红酒品质分类
1. 数据集介绍¶数据集共包含 11 个特征,共计 3269 条数据. 我们通过训练模型来预测红酒的品质, 品质共有 6 个各类别,分别使用数字: 0、1、2、3、4、5 来表示。2. 案例实现¶3.1 导入需要的库文件¶import itertoolsimport joblibimport numpy as npimport xgboost as xgbimport pandas as pdim
·
1. 数据集介绍¶
数据集共包含 11 个特征,共计 3269 条数据. 我们通过训练模型来预测红酒的品质, 品质共有 6 个各类别,分别使用数字: 0、1、2、3、4、5 来表示。
2. 案例实现¶
3.1 导入需要的库文件¶
import itertools
import joblib
import numpy as np
import xgboost as xgb
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
import time
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
3.2 数据基本处理¶
def test01():
# 1. 加载训练数据
data = pd.read_csv('data/红酒品质分类.csv')
x = data.iloc[:, :-1]
y = data.iloc[:, -1] - 3
# 2. 不平衡数据处理
print('数据类别比例:', Counter(y))
x, y = SMOTE().fit_resample(x, y)
print('数据类别比例:', Counter(y))
# 3. 数据集分割
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, stratify=y, random_state=22)
# 4. 存储数据
pd.concat([x_train, y_train], axis=1).to_csv('data/红酒品质分类-train.csv')
pd.concat([x_valid, y_valid], axis=1).to_csv('data/红酒品质分类-valid.csv')
3.3 模型基本训练¶
def test02():
# 1. 加载训练数据
train_data = pd.read_csv('data/红酒品质分类-train.csv')
valid_data = pd.read_csv('data/红酒品质分类-valid.csv')
# 训练集
x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
# 测试集
x_valid = valid_data.iloc[:, :-1]
y_valid = valid_data.iloc[:, -1]
# 2. XGBoost模型训练
estimator = xgb.XGBClassifier(n_estimators=100,
objective='multi:softmax',
eval_metric='merror',
eta=0.1,
use_label_encoder=False,
random_state=22)
estimator.fit(x_train, y_train)
# 3. 模型评估
y_pred = estimator.predict(x_valid)
print(classification_report(y_true=y_valid, y_pred=y_pred))
# 4. 模型保存
joblib.dump(estimator, 'model/xgboost.pth')
3.4 模型参数调优¶
def test03():
# 1. 加载训练数据
train_data = pd.read_csv('data/红酒品质分类-train.csv')
valid_data = pd.read_csv('data/红酒品质分类-valid.csv')
# 训练集
x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
# 测试集
x_valid = valid_data.iloc[:, :-1]
y_valid = valid_data.iloc[:, -1]
# 2. 定义超参数
param_grid = {'max_depth': np.arange(3, 11, 1),
'n_estimators': np.arange(50, 550, 50),
'eta': np.arange(0.1, 1, 0.1)}
# 3. 超参数组合
params = param_grid.items()
params_start = [0] * len(params)
params_end = [len(value) - 1 for _, value in params]
param_list = []
while True:
input_params = {}
for index, (key, value) in zip(params_start, params):
input_params[key] = value[index]
param_list.append(input_params)
if params_start == params_end:
break
params_start[0] += 1
for idx in range(len(params_start) - 1):
if params_start[idx] > params_end[idx]:
params_start[idx] = 0
params_start[idx + 1] += 1
# 4. 交叉验证分割数据集
train_data = []
spliter = StratifiedKFold(n_splits=5, shuffle=True)
for train_index, valid_index in spliter.split(x_train, y_train):
train_data.append({'train': train_index, 'valid': valid_index})
# 5. 寻找找参数组合
train_result = []
for param in param_list:
start = time.time()
# 初始化模型
base_estimator = xgb.XGBClassifier(use_label_encoder=False, eval_metric='merror')
# 存储模型评分
train_score = []
# 模型训练和评估
for data in train_data:
# 获得训练和验证集
train_index, valid_index = data['train'], data['valid']
train_x, train_y = x_train.values[train_index], y_train.values[train_index]
valid_x, valid_y = x_train.values[valid_index], y_train.values[valid_index]
# 模型训练
estimator = clone(base_estimator)
estimator.set_params(**param)
estimator.fit(train_x, train_y)
# 模型评估
score = estimator.score(valid_x, valid_y)
train_score.append(score)
# 打印模型平均分
train_result.append([np.mean(train_score), param])
end = time.time()
print('score: %.2f paramters: %s time: %.2fs' % (np.mean(train_score), param, end - start))
# 训练结果排序
train_result.sort(key=lambda x: x[0], reverse=True)
# 选择最优参数训练最终模型
best_param = train_result[0][1]
print('best_param:', best_param)
best_estimator = clone(base_estimator)
best_estimator.set_params(**best_param)
best_estimator.fit(x_train, y_train)
# 存储最优模型
joblib.dump(best_estimator, 'model/best_xgboost.pth')
# 使用最优模型进行预测
acc = best_estimator.score(x_valid, y_valid)
print('Accurary:', acc)

GitCode 天启AI是一款由 GitCode 团队打造的智能助手,基于先进的LLM(大语言模型)与多智能体 Agent 技术构建,致力于为用户提供高效、智能、多模态的创作与开发支持。它不仅支持自然语言对话,还具备处理文件、生成 PPT、撰写分析报告、开发 Web 应用等多项能力,真正做到“一句话,让 Al帮你完成复杂任务”。
更多推荐
所有评论(0)