跳转至

使用 sklearn 实现支持向量机分类算法

对银行客户流失数据进行特征筛选,构建 SVM 分类器,并与逻辑回归、决策树模型的分类效果进行对比。

Python
# 导入包
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
Python
# 读取数据
train = pd.read_excel('BankChurners2.xlsx', sheet_name='training')
test = pd.read_excel('BankChurners2.xlsx', sheet_name='testing')
x_train = train.drop(['Attrition_Flag'], axis=1)
y_train = train['Attrition_Flag']
x_test = test.drop(['Attrition_Flag'], axis=1)
y_test = test['Attrition_Flag']

变量编码

Python
# 提取文本类型的特征
text_features = [column for column in x_train.columns if x_train[column].dtype == 'object']
# 对文本类型的特征进行编码
le = LabelEncoder()
for column in text_features:
    x_train[column] = le.fit_transform(train[column])
    x_test[column] = le.transform(test[column])
# 对标签进行编码
y_train.replace({'Existing Customer': 0, 'Attrited Customer': 1}, inplace=True)
y_test.replace({'Existing Customer': 0, 'Attrited Customer': 1}, inplace=True)

特征选择

移除低方差特征

Python
# 删除 id 列
x_train.drop(['id'], axis=1, inplace=True)
x_test.drop(['id'], axis=1, inplace=True)
# 将特征进行最小最大归一化
scaler = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), index=x_train.index, columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)
# 移除低方差(归一化后方差小于 0.02)特征
selector = VarianceThreshold(threshold=0.02)
selector.fit(x_train)
# 打印因为低方差被移除的特征
print('因为低方差被移除特征:', x_train.columns[~selector.get_support()])
x_train = x_train.loc[:, selector.get_support()]
x_test = x_test.loc[:, selector.get_support()]
Text Only
因为低方差被移除特征: Index(['Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1'], dtype='object')

移除高相关特征中方差较低的那一个

Python
# 绘制特征之间的相关性热力图
plt.figure(figsize=(20, 20))
corr = x_train.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

Python
# 打印相关性绝对值大于 0.8 的特征
corr = corr.abs()
corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
corr = corr.stack()
corr = corr[corr > 0.8]
print('相关性绝对值大于 0.8 的特征:', [i for i in corr.index])
# 从数据集中移除相关性绝对值大于 0.8 的特征中方差较小的特征
feature_to_remove = []
corr = corr.reset_index()
corr.columns = ['feature1', 'feature2', 'corr']
for row in corr.itertuples():
    if x_train[row.feature1].var() < x_train[row.feature2].var():
        feature_to_remove.append(row.feature1)
    else:
        feature_to_remove.append(row.feature2)
feature_to_remove = list(set(feature_to_remove))
print('相关性绝对值大于 0.8 的特征中方差较小的特征:', feature_to_remove)
x_train.drop(feature_to_remove, axis=1, inplace=True)
x_test.drop(feature_to_remove, axis=1, inplace=True)
Text Only
相关性绝对值大于0.8的特征: [('Credit_Limit', 'Avg_Open_To_Buy'), ('Total_Trans_Amt', 'Total_Trans_Ct')]
相关性绝对值大于0.8的特征中方差较小的特征: ['Avg_Open_To_Buy', 'Total_Trans_Ct']

移除与标签相关性较低的特征

Python
# 移除与标签相关性绝对值小于 0.02 的特征
feature_to_remove = []
for i in x_train.columns:
    pearson_corr = pearsonr(x_train[i], y_train)[0]
    if abs(pearson_corr)<0.02:
        feature_to_remove.append(i)
print('与标签相关性绝对值小于 0.02 的特征:', feature_to_remove)
x_train.drop(feature_to_remove, axis=1, inplace=True)
x_test.drop(feature_to_remove, axis=1, inplace=True)
Text Only
与标签相关性绝对值小于0.02的特征: ['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book']
Python
# 打印最终选择的特征
print('最终选择的特征', x_train.columns.values)
Text Only
最终选择的特征 ['Customer_Age' 'Gender' 'Dependent_count' 'Total_Relationship_Count'
 'Months_Inactive_12_mon' 'Contacts_Count_12_mon' 'Credit_Limit'
 'Total_Revolving_Bal' 'Total_Trans_Amt' 'Avg_Utilization_Ratio']

构建 SVM 分类器

Python
svm = SVC(kernel='poly')
svm.fit(x_train, y_train)
print('训练集准确率:%.2f%%' % (svm.score(x_train, y_train) * 100))
print("超平面的截距项 b 为{}".format(svm.intercept_[0]))
print("决策函数为{}".format(svm.decision_function(x_train)))
Text Only
训练集准确率: 88.04%
超平面的截距项b为-0.9575823695081735
决策函数为[-1.59427586 -1.8345902  -1.00014489 ... -1.60371229 -1.94589042
 -1.6133594 ]

在测试集上给出模型分类的效果

Python
def evaluate(true, pred):
    acc = metrics.accuracy_score(true, pred)
    presision = metrics.precision_score(true, pred)
    recall = metrics.recall_score(true, pred)
    f1 = metrics.f1_score(true, pred)
    print('整体分类误差率:{:.2%}'.format(acc))
    print('Precision:{:.2%}'.format(presision))
    print('Recall:{:.2%}'.format(recall))
    print('F1 得分:{:.2%}'.format(f1))
    return acc, presision, recall, f1
Python
# 在测试集上给出模型分类的效果
y_pred_svm = svm.predict(x_test)
print(classification_report(y_test, y_pred_svm))
acc_svm, presision_svm, recall_svm, f1_svm = evaluate(y_test, y_pred_svm)
Text Only
              precision    recall  f1-score   support

           0       0.89      0.99      0.93      2637
           1       0.82      0.32      0.46       490

    accuracy                           0.88      3127
   macro avg       0.85      0.66      0.70      3127
weighted avg       0.88      0.88      0.86      3127

整体分类误差率:88.26%
Precision:81.54%
Recall:32.45%
F1得分:46.42%

将 SVM 的结果与逻辑回归以及决策树结果进行对比分析

逻辑回归

Python
lr = LogisticRegression()
lr.fit(x_train, y_train)
# 测试集上的预测结果
y_pred_lr = lr.predict(x_test)
print(classification_report(y_test, y_pred_lr))
acc_lr, presision_lr, recall_lr, f1_lr = evaluate(y_test, y_pred_lr)
Text Only
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      2637
           1       0.78      0.31      0.44       490

    accuracy                           0.88      3127
   macro avg       0.83      0.65      0.69      3127
weighted avg       0.87      0.88      0.85      3127

整体分类误差率:87.78%
Precision:77.55%
Recall:31.02%
F1得分:44.31%

决策树

Python
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
# 测试集上的预测结果
y_pred_dt = dt.predict(x_test)
print(classification_report(y_test, y_pred_dt))
acc_dt, presision_dt, recall_dt, f1_dt = evaluate(y_test, y_pred_dt)
Text Only
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2637
           1       0.73      0.72      0.72       490

    accuracy                           0.91      3127
   macro avg       0.84      0.83      0.84      3127
weighted avg       0.91      0.91      0.91      3127

整体分类误差率:91.43%
Precision:73.12%
Recall:71.63%
F1得分:72.37%

绘制各模型在测试集上的训练效果

ROC 曲线

Python
# 绘制 ROC 曲线
# SVM 的 ROC 曲线
y_pred_proba_svm = svm.decision_function(x_test)
fpr_svm, tprr_svm, thresholdsr_svm = metrics.roc_curve(y_test, y_pred_proba_svm)
plt.plot(fpr_svm, tprr_svm, color='r', lw=2, label='SVM')
# 逻辑回归的 ROC 曲线
y_pred_proba_lr = lr.predict_proba(x_test)[:, 1]
fpr_lr, tprr_lr, thresholdsr_lr = metrics.roc_curve(y_test, y_pred_proba_lr)
plt.plot(fpr_lr, tprr_lr, color='b', lw=2, label='LR')
# 决策树的 ROC 曲线
y_pred_proba_dt = dt.predict_proba(x_test)[:,1]
fpr_df, tpr_dt, thresholds_dt = metrics.roc_curve(y_test, y_pred_proba_dt)
plt.plot(fpr_df, tpr_dt, color='g', lw=2, label='决策树')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

混淆矩阵

Python
# 绘制测试集上的混淆矩阵
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap='Blues', colorbar=True):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    if colorbar:
        plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
Python
fig = plt.figure(figsize=(15, 5))
# SVM 的混淆矩阵
ax1 = fig.add_subplot(131)
cm_svm = metrics.confusion_matrix(y_test, y_pred_svm)
plot_confusion_matrix(cm_svm, classes=['0', '1'], normalize=True, title='SVM', colorbar=False)
# 逻辑回归的混淆矩阵
ax2 = fig.add_subplot(132)
cm_lr = metrics.confusion_matrix(y_test, y_pred_lr)
plot_confusion_matrix(cm_lr, classes=['0', '1'], normalize=True, title='逻辑回归', colorbar=False)
# 决策树的混淆矩阵
ax3 = fig.add_subplot(133)
cm_dt = metrics.confusion_matrix(y_test, y_pred_dt)
plot_confusion_matrix(cm_dt, classes=['0', '1'], normalize=True, title='决策树', colorbar=False)

准确率、精确率、召回率、F1 值

Python
# 绘制模型的评价指标
def to_percent(temp, position):
        return '%1.0f'%(100*temp) + '%'
fig = plt.figure(figsize=(15, 5))
ax1 = fig.add_subplot(131)
plt.bar(['SVM', 'LR', 'DT'], [acc_svm, acc_lr, acc_dt], color=['r', 'b', 'g'])
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title('Accuracy')
plt.ylim(0.8, 1)
ax2 = fig.add_subplot(132)
plt.bar(['SVM', 'LR', 'DT'], [presision_svm, presision_lr, presision_dt], color=['r', 'b', 'g'])
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title('Presision')
plt.ylim(0.6, 1)
ax3 = fig.add_subplot(133)
plt.bar(['SVM', 'LR', 'DT'], [recall_svm, recall_lr, recall_dt], color=['r', 'b', 'g'])
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title('Recall')
plt.show()

评论