Cross-Validation

如何在多標籤分類器上使用 scikit-learn 的交叉驗證功能

  • July 28, 2013

我在一個有 5 個類的數據集上測試不同的分類器,每個實例可以屬於這些類中的一個或多個,所以我使用 scikit-learn 的多標籤分類器,特別是sklearn.multiclass.OneVsRestClassifier. 現在我想使用sklearn.cross_validation.StratifiedKFold. 這會產生以下錯誤:

Traceback (most recent call last):
 File "mlfromcsv.py", line 93, in <module>
   main()
 File "mlfromcsv.py", line 77, in main
   test_classifier_multilabel(svm.LinearSVC(), X, Y, 'Linear Support Vector Machine')
 File "mlfromcsv.py", line 44, in test_classifier_multilabel
   scores = cross_validation.cross_val_score(clf_ml, X, Y_list, cv=cv, score_func=metrics.precision_recall_fscore_support, n_jobs=jobs)
 File "/usr/lib/pymodules/python2.7/sklearn/cross_validation.py", line 1046, in cross_val_score
   X, y = check_arrays(X, y, sparse_format='csr')
 File "/usr/lib/pymodules/python2.7/sklearn/utils/validation.py", line 144, in check_arrays
   size, n_samples))
ValueError: Found array with dim 5. Expected 98816

請注意,訓練多標籤分類器不會崩潰,但交叉驗證會崩潰。我必須如何對這個多標籤分類器執行交叉驗證?

我還編寫了第二個版本,將問題分解為訓練和交叉驗證 5 個單獨的分類器。這工作得很好。

這是我的代碼。功能test_classifier_multilabel是給問題的一個。 test_classifier是我的另一個嘗試(將問題分解為 5 個分類器和 5 個交叉驗證)。

import numpy as np
from sklearn import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
import time

def test_classifier(clf, X, Y, description, jobs=1):
   print '=== Testing classifier {0} ==='.format(description)
   for class_idx in xrange(Y.shape[1]):
       print ' > Cross-validating for class {:d}'.format(class_idx)
       n_samples = X.shape[0]
       cv = cross_validation.StratifiedKFold(Y[:,class_idx], 3)
       t_start = time.clock()
       scores = cross_validation.cross_val_score(clf, X, Y[:,class_idx], cv=cv, score_func=metrics.precision_recall_fscore_support, n_jobs=jobs)
       t_end = time.clock();
       print 'Cross validation time: {:0.3f}s.'.format(t_end-t_start)
       str_tbl_fmt = '{:>15s}{:>15s}{:>15s}{:>15s}{:>15s}'
       str_tbl_entry_fmt = '{:0.2f} +/- {:0.2f}'
       print str_tbl_fmt.format('', 'Precision', 'Recall', 'F1 score', 'Support')
       for (score_class, lbl) in [(0, 'Negative'), (1, 'Positive')]:
           mean_precision = scores[:,0,score_class].mean()
           std_precision = scores[:,0,score_class].std()
           mean_recall = scores[:,1,score_class].mean()
           std_recall = scores[:,1,score_class].std()
           mean_f1_score = scores[:,2,score_class].mean()
           std_f1_score = scores[:,2,score_class].std()
           support = scores[:,3,score_class].mean()
           print str_tbl_fmt.format(
               lbl,
               str_tbl_entry_fmt.format(mean_precision, std_precision),
               str_tbl_entry_fmt.format(mean_recall, std_recall),
               str_tbl_entry_fmt.format(mean_f1_score, std_f1_score),
               '{:0.2f}'.format(support))

def test_classifier_multilabel(clf, X, Y, description, jobs=1):
   print '=== Testing multi-label classifier {0} ==='.format(description)
   n_samples = X.shape[0]
   Y_list = [value for value in Y.T]
   print 'Y_list[0].shape:', Y_list[0].shape, 'len(Y_list):', len(Y_list)
   cv = cross_validation.StratifiedKFold(Y_list, 3)
   clf_ml = OneVsRestClassifier(clf)
   accuracy = (clf_ml.fit(X, Y).predict(X) != Y).sum()
   print 'Accuracy: {:0.2f}'.format(accuracy)
   scores = cross_validation.cross_val_score(clf_ml, X, Y_list, cv=cv, score_func=metrics.precision_recall_fscore_support, n_jobs=jobs)
   str_tbl_fmt = '{:>15s}{:>15s}{:>15s}{:>15s}{:>15s}'
   str_tbl_entry_fmt = '{:0.2f} +/- {:0.2f}'
   print str_tbl_fmt.format('', 'Precision', 'Recall', 'F1 score', 'Support')
   for (score_class, lbl) in [(0, 'Negative'), (1, 'Positive')]:
       mean_precision = scores[:,0,score_class].mean()
       std_precision = scores[:,0,score_class].std()
       mean_recall = scores[:,1,score_class].mean()
       std_recall = scores[:,1,score_class].std()
       mean_f1_score = scores[:,2,score_class].mean()
       std_f1_score = scores[:,2,score_class].std()
       support = scores[:,3,score_class].mean()
       print str_tbl_fmt.format(
           lbl,
           str_tbl_entry_fmt.format(mean_precision, std_precision),
           str_tbl_entry_fmt.format(mean_recall, std_recall),
           str_tbl_entry_fmt.format(mean_f1_score, std_f1_score),
           '{:0.2f}'.format(support))

def main():
   nfeatures = 13
   nclasses = 5
   ncolumns = nfeatures + nclasses

   data = np.loadtxt('./feature_db.csv', delimiter=',', usecols=range(ncolumns))

   print data, data.shape
   X = np.hstack((data[:,0:3], data[:,(nfeatures-1):nfeatures]))
   print 'X.shape:', X.shape
   Y = data[:,nfeatures:ncolumns]
   print 'Y.shape:', Y.shape

   test_classifier(svm.LinearSVC(), X, Y, 'Linear Support Vector Machine', jobs=-1)
   test_classifier_multilabel(svm.LinearSVC(), X, Y, 'Linear Support Vector Machine')

if  __name__ =='__main__':
   main()

我正在使用 Ubuntu 13.04 和 scikit-learn 0.12。我的數據是具有形狀 (98816, 4) 和 (98816, 5) 的兩個數組(X 和 Y)的形式,即每個實例有 4 個特徵和 5 個類標籤。標籤是 1 或 0,表示該類中的成員資格。我是否使用了正確的格式,因為我沒有看到太多關於它的文檔?

分層抽樣意味著在您的 KFold 抽樣中保留了類成員分佈。這在多標籤情況下沒有多大意義,因為您的目標向量每次觀察可能有多個標籤。

在這個意義上,分層有兩種可能的解釋。

為了標籤中至少有一個被填充,給你獨特的標籤。您可以對每個唯一標籤箱執行分層抽樣。

另一種選擇是嘗試對訓練數據進行分段,以使標籤向量分佈的概率質量在折疊上大致相同。例如

import numpy as np

np.random.seed(1)
y = np.random.randint(0, 2, (5000, 5))
y = y[np.where(y.sum(axis=1) != 0)[0]]


def proba_mass_split(y, folds=7):
   obs, classes = y.shape
   dist = y.sum(axis=0).astype('float')
   dist /= dist.sum()
   index_list = []
   fold_dist = np.zeros((folds, classes), dtype='float')
   for _ in xrange(folds):
       index_list.append([])
   for i in xrange(obs):
       if i < folds:
           target_fold = i
       else:
           normed_folds = fold_dist.T / fold_dist.sum(axis=1)
           how_off = normed_folds.T - dist
           target_fold = np.argmin(np.dot((y[i] - .5).reshape(1, -1), how_off.T))
       fold_dist[target_fold] += y[i]
       index_list[target_fold].append(i)
   print("Fold distributions are")
   print(fold_dist)
   return index_list

if __name__ == '__main__':
   proba_mass_split(y)

為了獲得正常的訓練,測試 KFold 產生的索引,你想用 np.arange(y.shape[0]) 將其重寫為每個索引的 np.setdiff1d,然後用iter方法將其包裝在一個類中。

引用自:https://stats.stackexchange.com/questions/65828

comments powered by Disqus