def get_gridsearch_params(param_grid): params_combination = [dict()] for k, v_list in param_grid.items(): tmp = [{k: v} for v in v_list] n = len(params_combination) copy_params = [copy.deepcopy(params_combination) for _ in range(len(tmp))] params_combination = sum(copy_params, []) _ = [params_combination[i*n+k].update(tmp[i]) for k in range(n) for i in range(len(tmp))] return params_combination
def get_KFold_scores(df, params, kf, metric, n_classes): metric_score = 0.0
for train_idx, val_idx in kf.split(df['words'],df['label']): df_train = df.iloc[train_idx] df_val = df.iloc[val_idx]
tmpdir = tempfile.mkdtemp() tmp_train_file = tmpdir + '/train.txt' df_train.to_csv(tmp_train_file, sep='\t', index=False, header=None, encoding='UTF-8')
fast_model = fasttext.train_supervised(tmp_train_file, label_prefix='__label__', thread=3, **params) predicted = fast_model.predict(df_val['words'].tolist()) y_val_pred = [int(label[0][-1:]) for label in predicted[0]] y_val = df_val['label'].values
score = get_metrics(y_val, y_val_pred, n_classes)[metric] metric_score += score shutil.rmtree(tmpdir, ignore_errors=True)
print('平均分:', metric_score / kf.n_splits) return metric_score / kf.n_splits
def my_gridsearch_cv(df, param_grid, metrics, kfold=10): n_classes = len(np.unique(df['label'])) print('n_classes', n_classes)
skf = StratifiedKFold(n_splits=kfold,shuffle=True,random_state=1)
params_combination = get_gridsearch_params(param_grid)
best_score = 0.0 best_params = dict() for params in params_combination: avg_score = get_KFold_scores(df, params, skf, metrics, n_classes) if avg_score > best_score: best_score = avg_score best_params = copy.deepcopy(params)
return best_score, best_params
import fasttext from sklearn.model_selection import KFold, StratifiedKFold import numpy as np import pandas as pd import copy import tempfile import shutil from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
def get_metrics(y_true, y_pred, n_classes): metrics = {}
if n_classes==2: metrics['precision'] = precision_score(y_true, y_pred, pos_label=1) metrics['recall'] = recall_score(y_true, y_pred, pos_label=1) metrics['f1'] = f1_score(y_true, y_pred, pos_label=1) else: average = 'macro' metrics[average+'_precision'] = precision_score(y_true, y_pred, average=average) metrics[average+'_recall'] = recall_score(y_true, y_pred, average=average) metrics[average+'_f1'] = f1_score(y_true, y_pred, average=average)
metrics['accuracy'] = accuracy_score(y_true, y_pred) metrics['confusion_matrix'] = confusion_matrix(y_true, y_pred) metrics['classification_report'] = classification_report(y_true, y_pred) return metrics
tuned_parameters = { 'lr': [1.0, 0.85, 0.5], 'epoch': [30,50], 'dim': [ 200], 'wordNgrams': [2, 3], }
if __name__ == '__main__': print(train_x.head()) print(train_x.shape) best_score, best_params = my_gridsearch_cv(train_x, tuned_parameters, 'accuracy', kfold=10) print('best_score', best_score) print('best_params', best_params)
|