Source code for dynapipe.autoCV

#!/usr/bin/env python

from dynapipe.estimatorCV import clf_cv,reg_cv
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score
from dynapipe.utilis_func import update_progress,delete_old_log_files
import joblib
import datetime
import numpy as np
from time import time
import warnings
import os
import logging

path = os.getcwd()

LOG_TS ="%Y.%m.%d.%H.%M.%S")
logs_folder = os.path.join(os.getcwd(),'logs')
if not os.path.exists(logs_folder):
log_name = os.path.join(logs_folder, f'{os.path.basename(__file__).split(".")[0]}_log_{LOG_TS}.log')

TS = time()
logger = logging.getLogger(__name__)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s','%d/%m %H:%M:%S')
fh = logging.FileHandler(filename = log_name)
Test_case = f'Dynamic Pipeline - autoCV - Auto Model Selection w/ Cross Validation :: {LOG_TS}'
Test_comment = '-' * len(Test_case) * 3
Start_log = '#' * len(Test_case) * 3
delete_old_log_files(directory = logs_folder ,delete_flag = DELETE_FLAG, logger = logger, extension_list = ['.log'],filename_list = ['autoCV_log'],log_ts = LOG_TS)

def warn(*args, **kwargs):

def print_results(results,in_pipeline = False):
    if (not in_pipeline):
        print('Best Parameters: {}\n'.format(results.best_params_))
        print('Best CV Score: {}\n'.format(results.best_score_))'Best Paramaters: {}\n'.format(results.best_params_))'Best CV Score: {}\n'.format(results.best_score_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):'{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

[docs]class evaluate_model: """This class implements model evaluation and return key score results. Parameters ---------- model_type : str, default = None Value in ["reg","cls"]. The "reg" for regression problem, and "cls" for classification problem. in_pipeline : bool, default = False Should be set to "True" when using autoPipe module to build Pipeline Cluster Traveral Experiments. Example ------- .. [Example] References ---------- """ def __init__(self,model_type = None, in_pipeline = False): self.model_type = model_type self.in_pipeline = in_pipeline optimal_scores = []
[docs] def fit(self,name = None, model = None, features = None, labels = None): """Model evaluation with all models by the validate datasets. Parameters ---------- name : str, default = None Estimator name. model : pkl, default = None Model needs to evaluate. Needs pkl file as input when in_pipeline = "False"; otherwise, should use DICT_EST[estimator name] as the input here. features : df, default = None Validate features columns. ( NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset). labels : df ,default = None Validate label column. ( NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset). Returns ------- optimal_scores : list When model_type = "cls", will return [name,accuracy,precision,recall,latency] info of model validation results. when model_type = "reg", will return [name,R-squared,MAE,MSE,RMSE,latency] info of model validation results. """ if (self.model_type == "cls"): start = time() pred = model.predict(features) end = time() accuracy = round(accuracy_score(labels, pred), 3) precision = round(precision_score(labels, pred), 3) recall = round(recall_score(labels, pred), 3) latency = round((end - start)*1000, 1) optimal_scores = [name,accuracy,precision,recall,latency] if(not self.in_pipeline): print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}s'.format(name,accuracy,precision,recall,latency))'{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}s'.format(name,accuracy,precision,recall,latency)) if(self.in_pipeline):'>>> {} Modle Validation Results -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}s'.format(name,accuracy,precision,recall,latency)) if (self.model_type == "reg"): start = time() pred = model.predict(features) end = time() R2 = round(metrics.r2_score(labels, pred),3) MAE = round(metrics.mean_absolute_error(labels, pred),3) MSE = round(metrics.mean_squared_error(labels, pred),3) RMSE = round(metrics.mean_squared_error(labels, pred),3) latency = round((end - start)*1000, 1) optimal_scores = [name,R2,MAE,MSE,RMSE,latency] if(not self.in_pipeline): print(f'{name} -- R^2 Score: {R2} / Mean Absolute Error: {MAE} / Mean Squared Error: {MSE} / Root Mean Squared Error: {RMSE} / Latency: {latency}s')'{name} -- R^2 Score: {R2} / Mean Absolute Error: {MAE} / Mean Squared Error: {MSE} / Root Mean Squared Error: {RMSE} / Latency: {latency}s') if(self.in_pipeline):'>>> {name} Model Validation Results -- R^2 Score: {R2} / Mean Absolute Error: {MAE} / Mean Squared Error: {MSE} / Root Mean Squared Error: {RMSE} / Latency: {latency}s') return(optimal_scores)
[docs]class dynaClassifier: """This class implements classification model selection with hyperparameters grid search and cross-validation. Parameters ---------- custom_estimators : list, default = None Custom set the estimators in the autoCV regression module(if set None, will use all available estimators). Current version's default available estimators are ['lgr','svm','mlp','rf','ada','gb','xgb']. random_state : int, default = None Random state value. cv : int, default = None # of folds for cross-validation. in_pipeline : bool, default = False Should be set to "True" when using autoPipe module to build Pipeline Cluster Traveral Experiments. input_from_file : bool, default = True When input dataset is df, needs to set "True"; Otherwise, i.e. array, needs to set "False". Example ------- .. [Example] References ---------- None """ def __init__(self,custom_estimators = None, random_state = 13,cv_num = 5,in_pipeline = False, input_from_file = True): default_estimators = ['lgr','svm','mlp','rf','ada','gb','xgb'] if(custom_estimators is None): self.set_estimators = default_estimators else: self.set_estimators = custom_estimators self.random_state =random_state self.cv_num = cv_num self.input_from_file = input_from_file self.in_pipeline = in_pipeline self.DICT_EST = {}
[docs] def fit(self,tr_features = None,tr_labels = None): """Fit and train datasets with classification hyperparameters GridSearch and CV across multiple estimators. Module will Auto save trained model as {estimator_name}_clf_model.pkl file to ./pkl folder. Parameters ---------- features : df, default = None Train features columns. ( NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset). labels : df ,default = None Train label column. ( NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset). Returns ------- cv_num : int # of fold for cross-validation. DICT_EST : dictionary key is the name of estimators, value is the ralated trained model NOTE - Trained model auto save function only avalable when in_pipeline = "False". NOTE - Log records will generate and save to ./logs folder automatedly. """ warnings.warn = warn if(self.input_from_file): tr_labels = tr_labels.values.ravel() clf = clf_cv(cv_val = self.cv_num,random_state = self.random_state) # estimators = ['lgr','svm','mlp','rf','ada','gb','xgb'] estimators = self.set_estimators loop_num = 1 total_loop = len(estimators) if(not self.in_pipeline): pkl_folder = os.path.join(os.getcwd(),'pkl') if not os.path.exists(pkl_folder): os.makedirs(pkl_folder) for est in estimators: start_time = time()"Current Running:" + est +" estimator") try: cv_est = getattr(clf, est)(),tr_labels) if(not self.in_pipeline): model_name = os.path.join(pkl_folder, f'{est}_clf_model.pkl') joblib.dump(cv_est.best_estimator_, model_name) time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4) update_progress(loop_num/total_loop, clear_flag = True, process_name = "Model Selection w/ Cross-validation",time_est= time_est) print(f"\n *DynaPipe* autoCV Module ===> {est}_CrossValidation with {self.cv_num} folds:") print_results(cv_est,self.in_pipeline) self.DICT_EST[est] = cv_est"This estimator executed {round((time()-start_time)/60,4)} minutes") loop_num += 1 except: print(est+" estimator is not availible.") if(not self.in_pipeline): time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4) update_progress(loop_num/total_loop, clear_flag = True, process_name = "Model Selection w/ Cross-validation",time_est= time_est)"This estimator executed {round((time()-start_time)/60,4)} minutes") loop_num += 1 pass return(self.cv_num,self.DICT_EST)
[docs]class dynaRegressor: """This class implements regression model selection with with hyperparameters grid search and cross-validation. Module will Auto save trained model as {estimator_name}_reg_model.pkl file to ./pkl folder. Parameters ---------- custom_estimators : list, default = None Custom set the estimators in the autoCV regression module(if set None, will use all available estimators). Current version's default available estimators are ['lr','knn','tree','svm','mlp','rf','gb','ada','xgb','hgboost','huber','rgcv','cvlasso','sgd']. random_state : int, default = None Random state value. cv : int, default = None # of folds for cross-validation. in_pipeline : bool, default = False Should be set to "True" when using autoPipe module to build Pipeline Cluster Traveral Experiments. input_from_file : bool, default = True When input dataset is df, needs to set "True"; Otherwise, i.e. array, needs to set "False". Example ------- .. [Example] References ---------- None """ def __init__(self ,custom_estimators = None, random_state = 25 ,cv_num = 5,in_pipeline = False, input_from_file = True): default_estimators = ['lr','knn','tree','svm','mlp','rf','gb','ada','xgb','hgboost','huber','rgcv','cvlasso','sgd'] if(custom_estimators is None): self.set_estimators = default_estimators else: self.set_estimators = custom_estimators self.random_state =random_state self.cv_num = cv_num self.input_from_file = input_from_file self.in_pipeline = in_pipeline self.DICT_EST = {}
[docs] def fit(self,tr_features = None,tr_labels = None): """Fit and train datasets with regression hyperparameters GridSearch and CV across multiple estimators. Parameters ---------- features : df, default = None Train features columns. ( NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset). labels : df ,default = None Train label column. ( NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset). Returns ------- cv_num : int # of fold for cross-validation. DICT_EST : dictionary key is the name of estimators, value is the ralated trained model. NOTE - Trained model auto save function only avalable when in_pipeline = "False". NOTE - Log records will generate and save to ./logs folder automatedly. """ if(self.input_from_file): tr_labels = tr_labels.values.ravel() reg = reg_cv(cv_val = self.cv_num,random_state = self.random_state) # estimators = ['lr','knn','tree','svm','mlp','rf','gb','ada','xgb','hgboost','huber','rgcv','cvlasso','sgd'] estimators = self.set_estimators if (not self.in_pipeline): pkl_folder = os.path.join(os.getcwd(),'pkl') if not os.path.exists(pkl_folder): os.makedirs(pkl_folder) loop_num = 1 total_loop = len(estimators) for est in estimators: start_time = time()"Current Running:" + est +" estimator") try: cv_est = getattr(reg, est)(),tr_labels) if (not self.in_pipeline): model_name = os.path.join(pkl_folder, f'{est}_reg_model.pkl') joblib.dump(cv_est.best_estimator_, model_name) time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4) update_progress(loop_num/total_loop, clear_flag = False, process_name = "Model Selection w/ Cross-validation",time_est= time_est) print(f"\n *DynaPipe* autoCV Module ===> {est} model CrossValidation with {self.cv_num} folds:") print_results(cv_est,self.in_pipeline) self.DICT_EST[est] = cv_est"This estimator executed {round((time()-start_time)/60,4)} minutes") loop_num += 1 except: print(est+" estimator is not availible.") if (not self.in_pipeline): time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4) update_progress(loop_num/total_loop, clear_flag = True, process_name = "Model Selection w/ Cross-validation",time_est= time_est)"This estimator executed {round((time()-start_time)/60,4)} minutes") loop_num += 1 pass return(self.cv_num,self.DICT_EST)