Module hipe4ml.model_handler
Module containing the class used for wrapping the models from different ML libraries to build a new model with common methods
Classes
class ModelHandler (input_model=None,
training_columns=None,
model_params=None,
task_type='classification')-
Expand source code
class ModelHandler: """ Class used for wrapping the models from different ML libraries to build a new model with common methods. Currently LightGBM, XGBoost (through their sklearn interface) and sklearn models are supported. Parameters ------------------------------------------------- input_model: XGBoost, LightGBM or sklearn model training_columns: list Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct'] model_params: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... task_type: str Task type of the model: 'classification' or 'regression' """ def __init__(self, input_model=None, training_columns=None, model_params=None, task_type='classification'): self.model = input_model self.training_columns = training_columns self.model_params = model_params self._n_classes = None self.optuna_cross_val_score = None self._task_type = task_type if self._task_type not in ['classification', 'regression']: raise ValueError( "Task type must be either 'classification' or 'regression'") if self.model is not None: self.model_string = inspect.getmodule( self.model).__name__.partition('.')[0] if self.model_string not in ["xgboost", "lightgbm", "sklearn"]: raise ValueError( "Model must be either 'xgboost', 'lightgbm' or 'sklearn'") if self.model_params is None: self.model_params = self.model.get_params() else: self.model.set_params(**self.model_params) def set_model_params(self, model_params): """ Set the model (hyper-)parameters Parameters ------------------------------------ model_params: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ self.model_params = model_params self.model.set_params(**self.model_params) def get_model_params(self): """ Get the model (hyper-)parameters Returns ------------------------------------ out: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ return self.model.get_params() def set_training_columns(self, training_columns): """ Set the features used for the training process Parameters ------------------------------------ training_columns: list Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ self.training_columns = training_columns def get_training_columns(self): """ Get the features used for the training process Returns ------------------------------------ out: list Names of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ return self.training_columns def get_original_model(self): """ Get the original unwrapped model Returns --------------------------- out: XGBoost, LGBM or sklearn model """ return self.model def get_model_module(self): """ Get the string containing the name of the model module Returns --------------------------- out: str Name of the model module """ return self.model_string def get_n_classes(self): """ Get the number of classes Returns --------------------------- out: int Number of classes """ return self._n_classes def get_task_type(self): """ Get the task type of the model Returns --------------------------- out: str Task type of the model: 'classification' or 'regression' """ return self._task_type def fit(self, x_train, y_train, **kwargs): """ Fit Model Parameters --------------------------- x_train: array-like, sparse matrix Training data y_train: array-like, sparse matrix Target data **kwargs: Extra kwargs passed on to model.fit() method """ if self._task_type == 'classification': n_classes = len(np.unique(y_train)) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(x_train.columns) self.model.fit(x_train[self.training_columns], y_train, **kwargs) def predict(self, x_test, output_margin=True, **kwargs): """ Return model prediction for the array x_test Parameters -------------------------------------- x_test: hipe4ml tree_handler, array-like, sparse matrix The input sample. output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. **kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict() Returns --------------------------------------- out: numpy array Model predictions """ if isinstance(x_test, hipe4ml.tree_handler.TreeHandler): x_test = x_test.get_data_frame() x_test = x_test[self.training_columns] # regression if self._task_type == 'regression': return self.model.predict(x_test, **kwargs) # classification if output_margin: if self.model_string == 'xgboost': return self.model.predict(x_test, output_margin=True, **kwargs) if self.model_string == 'lightgbm': return self.model.predict(x_test, raw_score=True, **kwargs) if self.model_string == 'sklearn': if not hasattr(self.model, 'decision_function'): raise ValueError( "This Model does not support a decision_function(): use output_margin=False") return self.model.decision_function(x_test, **kwargs).ravel() pred = self.model.predict_proba(x_test, **kwargs) # in case of binary classification return only the scores of # the signal class if pred.shape[1] <= 2: pred = pred[:, 1] return pred def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro', multi_class_opt='raise', **kwargs): """ Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression. Parameters ---------------------------------------------- data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array return_prediction: bool If True Model predictions on the test set are returned output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. average: string Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score multi_class_opt: string Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available **kwargs: dict Extra kwargs passed on to the model fit method Returns --------------------------------------- out: numpy array or None If return_prediction==True, Model predictions on the test set are returned """ # get number of classes n_classes = len(np.unique(data[1])) self._n_classes = n_classes print('==============================') print(f"Training {self.model_string} model for {self._task_type}") if self._task_type == 'classification': print('Number of detected classes:', n_classes) # final training with the optimized hyperparams print('Training the model: ...') self.fit(data[0], data[1], **kwargs) print('Training the model: Done!') print('Testing the model: ...') y_pred = self.predict(data[2], output_margin=output_margin) if self._task_type == 'classification': roc_score = roc_auc_score( data[3], y_pred, average=average, multi_class=multi_class_opt) print(f'ROC_AUC_score: {roc_score:.6f}') else: mse_score = mean_squared_error(data[3], y_pred) print(f'Mean squared error: {mse_score:.6f}') print('Testing the model: Done!') print('==============================') if return_prediction: return y_pred return None def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize', optuna_sampler=None, resume_study=None, save_study=None, **kwargs): """ Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization. Parameters ------------------------------------------------------ data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array hyperparams_ranges: dict Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 } cross_val_scoring: string, callable or None Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter direction: str The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize') optuna_sampler: optuna.samplers.BaseSampler Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html nfold: int Number of folds to calculate the cross validation error resume_study: str A string indicating the filename of the study to be resumed. If None, the study is not resumed. save_study: str A string indicating the filename of the study. If None, the study is not saved into a file. **kwargs: dict Optuna study parameters Returns ------------------------------------------------------ study: optuna.study.Study The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study """ n_classes = len(np.unique(data[1])) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(data[0].columns) x_train, y_train, _, _ = data def __get_int_or_uniform(hyperparams_ranges, trial): params = {} for key in hyperparams_ranges: if isinstance(hyperparams_ranges[key][0], int): params[key] = trial.suggest_int( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) elif isinstance(hyperparams_ranges[key][0], float): params[key] = trial.suggest_uniform( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) return params def __objective(trial): params = __get_int_or_uniform(hyperparams_ranges, trial) model_copy = deepcopy(self.model) model_copy.set_params(**{**self.model_params, **params}) return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train, cv=nfold, scoring=cross_val_scoring, n_jobs=1)) if resume_study: with open(resume_study, 'rb') as resume_study_file: study = pickle.load(resume_study_file) else: study = optuna.create_study( direction=direction, sampler=optuna_sampler) study.optimize(__objective, **kwargs) if save_study: with open(save_study, 'wb') as study_file: pickle.dump(study, study_file) print(f"Number of finished trials: {len(study.trials)}") print("Best trial:") best_trial = study.best_trial print(f"Value: {best_trial.value}") print("Params: ") for key, value in best_trial.params.items(): print(f" {key}: {value}") self.set_model_params({**self.model_params, **best_trial.params}) self.optuna_cross_val_score = best_trial.value return study def dump_original_model(self, filename, xgb_format=False): """ Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved xgb_format : bool If True saves the xgboost model into a .model file """ if xgb_format is False: with open(filename, "wb") as output_file: pickle.dump(self.model, output_file) else: if self.model_string == 'xgboost': self.model.save_model(filename) else: print("File not saved: only xgboost models support the .model extension") def dump_model_handler(self, filename): """ Save the model handler into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "wb") as output_file: pickle.dump(self, output_file) def load_model_handler(self, filename): """ Load a model handler saved into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "rb") as input_file: loaded_model = pickle.load(input_file) self.model = loaded_model.get_original_model() self.training_columns = loaded_model.get_training_columns() self.model_params = loaded_model.get_model_params() self.model.set_params(**self.model_params) self.model_string = loaded_model.get_model_module() self._n_classes = loaded_model.get_n_classes() self._task_type = loaded_model.get_task_type()Class used for wrapping the models from different ML libraries to build a new model with common methods. Currently LightGBM, XGBoost (through their sklearn interface) and sklearn models are supported.
Parameters
input_model:XGBoost, LightGBMorsklearn modeltraining_columns:list- Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
model_params:dict- Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
task_type:str- Task type of the model: 'classification' or 'regression'
Methods
def dump_model_handler(self, filename)-
Expand source code
def dump_model_handler(self, filename): """ Save the model handler into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "wb") as output_file: pickle.dump(self, output_file)Save the model handler into a pickle file
Parameters
filename:str- Name of the file in which the model is saved
def dump_original_model(self, filename, xgb_format=False)-
Expand source code
def dump_original_model(self, filename, xgb_format=False): """ Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved xgb_format : bool If True saves the xgboost model into a .model file """ if xgb_format is False: with open(filename, "wb") as output_file: pickle.dump(self.model, output_file) else: if self.model_string == 'xgboost': self.model.save_model(filename) else: print("File not saved: only xgboost models support the .model extension")Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file
Parameters
filename:str- Name of the file in which the model is saved
xgb_format:bool- If True saves the xgboost model into a .model file
def fit(self, x_train, y_train, **kwargs)-
Expand source code
def fit(self, x_train, y_train, **kwargs): """ Fit Model Parameters --------------------------- x_train: array-like, sparse matrix Training data y_train: array-like, sparse matrix Target data **kwargs: Extra kwargs passed on to model.fit() method """ if self._task_type == 'classification': n_classes = len(np.unique(y_train)) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(x_train.columns) self.model.fit(x_train[self.training_columns], y_train, **kwargs)Fit Model
Parameters
x_train:array-like, sparse matrix- Training data
y_train:array-like, sparse matrix- Target data
**kwargs: Extra kwargs passed on to model.fit() method
def get_model_module(self)-
Expand source code
def get_model_module(self): """ Get the string containing the name of the model module Returns --------------------------- out: str Name of the model module """ return self.model_stringGet the string containing the name of the model module
Returns
out:str- Name of the model module
def get_model_params(self)-
Expand source code
def get_model_params(self): """ Get the model (hyper-)parameters Returns ------------------------------------ out: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ return self.model.get_params()Get the model (hyper-)parameters
Returns
out:dict- Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
def get_n_classes(self)-
Expand source code
def get_n_classes(self): """ Get the number of classes Returns --------------------------- out: int Number of classes """ return self._n_classesGet the number of classes
Returns
out:int- Number of classes
def get_original_model(self)-
Expand source code
def get_original_model(self): """ Get the original unwrapped model Returns --------------------------- out: XGBoost, LGBM or sklearn model """ return self.modelGet the original unwrapped model
Returns
out:XGBoost, LGBMorsklearn model
def get_task_type(self)-
Expand source code
def get_task_type(self): """ Get the task type of the model Returns --------------------------- out: str Task type of the model: 'classification' or 'regression' """ return self._task_typeGet the task type of the model
Returns
out:str- Task type of the model: 'classification' or 'regression'
def get_training_columns(self)-
Expand source code
def get_training_columns(self): """ Get the features used for the training process Returns ------------------------------------ out: list Names of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ return self.training_columnsGet the features used for the training process
Returns
out:list- Names of the features used for the training. Example: ['dEdx', 'pT', 'ct']
def load_model_handler(self, filename)-
Expand source code
def load_model_handler(self, filename): """ Load a model handler saved into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "rb") as input_file: loaded_model = pickle.load(input_file) self.model = loaded_model.get_original_model() self.training_columns = loaded_model.get_training_columns() self.model_params = loaded_model.get_model_params() self.model.set_params(**self.model_params) self.model_string = loaded_model.get_model_module() self._n_classes = loaded_model.get_n_classes() self._task_type = loaded_model.get_task_type()Load a model handler saved into a pickle file
Parameters
filename:str- Name of the file in which the model is saved
def optimize_params_optuna(self,
data,
hyperparams_ranges,
cross_val_scoring,
nfold=5,
direction='maximize',
optuna_sampler=None,
resume_study=None,
save_study=None,
**kwargs)-
Expand source code
def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize', optuna_sampler=None, resume_study=None, save_study=None, **kwargs): """ Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization. Parameters ------------------------------------------------------ data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array hyperparams_ranges: dict Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 } cross_val_scoring: string, callable or None Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter direction: str The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize') optuna_sampler: optuna.samplers.BaseSampler Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html nfold: int Number of folds to calculate the cross validation error resume_study: str A string indicating the filename of the study to be resumed. If None, the study is not resumed. save_study: str A string indicating the filename of the study. If None, the study is not saved into a file. **kwargs: dict Optuna study parameters Returns ------------------------------------------------------ study: optuna.study.Study The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study """ n_classes = len(np.unique(data[1])) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(data[0].columns) x_train, y_train, _, _ = data def __get_int_or_uniform(hyperparams_ranges, trial): params = {} for key in hyperparams_ranges: if isinstance(hyperparams_ranges[key][0], int): params[key] = trial.suggest_int( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) elif isinstance(hyperparams_ranges[key][0], float): params[key] = trial.suggest_uniform( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) return params def __objective(trial): params = __get_int_or_uniform(hyperparams_ranges, trial) model_copy = deepcopy(self.model) model_copy.set_params(**{**self.model_params, **params}) return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train, cv=nfold, scoring=cross_val_scoring, n_jobs=1)) if resume_study: with open(resume_study, 'rb') as resume_study_file: study = pickle.load(resume_study_file) else: study = optuna.create_study( direction=direction, sampler=optuna_sampler) study.optimize(__objective, **kwargs) if save_study: with open(save_study, 'wb') as study_file: pickle.dump(study, study_file) print(f"Number of finished trials: {len(study.trials)}") print("Best trial:") best_trial = study.best_trial print(f"Value: {best_trial.value}") print("Params: ") for key, value in best_trial.params.items(): print(f" {key}: {value}") self.set_model_params({**self.model_params, **best_trial.params}) self.optuna_cross_val_score = best_trial.value return studyPerform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization.
Parameters
data:list- Contains respectively: training set dataframe, training label array, test set dataframe, test label array
hyperparams_ranges:dict- Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 }
cross_val_scoring:string, callableorNone- Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
direction:str- The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize')
optuna_sampler:optuna.samplers.BaseSampler- Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html
nfold:int- Number of folds to calculate the cross validation error
resume_study:str- A string indicating the filename of the study to be resumed. If None, the study is not resumed.
save_study:str- A string indicating the filename of the study. If None, the study is not saved into a file.
**kwargs:dict- Optuna study parameters
Returns
study:optuna.study.Study- The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
def predict(self, x_test, output_margin=True, **kwargs)-
Expand source code
def predict(self, x_test, output_margin=True, **kwargs): """ Return model prediction for the array x_test Parameters -------------------------------------- x_test: hipe4ml tree_handler, array-like, sparse matrix The input sample. output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. **kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict() Returns --------------------------------------- out: numpy array Model predictions """ if isinstance(x_test, hipe4ml.tree_handler.TreeHandler): x_test = x_test.get_data_frame() x_test = x_test[self.training_columns] # regression if self._task_type == 'regression': return self.model.predict(x_test, **kwargs) # classification if output_margin: if self.model_string == 'xgboost': return self.model.predict(x_test, output_margin=True, **kwargs) if self.model_string == 'lightgbm': return self.model.predict(x_test, raw_score=True, **kwargs) if self.model_string == 'sklearn': if not hasattr(self.model, 'decision_function'): raise ValueError( "This Model does not support a decision_function(): use output_margin=False") return self.model.decision_function(x_test, **kwargs).ravel() pred = self.model.predict_proba(x_test, **kwargs) # in case of binary classification return only the scores of # the signal class if pred.shape[1] <= 2: pred = pred[:, 1] return predReturn model prediction for the array x_test Parameters
x_test:hipe4ml tree_handler, array-like, sparse matrix- The input sample.
output_margin:bool- Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.
**kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict()
Returns
out:numpy array- Model predictions
def set_model_params(self, model_params)-
Expand source code
def set_model_params(self, model_params): """ Set the model (hyper-)parameters Parameters ------------------------------------ model_params: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ self.model_params = model_params self.model.set_params(**self.model_params)Set the model (hyper-)parameters
Parameters
model_params:dict- Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
def set_training_columns(self, training_columns)-
Expand source code
def set_training_columns(self, training_columns): """ Set the features used for the training process Parameters ------------------------------------ training_columns: list Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ self.training_columns = training_columnsSet the features used for the training process
Parameters
training_columns:list- Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
def train_test_model(self,
data,
return_prediction=False,
output_margin=False,
average='macro',
multi_class_opt='raise',
**kwargs)-
Expand source code
def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro', multi_class_opt='raise', **kwargs): """ Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression. Parameters ---------------------------------------------- data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array return_prediction: bool If True Model predictions on the test set are returned output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. average: string Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score multi_class_opt: string Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available **kwargs: dict Extra kwargs passed on to the model fit method Returns --------------------------------------- out: numpy array or None If return_prediction==True, Model predictions on the test set are returned """ # get number of classes n_classes = len(np.unique(data[1])) self._n_classes = n_classes print('==============================') print(f"Training {self.model_string} model for {self._task_type}") if self._task_type == 'classification': print('Number of detected classes:', n_classes) # final training with the optimized hyperparams print('Training the model: ...') self.fit(data[0], data[1], **kwargs) print('Training the model: Done!') print('Testing the model: ...') y_pred = self.predict(data[2], output_margin=output_margin) if self._task_type == 'classification': roc_score = roc_auc_score( data[3], y_pred, average=average, multi_class=multi_class_opt) print(f'ROC_AUC_score: {roc_score:.6f}') else: mse_score = mean_squared_error(data[3], y_pred) print(f'Mean squared error: {mse_score:.6f}') print('Testing the model: Done!') print('==============================') if return_prediction: return y_pred return NonePerform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression.
Parameters
data:list- Contains respectively: training set dataframe, training label array, test set dataframe, test label array
return_prediction:bool- If True Model predictions on the test set are returned
output_margin:bool- Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.
average:string- Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
multi_class_opt:string- Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available
**kwargs:dict- Extra kwargs passed on to the model fit method
Returns
out:numpy arrayorNone- If return_prediction==True, Model predictions on the test set are returned