Module hipe4ml.model_handler
Module containing the class used for wrapping the models from different ML libraries to build a new model with common methods
Classes
class ModelHandler (input_model=None,
training_columns=None,
model_params=None,
task_type='classification')-
Expand source code
class ModelHandler: """ Class used for wrapping the models from different ML libraries to build a new model with common methods. Currently LightGBM, XGBoost (through their sklearn interface) and sklearn models are supported. Parameters ------------------------------------------------- input_model: XGBoost, LightGBM or sklearn model training_columns: list Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct'] model_params: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... task_type: str Task type of the model: 'classification' or 'regression' """ def __init__(self, input_model=None, training_columns=None, model_params=None, task_type='classification'): self.model = input_model self.training_columns = training_columns self.model_params = model_params self._n_classes = None self.optuna_cross_val_score = None self._task_type = task_type if self._task_type not in ['classification', 'regression']: raise ValueError( "Task type must be either 'classification' or 'regression'") if self.model is not None: self.model_string = inspect.getmodule( self.model).__name__.partition('.')[0] if self.model_string not in ["xgboost", "lightgbm", "sklearn"]: raise ValueError( "Model must be either 'xgboost', 'lightgbm' or 'sklearn'") if self.model_params is None: self.model_params = self.model.get_params() else: self.model.set_params(**self.model_params) def set_model_params(self, model_params): """ Set the model (hyper-)parameters Parameters ------------------------------------ model_params: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ self.model_params = model_params self.model.set_params(**self.model_params) def get_model_params(self): """ Get the model (hyper-)parameters Returns ------------------------------------ out: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ return self.model.get_params() def set_training_columns(self, training_columns): """ Set the features used for the training process Parameters ------------------------------------ training_columns: list Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ self.training_columns = training_columns def get_training_columns(self): """ Get the features used for the training process Returns ------------------------------------ out: list Names of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ return self.training_columns def get_original_model(self): """ Get the original unwrapped model Returns --------------------------- out: XGBoost, LGBM or sklearn model """ return self.model def get_model_module(self): """ Get the string containing the name of the model module Returns --------------------------- out: str Name of the model module """ return self.model_string def get_n_classes(self): """ Get the number of classes Returns --------------------------- out: int Number of classes """ return self._n_classes def get_task_type(self): """ Get the task type of the model Returns --------------------------- out: str Task type of the model: 'classification' or 'regression' """ return self._task_type def fit(self, x_train, y_train, **kwargs): """ Fit Model Parameters --------------------------- x_train: array-like, sparse matrix Training data y_train: array-like, sparse matrix Target data **kwargs: Extra kwargs passed on to model.fit() method """ if self._task_type == 'classification': n_classes = len(np.unique(y_train)) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(x_train.columns) self.model.fit(x_train[self.training_columns], y_train, **kwargs) def predict(self, x_test, output_margin=True, **kwargs): """ Return model prediction for the array x_test Parameters -------------------------------------- x_test: hipe4ml tree_handler, array-like, sparse matrix The input sample. output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. **kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict() Returns --------------------------------------- out: numpy array Model predictions """ if isinstance(x_test, hipe4ml.tree_handler.TreeHandler): x_test = x_test.get_data_frame() x_test = x_test[self.training_columns] # regression if self._task_type == 'regression': return self.model.predict(x_test, **kwargs) # classification if output_margin: if self.model_string == 'xgboost': return self.model.predict(x_test, output_margin=True, **kwargs) if self.model_string == 'lightgbm': return self.model.predict(x_test, raw_score=True, **kwargs) if self.model_string == 'sklearn': if not hasattr(self.model, 'decision_function'): raise ValueError( "This Model does not support a decision_function(): use output_margin=False") return self.model.decision_function(x_test, **kwargs).ravel() pred = self.model.predict_proba(x_test, **kwargs) # in case of binary classification return only the scores of # the signal class if pred.shape[1] <= 2: pred = pred[:, 1] return pred def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro', multi_class_opt='raise', **kwargs): """ Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression. Parameters ---------------------------------------------- data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array return_prediction: bool If True Model predictions on the test set are returned output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. average: string Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score multi_class_opt: string Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available **kwargs: dict Extra kwargs passed on to the model fit method Returns --------------------------------------- out: numpy array or None If return_prediction==True, Model predictions on the test set are returned """ # get number of classes n_classes = len(np.unique(data[1])) self._n_classes = n_classes print('==============================') print(f"Training {self.model_string} model for {self._task_type}") if self._task_type == 'classification': print('Number of detected classes:', n_classes) # final training with the optimized hyperparams print('Training the model: ...') self.fit(data[0], data[1], **kwargs) print('Training the model: Done!') print('Testing the model: ...') y_pred = self.predict(data[2], output_margin=output_margin) if self._task_type == 'classification': roc_score = roc_auc_score( data[3], y_pred, average=average, multi_class=multi_class_opt) print(f'ROC_AUC_score: {roc_score:.6f}') else: mse_score = mean_squared_error(data[3], y_pred) print(f'Mean squared error: {mse_score:.6f}') print('Testing the model: Done!') print('==============================') if return_prediction: return y_pred return None def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize', optuna_sampler=None, resume_study=None, save_study=None, **kwargs): """ Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization. Parameters ------------------------------------------------------ data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array hyperparams_ranges: dict Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 } cross_val_scoring: string, callable or None Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter direction: str The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize') optuna_sampler: optuna.samplers.BaseSampler Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html nfold: int Number of folds to calculate the cross validation error resume_study: str A string indicating the filename of the study to be resumed. If None, the study is not resumed. save_study: str A string indicating the filename of the study. If None, the study is not saved into a file. **kwargs: dict Optuna study parameters Returns ------------------------------------------------------ study: optuna.study.Study The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study """ n_classes = len(np.unique(data[1])) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(data[0].columns) x_train, y_train, _, _ = data def __get_int_or_uniform(hyperparams_ranges, trial): params = {} for key in hyperparams_ranges: if isinstance(hyperparams_ranges[key][0], int): params[key] = trial.suggest_int( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) elif isinstance(hyperparams_ranges[key][0], float): params[key] = trial.suggest_uniform( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) return params def __objective(trial): params = __get_int_or_uniform(hyperparams_ranges, trial) model_copy = deepcopy(self.model) model_copy.set_params(**{**self.model_params, **params}) return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train, cv=nfold, scoring=cross_val_scoring, n_jobs=1)) if resume_study: with open(resume_study, 'rb') as resume_study_file: study = pickle.load(resume_study_file) else: study = optuna.create_study( direction=direction, sampler=optuna_sampler) study.optimize(__objective, **kwargs) if save_study: with open(save_study, 'wb') as study_file: pickle.dump(study, study_file) print(f"Number of finished trials: {len(study.trials)}") print("Best trial:") best_trial = study.best_trial print(f"Value: {best_trial.value}") print("Params: ") for key, value in best_trial.params.items(): print(f" {key}: {value}") self.set_model_params({**self.model_params, **best_trial.params}) self.optuna_cross_val_score = best_trial.value return study def dump_original_model(self, filename, xgb_format=False): """ Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved xgb_format : bool If True saves the xgboost model into a .model file """ if xgb_format is False: with open(filename, "wb") as output_file: pickle.dump(self.model, output_file) else: if self.model_string == 'xgboost': self.model.save_model(filename) else: print("File not saved: only xgboost models support the .model extension") def dump_model_handler(self, filename): """ Save the model handler into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "wb") as output_file: pickle.dump(self, output_file) def load_model_handler(self, filename): """ Load a model handler saved into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "rb") as input_file: loaded_model = pickle.load(input_file) self.model = loaded_model.get_original_model() self.training_columns = loaded_model.get_training_columns() self.model_params = loaded_model.get_model_params() self.model.set_params(**self.model_params) self.model_string = loaded_model.get_model_module() self._n_classes = loaded_model.get_n_classes() self._task_type = loaded_model.get_task_type()
Class used for wrapping the models from different ML libraries to build a new model with common methods. Currently LightGBM, XGBoost (through their sklearn interface) and sklearn models are supported.
Parameters
input_model
:XGBoost, LightGBM
orsklearn model
training_columns
:list
- Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
model_params
:dict
- Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
task_type
:str
- Task type of the model: 'classification' or 'regression'
Methods
def dump_model_handler(self, filename)
-
Expand source code
def dump_model_handler(self, filename): """ Save the model handler into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "wb") as output_file: pickle.dump(self, output_file)
Save the model handler into a pickle file
Parameters
filename
:str
- Name of the file in which the model is saved
def dump_original_model(self, filename, xgb_format=False)
-
Expand source code
def dump_original_model(self, filename, xgb_format=False): """ Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved xgb_format : bool If True saves the xgboost model into a .model file """ if xgb_format is False: with open(filename, "wb") as output_file: pickle.dump(self.model, output_file) else: if self.model_string == 'xgboost': self.model.save_model(filename) else: print("File not saved: only xgboost models support the .model extension")
Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file
Parameters
filename
:str
- Name of the file in which the model is saved
xgb_format
:bool
- If True saves the xgboost model into a .model file
def fit(self, x_train, y_train, **kwargs)
-
Expand source code
def fit(self, x_train, y_train, **kwargs): """ Fit Model Parameters --------------------------- x_train: array-like, sparse matrix Training data y_train: array-like, sparse matrix Target data **kwargs: Extra kwargs passed on to model.fit() method """ if self._task_type == 'classification': n_classes = len(np.unique(y_train)) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(x_train.columns) self.model.fit(x_train[self.training_columns], y_train, **kwargs)
Fit Model
Parameters
x_train
:array-like, sparse matrix
- Training data
y_train
:array-like, sparse matrix
- Target data
**kwargs: Extra kwargs passed on to model.fit() method
def get_model_module(self)
-
Expand source code
def get_model_module(self): """ Get the string containing the name of the model module Returns --------------------------- out: str Name of the model module """ return self.model_string
Get the string containing the name of the model module
Returns
out
:str
- Name of the model module
def get_model_params(self)
-
Expand source code
def get_model_params(self): """ Get the model (hyper-)parameters Returns ------------------------------------ out: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ return self.model.get_params()
Get the model (hyper-)parameters
Returns
out
:dict
- Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
def get_n_classes(self)
-
Expand source code
def get_n_classes(self): """ Get the number of classes Returns --------------------------- out: int Number of classes """ return self._n_classes
Get the number of classes
Returns
out
:int
- Number of classes
def get_original_model(self)
-
Expand source code
def get_original_model(self): """ Get the original unwrapped model Returns --------------------------- out: XGBoost, LGBM or sklearn model """ return self.model
Get the original unwrapped model
Returns
out
:XGBoost, LGBM
orsklearn model
def get_task_type(self)
-
Expand source code
def get_task_type(self): """ Get the task type of the model Returns --------------------------- out: str Task type of the model: 'classification' or 'regression' """ return self._task_type
Get the task type of the model
Returns
out
:str
- Task type of the model: 'classification' or 'regression'
def get_training_columns(self)
-
Expand source code
def get_training_columns(self): """ Get the features used for the training process Returns ------------------------------------ out: list Names of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ return self.training_columns
Get the features used for the training process
Returns
out
:list
- Names of the features used for the training. Example: ['dEdx', 'pT', 'ct']
def load_model_handler(self, filename)
-
Expand source code
def load_model_handler(self, filename): """ Load a model handler saved into a pickle file Parameters ----------------------------------------------------- filename: str Name of the file in which the model is saved """ with open(filename, "rb") as input_file: loaded_model = pickle.load(input_file) self.model = loaded_model.get_original_model() self.training_columns = loaded_model.get_training_columns() self.model_params = loaded_model.get_model_params() self.model.set_params(**self.model_params) self.model_string = loaded_model.get_model_module() self._n_classes = loaded_model.get_n_classes() self._task_type = loaded_model.get_task_type()
Load a model handler saved into a pickle file
Parameters
filename
:str
- Name of the file in which the model is saved
def optimize_params_optuna(self,
data,
hyperparams_ranges,
cross_val_scoring,
nfold=5,
direction='maximize',
optuna_sampler=None,
resume_study=None,
save_study=None,
**kwargs)-
Expand source code
def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize', optuna_sampler=None, resume_study=None, save_study=None, **kwargs): """ Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization. Parameters ------------------------------------------------------ data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array hyperparams_ranges: dict Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 } cross_val_scoring: string, callable or None Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter direction: str The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize') optuna_sampler: optuna.samplers.BaseSampler Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html nfold: int Number of folds to calculate the cross validation error resume_study: str A string indicating the filename of the study to be resumed. If None, the study is not resumed. save_study: str A string indicating the filename of the study. If None, the study is not saved into a file. **kwargs: dict Optuna study parameters Returns ------------------------------------------------------ study: optuna.study.Study The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study """ n_classes = len(np.unique(data[1])) self._n_classes = n_classes if self.training_columns is None: self.training_columns = list(data[0].columns) x_train, y_train, _, _ = data def __get_int_or_uniform(hyperparams_ranges, trial): params = {} for key in hyperparams_ranges: if isinstance(hyperparams_ranges[key][0], int): params[key] = trial.suggest_int( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) elif isinstance(hyperparams_ranges[key][0], float): params[key] = trial.suggest_uniform( key, hyperparams_ranges[key][0], hyperparams_ranges[key][1]) return params def __objective(trial): params = __get_int_or_uniform(hyperparams_ranges, trial) model_copy = deepcopy(self.model) model_copy.set_params(**{**self.model_params, **params}) return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train, cv=nfold, scoring=cross_val_scoring, n_jobs=1)) if resume_study: with open(resume_study, 'rb') as resume_study_file: study = pickle.load(resume_study_file) else: study = optuna.create_study( direction=direction, sampler=optuna_sampler) study.optimize(__objective, **kwargs) if save_study: with open(save_study, 'wb') as study_file: pickle.dump(study, study_file) print(f"Number of finished trials: {len(study.trials)}") print("Best trial:") best_trial = study.best_trial print(f"Value: {best_trial.value}") print("Params: ") for key, value in best_trial.params.items(): print(f" {key}: {value}") self.set_model_params({**self.model_params, **best_trial.params}) self.optuna_cross_val_score = best_trial.value return study
Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization.
Parameters
data
:list
- Contains respectively: training set dataframe, training label array, test set dataframe, test label array
hyperparams_ranges
:dict
- Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 }
cross_val_scoring
:string, callable
orNone
- Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
direction
:str
- The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize')
optuna_sampler
:optuna.samplers.BaseSampler
- Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html
nfold
:int
- Number of folds to calculate the cross validation error
resume_study
:str
- A string indicating the filename of the study to be resumed. If None, the study is not resumed.
save_study
:str
- A string indicating the filename of the study. If None, the study is not saved into a file.
**kwargs
:dict
- Optuna study parameters
Returns
study
:optuna.study.Study
- The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
def predict(self, x_test, output_margin=True, **kwargs)
-
Expand source code
def predict(self, x_test, output_margin=True, **kwargs): """ Return model prediction for the array x_test Parameters -------------------------------------- x_test: hipe4ml tree_handler, array-like, sparse matrix The input sample. output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. **kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict() Returns --------------------------------------- out: numpy array Model predictions """ if isinstance(x_test, hipe4ml.tree_handler.TreeHandler): x_test = x_test.get_data_frame() x_test = x_test[self.training_columns] # regression if self._task_type == 'regression': return self.model.predict(x_test, **kwargs) # classification if output_margin: if self.model_string == 'xgboost': return self.model.predict(x_test, output_margin=True, **kwargs) if self.model_string == 'lightgbm': return self.model.predict(x_test, raw_score=True, **kwargs) if self.model_string == 'sklearn': if not hasattr(self.model, 'decision_function'): raise ValueError( "This Model does not support a decision_function(): use output_margin=False") return self.model.decision_function(x_test, **kwargs).ravel() pred = self.model.predict_proba(x_test, **kwargs) # in case of binary classification return only the scores of # the signal class if pred.shape[1] <= 2: pred = pred[:, 1] return pred
Return model prediction for the array x_test Parameters
x_test
:hipe4ml tree_handler, array-like, sparse matrix
- The input sample.
output_margin
:bool
- Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.
**kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict()
Returns
out
:numpy array
- Model predictions
def set_model_params(self, model_params)
-
Expand source code
def set_model_params(self, model_params): """ Set the model (hyper-)parameters Parameters ------------------------------------ model_params: dict Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, ... """ self.model_params = model_params self.model.set_params(**self.model_params)
Set the model (hyper-)parameters
Parameters
model_params
:dict
- Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
def set_training_columns(self, training_columns)
-
Expand source code
def set_training_columns(self, training_columns): """ Set the features used for the training process Parameters ------------------------------------ training_columns: list Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct'] """ self.training_columns = training_columns
Set the features used for the training process
Parameters
training_columns
:list
- Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
def train_test_model(self,
data,
return_prediction=False,
output_margin=False,
average='macro',
multi_class_opt='raise',
**kwargs)-
Expand source code
def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro', multi_class_opt='raise', **kwargs): """ Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression. Parameters ---------------------------------------------- data: list Contains respectively: training set dataframe, training label array, test set dataframe, test label array return_prediction: bool If True Model predictions on the test set are returned output_margin: bool Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'. average: string Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score multi_class_opt: string Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available **kwargs: dict Extra kwargs passed on to the model fit method Returns --------------------------------------- out: numpy array or None If return_prediction==True, Model predictions on the test set are returned """ # get number of classes n_classes = len(np.unique(data[1])) self._n_classes = n_classes print('==============================') print(f"Training {self.model_string} model for {self._task_type}") if self._task_type == 'classification': print('Number of detected classes:', n_classes) # final training with the optimized hyperparams print('Training the model: ...') self.fit(data[0], data[1], **kwargs) print('Training the model: Done!') print('Testing the model: ...') y_pred = self.predict(data[2], output_margin=output_margin) if self._task_type == 'classification': roc_score = roc_auc_score( data[3], y_pred, average=average, multi_class=multi_class_opt) print(f'ROC_AUC_score: {roc_score:.6f}') else: mse_score = mean_squared_error(data[3], y_pred) print(f'Mean squared error: {mse_score:.6f}') print('Testing the model: Done!') print('==============================') if return_prediction: return y_pred return None
Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression.
Parameters
data
:list
- Contains respectively: training set dataframe, training label array, test set dataframe, test label array
return_prediction
:bool
- If True Model predictions on the test set are returned
output_margin
:bool
- Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.
average
:string
- Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
multi_class_opt
:string
- Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available
**kwargs
:dict
- Extra kwargs passed on to the model fit method
Returns
out
:numpy array
orNone
- If return_prediction==True, Model predictions on the test set are returned