Module hipe4ml.model_handler

Module containing the class used for wrapping the models from different ML libraries to build a new model with common methods

Expand source code
"""
Module containing the class used for wrapping the models from different
ML libraries to build a new model with common methods
"""
from copy import deepcopy
import inspect
import pickle

import numpy as np
import optuna
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import cross_val_score

import hipe4ml.tree_handler


class ModelHandler:
    """
    Class used for wrapping the models from different ML libraries to
    build a new model with common methods. Currently LightGBM, XGBoost
    (through their sklearn interface) and sklearn models are supported.

    Parameters
    -------------------------------------------------
    input_model: XGBoost, LightGBM or sklearn model

    training_columns: list
        Contains the name of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']

    model_params: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...

    task_type: str
        Task type of the model: 'classification' or 'regression'
    """

    def __init__(self, input_model=None, training_columns=None, model_params=None, task_type='classification'):
        self.model = input_model
        self.training_columns = training_columns
        self.model_params = model_params
        self._n_classes = None
        self._task_type = task_type
        if self._task_type not in ['classification', 'regression']:
            raise ValueError(
                "Task type must be either 'classification' or 'regression'")

        if self.model is not None:
            self.model_string = inspect.getmodule(
                self.model).__name__.partition('.')[0]

            if self.model_string not in ["xgboost", "lightgbm", "sklearn"]:
                raise ValueError(
                    "Model must be either 'xgboost', 'lightgbm' or 'sklearn'")

            if self.model_params is None:
                self.model_params = self.model.get_params()
            else:
                self.model.set_params(**self.model_params)

    def set_model_params(self, model_params):
        """
        Set the model (hyper-)parameters

        Parameters
        ------------------------------------
        model_params: dict
            Model hyper-parameter values. For
            example (XGBoost): max_depth, learning_rate,
            n_estimators, gamma, min_child_weight, ...
        """
        self.model_params = model_params
        self.model.set_params(**self.model_params)

    def get_model_params(self):
        """
        Get the model (hyper-)parameters

        Returns
        ------------------------------------
        out: dict
            Model hyper-parameter values. For
            example (XGBoost): max_depth, learning_rate,
            n_estimators, gamma, min_child_weight, ...
        """
        return self.model.get_params()

    def set_training_columns(self, training_columns):
        """
        Set the features used for the training process

        Parameters
        ------------------------------------
        training_columns: list
            Contains the name of the features used for the training.
            Example: ['dEdx', 'pT', 'ct']
        """
        self.training_columns = training_columns

    def get_training_columns(self):
        """
        Get the features used for the training process

        Returns
        ------------------------------------
        out: list
            Names of the features used for the training.
            Example: ['dEdx', 'pT', 'ct']
        """

        return self.training_columns

    def get_original_model(self):
        """
        Get the original unwrapped model

        Returns
        ---------------------------
        out: XGBoost, LGBM or sklearn model
        """
        return self.model

    def get_model_module(self):
        """
        Get the string containing the name
        of the model module

        Returns
        ---------------------------
        out: str
            Name of the model module
        """
        return self.model_string

    def get_n_classes(self):
        """
        Get the number of classes

        Returns
        ---------------------------
        out: int
            Number of classes
        """
        return self._n_classes

    def get_task_type(self):
        """
        Get the task type of the model

        Returns
        ---------------------------
        out: str
            Task type of the model: 'classification' or 'regression'
        """
        return self._task_type

    def fit(self, x_train, y_train, **kwargs):
        """
        Fit Model

        Parameters
        ---------------------------
        x_train: array-like, sparse matrix
            Training data

        y_train: array-like, sparse matrix
            Target data

        **kwargs:
            Extra kwargs passed on to model.fit() method
        """
        if self._task_type == 'classification':
            n_classes = len(np.unique(y_train))
            self._n_classes = n_classes
        if self.training_columns is None:
            self.training_columns = list(x_train.columns)

        self.model.fit(x_train[self.training_columns], y_train, **kwargs)

    def predict(self, x_test, output_margin=True, **kwargs):
        """
        Return model prediction for the array x_test
        Parameters
        --------------------------------------
        x_test: hipe4ml tree_handler, array-like, sparse matrix
            The input sample.

        output_margin: bool
            Whether to output the raw untransformed margin value. If False model
            probabilities are returned. Not used when task_type is 'regression'.

        **kwargs:
            Extra kwargs passed on to the following model prediction function:
            if (task_type == 'classification')
            - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True
            - predict_proba() if output_margin==False
            if (task_type == 'regression')
            - predict()

        Returns
        ---------------------------------------
        out: numpy array
            Model predictions
        """
        if isinstance(x_test, hipe4ml.tree_handler.TreeHandler):
            x_test = x_test.get_data_frame()

        x_test = x_test[self.training_columns]

        # regression
        if self._task_type == 'regression':
            return self.model.predict(x_test, **kwargs)

        # classification
        if output_margin:
            if self.model_string == 'xgboost':
                return self.model.predict(x_test, output_margin=True, **kwargs)
            if self.model_string == 'lightgbm':
                return self.model.predict(x_test, raw_score=True, **kwargs)
            if self.model_string == 'sklearn':
                if not hasattr(self.model, 'decision_function'):
                    raise ValueError(
                        "This Model does not support a decision_function(): use output_margin=False")
                return self.model.decision_function(x_test, **kwargs).ravel()

        pred = self.model.predict_proba(x_test, **kwargs)
        # in case of binary classification return only the scores of
        # the signal class
        if pred.shape[1] <= 2:
            pred = pred[:, 1]
        return pred

    def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro',
                         multi_class_opt='raise', **kwargs):
        """
        Perform the training and the testing of the model. The model performance is estimated
        using the ROC AUC metric for classification and the MSE for regression.

        Parameters
        ----------------------------------------------
        data: list
            Contains respectively: training
            set dataframe, training label array,
            test set dataframe, test label array

        return_prediction: bool
            If True Model predictions on the test set are
            returned

        output_margin: bool
            Whether to output the raw untransformed margin value. If False model
            probabilities are returned. Not used when task_type is 'regression'.

        average: string
            Option for the average of ROC AUC scores used only in case of multi-classification.
            You can choose between 'macro' and 'weighted'. For more information see
            https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score

        multi_class_opt: string
            Option to compute ROC AUC scores used only in case of multi-classification.
            The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available

        **kwargs: dict
            Extra kwargs passed on to the model fit method

        Returns
        ---------------------------------------
        out: numpy array or None
            If return_prediction==True, Model predictions on the test set are
            returned

        """

        # get number of classes
        n_classes = len(np.unique(data[1]))
        self._n_classes = n_classes
        print('==============================')
        print(f"Training {self.model_string} model for {self._task_type}")
        if self._task_type == 'classification':
            print('Number of detected classes:', n_classes)

        # final training with the optimized hyperparams
        print('Training the model: ...')
        self.fit(data[0], data[1], **kwargs)
        print('Training the model: Done!')
        print('Testing the model: ...')
        y_pred = self.predict(data[2], output_margin=output_margin)
        if self._task_type == 'classification':
            roc_score = roc_auc_score(
                data[3], y_pred, average=average, multi_class=multi_class_opt)
            print(f'ROC_AUC_score: {roc_score:.6f}')
        else:
            mse_score = mean_squared_error(data[3], y_pred)
            print(f'Mean squared error: {mse_score:.6f}')
        print('Testing the model: Done!')
        print('==============================')
        if return_prediction:
            return y_pred
        return None

    def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize',
                               optuna_sampler=None, resume_study=None, save_study=None, **kwargs):
        """
        Perform hyperparameter optimization of ModelHandler using the Optuna module.
        The model hyperparameters are automatically set as the ones that provided the
        best result during the optimization.

        Parameters
        ------------------------------------------------------
        data: list
            Contains respectively: training
            set dataframe, training label array,
            test set dataframe, test label array

        hyperparams_ranges: dict
            Hyperparameter ranges (in tuples or list). If a parameter is not
            in a tuple or a list it will be considered constant.
            Important: the type of the params must be preserved
            when passing the ranges.
            For example:
            dict={
                'max_depth':(10,100)
                'learning_rate': (0.01,0.03)
                'n_jobs': 8
            }

        cross_val_scoring: string, callable or None
            Score metrics used for the cross-validation.
            A string (see sklearn model evaluation documentation:
            https://scikit-learn.org/stable/modules/model_evaluation.html)
            or a scorer callable object / function with signature scorer(estimator, X, y)
            which should return only a single value.
            In binary classification 'roc_auc' is suggested.
            In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’,
            ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested.
            For more information see
            https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

        direction: str
            The direction of optimization. Either 'maximize' or 'minimize'.
            (e.g. for the metric 'roc_auc' the direction is 'maximize')

        optuna_sampler: optuna.samplers.BaseSampler
            Sampler to be used for the optuna (maxi-)minimisation.
            If None, default TPESampler is used. For more information see:
            https://optuna.readthedocs.io/en/stable/reference/samplers.html

        nfold: int
            Number of folds to calculate the cross validation error

        resume_study: str
            A string indicating the filename of the study to be resumed.
            If None, the study is not resumed.

        save_study: str
            A string indicating the filename of the study. If None,
            the study is not saved into a file.

        **kwargs: dict
            Optuna study parameters

        Returns
        ------------------------------------------------------

        study: optuna.study.Study
            The obtuna object which stores the whole study. See Optuna's documentation for more details:
            https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
        """

        n_classes = len(np.unique(data[1]))
        self._n_classes = n_classes
        if self.training_columns is None:
            self.training_columns = list(data[0].columns)

        x_train, y_train, _, _ = data

        def __get_int_or_uniform(hyperparams_ranges, trial):

            params = {}

            for key in hyperparams_ranges:
                if isinstance(hyperparams_ranges[key][0], int):
                    params[key] = trial.suggest_int(
                        key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])
                elif isinstance(hyperparams_ranges[key][0], float):
                    params[key] = trial.suggest_uniform(
                        key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])

            return params

        def __objective(trial):

            params = __get_int_or_uniform(hyperparams_ranges, trial)
            model_copy = deepcopy(self.model)
            model_copy.set_params(**{**self.model_params, **params})
            return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train,
                                           cv=nfold, scoring=cross_val_scoring, n_jobs=1))
        if resume_study:
            with open(resume_study, 'rb') as resume_study_file:
                study = pickle.load(resume_study_file)
        else:
            study = optuna.create_study(
                direction=direction, sampler=optuna_sampler)

        study.optimize(__objective, **kwargs)

        if save_study:
            with open(save_study, 'wb') as study_file:
                pickle.dump(study, study_file)

        print(f"Number of finished trials: {len(study.trials)}")
        print("Best trial:")
        best_trial = study.best_trial

        print(f"Value: {best_trial.value}")
        print("Params: ")
        for key, value in best_trial.params.items():
            print(f"    {key}: {value}")

        self.set_model_params({**self.model_params, **best_trial.params})

        return study

    def dump_original_model(self, filename, xgb_format=False):
        """
        Save the trained model into a pickle
        file. Only for xgboost models it is also given
        the possibility to save them into a .model file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved

        xgb_format : bool
            If True saves the xgboost model into a .model file
        """
        if xgb_format is False:
            with open(filename, "wb") as output_file:
                pickle.dump(self.model, output_file)
        else:
            if self.model_string == 'xgboost':
                self.model.save_model(filename)
            else:
                print("File not saved: only xgboost models support the .model extension")

    def dump_model_handler(self, filename):
        """
        Save the model handler into a pickle file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved
        """
        with open(filename, "wb") as output_file:
            pickle.dump(self, output_file)

    def load_model_handler(self, filename):
        """
        Load a model handler saved into a pickle file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved
        """
        with open(filename, "rb") as input_file:
            loaded_model = pickle.load(input_file)
            self.model = loaded_model.get_original_model()
            self.training_columns = loaded_model.get_training_columns()
            self.model_params = loaded_model.get_model_params()
            self.model.set_params(**self.model_params)
            self.model_string = loaded_model.get_model_module()
            self._n_classes = loaded_model.get_n_classes()
            self._task_type = loaded_model.get_task_type()

Classes

class ModelHandler (input_model=None, training_columns=None, model_params=None, task_type='classification')

Class used for wrapping the models from different ML libraries to build a new model with common methods. Currently LightGBM, XGBoost (through their sklearn interface) and sklearn models are supported.

Parameters

input_model : XGBoost, LightGBM or sklearn model
 
training_columns : list
Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
model_params : dict
Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
task_type : str
Task type of the model: 'classification' or 'regression'
Expand source code
class ModelHandler:
    """
    Class used for wrapping the models from different ML libraries to
    build a new model with common methods. Currently LightGBM, XGBoost
    (through their sklearn interface) and sklearn models are supported.

    Parameters
    -------------------------------------------------
    input_model: XGBoost, LightGBM or sklearn model

    training_columns: list
        Contains the name of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']

    model_params: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...

    task_type: str
        Task type of the model: 'classification' or 'regression'
    """

    def __init__(self, input_model=None, training_columns=None, model_params=None, task_type='classification'):
        self.model = input_model
        self.training_columns = training_columns
        self.model_params = model_params
        self._n_classes = None
        self._task_type = task_type
        if self._task_type not in ['classification', 'regression']:
            raise ValueError(
                "Task type must be either 'classification' or 'regression'")

        if self.model is not None:
            self.model_string = inspect.getmodule(
                self.model).__name__.partition('.')[0]

            if self.model_string not in ["xgboost", "lightgbm", "sklearn"]:
                raise ValueError(
                    "Model must be either 'xgboost', 'lightgbm' or 'sklearn'")

            if self.model_params is None:
                self.model_params = self.model.get_params()
            else:
                self.model.set_params(**self.model_params)

    def set_model_params(self, model_params):
        """
        Set the model (hyper-)parameters

        Parameters
        ------------------------------------
        model_params: dict
            Model hyper-parameter values. For
            example (XGBoost): max_depth, learning_rate,
            n_estimators, gamma, min_child_weight, ...
        """
        self.model_params = model_params
        self.model.set_params(**self.model_params)

    def get_model_params(self):
        """
        Get the model (hyper-)parameters

        Returns
        ------------------------------------
        out: dict
            Model hyper-parameter values. For
            example (XGBoost): max_depth, learning_rate,
            n_estimators, gamma, min_child_weight, ...
        """
        return self.model.get_params()

    def set_training_columns(self, training_columns):
        """
        Set the features used for the training process

        Parameters
        ------------------------------------
        training_columns: list
            Contains the name of the features used for the training.
            Example: ['dEdx', 'pT', 'ct']
        """
        self.training_columns = training_columns

    def get_training_columns(self):
        """
        Get the features used for the training process

        Returns
        ------------------------------------
        out: list
            Names of the features used for the training.
            Example: ['dEdx', 'pT', 'ct']
        """

        return self.training_columns

    def get_original_model(self):
        """
        Get the original unwrapped model

        Returns
        ---------------------------
        out: XGBoost, LGBM or sklearn model
        """
        return self.model

    def get_model_module(self):
        """
        Get the string containing the name
        of the model module

        Returns
        ---------------------------
        out: str
            Name of the model module
        """
        return self.model_string

    def get_n_classes(self):
        """
        Get the number of classes

        Returns
        ---------------------------
        out: int
            Number of classes
        """
        return self._n_classes

    def get_task_type(self):
        """
        Get the task type of the model

        Returns
        ---------------------------
        out: str
            Task type of the model: 'classification' or 'regression'
        """
        return self._task_type

    def fit(self, x_train, y_train, **kwargs):
        """
        Fit Model

        Parameters
        ---------------------------
        x_train: array-like, sparse matrix
            Training data

        y_train: array-like, sparse matrix
            Target data

        **kwargs:
            Extra kwargs passed on to model.fit() method
        """
        if self._task_type == 'classification':
            n_classes = len(np.unique(y_train))
            self._n_classes = n_classes
        if self.training_columns is None:
            self.training_columns = list(x_train.columns)

        self.model.fit(x_train[self.training_columns], y_train, **kwargs)

    def predict(self, x_test, output_margin=True, **kwargs):
        """
        Return model prediction for the array x_test
        Parameters
        --------------------------------------
        x_test: hipe4ml tree_handler, array-like, sparse matrix
            The input sample.

        output_margin: bool
            Whether to output the raw untransformed margin value. If False model
            probabilities are returned. Not used when task_type is 'regression'.

        **kwargs:
            Extra kwargs passed on to the following model prediction function:
            if (task_type == 'classification')
            - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True
            - predict_proba() if output_margin==False
            if (task_type == 'regression')
            - predict()

        Returns
        ---------------------------------------
        out: numpy array
            Model predictions
        """
        if isinstance(x_test, hipe4ml.tree_handler.TreeHandler):
            x_test = x_test.get_data_frame()

        x_test = x_test[self.training_columns]

        # regression
        if self._task_type == 'regression':
            return self.model.predict(x_test, **kwargs)

        # classification
        if output_margin:
            if self.model_string == 'xgboost':
                return self.model.predict(x_test, output_margin=True, **kwargs)
            if self.model_string == 'lightgbm':
                return self.model.predict(x_test, raw_score=True, **kwargs)
            if self.model_string == 'sklearn':
                if not hasattr(self.model, 'decision_function'):
                    raise ValueError(
                        "This Model does not support a decision_function(): use output_margin=False")
                return self.model.decision_function(x_test, **kwargs).ravel()

        pred = self.model.predict_proba(x_test, **kwargs)
        # in case of binary classification return only the scores of
        # the signal class
        if pred.shape[1] <= 2:
            pred = pred[:, 1]
        return pred

    def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro',
                         multi_class_opt='raise', **kwargs):
        """
        Perform the training and the testing of the model. The model performance is estimated
        using the ROC AUC metric for classification and the MSE for regression.

        Parameters
        ----------------------------------------------
        data: list
            Contains respectively: training
            set dataframe, training label array,
            test set dataframe, test label array

        return_prediction: bool
            If True Model predictions on the test set are
            returned

        output_margin: bool
            Whether to output the raw untransformed margin value. If False model
            probabilities are returned. Not used when task_type is 'regression'.

        average: string
            Option for the average of ROC AUC scores used only in case of multi-classification.
            You can choose between 'macro' and 'weighted'. For more information see
            https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score

        multi_class_opt: string
            Option to compute ROC AUC scores used only in case of multi-classification.
            The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available

        **kwargs: dict
            Extra kwargs passed on to the model fit method

        Returns
        ---------------------------------------
        out: numpy array or None
            If return_prediction==True, Model predictions on the test set are
            returned

        """

        # get number of classes
        n_classes = len(np.unique(data[1]))
        self._n_classes = n_classes
        print('==============================')
        print(f"Training {self.model_string} model for {self._task_type}")
        if self._task_type == 'classification':
            print('Number of detected classes:', n_classes)

        # final training with the optimized hyperparams
        print('Training the model: ...')
        self.fit(data[0], data[1], **kwargs)
        print('Training the model: Done!')
        print('Testing the model: ...')
        y_pred = self.predict(data[2], output_margin=output_margin)
        if self._task_type == 'classification':
            roc_score = roc_auc_score(
                data[3], y_pred, average=average, multi_class=multi_class_opt)
            print(f'ROC_AUC_score: {roc_score:.6f}')
        else:
            mse_score = mean_squared_error(data[3], y_pred)
            print(f'Mean squared error: {mse_score:.6f}')
        print('Testing the model: Done!')
        print('==============================')
        if return_prediction:
            return y_pred
        return None

    def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize',
                               optuna_sampler=None, resume_study=None, save_study=None, **kwargs):
        """
        Perform hyperparameter optimization of ModelHandler using the Optuna module.
        The model hyperparameters are automatically set as the ones that provided the
        best result during the optimization.

        Parameters
        ------------------------------------------------------
        data: list
            Contains respectively: training
            set dataframe, training label array,
            test set dataframe, test label array

        hyperparams_ranges: dict
            Hyperparameter ranges (in tuples or list). If a parameter is not
            in a tuple or a list it will be considered constant.
            Important: the type of the params must be preserved
            when passing the ranges.
            For example:
            dict={
                'max_depth':(10,100)
                'learning_rate': (0.01,0.03)
                'n_jobs': 8
            }

        cross_val_scoring: string, callable or None
            Score metrics used for the cross-validation.
            A string (see sklearn model evaluation documentation:
            https://scikit-learn.org/stable/modules/model_evaluation.html)
            or a scorer callable object / function with signature scorer(estimator, X, y)
            which should return only a single value.
            In binary classification 'roc_auc' is suggested.
            In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’,
            ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested.
            For more information see
            https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

        direction: str
            The direction of optimization. Either 'maximize' or 'minimize'.
            (e.g. for the metric 'roc_auc' the direction is 'maximize')

        optuna_sampler: optuna.samplers.BaseSampler
            Sampler to be used for the optuna (maxi-)minimisation.
            If None, default TPESampler is used. For more information see:
            https://optuna.readthedocs.io/en/stable/reference/samplers.html

        nfold: int
            Number of folds to calculate the cross validation error

        resume_study: str
            A string indicating the filename of the study to be resumed.
            If None, the study is not resumed.

        save_study: str
            A string indicating the filename of the study. If None,
            the study is not saved into a file.

        **kwargs: dict
            Optuna study parameters

        Returns
        ------------------------------------------------------

        study: optuna.study.Study
            The obtuna object which stores the whole study. See Optuna's documentation for more details:
            https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
        """

        n_classes = len(np.unique(data[1]))
        self._n_classes = n_classes
        if self.training_columns is None:
            self.training_columns = list(data[0].columns)

        x_train, y_train, _, _ = data

        def __get_int_or_uniform(hyperparams_ranges, trial):

            params = {}

            for key in hyperparams_ranges:
                if isinstance(hyperparams_ranges[key][0], int):
                    params[key] = trial.suggest_int(
                        key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])
                elif isinstance(hyperparams_ranges[key][0], float):
                    params[key] = trial.suggest_uniform(
                        key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])

            return params

        def __objective(trial):

            params = __get_int_or_uniform(hyperparams_ranges, trial)
            model_copy = deepcopy(self.model)
            model_copy.set_params(**{**self.model_params, **params})
            return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train,
                                           cv=nfold, scoring=cross_val_scoring, n_jobs=1))
        if resume_study:
            with open(resume_study, 'rb') as resume_study_file:
                study = pickle.load(resume_study_file)
        else:
            study = optuna.create_study(
                direction=direction, sampler=optuna_sampler)

        study.optimize(__objective, **kwargs)

        if save_study:
            with open(save_study, 'wb') as study_file:
                pickle.dump(study, study_file)

        print(f"Number of finished trials: {len(study.trials)}")
        print("Best trial:")
        best_trial = study.best_trial

        print(f"Value: {best_trial.value}")
        print("Params: ")
        for key, value in best_trial.params.items():
            print(f"    {key}: {value}")

        self.set_model_params({**self.model_params, **best_trial.params})

        return study

    def dump_original_model(self, filename, xgb_format=False):
        """
        Save the trained model into a pickle
        file. Only for xgboost models it is also given
        the possibility to save them into a .model file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved

        xgb_format : bool
            If True saves the xgboost model into a .model file
        """
        if xgb_format is False:
            with open(filename, "wb") as output_file:
                pickle.dump(self.model, output_file)
        else:
            if self.model_string == 'xgboost':
                self.model.save_model(filename)
            else:
                print("File not saved: only xgboost models support the .model extension")

    def dump_model_handler(self, filename):
        """
        Save the model handler into a pickle file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved
        """
        with open(filename, "wb") as output_file:
            pickle.dump(self, output_file)

    def load_model_handler(self, filename):
        """
        Load a model handler saved into a pickle file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved
        """
        with open(filename, "rb") as input_file:
            loaded_model = pickle.load(input_file)
            self.model = loaded_model.get_original_model()
            self.training_columns = loaded_model.get_training_columns()
            self.model_params = loaded_model.get_model_params()
            self.model.set_params(**self.model_params)
            self.model_string = loaded_model.get_model_module()
            self._n_classes = loaded_model.get_n_classes()
            self._task_type = loaded_model.get_task_type()

Methods

def dump_model_handler(self, filename)

Save the model handler into a pickle file

Parameters

filename : str
Name of the file in which the model is saved
Expand source code
def dump_model_handler(self, filename):
    """
    Save the model handler into a pickle file

    Parameters
    -----------------------------------------------------
    filename: str
        Name of the file in which the model is saved
    """
    with open(filename, "wb") as output_file:
        pickle.dump(self, output_file)
def dump_original_model(self, filename, xgb_format=False)

Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file

Parameters

filename : str
Name of the file in which the model is saved
xgb_format : bool
If True saves the xgboost model into a .model file
Expand source code
def dump_original_model(self, filename, xgb_format=False):
    """
    Save the trained model into a pickle
    file. Only for xgboost models it is also given
    the possibility to save them into a .model file

    Parameters
    -----------------------------------------------------
    filename: str
        Name of the file in which the model is saved

    xgb_format : bool
        If True saves the xgboost model into a .model file
    """
    if xgb_format is False:
        with open(filename, "wb") as output_file:
            pickle.dump(self.model, output_file)
    else:
        if self.model_string == 'xgboost':
            self.model.save_model(filename)
        else:
            print("File not saved: only xgboost models support the .model extension")
def fit(self, x_train, y_train, **kwargs)

Fit Model

Parameters

x_train : array-like, sparse matrix
Training data
y_train : array-like, sparse matrix
Target data

**kwargs: Extra kwargs passed on to model.fit() method

Expand source code
def fit(self, x_train, y_train, **kwargs):
    """
    Fit Model

    Parameters
    ---------------------------
    x_train: array-like, sparse matrix
        Training data

    y_train: array-like, sparse matrix
        Target data

    **kwargs:
        Extra kwargs passed on to model.fit() method
    """
    if self._task_type == 'classification':
        n_classes = len(np.unique(y_train))
        self._n_classes = n_classes
    if self.training_columns is None:
        self.training_columns = list(x_train.columns)

    self.model.fit(x_train[self.training_columns], y_train, **kwargs)
def get_model_module(self)

Get the string containing the name of the model module

Returns

out : str
Name of the model module
Expand source code
def get_model_module(self):
    """
    Get the string containing the name
    of the model module

    Returns
    ---------------------------
    out: str
        Name of the model module
    """
    return self.model_string
def get_model_params(self)

Get the model (hyper-)parameters

Returns

out : dict
Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
Expand source code
def get_model_params(self):
    """
    Get the model (hyper-)parameters

    Returns
    ------------------------------------
    out: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...
    """
    return self.model.get_params()
def get_n_classes(self)

Get the number of classes

Returns

out : int
Number of classes
Expand source code
def get_n_classes(self):
    """
    Get the number of classes

    Returns
    ---------------------------
    out: int
        Number of classes
    """
    return self._n_classes
def get_original_model(self)

Get the original unwrapped model

Returns

out : XGBoost, LGBM or sklearn model
 
Expand source code
def get_original_model(self):
    """
    Get the original unwrapped model

    Returns
    ---------------------------
    out: XGBoost, LGBM or sklearn model
    """
    return self.model
def get_task_type(self)

Get the task type of the model

Returns

out : str
Task type of the model: 'classification' or 'regression'
Expand source code
def get_task_type(self):
    """
    Get the task type of the model

    Returns
    ---------------------------
    out: str
        Task type of the model: 'classification' or 'regression'
    """
    return self._task_type
def get_training_columns(self)

Get the features used for the training process

Returns

out : list
Names of the features used for the training. Example: ['dEdx', 'pT', 'ct']
Expand source code
def get_training_columns(self):
    """
    Get the features used for the training process

    Returns
    ------------------------------------
    out: list
        Names of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']
    """

    return self.training_columns
def load_model_handler(self, filename)

Load a model handler saved into a pickle file

Parameters

filename : str
Name of the file in which the model is saved
Expand source code
def load_model_handler(self, filename):
    """
    Load a model handler saved into a pickle file

    Parameters
    -----------------------------------------------------
    filename: str
        Name of the file in which the model is saved
    """
    with open(filename, "rb") as input_file:
        loaded_model = pickle.load(input_file)
        self.model = loaded_model.get_original_model()
        self.training_columns = loaded_model.get_training_columns()
        self.model_params = loaded_model.get_model_params()
        self.model.set_params(**self.model_params)
        self.model_string = loaded_model.get_model_module()
        self._n_classes = loaded_model.get_n_classes()
        self._task_type = loaded_model.get_task_type()
def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize', optuna_sampler=None, resume_study=None, save_study=None, **kwargs)

Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization.

Parameters

data : list
Contains respectively: training set dataframe, training label array, test set dataframe, test label array
hyperparams_ranges : dict
Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 }
cross_val_scoring : string, callable or None
Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
direction : str
The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize')
optuna_sampler : optuna.samplers.BaseSampler
Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html
nfold : int
Number of folds to calculate the cross validation error
resume_study : str
A string indicating the filename of the study to be resumed. If None, the study is not resumed.
save_study : str
A string indicating the filename of the study. If None, the study is not saved into a file.
**kwargs : dict
Optuna study parameters

Returns

study : optuna.study.Study
The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
Expand source code
def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize',
                           optuna_sampler=None, resume_study=None, save_study=None, **kwargs):
    """
    Perform hyperparameter optimization of ModelHandler using the Optuna module.
    The model hyperparameters are automatically set as the ones that provided the
    best result during the optimization.

    Parameters
    ------------------------------------------------------
    data: list
        Contains respectively: training
        set dataframe, training label array,
        test set dataframe, test label array

    hyperparams_ranges: dict
        Hyperparameter ranges (in tuples or list). If a parameter is not
        in a tuple or a list it will be considered constant.
        Important: the type of the params must be preserved
        when passing the ranges.
        For example:
        dict={
            'max_depth':(10,100)
            'learning_rate': (0.01,0.03)
            'n_jobs': 8
        }

    cross_val_scoring: string, callable or None
        Score metrics used for the cross-validation.
        A string (see sklearn model evaluation documentation:
        https://scikit-learn.org/stable/modules/model_evaluation.html)
        or a scorer callable object / function with signature scorer(estimator, X, y)
        which should return only a single value.
        In binary classification 'roc_auc' is suggested.
        In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’,
        ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested.
        For more information see
        https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

    direction: str
        The direction of optimization. Either 'maximize' or 'minimize'.
        (e.g. for the metric 'roc_auc' the direction is 'maximize')

    optuna_sampler: optuna.samplers.BaseSampler
        Sampler to be used for the optuna (maxi-)minimisation.
        If None, default TPESampler is used. For more information see:
        https://optuna.readthedocs.io/en/stable/reference/samplers.html

    nfold: int
        Number of folds to calculate the cross validation error

    resume_study: str
        A string indicating the filename of the study to be resumed.
        If None, the study is not resumed.

    save_study: str
        A string indicating the filename of the study. If None,
        the study is not saved into a file.

    **kwargs: dict
        Optuna study parameters

    Returns
    ------------------------------------------------------

    study: optuna.study.Study
        The obtuna object which stores the whole study. See Optuna's documentation for more details:
        https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
    """

    n_classes = len(np.unique(data[1]))
    self._n_classes = n_classes
    if self.training_columns is None:
        self.training_columns = list(data[0].columns)

    x_train, y_train, _, _ = data

    def __get_int_or_uniform(hyperparams_ranges, trial):

        params = {}

        for key in hyperparams_ranges:
            if isinstance(hyperparams_ranges[key][0], int):
                params[key] = trial.suggest_int(
                    key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])
            elif isinstance(hyperparams_ranges[key][0], float):
                params[key] = trial.suggest_uniform(
                    key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])

        return params

    def __objective(trial):

        params = __get_int_or_uniform(hyperparams_ranges, trial)
        model_copy = deepcopy(self.model)
        model_copy.set_params(**{**self.model_params, **params})
        return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train,
                                       cv=nfold, scoring=cross_val_scoring, n_jobs=1))
    if resume_study:
        with open(resume_study, 'rb') as resume_study_file:
            study = pickle.load(resume_study_file)
    else:
        study = optuna.create_study(
            direction=direction, sampler=optuna_sampler)

    study.optimize(__objective, **kwargs)

    if save_study:
        with open(save_study, 'wb') as study_file:
            pickle.dump(study, study_file)

    print(f"Number of finished trials: {len(study.trials)}")
    print("Best trial:")
    best_trial = study.best_trial

    print(f"Value: {best_trial.value}")
    print("Params: ")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")

    self.set_model_params({**self.model_params, **best_trial.params})

    return study
def predict(self, x_test, output_margin=True, **kwargs)

Return model prediction for the array x_test Parameters


x_test : hipe4ml tree_handler, array-like, sparse matrix
The input sample.
output_margin : bool
Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.

**kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict()

Returns

out : numpy array
Model predictions
Expand source code
def predict(self, x_test, output_margin=True, **kwargs):
    """
    Return model prediction for the array x_test
    Parameters
    --------------------------------------
    x_test: hipe4ml tree_handler, array-like, sparse matrix
        The input sample.

    output_margin: bool
        Whether to output the raw untransformed margin value. If False model
        probabilities are returned. Not used when task_type is 'regression'.

    **kwargs:
        Extra kwargs passed on to the following model prediction function:
        if (task_type == 'classification')
        - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True
        - predict_proba() if output_margin==False
        if (task_type == 'regression')
        - predict()

    Returns
    ---------------------------------------
    out: numpy array
        Model predictions
    """
    if isinstance(x_test, hipe4ml.tree_handler.TreeHandler):
        x_test = x_test.get_data_frame()

    x_test = x_test[self.training_columns]

    # regression
    if self._task_type == 'regression':
        return self.model.predict(x_test, **kwargs)

    # classification
    if output_margin:
        if self.model_string == 'xgboost':
            return self.model.predict(x_test, output_margin=True, **kwargs)
        if self.model_string == 'lightgbm':
            return self.model.predict(x_test, raw_score=True, **kwargs)
        if self.model_string == 'sklearn':
            if not hasattr(self.model, 'decision_function'):
                raise ValueError(
                    "This Model does not support a decision_function(): use output_margin=False")
            return self.model.decision_function(x_test, **kwargs).ravel()

    pred = self.model.predict_proba(x_test, **kwargs)
    # in case of binary classification return only the scores of
    # the signal class
    if pred.shape[1] <= 2:
        pred = pred[:, 1]
    return pred
def set_model_params(self, model_params)

Set the model (hyper-)parameters

Parameters

model_params : dict
Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
Expand source code
def set_model_params(self, model_params):
    """
    Set the model (hyper-)parameters

    Parameters
    ------------------------------------
    model_params: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...
    """
    self.model_params = model_params
    self.model.set_params(**self.model_params)
def set_training_columns(self, training_columns)

Set the features used for the training process

Parameters

training_columns : list
Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
Expand source code
def set_training_columns(self, training_columns):
    """
    Set the features used for the training process

    Parameters
    ------------------------------------
    training_columns: list
        Contains the name of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']
    """
    self.training_columns = training_columns
def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro', multi_class_opt='raise', **kwargs)

Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression.

Parameters

data : list
Contains respectively: training set dataframe, training label array, test set dataframe, test label array
return_prediction : bool
If True Model predictions on the test set are returned
output_margin : bool
Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.
average : string
Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
multi_class_opt : string
Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available
**kwargs : dict
Extra kwargs passed on to the model fit method

Returns

out : numpy array or None
If return_prediction==True, Model predictions on the test set are returned
Expand source code
def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro',
                     multi_class_opt='raise', **kwargs):
    """
    Perform the training and the testing of the model. The model performance is estimated
    using the ROC AUC metric for classification and the MSE for regression.

    Parameters
    ----------------------------------------------
    data: list
        Contains respectively: training
        set dataframe, training label array,
        test set dataframe, test label array

    return_prediction: bool
        If True Model predictions on the test set are
        returned

    output_margin: bool
        Whether to output the raw untransformed margin value. If False model
        probabilities are returned. Not used when task_type is 'regression'.

    average: string
        Option for the average of ROC AUC scores used only in case of multi-classification.
        You can choose between 'macro' and 'weighted'. For more information see
        https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score

    multi_class_opt: string
        Option to compute ROC AUC scores used only in case of multi-classification.
        The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available

    **kwargs: dict
        Extra kwargs passed on to the model fit method

    Returns
    ---------------------------------------
    out: numpy array or None
        If return_prediction==True, Model predictions on the test set are
        returned

    """

    # get number of classes
    n_classes = len(np.unique(data[1]))
    self._n_classes = n_classes
    print('==============================')
    print(f"Training {self.model_string} model for {self._task_type}")
    if self._task_type == 'classification':
        print('Number of detected classes:', n_classes)

    # final training with the optimized hyperparams
    print('Training the model: ...')
    self.fit(data[0], data[1], **kwargs)
    print('Training the model: Done!')
    print('Testing the model: ...')
    y_pred = self.predict(data[2], output_margin=output_margin)
    if self._task_type == 'classification':
        roc_score = roc_auc_score(
            data[3], y_pred, average=average, multi_class=multi_class_opt)
        print(f'ROC_AUC_score: {roc_score:.6f}')
    else:
        mse_score = mean_squared_error(data[3], y_pred)
        print(f'Mean squared error: {mse_score:.6f}')
    print('Testing the model: Done!')
    print('==============================')
    if return_prediction:
        return y_pred
    return None