Module hipe4ml.tree_handler
Simple module with a class to manage the data used in the analysis
Classes
class TreeHandler (file_name=None, tree_name=None, column_names=None, folder_name=None, **kwds)
-
Expand source code
class TreeHandler: """ Class for storing and managing the data of a ROOT tree from a .root file or a pandas.DataFrame from a .parquet file """ def __init__(self, file_name=None, tree_name=None, column_names=None, folder_name=None, **kwds): """ Open the file in which the selected tree leaves are converted into pandas dataframe columns. If tree_name is not provided file_name is assumed to be associated to a .parquet file Parameters ------------------------------------------------ file_name: str or list of str Name of the input file where the data sit or list of input files tree_name: str Name of the tree within the input file, must be the same for all files. If None the method pandas.read_parquet is called column_names: list List of the names of the branches that one wants to analyse. If column_names is not specified all the branches are converted folder_name: str Name of the folder/folders within the input file. If the folder_name ends with a '*' all the folders containing the string folder_name are read and merged into a single dataframe. Example: folder_name = "DF*" will read all the folders containing the string "DF" and merge them into a single dataframe. **kwds: extra arguments are passed on to the uproot.TTree.arrays() or pandas.read_parquet() methods: https://uproot.readthedocs.io/en/latest/uproot.behaviors.TTree.TTree.html#uproot.behaviors.TTree.TTree.arrays https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html#pandas.read_parquet """ self._tree = tree_name self._full_data_frame = None self._preselections = None self._projection_variable = None self._projection_binning = None self._sliced_df_list = None if file_name is None: return self._full_data_frame = pd.DataFrame() self._files = file_name if isinstance(file_name, list) else [file_name] for file in self._files: if self._tree is None: # read from a parquet file self._full_data_frame = pd.concat( [self._full_data_frame, pd.read_parquet(file, columns=column_names, **kwds)], ignore_index=True, copy=False) continue if folder_name is None: self._full_data_frame = pd.concat( [self._full_data_frame, uproot.open(f'{file}:{self._tree}').arrays(filter_name=column_names, library='pd', **kwds)], ignore_index=True, copy=False) continue if folder_name[-1] != '*': self._full_data_frame = pd.concat( [self._full_data_frame, uproot.open(f'{file}:{folder_name}/{self._tree}').arrays(filter_name=column_names, library='pd', **kwds)], ignore_index=True, copy=False) continue file_folders = uproot.open(file).keys() # check if there are multiple cycles of the same tree, keep only last one # first we sort to have as first one the last cycle file_folders.sort(reverse=True) file_folders_to_remove = [] for ifolder, folder in enumerate(file_folders[1:]): obj_nocycle = folder.split(";")[0] if obj_nocycle in file_folders[ifolder]: file_folders_to_remove.append(folder) for folder_to_remove in file_folders_to_remove: file_folders.remove(folder_to_remove) tree_path_list = [] for folder in file_folders: if folder_name[:-1] in folder and self._tree in folder: tree_path_list.append(folder) for tree_path in tree_path_list: print(f"Reading {file}:{tree_path}") self._full_data_frame = pd.concat( [self._full_data_frame, uproot.open(f'{file}:{tree_path}').arrays(filter_name=column_names, library='pd', **kwds)], ignore_index=True, copy=False) def __getitem__(self, column): """ Access to the elements of the full data frame using a dictionary-like syntax. Accessing to the slices of the data frame in this way is not supported Parameters ------------------------------------------------ column: string or list Column name/s of the full data frame """ return self._full_data_frame[column] def __len__(self): """ Evaluate the number of entries in the full data frame """ return len(self._full_data_frame) def get_handler_from_large_file(self, file_name, tree_name, model_handler=None, preselection='', output_margin=True, max_workers=None): """ Read a ROOT.TTree in different lazy chuncks. Chuncks are read sequentially or in parallel and eventually pre-selections or ML selections are applied. This allows to preserve the memory usage and speed-up the reading. Chuncks size is decided automatically Parameters ----------------------------------------------- file_name: str or list of str Name of the input file where the data sit or list of input files tree_name: str Name of the tree within the input file, must be the same for all files model_handler: hipe4ml ModelHandler Model handler to be applied as a preselection on the data contained in the original tree. A column named model_output is added to the tree_handler. In case of multi-classification a new column is added for each class with name: model_output_{i} preselection: str String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. You can apply ML based preselections like in the example below: - "model_output > @score_cut" # binary classification - "model_output_0 > @score_cut[0] and model_output_1 <= @score_cut[1]" # multi-classification output_margin: bool Whether to predict the raw untransformed margin value. If False model probabilities are returned max_workers: int Maximum number of workers employed to read the chuncks. If max_workers is None or not given, it will default to the number of processors on the machine, multiplied by 5. If max_workers==-1 the multi-threading computation is turned off. More details in: https://docs.python.org/3/library/concurrent.futures.html """ self._files = file_name if isinstance(file_name, list) else [file_name] self._tree = tree_name self._preselections = preselection inputs = [f'{file_name}:{tree_name}' for file_name in self._files] executor = ThreadPoolExecutor(max_workers) if max_workers != -1 else None iterator = uproot.iterate(inputs, library='pd', decompression_executor=executor, interpretation_executor=executor) result = [] for data in iterator: if model_handler is not None: predictions = model_handler.predict(data, output_margin) n_classes = model_handler.get_n_classes() if n_classes > 2: for i_class in range(n_classes): column_name = f'model_output_{i_class}' data[column_name] = predictions[:, i_class] else: column_name = "model_output" data[column_name] = predictions if preselection: data = data.query(preselection) result.append(data) result = pd.concat(result) self._full_data_frame = result def set_data_frame(self, df_orig): """ Set the pandas DataFrame in the TreeHandler Parameters ------------------------------------------------ df: pandas.DataFrame DataFrame stored in the TreeHandler """ self._full_data_frame = df_orig def get_data_frame(self): """ Get the pandas DataFrame stored in the TreeHandler Returns ------------------------------------------------ out: pandas.DataFrame DataFrame stored in the TreeHandler """ return self._full_data_frame def get_preselections(self): """ Get the preselections applied to the stored DataFrame Returns ------------------------------------------------ out: str String containing the cuts applied to the stored DataFrame """ return self._preselections def get_projection_variable(self): """ Get the name of the sliced variable Returns ------------------------------------------------ out: str Sliced variable """ return self._projection_variable def get_projection_binning(self): """ Get the bins used for slicing the DataFrame Returns ------------------------------------------------ out: list Each element of the list is a list containing the bin edges """ return self._projection_binning def get_n_cand(self): """ Get the number of candidates stored in the full DataFrame Returns ------------------------------------------------ out: int Number of candidates """ return len(self._full_data_frame) def get_var_names(self): """ Get a list containing the name of the variables Returns ------------------------------------------------ out: list Names of the variables """ return list(self._full_data_frame.columns) def get_slice(self, n_bin): """ Get the n-th slice of the original DataFrame Parameters ------------------------------------------------ n_bin: int n-th element of _projection_binning list. Returns ------------------------------------------------ out: pandas.DataFrame N-th Slice of the original DataFrame """ return self._sliced_df_list[n_bin] def get_sliced_df_list(self): """ Get the list containing the slices of the orginal DataFrame Returns ------------------------------------------------ out: list List containing the slices of the orginal DataFrame """ return self._sliced_df_list def apply_preselections(self, preselections, inplace=True, **kwds): """ Apply preselections to data Parameters ------------------------------------------------ preselection: str String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. inplace: bool If True, the preselected dataframe replaces the initial dataframe. Otherwise return a copy of the preselected df **kwds: extra arguments are passed on to the pandas.DataFrame.query method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html#pandas.DataFrame.query Returns ------------------------------------------------ out: TreeHandler or None If inplace == True return None is returned and the full DataFrame is replaced """ if inplace: if self._preselections: self._preselections += " and " + preselections else: self._preselections = preselections self._full_data_frame.query(preselections, inplace=True, **kwds) return None new_hndl = copy.deepcopy(self) new_hndl._preselections = preselections # pylint: disable=W0212 new_hndl._full_data_frame.query(preselections, inplace=True, **kwds) # pylint: disable=W0212 return new_hndl def apply_model_handler(self, model_handler, output_margin=True, column_name=None): """ Apply the ML model to data: a new column is added to the DataFrame If a list is given the application is performed on the slices. Parameters ------------------------------------------------ model_handler: list or hipe4ml model_handler If a list of handlers(one for each bin) is provided, the ML model is applied to the slices output_margin: bool Whether to output the raw untransformed margin value. column_name: str Name of the new column with the model output """ if isinstance(model_handler, list): n_class = model_handler[0].get_n_classes() sliced = True else: sliced = False n_class = model_handler.get_n_classes() if column_name is None: if n_class > 2: column_name = [f'model_output_{i_class}' for i_class in range(n_class)] else: column_name = "model_output" if sliced: for (mod_handl, sliced_df) in zip(model_handler, self._sliced_df_list): prediction = mod_handl.predict(sliced_df, output_margin) if n_class > 2: for i_class in range(n_class): sliced_df[column_name[i_class]] = prediction[:, i_class] else: sliced_df[column_name] = prediction return prediction = model_handler.predict(self._full_data_frame, output_margin) if n_class > 2: for i_class in range(n_class): self._full_data_frame[column_name[i_class]] = prediction[:, i_class] return self._full_data_frame[column_name] = prediction def get_subset(self, selections=None, frac=None, size=None, rndm_state=None): """ Returns a TreeHandler containing a subset of the data Parameters ------------------------------------------------ selection: str String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. frac: float Fraction of candidates to return. size: int Number of candidates to return. Cannot be used with frac. rndm_state: int or numpy.random.RandomState, optional Seed for the random number generator (if int), or numpy RandomState object, passed to the pandas.DataFrame.sample() method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html Returns ------------------------------------------------ out: TreeHandler TreeHandler containing a subset of the current data """ subset = copy.deepcopy(self) if selections: subset.apply_preselections(selections, inplace=True) if frac or size: subset.shuffle_data_frame(frac=frac, size=size, inplace=True, random_state=rndm_state) return subset def slice_data_frame(self, projection_variable, projection_binning, delete_original_df=False): """ Create a list containing slices of the orginal DataFrame. The original DataFrame is splitted in N sub-DataFrames following the binning(projection_binning) of a given variable(projected_variable) Parameters ------------------------------------------------ projection_variable: str Name of the variable that will be sliced in the analysis projection_binning: list Binning of the sliced variable should be given as a list of [min, max) values for each bin delete_original_df: bool If True delete the original DataFrame. Only the the slice array will be accessible in this case """ self._projection_variable = projection_variable self._projection_binning = projection_binning self._sliced_df_list = [] for ibin in projection_binning: bin_mask = np.logical_and( self._full_data_frame[projection_variable] >= ibin[0], self._full_data_frame[projection_variable] < ibin[1]) self._sliced_df_list.append(self._full_data_frame[bin_mask].copy()) if delete_original_df: self._full_data_frame = None def shuffle_data_frame(self, size=None, frac=None, inplace=True, **kwds): """ Extract a random sample from the DataFrame Parameters ------------------------------------------------ size: int Number of candidates to return. Cannot be used with frac. Default = 1 if frac = None. frac: float Fraction of candidates to return. Cannot be used with size. inplace: bool If True the shuffled dataframe replaces the initial dataframe. Otherwise return a copy of the shuffled df **kwds: extra arguments are passed on to the pandas.DataFrame.sample method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html Returns ------------------------------------------------ out: TreeHandler or None If inplace == True None is returned and the full DataFrame is replaced """ if inplace: self._full_data_frame = self._full_data_frame.sample(size, frac, **kwds) return None new_hndl = copy.deepcopy(self) new_hndl._full_data_frame = self._full_data_frame.sample(size, frac, **kwds) # pylint: disable=W0212 return new_hndl def eval_data_frame(self, ev_str, inplace=True, **kwds): """ Evaluate a string describing operations on DataFrame columns Parameters ------------------------------------------------ ev_str: str The expression string to evaluate. The string syntax is the one required in the pandas.DataFrame.eval() method. inplace: bool If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. **kwds: extra arguments are passed on to the pandas.DataFrame.eval method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eval.html Returns ------------------------------------------------ out: TreeHandler or None if inplace == True None is returned and the full dataframe is evaluated """ if inplace: self._full_data_frame.eval(ev_str, inplace=True, **kwds) return None new_hndl = copy.deepcopy(self) new_hndl._full_data_frame.eval(ev_str, inplace=True, **kwds) # pylint: disable=W0212 return new_hndl def print_summary(self): """ Print information about the TreeHandler object and its data members """ print("\nFile name: ", self._files) print("Tree name: ", self._tree) print("DataFrame head:\n", self._full_data_frame.head(5)) print("\nPreselections:", self._preselections) print("Sliced variable: ", self._projection_variable) print("Slices binning: ", self._projection_binning) def write_df_to_parquet_files(self, base_file_name="TreeDataFrame", path="./", save_slices=False): """ Write the pandas dataframe to parquet files Parameters ------------------------------------------------ base_file_name: str Base filename used to save the parquet files path: str Base path of the output files save_slices: bool If True and the slices are available, single parquet files for each bins are created """ if self._full_data_frame is not None: name = os.path.join(path, f"{base_file_name}.parquet.gzip") self._full_data_frame.to_parquet(name, compression="gzip") else: print("\nWarning: original DataFrame not available") if save_slices: if self._sliced_df_list is not None: for ind, i_bin in enumerate(self._projection_binning): name = os.path.join( path, f"{base_file_name}_{self._projection_variable}_{i_bin[0]}_{i_bin[1]}.parquet.gzip") self._sliced_df_list[ind].to_parquet( name, compression="gzip") else: print("\nWarning: slices not available") def write_df_to_root_files(self, base_file_name="TreeDataFrame", tree_name="df", path="./", save_slices=False): """ Write the pandas dataframe to root files Parameters ------------------------------------------------ base_file_name: str Base filename used to save the root files path: str Base path of the output files save_slices: bool If True and the slices are available, single root files for each bins are created """ if self._full_data_frame is not None: name = os.path.join(path, f"{base_file_name}.root") out_file = uproot.recreate(name) out_file[tree_name] = self._full_data_frame out_file.close() else: print("\nWarning: original DataFrame not available") if save_slices: if self._sliced_df_list is not None: for ind, i_bin in enumerate(self._projection_binning): name = os.path.join( path, f"{base_file_name}_{self._projection_variable}_{i_bin[0]}_{i_bin[1]}.root") out_file = uproot.recreate(name) out_file[tree_name] = self._sliced_df_list[ind] out_file.close() else: print("\nWarning: slices not available")
Class for storing and managing the data of a ROOT tree from a .root file or a pandas.DataFrame from a .parquet file
Open the file in which the selected tree leaves are converted into pandas dataframe columns. If tree_name is not provided file_name is assumed to be associated to a .parquet file
Parameters
file_name
:str
orlist
ofstr
- Name of the input file where the data sit or list of input files
tree_name
:str
- Name of the tree within the input file, must be the same for all files. If None the method pandas.read_parquet is called
column_names
:list
- List of the names of the branches that one wants to analyse. If column_names is not specified all the branches are converted
folder_name
:str
- Name of the folder/folders within the input file. If the folder_name ends with a '' all the folders containing the string folder_name are read and merged into a single dataframe. Example: folder_name = "DF" will read all the folders containing the string "DF" and merge them into a single dataframe.
**kwds
:extra arguments are passed on to the uproot.TTree.arrays()
orpandas.read_parquet() methods:
- https://uproot.readthedocs.io/en/latest/uproot.behaviors.TTree.TTree.html#uproot.behaviors.TTree.TTree.arrays https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html#pandas.read_parquet
Methods
def apply_model_handler(self, model_handler, output_margin=True, column_name=None)
-
Expand source code
def apply_model_handler(self, model_handler, output_margin=True, column_name=None): """ Apply the ML model to data: a new column is added to the DataFrame If a list is given the application is performed on the slices. Parameters ------------------------------------------------ model_handler: list or hipe4ml model_handler If a list of handlers(one for each bin) is provided, the ML model is applied to the slices output_margin: bool Whether to output the raw untransformed margin value. column_name: str Name of the new column with the model output """ if isinstance(model_handler, list): n_class = model_handler[0].get_n_classes() sliced = True else: sliced = False n_class = model_handler.get_n_classes() if column_name is None: if n_class > 2: column_name = [f'model_output_{i_class}' for i_class in range(n_class)] else: column_name = "model_output" if sliced: for (mod_handl, sliced_df) in zip(model_handler, self._sliced_df_list): prediction = mod_handl.predict(sliced_df, output_margin) if n_class > 2: for i_class in range(n_class): sliced_df[column_name[i_class]] = prediction[:, i_class] else: sliced_df[column_name] = prediction return prediction = model_handler.predict(self._full_data_frame, output_margin) if n_class > 2: for i_class in range(n_class): self._full_data_frame[column_name[i_class]] = prediction[:, i_class] return self._full_data_frame[column_name] = prediction
Apply the ML model to data: a new column is added to the DataFrame If a list is given the application is performed on the slices.
Parameters
model_handler
:list
orhipe4ml model_handler
- If a list of handlers(one for each bin) is provided, the ML model is applied to the slices
output_margin
:bool
- Whether to output the raw untransformed margin value.
column_name
:str
- Name of the new column with the model output
def apply_preselections(self, preselections, inplace=True, **kwds)
-
Expand source code
def apply_preselections(self, preselections, inplace=True, **kwds): """ Apply preselections to data Parameters ------------------------------------------------ preselection: str String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. inplace: bool If True, the preselected dataframe replaces the initial dataframe. Otherwise return a copy of the preselected df **kwds: extra arguments are passed on to the pandas.DataFrame.query method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html#pandas.DataFrame.query Returns ------------------------------------------------ out: TreeHandler or None If inplace == True return None is returned and the full DataFrame is replaced """ if inplace: if self._preselections: self._preselections += " and " + preselections else: self._preselections = preselections self._full_data_frame.query(preselections, inplace=True, **kwds) return None new_hndl = copy.deepcopy(self) new_hndl._preselections = preselections # pylint: disable=W0212 new_hndl._full_data_frame.query(preselections, inplace=True, **kwds) # pylint: disable=W0212 return new_hndl
Apply preselections to data
Parameters
preselection
:str
- String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b.
inplace
:bool
- If True, the preselected dataframe replaces the initial dataframe. Otherwise return a copy of the preselected df
**kwds
:extra arguments are passed on to the pandas.DataFrame.query method:
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html#pandas.DataFrame.query
Returns
out
:TreeHandler
orNone
- If inplace == True return None is returned and the full DataFrame is replaced
def eval_data_frame(self, ev_str, inplace=True, **kwds)
-
Expand source code
def eval_data_frame(self, ev_str, inplace=True, **kwds): """ Evaluate a string describing operations on DataFrame columns Parameters ------------------------------------------------ ev_str: str The expression string to evaluate. The string syntax is the one required in the pandas.DataFrame.eval() method. inplace: bool If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. **kwds: extra arguments are passed on to the pandas.DataFrame.eval method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eval.html Returns ------------------------------------------------ out: TreeHandler or None if inplace == True None is returned and the full dataframe is evaluated """ if inplace: self._full_data_frame.eval(ev_str, inplace=True, **kwds) return None new_hndl = copy.deepcopy(self) new_hndl._full_data_frame.eval(ev_str, inplace=True, **kwds) # pylint: disable=W0212 return new_hndl
Evaluate a string describing operations on DataFrame columns
Parameters
ev_str
:str
- The expression string to evaluate. The string syntax is the one required in the pandas.DataFrame.eval() method.
inplace
:bool
- If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned.
**kwds
:extra arguments are passed on to the pandas.DataFrame.eval method:
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eval.html
Returns
out
:TreeHandler
orNone
- if inplace == True None is returned and the full dataframe is evaluated
def get_data_frame(self)
-
Expand source code
def get_data_frame(self): """ Get the pandas DataFrame stored in the TreeHandler Returns ------------------------------------------------ out: pandas.DataFrame DataFrame stored in the TreeHandler """ return self._full_data_frame
Get the pandas DataFrame stored in the TreeHandler
Returns
out
:pandas.DataFrame
- DataFrame stored in the TreeHandler
def get_handler_from_large_file(self,
file_name,
tree_name,
model_handler=None,
preselection='',
output_margin=True,
max_workers=None)-
Expand source code
def get_handler_from_large_file(self, file_name, tree_name, model_handler=None, preselection='', output_margin=True, max_workers=None): """ Read a ROOT.TTree in different lazy chuncks. Chuncks are read sequentially or in parallel and eventually pre-selections or ML selections are applied. This allows to preserve the memory usage and speed-up the reading. Chuncks size is decided automatically Parameters ----------------------------------------------- file_name: str or list of str Name of the input file where the data sit or list of input files tree_name: str Name of the tree within the input file, must be the same for all files model_handler: hipe4ml ModelHandler Model handler to be applied as a preselection on the data contained in the original tree. A column named model_output is added to the tree_handler. In case of multi-classification a new column is added for each class with name: model_output_{i} preselection: str String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. You can apply ML based preselections like in the example below: - "model_output > @score_cut" # binary classification - "model_output_0 > @score_cut[0] and model_output_1 <= @score_cut[1]" # multi-classification output_margin: bool Whether to predict the raw untransformed margin value. If False model probabilities are returned max_workers: int Maximum number of workers employed to read the chuncks. If max_workers is None or not given, it will default to the number of processors on the machine, multiplied by 5. If max_workers==-1 the multi-threading computation is turned off. More details in: https://docs.python.org/3/library/concurrent.futures.html """ self._files = file_name if isinstance(file_name, list) else [file_name] self._tree = tree_name self._preselections = preselection inputs = [f'{file_name}:{tree_name}' for file_name in self._files] executor = ThreadPoolExecutor(max_workers) if max_workers != -1 else None iterator = uproot.iterate(inputs, library='pd', decompression_executor=executor, interpretation_executor=executor) result = [] for data in iterator: if model_handler is not None: predictions = model_handler.predict(data, output_margin) n_classes = model_handler.get_n_classes() if n_classes > 2: for i_class in range(n_classes): column_name = f'model_output_{i_class}' data[column_name] = predictions[:, i_class] else: column_name = "model_output" data[column_name] = predictions if preselection: data = data.query(preselection) result.append(data) result = pd.concat(result) self._full_data_frame = result
Read a ROOT.TTree in different lazy chuncks. Chuncks are read sequentially or in parallel and eventually pre-selections or ML selections are applied. This allows to preserve the memory usage and speed-up the reading. Chuncks size is decided automatically
Parameters
file_name
:str
orlist
ofstr
- Name of the input file where the data sit or list of input files
tree_name
:str
- Name of the tree within the input file, must be the same for all files
model_handler
:hipe4ml ModelHandler
- Model handler to be applied as a preselection on the data contained in the original tree. A column named model_output is added to the tree_handler. In case of multi-classification a new column is added for each class with name: model_output_{i}
preselection
:str
- String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. You can apply ML based preselections like in the example below: - "model_output > @score_cut" # binary classification - "model_output_0 > @score_cut[0] and model_output_1 <= @score_cut[1]" # multi-classification
output_margin
:bool
- Whether to predict the raw untransformed margin value. If False model probabilities are returned
max_workers
:int
- Maximum number of workers employed to read the chuncks. If max_workers is None or not given, it will default to the number of processors on the machine, multiplied by 5. If max_workers==-1 the multi-threading computation is turned off. More details in: https://docs.python.org/3/library/concurrent.futures.html
def get_n_cand(self)
-
Expand source code
def get_n_cand(self): """ Get the number of candidates stored in the full DataFrame Returns ------------------------------------------------ out: int Number of candidates """ return len(self._full_data_frame)
Get the number of candidates stored in the full DataFrame
Returns
out
:int
Number of candidates
def get_preselections(self)
-
Expand source code
def get_preselections(self): """ Get the preselections applied to the stored DataFrame Returns ------------------------------------------------ out: str String containing the cuts applied to the stored DataFrame """ return self._preselections
Get the preselections applied to the stored DataFrame
Returns
out
:str
- String containing the cuts applied to the stored DataFrame
def get_projection_binning(self)
-
Expand source code
def get_projection_binning(self): """ Get the bins used for slicing the DataFrame Returns ------------------------------------------------ out: list Each element of the list is a list containing the bin edges """ return self._projection_binning
Get the bins used for slicing the DataFrame
Returns
out
:list
- Each element of the list is a list containing the bin edges
def get_projection_variable(self)
-
Expand source code
def get_projection_variable(self): """ Get the name of the sliced variable Returns ------------------------------------------------ out: str Sliced variable """ return self._projection_variable
Get the name of the sliced variable
Returns
out
:str
- Sliced variable
def get_slice(self, n_bin)
-
Expand source code
def get_slice(self, n_bin): """ Get the n-th slice of the original DataFrame Parameters ------------------------------------------------ n_bin: int n-th element of _projection_binning list. Returns ------------------------------------------------ out: pandas.DataFrame N-th Slice of the original DataFrame """ return self._sliced_df_list[n_bin]
Get the n-th slice of the original DataFrame
Parameters
n_bin
:int
- n-th element of _projection_binning list.
Returns
out
:pandas.DataFrame
- N-th Slice of the original DataFrame
def get_sliced_df_list(self)
-
Expand source code
def get_sliced_df_list(self): """ Get the list containing the slices of the orginal DataFrame Returns ------------------------------------------------ out: list List containing the slices of the orginal DataFrame """ return self._sliced_df_list
Get the list containing the slices of the orginal DataFrame
Returns
out
:list
- List containing the slices of the orginal DataFrame
def get_subset(self, selections=None, frac=None, size=None, rndm_state=None)
-
Expand source code
def get_subset(self, selections=None, frac=None, size=None, rndm_state=None): """ Returns a TreeHandler containing a subset of the data Parameters ------------------------------------------------ selection: str String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b. frac: float Fraction of candidates to return. size: int Number of candidates to return. Cannot be used with frac. rndm_state: int or numpy.random.RandomState, optional Seed for the random number generator (if int), or numpy RandomState object, passed to the pandas.DataFrame.sample() method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html Returns ------------------------------------------------ out: TreeHandler TreeHandler containing a subset of the current data """ subset = copy.deepcopy(self) if selections: subset.apply_preselections(selections, inplace=True) if frac or size: subset.shuffle_data_frame(frac=frac, size=size, inplace=True, random_state=rndm_state) return subset
Returns a TreeHandler containing a subset of the data
Parameters
selection
:str
- String containing the cuts to be applied as preselection on the data contained in the original tree. The string syntax is the one required in the pandas.DataFrame.query() method. You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b.
frac
:float
- Fraction of candidates to return.
size
:int
- Number of candidates to return. Cannot be used with frac.
rndm_state
:int
ornumpy.random.RandomState
, optional- Seed for the random number generator (if int), or numpy RandomState object, passed to the pandas.DataFrame.sample() method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
Returns
out
:TreeHandler
- TreeHandler containing a subset of the current data
def get_var_names(self)
-
Expand source code
def get_var_names(self): """ Get a list containing the name of the variables Returns ------------------------------------------------ out: list Names of the variables """ return list(self._full_data_frame.columns)
Get a list containing the name of the variables
Returns
out
:list
Names of the variables
def print_summary(self)
-
Expand source code
def print_summary(self): """ Print information about the TreeHandler object and its data members """ print("\nFile name: ", self._files) print("Tree name: ", self._tree) print("DataFrame head:\n", self._full_data_frame.head(5)) print("\nPreselections:", self._preselections) print("Sliced variable: ", self._projection_variable) print("Slices binning: ", self._projection_binning)
Print information about the TreeHandler object and its data members
def set_data_frame(self, df_orig)
-
Expand source code
def set_data_frame(self, df_orig): """ Set the pandas DataFrame in the TreeHandler Parameters ------------------------------------------------ df: pandas.DataFrame DataFrame stored in the TreeHandler """ self._full_data_frame = df_orig
Set the pandas DataFrame in the TreeHandler
Parameters
df
:pandas.DataFrame
- DataFrame stored in the TreeHandler
def shuffle_data_frame(self, size=None, frac=None, inplace=True, **kwds)
-
Expand source code
def shuffle_data_frame(self, size=None, frac=None, inplace=True, **kwds): """ Extract a random sample from the DataFrame Parameters ------------------------------------------------ size: int Number of candidates to return. Cannot be used with frac. Default = 1 if frac = None. frac: float Fraction of candidates to return. Cannot be used with size. inplace: bool If True the shuffled dataframe replaces the initial dataframe. Otherwise return a copy of the shuffled df **kwds: extra arguments are passed on to the pandas.DataFrame.sample method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html Returns ------------------------------------------------ out: TreeHandler or None If inplace == True None is returned and the full DataFrame is replaced """ if inplace: self._full_data_frame = self._full_data_frame.sample(size, frac, **kwds) return None new_hndl = copy.deepcopy(self) new_hndl._full_data_frame = self._full_data_frame.sample(size, frac, **kwds) # pylint: disable=W0212 return new_hndl
Extract a random sample from the DataFrame
Parameters
size
:int
- Number of candidates to return. Cannot be used with frac. Default = 1 if frac = None.
frac
:float
- Fraction of candidates to return. Cannot be used with size.
inplace
:bool
- If True the shuffled dataframe replaces the initial dataframe. Otherwise return a copy of the shuffled df
**kwds
:extra arguments are passed on to the pandas.DataFrame.sample method:
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
Returns
out
:TreeHandler
orNone
- If inplace == True None is returned and the full DataFrame is replaced
def slice_data_frame(self, projection_variable, projection_binning, delete_original_df=False)
-
Expand source code
def slice_data_frame(self, projection_variable, projection_binning, delete_original_df=False): """ Create a list containing slices of the orginal DataFrame. The original DataFrame is splitted in N sub-DataFrames following the binning(projection_binning) of a given variable(projected_variable) Parameters ------------------------------------------------ projection_variable: str Name of the variable that will be sliced in the analysis projection_binning: list Binning of the sliced variable should be given as a list of [min, max) values for each bin delete_original_df: bool If True delete the original DataFrame. Only the the slice array will be accessible in this case """ self._projection_variable = projection_variable self._projection_binning = projection_binning self._sliced_df_list = [] for ibin in projection_binning: bin_mask = np.logical_and( self._full_data_frame[projection_variable] >= ibin[0], self._full_data_frame[projection_variable] < ibin[1]) self._sliced_df_list.append(self._full_data_frame[bin_mask].copy()) if delete_original_df: self._full_data_frame = None
Create a list containing slices of the orginal DataFrame. The original DataFrame is splitted in N sub-DataFrames following the binning(projection_binning) of a given variable(projected_variable)
Parameters
projection_variable
:str
- Name of the variable that will be sliced in the analysis
projection_binning
:list
- Binning of the sliced variable should be given as a list of [min, max) values for each bin
delete_original_df
:bool
- If True delete the original DataFrame. Only the the slice array will be accessible in this case
def write_df_to_parquet_files(self, base_file_name='TreeDataFrame', path='./', save_slices=False)
-
Expand source code
def write_df_to_parquet_files(self, base_file_name="TreeDataFrame", path="./", save_slices=False): """ Write the pandas dataframe to parquet files Parameters ------------------------------------------------ base_file_name: str Base filename used to save the parquet files path: str Base path of the output files save_slices: bool If True and the slices are available, single parquet files for each bins are created """ if self._full_data_frame is not None: name = os.path.join(path, f"{base_file_name}.parquet.gzip") self._full_data_frame.to_parquet(name, compression="gzip") else: print("\nWarning: original DataFrame not available") if save_slices: if self._sliced_df_list is not None: for ind, i_bin in enumerate(self._projection_binning): name = os.path.join( path, f"{base_file_name}_{self._projection_variable}_{i_bin[0]}_{i_bin[1]}.parquet.gzip") self._sliced_df_list[ind].to_parquet( name, compression="gzip") else: print("\nWarning: slices not available")
Write the pandas dataframe to parquet files
Parameters
base_file_name
:str
- Base filename used to save the parquet files
path
:str
- Base path of the output files
save_slices
:bool
- If True and the slices are available, single parquet files for each bins are created
def write_df_to_root_files(self, base_file_name='TreeDataFrame', tree_name='df', path='./', save_slices=False)
-
Expand source code
def write_df_to_root_files(self, base_file_name="TreeDataFrame", tree_name="df", path="./", save_slices=False): """ Write the pandas dataframe to root files Parameters ------------------------------------------------ base_file_name: str Base filename used to save the root files path: str Base path of the output files save_slices: bool If True and the slices are available, single root files for each bins are created """ if self._full_data_frame is not None: name = os.path.join(path, f"{base_file_name}.root") out_file = uproot.recreate(name) out_file[tree_name] = self._full_data_frame out_file.close() else: print("\nWarning: original DataFrame not available") if save_slices: if self._sliced_df_list is not None: for ind, i_bin in enumerate(self._projection_binning): name = os.path.join( path, f"{base_file_name}_{self._projection_variable}_{i_bin[0]}_{i_bin[1]}.root") out_file = uproot.recreate(name) out_file[tree_name] = self._sliced_df_list[ind] out_file.close() else: print("\nWarning: slices not available")
Write the pandas dataframe to root files
Parameters
base_file_name
:str
- Base filename used to save the root files
path
:str
- Base path of the output files
save_slices
:bool
- If True and the slices are available, single root files for each bins are created