Source code for etna.analysis.feature_selection.mrmr_selection

import warnings
from enum import Enum
from typing import List

import numpy as np
import pandas as pd

[docs]class AggregationMode(str, Enum): """Enum for different aggregation modes.""" mean = "mean" max = "max" min = "min" median = "median"
AGGREGATION_FN = { AggregationMode.mean: np.mean, AggregationMode.max: np.max, AggregationMode.min: np.min, AggregationMode.median: np.median, }
[docs]def mrmr( relevance_table: pd.DataFrame, regressors: pd.DataFrame, top_k: int, fast_redundancy: bool = False, relevance_aggregation_mode: str = AggregationMode.mean, redundancy_aggregation_mode: str = AggregationMode.mean, atol: float = 1e-10, ) -> List[str]: """ Maximum Relevance and Minimum Redundancy feature selection method. Here relevance for each regressor is calculated as the per-segment aggregation of the relevance values in relevance_table. The redundancy term for the regressor is calculated as a mean absolute correlation between this regressor and other ones. The correlation between the two regressors is an aggregated pairwise correlation for the regressors values in each segment. Parameters ---------- relevance_table: dataframe of shape n_segment x n_exog_series with relevance table, where ``relevance_table[i][j]`` contains relevance of j-th ``df_exog`` series to i-th df series regressors: dataframe with regressors in etna format top_k: num of regressors to select; if there are not enough regressors, then all will be selected fast_redundancy: * True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)` * False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)` relevance_aggregation_mode: the method for relevance values per-segment aggregation redundancy_aggregation_mode: the method for redundancy values per-segment aggregation atol: the absolute tolerance to compare the float values Returns ------- selected_features: List[str] list of ``top_k`` selected regressors, sorted by their importance """ if not fast_redundancy: warnings.warn( "Option `fast_redundancy=False` was added for backward compatibility and will be removed in etna 3.0.0.", DeprecationWarning, ) relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)] redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)] # can't compute correlation of categorical column with the others try: regressors = regressors.astype(float) except ValueError as e: raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}") relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0) all_features = relevance.index.to_list() segments = set(regressors.columns.get_level_values("segment")) selected_features: List[str] = [] not_selected_features = all_features.copy() redundancy_table = pd.DataFrame(np.inf, index=all_features, columns=all_features) top_k = min(top_k, len(all_features)) for i in range(top_k): score_numerator = relevance.loc[not_selected_features] score_denominator = pd.Series(1, index=not_selected_features) if i > 0: last_selected_feature = selected_features[-1] last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]] not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]] if fast_redundancy: segment_redundancy = pd.concat( [ not_selected_regressors[segment].apply( lambda col: last_selected_regressor[segment].corrwith(col) # noqa: B023 ) for segment in segments ] ).abs() else: segment_redundancy = ( not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023 .abs() .groupby("feature") .apply(redundancy_aggregation_fn) .T.groupby("feature") ) redundancy_table.loc[not_selected_features, last_selected_feature] = ( segment_redundancy.apply(redundancy_aggregation_fn) .clip(atol) .fillna(np.inf) .loc[not_selected_features] .values.squeeze() ) score_denominator = redundancy_table.loc[not_selected_features, selected_features].mean(axis=1) score_denominator[np.isclose(score_denominator, 1, atol=atol)] = np.inf score = score_numerator / score_denominator best_feature = score.index[score.argmax()] selected_features.append(best_feature) not_selected_features.remove(best_feature) return selected_features