src.FeatureSelection

  1# ************************************************************************************************************************* #
  2#   UTC Header                                                                                                              #
  3#                                                         ::::::::::::::::::::       :::    ::: :::::::::::  ::::::::       #
  4#      FeatureSelection.py                                ::::::::::::::::::::       :+:    :+:     :+:     :+:    :+:      #
  5#                                                         ::::::::::::::+++#####+++  +:+    +:+     +:+     +:+             #
  6#      By: branlyst and ismailkad < >                     ::+++##############+++     +:+    +:+     +:+     +:+             #
  7#                                                     +++##############+++::::       +#+    +:+     +#+     +#+             #
  8#                                                       +++##+++::::::::::::::       +#+    +:+     +#+     +#+             #
  9#                                                         ::::::::::::::::::::       +#+    +#+     +#+     +#+             #
 10#                                                         ::::::::::::::::::::       #+#    #+#     #+#     #+#    #+#      #
 11#      Update: 2022/06/16 19:02:58 by branlyst and ismai  ::::::::::::::::::::        ########      ###      ######## .fr   #
 12#                                                                                                                           #
 13# ************************************************************************************************************************* #
 14
 15import geopandas
 16from matplotlib import pyplot as plt
 17import seaborn as sns
 18import re
 19import folium
 20import wrapt
 21import contextily as ctx
 22
 23from src.FeatureSelectionMethods.PearsonCorrelation import PearsonCorrelation
 24from src.FeatureSelectionMethods.GrangerCausality import GrangerCausality
 25
 26
 27class FeatureSelection:
 28    """
 29    Feature Selection is a module which permits to apply feature selection methods to a dataframe. It is specialised into geospatial timeseries and provides visualisation functions.
 30
 31    Attributes:
 32        _stations_dataframe (GeoDataFrame) : contains the registered stations
 33        _stations_geometry_column (str) : indicates the column name of _stations_dataframe which contains the geometry of the station
 34        _stations_name_column (str | None) : indicates the column name of _stations_dataframe which contains the name of the station
 35        _stations_id_column (str) : indicates the column name of _stations_dataframe which contains the id of the station
 36        _stations_get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
 37        _stations_crs (str) : current crs of the _stations_dataframe
 38
 39        _feature_selection_method_objects (TemplateMethod[]) : Array of TemplateMethod implemented objects
 40        _last_used_methods (str[]) : last used method names
 41        _last_used_targets (str[]) : last used targets names
 42
 43    Example:
 44    ```python
 45    # Import module
 46    from src.FeatureSelection import FeatureSelection
 47    import pandas as pd
 48
 49    # Import data sample
 50    data = pd.read_csv('./data/sample.csv', index_col=0)
 51
 52    # Import stations references
 53    stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
 54
 55    # Instanciation of FeatureSelection
 56    fs = FeatureSelection()
 57
 58    # Registering the stations
 59    fs.register_stations(
 60        stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
 61        id_column="numero_station", # Indicate the unique id column name
 62        get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
 63        lon_column='longitude', # Indicate longitude column
 64        lat_column='latitude', # Indicate latitude column
 65        name_column='nom' # Indicate name column
 66    )
 67    ```
 68    """
 69
 70    _stations_dataframe = None
 71    _stations_geometry_column = "geometry"
 72    _stations_name_column = None
 73    _stations_id_column = None
 74    _stations_get_id_from_sensor_regex = None
 75    _stations_crs = None
 76
 77    _feature_selection_method_objects = None
 78    _last_used_methods = None
 79    _last_used_targets = None
 80
 81    def __init__(self):
 82        self._feature_selection_method_objects = [
 83            PearsonCorrelation(),
 84            GrangerCausality(),
 85        ]
 86
 87    def register_stations(
 88        self,
 89        stations_dataframe,
 90        id_column,
 91        get_id_from_sensor_regex,
 92        lon_column="lon",
 93        lat_column="lat",
 94        geometry_column=None,
 95        name_column=None,
 96        crs="EPSG:4326",
 97    ):
 98        """
 99        Register the stations in order to have a visualisation
100
101        Args:
102            stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations
103            id_column (str) : column name of the stations_dataframe which contains the unique id of the station
104            get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
105            lon_column (str) : column name of the stations_dataframe which contains the longitude of the station
106            lat_column (str) : column name of the stations_dataframe which contains the latitude of the station
107            geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry
108            name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station
109            crs (str) : crs used for the location
110
111        Example:
112        ```python
113        # Import stations references
114        stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
115
116        # Registering the stations
117        fs.register_stations(
118            stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
119            id_column="numero_station", # Indicate the unique id column name
120            get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
121            lon_column='longitude', # Indicate longitude column
122            lat_column='latitude', # Indicate latitude column
123            name_column='nom' # Indicate name column
124        )
125        ```
126        """
127
128        self._stations_dataframe = stations_dataframe
129        self._stations_name_column = name_column
130        self._stations_id_column = id_column
131        self._stations_get_id_from_sensor_regex = get_id_from_sensor_regex
132        self._stations_crs = crs
133
134        if geometry_column:
135            self._stations_geometry_column = geometry_column
136        else:
137            self._stations_dataframe.loc[:, "geometry"] = geopandas.points_from_xy(
138                self._stations_dataframe.loc[:, lon_column],
139                self._stations_dataframe.loc[:, lat_column],
140            )
141            self._stations_geometry_column = "geometry"
142            self._stations_dataframe = self._stations_dataframe.drop(
143                columns=[lon_column, lat_column]
144            )
145
146        self._stations_dataframe = geopandas.GeoDataFrame(
147            self._stations_dataframe,
148            geometry=self._stations_dataframe[self._stations_geometry_column],
149            crs=crs,
150        )
151
152    def explore_stations(self, **explore_kwargs):
153        """
154        Explore the different registered stations on an interactive map
155
156        Example:
157        ```python
158        # Explore the stations
159        fs.explore_stations()
160        ```
161        """
162        map = self._stations_dataframe.explore(
163            column=self._stations_id_column,
164            categorical=True,
165            legend=True,
166            popup=True,
167            marker_kwds=dict(radius=5, fill=True),
168            tiles="CartoDB dark_matter",
169            tooltip_kwds=dict(labels=True),
170            **explore_kwargs,
171        )
172        return map
173
174    def _get_feature_selection_object_by_name(self, name):
175        """
176        Private method, get the feature selection method object by is name
177
178        Args:
179            name (str) : Feature selection method registered name
180        """
181
182        for method in self._feature_selection_method_objects:
183            if method.get_method_name() == name:
184                return method
185        return None
186
187    def select(
188        self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1
189    ):
190        """
191        Apply feature selection methods on target_columns for a given dataframe
192
193        Args:
194            dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry
195            target_columns (str[]) : array of the target column names used to apply the feature selection
196            method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied
197
198        Example:
199        ```python
200        # Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3
201        fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15)
202        ```
203        """
204        methods = (
205            self._feature_selection_method_objects
206            if not method_names
207            else [
208                method
209                for method in self._feature_selection_method_objects
210                if method.get_method_name() in method_names
211            ]
212        )
213
214        for method in methods:
215            method.select(dataframe, target_columns, number_of_target_to_keep)
216
217        self._last_used_methods = [method.get_method_name() for method in methods]
218        self._last_used_targets = target_columns
219
220    def explore(self, used_target, used_method, **explore_kwargs):
221        """
222        Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (`select()`) must be done before
223
224        Args:
225            used_target (str) : the name of the target that we wan't to see (must be referenced in `target_columns` when `select()`)
226            used_method (str) : the name of the method that we wan't to see (must be referenced in `method_names` when `select()`, or None used)
227
228        Example:
229        ```python
230        # Explore the results for the method PearsonCorrelation and the target pm2_5_station_3
231        fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation')
232        ```
233        """
234
235        # overide of the stylefunction, should be added soon as a Geopandas feature.
236        @wrapt.patch_function_wrapper(folium, "GeoJson")
237        def new_style(wrapped, instance, args, kwargs):
238            def style_fn(x):
239                return {
240                    "fillColor": x["properties"]["__folium_color"],
241                    "color": x["properties"]["__folium_color"],
242                    "radius": x["properties"]["nb_important_sensors"] + 1,
243                    "fillOpacity": 0.8,
244                }
245
246            if "_style_column" in str(kwargs["style_function"]):
247                kwargs["style_function"] = style_fn
248            return wrapped(*args, **kwargs)
249
250        stations_importance = self.get_station_importances(used_target, used_method)
251        map = stations_importance.explore(
252            column="max_importance_value",
253            legend=True,
254            marker_kwds=dict(radius=10, fill=True),
255            vmin=0,
256            vmax=1,
257            tiles="CartoDB dark_matter",
258            tooltip=[self._stations_name_column, self._stations_id_column, "sensors"],
259            popup=[self._stations_name_column, self._stations_id_column, "sensors"],
260            tooltip_kwds=dict(labels=True),
261            **explore_kwargs,
262        )
263        return map
264
265    def plot(self, used_targets=None, used_methods=None):
266        """
267        Plot the results of the feature selection. Feature selection (`select()`) must be done before.
268        (Cannot plot results for multiple methods and multiple targets at once)
269
270        Args:
271            used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in `target_columns` when `select()`). If None, all last used_targets will be used
272            used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in `method_names` when `select()`, or None used). If None, all last used_methods will be used
273
274        Example:
275        ```python
276        # Plot the results
277        fs.plot()
278
279        # Plot the results only for the target no_station_3
280        fs.plot(used_targets=['no_station_3'])
281        ```
282        """
283
284        if not used_targets:
285            used_targets = self._last_used_targets
286        if not used_methods:
287            used_methods = self._last_used_methods
288        if len(used_methods) > 1 and len(used_targets) > 1:
289            raise NotImplementedError(
290                "Cannot plot results for multiple methods and multiple targets at once yet..."
291            )
292
293        if len(used_methods) > 1:
294            fig, axs = plt.subplots(
295                len(used_methods),
296                2,
297                figsize=(30, 10 * len(used_methods)),
298                gridspec_kw={"width_ratios": [3, 1]},
299            )
300            fig.suptitle(
301                f"Feature importance visualization for the target {used_targets[0]}",
302                fontsize=36,
303            )
304            for i, method in enumerate(used_methods):
305                self._plot(
306                    used_targets[0],
307                    method,
308                    axs[i, 0],
309                    axs[i, 1],
310                    title=f"Stations importance for the method {method}",
311                )
312        elif len(used_targets) > 1:
313            fig, axs = plt.subplots(
314                len(used_targets),
315                2,
316                figsize=(30, 10 * len(used_targets)),
317                gridspec_kw={"width_ratios": [3, 1]},
318            )
319            fig.suptitle(
320                f"Feature importance visualization for the method {used_methods[0]}",
321                fontsize=36,
322            )
323            for i, target in enumerate(used_targets):
324                self._plot(
325                    target,
326                    used_methods[0],
327                    axs[i, 0],
328                    axs[i, 1],
329                    title=f"Stations importance for the target {target}",
330                )
331        else:
332            fig, (ax1, ax2) = plt.subplots(
333                1, 2, figsize=(30, 10), gridspec_kw={"width_ratios": [3, 1]}
334            )
335            fig.suptitle(
336                f"Feature importance visualization for the method {used_methods[0]} and the target {used_targets[0]}",
337                fontsize=36,
338            )
339            self._plot(used_targets[0], used_methods[0], ax1, ax2)
340
341    def _plot(self, target, method, ax1, ax2, title="Stations importance"):
342        """
343        Private method. Plot the result for a target and a method. Feature selection (`select()`) must be done before
344
345        Args:
346            target (str) : target name (must be referenced in `target_columns` when `select()`)
347            method (str) : method name (must be referenced in `method_names` when `select()`, or None used)
348            ax1 (Axe) : axe which will contains the map
349            ax2 (Axe) : axe which will contains the heatmap
350            title (str) : title of the figure
351        """
352
353        stations_importance = self.get_station_importances(target, method)
354        stations_importance = stations_importance.to_crs(
355            epsg=3857
356        )  # change to Spherical Mercator to add ctx base map properly
357        features_importance = self.get_feature_importances()
358        stations_importance.plot(
359            ax=ax1,
360            column="max_importance_value",
361            legend=True,
362            markersize=(stations_importance["nb_important_sensors"] * 40 + 5),
363            cmap=plt.cm.get_cmap("plasma"),
364            vmin=0,
365            vmax=1,
366        )
367        ax1.set_xlabel("Longitude", fontsize=10)
368        ax1.set_ylabel("Latitude", fontsize="medium")
369        ax1.set_title(title)
370        ctx.add_basemap(ax1, source=ctx.providers.CartoDB.Positron)
371        for x, y, label, offsetY in zip(
372            stations_importance.geometry.x,
373            stations_importance.geometry.y,
374            stations_importance[self._stations_id_column],
375            stations_importance["nb_important_sensors"] + 5,
376        ):
377            ax1.annotate(
378                label, xy=(x, y), xytext=(0, offsetY), textcoords="offset points"
379            )
380
381        features_importance = (
382            features_importance[method]
383            .dropna()
384            .sort_values(by=[target], ascending=False)
385        )
386        sns.heatmap(
387            features_importance[[target]],
388            ax=ax2,
389            annot=True,
390            linewidths=0.5,
391            cmap=plt.cm.get_cmap("plasma"),
392            cbar=False,
393            vmin=0,
394            vmax=1,
395        )
396
397    def get_feature_importances(self):
398        """
399        Get the features importance. Feature selection (`select()`) must be done before
400        """
401        method_names = self._last_used_methods
402
403        methods = (
404            self._feature_selection_method_objects
405            if not method_names
406            else [
407                method
408                for method in self._feature_selection_method_objects
409                if method.get_method_name() in method_names
410            ]
411        )
412        return dict(
413            zip(
414                [method.get_method_name() for method in methods],
415                [method.get_feature_importances() for method in methods],
416            )
417        )
418
419    def get_selected_features(self):
420        """
421        Get the selected features. Feature selection (`select()`) must be done before
422
423        Example:
424        ```python
425        # Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3
426        fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3']
427        ```
428        """
429        method_names = self._last_used_methods
430
431        methods = (
432            self._feature_selection_method_objects
433            if not method_names
434            else [
435                method
436                for method in self._feature_selection_method_objects
437                if method.get_method_name() in method_names
438            ]
439        )
440        return dict(
441            zip(
442                [method.get_method_name() for method in methods],
443                [method.get_selected_features() for method in methods],
444            )
445        )
446
447    def get_station_importances(self, target, method):
448        """
449        Generates a Stations importance for a target and a method. Feature selection (`select()`) must be done before
450
451        Args:
452            target (str) : target name (must be referenced in `target_columns` when `select()`)
453            method (str) : method name (must be referenced in `method_names` when `select()`, or None used)
454        """
455        stations_importance = self._stations_dataframe.copy()
456        stations_importance["nb_important_sensors"] = 0
457        stations_importance["max_importance_value"] = 0
458        stations_importance["sensors"] = ""
459        score = self.get_feature_importances()[method]
460        for index in score.index:
461            x = re.search(self._stations_get_id_from_sensor_regex, index)
462            if x:
463                station_id = int(x.group(1))
464                importance_value = score[target][index]
465                stations_importance.loc[
466                    stations_importance[self._stations_id_column] == station_id,
467                    "nb_important_sensors",
468                ] += 1
469                stations_importance.loc[
470                    stations_importance[self._stations_id_column] == station_id,
471                    "sensors",
472                ] += f"{index} : {'{:.2f}'.format(importance_value)}\n</br>"
473                stations_importance.loc[
474                    (stations_importance[self._stations_id_column] == station_id)
475                    & (stations_importance["max_importance_value"] < importance_value),
476                    "max_importance_value",
477                ] = importance_value
478
479        return stations_importance
480
481    def get_available_methods(self):
482        """
483        Get the name of all registered Feature Selection Methods
484        """
485
486        return [
487            method.get_method_name()
488            for method in self._feature_selection_method_objects
489        ]
class FeatureSelection:
 28class FeatureSelection:
 29    """
 30    Feature Selection is a module which permits to apply feature selection methods to a dataframe. It is specialised into geospatial timeseries and provides visualisation functions.
 31
 32    Attributes:
 33        _stations_dataframe (GeoDataFrame) : contains the registered stations
 34        _stations_geometry_column (str) : indicates the column name of _stations_dataframe which contains the geometry of the station
 35        _stations_name_column (str | None) : indicates the column name of _stations_dataframe which contains the name of the station
 36        _stations_id_column (str) : indicates the column name of _stations_dataframe which contains the id of the station
 37        _stations_get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
 38        _stations_crs (str) : current crs of the _stations_dataframe
 39
 40        _feature_selection_method_objects (TemplateMethod[]) : Array of TemplateMethod implemented objects
 41        _last_used_methods (str[]) : last used method names
 42        _last_used_targets (str[]) : last used targets names
 43
 44    Example:
 45    ```python
 46    # Import module
 47    from src.FeatureSelection import FeatureSelection
 48    import pandas as pd
 49
 50    # Import data sample
 51    data = pd.read_csv('./data/sample.csv', index_col=0)
 52
 53    # Import stations references
 54    stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
 55
 56    # Instanciation of FeatureSelection
 57    fs = FeatureSelection()
 58
 59    # Registering the stations
 60    fs.register_stations(
 61        stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
 62        id_column="numero_station", # Indicate the unique id column name
 63        get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
 64        lon_column='longitude', # Indicate longitude column
 65        lat_column='latitude', # Indicate latitude column
 66        name_column='nom' # Indicate name column
 67    )
 68    ```
 69    """
 70
 71    _stations_dataframe = None
 72    _stations_geometry_column = "geometry"
 73    _stations_name_column = None
 74    _stations_id_column = None
 75    _stations_get_id_from_sensor_regex = None
 76    _stations_crs = None
 77
 78    _feature_selection_method_objects = None
 79    _last_used_methods = None
 80    _last_used_targets = None
 81
 82    def __init__(self):
 83        self._feature_selection_method_objects = [
 84            PearsonCorrelation(),
 85            GrangerCausality(),
 86        ]
 87
 88    def register_stations(
 89        self,
 90        stations_dataframe,
 91        id_column,
 92        get_id_from_sensor_regex,
 93        lon_column="lon",
 94        lat_column="lat",
 95        geometry_column=None,
 96        name_column=None,
 97        crs="EPSG:4326",
 98    ):
 99        """
100        Register the stations in order to have a visualisation
101
102        Args:
103            stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations
104            id_column (str) : column name of the stations_dataframe which contains the unique id of the station
105            get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
106            lon_column (str) : column name of the stations_dataframe which contains the longitude of the station
107            lat_column (str) : column name of the stations_dataframe which contains the latitude of the station
108            geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry
109            name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station
110            crs (str) : crs used for the location
111
112        Example:
113        ```python
114        # Import stations references
115        stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
116
117        # Registering the stations
118        fs.register_stations(
119            stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
120            id_column="numero_station", # Indicate the unique id column name
121            get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
122            lon_column='longitude', # Indicate longitude column
123            lat_column='latitude', # Indicate latitude column
124            name_column='nom' # Indicate name column
125        )
126        ```
127        """
128
129        self._stations_dataframe = stations_dataframe
130        self._stations_name_column = name_column
131        self._stations_id_column = id_column
132        self._stations_get_id_from_sensor_regex = get_id_from_sensor_regex
133        self._stations_crs = crs
134
135        if geometry_column:
136            self._stations_geometry_column = geometry_column
137        else:
138            self._stations_dataframe.loc[:, "geometry"] = geopandas.points_from_xy(
139                self._stations_dataframe.loc[:, lon_column],
140                self._stations_dataframe.loc[:, lat_column],
141            )
142            self._stations_geometry_column = "geometry"
143            self._stations_dataframe = self._stations_dataframe.drop(
144                columns=[lon_column, lat_column]
145            )
146
147        self._stations_dataframe = geopandas.GeoDataFrame(
148            self._stations_dataframe,
149            geometry=self._stations_dataframe[self._stations_geometry_column],
150            crs=crs,
151        )
152
153    def explore_stations(self, **explore_kwargs):
154        """
155        Explore the different registered stations on an interactive map
156
157        Example:
158        ```python
159        # Explore the stations
160        fs.explore_stations()
161        ```
162        """
163        map = self._stations_dataframe.explore(
164            column=self._stations_id_column,
165            categorical=True,
166            legend=True,
167            popup=True,
168            marker_kwds=dict(radius=5, fill=True),
169            tiles="CartoDB dark_matter",
170            tooltip_kwds=dict(labels=True),
171            **explore_kwargs,
172        )
173        return map
174
175    def _get_feature_selection_object_by_name(self, name):
176        """
177        Private method, get the feature selection method object by is name
178
179        Args:
180            name (str) : Feature selection method registered name
181        """
182
183        for method in self._feature_selection_method_objects:
184            if method.get_method_name() == name:
185                return method
186        return None
187
188    def select(
189        self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1
190    ):
191        """
192        Apply feature selection methods on target_columns for a given dataframe
193
194        Args:
195            dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry
196            target_columns (str[]) : array of the target column names used to apply the feature selection
197            method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied
198
199        Example:
200        ```python
201        # Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3
202        fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15)
203        ```
204        """
205        methods = (
206            self._feature_selection_method_objects
207            if not method_names
208            else [
209                method
210                for method in self._feature_selection_method_objects
211                if method.get_method_name() in method_names
212            ]
213        )
214
215        for method in methods:
216            method.select(dataframe, target_columns, number_of_target_to_keep)
217
218        self._last_used_methods = [method.get_method_name() for method in methods]
219        self._last_used_targets = target_columns
220
221    def explore(self, used_target, used_method, **explore_kwargs):
222        """
223        Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (`select()`) must be done before
224
225        Args:
226            used_target (str) : the name of the target that we wan't to see (must be referenced in `target_columns` when `select()`)
227            used_method (str) : the name of the method that we wan't to see (must be referenced in `method_names` when `select()`, or None used)
228
229        Example:
230        ```python
231        # Explore the results for the method PearsonCorrelation and the target pm2_5_station_3
232        fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation')
233        ```
234        """
235
236        # overide of the stylefunction, should be added soon as a Geopandas feature.
237        @wrapt.patch_function_wrapper(folium, "GeoJson")
238        def new_style(wrapped, instance, args, kwargs):
239            def style_fn(x):
240                return {
241                    "fillColor": x["properties"]["__folium_color"],
242                    "color": x["properties"]["__folium_color"],
243                    "radius": x["properties"]["nb_important_sensors"] + 1,
244                    "fillOpacity": 0.8,
245                }
246
247            if "_style_column" in str(kwargs["style_function"]):
248                kwargs["style_function"] = style_fn
249            return wrapped(*args, **kwargs)
250
251        stations_importance = self.get_station_importances(used_target, used_method)
252        map = stations_importance.explore(
253            column="max_importance_value",
254            legend=True,
255            marker_kwds=dict(radius=10, fill=True),
256            vmin=0,
257            vmax=1,
258            tiles="CartoDB dark_matter",
259            tooltip=[self._stations_name_column, self._stations_id_column, "sensors"],
260            popup=[self._stations_name_column, self._stations_id_column, "sensors"],
261            tooltip_kwds=dict(labels=True),
262            **explore_kwargs,
263        )
264        return map
265
266    def plot(self, used_targets=None, used_methods=None):
267        """
268        Plot the results of the feature selection. Feature selection (`select()`) must be done before.
269        (Cannot plot results for multiple methods and multiple targets at once)
270
271        Args:
272            used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in `target_columns` when `select()`). If None, all last used_targets will be used
273            used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in `method_names` when `select()`, or None used). If None, all last used_methods will be used
274
275        Example:
276        ```python
277        # Plot the results
278        fs.plot()
279
280        # Plot the results only for the target no_station_3
281        fs.plot(used_targets=['no_station_3'])
282        ```
283        """
284
285        if not used_targets:
286            used_targets = self._last_used_targets
287        if not used_methods:
288            used_methods = self._last_used_methods
289        if len(used_methods) > 1 and len(used_targets) > 1:
290            raise NotImplementedError(
291                "Cannot plot results for multiple methods and multiple targets at once yet..."
292            )
293
294        if len(used_methods) > 1:
295            fig, axs = plt.subplots(
296                len(used_methods),
297                2,
298                figsize=(30, 10 * len(used_methods)),
299                gridspec_kw={"width_ratios": [3, 1]},
300            )
301            fig.suptitle(
302                f"Feature importance visualization for the target {used_targets[0]}",
303                fontsize=36,
304            )
305            for i, method in enumerate(used_methods):
306                self._plot(
307                    used_targets[0],
308                    method,
309                    axs[i, 0],
310                    axs[i, 1],
311                    title=f"Stations importance for the method {method}",
312                )
313        elif len(used_targets) > 1:
314            fig, axs = plt.subplots(
315                len(used_targets),
316                2,
317                figsize=(30, 10 * len(used_targets)),
318                gridspec_kw={"width_ratios": [3, 1]},
319            )
320            fig.suptitle(
321                f"Feature importance visualization for the method {used_methods[0]}",
322                fontsize=36,
323            )
324            for i, target in enumerate(used_targets):
325                self._plot(
326                    target,
327                    used_methods[0],
328                    axs[i, 0],
329                    axs[i, 1],
330                    title=f"Stations importance for the target {target}",
331                )
332        else:
333            fig, (ax1, ax2) = plt.subplots(
334                1, 2, figsize=(30, 10), gridspec_kw={"width_ratios": [3, 1]}
335            )
336            fig.suptitle(
337                f"Feature importance visualization for the method {used_methods[0]} and the target {used_targets[0]}",
338                fontsize=36,
339            )
340            self._plot(used_targets[0], used_methods[0], ax1, ax2)
341
342    def _plot(self, target, method, ax1, ax2, title="Stations importance"):
343        """
344        Private method. Plot the result for a target and a method. Feature selection (`select()`) must be done before
345
346        Args:
347            target (str) : target name (must be referenced in `target_columns` when `select()`)
348            method (str) : method name (must be referenced in `method_names` when `select()`, or None used)
349            ax1 (Axe) : axe which will contains the map
350            ax2 (Axe) : axe which will contains the heatmap
351            title (str) : title of the figure
352        """
353
354        stations_importance = self.get_station_importances(target, method)
355        stations_importance = stations_importance.to_crs(
356            epsg=3857
357        )  # change to Spherical Mercator to add ctx base map properly
358        features_importance = self.get_feature_importances()
359        stations_importance.plot(
360            ax=ax1,
361            column="max_importance_value",
362            legend=True,
363            markersize=(stations_importance["nb_important_sensors"] * 40 + 5),
364            cmap=plt.cm.get_cmap("plasma"),
365            vmin=0,
366            vmax=1,
367        )
368        ax1.set_xlabel("Longitude", fontsize=10)
369        ax1.set_ylabel("Latitude", fontsize="medium")
370        ax1.set_title(title)
371        ctx.add_basemap(ax1, source=ctx.providers.CartoDB.Positron)
372        for x, y, label, offsetY in zip(
373            stations_importance.geometry.x,
374            stations_importance.geometry.y,
375            stations_importance[self._stations_id_column],
376            stations_importance["nb_important_sensors"] + 5,
377        ):
378            ax1.annotate(
379                label, xy=(x, y), xytext=(0, offsetY), textcoords="offset points"
380            )
381
382        features_importance = (
383            features_importance[method]
384            .dropna()
385            .sort_values(by=[target], ascending=False)
386        )
387        sns.heatmap(
388            features_importance[[target]],
389            ax=ax2,
390            annot=True,
391            linewidths=0.5,
392            cmap=plt.cm.get_cmap("plasma"),
393            cbar=False,
394            vmin=0,
395            vmax=1,
396        )
397
398    def get_feature_importances(self):
399        """
400        Get the features importance. Feature selection (`select()`) must be done before
401        """
402        method_names = self._last_used_methods
403
404        methods = (
405            self._feature_selection_method_objects
406            if not method_names
407            else [
408                method
409                for method in self._feature_selection_method_objects
410                if method.get_method_name() in method_names
411            ]
412        )
413        return dict(
414            zip(
415                [method.get_method_name() for method in methods],
416                [method.get_feature_importances() for method in methods],
417            )
418        )
419
420    def get_selected_features(self):
421        """
422        Get the selected features. Feature selection (`select()`) must be done before
423
424        Example:
425        ```python
426        # Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3
427        fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3']
428        ```
429        """
430        method_names = self._last_used_methods
431
432        methods = (
433            self._feature_selection_method_objects
434            if not method_names
435            else [
436                method
437                for method in self._feature_selection_method_objects
438                if method.get_method_name() in method_names
439            ]
440        )
441        return dict(
442            zip(
443                [method.get_method_name() for method in methods],
444                [method.get_selected_features() for method in methods],
445            )
446        )
447
448    def get_station_importances(self, target, method):
449        """
450        Generates a Stations importance for a target and a method. Feature selection (`select()`) must be done before
451
452        Args:
453            target (str) : target name (must be referenced in `target_columns` when `select()`)
454            method (str) : method name (must be referenced in `method_names` when `select()`, or None used)
455        """
456        stations_importance = self._stations_dataframe.copy()
457        stations_importance["nb_important_sensors"] = 0
458        stations_importance["max_importance_value"] = 0
459        stations_importance["sensors"] = ""
460        score = self.get_feature_importances()[method]
461        for index in score.index:
462            x = re.search(self._stations_get_id_from_sensor_regex, index)
463            if x:
464                station_id = int(x.group(1))
465                importance_value = score[target][index]
466                stations_importance.loc[
467                    stations_importance[self._stations_id_column] == station_id,
468                    "nb_important_sensors",
469                ] += 1
470                stations_importance.loc[
471                    stations_importance[self._stations_id_column] == station_id,
472                    "sensors",
473                ] += f"{index} : {'{:.2f}'.format(importance_value)}\n</br>"
474                stations_importance.loc[
475                    (stations_importance[self._stations_id_column] == station_id)
476                    & (stations_importance["max_importance_value"] < importance_value),
477                    "max_importance_value",
478                ] = importance_value
479
480        return stations_importance
481
482    def get_available_methods(self):
483        """
484        Get the name of all registered Feature Selection Methods
485        """
486
487        return [
488            method.get_method_name()
489            for method in self._feature_selection_method_objects
490        ]

Feature Selection is a module which permits to apply feature selection methods to a dataframe. It is specialised into geospatial timeseries and provides visualisation functions.

Attributes
  • _stations_dataframe (GeoDataFrame) : contains the registered stations
  • _stations_geometry_column (str) : indicates the column name of _stations_dataframe which contains the geometry of the station
  • _stations_name_column (str | None) : indicates the column name of _stations_dataframe which contains the name of the station
  • _stations_id_column (str) : indicates the column name of _stations_dataframe which contains the id of the station
  • _stations_get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
  • _stations_crs (str) : current crs of the _stations_dataframe
  • _feature_selection_method_objects (TemplateMethod[]) : Array of TemplateMethod implemented objects
  • _last_used_methods (str[]) : last used method names
  • _last_used_targets (str[]) : last used targets names

Example:

# Import module
from src.FeatureSelection import FeatureSelection
import pandas as pd

# Import data sample
data = pd.read_csv('./data/sample.csv', index_col=0)

# Import stations references
stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")

# Instanciation of FeatureSelection
fs = FeatureSelection()

# Registering the stations
fs.register_stations(
    stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
    id_column="numero_station", # Indicate the unique id column name
    get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
    lon_column='longitude', # Indicate longitude column
    lat_column='latitude', # Indicate latitude column
    name_column='nom' # Indicate name column
)
FeatureSelection()
82    def __init__(self):
83        self._feature_selection_method_objects = [
84            PearsonCorrelation(),
85            GrangerCausality(),
86        ]
def register_stations( self, stations_dataframe, id_column, get_id_from_sensor_regex, lon_column='lon', lat_column='lat', geometry_column=None, name_column=None, crs='EPSG:4326')
 88    def register_stations(
 89        self,
 90        stations_dataframe,
 91        id_column,
 92        get_id_from_sensor_regex,
 93        lon_column="lon",
 94        lat_column="lat",
 95        geometry_column=None,
 96        name_column=None,
 97        crs="EPSG:4326",
 98    ):
 99        """
100        Register the stations in order to have a visualisation
101
102        Args:
103            stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations
104            id_column (str) : column name of the stations_dataframe which contains the unique id of the station
105            get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
106            lon_column (str) : column name of the stations_dataframe which contains the longitude of the station
107            lat_column (str) : column name of the stations_dataframe which contains the latitude of the station
108            geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry
109            name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station
110            crs (str) : crs used for the location
111
112        Example:
113        ```python
114        # Import stations references
115        stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
116
117        # Registering the stations
118        fs.register_stations(
119            stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
120            id_column="numero_station", # Indicate the unique id column name
121            get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
122            lon_column='longitude', # Indicate longitude column
123            lat_column='latitude', # Indicate latitude column
124            name_column='nom' # Indicate name column
125        )
126        ```
127        """
128
129        self._stations_dataframe = stations_dataframe
130        self._stations_name_column = name_column
131        self._stations_id_column = id_column
132        self._stations_get_id_from_sensor_regex = get_id_from_sensor_regex
133        self._stations_crs = crs
134
135        if geometry_column:
136            self._stations_geometry_column = geometry_column
137        else:
138            self._stations_dataframe.loc[:, "geometry"] = geopandas.points_from_xy(
139                self._stations_dataframe.loc[:, lon_column],
140                self._stations_dataframe.loc[:, lat_column],
141            )
142            self._stations_geometry_column = "geometry"
143            self._stations_dataframe = self._stations_dataframe.drop(
144                columns=[lon_column, lat_column]
145            )
146
147        self._stations_dataframe = geopandas.GeoDataFrame(
148            self._stations_dataframe,
149            geometry=self._stations_dataframe[self._stations_geometry_column],
150            crs=crs,
151        )

Register the stations in order to have a visualisation

Args
  • stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations
  • id_column (str) : column name of the stations_dataframe which contains the unique id of the station
  • get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
  • lon_column (str) : column name of the stations_dataframe which contains the longitude of the station
  • lat_column (str) : column name of the stations_dataframe which contains the latitude of the station
  • geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry
  • name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station
  • crs (str) : crs used for the location

Example:

# Import stations references
stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")

# Registering the stations
fs.register_stations(
    stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
    id_column="numero_station", # Indicate the unique id column name
    get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
    lon_column='longitude', # Indicate longitude column
    lat_column='latitude', # Indicate latitude column
    name_column='nom' # Indicate name column
)
def explore_stations(self, **explore_kwargs)
153    def explore_stations(self, **explore_kwargs):
154        """
155        Explore the different registered stations on an interactive map
156
157        Example:
158        ```python
159        # Explore the stations
160        fs.explore_stations()
161        ```
162        """
163        map = self._stations_dataframe.explore(
164            column=self._stations_id_column,
165            categorical=True,
166            legend=True,
167            popup=True,
168            marker_kwds=dict(radius=5, fill=True),
169            tiles="CartoDB dark_matter",
170            tooltip_kwds=dict(labels=True),
171            **explore_kwargs,
172        )
173        return map

Explore the different registered stations on an interactive map

Example:

# Explore the stations
fs.explore_stations()
def select( self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1)
188    def select(
189        self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1
190    ):
191        """
192        Apply feature selection methods on target_columns for a given dataframe
193
194        Args:
195            dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry
196            target_columns (str[]) : array of the target column names used to apply the feature selection
197            method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied
198
199        Example:
200        ```python
201        # Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3
202        fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15)
203        ```
204        """
205        methods = (
206            self._feature_selection_method_objects
207            if not method_names
208            else [
209                method
210                for method in self._feature_selection_method_objects
211                if method.get_method_name() in method_names
212            ]
213        )
214
215        for method in methods:
216            method.select(dataframe, target_columns, number_of_target_to_keep)
217
218        self._last_used_methods = [method.get_method_name() for method in methods]
219        self._last_used_targets = target_columns

Apply feature selection methods on target_columns for a given dataframe

Args
  • dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry
  • target_columns (str[]) : array of the target column names used to apply the feature selection
  • method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied

Example:

# Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3
fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15)
def explore(self, used_target, used_method, **explore_kwargs)
221    def explore(self, used_target, used_method, **explore_kwargs):
222        """
223        Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (`select()`) must be done before
224
225        Args:
226            used_target (str) : the name of the target that we wan't to see (must be referenced in `target_columns` when `select()`)
227            used_method (str) : the name of the method that we wan't to see (must be referenced in `method_names` when `select()`, or None used)
228
229        Example:
230        ```python
231        # Explore the results for the method PearsonCorrelation and the target pm2_5_station_3
232        fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation')
233        ```
234        """
235
236        # overide of the stylefunction, should be added soon as a Geopandas feature.
237        @wrapt.patch_function_wrapper(folium, "GeoJson")
238        def new_style(wrapped, instance, args, kwargs):
239            def style_fn(x):
240                return {
241                    "fillColor": x["properties"]["__folium_color"],
242                    "color": x["properties"]["__folium_color"],
243                    "radius": x["properties"]["nb_important_sensors"] + 1,
244                    "fillOpacity": 0.8,
245                }
246
247            if "_style_column" in str(kwargs["style_function"]):
248                kwargs["style_function"] = style_fn
249            return wrapped(*args, **kwargs)
250
251        stations_importance = self.get_station_importances(used_target, used_method)
252        map = stations_importance.explore(
253            column="max_importance_value",
254            legend=True,
255            marker_kwds=dict(radius=10, fill=True),
256            vmin=0,
257            vmax=1,
258            tiles="CartoDB dark_matter",
259            tooltip=[self._stations_name_column, self._stations_id_column, "sensors"],
260            popup=[self._stations_name_column, self._stations_id_column, "sensors"],
261            tooltip_kwds=dict(labels=True),
262            **explore_kwargs,
263        )
264        return map

Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (select()) must be done before

Args
  • used_target (str) : the name of the target that we wan't to see (must be referenced in target_columns when select())
  • used_method (str) : the name of the method that we wan't to see (must be referenced in method_names when select(), or None used)

Example:

# Explore the results for the method PearsonCorrelation and the target pm2_5_station_3
fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation')
def plot(self, used_targets=None, used_methods=None)
266    def plot(self, used_targets=None, used_methods=None):
267        """
268        Plot the results of the feature selection. Feature selection (`select()`) must be done before.
269        (Cannot plot results for multiple methods and multiple targets at once)
270
271        Args:
272            used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in `target_columns` when `select()`). If None, all last used_targets will be used
273            used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in `method_names` when `select()`, or None used). If None, all last used_methods will be used
274
275        Example:
276        ```python
277        # Plot the results
278        fs.plot()
279
280        # Plot the results only for the target no_station_3
281        fs.plot(used_targets=['no_station_3'])
282        ```
283        """
284
285        if not used_targets:
286            used_targets = self._last_used_targets
287        if not used_methods:
288            used_methods = self._last_used_methods
289        if len(used_methods) > 1 and len(used_targets) > 1:
290            raise NotImplementedError(
291                "Cannot plot results for multiple methods and multiple targets at once yet..."
292            )
293
294        if len(used_methods) > 1:
295            fig, axs = plt.subplots(
296                len(used_methods),
297                2,
298                figsize=(30, 10 * len(used_methods)),
299                gridspec_kw={"width_ratios": [3, 1]},
300            )
301            fig.suptitle(
302                f"Feature importance visualization for the target {used_targets[0]}",
303                fontsize=36,
304            )
305            for i, method in enumerate(used_methods):
306                self._plot(
307                    used_targets[0],
308                    method,
309                    axs[i, 0],
310                    axs[i, 1],
311                    title=f"Stations importance for the method {method}",
312                )
313        elif len(used_targets) > 1:
314            fig, axs = plt.subplots(
315                len(used_targets),
316                2,
317                figsize=(30, 10 * len(used_targets)),
318                gridspec_kw={"width_ratios": [3, 1]},
319            )
320            fig.suptitle(
321                f"Feature importance visualization for the method {used_methods[0]}",
322                fontsize=36,
323            )
324            for i, target in enumerate(used_targets):
325                self._plot(
326                    target,
327                    used_methods[0],
328                    axs[i, 0],
329                    axs[i, 1],
330                    title=f"Stations importance for the target {target}",
331                )
332        else:
333            fig, (ax1, ax2) = plt.subplots(
334                1, 2, figsize=(30, 10), gridspec_kw={"width_ratios": [3, 1]}
335            )
336            fig.suptitle(
337                f"Feature importance visualization for the method {used_methods[0]} and the target {used_targets[0]}",
338                fontsize=36,
339            )
340            self._plot(used_targets[0], used_methods[0], ax1, ax2)

Plot the results of the feature selection. Feature selection (select()) must be done before. (Cannot plot results for multiple methods and multiple targets at once)

Args
  • used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in target_columns when select()). If None, all last used_targets will be used
  • used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in method_names when select(), or None used). If None, all last used_methods will be used

Example:

# Plot the results
fs.plot()

# Plot the results only for the target no_station_3
fs.plot(used_targets=['no_station_3'])
def get_feature_importances(self)
398    def get_feature_importances(self):
399        """
400        Get the features importance. Feature selection (`select()`) must be done before
401        """
402        method_names = self._last_used_methods
403
404        methods = (
405            self._feature_selection_method_objects
406            if not method_names
407            else [
408                method
409                for method in self._feature_selection_method_objects
410                if method.get_method_name() in method_names
411            ]
412        )
413        return dict(
414            zip(
415                [method.get_method_name() for method in methods],
416                [method.get_feature_importances() for method in methods],
417            )
418        )

Get the features importance. Feature selection (select()) must be done before

def get_selected_features(self)
420    def get_selected_features(self):
421        """
422        Get the selected features. Feature selection (`select()`) must be done before
423
424        Example:
425        ```python
426        # Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3
427        fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3']
428        ```
429        """
430        method_names = self._last_used_methods
431
432        methods = (
433            self._feature_selection_method_objects
434            if not method_names
435            else [
436                method
437                for method in self._feature_selection_method_objects
438                if method.get_method_name() in method_names
439            ]
440        )
441        return dict(
442            zip(
443                [method.get_method_name() for method in methods],
444                [method.get_selected_features() for method in methods],
445            )
446        )

Get the selected features. Feature selection (select()) must be done before

Example:

# Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3
fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3']
def get_station_importances(self, target, method)
448    def get_station_importances(self, target, method):
449        """
450        Generates a Stations importance for a target and a method. Feature selection (`select()`) must be done before
451
452        Args:
453            target (str) : target name (must be referenced in `target_columns` when `select()`)
454            method (str) : method name (must be referenced in `method_names` when `select()`, or None used)
455        """
456        stations_importance = self._stations_dataframe.copy()
457        stations_importance["nb_important_sensors"] = 0
458        stations_importance["max_importance_value"] = 0
459        stations_importance["sensors"] = ""
460        score = self.get_feature_importances()[method]
461        for index in score.index:
462            x = re.search(self._stations_get_id_from_sensor_regex, index)
463            if x:
464                station_id = int(x.group(1))
465                importance_value = score[target][index]
466                stations_importance.loc[
467                    stations_importance[self._stations_id_column] == station_id,
468                    "nb_important_sensors",
469                ] += 1
470                stations_importance.loc[
471                    stations_importance[self._stations_id_column] == station_id,
472                    "sensors",
473                ] += f"{index} : {'{:.2f}'.format(importance_value)}\n</br>"
474                stations_importance.loc[
475                    (stations_importance[self._stations_id_column] == station_id)
476                    & (stations_importance["max_importance_value"] < importance_value),
477                    "max_importance_value",
478                ] = importance_value
479
480        return stations_importance

Generates a Stations importance for a target and a method. Feature selection (select()) must be done before

Args
  • target (str) : target name (must be referenced in target_columns when select())
  • method (str) : method name (must be referenced in method_names when select(), or None used)
def get_available_methods(self)
482    def get_available_methods(self):
483        """
484        Get the name of all registered Feature Selection Methods
485        """
486
487        return [
488            method.get_method_name()
489            for method in self._feature_selection_method_objects
490        ]

Get the name of all registered Feature Selection Methods