src.FeatureSelection
1# ************************************************************************************************************************* # 2# UTC Header # 3# :::::::::::::::::::: ::: ::: ::::::::::: :::::::: # 4# FeatureSelection.py :::::::::::::::::::: :+: :+: :+: :+: :+: # 5# ::::::::::::::+++#####+++ +:+ +:+ +:+ +:+ # 6# By: branlyst and ismailkad < > ::+++##############+++ +:+ +:+ +:+ +:+ # 7# +++##############+++:::: +#+ +:+ +#+ +#+ # 8# +++##+++:::::::::::::: +#+ +:+ +#+ +#+ # 9# :::::::::::::::::::: +#+ +#+ +#+ +#+ # 10# :::::::::::::::::::: #+# #+# #+# #+# #+# # 11# Update: 2022/06/16 19:02:58 by branlyst and ismai :::::::::::::::::::: ######## ### ######## .fr # 12# # 13# ************************************************************************************************************************* # 14 15import geopandas 16from matplotlib import pyplot as plt 17import seaborn as sns 18import re 19import folium 20import wrapt 21import contextily as ctx 22 23from src.FeatureSelectionMethods.PearsonCorrelation import PearsonCorrelation 24from src.FeatureSelectionMethods.GrangerCausality import GrangerCausality 25 26 27class FeatureSelection: 28 """ 29 Feature Selection is a module which permits to apply feature selection methods to a dataframe. It is specialised into geospatial timeseries and provides visualisation functions. 30 31 Attributes: 32 _stations_dataframe (GeoDataFrame) : contains the registered stations 33 _stations_geometry_column (str) : indicates the column name of _stations_dataframe which contains the geometry of the station 34 _stations_name_column (str | None) : indicates the column name of _stations_dataframe which contains the name of the station 35 _stations_id_column (str) : indicates the column name of _stations_dataframe which contains the id of the station 36 _stations_get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection 37 _stations_crs (str) : current crs of the _stations_dataframe 38 39 _feature_selection_method_objects (TemplateMethod[]) : Array of TemplateMethod implemented objects 40 _last_used_methods (str[]) : last used method names 41 _last_used_targets (str[]) : last used targets names 42 43 Example: 44 ```python 45 # Import module 46 from src.FeatureSelection import FeatureSelection 47 import pandas as pd 48 49 # Import data sample 50 data = pd.read_csv('./data/sample.csv', index_col=0) 51 52 # Import stations references 53 stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv") 54 55 # Instanciation of FeatureSelection 56 fs = FeatureSelection() 57 58 # Registering the stations 59 fs.register_stations( 60 stations_references[stations_references['statut'] == 'ouvert'], # Select open stations 61 id_column="numero_station", # Indicate the unique id column name 62 get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names 63 lon_column='longitude', # Indicate longitude column 64 lat_column='latitude', # Indicate latitude column 65 name_column='nom' # Indicate name column 66 ) 67 ``` 68 """ 69 70 _stations_dataframe = None 71 _stations_geometry_column = "geometry" 72 _stations_name_column = None 73 _stations_id_column = None 74 _stations_get_id_from_sensor_regex = None 75 _stations_crs = None 76 77 _feature_selection_method_objects = None 78 _last_used_methods = None 79 _last_used_targets = None 80 81 def __init__(self): 82 self._feature_selection_method_objects = [ 83 PearsonCorrelation(), 84 GrangerCausality(), 85 ] 86 87 def register_stations( 88 self, 89 stations_dataframe, 90 id_column, 91 get_id_from_sensor_regex, 92 lon_column="lon", 93 lat_column="lat", 94 geometry_column=None, 95 name_column=None, 96 crs="EPSG:4326", 97 ): 98 """ 99 Register the stations in order to have a visualisation 100 101 Args: 102 stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations 103 id_column (str) : column name of the stations_dataframe which contains the unique id of the station 104 get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection 105 lon_column (str) : column name of the stations_dataframe which contains the longitude of the station 106 lat_column (str) : column name of the stations_dataframe which contains the latitude of the station 107 geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry 108 name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station 109 crs (str) : crs used for the location 110 111 Example: 112 ```python 113 # Import stations references 114 stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv") 115 116 # Registering the stations 117 fs.register_stations( 118 stations_references[stations_references['statut'] == 'ouvert'], # Select open stations 119 id_column="numero_station", # Indicate the unique id column name 120 get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names 121 lon_column='longitude', # Indicate longitude column 122 lat_column='latitude', # Indicate latitude column 123 name_column='nom' # Indicate name column 124 ) 125 ``` 126 """ 127 128 self._stations_dataframe = stations_dataframe 129 self._stations_name_column = name_column 130 self._stations_id_column = id_column 131 self._stations_get_id_from_sensor_regex = get_id_from_sensor_regex 132 self._stations_crs = crs 133 134 if geometry_column: 135 self._stations_geometry_column = geometry_column 136 else: 137 self._stations_dataframe.loc[:, "geometry"] = geopandas.points_from_xy( 138 self._stations_dataframe.loc[:, lon_column], 139 self._stations_dataframe.loc[:, lat_column], 140 ) 141 self._stations_geometry_column = "geometry" 142 self._stations_dataframe = self._stations_dataframe.drop( 143 columns=[lon_column, lat_column] 144 ) 145 146 self._stations_dataframe = geopandas.GeoDataFrame( 147 self._stations_dataframe, 148 geometry=self._stations_dataframe[self._stations_geometry_column], 149 crs=crs, 150 ) 151 152 def explore_stations(self, **explore_kwargs): 153 """ 154 Explore the different registered stations on an interactive map 155 156 Example: 157 ```python 158 # Explore the stations 159 fs.explore_stations() 160 ``` 161 """ 162 map = self._stations_dataframe.explore( 163 column=self._stations_id_column, 164 categorical=True, 165 legend=True, 166 popup=True, 167 marker_kwds=dict(radius=5, fill=True), 168 tiles="CartoDB dark_matter", 169 tooltip_kwds=dict(labels=True), 170 **explore_kwargs, 171 ) 172 return map 173 174 def _get_feature_selection_object_by_name(self, name): 175 """ 176 Private method, get the feature selection method object by is name 177 178 Args: 179 name (str) : Feature selection method registered name 180 """ 181 182 for method in self._feature_selection_method_objects: 183 if method.get_method_name() == name: 184 return method 185 return None 186 187 def select( 188 self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1 189 ): 190 """ 191 Apply feature selection methods on target_columns for a given dataframe 192 193 Args: 194 dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry 195 target_columns (str[]) : array of the target column names used to apply the feature selection 196 method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied 197 198 Example: 199 ```python 200 # Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3 201 fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15) 202 ``` 203 """ 204 methods = ( 205 self._feature_selection_method_objects 206 if not method_names 207 else [ 208 method 209 for method in self._feature_selection_method_objects 210 if method.get_method_name() in method_names 211 ] 212 ) 213 214 for method in methods: 215 method.select(dataframe, target_columns, number_of_target_to_keep) 216 217 self._last_used_methods = [method.get_method_name() for method in methods] 218 self._last_used_targets = target_columns 219 220 def explore(self, used_target, used_method, **explore_kwargs): 221 """ 222 Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (`select()`) must be done before 223 224 Args: 225 used_target (str) : the name of the target that we wan't to see (must be referenced in `target_columns` when `select()`) 226 used_method (str) : the name of the method that we wan't to see (must be referenced in `method_names` when `select()`, or None used) 227 228 Example: 229 ```python 230 # Explore the results for the method PearsonCorrelation and the target pm2_5_station_3 231 fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation') 232 ``` 233 """ 234 235 # overide of the stylefunction, should be added soon as a Geopandas feature. 236 @wrapt.patch_function_wrapper(folium, "GeoJson") 237 def new_style(wrapped, instance, args, kwargs): 238 def style_fn(x): 239 return { 240 "fillColor": x["properties"]["__folium_color"], 241 "color": x["properties"]["__folium_color"], 242 "radius": x["properties"]["nb_important_sensors"] + 1, 243 "fillOpacity": 0.8, 244 } 245 246 if "_style_column" in str(kwargs["style_function"]): 247 kwargs["style_function"] = style_fn 248 return wrapped(*args, **kwargs) 249 250 stations_importance = self.get_station_importances(used_target, used_method) 251 map = stations_importance.explore( 252 column="max_importance_value", 253 legend=True, 254 marker_kwds=dict(radius=10, fill=True), 255 vmin=0, 256 vmax=1, 257 tiles="CartoDB dark_matter", 258 tooltip=[self._stations_name_column, self._stations_id_column, "sensors"], 259 popup=[self._stations_name_column, self._stations_id_column, "sensors"], 260 tooltip_kwds=dict(labels=True), 261 **explore_kwargs, 262 ) 263 return map 264 265 def plot(self, used_targets=None, used_methods=None): 266 """ 267 Plot the results of the feature selection. Feature selection (`select()`) must be done before. 268 (Cannot plot results for multiple methods and multiple targets at once) 269 270 Args: 271 used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in `target_columns` when `select()`). If None, all last used_targets will be used 272 used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in `method_names` when `select()`, or None used). If None, all last used_methods will be used 273 274 Example: 275 ```python 276 # Plot the results 277 fs.plot() 278 279 # Plot the results only for the target no_station_3 280 fs.plot(used_targets=['no_station_3']) 281 ``` 282 """ 283 284 if not used_targets: 285 used_targets = self._last_used_targets 286 if not used_methods: 287 used_methods = self._last_used_methods 288 if len(used_methods) > 1 and len(used_targets) > 1: 289 raise NotImplementedError( 290 "Cannot plot results for multiple methods and multiple targets at once yet..." 291 ) 292 293 if len(used_methods) > 1: 294 fig, axs = plt.subplots( 295 len(used_methods), 296 2, 297 figsize=(30, 10 * len(used_methods)), 298 gridspec_kw={"width_ratios": [3, 1]}, 299 ) 300 fig.suptitle( 301 f"Feature importance visualization for the target {used_targets[0]}", 302 fontsize=36, 303 ) 304 for i, method in enumerate(used_methods): 305 self._plot( 306 used_targets[0], 307 method, 308 axs[i, 0], 309 axs[i, 1], 310 title=f"Stations importance for the method {method}", 311 ) 312 elif len(used_targets) > 1: 313 fig, axs = plt.subplots( 314 len(used_targets), 315 2, 316 figsize=(30, 10 * len(used_targets)), 317 gridspec_kw={"width_ratios": [3, 1]}, 318 ) 319 fig.suptitle( 320 f"Feature importance visualization for the method {used_methods[0]}", 321 fontsize=36, 322 ) 323 for i, target in enumerate(used_targets): 324 self._plot( 325 target, 326 used_methods[0], 327 axs[i, 0], 328 axs[i, 1], 329 title=f"Stations importance for the target {target}", 330 ) 331 else: 332 fig, (ax1, ax2) = plt.subplots( 333 1, 2, figsize=(30, 10), gridspec_kw={"width_ratios": [3, 1]} 334 ) 335 fig.suptitle( 336 f"Feature importance visualization for the method {used_methods[0]} and the target {used_targets[0]}", 337 fontsize=36, 338 ) 339 self._plot(used_targets[0], used_methods[0], ax1, ax2) 340 341 def _plot(self, target, method, ax1, ax2, title="Stations importance"): 342 """ 343 Private method. Plot the result for a target and a method. Feature selection (`select()`) must be done before 344 345 Args: 346 target (str) : target name (must be referenced in `target_columns` when `select()`) 347 method (str) : method name (must be referenced in `method_names` when `select()`, or None used) 348 ax1 (Axe) : axe which will contains the map 349 ax2 (Axe) : axe which will contains the heatmap 350 title (str) : title of the figure 351 """ 352 353 stations_importance = self.get_station_importances(target, method) 354 stations_importance = stations_importance.to_crs( 355 epsg=3857 356 ) # change to Spherical Mercator to add ctx base map properly 357 features_importance = self.get_feature_importances() 358 stations_importance.plot( 359 ax=ax1, 360 column="max_importance_value", 361 legend=True, 362 markersize=(stations_importance["nb_important_sensors"] * 40 + 5), 363 cmap=plt.cm.get_cmap("plasma"), 364 vmin=0, 365 vmax=1, 366 ) 367 ax1.set_xlabel("Longitude", fontsize=10) 368 ax1.set_ylabel("Latitude", fontsize="medium") 369 ax1.set_title(title) 370 ctx.add_basemap(ax1, source=ctx.providers.CartoDB.Positron) 371 for x, y, label, offsetY in zip( 372 stations_importance.geometry.x, 373 stations_importance.geometry.y, 374 stations_importance[self._stations_id_column], 375 stations_importance["nb_important_sensors"] + 5, 376 ): 377 ax1.annotate( 378 label, xy=(x, y), xytext=(0, offsetY), textcoords="offset points" 379 ) 380 381 features_importance = ( 382 features_importance[method] 383 .dropna() 384 .sort_values(by=[target], ascending=False) 385 ) 386 sns.heatmap( 387 features_importance[[target]], 388 ax=ax2, 389 annot=True, 390 linewidths=0.5, 391 cmap=plt.cm.get_cmap("plasma"), 392 cbar=False, 393 vmin=0, 394 vmax=1, 395 ) 396 397 def get_feature_importances(self): 398 """ 399 Get the features importance. Feature selection (`select()`) must be done before 400 """ 401 method_names = self._last_used_methods 402 403 methods = ( 404 self._feature_selection_method_objects 405 if not method_names 406 else [ 407 method 408 for method in self._feature_selection_method_objects 409 if method.get_method_name() in method_names 410 ] 411 ) 412 return dict( 413 zip( 414 [method.get_method_name() for method in methods], 415 [method.get_feature_importances() for method in methods], 416 ) 417 ) 418 419 def get_selected_features(self): 420 """ 421 Get the selected features. Feature selection (`select()`) must be done before 422 423 Example: 424 ```python 425 # Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3 426 fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3'] 427 ``` 428 """ 429 method_names = self._last_used_methods 430 431 methods = ( 432 self._feature_selection_method_objects 433 if not method_names 434 else [ 435 method 436 for method in self._feature_selection_method_objects 437 if method.get_method_name() in method_names 438 ] 439 ) 440 return dict( 441 zip( 442 [method.get_method_name() for method in methods], 443 [method.get_selected_features() for method in methods], 444 ) 445 ) 446 447 def get_station_importances(self, target, method): 448 """ 449 Generates a Stations importance for a target and a method. Feature selection (`select()`) must be done before 450 451 Args: 452 target (str) : target name (must be referenced in `target_columns` when `select()`) 453 method (str) : method name (must be referenced in `method_names` when `select()`, or None used) 454 """ 455 stations_importance = self._stations_dataframe.copy() 456 stations_importance["nb_important_sensors"] = 0 457 stations_importance["max_importance_value"] = 0 458 stations_importance["sensors"] = "" 459 score = self.get_feature_importances()[method] 460 for index in score.index: 461 x = re.search(self._stations_get_id_from_sensor_regex, index) 462 if x: 463 station_id = int(x.group(1)) 464 importance_value = score[target][index] 465 stations_importance.loc[ 466 stations_importance[self._stations_id_column] == station_id, 467 "nb_important_sensors", 468 ] += 1 469 stations_importance.loc[ 470 stations_importance[self._stations_id_column] == station_id, 471 "sensors", 472 ] += f"{index} : {'{:.2f}'.format(importance_value)}\n</br>" 473 stations_importance.loc[ 474 (stations_importance[self._stations_id_column] == station_id) 475 & (stations_importance["max_importance_value"] < importance_value), 476 "max_importance_value", 477 ] = importance_value 478 479 return stations_importance 480 481 def get_available_methods(self): 482 """ 483 Get the name of all registered Feature Selection Methods 484 """ 485 486 return [ 487 method.get_method_name() 488 for method in self._feature_selection_method_objects 489 ]
28class FeatureSelection: 29 """ 30 Feature Selection is a module which permits to apply feature selection methods to a dataframe. It is specialised into geospatial timeseries and provides visualisation functions. 31 32 Attributes: 33 _stations_dataframe (GeoDataFrame) : contains the registered stations 34 _stations_geometry_column (str) : indicates the column name of _stations_dataframe which contains the geometry of the station 35 _stations_name_column (str | None) : indicates the column name of _stations_dataframe which contains the name of the station 36 _stations_id_column (str) : indicates the column name of _stations_dataframe which contains the id of the station 37 _stations_get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection 38 _stations_crs (str) : current crs of the _stations_dataframe 39 40 _feature_selection_method_objects (TemplateMethod[]) : Array of TemplateMethod implemented objects 41 _last_used_methods (str[]) : last used method names 42 _last_used_targets (str[]) : last used targets names 43 44 Example: 45 ```python 46 # Import module 47 from src.FeatureSelection import FeatureSelection 48 import pandas as pd 49 50 # Import data sample 51 data = pd.read_csv('./data/sample.csv', index_col=0) 52 53 # Import stations references 54 stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv") 55 56 # Instanciation of FeatureSelection 57 fs = FeatureSelection() 58 59 # Registering the stations 60 fs.register_stations( 61 stations_references[stations_references['statut'] == 'ouvert'], # Select open stations 62 id_column="numero_station", # Indicate the unique id column name 63 get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names 64 lon_column='longitude', # Indicate longitude column 65 lat_column='latitude', # Indicate latitude column 66 name_column='nom' # Indicate name column 67 ) 68 ``` 69 """ 70 71 _stations_dataframe = None 72 _stations_geometry_column = "geometry" 73 _stations_name_column = None 74 _stations_id_column = None 75 _stations_get_id_from_sensor_regex = None 76 _stations_crs = None 77 78 _feature_selection_method_objects = None 79 _last_used_methods = None 80 _last_used_targets = None 81 82 def __init__(self): 83 self._feature_selection_method_objects = [ 84 PearsonCorrelation(), 85 GrangerCausality(), 86 ] 87 88 def register_stations( 89 self, 90 stations_dataframe, 91 id_column, 92 get_id_from_sensor_regex, 93 lon_column="lon", 94 lat_column="lat", 95 geometry_column=None, 96 name_column=None, 97 crs="EPSG:4326", 98 ): 99 """ 100 Register the stations in order to have a visualisation 101 102 Args: 103 stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations 104 id_column (str) : column name of the stations_dataframe which contains the unique id of the station 105 get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection 106 lon_column (str) : column name of the stations_dataframe which contains the longitude of the station 107 lat_column (str) : column name of the stations_dataframe which contains the latitude of the station 108 geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry 109 name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station 110 crs (str) : crs used for the location 111 112 Example: 113 ```python 114 # Import stations references 115 stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv") 116 117 # Registering the stations 118 fs.register_stations( 119 stations_references[stations_references['statut'] == 'ouvert'], # Select open stations 120 id_column="numero_station", # Indicate the unique id column name 121 get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names 122 lon_column='longitude', # Indicate longitude column 123 lat_column='latitude', # Indicate latitude column 124 name_column='nom' # Indicate name column 125 ) 126 ``` 127 """ 128 129 self._stations_dataframe = stations_dataframe 130 self._stations_name_column = name_column 131 self._stations_id_column = id_column 132 self._stations_get_id_from_sensor_regex = get_id_from_sensor_regex 133 self._stations_crs = crs 134 135 if geometry_column: 136 self._stations_geometry_column = geometry_column 137 else: 138 self._stations_dataframe.loc[:, "geometry"] = geopandas.points_from_xy( 139 self._stations_dataframe.loc[:, lon_column], 140 self._stations_dataframe.loc[:, lat_column], 141 ) 142 self._stations_geometry_column = "geometry" 143 self._stations_dataframe = self._stations_dataframe.drop( 144 columns=[lon_column, lat_column] 145 ) 146 147 self._stations_dataframe = geopandas.GeoDataFrame( 148 self._stations_dataframe, 149 geometry=self._stations_dataframe[self._stations_geometry_column], 150 crs=crs, 151 ) 152 153 def explore_stations(self, **explore_kwargs): 154 """ 155 Explore the different registered stations on an interactive map 156 157 Example: 158 ```python 159 # Explore the stations 160 fs.explore_stations() 161 ``` 162 """ 163 map = self._stations_dataframe.explore( 164 column=self._stations_id_column, 165 categorical=True, 166 legend=True, 167 popup=True, 168 marker_kwds=dict(radius=5, fill=True), 169 tiles="CartoDB dark_matter", 170 tooltip_kwds=dict(labels=True), 171 **explore_kwargs, 172 ) 173 return map 174 175 def _get_feature_selection_object_by_name(self, name): 176 """ 177 Private method, get the feature selection method object by is name 178 179 Args: 180 name (str) : Feature selection method registered name 181 """ 182 183 for method in self._feature_selection_method_objects: 184 if method.get_method_name() == name: 185 return method 186 return None 187 188 def select( 189 self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1 190 ): 191 """ 192 Apply feature selection methods on target_columns for a given dataframe 193 194 Args: 195 dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry 196 target_columns (str[]) : array of the target column names used to apply the feature selection 197 method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied 198 199 Example: 200 ```python 201 # Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3 202 fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15) 203 ``` 204 """ 205 methods = ( 206 self._feature_selection_method_objects 207 if not method_names 208 else [ 209 method 210 for method in self._feature_selection_method_objects 211 if method.get_method_name() in method_names 212 ] 213 ) 214 215 for method in methods: 216 method.select(dataframe, target_columns, number_of_target_to_keep) 217 218 self._last_used_methods = [method.get_method_name() for method in methods] 219 self._last_used_targets = target_columns 220 221 def explore(self, used_target, used_method, **explore_kwargs): 222 """ 223 Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (`select()`) must be done before 224 225 Args: 226 used_target (str) : the name of the target that we wan't to see (must be referenced in `target_columns` when `select()`) 227 used_method (str) : the name of the method that we wan't to see (must be referenced in `method_names` when `select()`, or None used) 228 229 Example: 230 ```python 231 # Explore the results for the method PearsonCorrelation and the target pm2_5_station_3 232 fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation') 233 ``` 234 """ 235 236 # overide of the stylefunction, should be added soon as a Geopandas feature. 237 @wrapt.patch_function_wrapper(folium, "GeoJson") 238 def new_style(wrapped, instance, args, kwargs): 239 def style_fn(x): 240 return { 241 "fillColor": x["properties"]["__folium_color"], 242 "color": x["properties"]["__folium_color"], 243 "radius": x["properties"]["nb_important_sensors"] + 1, 244 "fillOpacity": 0.8, 245 } 246 247 if "_style_column" in str(kwargs["style_function"]): 248 kwargs["style_function"] = style_fn 249 return wrapped(*args, **kwargs) 250 251 stations_importance = self.get_station_importances(used_target, used_method) 252 map = stations_importance.explore( 253 column="max_importance_value", 254 legend=True, 255 marker_kwds=dict(radius=10, fill=True), 256 vmin=0, 257 vmax=1, 258 tiles="CartoDB dark_matter", 259 tooltip=[self._stations_name_column, self._stations_id_column, "sensors"], 260 popup=[self._stations_name_column, self._stations_id_column, "sensors"], 261 tooltip_kwds=dict(labels=True), 262 **explore_kwargs, 263 ) 264 return map 265 266 def plot(self, used_targets=None, used_methods=None): 267 """ 268 Plot the results of the feature selection. Feature selection (`select()`) must be done before. 269 (Cannot plot results for multiple methods and multiple targets at once) 270 271 Args: 272 used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in `target_columns` when `select()`). If None, all last used_targets will be used 273 used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in `method_names` when `select()`, or None used). If None, all last used_methods will be used 274 275 Example: 276 ```python 277 # Plot the results 278 fs.plot() 279 280 # Plot the results only for the target no_station_3 281 fs.plot(used_targets=['no_station_3']) 282 ``` 283 """ 284 285 if not used_targets: 286 used_targets = self._last_used_targets 287 if not used_methods: 288 used_methods = self._last_used_methods 289 if len(used_methods) > 1 and len(used_targets) > 1: 290 raise NotImplementedError( 291 "Cannot plot results for multiple methods and multiple targets at once yet..." 292 ) 293 294 if len(used_methods) > 1: 295 fig, axs = plt.subplots( 296 len(used_methods), 297 2, 298 figsize=(30, 10 * len(used_methods)), 299 gridspec_kw={"width_ratios": [3, 1]}, 300 ) 301 fig.suptitle( 302 f"Feature importance visualization for the target {used_targets[0]}", 303 fontsize=36, 304 ) 305 for i, method in enumerate(used_methods): 306 self._plot( 307 used_targets[0], 308 method, 309 axs[i, 0], 310 axs[i, 1], 311 title=f"Stations importance for the method {method}", 312 ) 313 elif len(used_targets) > 1: 314 fig, axs = plt.subplots( 315 len(used_targets), 316 2, 317 figsize=(30, 10 * len(used_targets)), 318 gridspec_kw={"width_ratios": [3, 1]}, 319 ) 320 fig.suptitle( 321 f"Feature importance visualization for the method {used_methods[0]}", 322 fontsize=36, 323 ) 324 for i, target in enumerate(used_targets): 325 self._plot( 326 target, 327 used_methods[0], 328 axs[i, 0], 329 axs[i, 1], 330 title=f"Stations importance for the target {target}", 331 ) 332 else: 333 fig, (ax1, ax2) = plt.subplots( 334 1, 2, figsize=(30, 10), gridspec_kw={"width_ratios": [3, 1]} 335 ) 336 fig.suptitle( 337 f"Feature importance visualization for the method {used_methods[0]} and the target {used_targets[0]}", 338 fontsize=36, 339 ) 340 self._plot(used_targets[0], used_methods[0], ax1, ax2) 341 342 def _plot(self, target, method, ax1, ax2, title="Stations importance"): 343 """ 344 Private method. Plot the result for a target and a method. Feature selection (`select()`) must be done before 345 346 Args: 347 target (str) : target name (must be referenced in `target_columns` when `select()`) 348 method (str) : method name (must be referenced in `method_names` when `select()`, or None used) 349 ax1 (Axe) : axe which will contains the map 350 ax2 (Axe) : axe which will contains the heatmap 351 title (str) : title of the figure 352 """ 353 354 stations_importance = self.get_station_importances(target, method) 355 stations_importance = stations_importance.to_crs( 356 epsg=3857 357 ) # change to Spherical Mercator to add ctx base map properly 358 features_importance = self.get_feature_importances() 359 stations_importance.plot( 360 ax=ax1, 361 column="max_importance_value", 362 legend=True, 363 markersize=(stations_importance["nb_important_sensors"] * 40 + 5), 364 cmap=plt.cm.get_cmap("plasma"), 365 vmin=0, 366 vmax=1, 367 ) 368 ax1.set_xlabel("Longitude", fontsize=10) 369 ax1.set_ylabel("Latitude", fontsize="medium") 370 ax1.set_title(title) 371 ctx.add_basemap(ax1, source=ctx.providers.CartoDB.Positron) 372 for x, y, label, offsetY in zip( 373 stations_importance.geometry.x, 374 stations_importance.geometry.y, 375 stations_importance[self._stations_id_column], 376 stations_importance["nb_important_sensors"] + 5, 377 ): 378 ax1.annotate( 379 label, xy=(x, y), xytext=(0, offsetY), textcoords="offset points" 380 ) 381 382 features_importance = ( 383 features_importance[method] 384 .dropna() 385 .sort_values(by=[target], ascending=False) 386 ) 387 sns.heatmap( 388 features_importance[[target]], 389 ax=ax2, 390 annot=True, 391 linewidths=0.5, 392 cmap=plt.cm.get_cmap("plasma"), 393 cbar=False, 394 vmin=0, 395 vmax=1, 396 ) 397 398 def get_feature_importances(self): 399 """ 400 Get the features importance. Feature selection (`select()`) must be done before 401 """ 402 method_names = self._last_used_methods 403 404 methods = ( 405 self._feature_selection_method_objects 406 if not method_names 407 else [ 408 method 409 for method in self._feature_selection_method_objects 410 if method.get_method_name() in method_names 411 ] 412 ) 413 return dict( 414 zip( 415 [method.get_method_name() for method in methods], 416 [method.get_feature_importances() for method in methods], 417 ) 418 ) 419 420 def get_selected_features(self): 421 """ 422 Get the selected features. Feature selection (`select()`) must be done before 423 424 Example: 425 ```python 426 # Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3 427 fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3'] 428 ``` 429 """ 430 method_names = self._last_used_methods 431 432 methods = ( 433 self._feature_selection_method_objects 434 if not method_names 435 else [ 436 method 437 for method in self._feature_selection_method_objects 438 if method.get_method_name() in method_names 439 ] 440 ) 441 return dict( 442 zip( 443 [method.get_method_name() for method in methods], 444 [method.get_selected_features() for method in methods], 445 ) 446 ) 447 448 def get_station_importances(self, target, method): 449 """ 450 Generates a Stations importance for a target and a method. Feature selection (`select()`) must be done before 451 452 Args: 453 target (str) : target name (must be referenced in `target_columns` when `select()`) 454 method (str) : method name (must be referenced in `method_names` when `select()`, or None used) 455 """ 456 stations_importance = self._stations_dataframe.copy() 457 stations_importance["nb_important_sensors"] = 0 458 stations_importance["max_importance_value"] = 0 459 stations_importance["sensors"] = "" 460 score = self.get_feature_importances()[method] 461 for index in score.index: 462 x = re.search(self._stations_get_id_from_sensor_regex, index) 463 if x: 464 station_id = int(x.group(1)) 465 importance_value = score[target][index] 466 stations_importance.loc[ 467 stations_importance[self._stations_id_column] == station_id, 468 "nb_important_sensors", 469 ] += 1 470 stations_importance.loc[ 471 stations_importance[self._stations_id_column] == station_id, 472 "sensors", 473 ] += f"{index} : {'{:.2f}'.format(importance_value)}\n</br>" 474 stations_importance.loc[ 475 (stations_importance[self._stations_id_column] == station_id) 476 & (stations_importance["max_importance_value"] < importance_value), 477 "max_importance_value", 478 ] = importance_value 479 480 return stations_importance 481 482 def get_available_methods(self): 483 """ 484 Get the name of all registered Feature Selection Methods 485 """ 486 487 return [ 488 method.get_method_name() 489 for method in self._feature_selection_method_objects 490 ]
Feature Selection is a module which permits to apply feature selection methods to a dataframe. It is specialised into geospatial timeseries and provides visualisation functions.
Attributes
- _stations_dataframe (GeoDataFrame) : contains the registered stations
- _stations_geometry_column (str) : indicates the column name of _stations_dataframe which contains the geometry of the station
- _stations_name_column (str | None) : indicates the column name of _stations_dataframe which contains the name of the station
- _stations_id_column (str) : indicates the column name of _stations_dataframe which contains the id of the station
- _stations_get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
- _stations_crs (str) : current crs of the _stations_dataframe
- _feature_selection_method_objects (TemplateMethod[]) : Array of TemplateMethod implemented objects
- _last_used_methods (str[]) : last used method names
- _last_used_targets (str[]) : last used targets names
Example:
# Import module
from src.FeatureSelection import FeatureSelection
import pandas as pd
# Import data sample
data = pd.read_csv('./data/sample.csv', index_col=0)
# Import stations references
stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
# Instanciation of FeatureSelection
fs = FeatureSelection()
# Registering the stations
fs.register_stations(
stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
id_column="numero_station", # Indicate the unique id column name
get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
lon_column='longitude', # Indicate longitude column
lat_column='latitude', # Indicate latitude column
name_column='nom' # Indicate name column
)
88 def register_stations( 89 self, 90 stations_dataframe, 91 id_column, 92 get_id_from_sensor_regex, 93 lon_column="lon", 94 lat_column="lat", 95 geometry_column=None, 96 name_column=None, 97 crs="EPSG:4326", 98 ): 99 """ 100 Register the stations in order to have a visualisation 101 102 Args: 103 stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations 104 id_column (str) : column name of the stations_dataframe which contains the unique id of the station 105 get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection 106 lon_column (str) : column name of the stations_dataframe which contains the longitude of the station 107 lat_column (str) : column name of the stations_dataframe which contains the latitude of the station 108 geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry 109 name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station 110 crs (str) : crs used for the location 111 112 Example: 113 ```python 114 # Import stations references 115 stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv") 116 117 # Registering the stations 118 fs.register_stations( 119 stations_references[stations_references['statut'] == 'ouvert'], # Select open stations 120 id_column="numero_station", # Indicate the unique id column name 121 get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names 122 lon_column='longitude', # Indicate longitude column 123 lat_column='latitude', # Indicate latitude column 124 name_column='nom' # Indicate name column 125 ) 126 ``` 127 """ 128 129 self._stations_dataframe = stations_dataframe 130 self._stations_name_column = name_column 131 self._stations_id_column = id_column 132 self._stations_get_id_from_sensor_regex = get_id_from_sensor_regex 133 self._stations_crs = crs 134 135 if geometry_column: 136 self._stations_geometry_column = geometry_column 137 else: 138 self._stations_dataframe.loc[:, "geometry"] = geopandas.points_from_xy( 139 self._stations_dataframe.loc[:, lon_column], 140 self._stations_dataframe.loc[:, lat_column], 141 ) 142 self._stations_geometry_column = "geometry" 143 self._stations_dataframe = self._stations_dataframe.drop( 144 columns=[lon_column, lat_column] 145 ) 146 147 self._stations_dataframe = geopandas.GeoDataFrame( 148 self._stations_dataframe, 149 geometry=self._stations_dataframe[self._stations_geometry_column], 150 crs=crs, 151 )
Register the stations in order to have a visualisation
Args
- stations_dataframe (DataFrame | GeoDataFrame) : DataFrame containing the stations
- id_column (str) : column name of the stations_dataframe which contains the unique id of the station
- get_id_from_sensor_regex (str) : regex used to find the station id in the dataframe containing all records used for feature selection
- lon_column (str) : column name of the stations_dataframe which contains the longitude of the station
- lat_column (str) : column name of the stations_dataframe which contains the latitude of the station
- geometry_column (str | None) : if provided and dataframe is a GeoDataFrame, column name of the stations_dataframe which contains the geometry
- name_column (str | None) : if provided, column name of the stations_dataframe which contains the name of the station
- crs (str) : crs used for the location
Example:
# Import stations references
stations_references = pd.read_csv("./data/liste-des-stations-rsqa.csv")
# Registering the stations
fs.register_stations(
stations_references[stations_references['statut'] == 'ouvert'], # Select open stations
id_column="numero_station", # Indicate the unique id column name
get_id_from_sensor_regex="station_([0-9]+)", # Indicate how to get this unique id from the data column's names
lon_column='longitude', # Indicate longitude column
lat_column='latitude', # Indicate latitude column
name_column='nom' # Indicate name column
)
153 def explore_stations(self, **explore_kwargs): 154 """ 155 Explore the different registered stations on an interactive map 156 157 Example: 158 ```python 159 # Explore the stations 160 fs.explore_stations() 161 ``` 162 """ 163 map = self._stations_dataframe.explore( 164 column=self._stations_id_column, 165 categorical=True, 166 legend=True, 167 popup=True, 168 marker_kwds=dict(radius=5, fill=True), 169 tiles="CartoDB dark_matter", 170 tooltip_kwds=dict(labels=True), 171 **explore_kwargs, 172 ) 173 return map
Explore the different registered stations on an interactive map
Example:
# Explore the stations
fs.explore_stations()
188 def select( 189 self, dataframe, target_columns, method_names=None, number_of_target_to_keep=1 190 ): 191 """ 192 Apply feature selection methods on target_columns for a given dataframe 193 194 Args: 195 dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry 196 target_columns (str[]) : array of the target column names used to apply the feature selection 197 method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied 198 199 Example: 200 ```python 201 # Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3 202 fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15) 203 ``` 204 """ 205 methods = ( 206 self._feature_selection_method_objects 207 if not method_names 208 else [ 209 method 210 for method in self._feature_selection_method_objects 211 if method.get_method_name() in method_names 212 ] 213 ) 214 215 for method in methods: 216 method.select(dataframe, target_columns, number_of_target_to_keep) 217 218 self._last_used_methods = [method.get_method_name() for method in methods] 219 self._last_used_targets = target_columns
Apply feature selection methods on target_columns for a given dataframe
Args
- dataframe (DataFrame) : dataframe which contains the data used to apply the feature selection. 1 column by feature and 1 line by entry
- target_columns (str[]) : array of the target column names used to apply the feature selection
- method_names (str[] | None) : array of the method names to use for feature selection, if None, all registered methods will be applied
Example:
# Apply a feature selection method (PearsonCorrelation) to the data for the targets pm2_5_station_3 and no_station_3
fs.select(data, target_columns=['pm2_5_station_3', 'no_station_3'], method_names=['PearsonCorrelation'], number_of_target_to_keep=15)
221 def explore(self, used_target, used_method, **explore_kwargs): 222 """ 223 Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (`select()`) must be done before 224 225 Args: 226 used_target (str) : the name of the target that we wan't to see (must be referenced in `target_columns` when `select()`) 227 used_method (str) : the name of the method that we wan't to see (must be referenced in `method_names` when `select()`, or None used) 228 229 Example: 230 ```python 231 # Explore the results for the method PearsonCorrelation and the target pm2_5_station_3 232 fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation') 233 ``` 234 """ 235 236 # overide of the stylefunction, should be added soon as a Geopandas feature. 237 @wrapt.patch_function_wrapper(folium, "GeoJson") 238 def new_style(wrapped, instance, args, kwargs): 239 def style_fn(x): 240 return { 241 "fillColor": x["properties"]["__folium_color"], 242 "color": x["properties"]["__folium_color"], 243 "radius": x["properties"]["nb_important_sensors"] + 1, 244 "fillOpacity": 0.8, 245 } 246 247 if "_style_column" in str(kwargs["style_function"]): 248 kwargs["style_function"] = style_fn 249 return wrapped(*args, **kwargs) 250 251 stations_importance = self.get_station_importances(used_target, used_method) 252 map = stations_importance.explore( 253 column="max_importance_value", 254 legend=True, 255 marker_kwds=dict(radius=10, fill=True), 256 vmin=0, 257 vmax=1, 258 tiles="CartoDB dark_matter", 259 tooltip=[self._stations_name_column, self._stations_id_column, "sensors"], 260 popup=[self._stations_name_column, self._stations_id_column, "sensors"], 261 tooltip_kwds=dict(labels=True), 262 **explore_kwargs, 263 ) 264 return map
Explore the results of the feature selection on an interactive map for a method and a target. Feature selection (select()
) must be done before
Args
- used_target (str) : the name of the target that we wan't to see (must be referenced in
target_columns
whenselect()
) - used_method (str) : the name of the method that we wan't to see (must be referenced in
method_names
whenselect()
, or None used)
Example:
# Explore the results for the method PearsonCorrelation and the target pm2_5_station_3
fs.explore(used_target='pm2_5_station_3', used_method='PearsonCorrelation')
266 def plot(self, used_targets=None, used_methods=None): 267 """ 268 Plot the results of the feature selection. Feature selection (`select()`) must be done before. 269 (Cannot plot results for multiple methods and multiple targets at once) 270 271 Args: 272 used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in `target_columns` when `select()`). If None, all last used_targets will be used 273 used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in `method_names` when `select()`, or None used). If None, all last used_methods will be used 274 275 Example: 276 ```python 277 # Plot the results 278 fs.plot() 279 280 # Plot the results only for the target no_station_3 281 fs.plot(used_targets=['no_station_3']) 282 ``` 283 """ 284 285 if not used_targets: 286 used_targets = self._last_used_targets 287 if not used_methods: 288 used_methods = self._last_used_methods 289 if len(used_methods) > 1 and len(used_targets) > 1: 290 raise NotImplementedError( 291 "Cannot plot results for multiple methods and multiple targets at once yet..." 292 ) 293 294 if len(used_methods) > 1: 295 fig, axs = plt.subplots( 296 len(used_methods), 297 2, 298 figsize=(30, 10 * len(used_methods)), 299 gridspec_kw={"width_ratios": [3, 1]}, 300 ) 301 fig.suptitle( 302 f"Feature importance visualization for the target {used_targets[0]}", 303 fontsize=36, 304 ) 305 for i, method in enumerate(used_methods): 306 self._plot( 307 used_targets[0], 308 method, 309 axs[i, 0], 310 axs[i, 1], 311 title=f"Stations importance for the method {method}", 312 ) 313 elif len(used_targets) > 1: 314 fig, axs = plt.subplots( 315 len(used_targets), 316 2, 317 figsize=(30, 10 * len(used_targets)), 318 gridspec_kw={"width_ratios": [3, 1]}, 319 ) 320 fig.suptitle( 321 f"Feature importance visualization for the method {used_methods[0]}", 322 fontsize=36, 323 ) 324 for i, target in enumerate(used_targets): 325 self._plot( 326 target, 327 used_methods[0], 328 axs[i, 0], 329 axs[i, 1], 330 title=f"Stations importance for the target {target}", 331 ) 332 else: 333 fig, (ax1, ax2) = plt.subplots( 334 1, 2, figsize=(30, 10), gridspec_kw={"width_ratios": [3, 1]} 335 ) 336 fig.suptitle( 337 f"Feature importance visualization for the method {used_methods[0]} and the target {used_targets[0]}", 338 fontsize=36, 339 ) 340 self._plot(used_targets[0], used_methods[0], ax1, ax2)
Plot the results of the feature selection. Feature selection (select()
) must be done before.
(Cannot plot results for multiple methods and multiple targets at once)
Args
- used_targets (str[] | None) : the name of the targets that we wan't to see (must be referenced in
target_columns
whenselect()
). If None, all last used_targets will be used - used_methods (str[] | None) : the name of the methods that we wan't to see (must be referenced in
method_names
whenselect()
, or None used). If None, all last used_methods will be used
Example:
# Plot the results
fs.plot()
# Plot the results only for the target no_station_3
fs.plot(used_targets=['no_station_3'])
398 def get_feature_importances(self): 399 """ 400 Get the features importance. Feature selection (`select()`) must be done before 401 """ 402 method_names = self._last_used_methods 403 404 methods = ( 405 self._feature_selection_method_objects 406 if not method_names 407 else [ 408 method 409 for method in self._feature_selection_method_objects 410 if method.get_method_name() in method_names 411 ] 412 ) 413 return dict( 414 zip( 415 [method.get_method_name() for method in methods], 416 [method.get_feature_importances() for method in methods], 417 ) 418 )
Get the features importance. Feature selection (select()
) must be done before
420 def get_selected_features(self): 421 """ 422 Get the selected features. Feature selection (`select()`) must be done before 423 424 Example: 425 ```python 426 # Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3 427 fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3'] 428 ``` 429 """ 430 method_names = self._last_used_methods 431 432 methods = ( 433 self._feature_selection_method_objects 434 if not method_names 435 else [ 436 method 437 for method in self._feature_selection_method_objects 438 if method.get_method_name() in method_names 439 ] 440 ) 441 return dict( 442 zip( 443 [method.get_method_name() for method in methods], 444 [method.get_selected_features() for method in methods], 445 ) 446 )
Get the selected features. Feature selection (select()
) must be done before
Example:
# Get access to the selected features for Pearson Correlation method and the target pm2_5_station_3
fs.get_selected_features()['PearsonCorrelation']['pm2_5_station_3']
448 def get_station_importances(self, target, method): 449 """ 450 Generates a Stations importance for a target and a method. Feature selection (`select()`) must be done before 451 452 Args: 453 target (str) : target name (must be referenced in `target_columns` when `select()`) 454 method (str) : method name (must be referenced in `method_names` when `select()`, or None used) 455 """ 456 stations_importance = self._stations_dataframe.copy() 457 stations_importance["nb_important_sensors"] = 0 458 stations_importance["max_importance_value"] = 0 459 stations_importance["sensors"] = "" 460 score = self.get_feature_importances()[method] 461 for index in score.index: 462 x = re.search(self._stations_get_id_from_sensor_regex, index) 463 if x: 464 station_id = int(x.group(1)) 465 importance_value = score[target][index] 466 stations_importance.loc[ 467 stations_importance[self._stations_id_column] == station_id, 468 "nb_important_sensors", 469 ] += 1 470 stations_importance.loc[ 471 stations_importance[self._stations_id_column] == station_id, 472 "sensors", 473 ] += f"{index} : {'{:.2f}'.format(importance_value)}\n</br>" 474 stations_importance.loc[ 475 (stations_importance[self._stations_id_column] == station_id) 476 & (stations_importance["max_importance_value"] < importance_value), 477 "max_importance_value", 478 ] = importance_value 479 480 return stations_importance
482 def get_available_methods(self): 483 """ 484 Get the name of all registered Feature Selection Methods 485 """ 486 487 return [ 488 method.get_method_name() 489 for method in self._feature_selection_method_objects 490 ]
Get the name of all registered Feature Selection Methods