Source code for sdgx.data_processors.transformers.numeric

from __future__ import annotations

from typing import Any, Dict, Set

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sdgx.data_loader import DataLoader
from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.transformers.base import Transformer
from sdgx.utils import logger


[docs] class NumericValueTransformer(Transformer): """ A transformer class for numeric data. This class is used to transform numeric data by scaling it using the StandardScaler from sklearn. Attributes: standard_scale (bool): A flag indicating whether to scale the data using StandardScaler. int_columns (Set): A set of column names that are of integer type. float_columns (Set): A set of column names that are of float type. scalers (Dict): A dictionary of scalers for each numeric column. """ standard_scale: bool = True """ A flag indicating whether to scale the data using StandardScaler. If True, the data will be scaled using StandardScaler. If False, the data will not be scaled. """ int_columns: Set """ A set of column names that are of integer type. These columns will be considered for scaling if `standard_scale` is True. """ float_columns: Set """ A set of column names that are of float type. These columns will be considered for scaling if `standard_scale` is True. """ scalers: Dict """ A dictionary of scalers for each numeric column. The keys are the column names and the values are the corresponding scalers. """ def __init__(self): self.int_columns = set() self.float_columns = set() self.scalers = {}
[docs] def fit( self, metadata: Metadata | None = None, tabular_data: DataLoader | pd.DataFrame = None, **kwargs: dict[str, Any], ): """ The fit method. Data columns of int and float types need to be recorded here (Get data from metadata). """ # get exact final data type from metadata # int columns for each_col in metadata.int_columns: if each_col not in metadata.column_list: continue if metadata.get_column_data_type(each_col) == "int": self.int_columns.add(each_col) continue if metadata.get_column_data_type(each_col) == "id": self.int_columns.add(each_col) # float columns for each_col in metadata.float_columns: if each_col not in metadata.column_list: continue if metadata.get_column_data_type(each_col) == "float": self.float_columns.add(each_col) if len(self.int_columns) == 0 and len(self.float_columns) == 0: logger.info("NumericValueTransformer Fitted (No numeric columns).") return # fit each columnxf for each_col in list(self.int_columns) + list(self.float_columns): self._fit_column(each_col, tabular_data[[each_col]]) self.fitted = True logger.info("NumericValueTransformer Fitted.")
[docs] def _fit_column(self, column_name: str, column_data: pd.DataFrame) -> np.ndarray: """ Fit every numeric (include int and float) column in `_fit_column`. """ if self.standard_scale: self._fit_column_scale(column_name, column_data) return return
[docs] def _fit_column_scale(self, column_name: str, column_data: pd.DataFrame) -> np.ndarray: """ Fit every numeric (include int and float) column using sklearn StandardScaler. """ self.scalers[column_name] = StandardScaler() self.scalers[column_name].fit(column_data)
[docs] def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: """ Convert method to handle missing values in the input data. """ logger.info("Converting data using NumericValueTransformer...") if len(self.int_columns) == 0 and len(self.float_columns) == 0: logger.info("Converting data using NumericValueTransformer... Finished (No column).") return processed_data = raw_data.copy() for each_col in list(self.int_columns) + list(self.float_columns): # convert every column then change the column processed_col = self._covert_column(each_col, processed_data[[each_col]]) processed_data[each_col] = processed_col logger.info("Converting data using NumericValueTransformer... Finished.") return processed_data
[docs] def _covert_column(self, column_name: str, column_data: pd.DataFrame): """ Convert every numeric (include int and float) column. """ if self.standard_scale: return self._covert_column_scale(column_name=column_name, column_data=column_data) pass
[docs] def _covert_column_scale(self, column_name: str, column_data: pd.DataFrame): """ Convert every numeric (include int and float) column using sklearn StandardScaler. """ scaled_data = self.scalers[column_name].transform(column_data) return scaled_data
[docs] def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: """ Reverse convert method, convert generated data into processed data. """ for each_col in list(self.int_columns) + list(self.float_columns): # reverse convert every column then change the column processed_col = self._reverse_convert_column(each_col, processed_data[[each_col]]) processed_data[each_col] = processed_col logger.info("Data reverse-converted by NumericValueTransformer (No Action).") return processed_data
[docs] def _reverse_convert_column(self, column_name: str, column_data: pd.DataFrame): """ Reverse convert method for each column. """ if self.standard_scale: return self._reverse_convert_column_scale( column_name=column_name, column_data=column_data ) return
[docs] def _reverse_convert_column_scale(self, column_name: str, column_data: pd.DataFrame): """ Reverse convert method for input column using scale method. """ reverse_converted_data = self.scalers[column_name].inverse_transform(column_data) return reverse_converted_data
pass
@hookimpl def register(manager): manager.register("NumericValueTransformer", NumericValueTransformer)