Source code for sdgx.data_models.inspectors.numeric

from __future__ import annotations

from typing import Any

import pandas as pd

from sdgx.data_models.inspectors.base import Inspector
from sdgx.data_models.inspectors.extension import hookimpl


[docs] class NumericInspector(Inspector): """ A class for inspecting numeric data. This class is a subclass of `Inspector` and is designed to provide methods for inspecting and analyzing numeric data. It includes methods for detecting int or float data type. In August 2024, we introduced a new feature that will continue to judge the positivity or negativity after determining the type, thereby effectively improving the quality of synthetic data in subsequent processing. """ int_columns: set = set() """ A set of column names that contain integer values. """ float_columns: set = set() """ A set of column names that contain float values. """ positive_columns: set = set() """ A set of column names that contain only positive numeric values. """ negative_columns: set = set() """ A set of column names that contain only negative numeric values. """ pos_threshold: float = 0.95 """ The threshold proportion of positive values in a column to consider it as a positive column. """ negative_threshold: float = 0.95 """ The threshold proportion of negative values in a column to consider it as a negative column. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._int_rate = 0.9 self.df_length = 0
[docs] def _is_int_column(self, col_series: pd.Series) -> bool: """ Determine if a column contains predominantly integer values. This method checks if the proportion of integer values in the given column exceeds a predefined threshold. Args: col_series (pd.Series): The column series to be inspected. Returns: bool: True if the column is predominantly integer, False otherwise. """ # Convert the column series to numeric values, coercing errors to NaN and dropping them numeric_values = pd.to_numeric(col_series, errors="coerce").dropna() # If there are no numeric values, return False to avoid division by zero if len(numeric_values) == 0: return False # Count how many of the numeric values are integers int_cnt = (numeric_values == numeric_values.astype(int)).sum() # Calculate the ratio of integer values to the total numeric values int_rate = int_cnt / len(numeric_values) # Return True if the integer rate is greater than the predefined threshold return int_rate > self._int_rate
[docs] def _is_positive_or_negative_column( self, col_series: pd.Series, threshold: float, comparison_func ) -> bool: """ Determine if a column contains predominantly positive or negative values. This method checks if the proportion of values that satisfy a given comparison function exceeds a predefined threshold. Args: col_series (pd.Series): The column series to be inspected. threshold (float): The proportion threshold for considering the column as positive or negative. comparison_func (function): A function that takes a numeric value and returns a boolean. Returns: bool: True if the column satisfies the condition, False otherwise. """ # Convert the column series to numeric values, coercing errors to NaN and dropping NaN values numeric_values = pd.to_numeric(col_series, errors="coerce").dropna() # If there are no numeric values, return False to avoid division by zero if len(numeric_values) == 0: return False # Apply the comparison function to the numeric values and sum the results count = comparison_func(numeric_values).sum() # Calculate the proportion of values that meet the comparison criteria proportion = count / len(numeric_values) # Return True if the proportion meets or exceeds the threshold, otherwise False return proportion >= threshold
[docs] def _is_positive_column(self, col_series: pd.Series) -> bool: """ Determine if a column contains predominantly positive values. This method checks if the proportion of positive values in the given column exceeds a predefined threshold. Args: col_series (pd.Series): The column series to be inspected. Returns: bool: True if the column is predominantly positive, False otherwise. """ return self._is_positive_or_negative_column(col_series, self.pos_threshold, lambda x: x > 0)
[docs] def _is_negative_column(self, col_series: pd.Series) -> bool: """ Determine if a column contains predominantly negative values. This method checks if the proportion of negative values in the given column exceeds a predefined threshold. Args: col_series (pd.Series): The column series to be inspected. Returns: bool: True if the column is predominantly negative, False otherwise. """ return self._is_positive_or_negative_column( col_series, self.negative_threshold, lambda x: x < 0 )
[docs] def fit(self, raw_data: pd.DataFrame, *args, **kwargs): """Fit the inspector. Gets the list of discrete columns from the raw data. Args: raw_data (pd.DataFrame): Raw data """ # Initialize sets for integer and float columns self.int_columns = set() self.float_columns = set() # Initialize sets for positive and negative columns self.positive_columns = set() self.negative_columns = set() # Store the length of the DataFrame self.df_length = len(raw_data) # Iterate all columns and determain the final data type for col in raw_data.columns: if pd.api.types.is_integer_dtype(raw_data[col].dtype) or pd.api.types.is_float_dtype( raw_data[col].dtype ): # series type may be 32/64bit. # float or int if self._is_int_column(raw_data[col]): self.int_columns.add(col) else: self.float_columns.add(col) # positive? negative? if self._is_positive_column(raw_data[col]): self.positive_columns.add(col) elif self._is_negative_column(raw_data[col]): self.negative_columns.add(col) # Mark the inspector as ready self.ready = True
[docs] def inspect(self, *args, **kwargs) -> dict[str, Any]: """Inspect raw data and generate metadata.""" # Positive and negative columns should not be strictly considered as label columns # We use the format dict to inspect and output to metadata numeric_format: dict = {} numeric_format["positive"] = sorted(list(self.positive_columns)) numeric_format["negative"] = sorted(list(self.negative_columns)) return { "int_columns": list(self.int_columns), "float_columns": list(self.float_columns), "numeric_format": numeric_format, }
@hookimpl def register(manager): manager.register("NumericInspector", NumericInspector)