Source code for sdgx.data_processors.formatters.int
from __future__ import annotations
from typing import Any, List
import pandas as pd
from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.formatters.base import Formatter
from sdgx.utils import logger
[docs]
class IntValueFormatter(Formatter):
"""
Formatter class for handling Int values in pd.DataFrame.
"""
int_columns: set
"""
List of column names that are of type int, populated by the fit method using metadata.
"""
def __init__(self):
self.int_columns = set()
[docs]
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the formatter.
Formatter need to use metadata to record which columns belong to the int type, and convert them back to the int type during post-processing.
"""
# get from metadata
for each_col in metadata.int_columns:
if each_col not in metadata.column_list:
continue
if metadata.get_column_data_type(each_col) == "int":
self.int_columns.add(each_col)
continue
if metadata.get_column_data_type(each_col) == "id":
self.int_columns.add(each_col)
logger.info("IntValueFormatter Fitted.")
self.fitted = True
return
[docs]
def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
"""
No action for convert.
"""
logger.info("Converting data using IntValueFormatter... Finished (No Action).")
return raw_data
[docs]
def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
reverse_convert method for the formatter.
Do format conversion for int columns.
"""
for col in self.int_columns:
if col in processed_data.columns:
processed_data[col] = processed_data[col].astype(int)
else:
logger.error("Column {} not found in processed_data.".format(col))
logger.info("Data reverse-converted by IntValueFormatter.")
return processed_data
@hookimpl
def register(manager):
manager.register("IntValueFormatter", IntValueFormatter)