Source code for sdgx.data_processors.formatters.datetime
from __future__ import annotations
from collections import defaultdict
from datetime import datetime
from typing import Any, Dict
import numpy as np
import pandas as pd
from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.formatters.base import Formatter
from sdgx.utils import logger
[docs]
class DatetimeFormatter(Formatter):
"""
A class for formatting datetime columns in a pandas DataFrame.
DatetimeFormatter is designed to handle the conversion of datetime columns to timestamp format and vice versa.
It uses metadata to identify datetime columns and their corresponding datetime formats.
Attributes:
datetime_columns (list): List of column names that are of datetime type.
datetime_formats (dict): Dictionary with column names as keys and datetime formats as values.
dead_columns (list): List of column names that are no longer needed or to be removed.
fitted (bool): Indicates whether the formatter has been fitted.
Methods:
fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]): Fits the formatter by recording the datetime columns and their formats.
convert(raw_data: pd.DataFrame) -> pd.DataFrame: Converts datetime columns in raw_data to timestamp format.
reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: Converts timestamp columns in processed_data back to datetime format.
"""
datetime_columns: list
"""
List to store the columns that are of datetime type.
"""
datetime_formats: Dict
"""
Dictionary to store the datetime formats for each column, with default value as an empty string.
"""
dead_columns: list
"""
List to store columns that are no longer needed or to be removed.
"""
def __init__(self):
self.fitted = False
self.datetime_columns = []
self.datetime_formats = defaultdict(str)
self.dead_columns = []
[docs]
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for datetime formatter, the datetime column and datetime format need to be recorded.
If there is a column without format, the default format will be used for output (this may cause some problems).
Formatter need to use metadata to record which columns belong to datetime type, and convert timestamp back to datetime type during post-processing.
"""
# get from metadata
self.datetime_formats = metadata.get("datetime_format")
datetime_columns = []
dead_columns = []
# Check datetime_formats and columns
# exclude columns without format as there is huge risk of handling errors
meta_datetime_columns = metadata.get("datetime_columns")
for each_col in meta_datetime_columns:
if each_col in self.datetime_formats.keys():
datetime_columns.append(each_col)
else:
dead_columns.append(each_col)
logger.warning(
f"Column {each_col} has no datetime_format, DatetimeFormatter will REMOVE this column!"
)
# Remove successful formatted datetime columns from metadata.discrete_columns
if not (set(datetime_columns) - set(metadata.discrete_columns)):
metadata.change_column_type(datetime_columns, "discrete", "datetime")
# Remove dead_columns from metadata
metadata.remove_column(dead_columns)
self.datetime_columns = datetime_columns
self.dead_columns = dead_columns
logger.info("DatetimeFormatter Fitted.")
self.fitted = True
return
[docs]
def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
"""
Convert method to convert datetime samples into timestamp.
Args:
- raw_data (pd.DataFrame): Unprocessed table data
"""
if len(self.datetime_columns) == 0:
logger.info(
"Converting data using DatetimeFormatter... Finished (No datetime columns)."
)
return raw_data
# remove the column without format
for each_col in self.dead_columns:
raw_data = self.remove_columns(raw_data, [each_col])
logger.warning(f"Column {each_col} was removed because lack of format info.")
logger.info("Converting data using DatetimeFormatter...")
res_data = self.convert_datetime_columns(
self.datetime_columns, self.datetime_formats, raw_data
)
logger.info("Converting data using DatetimeFormatter... Finished.")
return res_data
[docs]
@staticmethod
def convert_datetime_columns(datetime_column_list, datetime_formats, processed_data):
"""
Convert datetime columns in processed_data from string to timestamp (int)
Args:
- datetime_column_list (list): List of columns that are date time type
- processed_data (pd.DataFrame): Processed table data
Returns:
- result_data (pd.DataFrame): Processed table data with datetime columns converted to timestamp
"""
def datetime_formatter(each_value, datetime_format):
"""
convert each single column datetime string to timestamp int value.
"""
try:
datetime_obj = datetime.strptime(str(each_value), datetime_format)
each_stamp = datetime.timestamp(datetime_obj)
except Exception as e:
logger.warning(
f"An error occured when convert str to timestamp {e}, we set as mean."
)
logger.warning(f"Input parameters: ({str(each_value)}, {datetime_format})")
logger.warning(f"Input type: ({type(each_value)}, {type(datetime_format)})")
each_stamp = np.nan
return each_stamp
# Make a copy of processed_data to avoid modifying the original data
result_data: pd.DataFrame = processed_data.copy()
# Convert each datetime column in datetime_column_list to timestamp
for column in datetime_column_list:
# Convert datetime to timestamp (int)
result_data[column] = result_data[column].apply(
datetime_formatter, datetime_format=datetime_formats[column]
)
result_data[column].fillna(result_data[column].mean(), inplace=True)
return result_data
[docs]
def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
reverse_convert method for datetime formatter.
Does not require any action.
"""
if len(self.datetime_columns) == 0:
logger.info("Data reverse-converted by DatetimeFormatter (No datetime columns).")
return processed_data
logger.info("Data reverse-converting by DatetimeFormatter...")
logger.info(f"parameters : {self.datetime_columns}, {self.datetime_formats}")
result_data = self.convert_timestamp_to_datetime(
self.datetime_columns, self.datetime_formats, processed_data
)
logger.info("Data reverse-converted by DatetimeFormatter... Finished.")
return result_data
[docs]
@staticmethod
def convert_timestamp_to_datetime(timestamp_column_list, format_dict, processed_data):
"""
Convert timestamp columns to datetime format in a DataFrame.
Parameters:
- timestamp_column_list (list): List of column names in the DataFrame which are of timestamp type.
- datetime_column_dict (dict): Dictionary with column names as keys and datetime format as values.
- processed_data (pd.DataFrame): DataFrame containing the processed data.
Returns:
- result_data (pd.DataFrame): DataFrame with timestamp columns converted to datetime format.
TODO:
if the value <0, the result will be `No Datetime`, try to fix it.
"""
def column_timestamp_formatter(each_stamp: int, timestamp_format: str) -> str:
try:
each_str = datetime.fromtimestamp(each_stamp).strftime(timestamp_format)
except Exception as e:
logger.debug(f"An error occured when convert timestamp to str {e}.")
each_str = "No Datetime"
return each_str
# Copy the processed data to result_data
result_data = processed_data.copy()
# Iterate over each column in the timestamp_column_list
for column in timestamp_column_list:
# Check if the column is in the DataFrame
if column in result_data.columns:
# Convert the timestamp to datetime format using the format provided in datetime_column_dict
result_data[column] = result_data[column].apply(
column_timestamp_formatter, timestamp_format=format_dict[column]
)
else:
logger.error(f"Column {column} not in processed data's column list!")
return result_data
@hookimpl
def register(manager):
manager.register("DatetimeFormatter", DatetimeFormatter)