Source code for sdgx.data_models.inspectors.datetime
from __future__ import annotations
from typing import Any
import pandas as pd
from pandas._libs.tslibs.parsing import DateParseError
from sdgx.data_models.inspectors.base import Inspector
from sdgx.data_models.inspectors.extension import hookimpl
from sdgx.utils import ignore_warnings
[docs]
class DatetimeInspector(Inspector):
_inspect_level = 20
"""
The inspect_level of DatetimeInspector is higher than DiscreteInspector.
Often, difficult-to-recognize date or datetime objects are also recognized as descrete types by DatetimeInspector, causing the column to be marked repeatedly.
"""
_format_match_rate = 0.9
"""
When specifically check the datatime format, problems caused by missing values and incorrect values will inevitably occur.
To fix this, we discard the .any() method and use the `match_rate` to increase the robustness of this inspector.
"""
PRESET_FORMAT_STRINGS = [
"%Y-%m-%d",
"%d %b %Y",
"%b-%Y",
"%Y/%m/%d",
]
def __init__(self, user_formats: list[str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.datetime_columns: set[str] = set()
self.user_defined_formats = user_formats if user_formats else []
self.column_formats: dict[str, str] = {}
[docs]
@classmethod
@ignore_warnings(category=UserWarning)
def can_convert_to_datetime(cls, input_col: pd.Series):
"""Whether a df column can be converted to datetime.
Args:
input_col(pd.Series): A column of a dataframe.
"""
try:
pd.to_datetime(input_col)
return True
except DateParseError:
return False
# for other situations
except:
return False
[docs]
def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
"""Fit the inspector.
Gets the list of discrete columns from the raw data.
Args:
raw_data (pd.DataFrame): Raw data
"""
self.datetime_columns = set()
self.datetime_columns = self.datetime_columns.union(
set(raw_data.infer_objects().select_dtypes(include=["datetime64"]).columns)
)
# for some other case
# Some columns containing dates after infer are still marked as object
candidate_columns = set(raw_data.select_dtypes(include=["object"]).columns)
for col_name in candidate_columns:
each_col = raw_data[col_name]
if DatetimeInspector.can_convert_to_datetime(each_col):
self.datetime_columns.add(col_name)
# Process for detecting format strings
for col_name in self.datetime_columns:
each_col = raw_data[col_name]
datetime_format = self.detect_datetime_format(each_col)
if datetime_format:
self.column_formats[col_name] = datetime_format
self.ready = True
[docs]
def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""
return {
"datetime_columns": list(self.datetime_columns),
"datetime_formats": self.column_formats,
}
@hookimpl
def register(manager):
manager.register("DatetimeInspector", DatetimeInspector)