| |
|
| |
|
| |
|
| | from datetime import date
|
| | import polars as pl
|
| | from pandas import (
|
| | DataFrame as pd_DataFrame,
|
| | read_csv as pd_read_csv,
|
| | to_datetime,
|
| | )
|
| |
|
| |
|
| | def read_csv_data(
|
| | start_date: date | str,
|
| | retrieve_columns: list | tuple = (
|
| | "publication_date",
|
| | "document_number",
|
| | "significant",
|
| | "econ_significant",
|
| | "3(f)(1) significant",
|
| | "Major"
|
| | ),
|
| | url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
|
| | ):
|
| |
|
| | if isinstance(start_date, str):
|
| | start_date = date.fromisoformat(start_date)
|
| |
|
| |
|
| | if start_date >= date.fromisoformat("2023-04-06"):
|
| | cols = [col for col in retrieve_columns if col != "econ_significant"]
|
| | else:
|
| | cols = list(retrieve_columns)
|
| |
|
| |
|
| | try:
|
| | df_pd = pd_read_csv(url, usecols=cols)
|
| | except UnicodeDecodeError:
|
| | df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
|
| |
|
| | df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
|
| | max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
|
| |
|
| | cols.remove("publication_date")
|
| | df = pl.from_pandas(df_pd.loc[:, cols])
|
| |
|
| | if df.shape[1] == len(cols):
|
| |
|
| | rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
|
| | if all(True if rename in cols else False for rename in rename_cols.keys()):
|
| | df = df.rename(rename_cols)
|
| | cols = [rename_cols.get(col, col) for col in cols]
|
| |
|
| | return df, cols, max_date
|
| | else:
|
| | return None, cols, max_date
|
| |
|
| |
|
| | def clean_data(df: pl.DataFrame,
|
| | document_numbers: list,
|
| | clean_columns: list | tuple,
|
| |
|
| | return_optimized_plan = False
|
| | ):
|
| |
|
| |
|
| | lf = (
|
| | df.lazy()
|
| |
|
| | .with_columns(pl.col("document_number").str.strip_chars())
|
| |
|
| | .filter(pl.col("document_number").is_in(document_numbers))
|
| |
|
| |
|
| |
|
| |
|
| | )
|
| |
|
| |
|
| | if return_optimized_plan:
|
| | return lf.explain(optimized=True)
|
| |
|
| |
|
| | return lf.collect()
|
| |
|
| |
|
| | def merge_with_api_results(pd_df: pd_DataFrame,
|
| | pl_df: pl.DataFrame
|
| | ):
|
| |
|
| | main_df = pl.from_pandas(pd_df)
|
| | df = main_df.join(pl_df, on="document_number", how="left", validate="1:1")
|
| | return df.to_pandas()
|
| |
|
| |
|
| | def get_significant_info(input_df, start_date, document_numbers):
|
| |
|
| | pl_df, clean_cols, max_date = read_csv_data(start_date)
|
| | if pl_df is None:
|
| | print("Failed to integrate significance tracking data with retrieved documents.")
|
| | return input_df
|
| | pl_df = clean_data(pl_df, document_numbers, clean_cols)
|
| | pd_df = merge_with_api_results(input_df, pl_df)
|
| | return pd_df, max_date
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| |
|
| | date_a = "2023-04-05"
|
| | date_b = "2023-04-06"
|
| | numbers = [
|
| | "2021-01303",
|
| | '2023-28006',
|
| | '2024-00149',
|
| | '2024-00089',
|
| | '2023-28828',
|
| | '2024-00300',
|
| | '2024-00045',
|
| | '2024-00192',
|
| | '2024-00228',
|
| | '2024-00187'
|
| | ]
|
| |
|
| |
|
| | df_a, clean_cols = read_csv_data(date_a)
|
| | df_a = clean_data(df_a, numbers, clean_cols)
|
| |
|
| |
|
| | df_b, clean_cols = read_csv_data(date_b)
|
| | df_b = clean_data(df_b, numbers, clean_cols)
|
| |
|
| |
|
| |
|
| |
|