| | |
| | |
| |
|
| | from datetime import date |
| | import polars as pl |
| | from pandas import ( |
| | DataFrame as pd_DataFrame, |
| | read_csv as pd_read_csv, |
| | to_datetime, |
| | ) |
| |
|
| |
|
| | def read_csv_data( |
| | start_date: date | str, |
| | retrieve_columns: list | tuple = ( |
| | "publication_date", |
| | "document_number", |
| | "significant", |
| | "econ_significant", |
| | "3(f)(1) significant", |
| | "Major" |
| | ), |
| | url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv" |
| | ) -> tuple[pd_DataFrame | None, list, date]: |
| | """Read CSV data from GitHub file. |
| | |
| | Args: |
| | start_date (date | str): Start date of read data. |
| | retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ). |
| | url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv". |
| | |
| | Returns: |
| | tuple: Data, column names, max date in dataset |
| | """ |
| | |
| | if isinstance(start_date, str): |
| | start_date = date.fromisoformat(start_date) |
| | |
| | |
| | if start_date >= date.fromisoformat("2023-04-06"): |
| | cols = [col for col in retrieve_columns if col != "econ_significant"] |
| | else: |
| | cols = list(retrieve_columns) |
| | |
| | |
| | try: |
| | df_pd = pd_read_csv(url, usecols=cols) |
| | except UnicodeDecodeError: |
| | df_pd = pd_read_csv(url, usecols=cols, encoding="latin") |
| | |
| | |
| | if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0: |
| | df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\ |
| | drop_duplicates(subset=['document_number'],keep='last',ignore_index=True) |
| | else: |
| | pass |
| |
|
| | df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False) |
| | max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date() |
| | |
| | cols.remove("publication_date") |
| | df = pl.from_pandas(df_pd.loc[:, cols]) |
| | |
| | if df.shape[1] == len(cols): |
| | |
| | rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"} |
| | if all(True if rename in cols else False for rename in rename_cols.keys()): |
| | df = df.rename(rename_cols) |
| | cols = [rename_cols.get(col, col) for col in cols] |
| | |
| | return df, cols, max_date |
| | else: |
| | return None, cols, max_date |
| |
|
| |
|
| | def clean_data( |
| | df: pl.DataFrame, |
| | document_numbers: list, |
| | *, |
| | return_optimized_plan: bool = False |
| | ): |
| | """Clean data. |
| | |
| | Args: |
| | df (pl.DataFrame): Input polars dataframe. |
| | document_numbers (list): List of document numbers to keep. |
| | return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False. |
| | |
| | Returns: |
| | DataFrame | str: Cleaned data (or string representation of the query plan) |
| | """ |
| | |
| | lf = ( |
| | df.lazy() |
| | |
| | .with_columns(pl.col("document_number").str.strip_chars()) |
| | |
| | .filter(pl.col("document_number").is_in(document_numbers)) |
| | ) |
| | |
| | |
| | if return_optimized_plan: |
| | return lf.explain(optimized=True) |
| | |
| | |
| | return lf.collect() |
| |
|
| |
|
| | def merge_with_api_results( |
| | pd_df: pd_DataFrame, |
| | pl_df: pl.DataFrame |
| | ): |
| | """Merge significance data with FR API data. |
| | |
| | Args: |
| | pd_df (pd_DataFrame): Main dataset of FR rules. |
| | pl_df (pl.DataFrame): Significance data. |
| | |
| | Returns: |
| | DataFrame: Merged data. |
| | """ |
| | main_df = pl.from_pandas(pd_df) |
| | df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True) |
| | return df.to_pandas() |
| |
|
| |
|
| | def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list): |
| | """Retrieve significance information for input data. |
| | |
| | Args: |
| | input_df (pd.DataFrame): Input data. |
| | start_date (str): Start date of data. |
| | document_numbers (list): Documents to keep. |
| | |
| | Returns: |
| | tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset |
| | """ |
| | pl_df, _, max_date = read_csv_data(start_date) |
| | if pl_df is None: |
| | print("Failed to integrate significance tracking data with retrieved documents.") |
| | return input_df |
| | pl_df = clean_data(pl_df, document_numbers) |
| | pd_df = merge_with_api_results(input_df, pl_df) |
| | return pd_df, max_date |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | date_a = "2023-04-05" |
| | date_b = "2023-04-06" |
| | numbers = [ |
| | "2021-01303", |
| | '2023-28006', |
| | '2024-00149', |
| | '2024-00089', |
| | '2023-28828', |
| | '2024-00300', |
| | '2024-00045', |
| | '2024-00192', |
| | '2024-00228', |
| | '2024-00187' |
| | ] |
| |
|
| | |
| | df_a, clean_cols = read_csv_data(date_a) |
| | df_a = clean_data(df_a, numbers, clean_cols) |
| | |
| | |
| | df_b, clean_cols = read_csv_data(date_b) |
| | df_b = clean_data(df_b, numbers, clean_cols) |
| | |
| | |
| | |
| |
|