File size: 8,729 Bytes
bb93e21 393578a b8a8b48 bb93e21 393578a bb93e21 b8a8b48 f520ef1 bb93e21 393578a bb93e21 d04838b fdf2d60 d04838b fdf2d60 bb93e21 393578a bb93e21 a0ce115 bb93e21 a0ce115 bb93e21 82bf3a2 bb93e21 a0ce115 bb93e21 a0ce115 bb93e21 a0ce115 bb93e21 a0ce115 58bb4c7 bb93e21 a0ce115 e36a0a8 58bb4c7 e36a0a8 bb93e21 e36a0a8 58bb4c7 393578a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | from datetime import date
from fr_toolbelt.api_requests import get_documents_by_date
from fr_toolbelt.preprocessing import process_documents, AgencyMetadata
from numpy import array
from pandas import DataFrame, to_datetime
try:
from search_columns import search_columns, SearchError
from significant import get_significant_info
from utils import get_agency_metadata_values
except (ModuleNotFoundError, ImportError):
from .search_columns import search_columns, SearchError
from .significant import get_significant_info
from .utils import get_agency_metadata_values
METADATA, _ = AgencyMetadata().get_agency_metadata()
START_DATE = "2024-01-01"
WINDOW_OPEN_DATE = "2024-08-16"
GET_SIGNIFICANT = True if date.fromisoformat(START_DATE) >= date(2023, 4, 6) else False
class DataAvailabilityError(Exception):
"""Raised when data is not available for the requested inputs."""
pass
def get_date_range(start_date: str, end_mmdd: str = "01-20"):
"""Define date range of documents returned by the app.
Args:
start_date (str): The start date for retrieving the documents.
end_mmdd (str, optional): The month and day for the end date in MM-DD format. Defaults to "01-20".
Returns:
dict: Dictionary containing start date, end date, and transition year.
"""
start_year = date.fromisoformat(start_date).year
end_year = start_year + 1
date_range = {
"start": start_date,
"end": f"{end_year}-{end_mmdd}",
"transition_year": end_year,
}
return date_range
def get_rules(date_range: dict) -> list[dict]:
"""Get rules within a date range.
"""
results, _ = get_documents_by_date(
start_date=date_range.get("start"),
end_date=date_range.get("end"),
document_types=("RULE", )
)
return results
def format_documents(documents: list[dict]):
"""Format Federal Register documents to generate count by presidential year.
Args:
documents (list[dict]): List of documents.
Returns:
DataFrame: Pandas DataFrame with formatted data.
"""
# process agency info in documents
documents = process_documents(
documents,
which=("agencies", "presidents"),
return_values_as_str=False
)
# create dataframe
df = DataFrame(documents)
# convert publication date to datetime format
df.loc[:, "publication_dt"] = to_datetime(df["publication_date"])
df.loc[:, "publication_date"] = df.apply(lambda x: x["publication_dt"].date(), axis=1)
df.loc[:, "publication_year"] = df.apply(lambda x: x["publication_dt"].year, axis=1)
df.loc[:, "publication_month"] = df.apply(lambda x: x["publication_dt"].month, axis=1)
df.loc[:, "publication_day"] = df.apply(lambda x: x["publication_dt"].day, axis=1)
# return dataframe
return df
def filter_new_admin_rules(
df: DataFrame,
transition_year: int,
date_col: str = "publication_date",
):
"""Remove rules issued by the new administration.
Args:
df (DataFrame): Input data.
transition_year (int): The year of the presidential transition.
date_col (str, optional): Column containing date information. Defaults to "publication_date".
Returns:
DataFrame: Filtered data.
"""
admin_transitions = {
2001: "george-w-bush",
2009: "barack-obama",
2017: "donald-trump",
2021: "joe-biden",
2025: "donald-trump",
}
bool_date = array(df[date_col] >= date(transition_year, 1, 20))
bool_prez = array(df["president_id"] == admin_transitions.get(transition_year))
bool_ = bool_date & bool_prez
return df.loc[~bool_]
def filter_corrections(df: DataFrame):
"""Filter out corrections from Federal Register documents.
Identifies corrections using `corrrection_of` field and regex searches of `document_number`, `title`, and `action` fields.
Args:
df (DataFrame): Federal Register data.
Returns:
tuple: DataFrame with corrections removed, DataFrame of corrections
"""
# get original column names
cols = df.columns.tolist()
# filter out corrections
# 1. Using correction fields
bool_na = array(df["correction_of"].isna())
# 2. Searching other fields
search_1 = search_columns(df, [r"^[crxz][\d]{1,2}-(?:[\w]{2,4}-)?[\d]+"], ["document_number"],
return_column="indicator1")
search_2 = search_columns(df, [r"(?:;\scorrection\b)|(?:\bcorrecting\samend[\w]+\b)"], ["title", "action"],
return_column="indicator2")
bool_search = array(search_1["indicator1"] == 1) | array(search_2["indicator2"] == 1)
# separate corrections from non-corrections
df_no_corrections = df.loc[(bool_na & ~bool_search), cols] # remove flagged documents
df_corrections = df.loc[(~bool_na | bool_search), cols]
# return filtered results
if len(df) == len(df_no_corrections) + len(df_corrections):
return df_no_corrections, df_corrections
else:
raise SearchError(f"{len(df)} != {len(df_no_corrections)} + {len(df_corrections)}")
def get_significant_rules(df: DataFrame, start_date: str) -> tuple[DataFrame, date]:
"""Get significant rules and merge with FR data.
Args:
df (DataFrame): Input data.
start_date (str): Start date of significant rule data.
Raises:
DataAvailabilityError: Raised when requesting significant rule counts prior to Executive Order 14094 of April 6, 2023.
Returns:
tuple[DataFrame, datetime.date]: Data with significant rules, last updated date for significant data
"""
process_columns = ("significant", "3f1_significant", )
if date.fromisoformat(start_date) < date(2023, 4, 6):
raise DataAvailabilityError("This program does not calculate significant rule counts prior to Executive Order 14094 of April 6, 2023.")
else:
document_numbers = df.loc[:, "document_number"].to_list()
df, last_updated = get_significant_info(df, start_date, document_numbers)
for col in process_columns:
bool_na = df[col].isna()
df.loc[bool_na, col] = "0"
df.loc[:, col] = df[col].replace(".", "0").astype("int64")
bool_3f1 = df["3f1_significant"] == 1
bool_sig = df["significant"] == 1
df.loc[:, "3f1_significant"] = 0
df.loc[bool_3f1, "3f1_significant"] = 1
df.loc[:, "other_significant"] = 0
df.loc[(bool_sig & ~bool_3f1), "other_significant"] = 1
return df, last_updated
def get_rules_in_window(start_date: str, get_significant: bool = True, metadata: dict = METADATA):
"""Retrieve and process rules in a given CRA window.
Args:
start_date (str): Start date of window.
get_significant (bool, optional): Get significant rule data. Defaults to True.
metadata (dict, optional): Agency metadata. Defaults to METADATA.
Returns:
tuple[DataFrame, datetime.date]: Data with significant rules, last updated date for significant data
"""
date_range = get_date_range(start_date)
transition_year = date_range.get("transition_year")
results = get_rules(date_range)
df = format_documents(results)
df, _ = filter_corrections(df)
df = filter_new_admin_rules(df, transition_year)
df.loc[:, "acronym"] = get_agency_metadata_values(df, "parent_slug", metadata=metadata, metadata_value="acronym")
if get_significant:
df, last_updated = get_significant_rules(df, start_date)
else:
last_updated = date.today()
return df, last_updated
def get_list_agencies(start_date: str, agency_column: str = "parent_slug", significant: bool = True, **kwargs):
"""Get list of agencies with rules in dataset.
Args:
start_date (str): Start date of window.
agency_column (str, optional): Column containing agency values. Defaults to "parent_slug".
significant (bool, optional): Get significant rule data. Defaults to True.
Returns:
list: List of agencies
"""
df, _ = get_rules_in_window(start_date, get_significant=significant, **kwargs)
df_ex = df.explode(agency_column, ignore_index=True)
return sorted(df_ex[agency_column].value_counts().index.to_list())
# create objects to import in app
DF, LAST_UPDATED = get_rules_in_window(START_DATE, get_significant=GET_SIGNIFICANT)
AGENCIES = get_list_agencies(START_DATE, significant=GET_SIGNIFICANT)
if __name__ == "__main__":
print(DF.columns)
print(LAST_UPDATED)
print(AGENCIES)
print(len(METADATA.keys()))
|