| import pandas as pd |
| import subprocess |
| import datetime |
| from dateutil import parser |
| import re |
| import os |
|
|
| |
| dir_path=os.path.dirname(os.path.realpath(__file__)) |
|
|
| |
| rd_stages={0:('RD Introduced','introdate'), |
| 1:('RD Passed One Chamber','chamber1passdate'), |
| 2:('RD Passed Two Chambers','chamber2passdate'), |
| 3:('RD Became Law','becamelawdate'), |
| 4:('RD Vetoed by President','vetodate'), |
| 9:('RD Failed','faildate'), |
| 100:('RD Passed Senate','senatevotedate'), |
| 200:('RD Passed House','housevotedate'), |
| 901:('RD Failed Senate','senatevotedate'), |
| 902:('RD Failed House','housevotedate')} |
|
|
| |
| def read_cra_data(file_path): |
| |
| |
| df_rd=pd.read_csv(file_path) |
|
|
| |
| df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True) |
|
|
| |
| df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True) |
|
|
| |
| for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']: |
| df_rd.loc[df_rd[d]=='.',d]=None |
| df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d') |
|
|
| |
| df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\ |
| astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') |
| df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\ |
| astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') |
| df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | |
| ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)), |
| 'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') |
|
|
| |
| def determine_stages(row): |
| """Determine RD stages based on chamber and voting conditions.""" |
| if row['intro_chamber'] == 'Senate': |
| if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']): |
| select_stages = [0, 100, 200, 3, 4] |
| elif row['senatevotebin'] != row['senatepassbin']: |
| select_stages = [0, 901] |
| else: |
| select_stages = [0, 100, 902] |
| elif row['intro_chamber'] == 'House': |
| if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']): |
| select_stages = [0, 200, 100, 3, 4] |
| elif row['housevotebin'] != row['housepassbin']: |
| select_stages = [0, 902] |
| else: |
| select_stages = [0, 200, 901] |
| else: |
| select_stages = [] |
|
|
| |
| stages = [ |
| f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}" |
| for stage in select_stages |
| if pd.notnull(row.get(rd_stages[stage][1])) |
| ] |
|
|
| return '; '.join(stages) |
|
|
| |
| df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1) |
|
|
| |
| df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1 |
| df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack() |
| df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1])) |
| df_wide = df_wide.reset_index() |
|
|
| |
| last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max() |
|
|
| |
| df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1) |
| df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3 |
| df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4 |
| df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | |
| ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)), |
| 'rd_status_no']=9 |
|
|
| |
| df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']] |
|
|
| |
| df_rd['rd_status'] = df_rd.apply( |
| lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})", |
| axis=1) |
|
|
| |
| df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\ |
| agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index() |
|
|
| |
| df_rd=df_rd.merge(df_wide,on='ridentifier',how='left') |
|
|
| |
| df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target', |
| 'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True) |
|
|
| |
| |
| indices = set() |
| for col in df_rd.columns: |
| match = re.search(r'_(\d+)$', col) |
| if match: |
| indices.add(int(match.group(1))) |
| max_index = max(indices) if indices else 0 |
|
|
| |
| rename_mapping={} |
| for i in range(1, max_index + 1): |
| rename_mapping.update({ |
| f'rd_{i}': f'RD{i}_No', |
| f'all_stages_{i}': f'RD{i}_Stage', |
| f'rd_link_{i}': f'RD{i}_Link', |
| }) |
|
|
| |
| df_rd.rename(columns=rename_mapping, inplace=True) |
|
|
| |
| new_order = [] |
| for i in range(1, max_index + 1): |
| new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link']) |
|
|
| |
| df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]] |
|
|
| return df_rd, last_rd_date |
|
|
| |
| def get_rd(df_fr): |
| |
| cra_file_path=f'{dir_path}/../congress_data/cra_data.csv' |
| df_rd,_=read_cra_data(cra_file_path) |
|
|
| |
| df_merged=df_fr.merge(df_rd,on='citation',how='left') |
|
|
| |
| df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0) |
|
|
| return df_merged |
|
|
| |
| def extract_date_after_keyword(text, keyword='data last updated'): |
| |
| date_patterns = [ |
| r'\d{4}-\d{2}-\d{2}', |
| r'\d{2}/\d{2}/\d{4}', |
| r'\d{1}/\d{2}/\d{4}', |
| r'\d{1}/\d{1}/\d{4}', |
| r'\d{2}-\d{2}-\d{4}', |
| r'\d{4}/\d{2}/\d{2}', |
| r'\d{2} \w{3,9} \d{4}', |
| r'\w{3,9} \d{2}, \d{4}', |
| ] |
|
|
| |
| date_regex = '|'.join(date_patterns) |
|
|
| |
| pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})' |
| match = re.search(pattern, text, re.IGNORECASE) |
| |
| if match: |
| return parser.parse(match.group(1)).strftime("%Y-%m-%d") |
| return None |
|
|
| |
| def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'): |
| with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file: |
| text = file.read() |
| cra_last_updated = extract_date_after_keyword(text) |
|
|
| if cra_last_updated: |
| return cra_last_updated |
| else: |
| _,last_rd_date=read_cra_data(cra_file_path) |
| return last_rd_date.strftime("%Y-%m-%d") |
| |
| |
| CRA_LAST_UPDATED=get_cra_updated_date() |