Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,448 Bytes
896453f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | #!/usr/bin/env python3
"""
Split meetings_calendar.parquet and meetings_transcripts.parquet by state.
Creates state-specific files in data/gold/states/{STATE}/ directories.
"""
from pathlib import Path
import pandas as pd
from loguru import logger
def split_transcripts_by_state():
"""Split meetings_transcripts.parquet by state."""
logger.info("π Splitting meetings_transcripts.parquet by state...")
# Load transcripts from national directory
transcripts_file = Path("data/gold/national/meetings_transcripts.parquet")
df_trans = pd.read_parquet(transcripts_file)
logger.info(f" Loaded {len(df_trans):,} transcripts from {len(df_trans['state'].unique())} states")
states_dir = Path("data/gold/states")
states_dir.mkdir(parents=True, exist_ok=True)
total_size = 0
for state in sorted(df_trans['state'].unique()):
state_df = df_trans[df_trans['state'] == state].copy()
# Create state directory
state_dir = states_dir / state
state_dir.mkdir(parents=True, exist_ok=True)
# Save transcripts
output_file = state_dir / "meetings_transcripts.parquet"
state_df.to_parquet(output_file, index=False, compression='snappy')
size = output_file.stat().st_size
total_size += size
logger.success(
f" β
{state}: {len(state_df):,} transcripts β "
f"{output_file} ({size / 1024 / 1024:.1f} MB)"
)
logger.success(
f" π¦ Total: {len(df_trans['state'].unique())} states, "
f"{total_size / 1024 / 1024:.1f} MB"
)
return df_trans
def create_calendar_from_state_meetings():
"""Create calendar files from existing state meetings.parquet files."""
logger.info("\nπ
Creating meetings_calendar.parquet from state meetings...")
states_dir = Path("data/gold/states")
# Find all state directories with meetings.parquet
state_dirs = sorted([d for d in states_dir.iterdir() if d.is_dir() and (d / "meetings.parquet").exists()])
if not state_dirs:
logger.warning(" β οΈ No state directories with meetings.parquet found")
return
logger.info(f" Found {len(state_dirs)} states with meeting data")
total_size = 0
total_records = 0
for state_dir in state_dirs:
state = state_dir.name
meetings_file = state_dir / "meetings.parquet"
# Load full meetings data
df = pd.read_parquet(meetings_file)
# Create calendar with essential columns
calendar_cols = {
'vid_id': 'meeting_id',
'place_name': 'jurisdiction',
'channel_type': 'channel_type',
'meeting_date': 'meeting_date',
'vid_upload_date': 'upload_date',
'vid_title': 'title',
'vid_length_min': 'duration_min',
}
# Select and rename columns that exist
available_cols = {k: v for k, v in calendar_cols.items() if k in df.columns}
calendar_df = df[list(available_cols.keys())].copy()
calendar_df = calendar_df.rename(columns=available_cols)
# Save calendar
output_file = state_dir / "meetings_calendar.parquet"
calendar_df.to_parquet(output_file, index=False, compression='snappy')
size = output_file.stat().st_size
total_size += size
total_records += len(calendar_df)
logger.success(
f" β
{state}: {len(calendar_df):,} calendar records β "
f"{output_file} ({size / 1024 / 1024:.2f} MB)"
)
logger.success(
f" π¦ Total: {len(state_dirs)} states, {total_records:,} records, "
f"{total_size / 1024 / 1024:.1f} MB"
)
def main():
"""Split both meetings files by state."""
logger.info("π Starting meetings data split by state...\n")
# Split transcripts (has state column in source file)
split_transcripts_by_state()
# Create calendar files from existing state meetings data
create_calendar_from_state_meetings()
logger.success("\nβ
Done! Meetings data split by state")
logger.info("\nFiles created in: data/gold/states/{STATE}/")
logger.info(" - meetings_transcripts.parquet")
logger.info(" - meetings_calendar.parquet")
if __name__ == "__main__":
main()
|