File size: 4,448 Bytes
896453f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
"""
Split meetings_calendar.parquet and meetings_transcripts.parquet by state.

Creates state-specific files in data/gold/states/{STATE}/ directories.
"""

from pathlib import Path
import pandas as pd
from loguru import logger


def split_transcripts_by_state():
    """Split meetings_transcripts.parquet by state."""
    logger.info("πŸ“ Splitting meetings_transcripts.parquet by state...")
    
    # Load transcripts from national directory
    transcripts_file = Path("data/gold/national/meetings_transcripts.parquet")
    df_trans = pd.read_parquet(transcripts_file)
    
    logger.info(f"   Loaded {len(df_trans):,} transcripts from {len(df_trans['state'].unique())} states")
    
    states_dir = Path("data/gold/states")
    states_dir.mkdir(parents=True, exist_ok=True)
    
    total_size = 0
    
    for state in sorted(df_trans['state'].unique()):
        state_df = df_trans[df_trans['state'] == state].copy()
        
        # Create state directory
        state_dir = states_dir / state
        state_dir.mkdir(parents=True, exist_ok=True)
        
        # Save transcripts
        output_file = state_dir / "meetings_transcripts.parquet"
        state_df.to_parquet(output_file, index=False, compression='snappy')
        
        size = output_file.stat().st_size
        total_size += size
        
        logger.success(
            f"   βœ… {state}: {len(state_df):,} transcripts β†’ "
            f"{output_file} ({size / 1024 / 1024:.1f} MB)"
        )
    
    logger.success(
        f"   πŸ“¦ Total: {len(df_trans['state'].unique())} states, "
        f"{total_size / 1024 / 1024:.1f} MB"
    )
    
    return df_trans


def create_calendar_from_state_meetings():
    """Create calendar files from existing state meetings.parquet files."""
    logger.info("\nπŸ“… Creating meetings_calendar.parquet from state meetings...")
    
    states_dir = Path("data/gold/states")
    
    # Find all state directories with meetings.parquet
    state_dirs = sorted([d for d in states_dir.iterdir() if d.is_dir() and (d / "meetings.parquet").exists()])
    
    if not state_dirs:
        logger.warning("   ⚠️  No state directories with meetings.parquet found")
        return
    
    logger.info(f"   Found {len(state_dirs)} states with meeting data")
    
    total_size = 0
    total_records = 0
    
    for state_dir in state_dirs:
        state = state_dir.name
        meetings_file = state_dir / "meetings.parquet"
        
        # Load full meetings data
        df = pd.read_parquet(meetings_file)
        
        # Create calendar with essential columns
        calendar_cols = {
            'vid_id': 'meeting_id',
            'place_name': 'jurisdiction', 
            'channel_type': 'channel_type',
            'meeting_date': 'meeting_date',
            'vid_upload_date': 'upload_date',
            'vid_title': 'title',
            'vid_length_min': 'duration_min',
        }
        
        # Select and rename columns that exist
        available_cols = {k: v for k, v in calendar_cols.items() if k in df.columns}
        calendar_df = df[list(available_cols.keys())].copy()
        calendar_df = calendar_df.rename(columns=available_cols)
        
        # Save calendar
        output_file = state_dir / "meetings_calendar.parquet"
        calendar_df.to_parquet(output_file, index=False, compression='snappy')
        
        size = output_file.stat().st_size
        total_size += size
        total_records += len(calendar_df)
        
        logger.success(
            f"   βœ… {state}: {len(calendar_df):,} calendar records β†’ "
            f"{output_file} ({size / 1024 / 1024:.2f} MB)"
        )
    
    logger.success(
        f"   πŸ“¦ Total: {len(state_dirs)} states, {total_records:,} records, "
        f"{total_size / 1024 / 1024:.1f} MB"
    )


def main():
    """Split both meetings files by state."""
    logger.info("πŸš€ Starting meetings data split by state...\n")
    
    # Split transcripts (has state column in source file)
    split_transcripts_by_state()
    
    # Create calendar files from existing state meetings data
    create_calendar_from_state_meetings()
    
    logger.success("\nβœ… Done! Meetings data split by state")
    logger.info("\nFiles created in: data/gold/states/{STATE}/")
    logger.info("  - meetings_transcripts.parquet")
    logger.info("  - meetings_calendar.parquet")


if __name__ == "__main__":
    main()