File size: 5,805 Bytes
61d29fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
"""
Fix and publish the 5 failed datasets
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv

load_dotenv()

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
GOLD_DIR = Path("data/gold")


def fix_and_publish_jurisdictions():
    """Fix and publish the 4 jurisdiction files."""
    
    jurisdiction_files = [
        'data/gold/reference/jurisdictions_cities.parquet',
        'data/gold/reference/jurisdictions_counties.parquet',
        'data/gold/reference/jurisdictions_school_districts.parquet',
        'data/gold/reference/jurisdictions_townships.parquet',
    ]
    
    api = HfApi(token=HUGGINGFACE_TOKEN)
    
    for file_str in jurisdiction_files:
        file_path = Path(file_str)
        dataset_name = f"reference-{file_path.stem.replace('_', '-')}"
        repo_id = f"{HF_ORGANIZATION}/{dataset_name}"
        
        logger.info(f"πŸ“€ Processing {file_path.name}...")
        
        try:
            # Load file
            df = pd.read_parquet(file_path)
            logger.info(f"   Loaded {len(df):,} records, {len(df.columns)} columns")
            
            # FIX: Convert ALL columns to standard types
            # This fixes the Arrow dictionary/categorical issue
            for col in df.columns:
                if df[col].dtype.name == 'category':
                    logger.info(f"   Converting categorical column: {col}")
                    df[col] = df[col].astype(str)
                elif df[col].dtype == 'object':
                    # Ensure all object columns are strings
                    df[col] = df[col].astype(str)
            
            # Reset index
            df = df.reset_index(drop=True)
            
            logger.info(f"   Creating HuggingFace dataset...")
            dataset = Dataset.from_pandas(df, preserve_index=False)
            
            # Create repo
            try:
                create_repo(
                    repo_id=repo_id,
                    repo_type="dataset",
                    private=False,
                    exist_ok=True,
                    token=HUGGINGFACE_TOKEN
                )
            except Exception as e:
                logger.debug(f"   Repo may already exist: {e}")
            
            # Push to hub
            logger.info(f"   Pushing to {repo_id}...")
            dataset.push_to_hub(
                repo_id=repo_id,
                private=False,
                commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
                token=HUGGINGFACE_TOKEN
            )
            
            url = f"https://huggingface.co/datasets/{repo_id}"
            logger.success(f"   βœ… Published {len(df):,} records to {url}\n")
            
        except Exception as e:
            logger.error(f"   ❌ Failed: {e}\n")


def check_old_meeting_files():
    """Check if we have old meetings.parquet files that should be replaced."""
    
    logger.info("πŸ” Checking for old meeting file naming...")
    
    events_events = Path('data/gold/national/events_events.parquet')
    old_meetings_calendar = Path('data/gold/national/meetings_calendar.parquet')
    old_meetings = Path('data/gold/national/meetings.parquet')
    
    if events_events.exists():
        try:
            df = pd.read_parquet(events_events)
            logger.success(f"βœ… Found events_events.parquet with {len(df):,} records (new naming)")
            
            if old_meetings_calendar.exists() or old_meetings.exists():
                logger.warning("⚠️  Old meeting files still exist - these can be deleted:")
                if old_meetings_calendar.exists():
                    logger.info(f"   - meetings_calendar.parquet")
                if old_meetings.exists():
                    logger.info(f"   - meetings.parquet")
                logger.info("   Run migration to rename old files to events_* naming\n")
        except Exception as e:
            logger.error(f"❌ events_events.parquet error: {e}")
    else:
        logger.warning("⚠️  events_events.parquet not found - run pipeline to generate")
        
    # Check if old files exist
    if old_meetings_calendar.exists():
        try:
            df = pd.read_parquet(old_meetings_calendar)
            logger.info(f"πŸ“‹ Old meetings_calendar.parquet has {len(df):,} records")
        except Exception as e:
            logger.error(f"❌ meetings_calendar.parquet is corrupted: {e}")
    
    if old_meetings.exists():
        try:
            df = pd.read_parquet(old_meetings)
            logger.info(f"πŸ“‹ Old meetings.parquet has {len(df):,} records")
        except Exception as e:
            logger.error(f"❌ meetings.parquet is corrupted: {e}")
            logger.info(f"   File size: {old_meetings.stat().st_size / 1024 / 1024:.2f} MB")
    
    logger.info("")


def main():
    """Fix and publish failed datasets."""
    
    if not HUGGINGFACE_TOKEN:
        logger.error("❌ HUGGINGFACE_TOKEN not set")
        return
    
    logger.info("=" * 80)
    logger.info("πŸ”§ Fixing and Publishing Failed Datasets")
    logger.info("=" * 80)
    print()
    
    # Check for old meeting file naming
    check_old_meeting_files()
    
    # Fix and publish jurisdiction files
    logger.info("πŸ“‹ Publishing 4 jurisdiction reference datasets...")
    print()
    fix_and_publish_jurisdictions()
    
    # Summary
    logger.info("=" * 80)
    logger.success("βœ… Done! Check your datasets at: https://huggingface.co/CommunityOne")
    logger.info("=" * 80)


if __name__ == "__main__":
    main()