#!/usr/bin/env python3 """ Upload discovery and processed data to Hugging Face Datasets. This script allows you to store unlimited data for FREE on Hugging Face. Usage: # Install requirements pip install huggingface_hub datasets # Get your token from https://huggingface.co/settings/tokens export HUGGINGFACE_TOKEN="hf_YOUR_TOKEN_HERE" # Upload discovery results python scripts/upload_to_huggingface.py --discovery # Upload meeting data python scripts/upload_to_huggingface.py --meetings # Upload oral health subset python scripts/upload_to_huggingface.py --oral-health """ import argparse import os from pathlib import Path import pandas as pd from datasets import Dataset, DatasetDict, Features, Value, Sequence from huggingface_hub import login, create_repo, HfApi from loguru import logger # Configuration DEFAULT_REPO_NAME = "oral-health-policy-data" class HuggingFaceUploader: """Upload oral health policy data to Hugging Face Datasets.""" def __init__(self, repo_name: str, token: str = None): """ Initialize uploader. Args: repo_name: Hugging Face repo name (e.g., "username/oral-health-policy-data") token: HF token (or set HUGGINGFACE_TOKEN environment variable) """ self.repo_name = repo_name # Check HUGGINGFACE_TOKEN first (matches .env), fall back to HF_TOKEN for backwards compatibility self.token = token or os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN") if not self.token: raise ValueError( "Hugging Face token required! " "Get it from https://huggingface.co/settings/tokens " "and set HUGGINGFACE_TOKEN environment variable" ) # Login login(token=self.token) logger.info(f"✅ Logged in to Hugging Face") # Create repo if doesn't exist try: create_repo( repo_id=self.repo_name, repo_type="dataset", private=False, # Public = FREE unlimited storage! exist_ok=True ) logger.info(f"✅ Repository ready: https://huggingface.co/datasets/{self.repo_name}") except Exception as e: logger.warning(f"Repository may already exist: {e}") def upload_discovery_results(self, data_dir: str = "data/bronze/discovered_sources"): """ Upload discovery results to Hugging Face as Parquet. IMPORTANT: This uploads 1 Parquet file (not thousands of individual files). This keeps you under Hugging Face's 100k file limit. Args: data_dir: Directory containing discovery CSV files """ logger.info(f"📤 Uploading discovery results from {data_dir}") data_path = Path(data_dir) if not data_path.exists(): logger.error(f"Directory not found: {data_dir}") return # Find all CSV files csv_files = list(data_path.glob("discovery_*.csv")) if not csv_files: logger.warning(f"No discovery CSV files found in {data_dir}") return # Load and combine all CSVs all_data = [] for csv_file in csv_files: logger.info(f" Loading {csv_file.name}...") df = pd.read_csv(csv_file) all_data.append(df) # Combine combined = pd.concat(all_data, ignore_index=True) # Remove duplicates combined = combined.drop_duplicates(subset=['name', 'state'], keep='last') logger.info(f" Total jurisdictions: {len(combined)}") logger.info(f" Columns: {', '.join(combined.columns)}") # Save as Parquet locally first (compressed) parquet_file = Path("discovery_all.parquet") combined.to_parquet(parquet_file, compression='snappy', index=False) file_size_mb = parquet_file.stat().st_size / (1024 * 1024) logger.info(f" Parquet file size: {file_size_mb:.2f} MB") # Convert to Dataset (will be stored as Parquet on HF) dataset = Dataset.from_pandas(combined) # Upload (Hugging Face stores as Parquet internally) logger.info(f" Uploading to Hugging Face as Parquet...") dataset.push_to_hub( self.repo_name, split="discovery", commit_message="Update discovery results" ) # Clean up local Parquet parquet_file.unlink() logger.success(f"✅ Uploaded {len(combined)} jurisdictions in 1 Parquet file!") logger.success(f" File size: {file_size_mb:.2f} MB (not {len(combined)} individual files)") logger.success(f" View at: https://huggingface.co/datasets/{self.repo_name}") return dataset def upload_meeting_data(self, meetings_file: str): """ Upload meeting data to Hugging Face as Parquet. IMPORTANT: Pass a CSV/JSON with extracted text, NOT individual PDF files. This keeps you under Hugging Face's 100k file limit. Expected columns: - jurisdiction: City/county name - state: State code - date: Meeting date - title: Meeting title - agenda_text: Extracted text from agenda PDF - minutes_text: Extracted text from minutes PDF - source_url: Link to original PDF - video_url: Link to YouTube (optional) Args: meetings_file: CSV/JSON file with meeting data (text extracted, not PDF bytes) """ logger.info(f"📤 Uploading meeting data from {meetings_file}") file_path = Path(meetings_file) if not file_path.exists(): logger.error(f"File not found: {meetings_file}") return # Load data if file_path.suffix == '.csv': df = pd.read_csv(file_path) elif file_path.suffix == '.json': df = pd.read_json(file_path) elif file_path.suffix == '.parquet': df = pd.read_parquet(file_path) else: logger.error(f"Unsupported file type: {file_path.suffix}. Use .csv, .json, or .parquet") return logger.info(f" Meetings: {len(df)}") logger.info(f" Columns: {', '.join(df.columns)}") # Validate expected columns required_cols = ['jurisdiction', 'state', 'date'] missing = [col for col in required_cols if col not in df.columns] if missing: logger.warning(f" Missing recommended columns: {missing}") # Save as Parquet locally first parquet_file = Path("meetings_all.parquet") df.to_parquet(parquet_file, compression='snappy', index=False) file_size_mb = parquet_file.stat().st_size / (1024 * 1024) logger.info(f" Parquet file size: {file_size_mb:.2f} MB") # Convert to Dataset (stored as Parquet on HF) dataset = Dataset.from_pandas(df) # Upload logger.info(f" Uploading {len(df):,} meetings as 1 Parquet file...") dataset.push_to_hub( self.repo_name, split="meetings", commit_message="Update meeting data" ) # Clean up parquet_file.unlink() logger.success(f"✅ Uploaded {len(df):,} meetings in 1 Parquet file!") logger.success(f" File size: {file_size_mb:.2f} MB") logger.success(f" NOT {len(df):,} individual PDF files (would exceed limits)") return dataset def upload_oral_health_subset(self, filtered_file: str): """ Upload filtered oral health documents to Hugging Face. Args: filtered_file: CSV/JSON with oral health-related documents """ logger.info(f"📤 Uploading oral health subset from {filtered_file}") file_path = Path(filtered_file) if not file_path.exists(): logger.error(f"File not found: {filtered_file}") return # Load data if file_path.suffix == '.csv': df = pd.read_csv(file_path) elif file_path.suffix == '.json': df = pd.read_json(file_path) else: logger.error(f"Unsupported file type: {file_path.suffix}") return logger.info(f" Documents: {len(df)}") # Convert to Dataset dataset = Dataset.from_pandas(df) # Upload dataset.push_to_hub( self.repo_name, split="oral_health", commit_message="Update oral health documents" ) logger.success(f"✅ Uploaded {len(df)} oral health documents!") return dataset def create_dataset_card(self): """Create README.md for the dataset.""" readme = f"""--- license: cc0-1.0 task_categories: - text-classification - summarization language: - en tags: - government - public-health - oral-health - policy - meeting-minutes size_categories: - 10K 0.8) ``` ## Data Collection Data was collected through: 1. **Automated discovery** across 22,000+ U.S. jurisdictions 2. **Pattern matching** for official websites and social media 3. **API integration** with Legistar, SuiteOne, and other platforms 4. **Web scraping** of public government websites 5. **YouTube channel discovery** using multiple handle patterns 6. **Text extraction** from public PDF documents 7. **Keyword filtering** for oral health topics ## Ethical Considerations - All data is from public government sources - No personal information is included - Documents are public records under Freedom of Information laws - Dataset helps research community access public policy information ## Citation If you use this dataset in your research, please cite: ``` @dataset{{oral_health_policy_data, author = {{Oral Health Policy Pulse Project}}, title = {{Oral Health Policy Data}}, year = {{2026}}, publisher = {{Hugging Face}}, url = {{https://huggingface.co/datasets/{self.repo_name}}} }} ``` ## Maintenance This dataset is actively maintained. Updates are pushed regularly as new data is discovered. Last updated: {pd.Timestamp.now().strftime('%Y-%m-%d')} """ # Upload README api = HfApi() api.upload_file( path_or_fileobj=readme.encode('utf-8'), path_in_repo="README.md", repo_id=self.repo_name, repo_type="dataset", token=self.token ) logger.success(f"✅ Dataset card created!") def process_pdfs_to_parquet(pdf_urls: list, output_file: str = "meetings_processed.parquet"): """ CORRECT WAY: Extract text from PDFs and save as Parquet (not individual files). This function demonstrates the proper workflow: 1. Download PDF temporarily 2. Extract text 3. Store metadata + text in DataFrame 4. Delete PDF 5. Save all as single Parquet file This avoids uploading millions of individual files to Hugging Face. Args: pdf_urls: List of PDF URLs to process output_file: Output Parquet file path Returns: DataFrame with processed meetings """ try: import httpx from PyPDF2 import PdfReader import io from datetime import datetime except ImportError: logger.error("Install required packages: pip install httpx PyPDF2") return None logger.info(f"Processing {len(pdf_urls)} PDFs to Parquet format...") all_meetings = [] client = httpx.Client(timeout=30) for i, pdf_url in enumerate(pdf_urls, 1): try: logger.info(f" [{i}/{len(pdf_urls)}] Processing {pdf_url}...") # Download PDF temporarily (don't save to disk) response = client.get(pdf_url) pdf_bytes = response.content # Extract text from PDF pdf_reader = PdfReader(io.BytesIO(pdf_bytes)) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" # Store metadata + text (NOT PDF bytes) all_meetings.append({ 'source_url': pdf_url, 'text': text, 'page_count': len(pdf_reader.pages), 'file_size_kb': len(pdf_bytes) // 1024, 'processed_at': datetime.now().isoformat(), # Add your metadata extraction here: # 'jurisdiction': extract_jurisdiction(text), # 'date': extract_date(text), # 'title': extract_title(text), }) # Delete PDF bytes immediately (free memory!) del pdf_bytes except Exception as e: logger.warning(f" Failed to process {pdf_url}: {e}") continue client.close() # Convert to DataFrame df = pd.DataFrame(all_meetings) logger.info(f"Successfully processed {len(df)} PDFs") # Save as Parquet (compressed) df.to_parquet(output_file, compression='snappy', index=False) file_size_mb = Path(output_file).stat().st_size / (1024 * 1024) logger.success(f"✅ Saved to {output_file} ({file_size_mb:.2f} MB)") logger.success(f" This is 1 file, not {len(df)} individual PDFs!") return df def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Upload oral health policy data to Hugging Face" ) parser.add_argument( "--repo", default=DEFAULT_REPO_NAME, help="Hugging Face repo name (e.g., 'username/oral-health-policy-data')" ) parser.add_argument( "--discovery", action="store_true", help="Upload discovery results" ) parser.add_argument( "--meetings", help="Upload meeting data from CSV/JSON file" ) parser.add_argument( "--oral-health", help="Upload oral health subset from CSV/JSON file" ) parser.add_argument( "--create-card", action="store_true", help="Create dataset README card" ) parser.add_argument( "--process-pdfs", help="Process PDF URLs from file and save as Parquet (CORRECT way to handle PDFs)" ) args = parser.parse_args() # Special case: Process PDFs to Parquet first if args.process_pdfs: logger.info("Processing PDFs to Parquet format (not uploading individual files)...") # Load PDF URLs from file pdf_urls = [] with open(args.process_pdfs) as f: pdf_urls = [line.strip() for line in f if line.strip()] # Process to Parquet df = process_pdfs_to_parquet(pdf_urls, "meetings_processed.parquet") if df is not None: logger.success("✅ PDFs processed! Now upload the Parquet file:") logger.success(f" python {__file__} --meetings meetings_processed.parquet") return # Initialize uploader uploader = HuggingFaceUploader(args.repo) # Upload based on flags if args.discovery: uploader.upload_discovery_results() if args.meetings: uploader.upload_meeting_data(args.meetings) if args.oral_health: uploader.upload_oral_health_subset(args.oral_health) if args.create_card: uploader.create_dataset_card() if not any([args.discovery, args.meetings, args.oral_health, args.create_card]): logger.warning("No action specified. Use --discovery, --meetings, or --oral-health") parser.print_help() if __name__ == "__main__": main()