open-navigator / scripts /huggingface /upload_to_huggingface.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
#!/usr/bin/env python3
"""
Upload discovery and processed data to Hugging Face Datasets.
This script allows you to store unlimited data for FREE on Hugging Face.
Usage:
# Install requirements
pip install huggingface_hub datasets
# Get your token from https://huggingface.co/settings/tokens
export HUGGINGFACE_TOKEN="hf_YOUR_TOKEN_HERE"
# Upload discovery results
python scripts/upload_to_huggingface.py --discovery
# Upload meeting data
python scripts/upload_to_huggingface.py --meetings
# Upload oral health subset
python scripts/upload_to_huggingface.py --oral-health
"""
import argparse
import os
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict, Features, Value, Sequence
from huggingface_hub import login, create_repo, HfApi
from loguru import logger
# Configuration
DEFAULT_REPO_NAME = "oral-health-policy-data"
class HuggingFaceUploader:
"""Upload oral health policy data to Hugging Face Datasets."""
def __init__(self, repo_name: str, token: str = None):
"""
Initialize uploader.
Args:
repo_name: Hugging Face repo name (e.g., "username/oral-health-policy-data")
token: HF token (or set HUGGINGFACE_TOKEN environment variable)
"""
self.repo_name = repo_name
# Check HUGGINGFACE_TOKEN first (matches .env), fall back to HF_TOKEN for backwards compatibility
self.token = token or os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
if not self.token:
raise ValueError(
"Hugging Face token required! "
"Get it from https://huggingface.co/settings/tokens "
"and set HUGGINGFACE_TOKEN environment variable"
)
# Login
login(token=self.token)
logger.info(f"✅ Logged in to Hugging Face")
# Create repo if doesn't exist
try:
create_repo(
repo_id=self.repo_name,
repo_type="dataset",
private=False, # Public = FREE unlimited storage!
exist_ok=True
)
logger.info(f"✅ Repository ready: https://huggingface.co/datasets/{self.repo_name}")
except Exception as e:
logger.warning(f"Repository may already exist: {e}")
def upload_discovery_results(self, data_dir: str = "data/bronze/discovered_sources"):
"""
Upload discovery results to Hugging Face as Parquet.
IMPORTANT: This uploads 1 Parquet file (not thousands of individual files).
This keeps you under Hugging Face's 100k file limit.
Args:
data_dir: Directory containing discovery CSV files
"""
logger.info(f"📤 Uploading discovery results from {data_dir}")
data_path = Path(data_dir)
if not data_path.exists():
logger.error(f"Directory not found: {data_dir}")
return
# Find all CSV files
csv_files = list(data_path.glob("discovery_*.csv"))
if not csv_files:
logger.warning(f"No discovery CSV files found in {data_dir}")
return
# Load and combine all CSVs
all_data = []
for csv_file in csv_files:
logger.info(f" Loading {csv_file.name}...")
df = pd.read_csv(csv_file)
all_data.append(df)
# Combine
combined = pd.concat(all_data, ignore_index=True)
# Remove duplicates
combined = combined.drop_duplicates(subset=['name', 'state'], keep='last')
logger.info(f" Total jurisdictions: {len(combined)}")
logger.info(f" Columns: {', '.join(combined.columns)}")
# Save as Parquet locally first (compressed)
parquet_file = Path("discovery_all.parquet")
combined.to_parquet(parquet_file, compression='snappy', index=False)
file_size_mb = parquet_file.stat().st_size / (1024 * 1024)
logger.info(f" Parquet file size: {file_size_mb:.2f} MB")
# Convert to Dataset (will be stored as Parquet on HF)
dataset = Dataset.from_pandas(combined)
# Upload (Hugging Face stores as Parquet internally)
logger.info(f" Uploading to Hugging Face as Parquet...")
dataset.push_to_hub(
self.repo_name,
split="discovery",
commit_message="Update discovery results"
)
# Clean up local Parquet
parquet_file.unlink()
logger.success(f"✅ Uploaded {len(combined)} jurisdictions in 1 Parquet file!")
logger.success(f" File size: {file_size_mb:.2f} MB (not {len(combined)} individual files)")
logger.success(f" View at: https://huggingface.co/datasets/{self.repo_name}")
return dataset
def upload_meeting_data(self, meetings_file: str):
"""
Upload meeting data to Hugging Face as Parquet.
IMPORTANT: Pass a CSV/JSON with extracted text, NOT individual PDF files.
This keeps you under Hugging Face's 100k file limit.
Expected columns:
- jurisdiction: City/county name
- state: State code
- date: Meeting date
- title: Meeting title
- agenda_text: Extracted text from agenda PDF
- minutes_text: Extracted text from minutes PDF
- source_url: Link to original PDF
- video_url: Link to YouTube (optional)
Args:
meetings_file: CSV/JSON file with meeting data (text extracted, not PDF bytes)
"""
logger.info(f"📤 Uploading meeting data from {meetings_file}")
file_path = Path(meetings_file)
if not file_path.exists():
logger.error(f"File not found: {meetings_file}")
return
# Load data
if file_path.suffix == '.csv':
df = pd.read_csv(file_path)
elif file_path.suffix == '.json':
df = pd.read_json(file_path)
elif file_path.suffix == '.parquet':
df = pd.read_parquet(file_path)
else:
logger.error(f"Unsupported file type: {file_path.suffix}. Use .csv, .json, or .parquet")
return
logger.info(f" Meetings: {len(df)}")
logger.info(f" Columns: {', '.join(df.columns)}")
# Validate expected columns
required_cols = ['jurisdiction', 'state', 'date']
missing = [col for col in required_cols if col not in df.columns]
if missing:
logger.warning(f" Missing recommended columns: {missing}")
# Save as Parquet locally first
parquet_file = Path("meetings_all.parquet")
df.to_parquet(parquet_file, compression='snappy', index=False)
file_size_mb = parquet_file.stat().st_size / (1024 * 1024)
logger.info(f" Parquet file size: {file_size_mb:.2f} MB")
# Convert to Dataset (stored as Parquet on HF)
dataset = Dataset.from_pandas(df)
# Upload
logger.info(f" Uploading {len(df):,} meetings as 1 Parquet file...")
dataset.push_to_hub(
self.repo_name,
split="meetings",
commit_message="Update meeting data"
)
# Clean up
parquet_file.unlink()
logger.success(f"✅ Uploaded {len(df):,} meetings in 1 Parquet file!")
logger.success(f" File size: {file_size_mb:.2f} MB")
logger.success(f" NOT {len(df):,} individual PDF files (would exceed limits)")
return dataset
def upload_oral_health_subset(self, filtered_file: str):
"""
Upload filtered oral health documents to Hugging Face.
Args:
filtered_file: CSV/JSON with oral health-related documents
"""
logger.info(f"📤 Uploading oral health subset from {filtered_file}")
file_path = Path(filtered_file)
if not file_path.exists():
logger.error(f"File not found: {filtered_file}")
return
# Load data
if file_path.suffix == '.csv':
df = pd.read_csv(file_path)
elif file_path.suffix == '.json':
df = pd.read_json(file_path)
else:
logger.error(f"Unsupported file type: {file_path.suffix}")
return
logger.info(f" Documents: {len(df)}")
# Convert to Dataset
dataset = Dataset.from_pandas(df)
# Upload
dataset.push_to_hub(
self.repo_name,
split="oral_health",
commit_message="Update oral health documents"
)
logger.success(f"✅ Uploaded {len(df)} oral health documents!")
return dataset
def create_dataset_card(self):
"""Create README.md for the dataset."""
readme = f"""---
license: cc0-1.0
task_categories:
- text-classification
- summarization
language:
- en
tags:
- government
- public-health
- oral-health
- policy
- meeting-minutes
size_categories:
- 10K<n<100K
---
# Oral Health Policy Data
This dataset contains comprehensive data about oral health policy discussions in U.S. government meetings.
## Dataset Description
- **Curated by:** Oral Health Policy Pulse Project
- **Language(s):** English
- **License:** CC0 (Public Domain)
## Dataset Structure
### Discovery Split
Contains information about 22,000+ U.S. jurisdictions (cities and counties), including:
- Official websites
- YouTube channels (with subscriber/video counts)
- Meeting platforms (Legistar, SuiteOne, Granicus, etc.)
- Agenda portals
- Social media accounts
- Completeness scores
**Columns:**
- `name`: Jurisdiction name
- `state`: State code
- `type`: "city" or "county"
- `population`: Population estimate
- `website`: Official website URL
- `youtube_channels`: Number of YouTube channels found
- `meeting_platforms`: Number of meeting platforms detected
- `agenda_portals`: Number of agenda portal URLs
- `completeness`: Completeness score (0-1)
### Meetings Split
Contains processed meeting data including:
- Meeting metadata (date, time, body, location)
- Agenda items
- Minutes/transcripts
- Video URLs
- Source links
### Oral Health Split
Contains filtered subset of meetings/documents that mention oral health topics:
- Fluoridation discussions
- Dental clinic approvals
- Water treatment policy
- School dental programs
- Public health initiatives
## Usage
```python
from datasets import load_dataset
# Load discovery data
discovery = load_dataset("YOUR_USERNAME/oral-health-policy-data", split="discovery")
# Load meeting data
meetings = load_dataset("YOUR_USERNAME/oral-health-policy-data", split="meetings")
# Load oral health subset
oral_health = load_dataset("YOUR_USERNAME/oral-health-policy-data", split="oral_health")
# Filter for specific state
alabama_data = discovery.filter(lambda x: x['state'] == 'AL')
# Find high-quality sources
high_quality = discovery.filter(lambda x: x['completeness'] > 0.8)
```
## Data Collection
Data was collected through:
1. **Automated discovery** across 22,000+ U.S. jurisdictions
2. **Pattern matching** for official websites and social media
3. **API integration** with Legistar, SuiteOne, and other platforms
4. **Web scraping** of public government websites
5. **YouTube channel discovery** using multiple handle patterns
6. **Text extraction** from public PDF documents
7. **Keyword filtering** for oral health topics
## Ethical Considerations
- All data is from public government sources
- No personal information is included
- Documents are public records under Freedom of Information laws
- Dataset helps research community access public policy information
## Citation
If you use this dataset in your research, please cite:
```
@dataset{{oral_health_policy_data,
author = {{Oral Health Policy Pulse Project}},
title = {{Oral Health Policy Data}},
year = {{2026}},
publisher = {{Hugging Face}},
url = {{https://huggingface.co/datasets/{self.repo_name}}}
}}
```
## Maintenance
This dataset is actively maintained. Updates are pushed regularly as new data is discovered.
Last updated: {pd.Timestamp.now().strftime('%Y-%m-%d')}
"""
# Upload README
api = HfApi()
api.upload_file(
path_or_fileobj=readme.encode('utf-8'),
path_in_repo="README.md",
repo_id=self.repo_name,
repo_type="dataset",
token=self.token
)
logger.success(f"✅ Dataset card created!")
def process_pdfs_to_parquet(pdf_urls: list, output_file: str = "meetings_processed.parquet"):
"""
CORRECT WAY: Extract text from PDFs and save as Parquet (not individual files).
This function demonstrates the proper workflow:
1. Download PDF temporarily
2. Extract text
3. Store metadata + text in DataFrame
4. Delete PDF
5. Save all as single Parquet file
This avoids uploading millions of individual files to Hugging Face.
Args:
pdf_urls: List of PDF URLs to process
output_file: Output Parquet file path
Returns:
DataFrame with processed meetings
"""
try:
import httpx
from PyPDF2 import PdfReader
import io
from datetime import datetime
except ImportError:
logger.error("Install required packages: pip install httpx PyPDF2")
return None
logger.info(f"Processing {len(pdf_urls)} PDFs to Parquet format...")
all_meetings = []
client = httpx.Client(timeout=30)
for i, pdf_url in enumerate(pdf_urls, 1):
try:
logger.info(f" [{i}/{len(pdf_urls)}] Processing {pdf_url}...")
# Download PDF temporarily (don't save to disk)
response = client.get(pdf_url)
pdf_bytes = response.content
# Extract text from PDF
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
# Store metadata + text (NOT PDF bytes)
all_meetings.append({
'source_url': pdf_url,
'text': text,
'page_count': len(pdf_reader.pages),
'file_size_kb': len(pdf_bytes) // 1024,
'processed_at': datetime.now().isoformat(),
# Add your metadata extraction here:
# 'jurisdiction': extract_jurisdiction(text),
# 'date': extract_date(text),
# 'title': extract_title(text),
})
# Delete PDF bytes immediately (free memory!)
del pdf_bytes
except Exception as e:
logger.warning(f" Failed to process {pdf_url}: {e}")
continue
client.close()
# Convert to DataFrame
df = pd.DataFrame(all_meetings)
logger.info(f"Successfully processed {len(df)} PDFs")
# Save as Parquet (compressed)
df.to_parquet(output_file, compression='snappy', index=False)
file_size_mb = Path(output_file).stat().st_size / (1024 * 1024)
logger.success(f"✅ Saved to {output_file} ({file_size_mb:.2f} MB)")
logger.success(f" This is 1 file, not {len(df)} individual PDFs!")
return df
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Upload oral health policy data to Hugging Face"
)
parser.add_argument(
"--repo",
default=DEFAULT_REPO_NAME,
help="Hugging Face repo name (e.g., 'username/oral-health-policy-data')"
)
parser.add_argument(
"--discovery",
action="store_true",
help="Upload discovery results"
)
parser.add_argument(
"--meetings",
help="Upload meeting data from CSV/JSON file"
)
parser.add_argument(
"--oral-health",
help="Upload oral health subset from CSV/JSON file"
)
parser.add_argument(
"--create-card",
action="store_true",
help="Create dataset README card"
)
parser.add_argument(
"--process-pdfs",
help="Process PDF URLs from file and save as Parquet (CORRECT way to handle PDFs)"
)
args = parser.parse_args()
# Special case: Process PDFs to Parquet first
if args.process_pdfs:
logger.info("Processing PDFs to Parquet format (not uploading individual files)...")
# Load PDF URLs from file
pdf_urls = []
with open(args.process_pdfs) as f:
pdf_urls = [line.strip() for line in f if line.strip()]
# Process to Parquet
df = process_pdfs_to_parquet(pdf_urls, "meetings_processed.parquet")
if df is not None:
logger.success("✅ PDFs processed! Now upload the Parquet file:")
logger.success(f" python {__file__} --meetings meetings_processed.parquet")
return
# Initialize uploader
uploader = HuggingFaceUploader(args.repo)
# Upload based on flags
if args.discovery:
uploader.upload_discovery_results()
if args.meetings:
uploader.upload_meeting_data(args.meetings)
if args.oral_health:
uploader.upload_oral_health_subset(args.oral_health)
if args.create_card:
uploader.create_dataset_card()
if not any([args.discovery, args.meetings, args.oral_health, args.create_card]):
logger.warning("No action specified. Use --discovery, --meetings, or --oral-health")
parser.print_help()
if __name__ == "__main__":
main()