6.c395miniproject / scripts /upload_facilities_to_hf.py
Bookmaster9
initial commit
458593e
#!/usr/bin/env python3
"""
Upload data/facilities.csv to a Hugging Face Dataset repo.
Uses the Datasets library to create a dataset from the local CSV and push to the Hub.
See: https://huggingface.co/docs/datasets/en/create_dataset
Usage:
pip install datasets
# Auth: put HF_TOKEN=your_token in .env (same as the app), or run: python -m huggingface_hub.cli.login
python scripts/upload_facilities_to_hf.py [REPO_ID]
Example:
python scripts/upload_facilities_to_hf.py phanny/samhsa-facilities
If REPO_ID is omitted, you will be prompted. The dataset repo must already exist
on the Hub (create it at https://huggingface.co/datasets → New dataset).
"""
import os
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = REPO_ROOT / "data"
CSV_PATH = DATA_DIR / "facilities.csv"
# Load .env so HF_TOKEN is available (same as app)
def _load_dotenv():
env_file = REPO_ROOT / ".env"
if env_file.exists():
for line in env_file.read_text().splitlines():
line = line.strip()
if line and not line.startswith("#") and "=" in line:
k, _, v = line.partition("=")
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
def main():
_load_dotenv()
if not CSV_PATH.exists():
print(f"Error: {CSV_PATH} not found. Run scripts/download_findtreatment_data.py first.", file=sys.stderr)
sys.exit(1)
try:
from datasets import load_dataset
except ImportError as e:
print(f"Error: {e}. Install with: pip install datasets huggingface_hub", file=sys.stderr)
sys.exit(1)
repo_id = (sys.argv[1] if len(sys.argv) > 1 else "").strip()
if not repo_id:
repo_id = input("Hugging Face dataset repo id (e.g. phanny/samhsa-facilities): ").strip()
if not repo_id or "/" not in repo_id:
print("Error: repo id must be like username/dataset-name", file=sys.stderr)
sys.exit(1)
if not os.environ.get("HF_TOKEN"):
print("Error: Hugging Face token not set. Add HF_TOKEN to .env or run: python -m huggingface_hub.cli.login", file=sys.stderr)
sys.exit(1)
print(f"Loading {CSV_PATH}...")
# Create dataset from local CSV (per https://huggingface.co/docs/datasets/en/create_dataset)
dataset = load_dataset("csv", data_files=str(CSV_PATH), split="train")
print(f"Uploading to {repo_id}...")
dataset.push_to_hub(repo_id, private=False)
print(f"Done. Use in your Space: set variable FACILITIES_DATASET={repo_id}")
print("Then load in code: load_dataset({!r}, split='train')".format(repo_id))
if __name__ == "__main__":
main()