Spaces:
Running
Running
| """Upload the final clean dataset to Hugging Face.""" | |
| import json | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from datasets import Dataset | |
| from dotenv import load_dotenv | |
| from huggingface_hub import HfApi | |
| load_dotenv() | |
| ROOT = Path(__file__).resolve().parent.parent | |
| FINAL_FILE = ROOT / "data" / "retro-alpha-final.jsonl" | |
| # Prefer explicit env vars, then default to build-small-hackathon namespace. | |
| HF_USER = os.getenv("HF_USER", "build-small-hackathon") | |
| REPO_ID = os.getenv("DATASET_REPO", f"{HF_USER}/retro-alpha-dataset") | |
| def main(): | |
| token = os.getenv("HF_TOKEN") | |
| if not token: | |
| print("HF_TOKEN not found in .env or environment") | |
| sys.exit(1) | |
| if not FINAL_FILE.exists(): | |
| print(f"Final dataset not found: {FINAL_FILE}") | |
| sys.exit(1) | |
| print(f"Loading dataset from {FINAL_FILE}...") | |
| with open(FINAL_FILE, "r", encoding="utf-8") as f: | |
| rows = [json.loads(line) for line in f if line.strip()] | |
| print(f"Loaded {len(rows)} rows") | |
| def build_conversation(row): | |
| return { | |
| "messages": [ | |
| {"role": "system", "content": row["system"]}, | |
| {"role": "user", "content": row["user"]}, | |
| {"role": "assistant", "content": row["response"]}, | |
| ], | |
| "task": row["task"], | |
| "metadata": row.get("metadata", {}), | |
| } | |
| dataset = Dataset.from_list([build_conversation(r) for r in rows]) | |
| api = HfApi(token=token) | |
| try: | |
| api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True) | |
| except Exception as e: | |
| print(f"Could not create/find dataset repo {REPO_ID}: {e}") | |
| # Fallback to user namespace if org namespace fails. | |
| me = api.whoami()["name"] | |
| fallback = f"{me}/retro-alpha-dataset" | |
| if fallback != REPO_ID: | |
| print(f"Trying fallback repo: {fallback}") | |
| api.create_repo(repo_id=fallback, repo_type="dataset", exist_ok=True) | |
| dataset.push_to_hub(fallback, token=token, private=False) | |
| print(f"Pushed to fallback {fallback}") | |
| return | |
| raise | |
| print(f"Pushing to {REPO_ID}...") | |
| dataset.push_to_hub(REPO_ID, token=token, private=False) | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() | |