File size: 2,272 Bytes
122cc3c
 
94d68b4
122cc3c
 
 
 
 
 
 
 
 
 
 
 
94d68b4
 
 
 
122cc3c
 
 
 
 
94d68b4
122cc3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94d68b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122cc3c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""Upload the final clean dataset to Hugging Face."""

import json
import os
import sys
from pathlib import Path

from datasets import Dataset
from dotenv import load_dotenv
from huggingface_hub import HfApi

load_dotenv()

ROOT = Path(__file__).resolve().parent.parent
FINAL_FILE = ROOT / "data" / "retro-alpha-final.jsonl"

# Prefer explicit env vars, then default to build-small-hackathon namespace.
HF_USER = os.getenv("HF_USER", "build-small-hackathon")
REPO_ID = os.getenv("DATASET_REPO", f"{HF_USER}/retro-alpha-dataset")


def main():
    token = os.getenv("HF_TOKEN")
    if not token:
        print("HF_TOKEN not found in .env or environment")
        sys.exit(1)

    if not FINAL_FILE.exists():
        print(f"Final dataset not found: {FINAL_FILE}")
        sys.exit(1)

    print(f"Loading dataset from {FINAL_FILE}...")
    with open(FINAL_FILE, "r", encoding="utf-8") as f:
        rows = [json.loads(line) for line in f if line.strip()]

    print(f"Loaded {len(rows)} rows")

    def build_conversation(row):
        return {
            "messages": [
                {"role": "system", "content": row["system"]},
                {"role": "user", "content": row["user"]},
                {"role": "assistant", "content": row["response"]},
            ],
            "task": row["task"],
            "metadata": row.get("metadata", {}),
        }

    dataset = Dataset.from_list([build_conversation(r) for r in rows])

    api = HfApi(token=token)
    try:
        api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
    except Exception as e:
        print(f"Could not create/find dataset repo {REPO_ID}: {e}")
        # Fallback to user namespace if org namespace fails.
        me = api.whoami()["name"]
        fallback = f"{me}/retro-alpha-dataset"
        if fallback != REPO_ID:
            print(f"Trying fallback repo: {fallback}")
            api.create_repo(repo_id=fallback, repo_type="dataset", exist_ok=True)
            dataset.push_to_hub(fallback, token=token, private=False)
            print(f"Pushed to fallback {fallback}")
            return
        raise

    print(f"Pushing to {REPO_ID}...")
    dataset.push_to_hub(REPO_ID, token=token, private=False)
    print("Done.")


if __name__ == "__main__":
    main()