File size: 1,435 Bytes
549c270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# src/agents/data_agent.py

import subprocess
import sys
import os
from pathlib import Path
from typing import Literal
import urllib.request

class DataAgent:
    """
    Runs data prep scripts for a dataset:
    - Downloads raw files if not present
    - join_meta.py
    - build_text_emb.py
    - build_image_emb.py
    - build_meta_emb.py
    """

    def _run(self, argv):
        print("→", " ".join(argv))
        subprocess.check_call(argv)

    def _download_raw_data(self, dataset: str):
        if dataset != "beauty":
            raise ValueError(f"Auto-download is only supported for 'beauty' dataset")

        base_dir = Path("data/raw/beauty")
        base_dir.mkdir(parents=True, exist_ok=True)

        files = {
            "reviews.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/reviews.json",
            "meta.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/meta.json",
        }

        for fname, url in files.items():
            out_path = base_dir / fname
            if not out_path.exists():
                print(f"⬇️ Downloading {fname}...")
                urllib.request.urlretrieve(url, out_path)
                print(f"✅ Saved to {out_path}")
            else:
                print(f"✔️ Already exists: {out_path}")

    def prepare(self, dataset: Literal["beauty"] = "beauty"):
        print(f"