Spaces:
Sleeping
Sleeping
File size: 1,435 Bytes
549c270 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# src/agents/data_agent.py
import subprocess
import sys
import os
from pathlib import Path
from typing import Literal
import urllib.request
class DataAgent:
"""
Runs data prep scripts for a dataset:
- Downloads raw files if not present
- join_meta.py
- build_text_emb.py
- build_image_emb.py
- build_meta_emb.py
"""
def _run(self, argv):
print("→", " ".join(argv))
subprocess.check_call(argv)
def _download_raw_data(self, dataset: str):
if dataset != "beauty":
raise ValueError(f"Auto-download is only supported for 'beauty' dataset")
base_dir = Path("data/raw/beauty")
base_dir.mkdir(parents=True, exist_ok=True)
files = {
"reviews.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/reviews.json",
"meta.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/meta.json",
}
for fname, url in files.items():
out_path = base_dir / fname
if not out_path.exists():
print(f"⬇️ Downloading {fname}...")
urllib.request.urlretrieve(url, out_path)
print(f"✅ Saved to {out_path}")
else:
print(f"✔️ Already exists: {out_path}")
def prepare(self, dataset: Literal["beauty"] = "beauty"):
print(f" |