Spaces:
Sleeping
Sleeping
| # src/agents/data_agent.py | |
| import subprocess | |
| import sys | |
| import os | |
| from pathlib import Path | |
| from typing import Literal | |
| import urllib.request | |
| class DataAgent: | |
| """ | |
| Runs data prep scripts for a dataset: | |
| - Downloads raw files if not present | |
| - join_meta.py | |
| - build_text_emb.py | |
| - build_image_emb.py | |
| - build_meta_emb.py | |
| """ | |
| def _run(self, argv): | |
| print("β", " ".join(argv)) | |
| subprocess.check_call(argv) | |
| def _download_raw_data(self, dataset: str): | |
| if dataset != "beauty": | |
| raise ValueError(f"Auto-download is only supported for 'beauty' dataset") | |
| base_dir = Path("data/raw/beauty") | |
| base_dir.mkdir(parents=True, exist_ok=True) | |
| files = { | |
| "reviews.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/reviews.json", | |
| "meta.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/meta.json", | |
| } | |
| for fname, url in files.items(): | |
| out_path = base_dir / fname | |
| if not out_path.exists(): | |
| print(f"β¬οΈ Downloading {fname}...") | |
| urllib.request.urlretrieve(url, out_path) | |
| print(f"β Saved to {out_path}") | |
| else: | |
| print(f"βοΈ Already exists: {out_path}") | |
| def prepare(self, dataset: Literal["beauty"] = "beauty"): | |
| print(f" |