Spaces:
Running on Zero
Running on Zero
| """Upload a curated Objectverse Diary SFT JSONL file to Hugging Face Datasets.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| def validate_dataset_file(dataset_file: Path) -> dict[str, object]: | |
| if not dataset_file.exists() or not dataset_file.is_file(): | |
| raise FileNotFoundError(f"Dataset file does not exist: {dataset_file}") | |
| record_count = 0 | |
| sources: set[str] = set() | |
| modes: set[str] = set() | |
| object_names: set[str] = set() | |
| for line_number, line in enumerate( | |
| dataset_file.read_text(encoding="utf-8").splitlines(), | |
| start=1, | |
| ): | |
| if not line.strip(): | |
| continue | |
| try: | |
| record = json.loads(line) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError(f"Invalid JSON on line {line_number}: {exc.msg}") from exc | |
| if not isinstance(record, dict): | |
| raise ValueError(f"Line {line_number} must be a JSON object.") | |
| messages = record.get("messages") | |
| if not isinstance(messages, list) or not messages: | |
| raise ValueError(f"Line {line_number} must include a non-empty messages list.") | |
| assistant_messages = [ | |
| message | |
| for message in messages | |
| if isinstance(message, dict) and message.get("role") == "assistant" | |
| ] | |
| if not assistant_messages: | |
| raise ValueError(f"Line {line_number} must include an assistant message.") | |
| assistant_content = assistant_messages[-1].get("content") | |
| if not isinstance(assistant_content, str): | |
| raise ValueError(f"Line {line_number} assistant content must be a string.") | |
| try: | |
| assistant_payload = json.loads(assistant_content) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError( | |
| f"Line {line_number} assistant content is not valid JSON: {exc.msg}" | |
| ) from exc | |
| if not isinstance(assistant_payload, dict): | |
| raise ValueError(f"Line {line_number} assistant content must be a JSON object.") | |
| if "persona" not in assistant_payload or "diary" not in assistant_payload: | |
| raise ValueError( | |
| f"Line {line_number} assistant content must include persona and diary." | |
| ) | |
| record_count += 1 | |
| if isinstance(record.get("source"), str): | |
| sources.add(str(record["source"])) | |
| if isinstance(record.get("mode"), str): | |
| modes.add(str(record["mode"])) | |
| object_understanding = record.get("object_understanding") | |
| if isinstance(object_understanding, dict): | |
| raw_object = object_understanding.get("object") | |
| if isinstance(raw_object, dict) and isinstance(raw_object.get("name"), str): | |
| object_names.add(str(raw_object["name"])) | |
| if record_count == 0: | |
| raise ValueError(f"Dataset file has no records: {dataset_file}") | |
| return { | |
| "dataset_file": str(dataset_file), | |
| "record_count": record_count, | |
| "sources": sorted(sources), | |
| "modes": sorted(modes), | |
| "unique_object_count": len(object_names), | |
| } | |
| def upload_dataset( | |
| *, | |
| dataset_file: Path, | |
| repo_id: str, | |
| path_in_repo: str, | |
| private: bool, | |
| commit_message: str, | |
| dry_run: bool, | |
| ) -> dict[str, object]: | |
| summary = validate_dataset_file(dataset_file) | |
| summary.update( | |
| { | |
| "repo_id": repo_id, | |
| "path_in_repo": path_in_repo, | |
| "private": private, | |
| "commit_message": commit_message, | |
| "dry_run": dry_run, | |
| } | |
| ) | |
| if dry_run: | |
| summary["uploaded"] = False | |
| return summary | |
| from huggingface_hub import HfApi | |
| api = HfApi() | |
| api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True) | |
| api.upload_file( | |
| path_or_fileobj=str(dataset_file), | |
| path_in_repo=path_in_repo, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| commit_message=commit_message, | |
| ) | |
| summary["uploaded"] = True | |
| summary["url"] = f"https://huggingface.co/datasets/{repo_id}" | |
| return summary | |
| def _print_json(payload: dict[str, Any]) -> None: | |
| print(json.dumps(payload, indent=2, sort_keys=True), flush=True) | |
| def _parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--dataset-file", type=Path, required=True) | |
| parser.add_argument("--repo-id", required=True) | |
| parser.add_argument("--path-in-repo", required=True) | |
| parser.add_argument("--private", action="store_true") | |
| parser.add_argument( | |
| "--commit-message", | |
| default="Upload Objectverse Diary curated SFT dataset", | |
| ) | |
| parser.add_argument("--dry-run", action="store_true") | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = _parse_args() | |
| _print_json( | |
| upload_dataset( | |
| dataset_file=args.dataset_file, | |
| repo_id=args.repo_id, | |
| path_in_repo=args.path_in_repo, | |
| private=args.private, | |
| commit_message=args.commit_message, | |
| dry_run=args.dry_run, | |
| ) | |
| ) | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except Exception as exc: | |
| raise SystemExit(str(exc)) from exc | |