Spaces:
Sleeping
Sleeping
| ''' | |
| Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py. | |
| ''' | |
| import json | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import List, Dict | |
| BASE_SYNTHETIC_DIR = Path("data/synthetic") | |
| OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined" | |
| def load_jsonl(path: Path) -> List[Dict]: | |
| with path.open("r", encoding="utf-8") as f: | |
| return [json.loads(line) for line in f] | |
| def save_jsonl(path: Path, records: List[Dict]): | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8") as f: | |
| for r in records: | |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") | |
| def save_json(path: Path, records: List[Dict]): | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8") as f: | |
| json.dump(records, f, indent=2) | |
| def aggregate(): | |
| positive_pairs_all = [] | |
| triplets_all = [] | |
| included_runs = [] | |
| for run_dir in BASE_SYNTHETIC_DIR.iterdir(): | |
| if not run_dir.is_dir(): | |
| continue | |
| if run_dir.name == "combined": | |
| continue | |
| pos_path = run_dir / "positive_pairs.jsonl" | |
| tri_path = run_dir / "triplets.jsonl" | |
| if pos_path.exists() and tri_path.exists(): | |
| positive_pairs_all.extend(load_jsonl(pos_path)) | |
| triplets_all.extend(load_jsonl(tri_path)) | |
| included_runs.append(run_dir.name) | |
| # Save JSONL (training) | |
| save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all) | |
| save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all) | |
| # Save JSON (inspection / upload) | |
| save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all) | |
| save_json(OUTPUT_DIR / "triplets.json", triplets_all) | |
| # Metadata | |
| metadata = { | |
| "type": "combined_dataset", | |
| "included_runs": included_runs, | |
| "total_positive_pairs": len(positive_pairs_all), | |
| "total_triplets": len(triplets_all), | |
| "created_at": datetime.utcnow().isoformat(), | |
| } | |
| with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f: | |
| json.dump(metadata, f, indent=2) | |
| print("✅ Combined dataset created at:", OUTPUT_DIR) | |
| if __name__ == "__main__": | |
| aggregate() | |