Spaces:
Sleeping
Sleeping
| """ | |
| Generate training datasets for ALL frameworks automatically. | |
| This script auto-discovers all chunk files and processes them, | |
| generating separate datasets for each framework PLUS a combined dataset. | |
| Usage: | |
| python scripts/generate_all_frameworks.py | |
| Output Structure: | |
| data/processed/training_crewai/ | |
| - positive_pairs.json | |
| - triplets.json | |
| data/processed/training_langgraph/ | |
| - positive_pairs.json | |
| - triplets.json | |
| data/processed/training_combined/ | |
| - positive_pairs.json (ALL frameworks merged) | |
| - triplets.json (ALL frameworks merged) | |
| """ | |
| import sys | |
| import json | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| from dataclasses import asdict | |
| # Add project root to path | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from src.task_3_data_engineering.export.pairs_triplets_generator import ( | |
| generate_pairs_and_triplets, | |
| PositivePair, | |
| Triplet | |
| ) | |
| def discover_all_chunk_files() -> List[Tuple[Path, str]]: | |
| """ | |
| Discover all chunk files in the workspace. | |
| Returns: | |
| List of (chunk_path, framework_name) tuples | |
| """ | |
| chunk_files = [] | |
| # Check local chunks | |
| local_paths = [ | |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", | |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", | |
| ] | |
| for path in local_paths: | |
| if path.exists(): | |
| # Extract framework from parent directory or use "local" | |
| if "Local_saved_files" in str(path): | |
| framework = "crewai" | |
| elif "sample_code" in str(path): | |
| framework = "sample" | |
| else: | |
| framework = path.parent.name | |
| chunk_files.append((path, framework)) | |
| # Check repository chunks | |
| repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" | |
| if repos_dir.exists(): | |
| for repo_dir in repos_dir.iterdir(): | |
| if repo_dir.is_dir(): | |
| for jsonl_file in repo_dir.glob("*_chunks.jsonl"): | |
| # Extract framework from filename or directory | |
| framework = jsonl_file.stem.replace("_chunks", "").split("_")[0] | |
| chunk_files.append((jsonl_file, framework)) | |
| return chunk_files | |
| def merge_datasets(all_pairs: List[List[PositivePair]], | |
| all_triplets: List[List[Triplet]], | |
| output_dir: Path) -> None: | |
| """Merge all framework datasets into combined files (JSON + JSONL).""" | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Flatten lists | |
| combined_pairs = [] | |
| for pairs in all_pairs: | |
| combined_pairs.extend(pairs) | |
| combined_triplets = [] | |
| for triplets in all_triplets: | |
| combined_triplets.extend(triplets) | |
| # Export combined positive pairs - JSON | |
| pairs_json_path = output_dir / "positive_pairs.json" | |
| with open(pairs_json_path, "w", encoding="utf-8") as f: | |
| json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False) | |
| print(f"β Combined positive pairs (JSON): {pairs_json_path}") | |
| # Export combined positive pairs - JSONL | |
| pairs_jsonl_path = output_dir / "positive_pairs.jsonl" | |
| with open(pairs_jsonl_path, "w", encoding="utf-8") as f: | |
| for p in combined_pairs: | |
| f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n") | |
| print(f"β Combined positive pairs (JSONL): {pairs_jsonl_path}") | |
| # Export combined triplets - JSON | |
| triplets_json_path = output_dir / "triplets.json" | |
| with open(triplets_json_path, "w", encoding="utf-8") as f: | |
| json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False) | |
| print(f"β Combined triplets (JSON): {triplets_json_path}") | |
| # Export combined triplets - JSONL | |
| triplets_jsonl_path = output_dir / "triplets.jsonl" | |
| with open(triplets_jsonl_path, "w", encoding="utf-8") as f: | |
| for t in combined_triplets: | |
| f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n") | |
| print(f"β Combined triplets (JSONL): {triplets_jsonl_path}") | |
| return len(combined_pairs), len(combined_triplets) | |
| def main(): | |
| """Generate datasets for all discovered frameworks + combined dataset.""" | |
| print("=" * 80) | |
| print("π MULTI-FRAMEWORK TRAINING DATA GENERATOR") | |
| print("=" * 80) | |
| # Discover all chunk files | |
| print("\nπ Discovering chunk files...") | |
| chunk_files = discover_all_chunk_files() | |
| if not chunk_files: | |
| print("β No chunk files found!") | |
| print("\nPlease ensure chunks exist in:") | |
| print(" - data/processed/chunks/Local_saved_files/") | |
| print(" - data/processed/repos/*/") | |
| return | |
| print(f"β Found {len(chunk_files)} chunk file(s):\n") | |
| for path, framework in chunk_files: | |
| print(f" π¦ {framework}: {path.name}") | |
| # Process each framework | |
| print("\n" + "=" * 80) | |
| print("π PROCESSING INDIVIDUAL FRAMEWORKS") | |
| print("=" * 80 + "\n") | |
| results = [] | |
| all_pairs = [] | |
| all_triplets = [] | |
| for i, (chunks_path, framework) in enumerate(chunk_files, 1): | |
| print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...") | |
| print("-" * 60) | |
| output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}" | |
| try: | |
| pairs, triplets = generate_pairs_and_triplets( | |
| chunks_path=chunks_path, | |
| output_dir=output_dir, | |
| num_pairs=100, | |
| num_triplets=100, | |
| variance=5, | |
| export_format="both" # JSON + JSONL | |
| ) | |
| # Collect for combined dataset | |
| all_pairs.append(pairs) | |
| all_triplets.append(triplets) | |
| results.append({ | |
| "framework": framework, | |
| "status": "β SUCCESS", | |
| "pairs": len(pairs), | |
| "variations": sum(len(p.variations) for p in pairs), | |
| "triplets": len(triplets), | |
| "output": output_dir | |
| }) | |
| except Exception as e: | |
| results.append({ | |
| "framework": framework, | |
| "status": f"β FAILED: {str(e)}", | |
| "output": output_dir | |
| }) | |
| # Create combined dataset | |
| print("\n" + "=" * 80) | |
| print("π CREATING COMBINED DATASET (ALL FRAMEWORKS)") | |
| print("=" * 80 + "\n") | |
| combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined" | |
| total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir) | |
| # Final summary | |
| print("\n" + "=" * 80) | |
| print("π FINAL SUMMARY") | |
| print("=" * 80 + "\n") | |
| print("INDIVIDUAL FRAMEWORK DATASETS:") | |
| print("-" * 40) | |
| for result in results: | |
| print(f"\nπ¦ {result['framework'].upper()}") | |
| print(f" Status: {result['status']}") | |
| if "pairs" in result: | |
| print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)") | |
| print(f" - triplets.json: {result['triplets']} docs") | |
| print(f" π {result['output']}") | |
| print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):") | |
| print("-" * 40) | |
| print(f"π {combined_dir}") | |
| print(f" - positive_pairs.json: {total_pairs} docs") | |
| print(f" - triplets.json: {total_triplets} docs") | |
| # File count summary | |
| successful = sum(1 for r in results if "SUCCESS" in r["status"]) | |
| total_files = (successful * 4) + 4 # 4 per framework + 4 combined | |
| print(f"\n\nπ TOTAL FILES GENERATED: {total_files}") | |
| print(f" - {successful} frameworks Γ 4 files = {successful * 4} files") | |
| print(f" - Combined dataset = 4 files") | |
| print("=" * 80) | |
| if __name__ == "__main__": | |
| main() | |