| """ |
| Generate training datasets for ALL frameworks automatically. |
| |
| This script auto-discovers all chunk files and processes them, |
| generating separate datasets for each framework PLUS a combined dataset. |
| |
| Usage: |
| python scripts/generate_all_frameworks.py |
| |
| Output Structure: |
| data/processed/training_crewai/ |
| - positive_pairs.json |
| - triplets.json |
| data/processed/training_langgraph/ |
| - positive_pairs.json |
| - triplets.json |
| data/processed/training_combined/ |
| - positive_pairs.json (ALL frameworks merged) |
| - triplets.json (ALL frameworks merged) |
| """ |
|
|
| import sys |
| import json |
| from pathlib import Path |
| from typing import List, Tuple |
| from dataclasses import asdict |
|
|
| |
| PROJECT_ROOT = Path(__file__).parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.task_3_data_engineering.export.pairs_triplets_generator import ( |
| generate_pairs_and_triplets, |
| PositivePair, |
| Triplet |
| ) |
|
|
|
|
| def discover_all_chunk_files() -> List[Tuple[Path, str]]: |
| """ |
| Discover all chunk files in the workspace. |
| |
| Returns: |
| List of (chunk_path, framework_name) tuples |
| """ |
| chunk_files = [] |
| |
| |
| local_paths = [ |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", |
| ] |
| |
| for path in local_paths: |
| if path.exists(): |
| |
| if "Local_saved_files" in str(path): |
| framework = "crewai" |
| elif "sample_code" in str(path): |
| framework = "sample" |
| else: |
| framework = path.parent.name |
| chunk_files.append((path, framework)) |
| |
| |
| repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" |
| if repos_dir.exists(): |
| for repo_dir in repos_dir.iterdir(): |
| if repo_dir.is_dir(): |
| for jsonl_file in repo_dir.glob("*_chunks.jsonl"): |
| |
| framework = jsonl_file.stem.replace("_chunks", "").split("_")[0] |
| chunk_files.append((jsonl_file, framework)) |
| |
| return chunk_files |
|
|
|
|
| def merge_datasets(all_pairs: List[List[PositivePair]], |
| all_triplets: List[List[Triplet]], |
| output_dir: Path) -> None: |
| """Merge all framework datasets into combined files (JSON + JSONL).""" |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| |
| combined_pairs = [] |
| for pairs in all_pairs: |
| combined_pairs.extend(pairs) |
| |
| combined_triplets = [] |
| for triplets in all_triplets: |
| combined_triplets.extend(triplets) |
| |
| |
| pairs_json_path = output_dir / "positive_pairs.json" |
| with open(pairs_json_path, "w", encoding="utf-8") as f: |
| json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False) |
| print(f"β
Combined positive pairs (JSON): {pairs_json_path}") |
| |
| |
| pairs_jsonl_path = output_dir / "positive_pairs.jsonl" |
| with open(pairs_jsonl_path, "w", encoding="utf-8") as f: |
| for p in combined_pairs: |
| f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n") |
| print(f"β
Combined positive pairs (JSONL): {pairs_jsonl_path}") |
| |
| |
| triplets_json_path = output_dir / "triplets.json" |
| with open(triplets_json_path, "w", encoding="utf-8") as f: |
| json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False) |
| print(f"β
Combined triplets (JSON): {triplets_json_path}") |
| |
| |
| triplets_jsonl_path = output_dir / "triplets.jsonl" |
| with open(triplets_jsonl_path, "w", encoding="utf-8") as f: |
| for t in combined_triplets: |
| f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n") |
| print(f"β
Combined triplets (JSONL): {triplets_jsonl_path}") |
| |
| return len(combined_pairs), len(combined_triplets) |
|
|
|
|
| def main(): |
| """Generate datasets for all discovered frameworks + combined dataset.""" |
| print("=" * 80) |
| print("π MULTI-FRAMEWORK TRAINING DATA GENERATOR") |
| print("=" * 80) |
| |
| |
| print("\nπ Discovering chunk files...") |
| chunk_files = discover_all_chunk_files() |
| |
| if not chunk_files: |
| print("β No chunk files found!") |
| print("\nPlease ensure chunks exist in:") |
| print(" - data/processed/chunks/Local_saved_files/") |
| print(" - data/processed/repos/*/") |
| return |
| |
| print(f"β
Found {len(chunk_files)} chunk file(s):\n") |
| for path, framework in chunk_files: |
| print(f" π¦ {framework}: {path.name}") |
| |
| |
| print("\n" + "=" * 80) |
| print("π PROCESSING INDIVIDUAL FRAMEWORKS") |
| print("=" * 80 + "\n") |
| |
| results = [] |
| all_pairs = [] |
| all_triplets = [] |
| |
| for i, (chunks_path, framework) in enumerate(chunk_files, 1): |
| print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...") |
| print("-" * 60) |
| |
| output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}" |
| |
| try: |
| pairs, triplets = generate_pairs_and_triplets( |
| chunks_path=chunks_path, |
| output_dir=output_dir, |
| num_pairs=100, |
| num_triplets=100, |
| variance=5, |
| export_format="both" |
| ) |
| |
| |
| all_pairs.append(pairs) |
| all_triplets.append(triplets) |
| |
| results.append({ |
| "framework": framework, |
| "status": "β
SUCCESS", |
| "pairs": len(pairs), |
| "variations": sum(len(p.variations) for p in pairs), |
| "triplets": len(triplets), |
| "output": output_dir |
| }) |
| |
| except Exception as e: |
| results.append({ |
| "framework": framework, |
| "status": f"β FAILED: {str(e)}", |
| "output": output_dir |
| }) |
| |
| |
| print("\n" + "=" * 80) |
| print("π CREATING COMBINED DATASET (ALL FRAMEWORKS)") |
| print("=" * 80 + "\n") |
| |
| combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined" |
| total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir) |
| |
| |
| print("\n" + "=" * 80) |
| print("π FINAL SUMMARY") |
| print("=" * 80 + "\n") |
| |
| print("INDIVIDUAL FRAMEWORK DATASETS:") |
| print("-" * 40) |
| for result in results: |
| print(f"\nπ¦ {result['framework'].upper()}") |
| print(f" Status: {result['status']}") |
| if "pairs" in result: |
| print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)") |
| print(f" - triplets.json: {result['triplets']} docs") |
| print(f" π {result['output']}") |
| |
| print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):") |
| print("-" * 40) |
| print(f"π {combined_dir}") |
| print(f" - positive_pairs.json: {total_pairs} docs") |
| print(f" - triplets.json: {total_triplets} docs") |
| |
| |
| successful = sum(1 for r in results if "SUCCESS" in r["status"]) |
| total_files = (successful * 4) + 4 |
| |
| print(f"\n\nπ TOTAL FILES GENERATED: {total_files}") |
| print(f" - {successful} frameworks Γ 4 files = {successful * 4} files") |
| print(f" - Combined dataset = 4 files") |
| print("=" * 80) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|