| """ |
| Script to generate positive pairs and triplets from code chunks. |
| |
| This script loads code chunks and generates: |
| 1. Positive Pairs: (question, code) with 4-5 variations per sample |
| 2. Triplets: (anchor_question, positive_code, negative_code) |
| |
| Usage: |
| python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir> |
| python -m scripts.run_pairs_triplets_pipeline --help |
| |
| Examples: |
| # Generate from local chunks with default settings |
| python -m scripts.run_pairs_triplets_pipeline \\ |
| --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\ |
| --output data/processed/training |
| |
| # Generate from repository chunks |
| python -m scripts.run_pairs_triplets_pipeline \\ |
| --chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\ |
| --output data/processed/training/langgraph |
| |
| # Custom settings |
| python -m scripts.run_pairs_triplets_pipeline \\ |
| --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\ |
| --output data/processed/training \\ |
| --pairs 100 --triplets 100 --variance 5 |
| """ |
|
|
| import sys |
| from pathlib import Path |
|
|
| |
| PROJECT_ROOT = Path(__file__).parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.task_3_data_engineering.export.pairs_triplets_generator import ( |
| generate_pairs_and_triplets, |
| main as cli_main |
| ) |
|
|
|
|
| def run_default_pipeline(): |
| """Run with default settings for the available chunks.""" |
| |
| |
| possible_paths = [ |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", |
| ] |
| |
| |
| chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks" |
| if chunks_dir.exists(): |
| for subdir in chunks_dir.iterdir(): |
| if subdir.is_dir(): |
| chunks_file = subdir / "chunks.jsonl" |
| if chunks_file.exists() and chunks_file not in possible_paths: |
| possible_paths.append(chunks_file) |
| |
| |
| repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" |
| if repos_dir.exists(): |
| for repo_dir in repos_dir.iterdir(): |
| if repo_dir.is_dir(): |
| for jsonl_file in repo_dir.glob("*_chunks.jsonl"): |
| possible_paths.append(jsonl_file) |
| |
| chunks_path = None |
| for path in possible_paths: |
| if path.exists(): |
| chunks_path = path |
| break |
| |
| if chunks_path is None: |
| print("β No chunks files found. Please specify a chunks file with --chunks") |
| print("\nPossible locations checked:") |
| for p in possible_paths[:5]: |
| print(f" - {p}") |
| return |
| |
| output_dir = PROJECT_ROOT / "data" / "processed" / "training" |
| |
| print("=" * 60) |
| print("π Positive Pairs & Triplets Generator") |
| print("=" * 60) |
| print(f"\nπ Chunks Path: {chunks_path}") |
| print(f"π Output Dir: {output_dir}") |
| print(f"π Settings: pairs=100, triplets=100, variance=5") |
| print("\n" + "-" * 60) |
| |
| pairs, triplets = generate_pairs_and_triplets( |
| chunks_path=chunks_path, |
| output_dir=output_dir, |
| num_pairs=100, |
| num_triplets=100, |
| variance=5, |
| export_format="both" |
| ) |
| |
| print("\n" + "=" * 60) |
| print("β
Pipeline Complete!") |
| print("=" * 60) |
| print(f"\nπ Output files saved to: {output_dir}") |
| print(" - positive_pairs.jsonl") |
| print(" - positive_pairs.json") |
| print(" - triplets.jsonl") |
| print(" - triplets.json") |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
| |
| |
| if len(sys.argv) > 1: |
| |
| cli_main() |
| else: |
| |
| run_default_pipeline() |
|
|