Spaces:
Sleeping
Sleeping
| """ | |
| Script to generate positive pairs and triplets from code chunks. | |
| This script loads code chunks and generates: | |
| 1. Positive Pairs: (question, code) with 4-5 variations per sample | |
| 2. Triplets: (anchor_question, positive_code, negative_code) | |
| Usage: | |
| python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir> | |
| python -m scripts.run_pairs_triplets_pipeline --help | |
| Examples: | |
| # Generate from local chunks with default settings | |
| python -m scripts.run_pairs_triplets_pipeline \\ | |
| --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\ | |
| --output data/processed/training | |
| # Generate from repository chunks | |
| python -m scripts.run_pairs_triplets_pipeline \\ | |
| --chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\ | |
| --output data/processed/training/langgraph | |
| # Custom settings | |
| python -m scripts.run_pairs_triplets_pipeline \\ | |
| --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\ | |
| --output data/processed/training \\ | |
| --pairs 100 --triplets 100 --variance 5 | |
| """ | |
| import sys | |
| from pathlib import Path | |
| # Add project root to path | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from src.task_3_data_engineering.export.pairs_triplets_generator import ( | |
| generate_pairs_and_triplets, | |
| main as cli_main | |
| ) | |
| def run_default_pipeline(): | |
| """Run with default settings for the available chunks.""" | |
| # Try multiple possible chunk locations | |
| possible_paths = [ | |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", | |
| PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", | |
| ] | |
| # Find all chunks.jsonl files in chunks folder subdirectories | |
| chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks" | |
| if chunks_dir.exists(): | |
| for subdir in chunks_dir.iterdir(): | |
| if subdir.is_dir(): | |
| chunks_file = subdir / "chunks.jsonl" | |
| if chunks_file.exists() and chunks_file not in possible_paths: | |
| possible_paths.append(chunks_file) | |
| # Find repository chunks | |
| repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" | |
| if repos_dir.exists(): | |
| for repo_dir in repos_dir.iterdir(): | |
| if repo_dir.is_dir(): | |
| for jsonl_file in repo_dir.glob("*_chunks.jsonl"): | |
| possible_paths.append(jsonl_file) | |
| chunks_path = None | |
| for path in possible_paths: | |
| if path.exists(): | |
| chunks_path = path | |
| break | |
| if chunks_path is None: | |
| print("β No chunks files found. Please specify a chunks file with --chunks") | |
| print("\nPossible locations checked:") | |
| for p in possible_paths[:5]: | |
| print(f" - {p}") | |
| return | |
| output_dir = PROJECT_ROOT / "data" / "processed" / "training" | |
| print("=" * 60) | |
| print("π Positive Pairs & Triplets Generator") | |
| print("=" * 60) | |
| print(f"\nπ Chunks Path: {chunks_path}") | |
| print(f"π Output Dir: {output_dir}") | |
| print(f"π Settings: pairs=100, triplets=100, variance=5") | |
| print("\n" + "-" * 60) | |
| pairs, triplets = generate_pairs_and_triplets( | |
| chunks_path=chunks_path, | |
| output_dir=output_dir, | |
| num_pairs=100, | |
| num_triplets=100, | |
| variance=5, | |
| export_format="both" | |
| ) | |
| print("\n" + "=" * 60) | |
| print("β Pipeline Complete!") | |
| print("=" * 60) | |
| print(f"\nπ Output files saved to: {output_dir}") | |
| print(" - positive_pairs.jsonl") | |
| print(" - positive_pairs.json") | |
| print(" - triplets.jsonl") | |
| print(" - triplets.json") | |
| if __name__ == "__main__": | |
| import argparse | |
| # Check if any arguments provided | |
| if len(sys.argv) > 1: | |
| # Use CLI with provided arguments | |
| cli_main() | |
| else: | |
| # Run with defaults | |
| run_default_pipeline() | |