""" Script to generate positive pairs and triplets from code chunks. This script loads code chunks and generates: 1. Positive Pairs: (question, code) with 4-5 variations per sample 2. Triplets: (anchor_question, positive_code, negative_code) Usage: python -m scripts.run_pairs_triplets_pipeline --chunks --output python -m scripts.run_pairs_triplets_pipeline --help Examples: # Generate from local chunks with default settings python -m scripts.run_pairs_triplets_pipeline \\ --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\ --output data/processed/training # Generate from repository chunks python -m scripts.run_pairs_triplets_pipeline \\ --chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\ --output data/processed/training/langgraph # Custom settings python -m scripts.run_pairs_triplets_pipeline \\ --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\ --output data/processed/training \\ --pairs 100 --triplets 100 --variance 5 """ import sys from pathlib import Path # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from src.task_3_data_engineering.export.pairs_triplets_generator import ( generate_pairs_and_triplets, main as cli_main ) def run_default_pipeline(): """Run with default settings for the available chunks.""" # Try multiple possible chunk locations possible_paths = [ PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", ] # Find all chunks.jsonl files in chunks folder subdirectories chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks" if chunks_dir.exists(): for subdir in chunks_dir.iterdir(): if subdir.is_dir(): chunks_file = subdir / "chunks.jsonl" if chunks_file.exists() and chunks_file not in possible_paths: possible_paths.append(chunks_file) # Find repository chunks repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" if repos_dir.exists(): for repo_dir in repos_dir.iterdir(): if repo_dir.is_dir(): for jsonl_file in repo_dir.glob("*_chunks.jsonl"): possible_paths.append(jsonl_file) chunks_path = None for path in possible_paths: if path.exists(): chunks_path = path break if chunks_path is None: print("āŒ No chunks files found. Please specify a chunks file with --chunks") print("\nPossible locations checked:") for p in possible_paths[:5]: print(f" - {p}") return output_dir = PROJECT_ROOT / "data" / "processed" / "training" print("=" * 60) print("šŸš€ Positive Pairs & Triplets Generator") print("=" * 60) print(f"\nšŸ“‚ Chunks Path: {chunks_path}") print(f"šŸ“ Output Dir: {output_dir}") print(f"šŸ“Š Settings: pairs=100, triplets=100, variance=5") print("\n" + "-" * 60) pairs, triplets = generate_pairs_and_triplets( chunks_path=chunks_path, output_dir=output_dir, num_pairs=100, num_triplets=100, variance=5, export_format="both" ) print("\n" + "=" * 60) print("āœ… Pipeline Complete!") print("=" * 60) print(f"\nšŸ“ Output files saved to: {output_dir}") print(" - positive_pairs.jsonl") print(" - positive_pairs.json") print(" - triplets.jsonl") print(" - triplets.json") if __name__ == "__main__": import argparse # Check if any arguments provided if len(sys.argv) > 1: # Use CLI with provided arguments cli_main() else: # Run with defaults run_default_pipeline()