Spaces:
Sleeping
Sleeping
File size: 3,982 Bytes
463fc7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | """
Script to generate positive pairs and triplets from code chunks.
This script loads code chunks and generates:
1. Positive Pairs: (question, code) with 4-5 variations per sample
2. Triplets: (anchor_question, positive_code, negative_code)
Usage:
python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir>
python -m scripts.run_pairs_triplets_pipeline --help
Examples:
# Generate from local chunks with default settings
python -m scripts.run_pairs_triplets_pipeline \\
--chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
--output data/processed/training
# Generate from repository chunks
python -m scripts.run_pairs_triplets_pipeline \\
--chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\
--output data/processed/training/langgraph
# Custom settings
python -m scripts.run_pairs_triplets_pipeline \\
--chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
--output data/processed/training \\
--pairs 100 --triplets 100 --variance 5
"""
import sys
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.task_3_data_engineering.export.pairs_triplets_generator import (
generate_pairs_and_triplets,
main as cli_main
)
def run_default_pipeline():
"""Run with default settings for the available chunks."""
# Try multiple possible chunk locations
possible_paths = [
PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
]
# Find all chunks.jsonl files in chunks folder subdirectories
chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks"
if chunks_dir.exists():
for subdir in chunks_dir.iterdir():
if subdir.is_dir():
chunks_file = subdir / "chunks.jsonl"
if chunks_file.exists() and chunks_file not in possible_paths:
possible_paths.append(chunks_file)
# Find repository chunks
repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
if repos_dir.exists():
for repo_dir in repos_dir.iterdir():
if repo_dir.is_dir():
for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
possible_paths.append(jsonl_file)
chunks_path = None
for path in possible_paths:
if path.exists():
chunks_path = path
break
if chunks_path is None:
print("β No chunks files found. Please specify a chunks file with --chunks")
print("\nPossible locations checked:")
for p in possible_paths[:5]:
print(f" - {p}")
return
output_dir = PROJECT_ROOT / "data" / "processed" / "training"
print("=" * 60)
print("π Positive Pairs & Triplets Generator")
print("=" * 60)
print(f"\nπ Chunks Path: {chunks_path}")
print(f"π Output Dir: {output_dir}")
print(f"π Settings: pairs=100, triplets=100, variance=5")
print("\n" + "-" * 60)
pairs, triplets = generate_pairs_and_triplets(
chunks_path=chunks_path,
output_dir=output_dir,
num_pairs=100,
num_triplets=100,
variance=5,
export_format="both"
)
print("\n" + "=" * 60)
print("β
Pipeline Complete!")
print("=" * 60)
print(f"\nπ Output files saved to: {output_dir}")
print(" - positive_pairs.jsonl")
print(" - positive_pairs.json")
print(" - triplets.jsonl")
print(" - triplets.json")
if __name__ == "__main__":
import argparse
# Check if any arguments provided
if len(sys.argv) > 1:
# Use CLI with provided arguments
cli_main()
else:
# Run with defaults
run_default_pipeline()
|