File size: 3,982 Bytes
463fc7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Script to generate positive pairs and triplets from code chunks.

This script loads code chunks and generates:
1. Positive Pairs: (question, code) with 4-5 variations per sample
2. Triplets: (anchor_question, positive_code, negative_code)

Usage:
    python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir>
    python -m scripts.run_pairs_triplets_pipeline --help

Examples:
    # Generate from local chunks with default settings
    python -m scripts.run_pairs_triplets_pipeline \\
        --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
        --output data/processed/training

    # Generate from repository chunks  
    python -m scripts.run_pairs_triplets_pipeline \\
        --chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\
        --output data/processed/training/langgraph

    # Custom settings
    python -m scripts.run_pairs_triplets_pipeline \\
        --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
        --output data/processed/training \\
        --pairs 100 --triplets 100 --variance 5
"""

import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from src.task_3_data_engineering.export.pairs_triplets_generator import (
    generate_pairs_and_triplets,
    main as cli_main
)


def run_default_pipeline():
    """Run with default settings for the available chunks."""
    
    # Try multiple possible chunk locations
    possible_paths = [
        PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
        PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
    ]
    
    # Find all chunks.jsonl files in chunks folder subdirectories
    chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks"
    if chunks_dir.exists():
        for subdir in chunks_dir.iterdir():
            if subdir.is_dir():
                chunks_file = subdir / "chunks.jsonl"
                if chunks_file.exists() and chunks_file not in possible_paths:
                    possible_paths.append(chunks_file)
    
    # Find repository chunks
    repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
    if repos_dir.exists():
        for repo_dir in repos_dir.iterdir():
            if repo_dir.is_dir():
                for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
                    possible_paths.append(jsonl_file)
    
    chunks_path = None
    for path in possible_paths:
        if path.exists():
            chunks_path = path
            break
    
    if chunks_path is None:
        print("❌ No chunks files found. Please specify a chunks file with --chunks")
        print("\nPossible locations checked:")
        for p in possible_paths[:5]:
            print(f"   - {p}")
        return
    
    output_dir = PROJECT_ROOT / "data" / "processed" / "training"
    
    print("=" * 60)
    print("πŸš€ Positive Pairs & Triplets Generator")
    print("=" * 60)
    print(f"\nπŸ“‚ Chunks Path: {chunks_path}")
    print(f"πŸ“ Output Dir: {output_dir}")
    print(f"πŸ“Š Settings: pairs=100, triplets=100, variance=5")
    print("\n" + "-" * 60)
    
    pairs, triplets = generate_pairs_and_triplets(
        chunks_path=chunks_path,
        output_dir=output_dir,
        num_pairs=100,
        num_triplets=100,
        variance=5,
        export_format="both"
    )
    
    print("\n" + "=" * 60)
    print("βœ… Pipeline Complete!")
    print("=" * 60)
    print(f"\nπŸ“ Output files saved to: {output_dir}")
    print("   - positive_pairs.jsonl")
    print("   - positive_pairs.json")
    print("   - triplets.jsonl")
    print("   - triplets.json")


if __name__ == "__main__":
    import argparse
    
    # Check if any arguments provided
    if len(sys.argv) > 1:
        # Use CLI with provided arguments
        cli_main()
    else:
        # Run with defaults
        run_default_pipeline()