Spaces:
Sleeping
Sleeping
| ''' | |
| Synthesize triplet and positive pair datasets from chunked code files.''' | |
| import argparse | |
| import json | |
| import random | |
| import hashlib | |
| from pathlib import Path | |
| from typing import Dict, List | |
| from datetime import datetime | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # ============================ | |
| # CONFIG | |
| # ============================ | |
| MAX_DOCUMENTS = 200 | |
| POSITIVE_VARIANTS = 5 | |
| TFIDF_MAX_FEATURES = 5000 | |
| RANDOM_SEED = 42 | |
| BASE_OUTPUT_DIR = Path("data/synthetic") | |
| random.seed(RANDOM_SEED) | |
| # ============================ | |
| # UTILITIES | |
| # ============================ | |
| def load_chunks(file_path): | |
| path = Path(file_path) | |
| if path.suffix == ".jsonl": | |
| chunks = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line_no, line in enumerate(f, 1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| chunks.append(json.loads(line)) | |
| except json.JSONDecodeError as e: | |
| raise ValueError( | |
| f"Invalid JSON on line {line_no} in {path}" | |
| ) from e | |
| return chunks | |
| elif path.suffix == ".json": | |
| with open(path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if not isinstance(data, list): | |
| raise ValueError(f"{path} must contain a list of chunks") | |
| return data | |
| else: | |
| raise ValueError( | |
| f"Unsupported file format {path.suffix}. Use .json or .jsonl" | |
| ) | |
| def save_jsonl(path: Path, records: List[Dict]): | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8") as f: | |
| for r in records: | |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") | |
| def save_json(path: Path, data): | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2) | |
| def stable_document_id(chunk: Dict, idx: int) -> str: | |
| """ | |
| Generate a canonical, stable document_id. | |
| """ | |
| base = f"{chunk.get('file_path','unknown')}::{idx}" | |
| return "doc_" + hashlib.sha1(base.encode()).hexdigest() | |
| def infer_framework(input_path: Path) -> str: | |
| """ | |
| Infer framework from path (fallback-safe). | |
| """ | |
| parts = [p.lower() for p in input_path.parts] | |
| for fw in ["crewai", "langchain", "langgraph", "autogen"]: | |
| if fw in parts: | |
| return fw | |
| return "unknown" | |
| # ============================ | |
| # ANCHOR GENERATION (LLM PLACEHOLDER) | |
| # ============================ | |
| def generate_anchor_questions(code: str, n: int) -> List[str]: | |
| """ | |
| Deterministic placeholder (LLM-ready). | |
| """ | |
| symbol = code.split("(")[0].replace("def ", "").replace("class ", "").strip() | |
| templates = [ | |
| f"How does {symbol} work in Python?", | |
| f"How to implement {symbol}?", | |
| f"Example usage of {symbol}", | |
| f"Explain the {symbol} logic", | |
| f"Best practices for {symbol}", | |
| ] | |
| random.shuffle(templates) | |
| return templates[:n] | |
| # ============================ | |
| # NEGATIVE MINING | |
| # ============================ | |
| def build_tfidf(chunks: List[Dict]): | |
| corpus = [c["code"] for c in chunks] | |
| vectorizer = TfidfVectorizer( | |
| stop_words="english", | |
| max_features=TFIDF_MAX_FEATURES | |
| ) | |
| matrix = vectorizer.fit_transform(corpus) | |
| return vectorizer, matrix | |
| def mine_hard_negative( | |
| anchor: str, | |
| positive_idx: int, | |
| chunks: List[Dict], | |
| vectorizer, | |
| matrix, | |
| ) -> Dict: | |
| query_vec = vectorizer.transform([anchor]) | |
| scores = cosine_similarity(query_vec, matrix)[0] | |
| ranked = sorted( | |
| [(i, s) for i, s in enumerate(scores)], | |
| key=lambda x: x[1], | |
| reverse=True, | |
| ) | |
| for idx, _ in ranked: | |
| if idx != positive_idx: | |
| return chunks[idx] | |
| raise RuntimeError("No negative candidate found") | |
| # ============================ | |
| # MAIN PIPELINE | |
| # ============================ | |
| def generate_datasets(input_path: Path, run_name: str): | |
| output_dir = BASE_OUTPUT_DIR / run_name | |
| framework = infer_framework(input_path) | |
| chunks = load_chunks(input_path) | |
| # Filter only semantic code chunks | |
| chunks = [ | |
| c for c in chunks | |
| if c.get("chunk_type") in {"class", "method", "function"} | |
| and "code" in c | |
| ] | |
| random.shuffle(chunks) | |
| chunks = chunks[:MAX_DOCUMENTS] | |
| # Assign canonical document_id | |
| for idx, c in enumerate(chunks): | |
| c["document_id"] = stable_document_id(c, idx) | |
| vectorizer, matrix = build_tfidf(chunks) | |
| positive_pairs = [] | |
| triplets = [] | |
| for idx, chunk in enumerate(chunks): | |
| code = chunk["code"] | |
| doc_id = chunk["document_id"] | |
| # -------- POSITIVE PAIRS -------- | |
| anchors = generate_anchor_questions(code, POSITIVE_VARIANTS) | |
| for a in anchors: | |
| positive_pairs.append({ | |
| "document_id": doc_id, | |
| "anchor": a, | |
| "positive": code, | |
| "framework": framework, | |
| "source": "synthetic_positive_v2", | |
| }) | |
| # -------- TRIPLET -------- | |
| anchor = anchors[0] | |
| negative_chunk = mine_hard_negative( | |
| anchor, idx, chunks, vectorizer, matrix | |
| ) | |
| triplets.append({ | |
| "document_id": doc_id, | |
| "anchor": anchor, | |
| "positive": code, | |
| "negative": negative_chunk["code"], | |
| "framework": framework, | |
| "source": "synthetic_triplet_v2", | |
| }) | |
| # -------- SAVE -------- | |
| save_jsonl(output_dir / "positive_pairs.jsonl", positive_pairs) | |
| save_jsonl(output_dir / "triplets.jsonl", triplets) | |
| save_json(output_dir / "positive_pairs.json", positive_pairs) | |
| save_json(output_dir / "triplets.json", triplets) | |
| metadata = { | |
| "name": run_name, | |
| "framework": framework, | |
| "input_file": str(input_path), | |
| "num_chunks": len(chunks), | |
| "positive_pairs": len(positive_pairs), | |
| "triplets": len(triplets), | |
| "created_at": datetime.utcnow().isoformat(), | |
| "random_seed": RANDOM_SEED, | |
| } | |
| save_json(output_dir / "metadata.json", metadata) | |
| print(f"✅ Dataset generated at: {output_dir}") | |
| # ============================ | |
| # ENTRY POINT | |
| # ============================ | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input", required=True, help="Chunked JSONL file") | |
| parser.add_argument("--name", required=True, help="Synthetic dataset name") | |
| args = parser.parse_args() | |
| generate_datasets( | |
| input_path=Path(args.input), | |
| run_name=args.name, | |
| ) | |
| # # For document id | |
| # document_id := sha1( | |
| # normalized_repo_path + | |
| # file_path + | |
| # top_level_symbol | |
| # ) |