""" Code chunker. Takes the raw definitions from parser.py and produces retrieval-ready chunks. The only real job here is building `chunk_text` -- the string that actually gets embedded and BM25-indexed. We prepend a short header (file, kind, name) to the code so search can match on *location and signature* as well as the code body. A query like "where is JWT created" then matches both the symbol names and the implementation. Run standalone to preview chunks: python -m src.ingestion.chunker path/to/repo_dir """ # Definitions longer than this many characters get their body trimmed in # chunk_text so one giant function can't dominate an embedding. The full code # is always kept in `code` for display/citations -- only the embedded text is # trimmed. _MAX_BODY_CHARS = 2000 def _build_chunk_text(d): header = f"File: {d['file']} | {d['type']} {d['name']} | lines {d['start_line']}-{d['end_line']}" body = d["code"] if len(body) > _MAX_BODY_CHARS: body = body[:_MAX_BODY_CHARS] + "\n# ... (truncated)" return f"{header}\n{body}" def build_chunks(definitions): """Attach chunk_text (and a stable id) to each definition dict.""" chunks = [] for i, d in enumerate(definitions): chunk = dict(d) # don't mutate the parser's output chunk["id"] = f"{d['file']}::{d['name']}::{d['start_line']}" chunk["chunk_text"] = _build_chunk_text(d) chunks.append(chunk) return chunks def chunk_repo(files): """Full ingestion: parse files -> chunks. `files` is scanner output.""" from src.ingestion.parser import parse_files return build_chunks(parse_files(files)) if __name__ == "__main__": import sys from src.ingestion.scanner import scan_python_files, scan_repo if len(sys.argv) < 2: print("usage: python -m src.ingestion.chunker ") sys.exit(1) target = sys.argv[1] files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target) chunks = chunk_repo(files) print(f"Built {len(chunks)} chunks. First 3:\n") for c in chunks[:3]: print("-" * 70) print(f"id: {c['id']}") print(c["chunk_text"][:400]) print("-" * 70)