Spaces:
Running
Running
| """ | |
| Code chunker. | |
| Takes the raw definitions from parser.py and produces retrieval-ready chunks. | |
| The only real job here is building `chunk_text` -- the string that actually gets | |
| embedded and BM25-indexed. We prepend a short header (file, kind, name) to the | |
| code so search can match on *location and signature* as well as the code body. | |
| A query like "where is JWT created" then matches both the symbol names and the | |
| implementation. | |
| Run standalone to preview chunks: | |
| python -m src.ingestion.chunker path/to/repo_dir | |
| """ | |
| # Definitions longer than this many characters get their body trimmed in | |
| # chunk_text so one giant function can't dominate an embedding. The full code | |
| # is always kept in `code` for display/citations -- only the embedded text is | |
| # trimmed. | |
| _MAX_BODY_CHARS = 2000 | |
| def _build_chunk_text(d): | |
| header = f"File: {d['file']} | {d['type']} {d['name']} | lines {d['start_line']}-{d['end_line']}" | |
| body = d["code"] | |
| if len(body) > _MAX_BODY_CHARS: | |
| body = body[:_MAX_BODY_CHARS] + "\n# ... (truncated)" | |
| return f"{header}\n{body}" | |
| def build_chunks(definitions): | |
| """Attach chunk_text (and a stable id) to each definition dict.""" | |
| chunks = [] | |
| for i, d in enumerate(definitions): | |
| chunk = dict(d) # don't mutate the parser's output | |
| chunk["id"] = f"{d['file']}::{d['name']}::{d['start_line']}" | |
| chunk["chunk_text"] = _build_chunk_text(d) | |
| chunks.append(chunk) | |
| return chunks | |
| def chunk_repo(files): | |
| """Full ingestion: parse files -> chunks. `files` is scanner output.""" | |
| from src.ingestion.parser import parse_files | |
| return build_chunks(parse_files(files)) | |
| if __name__ == "__main__": | |
| import sys | |
| from src.ingestion.scanner import scan_python_files, scan_repo | |
| if len(sys.argv) < 2: | |
| print("usage: python -m src.ingestion.chunker <repo.zip | repo_dir>") | |
| sys.exit(1) | |
| target = sys.argv[1] | |
| files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target) | |
| chunks = chunk_repo(files) | |
| print(f"Built {len(chunks)} chunks. First 3:\n") | |
| for c in chunks[:3]: | |
| print("-" * 70) | |
| print(f"id: {c['id']}") | |
| print(c["chunk_text"][:400]) | |
| print("-" * 70) |