codebase-agent / src /ingestion /chunker.py
AishaSurve's picture
Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval
8e72e1f
Raw
History Blame Contribute Delete
2.23 kB
"""
Code chunker.
Takes the raw definitions from parser.py and produces retrieval-ready chunks.
The only real job here is building `chunk_text` -- the string that actually gets
embedded and BM25-indexed. We prepend a short header (file, kind, name) to the
code so search can match on *location and signature* as well as the code body.
A query like "where is JWT created" then matches both the symbol names and the
implementation.
Run standalone to preview chunks:
python -m src.ingestion.chunker path/to/repo_dir
"""
# Definitions longer than this many characters get their body trimmed in
# chunk_text so one giant function can't dominate an embedding. The full code
# is always kept in `code` for display/citations -- only the embedded text is
# trimmed.
_MAX_BODY_CHARS = 2000
def _build_chunk_text(d):
header = f"File: {d['file']} | {d['type']} {d['name']} | lines {d['start_line']}-{d['end_line']}"
body = d["code"]
if len(body) > _MAX_BODY_CHARS:
body = body[:_MAX_BODY_CHARS] + "\n# ... (truncated)"
return f"{header}\n{body}"
def build_chunks(definitions):
"""Attach chunk_text (and a stable id) to each definition dict."""
chunks = []
for i, d in enumerate(definitions):
chunk = dict(d) # don't mutate the parser's output
chunk["id"] = f"{d['file']}::{d['name']}::{d['start_line']}"
chunk["chunk_text"] = _build_chunk_text(d)
chunks.append(chunk)
return chunks
def chunk_repo(files):
"""Full ingestion: parse files -> chunks. `files` is scanner output."""
from src.ingestion.parser import parse_files
return build_chunks(parse_files(files))
if __name__ == "__main__":
import sys
from src.ingestion.scanner import scan_python_files, scan_repo
if len(sys.argv) < 2:
print("usage: python -m src.ingestion.chunker <repo.zip | repo_dir>")
sys.exit(1)
target = sys.argv[1]
files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target)
chunks = chunk_repo(files)
print(f"Built {len(chunks)} chunks. First 3:\n")
for c in chunks[:3]:
print("-" * 70)
print(f"id: {c['id']}")
print(c["chunk_text"][:400])
print("-" * 70)