Spaces:

AishaSurve
/

codebase-agent

Running

App Files Files Community

codebase-agent / src /ingestion /chunker.py

AishaSurve

Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval

8e72e1f 9 days ago

Raw

History Blame Contribute Delete

2.23 kB

	"""
	Code chunker.

	Takes the raw definitions from parser.py and produces retrieval-ready chunks.
	The only real job here is building `chunk_text` -- the string that actually gets
	embedded and BM25-indexed. We prepend a short header (file, kind, name) to the
	code so search can match on location and signature as well as the code body.
	A query like "where is JWT created" then matches both the symbol names and the
	implementation.

	Run standalone to preview chunks:
	python -m src.ingestion.chunker path/to/repo_dir
	"""

	# Definitions longer than this many characters get their body trimmed in
	# chunk_text so one giant function can't dominate an embedding. The full code
	# is always kept in `code` for display/citations -- only the embedded text is
	# trimmed.
	_MAX_BODY_CHARS = 2000


	def _build_chunk_text(d):
	header = f"File: {d['file']} \| {d['type']} {d['name']} \| lines {d['start_line']}-{d['end_line']}"
	body = d["code"]
	if len(body) > _MAX_BODY_CHARS:
	body = body[:_MAX_BODY_CHARS] + "\n# ... (truncated)"
	return f"{header}\n{body}"


	def build_chunks(definitions):
	"""Attach chunk_text (and a stable id) to each definition dict."""
	chunks = []
	for i, d in enumerate(definitions):
	chunk = dict(d) # don't mutate the parser's output
	chunk["id"] = f"{d['file']}::{d['name']}::{d['start_line']}"
	chunk["chunk_text"] = _build_chunk_text(d)
	chunks.append(chunk)
	return chunks


	def chunk_repo(files):
	"""Full ingestion: parse files -> chunks. `files` is scanner output."""
	from src.ingestion.parser import parse_files
	return build_chunks(parse_files(files))


	if __name__ == "__main__":
	import sys
	from src.ingestion.scanner import scan_python_files, scan_repo

	if len(sys.argv) < 2:
	print("usage: python -m src.ingestion.chunker <repo.zip \| repo_dir>")
	sys.exit(1)

	target = sys.argv[1]
	files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target)
	chunks = chunk_repo(files)

	print(f"Built {len(chunks)} chunks. First 3:\n")
	for c in chunks[:3]:
	print("-" * 70)
	print(f"id: {c['id']}")
	print(c["chunk_text"][:400])
	print("-" * 70)