"""
Code chunker.

Takes the raw definitions from parser.py and produces retrieval-ready chunks.
The only real job here is building `chunk_text` -- the string that actually gets
embedded and BM25-indexed. We prepend a short header (file, kind, name) to the
code so search can match on *location and signature* as well as the code body.
A query like "where is JWT created" then matches both the symbol names and the
implementation.

Run standalone to preview chunks:
    python -m src.ingestion.chunker path/to/repo_dir
"""

# Definitions longer than this many characters get their body trimmed in
# chunk_text so one giant function can't dominate an embedding. The full code
# is always kept in `code` for display/citations -- only the embedded text is
# trimmed.
_MAX_BODY_CHARS = 2000


def _build_chunk_text(d):
    header = f"File: {d['file']} | {d['type']} {d['name']} | lines {d['start_line']}-{d['end_line']}"
    body = d["code"]
    if len(body) > _MAX_BODY_CHARS:
        body = body[:_MAX_BODY_CHARS] + "\n# ... (truncated)"
    return f"{header}\n{body}"


def build_chunks(definitions):
    """Attach chunk_text (and a stable id) to each definition dict."""
    chunks = []
    for i, d in enumerate(definitions):
        chunk = dict(d)  # don't mutate the parser's output
        chunk["id"] = f"{d['file']}::{d['name']}::{d['start_line']}"
        chunk["chunk_text"] = _build_chunk_text(d)
        chunks.append(chunk)
    return chunks


def chunk_repo(files):
    """Full ingestion: parse files -> chunks. `files` is scanner output."""
    from src.ingestion.parser import parse_files
    return build_chunks(parse_files(files))


if __name__ == "__main__":
    import sys
    from src.ingestion.scanner import scan_python_files, scan_repo

    if len(sys.argv) < 2:
        print("usage: python -m src.ingestion.chunker <repo.zip | repo_dir>")
        sys.exit(1)

    target = sys.argv[1]
    files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target)
    chunks = chunk_repo(files)

    print(f"Built {len(chunks)} chunks. First 3:\n")
    for c in chunks[:3]:
        print("-" * 70)
        print(f"id: {c['id']}")
        print(c["chunk_text"][:400])
    print("-" * 70)