from __future__ import annotations import hashlib import logging from pathlib import Path from typing import Any from ingestion.models import DocumentChunk from src.file_agent.config import file_config logger = logging.getLogger(__name__) def _word_windows(text: str, max_words: int) -> list[str]: words = text.split() if not words: return [] parts = [] for i in range(0, len(words), max_words): chunk = " ".join(words[i : i + max_words]) if chunk.strip(): parts.append(chunk) return parts def _format_table_rows(rows: list[list[str]]) -> str: return "\n".join(" | ".join(str(c) for c in row) for row in rows) def chunk_file_content( blocks: list[dict[str, Any]], file_path: str, team_id: str, max_words: int = 0, ) -> list[DocumentChunk]: max_words = max_words or file_config.max_words_per_chunk file_name = Path(file_path).name file_url = f"file://{Path(file_path).resolve().as_posix()}" doc_id = hashlib.sha256(f"file:{file_name}".encode()).hexdigest() chunks: list[DocumentChunk] = [] for block_idx, block in enumerate(blocks): btype = block.get("type", "text") content = block.get("content", "") heading = block.get("heading", block.get("tag_path", "")) page = block.get("page", "") sheet = block.get("sheet", "") base_meta = { "file_name": file_name, "block_index": block_idx, "block_type": btype, } if page: base_meta["page"] = page if sheet: base_meta["sheet"] = sheet if btype in ("table",): if isinstance(content, list): text = _format_table_rows(content) else: text = str(content) prefix = f"File: {file_name}\nTable:\n" chunk_id = hashlib.sha256(f"file:{file_name}:block_{block_idx}_part_0".encode()).hexdigest() chunks.append( DocumentChunk( chunk_id=chunk_id, doc_id=doc_id, text=(prefix + text).strip(), source=file_url, source_type="file", team_id=team_id, chunk_index=len(chunks), metadata={**base_meta, "title": file_name}, ) ) elif btype == "row": text = str(content) prefix = f"File: {file_name}\nRow: " chunk_id = hashlib.sha256(f"file:{file_name}:block_{block_idx}_part_0".encode()).hexdigest() chunks.append( DocumentChunk( chunk_id=chunk_id, doc_id=doc_id, text=(prefix + text).strip(), source=file_url, source_type="file", team_id=team_id, chunk_index=len(chunks), metadata={**base_meta, "title": file_name}, ) ) else: # text / section / xml_node — word-window split text = str(content).strip() if not text: continue heading_line = f"Section: {heading}\n\n" if heading else "" prefix = f"File: {file_name}\n{heading_line}" parts = _word_windows(text, max_words) for part_idx, part in enumerate(parts): chunk_id = hashlib.sha256( f"file:{file_name}:block_{block_idx}_part_{part_idx}".encode() ).hexdigest() chunks.append( DocumentChunk( chunk_id=chunk_id, doc_id=doc_id, text=(prefix + part).strip(), source=file_url, source_type="file", team_id=team_id, chunk_index=len(chunks), metadata={**base_meta, "title": file_name, "section_heading": heading}, ) ) logger.debug("file_chunker: %s -> %d chunks from %d blocks", file_name, len(chunks), len(blocks)) return chunks