from __future__ import annotations from typing import Mapping from backend.api.utils.text_extractor import extract_text from backend.mcp_server.common.database import insert_document_chunks from backend.mcp_server.common.embeddings import embed_text from backend.mcp_server.common.tenant import TenantContext from backend.mcp_server.common.utils import ToolValidationError, tool_handler @tool_handler("rag.ingest") async def rag_ingest(context: TenantContext, payload: Mapping[str, object]) -> dict[str, object]: """ Ingest raw text into the tenant's knowledge base. """ content = payload.get("content") if not isinstance(content, str) or not content.strip(): raise ToolValidationError("content must be a non-empty string") max_words = payload.get("chunk_words", 300) try: max_words_value = max(50, min(int(max_words), 800)) except (TypeError, ValueError): raise ToolValidationError("chunk_words must be an integer between 50 and 800") chunks = extract_text(content, max_words=max_words_value) if not chunks: raise ToolValidationError("no text detected after preprocessing") stored = 0 for chunk in chunks: vector = embed_text(chunk) insert_document_chunks(context.tenant_id, chunk, vector) stored += 1 return { "tenant_id": context.tenant_id, "chunks_ingested": stored, "metadata": {"chunk_words": max_words_value}, }