Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from typing import Mapping | |
| from backend.api.utils.text_extractor import extract_text | |
| from backend.mcp_server.common.database import insert_document_chunks | |
| from backend.mcp_server.common.embeddings import embed_text | |
| from backend.mcp_server.common.tenant import TenantContext | |
| from backend.mcp_server.common.utils import ToolValidationError, tool_handler | |
| async def rag_ingest(context: TenantContext, payload: Mapping[str, object]) -> dict[str, object]: | |
| """ | |
| Ingest raw text into the tenant's knowledge base. | |
| """ | |
| content = payload.get("content") | |
| if not isinstance(content, str) or not content.strip(): | |
| raise ToolValidationError("content must be a non-empty string") | |
| max_words = payload.get("chunk_words", 300) | |
| try: | |
| max_words_value = max(50, min(int(max_words), 800)) | |
| except (TypeError, ValueError): | |
| raise ToolValidationError("chunk_words must be an integer between 50 and 800") | |
| chunks = extract_text(content, max_words=max_words_value) | |
| if not chunks: | |
| raise ToolValidationError("no text detected after preprocessing") | |
| stored = 0 | |
| for chunk in chunks: | |
| vector = embed_text(chunk) | |
| insert_document_chunks(context.tenant_id, chunk, vector) | |
| stored += 1 | |
| return { | |
| "tenant_id": context.tenant_id, | |
| "chunks_ingested": stored, | |
| "metadata": {"chunk_words": max_words_value}, | |
| } | |