Spaces:
Sleeping
Sleeping
File size: 1,465 Bytes
e44e5dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from __future__ import annotations
from typing import Mapping
from backend.api.utils.text_extractor import extract_text
from backend.mcp_server.common.database import insert_document_chunks
from backend.mcp_server.common.embeddings import embed_text
from backend.mcp_server.common.tenant import TenantContext
from backend.mcp_server.common.utils import ToolValidationError, tool_handler
@tool_handler("rag.ingest")
async def rag_ingest(context: TenantContext, payload: Mapping[str, object]) -> dict[str, object]:
"""
Ingest raw text into the tenant's knowledge base.
"""
content = payload.get("content")
if not isinstance(content, str) or not content.strip():
raise ToolValidationError("content must be a non-empty string")
max_words = payload.get("chunk_words", 300)
try:
max_words_value = max(50, min(int(max_words), 800))
except (TypeError, ValueError):
raise ToolValidationError("chunk_words must be an integer between 50 and 800")
chunks = extract_text(content, max_words=max_words_value)
if not chunks:
raise ToolValidationError("no text detected after preprocessing")
stored = 0
for chunk in chunks:
vector = embed_text(chunk)
insert_document_chunks(context.tenant_id, chunk, vector)
stored += 1
return {
"tenant_id": context.tenant_id,
"chunks_ingested": stored,
"metadata": {"chunk_words": max_words_value},
}
|