Spaces:

nothingworry
/

IntegraChat

Sleeping

File size: 1,465 Bytes

e44e5dd

from __future__ import annotations

from typing import Mapping

from backend.api.utils.text_extractor import extract_text
from backend.mcp_server.common.database import insert_document_chunks
from backend.mcp_server.common.embeddings import embed_text
from backend.mcp_server.common.tenant import TenantContext
from backend.mcp_server.common.utils import ToolValidationError, tool_handler


@tool_handler("rag.ingest")
async def rag_ingest(context: TenantContext, payload: Mapping[str, object]) -> dict[str, object]:
    """
    Ingest raw text into the tenant's knowledge base.
    """

    content = payload.get("content")
    if not isinstance(content, str) or not content.strip():
        raise ToolValidationError("content must be a non-empty string")

    max_words = payload.get("chunk_words", 300)
    try:
        max_words_value = max(50, min(int(max_words), 800))
    except (TypeError, ValueError):
        raise ToolValidationError("chunk_words must be an integer between 50 and 800")

    chunks = extract_text(content, max_words=max_words_value)
    if not chunks:
        raise ToolValidationError("no text detected after preprocessing")

    stored = 0
    for chunk in chunks:
        vector = embed_text(chunk)
        insert_document_chunks(context.tenant_id, chunk, vector)
        stored += 1

    return {
        "tenant_id": context.tenant_id,
        "chunks_ingested": stored,
        "metadata": {"chunk_words": max_words_value},
    }