import os
from typing import Type, Optional, Any
from pydantic import BaseModel, Field
from markitdown import MarkItDown
from chonkie import SemanticChunker
from qdrant_client import QdrantClient

try:
    from crewai.tools import BaseTool
except ImportError:
    from langchain.tools import BaseTool

class SearchInput(BaseModel):
    query: str = Field(..., description="Search query")

class DocumentSearchTool(BaseTool):
    name: str = "DocumentSearchTool"
    description: str = "Search uploaded document for relevant passages."
    args_schema: Type[BaseModel] = SearchInput
    file_path: Optional[str] = None
    client: Optional[Any] = None

    COLLECTION  : str   = "neuraldocs_collection"
    EMBED_MODEL : str   = "minishlab/potion-base-8M"
    CHUNK_SIZE  : int   = 128
    SIMILARITY_T: float = 0.5
    TOP_K       : int   = 2
    SEPARATOR   : str   = "\n---\n"

    def __init__(self, file_path: str):
        super().__init__(file_path=file_path, client=QdrantClient(":memory:"))
        self._build_index()

    def _to_text(self) -> str:
        converter = MarkItDown()
        result = converter.convert(self.file_path)
        text = result.text_content.strip()
        if not text:
            raise ValueError(f"Could not extract text from '{self.file_path}'.")
        return text[:5000]

    def _chunk(self, text: str) -> list:
        chunker = SemanticChunker(
            embedding_model=self.EMBED_MODEL,
            threshold=self.SIMILARITY_T,
            chunk_size=self.CHUNK_SIZE,
            min_sentences=1,
        )
        return [c.text for c in chunker.chunk(text) if c.text.strip()]

    def _build_index(self) -> None:
        chunks = self._chunk(self._to_text())
        source_name = os.path.basename(self.file_path)
        self.client.add(
            collection_name=self.COLLECTION,
            documents=chunks,
            metadata=[{"source": source_name, "chunk_id": i} for i in range(len(chunks))],
            ids=list(range(len(chunks))),
        )

    def _run(self, query: str) -> str:
        hits = self.client.query(
            collection_name=self.COLLECTION,
            query_text=query,
            limit=self.TOP_K,
        )
        passages = [h.document for h in hits if h.document and h.document.strip()]
        if not passages:
            return "No relevant passages found in the document."
        return self.SEPARATOR.join(passages)