Spaces:
Running
Running
| """Shared data models used across the project.""" | |
| from dataclasses import dataclass, field | |
| from enum import Enum | |
| class ChunkStrategy(Enum): | |
| """Available text chunking strategies.""" | |
| FIXED_SIZE = "fixed_size" | |
| RECURSIVE = "recursive" | |
| SEMANTIC = "semantic" | |
| class IntentType(Enum): | |
| """Classified intent types for incoming queries.""" | |
| FACTUAL = "factual" | |
| RAG = "rag" | |
| SUMMARY = "summary" | |
| COMPARISON = "comparison" | |
| PROCEDURAL = "procedural" | |
| UNKNOWN = "unknown" | |
| class DocumentChunk: | |
| """A single chunk of text extracted from a PDF document. | |
| Attributes: | |
| chunk_id: Unique identifier for the chunk. | |
| document_id: Identifier of the source document. | |
| text: The chunk text content. | |
| metadata: Additional metadata (page number, source file, etc.). | |
| strategy: The chunking strategy used to produce this chunk. | |
| """ | |
| chunk_id: str | |
| document_id: str | |
| text: str | |
| metadata: dict[str, str | int] = field(default_factory=dict) | |
| strategy: ChunkStrategy = ChunkStrategy.FIXED_SIZE | |
| class QueryResult: | |
| """A single retrieval result with relevance scoring. | |
| Attributes: | |
| chunk: The retrieved document chunk. | |
| score: Relevance score (higher is better). | |
| source: Which retrieval method produced this result. | |
| """ | |
| chunk: DocumentChunk | |
| score: float | |
| source: str | |
| def to_dict(self, *, include_text: bool = True) -> dict: | |
| """Serialise to a JSON-safe dictionary. | |
| Args: | |
| include_text: Whether to include the chunk text (default True). | |
| Returns: | |
| Dictionary with chunk_id, document_id, score, source, metadata, | |
| and optionally text. | |
| """ | |
| d: dict = { | |
| "chunk_id": self.chunk.chunk_id, | |
| "document_id": self.chunk.document_id, | |
| "score": self.score, | |
| "source": self.source, | |
| "metadata": self.chunk.metadata, | |
| } | |
| if include_text: | |
| d["text"] = self.chunk.text | |
| return d | |
| class PipelineDetails: | |
| """Intermediate pipeline data for debugging and transparency. | |
| Attributes: | |
| original_query: The user's original query text. | |
| retrieval_query: The query used for retrieval (may be translated). | |
| detected_language: Detected language of the original query. | |
| translated: Whether the query was translated for retrieval. | |
| dense_results: Results from dense (vector) retrieval. | |
| sparse_results: Results from sparse (BM25) retrieval. | |
| fused_results: Results after reciprocal rank fusion. | |
| reranked_results: Results after cross-encoder reranking. | |
| plan_steps: Ordered descriptions of planned steps (Plan-and-Execute mode). | |
| tool_calls: Log of tool invocations as "tool_name: argument" strings. | |
| """ | |
| original_query: str = "" | |
| retrieval_query: str = "" | |
| detected_language: str = "" | |
| translated: bool = False | |
| dense_results: list[QueryResult] = field(default_factory=list) | |
| sparse_results: list[QueryResult] = field(default_factory=list) | |
| fused_results: list[QueryResult] = field(default_factory=list) | |
| reranked_results: list[QueryResult] = field(default_factory=list) | |
| plan_steps: list[str] = field(default_factory=list) | |
| tool_calls: list[str] = field(default_factory=list) | |
| class GenerationResponse: | |
| """Structured response from the generation pipeline. | |
| Attributes: | |
| answer: The generated answer text. | |
| sources: List of source chunks used for generation. | |
| intent: Classified intent of the original query. | |
| confidence: Model confidence in the answer (0.0-1.0). | |
| pipeline_details: Optional intermediate pipeline data. | |
| """ | |
| answer: str | |
| sources: list[QueryResult] | |
| intent: IntentType | |
| confidence: float | |
| pipeline_details: PipelineDetails = field(default_factory=PipelineDetails) |