| """Data models for document processing""" | |
| from pydantic import BaseModel, Field | |
| from typing import Optional, List | |
| class DocumentChunk(BaseModel): | |
| """Represents a single document chunk with metadata""" | |
| chunk_id: str = Field(..., description="Unique identifier for the chunk") | |
| content: str = Field(..., description="The actual text content") | |
| source_doc: str = Field(..., description="Original document source/filename") | |
| chunk_index: int = Field(..., description="Index of this chunk in the document") | |
| start_char: int = Field(..., description="Starting character position in original document") | |
| end_char: int = Field(..., description="Ending character position in original document") | |
| token_count: int = Field(..., description="Number of tokens in this chunk") | |
| metadata: dict = Field(default_factory=dict, description="Additional metadata") | |
| embedding: Optional[List[float]] = Field(None, description="Vector embedding (optional)") | |
| class Document(BaseModel): | |
| """Represents a source document""" | |
| doc_id: str = Field(..., description="Unique document identifier") | |
| filename: str = Field(..., description="Source filename") | |
| content: str = Field(..., description="Full document content") | |
| doc_type: str = Field(default="product_manual", description="Type of document") | |
| metadata: dict = Field(default_factory=dict, description="Document-level metadata") | |
| class RetrievalResult(BaseModel): | |
| """Result from retrieval""" | |
| chunk_id: str | |
| content: str | |
| source_doc: str | |
| score: float = Field(..., description="Relevance score (0-1)") | |
| search_type: str = Field(..., description="Type of search that returned this result") | |