| """ | |
| Data models for embeddings and vector storage. | |
| This module defines Pydantic models for embedding metadata. | |
| """ | |
| from pydantic import BaseModel, Field | |
| class EmbeddingMetadata(BaseModel): | |
| """Metadata stored with each embedding in ChromaDB.""" | |
| chunk_id: str = Field(..., description="Chunk UUID as string") | |
| document_id: str = Field(..., description="Document UUID as string") | |
| parent_id: str = Field(default="", description="Parent chunk UUID (empty for parent chunks)") | |
| filename: str = Field(..., description="Original PDF filename") | |
| file_hash: str = Field(..., description="SHA256 hash of source file") | |
| page_numbers: str = Field(..., description="JSON-encoded list of page numbers") | |
| chunk_index: int = Field(..., ge=0, description="Position in document") | |
| chunk_type: str = Field(..., description="'parent' or 'child'") | |
| token_count: int = Field(..., ge=0, description="Number of tokens") | |
| start_char: int = Field(..., ge=0, description="Start position in document") | |
| end_char: int = Field(..., ge=0, description="End position in document") | |
| ingestion_date: str = Field(..., description="ISO datetime of ingestion") | |
| class Config: | |
| json_schema_extra = { | |
| "example": { | |
| "chunk_id": "uuid-string", | |
| "document_id": "doc-uuid-string", | |
| "parent_id": "parent-uuid-string", | |
| "filename": "sample.pdf", | |
| "file_hash": "abc123...", | |
| "page_numbers": "[1, 2]", | |
| "chunk_index": 0, | |
| "chunk_type": "child", | |
| "token_count": 800, | |
| "start_char": 0, | |
| "end_char": 1000, | |
| "ingestion_date": "2024-01-01T12:00:00" | |
| } | |
| } | |