""" Data models for embeddings and vector storage. This module defines Pydantic models for embedding metadata. """ from pydantic import BaseModel, Field class EmbeddingMetadata(BaseModel): """Metadata stored with each embedding in ChromaDB.""" chunk_id: str = Field(..., description="Chunk UUID as string") document_id: str = Field(..., description="Document UUID as string") parent_id: str = Field(default="", description="Parent chunk UUID (empty for parent chunks)") filename: str = Field(..., description="Original PDF filename") file_hash: str = Field(..., description="SHA256 hash of source file") page_numbers: str = Field(..., description="JSON-encoded list of page numbers") chunk_index: int = Field(..., ge=0, description="Position in document") chunk_type: str = Field(..., description="'parent' or 'child'") token_count: int = Field(..., ge=0, description="Number of tokens") start_char: int = Field(..., ge=0, description="Start position in document") end_char: int = Field(..., ge=0, description="End position in document") ingestion_date: str = Field(..., description="ISO datetime of ingestion") class Config: json_schema_extra = { "example": { "chunk_id": "uuid-string", "document_id": "doc-uuid-string", "parent_id": "parent-uuid-string", "filename": "sample.pdf", "file_hash": "abc123...", "page_numbers": "[1, 2]", "chunk_index": 0, "chunk_type": "child", "token_count": 800, "start_char": 0, "end_char": 1000, "ingestion_date": "2024-01-01T12:00:00" } }