zeta / src /embedding /models.py
rodrigo-moonray
Deploy zeta-only embeddings (NV-Embed-v2 + E5-small)
9b457ed
"""
Data models for embeddings and vector storage.
This module defines Pydantic models for embedding metadata.
"""
from pydantic import BaseModel, Field
class EmbeddingMetadata(BaseModel):
"""Metadata stored with each embedding in ChromaDB."""
chunk_id: str = Field(..., description="Chunk UUID as string")
document_id: str = Field(..., description="Document UUID as string")
parent_id: str = Field(default="", description="Parent chunk UUID (empty for parent chunks)")
filename: str = Field(..., description="Original PDF filename")
file_hash: str = Field(..., description="SHA256 hash of source file")
page_numbers: str = Field(..., description="JSON-encoded list of page numbers")
chunk_index: int = Field(..., ge=0, description="Position in document")
chunk_type: str = Field(..., description="'parent' or 'child'")
token_count: int = Field(..., ge=0, description="Number of tokens")
start_char: int = Field(..., ge=0, description="Start position in document")
end_char: int = Field(..., ge=0, description="End position in document")
ingestion_date: str = Field(..., description="ISO datetime of ingestion")
class Config:
json_schema_extra = {
"example": {
"chunk_id": "uuid-string",
"document_id": "doc-uuid-string",
"parent_id": "parent-uuid-string",
"filename": "sample.pdf",
"file_hash": "abc123...",
"page_numbers": "[1, 2]",
"chunk_index": 0,
"chunk_type": "child",
"token_count": 800,
"start_char": 0,
"end_char": 1000,
"ingestion_date": "2024-01-01T12:00:00"
}
}