File size: 1,769 Bytes
9b457ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""
Data models for embeddings and vector storage.

This module defines Pydantic models for embedding metadata.
"""

from pydantic import BaseModel, Field


class EmbeddingMetadata(BaseModel):
    """Metadata stored with each embedding in ChromaDB."""

    chunk_id: str = Field(..., description="Chunk UUID as string")
    document_id: str = Field(..., description="Document UUID as string")
    parent_id: str = Field(default="", description="Parent chunk UUID (empty for parent chunks)")
    filename: str = Field(..., description="Original PDF filename")
    file_hash: str = Field(..., description="SHA256 hash of source file")
    page_numbers: str = Field(..., description="JSON-encoded list of page numbers")
    chunk_index: int = Field(..., ge=0, description="Position in document")
    chunk_type: str = Field(..., description="'parent' or 'child'")
    token_count: int = Field(..., ge=0, description="Number of tokens")
    start_char: int = Field(..., ge=0, description="Start position in document")
    end_char: int = Field(..., ge=0, description="End position in document")
    ingestion_date: str = Field(..., description="ISO datetime of ingestion")

    class Config:
        json_schema_extra = {
            "example": {
                "chunk_id": "uuid-string",
                "document_id": "doc-uuid-string",
                "parent_id": "parent-uuid-string",
                "filename": "sample.pdf",
                "file_hash": "abc123...",
                "page_numbers": "[1, 2]",
                "chunk_index": 0,
                "chunk_type": "child",
                "token_count": 800,
                "start_char": 0,
                "end_char": 1000,
                "ingestion_date": "2024-01-01T12:00:00"
            }
        }