File size: 741 Bytes
f258251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7612224
 
 
 
 
 
f258251
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from __future__ import annotations

import hashlib
from enum import Enum
from typing import NewType

EmbeddingVector = NewType("EmbeddingVector", list[float])
ChunkId = NewType("ChunkId", str)
DocId = NewType("DocId", str)


class Language(str, Enum):
    EN = "en"
    ID = "id"


class GroundingStatus(str, Enum):
    GROUNDED = "grounded"
    PARTIAL = "partial"
    UNSUPPORTED = "unsupported"


# Complexity: Time O(n) | Space O(1) — streams file bytes through SHA-256
def compute_doc_id(file_bytes: bytes) -> DocId:
    return DocId(hashlib.sha256(file_bytes).hexdigest())


# Complexity: Time O(1) | Space O(1)
def build_chunk_id(doc_id: DocId, page: int, index: int) -> ChunkId:
    return ChunkId(f"{doc_id[:16]}_{page}_{index}")