File size: 2,421 Bytes
e3994d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import os
import hashlib
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional

@dataclass
class ValidationResult:
    is_valid: bool
    errors: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)
    is_scanned: bool = False

@dataclass
class PageData:
    page_num: int
    text: str
    width: float
    height: float
    section: str = "Abstract"
    font_sizes: Dict[str, float] = field(default_factory=dict)

@dataclass
class PaperMetadata:
    title: str = "Unknown Title"
    authors: str = "Unknown Authors"
    year: str = "Unknown Year"
    doi: str = "Unknown DOI"
    n_pages: int = 0
    filepath: str = ""

@dataclass
class ChildChunk:
    text: str
    display_text: str
    enriched_text: str
    parent_id: str
    metadata: Dict[str, Any]
    chunk_index: int = 0

@dataclass
class ParentChunk:
    text: str
    parent_id: str
    children: List[ChildChunk]
    metadata: Dict[str, Any]

@dataclass
class PaperResult:
    metadata: PaperMetadata
    parent_store: Dict[str, ParentChunk]
    children: List[ChildChunk]
    faiss_index: Any  # faiss.Index
    bm25_index: Any   # BM25Okapi
    paper_id: str

@dataclass
class UnifiedIndex:
    faiss_index: Any
    bm25_index: Any
    children: List[ChildChunk]

def snap_to_sentence(text: str, direction: str = "end") -> str:
    """Snap to nearest sentence boundary."""
    if direction == "end":
        # Find last sentence-ending punctuation followed by space
        match = list(re.finditer(r'[.!?]\s+', text))
        if match:
            snapped = text[:match[-1].end()]
            if len(snapped.strip()) > 20:
                return snapped
    elif direction == "start":
        # Find first sentence-ending punctuation followed by space
        match = re.search(r'[.!?]\s+', text)
        if match:
            snapped = text[match.end():]
            if len(snapped.strip()) > 20:
                return snapped
    return text

def generate_paper_id(filepath: str) -> str:
    """Generate a deterministic ID from the filename."""
    basename = os.path.basename(filepath)
    hash_obj = hashlib.md5(basename.encode("utf-8"))
    return f"{basename.replace('.pdf', '')}_{hash_obj.hexdigest()[:6]}"

def ensure_data_dirs(base_dir: str = "data/indices") -> None:
    """Ensure that the data directory exists."""
    os.makedirs(base_dir, exist_ok=True)