File size: 4,455 Bytes
822c114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
import hashlib
from pathlib import Path
from dataclasses import dataclass
from typing import Optional

try:
    from pypdf import PdfReader
    HAS_PYPDF = True
except ImportError:
    HAS_PYPDF = False


@dataclass
class ParsedDocument:
    title: str
    full_text: str
    sections: list[dict]
    page_count: int


def extract_title(text: str, filename: str) -> str:
    lines = text.strip().split('\n')
    for line in lines[:10]:
        line = line.strip()
        if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')):
            return line
    return Path(filename).stem.replace('_', ' ').replace('-', ' ').title()


def detect_sections(text: str) -> list[dict]:
    section_pattern = re.compile(
        r'^(?:(\d+\.?\s*)?)(Abstract|Introduction|Background|Related Work|'
        r'Methodology|Methods|Method|Approach|Model|Architecture|'
        r'Experiments?|Results?|Discussion|Conclusion|Conclusions|'
        r'References|Acknowledgments?|Appendix)\s*$',
        re.IGNORECASE | re.MULTILINE
    )
    
    sections = []
    matches = list(section_pattern.finditer(text))
    
    if not matches:
        return [{"title": "Content", "content": text, "start": 0, "end": len(text)}]
    
    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_title = match.group(2).strip()
        section_content = text[start:end].strip()
        
        if section_content:
            sections.append({
                "title": section_title,
                "content": section_content,
                "start": start,
                "end": end
            })
    
    return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}]


def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]:
    if not HAS_PYPDF:
        return None
    
    try:
        reader = PdfReader(str(file_path))
        pages = [page.extract_text() or "" for page in reader.pages]
        full_text = "\n\n".join(pages)
        
        if len(full_text.strip()) < 100:
            return None
        
        title = extract_title(full_text, file_path.name)
        sections = detect_sections(full_text)
        
        return ParsedDocument(
            title=title,
            full_text=full_text,
            sections=sections,
            page_count=len(pages)
        )
    except Exception:
        return None


def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]:
    from vector_store import DocumentChunk
    
    chunks = []
    
    for section in doc.sections:
        content = section["content"]
        section_title = section["title"]
        
        if len(content) <= chunk_size:
            chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest()
            chunks.append(DocumentChunk(
                chunk_id=chunk_id,
                paper_id=paper_id,
                paper_name=doc.title,
                content=content,
                section_title=section_title
            ))
        else:
            paragraphs = content.split('\n\n')
            current_chunk = ""
            
            for para in paragraphs:
                if len(current_chunk) + len(para) <= chunk_size:
                    current_chunk += para + "\n\n"
                else:
                    if current_chunk.strip():
                        chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
                        chunks.append(DocumentChunk(
                            chunk_id=chunk_id,
                            paper_id=paper_id,
                            paper_name=doc.title,
                            content=current_chunk.strip(),
                            section_title=section_title
                        ))
                    current_chunk = para + "\n\n"
            
            if current_chunk.strip():
                chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
                chunks.append(DocumentChunk(
                    chunk_id=chunk_id,
                    paper_id=paper_id,
                    paper_name=doc.title,
                    content=current_chunk.strip(),
                    section_title=section_title
                ))
    
    return chunks