stochastic / document.py
Sonu Prasad
initial commit
822c114
import re
import hashlib
from pathlib import Path
from dataclasses import dataclass
from typing import Optional
try:
from pypdf import PdfReader
HAS_PYPDF = True
except ImportError:
HAS_PYPDF = False
@dataclass
class ParsedDocument:
title: str
full_text: str
sections: list[dict]
page_count: int
def extract_title(text: str, filename: str) -> str:
lines = text.strip().split('\n')
for line in lines[:10]:
line = line.strip()
if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')):
return line
return Path(filename).stem.replace('_', ' ').replace('-', ' ').title()
def detect_sections(text: str) -> list[dict]:
section_pattern = re.compile(
r'^(?:(\d+\.?\s*)?)(Abstract|Introduction|Background|Related Work|'
r'Methodology|Methods|Method|Approach|Model|Architecture|'
r'Experiments?|Results?|Discussion|Conclusion|Conclusions|'
r'References|Acknowledgments?|Appendix)\s*$',
re.IGNORECASE | re.MULTILINE
)
sections = []
matches = list(section_pattern.finditer(text))
if not matches:
return [{"title": "Content", "content": text, "start": 0, "end": len(text)}]
for i, match in enumerate(matches):
start = match.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
section_title = match.group(2).strip()
section_content = text[start:end].strip()
if section_content:
sections.append({
"title": section_title,
"content": section_content,
"start": start,
"end": end
})
return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}]
def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]:
if not HAS_PYPDF:
return None
try:
reader = PdfReader(str(file_path))
pages = [page.extract_text() or "" for page in reader.pages]
full_text = "\n\n".join(pages)
if len(full_text.strip()) < 100:
return None
title = extract_title(full_text, file_path.name)
sections = detect_sections(full_text)
return ParsedDocument(
title=title,
full_text=full_text,
sections=sections,
page_count=len(pages)
)
except Exception:
return None
def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]:
from vector_store import DocumentChunk
chunks = []
for section in doc.sections:
content = section["content"]
section_title = section["title"]
if len(content) <= chunk_size:
chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest()
chunks.append(DocumentChunk(
chunk_id=chunk_id,
paper_id=paper_id,
paper_name=doc.title,
content=content,
section_title=section_title
))
else:
paragraphs = content.split('\n\n')
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) <= chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk.strip():
chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
chunks.append(DocumentChunk(
chunk_id=chunk_id,
paper_id=paper_id,
paper_name=doc.title,
content=current_chunk.strip(),
section_title=section_title
))
current_chunk = para + "\n\n"
if current_chunk.strip():
chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
chunks.append(DocumentChunk(
chunk_id=chunk_id,
paper_id=paper_id,
paper_name=doc.title,
content=current_chunk.strip(),
section_title=section_title
))
return chunks