import re import hashlib from pathlib import Path from dataclasses import dataclass from typing import Optional try: from pypdf import PdfReader HAS_PYPDF = True except ImportError: HAS_PYPDF = False @dataclass class ParsedDocument: title: str full_text: str sections: list[dict] page_count: int def extract_title(text: str, filename: str) -> str: lines = text.strip().split('\n') for line in lines[:10]: line = line.strip() if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')): return line return Path(filename).stem.replace('_', ' ').replace('-', ' ').title() def detect_sections(text: str) -> list[dict]: section_pattern = re.compile( r'^(?:(\d+\.?\s*)?)(Abstract|Introduction|Background|Related Work|' r'Methodology|Methods|Method|Approach|Model|Architecture|' r'Experiments?|Results?|Discussion|Conclusion|Conclusions|' r'References|Acknowledgments?|Appendix)\s*$', re.IGNORECASE | re.MULTILINE ) sections = [] matches = list(section_pattern.finditer(text)) if not matches: return [{"title": "Content", "content": text, "start": 0, "end": len(text)}] for i, match in enumerate(matches): start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) section_title = match.group(2).strip() section_content = text[start:end].strip() if section_content: sections.append({ "title": section_title, "content": section_content, "start": start, "end": end }) return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}] def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]: if not HAS_PYPDF: return None try: reader = PdfReader(str(file_path)) pages = [page.extract_text() or "" for page in reader.pages] full_text = "\n\n".join(pages) if len(full_text.strip()) < 100: return None title = extract_title(full_text, file_path.name) sections = detect_sections(full_text) return ParsedDocument( title=title, full_text=full_text, sections=sections, page_count=len(pages) ) except Exception: return None def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]: from vector_store import DocumentChunk chunks = [] for section in doc.sections: content = section["content"] section_title = section["title"] if len(content) <= chunk_size: chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest() chunks.append(DocumentChunk( chunk_id=chunk_id, paper_id=paper_id, paper_name=doc.title, content=content, section_title=section_title )) else: paragraphs = content.split('\n\n') current_chunk = "" for para in paragraphs: if len(current_chunk) + len(para) <= chunk_size: current_chunk += para + "\n\n" else: if current_chunk.strip(): chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest() chunks.append(DocumentChunk( chunk_id=chunk_id, paper_id=paper_id, paper_name=doc.title, content=current_chunk.strip(), section_title=section_title )) current_chunk = para + "\n\n" if current_chunk.strip(): chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest() chunks.append(DocumentChunk( chunk_id=chunk_id, paper_id=paper_id, paper_name=doc.title, content=current_chunk.strip(), section_title=section_title )) return chunks