Spaces:
Runtime error
Runtime error
| import re | |
| import hashlib | |
| from pathlib import Path | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| try: | |
| from pypdf import PdfReader | |
| HAS_PYPDF = True | |
| except ImportError: | |
| HAS_PYPDF = False | |
| class ParsedDocument: | |
| title: str | |
| full_text: str | |
| sections: list[dict] | |
| page_count: int | |
| def extract_title(text: str, filename: str) -> str: | |
| lines = text.strip().split('\n') | |
| for line in lines[:10]: | |
| line = line.strip() | |
| if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')): | |
| return line | |
| return Path(filename).stem.replace('_', ' ').replace('-', ' ').title() | |
| def detect_sections(text: str) -> list[dict]: | |
| section_pattern = re.compile( | |
| r'^(?:(\d+\.?\s*)?)(Abstract|Introduction|Background|Related Work|' | |
| r'Methodology|Methods|Method|Approach|Model|Architecture|' | |
| r'Experiments?|Results?|Discussion|Conclusion|Conclusions|' | |
| r'References|Acknowledgments?|Appendix)\s*$', | |
| re.IGNORECASE | re.MULTILINE | |
| ) | |
| sections = [] | |
| matches = list(section_pattern.finditer(text)) | |
| if not matches: | |
| return [{"title": "Content", "content": text, "start": 0, "end": len(text)}] | |
| for i, match in enumerate(matches): | |
| start = match.end() | |
| end = matches[i + 1].start() if i + 1 < len(matches) else len(text) | |
| section_title = match.group(2).strip() | |
| section_content = text[start:end].strip() | |
| if section_content: | |
| sections.append({ | |
| "title": section_title, | |
| "content": section_content, | |
| "start": start, | |
| "end": end | |
| }) | |
| return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}] | |
| def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]: | |
| if not HAS_PYPDF: | |
| return None | |
| try: | |
| reader = PdfReader(str(file_path)) | |
| pages = [page.extract_text() or "" for page in reader.pages] | |
| full_text = "\n\n".join(pages) | |
| if len(full_text.strip()) < 100: | |
| return None | |
| title = extract_title(full_text, file_path.name) | |
| sections = detect_sections(full_text) | |
| return ParsedDocument( | |
| title=title, | |
| full_text=full_text, | |
| sections=sections, | |
| page_count=len(pages) | |
| ) | |
| except Exception: | |
| return None | |
| def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]: | |
| from vector_store import DocumentChunk | |
| chunks = [] | |
| for section in doc.sections: | |
| content = section["content"] | |
| section_title = section["title"] | |
| if len(content) <= chunk_size: | |
| chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest() | |
| chunks.append(DocumentChunk( | |
| chunk_id=chunk_id, | |
| paper_id=paper_id, | |
| paper_name=doc.title, | |
| content=content, | |
| section_title=section_title | |
| )) | |
| else: | |
| paragraphs = content.split('\n\n') | |
| current_chunk = "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) <= chunk_size: | |
| current_chunk += para + "\n\n" | |
| else: | |
| if current_chunk.strip(): | |
| chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest() | |
| chunks.append(DocumentChunk( | |
| chunk_id=chunk_id, | |
| paper_id=paper_id, | |
| paper_name=doc.title, | |
| content=current_chunk.strip(), | |
| section_title=section_title | |
| )) | |
| current_chunk = para + "\n\n" | |
| if current_chunk.strip(): | |
| chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest() | |
| chunks.append(DocumentChunk( | |
| chunk_id=chunk_id, | |
| paper_id=paper_id, | |
| paper_name=doc.title, | |
| content=current_chunk.strip(), | |
| section_title=section_title | |
| )) | |
| return chunks | |