Spaces:
Runtime error
Runtime error
File size: 4,455 Bytes
822c114 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import re
import hashlib
from pathlib import Path
from dataclasses import dataclass
from typing import Optional
try:
from pypdf import PdfReader
HAS_PYPDF = True
except ImportError:
HAS_PYPDF = False
@dataclass
class ParsedDocument:
title: str
full_text: str
sections: list[dict]
page_count: int
def extract_title(text: str, filename: str) -> str:
lines = text.strip().split('\n')
for line in lines[:10]:
line = line.strip()
if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')):
return line
return Path(filename).stem.replace('_', ' ').replace('-', ' ').title()
def detect_sections(text: str) -> list[dict]:
section_pattern = re.compile(
r'^(?:(\d+\.?\s*)?)(Abstract|Introduction|Background|Related Work|'
r'Methodology|Methods|Method|Approach|Model|Architecture|'
r'Experiments?|Results?|Discussion|Conclusion|Conclusions|'
r'References|Acknowledgments?|Appendix)\s*$',
re.IGNORECASE | re.MULTILINE
)
sections = []
matches = list(section_pattern.finditer(text))
if not matches:
return [{"title": "Content", "content": text, "start": 0, "end": len(text)}]
for i, match in enumerate(matches):
start = match.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
section_title = match.group(2).strip()
section_content = text[start:end].strip()
if section_content:
sections.append({
"title": section_title,
"content": section_content,
"start": start,
"end": end
})
return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}]
def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]:
if not HAS_PYPDF:
return None
try:
reader = PdfReader(str(file_path))
pages = [page.extract_text() or "" for page in reader.pages]
full_text = "\n\n".join(pages)
if len(full_text.strip()) < 100:
return None
title = extract_title(full_text, file_path.name)
sections = detect_sections(full_text)
return ParsedDocument(
title=title,
full_text=full_text,
sections=sections,
page_count=len(pages)
)
except Exception:
return None
def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]:
from vector_store import DocumentChunk
chunks = []
for section in doc.sections:
content = section["content"]
section_title = section["title"]
if len(content) <= chunk_size:
chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest()
chunks.append(DocumentChunk(
chunk_id=chunk_id,
paper_id=paper_id,
paper_name=doc.title,
content=content,
section_title=section_title
))
else:
paragraphs = content.split('\n\n')
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) <= chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk.strip():
chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
chunks.append(DocumentChunk(
chunk_id=chunk_id,
paper_id=paper_id,
paper_name=doc.title,
content=current_chunk.strip(),
section_title=section_title
))
current_chunk = para + "\n\n"
if current_chunk.strip():
chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
chunks.append(DocumentChunk(
chunk_id=chunk_id,
paper_id=paper_id,
paper_name=doc.title,
content=current_chunk.strip(),
section_title=section_title
))
return chunks
|