Spaces:

sonuprasad23
/

stochastic

Runtime error

stochastic / document.py

Sonu Prasad

initial commit

822c114 about 2 months ago

4.46 kB

	import re
	import hashlib
	from pathlib import Path
	from dataclasses import dataclass
	from typing import Optional

	try:
	from pypdf import PdfReader
	HAS_PYPDF = True
	except ImportError:
	HAS_PYPDF = False


	@dataclass
	class ParsedDocument:
	title: str
	full_text: str
	sections: list[dict]
	page_count: int


	def extract_title(text: str, filename: str) -> str:
	lines = text.strip().split('\n')
	for line in lines[:10]:
	line = line.strip()
	if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')):
	return line
	return Path(filename).stem.replace('_', ' ').replace('-', ' ').title()


	def detect_sections(text: str) -> list[dict]:
	section_pattern = re.compile(
	r'^(?:(\d+\.?\s*)?)(Abstract\|Introduction\|Background\|Related Work\|'
	r'Methodology\|Methods\|Method\|Approach\|Model\|Architecture\|'
	r'Experiments?\|Results?\|Discussion\|Conclusion\|Conclusions\|'
	r'References\|Acknowledgments?\|Appendix)\s*$',
	re.IGNORECASE \| re.MULTILINE
	)

	sections = []
	matches = list(section_pattern.finditer(text))

	if not matches:
	return [{"title": "Content", "content": text, "start": 0, "end": len(text)}]

	for i, match in enumerate(matches):
	start = match.end()
	end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
	section_title = match.group(2).strip()
	section_content = text[start:end].strip()

	if section_content:
	sections.append({
	"title": section_title,
	"content": section_content,
	"start": start,
	"end": end
	})

	return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}]


	def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]:
	if not HAS_PYPDF:
	return None

	try:
	reader = PdfReader(str(file_path))
	pages = [page.extract_text() or "" for page in reader.pages]
	full_text = "\n\n".join(pages)

	if len(full_text.strip()) < 100:
	return None

	title = extract_title(full_text, file_path.name)
	sections = detect_sections(full_text)

	return ParsedDocument(
	title=title,
	full_text=full_text,
	sections=sections,
	page_count=len(pages)
	)
	except Exception:
	return None


	def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]:
	from vector_store import DocumentChunk

	chunks = []

	for section in doc.sections:
	content = section["content"]
	section_title = section["title"]

	if len(content) <= chunk_size:
	chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest()
	chunks.append(DocumentChunk(
	chunk_id=chunk_id,
	paper_id=paper_id,
	paper_name=doc.title,
	content=content,
	section_title=section_title
	))
	else:
	paragraphs = content.split('\n\n')
	current_chunk = ""

	for para in paragraphs:
	if len(current_chunk) + len(para) <= chunk_size:
	current_chunk += para + "\n\n"
	else:
	if current_chunk.strip():
	chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
	chunks.append(DocumentChunk(
	chunk_id=chunk_id,
	paper_id=paper_id,
	paper_name=doc.title,
	content=current_chunk.strip(),
	section_title=section_title
	))
	current_chunk = para + "\n\n"

	if current_chunk.strip():
	chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
	chunks.append(DocumentChunk(
	chunk_id=chunk_id,
	paper_id=paper_id,
	paper_name=doc.title,
	content=current_chunk.strip(),
	section_title=section_title
	))

	return chunks