Spaces:

Peterase
/

Ragora-Server

Sleeping

App Files Files Community

Ragora-Server / app /services /document_processor.py

Peterase

deploy: initial deployment to Hugging Face Spaces

f02c5b9 18 days ago

raw

history blame contribute delete

3.3 kB

	import io
	from typing import List
	import pypdf
	import pdfplumber
	from docx import Document as DocxDocument
	from app.ports.document_processor import DocumentProcessorPort, ExtractedText


	class DocumentProcessorAdapter(DocumentProcessorPort):
	"""Extract text from PDF and DOCX files."""

	SUPPORTED_TYPES = {
	"application/pdf",
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	"application/msword"
	}

	async def extract(self, file_bytes: bytes, content_type: str) -> ExtractedText:
	"""Route to appropriate extractor based on content type."""
	if content_type == "application/pdf":
	content = self._extract_pdf(file_bytes)
	return ExtractedText(
	content=content,
	metadata={"content_type": content_type},
	page_count=self._get_pdf_page_count(file_bytes)
	)
	elif content_type in [
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	"application/msword"
	]:
	content = self._extract_docx(file_bytes)
	return ExtractedText(
	content=content,
	metadata={"content_type": content_type},
	page_count=1
	)
	else:
	raise ValueError(f"Unsupported content type: {content_type}")

	def supports(self, content_type: str) -> bool:
	"""Check if content type is supported."""
	return content_type in self.SUPPORTED_TYPES

	def _extract_pdf(self, file_bytes: bytes, use_pdfplumber: bool = True) -> str:
	"""Extract text from PDF using pdfplumber (better for complex layouts) or pypdf."""
	if use_pdfplumber:
	try:
	return self._extract_with_pdfplumber(file_bytes)
	except Exception:
	# Fallback to pypdf
	return self._extract_with_pypdf(file_bytes)
	return self._extract_with_pypdf(file_bytes)

	def _extract_with_pdfplumber(self, file_bytes: bytes) -> str:
	text = ""
	with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text.strip()

	def _extract_with_pypdf(self, file_bytes: bytes) -> str:
	text = ""
	pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text.strip()

	def _get_pdf_page_count(self, file_bytes: bytes) -> int:
	"""Get number of pages in PDF."""
	try:
	pdf = pypdf.PdfReader(io.BytesIO(file_bytes))
	return len(pdf.pages)
	except:
	return 1

	def _extract_docx(self, file_bytes: bytes) -> str:
	"""Extract text from DOCX file."""
	doc = DocxDocument(io.BytesIO(file_bytes))
	text = []
	for para in doc.paragraphs:
	if para.text.strip():
	text.append(para.text)
	return "\n".join(text)


	# Singleton instance for dependency injection
	def get_document_processor() -> DocumentProcessorPort:
	return DocumentProcessorAdapter()