Spaces:

ranaspark
/

voice

Running

App Files Files Community

voice / pipeline /document_parser.py

rahulrana0001

Sync: Full project update including premium UI and advanced chat engine

b0bfc45 about 1 month ago

raw

history blame contribute delete

2.77 kB

	import fitz # PyMuPDF
	from docx import Document
	import os

	def extract_text_from_pdf(file_path: str) -> str:
	"""
	Extracts text from a PDF file using PyMuPDF.
	"""
	text = ""
	try:
	doc = fitz.open(file_path)
	for page in doc:
	text += page.get_text()
	doc.close()
	except Exception as e:
	print(f"ERROR: PDF extraction failed: {e}")
	return text

	def extract_text_from_docx(file_path: str) -> str:
	"""
	Extracts text from a Word (.docx) file.
	"""
	text = ""
	try:
	doc = Document(file_path)
	for para in doc.paragraphs:
	text += para.text + "\n"
	except Exception as e:
	print(f"ERROR: DOCX extraction failed: {e}")
	return text

	def get_pdf_page_count(file_path: str) -> int:
	try:
	doc = fitz.open(file_path)
	count = len(doc)
	doc.close()
	return count
	except:
	return 0

	def get_pdf_page_as_image(file_path: str, page_num: int) -> str:
	"""
	Renders a PDF page as an image and returns the temporary file path.
	Essential for comics/manga which are image-based.
	"""
	try:
	doc = fitz.open(file_path)
	if page_num >= len(doc):
	return None

	page = doc[page_num]
	# Balanced resolution for speed and OCR accuracy (2.5x zoom)
	matrix = fitz.Matrix(2.5, 2.5)
	pix = page.get_pixmap(matrix=matrix)

	import tempfile
	tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
	pix.save(tmp_img)
	doc.close()
	return tmp_img
	except Exception as e:
	print(f"ERROR: PDF rendering failed: {e}")
	return None

	def get_text_from_page(file_path: str, page_num: int) -> str:
	"""
	Tries to extract digital text directly from a specific page.
	"""
	try:
	doc = fitz.open(file_path)
	if page_num >= len(doc):
	return ""
	text = doc[page_num].get_text().strip()
	doc.close()
	return text
	except:
	return ""

	def extract_text_from_document(file_path: str) -> str:
	"""
	Dispatcher to extract text based on file extension.
	"""
	if not file_path or not os.path.exists(file_path):
	return ""

	ext = os.path.splitext(file_path)[1].lower()

	if ext in [".pdf", ".epub"]:
	return extract_text_from_pdf(file_path)
	elif ext == ".docx":
	return extract_text_from_docx(file_path)
	elif ext == ".txt":
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()
	except:
	with open(file_path, "r", encoding="latin-1") as f:
	return f.read()
	else:
	return ""