voice / pipeline /document_parser.py
rahulrana0001's picture
Sync: Full project update including premium UI and advanced chat engine
b0bfc45
import fitz # PyMuPDF
from docx import Document
import os
def extract_text_from_pdf(file_path: str) -> str:
"""
Extracts text from a PDF file using PyMuPDF.
"""
text = ""
try:
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
doc.close()
except Exception as e:
print(f"ERROR: PDF extraction failed: {e}")
return text
def extract_text_from_docx(file_path: str) -> str:
"""
Extracts text from a Word (.docx) file.
"""
text = ""
try:
doc = Document(file_path)
for para in doc.paragraphs:
text += para.text + "\n"
except Exception as e:
print(f"ERROR: DOCX extraction failed: {e}")
return text
def get_pdf_page_count(file_path: str) -> int:
try:
doc = fitz.open(file_path)
count = len(doc)
doc.close()
return count
except:
return 0
def get_pdf_page_as_image(file_path: str, page_num: int) -> str:
"""
Renders a PDF page as an image and returns the temporary file path.
Essential for comics/manga which are image-based.
"""
try:
doc = fitz.open(file_path)
if page_num >= len(doc):
return None
page = doc[page_num]
# Balanced resolution for speed and OCR accuracy (2.5x zoom)
matrix = fitz.Matrix(2.5, 2.5)
pix = page.get_pixmap(matrix=matrix)
import tempfile
tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
pix.save(tmp_img)
doc.close()
return tmp_img
except Exception as e:
print(f"ERROR: PDF rendering failed: {e}")
return None
def get_text_from_page(file_path: str, page_num: int) -> str:
"""
Tries to extract digital text directly from a specific page.
"""
try:
doc = fitz.open(file_path)
if page_num >= len(doc):
return ""
text = doc[page_num].get_text().strip()
doc.close()
return text
except:
return ""
def extract_text_from_document(file_path: str) -> str:
"""
Dispatcher to extract text based on file extension.
"""
if not file_path or not os.path.exists(file_path):
return ""
ext = os.path.splitext(file_path)[1].lower()
if ext in [".pdf", ".epub"]:
return extract_text_from_pdf(file_path)
elif ext == ".docx":
return extract_text_from_docx(file_path)
elif ext == ".txt":
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
except:
with open(file_path, "r", encoding="latin-1") as f:
return f.read()
else:
return ""