import fitz # PyMuPDF from docx import Document import os def extract_text_from_pdf(file_path: str) -> str: """ Extracts text from a PDF file using PyMuPDF. """ text = "" try: doc = fitz.open(file_path) for page in doc: text += page.get_text() doc.close() except Exception as e: print(f"ERROR: PDF extraction failed: {e}") return text def extract_text_from_docx(file_path: str) -> str: """ Extracts text from a Word (.docx) file. """ text = "" try: doc = Document(file_path) for para in doc.paragraphs: text += para.text + "\n" except Exception as e: print(f"ERROR: DOCX extraction failed: {e}") return text def get_pdf_page_count(file_path: str) -> int: try: doc = fitz.open(file_path) count = len(doc) doc.close() return count except: return 0 def get_pdf_page_as_image(file_path: str, page_num: int) -> str: """ Renders a PDF page as an image and returns the temporary file path. Essential for comics/manga which are image-based. """ try: doc = fitz.open(file_path) if page_num >= len(doc): return None page = doc[page_num] # Balanced resolution for speed and OCR accuracy (2.5x zoom) matrix = fitz.Matrix(2.5, 2.5) pix = page.get_pixmap(matrix=matrix) import tempfile tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name pix.save(tmp_img) doc.close() return tmp_img except Exception as e: print(f"ERROR: PDF rendering failed: {e}") return None def get_text_from_page(file_path: str, page_num: int) -> str: """ Tries to extract digital text directly from a specific page. """ try: doc = fitz.open(file_path) if page_num >= len(doc): return "" text = doc[page_num].get_text().strip() doc.close() return text except: return "" def extract_text_from_document(file_path: str) -> str: """ Dispatcher to extract text based on file extension. """ if not file_path or not os.path.exists(file_path): return "" ext = os.path.splitext(file_path)[1].lower() if ext in [".pdf", ".epub"]: return extract_text_from_pdf(file_path) elif ext == ".docx": return extract_text_from_docx(file_path) elif ext == ".txt": try: with open(file_path, "r", encoding="utf-8") as f: return f.read() except: with open(file_path, "r", encoding="latin-1") as f: return f.read() else: return ""