| import fitz |
| from docx import Document |
| import os |
|
|
| def extract_text_from_pdf(file_path: str) -> str: |
| """ |
| Extracts text from a PDF file using PyMuPDF. |
| """ |
| text = "" |
| try: |
| doc = fitz.open(file_path) |
| for page in doc: |
| text += page.get_text() |
| doc.close() |
| except Exception as e: |
| print(f"ERROR: PDF extraction failed: {e}") |
| return text |
|
|
| def extract_text_from_docx(file_path: str) -> str: |
| """ |
| Extracts text from a Word (.docx) file. |
| """ |
| text = "" |
| try: |
| doc = Document(file_path) |
| for para in doc.paragraphs: |
| text += para.text + "\n" |
| except Exception as e: |
| print(f"ERROR: DOCX extraction failed: {e}") |
| return text |
|
|
| def get_pdf_page_count(file_path: str) -> int: |
| try: |
| doc = fitz.open(file_path) |
| count = len(doc) |
| doc.close() |
| return count |
| except: |
| return 0 |
|
|
| def get_pdf_page_as_image(file_path: str, page_num: int) -> str: |
| """ |
| Renders a PDF page as an image and returns the temporary file path. |
| Essential for comics/manga which are image-based. |
| """ |
| try: |
| doc = fitz.open(file_path) |
| if page_num >= len(doc): |
| return None |
| |
| page = doc[page_num] |
| |
| matrix = fitz.Matrix(2.5, 2.5) |
| pix = page.get_pixmap(matrix=matrix) |
| |
| import tempfile |
| tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name |
| pix.save(tmp_img) |
| doc.close() |
| return tmp_img |
| except Exception as e: |
| print(f"ERROR: PDF rendering failed: {e}") |
| return None |
|
|
| def get_text_from_page(file_path: str, page_num: int) -> str: |
| """ |
| Tries to extract digital text directly from a specific page. |
| """ |
| try: |
| doc = fitz.open(file_path) |
| if page_num >= len(doc): |
| return "" |
| text = doc[page_num].get_text().strip() |
| doc.close() |
| return text |
| except: |
| return "" |
|
|
| def extract_text_from_document(file_path: str) -> str: |
| """ |
| Dispatcher to extract text based on file extension. |
| """ |
| if not file_path or not os.path.exists(file_path): |
| return "" |
| |
| ext = os.path.splitext(file_path)[1].lower() |
| |
| if ext in [".pdf", ".epub"]: |
| return extract_text_from_pdf(file_path) |
| elif ext == ".docx": |
| return extract_text_from_docx(file_path) |
| elif ext == ".txt": |
| try: |
| with open(file_path, "r", encoding="utf-8") as f: |
| return f.read() |
| except: |
| with open(file_path, "r", encoding="latin-1") as f: |
| return f.read() |
| else: |
| return "" |
|
|