Spaces:

ranaspark
/

voice

Running

File size: 2,765 Bytes

import fitz  # PyMuPDF
from docx import Document
import os

def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts text from a PDF file using PyMuPDF.
    """
    text = ""
    try:
        doc = fitz.open(file_path)
        for page in doc:
            text += page.get_text()
        doc.close()
    except Exception as e:
        print(f"ERROR: PDF extraction failed: {e}")
    return text

def extract_text_from_docx(file_path: str) -> str:
    """
    Extracts text from a Word (.docx) file.
    """
    text = ""
    try:
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"ERROR: DOCX extraction failed: {e}")
    return text

def get_pdf_page_count(file_path: str) -> int:
    try:
        doc = fitz.open(file_path)
        count = len(doc)
        doc.close()
        return count
    except:
        return 0

def get_pdf_page_as_image(file_path: str, page_num: int) -> str:
    """
    Renders a PDF page as an image and returns the temporary file path.
    Essential for comics/manga which are image-based.
    """
    try:
        doc = fitz.open(file_path)
        if page_num >= len(doc):
            return None
            
        page = doc[page_num]
        # Balanced resolution for speed and OCR accuracy (2.5x zoom)
        matrix = fitz.Matrix(2.5, 2.5)
        pix = page.get_pixmap(matrix=matrix)
        
        import tempfile
        tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
        pix.save(tmp_img)
        doc.close()
        return tmp_img
    except Exception as e:
        print(f"ERROR: PDF rendering failed: {e}")
        return None

def get_text_from_page(file_path: str, page_num: int) -> str:
    """
    Tries to extract digital text directly from a specific page.
    """
    try:
        doc = fitz.open(file_path)
        if page_num >= len(doc):
            return ""
        text = doc[page_num].get_text().strip()
        doc.close()
        return text
    except:
        return ""

def extract_text_from_document(file_path: str) -> str:
    """
    Dispatcher to extract text based on file extension.
    """
    if not file_path or not os.path.exists(file_path):
        return ""
    
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext in [".pdf", ".epub"]:
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext == ".txt":
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read()
        except:
            with open(file_path, "r", encoding="latin-1") as f:
                return f.read()
    else:
        return ""