File size: 2,765 Bytes
e87f3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0bfc45
 
e87f3ac
 
 
 
 
 
 
 
 
 
 
ac3541e
 
 
 
 
 
 
 
 
 
 
 
 
 
e87f3ac
 
 
 
 
 
 
 
 
4c4ec72
e87f3ac
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import fitz  # PyMuPDF
from docx import Document
import os

def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts text from a PDF file using PyMuPDF.
    """
    text = ""
    try:
        doc = fitz.open(file_path)
        for page in doc:
            text += page.get_text()
        doc.close()
    except Exception as e:
        print(f"ERROR: PDF extraction failed: {e}")
    return text

def extract_text_from_docx(file_path: str) -> str:
    """
    Extracts text from a Word (.docx) file.
    """
    text = ""
    try:
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"ERROR: DOCX extraction failed: {e}")
    return text

def get_pdf_page_count(file_path: str) -> int:
    try:
        doc = fitz.open(file_path)
        count = len(doc)
        doc.close()
        return count
    except:
        return 0

def get_pdf_page_as_image(file_path: str, page_num: int) -> str:
    """
    Renders a PDF page as an image and returns the temporary file path.
    Essential for comics/manga which are image-based.
    """
    try:
        doc = fitz.open(file_path)
        if page_num >= len(doc):
            return None
            
        page = doc[page_num]
        # Balanced resolution for speed and OCR accuracy (2.5x zoom)
        matrix = fitz.Matrix(2.5, 2.5)
        pix = page.get_pixmap(matrix=matrix)
        
        import tempfile
        tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
        pix.save(tmp_img)
        doc.close()
        return tmp_img
    except Exception as e:
        print(f"ERROR: PDF rendering failed: {e}")
        return None

def get_text_from_page(file_path: str, page_num: int) -> str:
    """
    Tries to extract digital text directly from a specific page.
    """
    try:
        doc = fitz.open(file_path)
        if page_num >= len(doc):
            return ""
        text = doc[page_num].get_text().strip()
        doc.close()
        return text
    except:
        return ""

def extract_text_from_document(file_path: str) -> str:
    """
    Dispatcher to extract text based on file extension.
    """
    if not file_path or not os.path.exists(file_path):
        return ""
    
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext in [".pdf", ".epub"]:
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext == ".txt":
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read()
        except:
            with open(file_path, "r", encoding="latin-1") as f:
                return f.read()
    else:
        return ""