File size: 2,822 Bytes
af5f677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import textract
import pandas as pd
from PIL import Image
import pytesseract

# Try to set Tesseract path for Windows
if os.name == 'nt':
    tesseract_paths = [
        r'C:\Program Files\Tesseract-OCR\tesseract.exe',
        r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
    ]
    for path in tesseract_paths:
        if os.path.exists(path):
            pytesseract.pytesseract.tesseract_cmd = path
            break

SUPPORTED_TYPES = ["pdf", "docx", "doc", "txt", "xlsx", "csv", "png", "jpg", "jpeg"]


def _extract_pdf(file_path):
    """Extract text from PDF. Try pymupdf, pdfplumber, then textract."""
    # PyMuPDF (fitz) - very reliable, handles most PDFs
    try:
        import fitz
        doc = fitz.open(file_path)
        parts = []
        for page in doc:
            t = page.get_text()
            if t:
                parts.append(t)
        doc.close()
        text = "\n".join(parts).strip() if parts else ""
        if text:
            return text
    except Exception:
        pass
    # pdfplumber
    try:
        import pdfplumber
        with pdfplumber.open(file_path) as pdf:
            parts = []
            for page in pdf.pages:
                t = page.extract_text()
                if t:
                    parts.append(t)
            text = "\n".join(parts).strip() if parts else ""
            if text:
                return text
    except Exception:
        pass
    # textract (last resort)
    try:
        text = textract.process(file_path).decode('utf-8', errors='replace').strip()
        if text:
            return text
    except Exception:
        pass
    return ""


def extract_text(file_path):
    """Extract text from a file. Returns extracted text or [IMAGE_FILE:path] for images."""
    if not file_path or not os.path.exists(file_path):
        return ""

    ext = file_path.split('.')[-1].lower()
    text = ""

    if ext == "pdf":
        text = _extract_pdf(file_path)
    elif ext in ["doc", "docx", "txt"]:
        try:
            text = textract.process(file_path).decode('utf-8', errors='replace')
        except Exception:
            return ""

    elif ext in ["xlsx", "csv"]:
        df = pd.read_excel(file_path) if ext == "xlsx" else pd.read_csv(file_path)
        text = df.to_string()

    elif ext in ["png", "jpg", "jpeg"]:
        try:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
            if not text.strip():
                return "[IMAGE_FILE: Could not extract text from image]"
        except Exception:
            return "[IMAGE_FILE: Could not process image]"

    else:
        return f"[Unsupported file type: {ext}]"

    return text.strip() if text else ""