import os import textract import pandas as pd from PIL import Image import pytesseract # Try to set Tesseract path for Windows if os.name == 'nt': tesseract_paths = [ r'C:\Program Files\Tesseract-OCR\tesseract.exe', r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe', ] for path in tesseract_paths: if os.path.exists(path): pytesseract.pytesseract.tesseract_cmd = path break SUPPORTED_TYPES = ["pdf", "docx", "doc", "txt", "xlsx", "csv", "png", "jpg", "jpeg"] def _extract_pdf(file_path): """Extract text from PDF. Try pymupdf, pdfplumber, then textract.""" # PyMuPDF (fitz) - very reliable, handles most PDFs try: import fitz doc = fitz.open(file_path) parts = [] for page in doc: t = page.get_text() if t: parts.append(t) doc.close() text = "\n".join(parts).strip() if parts else "" if text: return text except Exception: pass # pdfplumber try: import pdfplumber with pdfplumber.open(file_path) as pdf: parts = [] for page in pdf.pages: t = page.extract_text() if t: parts.append(t) text = "\n".join(parts).strip() if parts else "" if text: return text except Exception: pass # textract (last resort) try: text = textract.process(file_path).decode('utf-8', errors='replace').strip() if text: return text except Exception: pass return "" def extract_text(file_path): """Extract text from a file. Returns extracted text or [IMAGE_FILE:path] for images.""" if not file_path or not os.path.exists(file_path): return "" ext = file_path.split('.')[-1].lower() text = "" if ext == "pdf": text = _extract_pdf(file_path) elif ext in ["doc", "docx", "txt"]: try: text = textract.process(file_path).decode('utf-8', errors='replace') except Exception: return "" elif ext in ["xlsx", "csv"]: df = pd.read_excel(file_path) if ext == "xlsx" else pd.read_csv(file_path) text = df.to_string() elif ext in ["png", "jpg", "jpeg"]: try: image = Image.open(file_path) text = pytesseract.image_to_string(image) if not text.strip(): return "[IMAGE_FILE: Could not extract text from image]" except Exception: return "[IMAGE_FILE: Could not process image]" else: return f"[Unsupported file type: {ext}]" return text.strip() if text else ""