Spaces:
Sleeping
Sleeping
| import os | |
| import textract | |
| import pandas as pd | |
| from PIL import Image | |
| import pytesseract | |
| # Try to set Tesseract path for Windows | |
| if os.name == 'nt': | |
| tesseract_paths = [ | |
| r'C:\Program Files\Tesseract-OCR\tesseract.exe', | |
| r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe', | |
| ] | |
| for path in tesseract_paths: | |
| if os.path.exists(path): | |
| pytesseract.pytesseract.tesseract_cmd = path | |
| break | |
| SUPPORTED_TYPES = ["pdf", "docx", "doc", "txt", "xlsx", "csv", "png", "jpg", "jpeg"] | |
| def _extract_pdf(file_path): | |
| """Extract text from PDF. Try pymupdf, pdfplumber, then textract.""" | |
| # PyMuPDF (fitz) - very reliable, handles most PDFs | |
| try: | |
| import fitz | |
| doc = fitz.open(file_path) | |
| parts = [] | |
| for page in doc: | |
| t = page.get_text() | |
| if t: | |
| parts.append(t) | |
| doc.close() | |
| text = "\n".join(parts).strip() if parts else "" | |
| if text: | |
| return text | |
| except Exception: | |
| pass | |
| # pdfplumber | |
| try: | |
| import pdfplumber | |
| with pdfplumber.open(file_path) as pdf: | |
| parts = [] | |
| for page in pdf.pages: | |
| t = page.extract_text() | |
| if t: | |
| parts.append(t) | |
| text = "\n".join(parts).strip() if parts else "" | |
| if text: | |
| return text | |
| except Exception: | |
| pass | |
| # textract (last resort) | |
| try: | |
| text = textract.process(file_path).decode('utf-8', errors='replace').strip() | |
| if text: | |
| return text | |
| except Exception: | |
| pass | |
| return "" | |
| def extract_text(file_path): | |
| """Extract text from a file. Returns extracted text or [IMAGE_FILE:path] for images.""" | |
| if not file_path or not os.path.exists(file_path): | |
| return "" | |
| ext = file_path.split('.')[-1].lower() | |
| text = "" | |
| if ext == "pdf": | |
| text = _extract_pdf(file_path) | |
| elif ext in ["doc", "docx", "txt"]: | |
| try: | |
| text = textract.process(file_path).decode('utf-8', errors='replace') | |
| except Exception: | |
| return "" | |
| elif ext in ["xlsx", "csv"]: | |
| df = pd.read_excel(file_path) if ext == "xlsx" else pd.read_csv(file_path) | |
| text = df.to_string() | |
| elif ext in ["png", "jpg", "jpeg"]: | |
| try: | |
| image = Image.open(file_path) | |
| text = pytesseract.image_to_string(image) | |
| if not text.strip(): | |
| return "[IMAGE_FILE: Could not extract text from image]" | |
| except Exception: | |
| return "[IMAGE_FILE: Could not process image]" | |
| else: | |
| return f"[Unsupported file type: {ext}]" | |
| return text.strip() if text else "" | |