Spaces:
Sleeping
Sleeping
File size: 2,822 Bytes
af5f677 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import os
import textract
import pandas as pd
from PIL import Image
import pytesseract
# Try to set Tesseract path for Windows
if os.name == 'nt':
tesseract_paths = [
r'C:\Program Files\Tesseract-OCR\tesseract.exe',
r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
]
for path in tesseract_paths:
if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path
break
SUPPORTED_TYPES = ["pdf", "docx", "doc", "txt", "xlsx", "csv", "png", "jpg", "jpeg"]
def _extract_pdf(file_path):
"""Extract text from PDF. Try pymupdf, pdfplumber, then textract."""
# PyMuPDF (fitz) - very reliable, handles most PDFs
try:
import fitz
doc = fitz.open(file_path)
parts = []
for page in doc:
t = page.get_text()
if t:
parts.append(t)
doc.close()
text = "\n".join(parts).strip() if parts else ""
if text:
return text
except Exception:
pass
# pdfplumber
try:
import pdfplumber
with pdfplumber.open(file_path) as pdf:
parts = []
for page in pdf.pages:
t = page.extract_text()
if t:
parts.append(t)
text = "\n".join(parts).strip() if parts else ""
if text:
return text
except Exception:
pass
# textract (last resort)
try:
text = textract.process(file_path).decode('utf-8', errors='replace').strip()
if text:
return text
except Exception:
pass
return ""
def extract_text(file_path):
"""Extract text from a file. Returns extracted text or [IMAGE_FILE:path] for images."""
if not file_path or not os.path.exists(file_path):
return ""
ext = file_path.split('.')[-1].lower()
text = ""
if ext == "pdf":
text = _extract_pdf(file_path)
elif ext in ["doc", "docx", "txt"]:
try:
text = textract.process(file_path).decode('utf-8', errors='replace')
except Exception:
return ""
elif ext in ["xlsx", "csv"]:
df = pd.read_excel(file_path) if ext == "xlsx" else pd.read_csv(file_path)
text = df.to_string()
elif ext in ["png", "jpg", "jpeg"]:
try:
image = Image.open(file_path)
text = pytesseract.image_to_string(image)
if not text.strip():
return "[IMAGE_FILE: Could not extract text from image]"
except Exception:
return "[IMAGE_FILE: Could not process image]"
else:
return f"[Unsupported file type: {ext}]"
return text.strip() if text else ""
|