psle_video / file_processor.py
arhamTariq's picture
Upload 5 files
af5f677 verified
import os
import textract
import pandas as pd
from PIL import Image
import pytesseract
# Try to set Tesseract path for Windows
if os.name == 'nt':
tesseract_paths = [
r'C:\Program Files\Tesseract-OCR\tesseract.exe',
r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
]
for path in tesseract_paths:
if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path
break
SUPPORTED_TYPES = ["pdf", "docx", "doc", "txt", "xlsx", "csv", "png", "jpg", "jpeg"]
def _extract_pdf(file_path):
"""Extract text from PDF. Try pymupdf, pdfplumber, then textract."""
# PyMuPDF (fitz) - very reliable, handles most PDFs
try:
import fitz
doc = fitz.open(file_path)
parts = []
for page in doc:
t = page.get_text()
if t:
parts.append(t)
doc.close()
text = "\n".join(parts).strip() if parts else ""
if text:
return text
except Exception:
pass
# pdfplumber
try:
import pdfplumber
with pdfplumber.open(file_path) as pdf:
parts = []
for page in pdf.pages:
t = page.extract_text()
if t:
parts.append(t)
text = "\n".join(parts).strip() if parts else ""
if text:
return text
except Exception:
pass
# textract (last resort)
try:
text = textract.process(file_path).decode('utf-8', errors='replace').strip()
if text:
return text
except Exception:
pass
return ""
def extract_text(file_path):
"""Extract text from a file. Returns extracted text or [IMAGE_FILE:path] for images."""
if not file_path or not os.path.exists(file_path):
return ""
ext = file_path.split('.')[-1].lower()
text = ""
if ext == "pdf":
text = _extract_pdf(file_path)
elif ext in ["doc", "docx", "txt"]:
try:
text = textract.process(file_path).decode('utf-8', errors='replace')
except Exception:
return ""
elif ext in ["xlsx", "csv"]:
df = pd.read_excel(file_path) if ext == "xlsx" else pd.read_csv(file_path)
text = df.to_string()
elif ext in ["png", "jpg", "jpeg"]:
try:
image = Image.open(file_path)
text = pytesseract.image_to_string(image)
if not text.strip():
return "[IMAGE_FILE: Could not extract text from image]"
except Exception:
return "[IMAGE_FILE: Could not process image]"
else:
return f"[Unsupported file type: {ext}]"
return text.strip() if text else ""