Spaces:

arhamTariq
/

psle_video

Sleeping

App Files Files Community

psle_video / file_processor.py

arhamTariq

Upload 5 files

af5f677 verified about 2 months ago

raw

history blame contribute delete

2.82 kB

	import os
	import textract
	import pandas as pd
	from PIL import Image
	import pytesseract

	# Try to set Tesseract path for Windows
	if os.name == 'nt':
	tesseract_paths = [
	r'C:\Program Files\Tesseract-OCR\tesseract.exe',
	r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
	]
	for path in tesseract_paths:
	if os.path.exists(path):
	pytesseract.pytesseract.tesseract_cmd = path
	break

	SUPPORTED_TYPES = ["pdf", "docx", "doc", "txt", "xlsx", "csv", "png", "jpg", "jpeg"]


	def _extract_pdf(file_path):
	"""Extract text from PDF. Try pymupdf, pdfplumber, then textract."""
	# PyMuPDF (fitz) - very reliable, handles most PDFs
	try:
	import fitz
	doc = fitz.open(file_path)
	parts = []
	for page in doc:
	t = page.get_text()
	if t:
	parts.append(t)
	doc.close()
	text = "\n".join(parts).strip() if parts else ""
	if text:
	return text
	except Exception:
	pass
	# pdfplumber
	try:
	import pdfplumber
	with pdfplumber.open(file_path) as pdf:
	parts = []
	for page in pdf.pages:
	t = page.extract_text()
	if t:
	parts.append(t)
	text = "\n".join(parts).strip() if parts else ""
	if text:
	return text
	except Exception:
	pass
	# textract (last resort)
	try:
	text = textract.process(file_path).decode('utf-8', errors='replace').strip()
	if text:
	return text
	except Exception:
	pass
	return ""


	def extract_text(file_path):
	"""Extract text from a file. Returns extracted text or [IMAGE_FILE:path] for images."""
	if not file_path or not os.path.exists(file_path):
	return ""

	ext = file_path.split('.')[-1].lower()
	text = ""

	if ext == "pdf":
	text = _extract_pdf(file_path)
	elif ext in ["doc", "docx", "txt"]:
	try:
	text = textract.process(file_path).decode('utf-8', errors='replace')
	except Exception:
	return ""

	elif ext in ["xlsx", "csv"]:
	df = pd.read_excel(file_path) if ext == "xlsx" else pd.read_csv(file_path)
	text = df.to_string()

	elif ext in ["png", "jpg", "jpeg"]:
	try:
	image = Image.open(file_path)
	text = pytesseract.image_to_string(image)
	if not text.strip():
	return "[IMAGE_FILE: Could not extract text from image]"
	except Exception:
	return "[IMAGE_FILE: Could not process image]"

	else:
	return f"[Unsupported file type: {ext}]"

	return text.strip() if text else ""