Spaces:

empirenexus
/

TranscriptWriting

Sleeping

App Files Files Community

TranscriptWriting / extractors.py

jmisak

Upload 23 files

54c99ad verified 3 months ago

raw

history blame

7.16 kB

	from docx import Document
	import pdfplumber
	import re
	from typing import Tuple
	import os

	def extract_docx(file_obj) -> str:
	"""
	Extract text from DOCX with enhanced error handling and formatting preservation
	"""
	try:
	doc = Document(file_obj)

	# Extract paragraphs with better handling
	paragraphs = []
	for para in doc.paragraphs:
	text = para.text.strip()
	if text: # Only include non-empty paragraphs
	paragraphs.append(text)

	# Also extract text from tables
	for table in doc.tables:
	for row in table.rows:
	row_text = []
	for cell in row.cells:
	cell_text = cell.text.strip()
	if cell_text:
	row_text.append(cell_text)
	if row_text:
	paragraphs.append(" \| ".join(row_text))

	extracted_text = "\n\n".join(paragraphs)

	# Clean up common issues
	extracted_text = clean_extracted_text(extracted_text)

	return extracted_text

	except Exception as e:
	error_msg = f"[DOCX Extraction Error] {str(e)}"
	print(error_msg)
	return f"Error extracting DOCX: {str(e)}"


	def extract_pdf(file_obj) -> str:
	"""
	Extract text from PDF with multiple strategies and enhanced error handling
	"""
	try:
	extracted_pages = []

	with pdfplumber.open(file_obj) as pdf:
	# Track extraction success
	successful_pages = 0
	total_pages = len(pdf.pages)

	for page_num, page in enumerate(pdf.pages, 1):
	try:
	# Strategy 1: Standard text extraction
	page_text = page.extract_text()

	# Strategy 2: If standard fails, try with layout
	if not page_text or len(page_text.strip()) < 50:
	page_text = page.extract_text(layout=True)

	# Strategy 3: If still poor, try with custom settings
	if not page_text or len(page_text.strip()) < 50:
	page_text = page.extract_text(
	x_tolerance=2,
	y_tolerance=2
	)

	if page_text and page_text.strip():
	# Clean and add page marker
	clean_text = page_text.strip()
	extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
	successful_pages += 1
	else:
	print(f"[PDF Warning] Page {page_num} yielded no text")

	except Exception as page_error:
	print(f"[PDF Warning] Error on page {page_num}: {page_error}")
	continue

	if successful_pages == 0:
	return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."

	if successful_pages < total_pages * 0.5:
	print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")

	full_text = "\n\n".join(extracted_pages)

	# Clean up the extracted text
	full_text = clean_extracted_text(full_text)

	return full_text

	except Exception as e:
	error_msg = f"[PDF Extraction Error] {str(e)}"
	print(error_msg)
	return f"Error extracting PDF: {str(e)}"


	def clean_extracted_text(text: str) -> str:
	"""
	Clean up common issues in extracted text
	"""
	# Remove excessive whitespace
	text = re.sub(r'\n{3,}', '\n\n', text)
	text = re.sub(r' {2,}', ' ', text)

	# Remove page numbers that appear alone on lines
	text = re.sub(r'^\s\d+\s$', '', text, flags=re.MULTILINE)

	# Remove common headers/footers patterns
	text = re.sub(r'^\sPage \d+ of \d+\s$', '', text, flags=re.MULTILINE)
	text = re.sub(r'^\s\d+/\d+\s$', '', text, flags=re.MULTILINE)

	# Fix common OCR issues (if any)
	text = text.replace('', "'") # Curly apostrophe
	text = text.replace('', "'")
	text = text.replace('"', '"') # Curly quotes
	text = text.replace('"', '"')
	text = text.replace('–', '-') # En dash
	text = text.replace('—', '-') # Em dash

	# Remove zero-width characters
	text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)

	return text.strip()


	def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
	"""
	Validate extracted text quality
	"""
	# Check if text is empty
	if not text or not text.strip():
	return False, "No text extracted"

	# Check for minimum length
	if len(text) < 100:
	return False, f"Extracted text too short ({len(text)} characters)"

	# Check for error messages
	if text.startswith("Error") or text.startswith("["):
	return False, "Extraction error detected"

	# Check for gibberish (too many non-alphanumeric characters)
	#alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
	#ratio = alphanumeric / len(text) if text else 0

	#if ratio < 0.2:
	# return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"

	# Check word count
	words = text.split()
	if len(words) < 50:
	return False, f"Too few words ({len(words)})"

	# Check for reasonable word lengths (catch binary junk)
	#avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
	#if avg_word_length < 2 or avg_word_length > 20:
	# return False, f"Unusual average word length ({avg_word_length:.1f})"

	# All checks passed
	return True, f"Valid extraction: {len(words)} words, {len(text)} characters"


	def detect_file_encoding(file_path: str) -> str:
	"""
	Detect file encoding for text files
	"""
	try:
	import chardet
	with open(file_path, 'rb') as f:
	raw_data = f.read()
	result = chardet.detect(raw_data)
	return result['encoding']
	except:
	return 'utf-8' # Default fallback


	def extract_text_file(file_obj) -> str:
	"""
	Extract from plain text file with encoding detection
	"""
	try:
	# Try UTF-8 first
	try:
	return file_obj.read().decode('utf-8')
	except UnicodeDecodeError:
	# Try other common encodings
	file_obj.seek(0)
	try:
	return file_obj.read().decode('latin-1')
	except:
	file_obj.seek(0)
	return file_obj.read().decode('cp1252')
	except Exception as e:
	return f"Error reading text file: {str(e)}"