Upload folder using huggingface_hub

e4f4981 verified 9 months ago

10.9 kB

	import os
	import sys
	import subprocess
	import platform
	from pathlib import Path
	from typing import Optional, List, Dict, Any
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Import specialized parsers
	try:
	from .pdf_parser import extract_resume_text as extract_pdf_text
	PDF_PARSER_AVAILABLE = True
	except ImportError:
	PDF_PARSER_AVAILABLE = False
	logger.warning("PDF parser not available")

	try:
	from .word_parser import extract_word_text as extract_docx_text
	WORD_PARSER_AVAILABLE = True
	except ImportError:
	WORD_PARSER_AVAILABLE = False
	logger.warning("Word parser not available")

	# Tika imports
	try:
	import tika
	from tika import parser
	TIKA_AVAILABLE = True
	logger.info("Tika is available")
	except ImportError:
	TIKA_AVAILABLE = False
	logger.warning("Tika not available. Install with: pip install tika")

	class JavaChecker:
	"""Check and install Java for Tika."""

	@staticmethod
	def check_java_installed() -> bool:
	"""Check if Java is installed and accessible."""
	try:
	result = subprocess.run(['java', '-version'],
	capture_output=True, text=True, timeout=10)
	if result.returncode == 0:
	logger.info("Java is already installed")
	return True
	except (subprocess.TimeoutExpired, FileNotFoundError):
	pass

	logger.warning("Java not found or not accessible")
	return False

	@staticmethod
	def install_java() -> bool:
	"""Install Java based on the operating system."""
	system = platform.system().lower()

	try:
	if system == "darwin": # macOS
	logger.info("Installing Java on macOS...")
	result = subprocess.run(['brew', 'install', 'openjdk@11'],
	capture_output=True, text=True, timeout=300)
	if result.returncode == 0:
	# Create symlink for system-wide access
	subprocess.run(['sudo', 'ln', '-sfn',
	'/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk',
	'/Library/Java/JavaVirtualMachines/openjdk-11.jdk'])
	logger.info("Java installed successfully on macOS")
	return True
	else:
	logger.error(f"Failed to install Java: {result.stderr}")
	return False

	elif system == "linux":
	logger.info("Installing Java on Linux...")
	# Try different package managers
	package_managers = [
	['sudo', 'apt-get', 'update', '&&', 'sudo', 'apt-get', 'install', '-y', 'openjdk-11-jdk'],
	['sudo', 'yum', 'install', '-y', 'java-11-openjdk'],
	['sudo', 'dnf', 'install', '-y', 'java-11-openjdk']
	]

	for cmd in package_managers:
	try:
	result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
	if result.returncode == 0:
	logger.info("Java installed successfully on Linux")
	return True
	except subprocess.TimeoutExpired:
	continue

	logger.error("Failed to install Java on Linux")
	return False

	elif system == "windows":
	logger.info("Please install Java manually on Windows:")
	logger.info("Download from: https://adoptium.net/")
	return False

	else:
	logger.error(f"Unsupported operating system: {system}")
	return False

	except Exception as e:
	logger.error(f"Error installing Java: {e}")
	return False

	@staticmethod
	def ensure_java_available() -> bool:
	"""Ensure Java is available, install if needed."""
	if JavaChecker.check_java_installed():
	return True

	logger.info("Java not found. Attempting to install...")
	return JavaChecker.install_java()

	class UniversalParser:
	"""Universal document parser with fallback to Tika."""

	def __init__(self):
	self.java_available = False
	self.tika_available = TIKA_AVAILABLE

	# Ensure Java is available if Tika is needed
	if self.tika_available:
	self.java_available = JavaChecker.ensure_java_available()

	def extract_with_tika(self, file_path: str) -> Dict[str, Any]:
	"""Extract text using Apache Tika."""
	if not self.tika_available:
	return {
	"success": False,
	"file_path": file_path,
	"error": "Tika not available",
	"method": "tika"
	}

	if not self.java_available:
	return {
	"success": False,
	"file_path": file_path,
	"error": "Java not available for Tika",
	"method": "tika"
	}

	try:
	logger.info(f"Extracting with Tika: {file_path}")

	# Parse document with Tika
	raw = parser.from_file(file_path)

	if raw and raw.get("content"):
	return {
	"success": True,
	"file_path": file_path,
	"text": raw["content"].strip(),
	"metadata": raw.get("metadata", {}),
	"content_type": raw.get("metadata", {}).get("Content-Type", ""),
	"method": "tika"
	}
	else:
	return {
	"success": False,
	"file_path": file_path,
	"error": "Tika returned no content",
	"method": "tika"
	}

	except Exception as e:
	logger.error(f"Tika extraction failed: {e}")
	return {
	"success": False,
	"file_path": file_path,
	"error": str(e),
	"method": "tika"
	}

	def extract_document(self, file_path: str) -> Dict[str, Any]:
	"""
	Extract text from document using specialized parsers first,
	then fallback to Tika if needed.
	"""
	file_path = Path(file_path)

	if not file_path.exists():
	return {
	"success": False,
	"file_path": str(file_path),
	"error": "File not found",
	"method": "none"
	}

	# Try specialized parsers first
	if file_path.suffix.lower() == '.pdf' and PDF_PARSER_AVAILABLE:
	logger.info(f"Trying PDF parser: {file_path}")
	result = extract_pdf_text(str(file_path))
	if result["success"]:
	result["method"] = "pdf_parser"
	return result
	else:
	logger.warning(f"PDF parser failed, trying Tika: {result['error']}")

	elif file_path.suffix.lower() in ['.docx', '.doc'] and WORD_PARSER_AVAILABLE:
	logger.info(f"Trying Word parser: {file_path}")
	result = extract_docx_text(str(file_path))
	if result["success"]:
	result["method"] = "word_parser"
	return result
	else:
	logger.warning(f"Word parser failed, trying Tika: {result['error']}")

	# Fallback to Tika for any format
	logger.info(f"Falling back to Tika: {file_path}")
	return self.extract_with_tika(str(file_path))

	def process_batch_documents(self, file_paths: List[str]) -> List[Dict[str, Any]]:
	"""
	Process multiple documents with fallback strategy.
	"""
	results = []
	total_files = len(file_paths)

	logger.info(f"Starting batch processing of {total_files} files")

	for i, file_path in enumerate(file_paths, 1):
	logger.info(f"Processing file {i}/{total_files}: {file_path}")
	result = self.extract_document(file_path)
	results.append(result)

	if result["success"]:
	logger.info(f"✓ Successfully processed: {file_path} (method: {result['method']})")
	else:
	logger.warning(f"✗ Failed to process: {file_path} - {result['error']}")

	# Summary statistics
	successful = sum(1 for r in results if r["success"])
	failed = total_files - successful

	# Method breakdown
	methods = {}
	for r in results:
	method = r.get("method", "unknown")
	methods[method] = methods.get(method, 0) + 1

	logger.info(f"Batch processing complete: {successful} successful, {failed} failed")
	logger.info(f"Method breakdown: {methods}")

	return results

	def extract_any_document(file_path: str) -> Dict[str, Any]:
	"""
	Universal document extraction function.

	Args:
	file_path: Path to the document file

	Returns:
	Dict containing extraction results with method used
	"""
	parser = UniversalParser()
	return parser.extract_document(file_path)

	def process_batch_any_documents(file_paths: List[str]) -> List[Dict[str, Any]]:
	"""
	Process multiple documents of any format.

	Args:
	file_paths: List of file paths to process

	Returns:
	List of extraction results for each file
	"""
	parser = UniversalParser()
	return parser.process_batch_documents(file_paths)

	def main():
	"""Main function for command line usage."""
	if len(sys.argv) > 1:
	file_path = sys.argv[1]
	result = extract_any_document(file_path)

	if result["success"]:
	print(f"✓ Successfully extracted text from: {file_path}")
	print(f"Method used: {result['method']}")
	print(f"Text length: {len(result['text'])} characters")
	print(f"Content type: {result.get('content_type', 'N/A')}")
	else:
	print(f"✗ Failed to extract text: {result['error']}")
	print(f"Method attempted: {result.get('method', 'N/A')}")
	else:
	print("Usage: python universal_parser.py <file_path>")
	print("Supports: PDF, Word, PowerPoint, Excel, Text, and many more formats")

	if __name__ == "__main__":
	main()