Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / text_analyzer /base_analyzer.py

egumasa

Enhance GPU support with stronger enforcement

bb65e54 7 months ago

raw

history blame contribute delete

22 kB

	"""
	Base analyzer module providing shared SpaCy infrastructure.
	Eliminates code duplication and provides common functionality for all SpaCy-based analyzers.
	"""

	import spacy
	from typing import Dict, List, Any, Optional, Iterator, Tuple, TYPE_CHECKING
	import logging
	import tempfile
	from pathlib import Path
	import os
	from .app_config import AppConfig
	from .text_utility import TextUtility

	# Import UniDic extensions and enricher
	try:
	from . import unidic_extensions # This registers the token extensions
	from .unidic_enricher import UniDicEnricher
	UNIDIC_AVAILABLE = True
	except ImportError as e:
	logger.warning(f"UniDic integration not available: {e}")
	UNIDIC_AVAILABLE = False
	UniDicEnricher = None

	if TYPE_CHECKING:
	import spacy

	logger = logging.getLogger(__name__)


	class BaseAnalyzer:
	"""
	Base class for all SpaCy-based text analyzers.
	Provides shared model loading, document processing, and utility functions.
	"""

	def __init__(self, language: str = None, model_size: str = None, gpu_device: Optional[int] = None):
	"""
	Initialize the base analyzer.

	Args:
	language: Language code ('en' or 'ja')
	model_size: Model size ('md' or 'trf')
	gpu_device: GPU device ID to use (None for auto-detect, -1 for CPU only)
	"""
	self.language = language or AppConfig.DEFAULT_LANGUAGE
	self.model_size = model_size or AppConfig.DEFAULT_MODEL_SIZE
	self.gpu_device = gpu_device
	self.nlp = None
	self._model_info = {}
	self.unidic_enricher = None
	self._using_gpu = False

	self._load_spacy_model()

	# Initialize UniDic enricher for Japanese
	if self.language == 'ja' and UNIDIC_AVAILABLE:
	try:
	self.unidic_enricher = UniDicEnricher()
	logger.info("UniDic enricher initialized for Japanese analysis")
	except Exception as e:
	logger.warning(f"Failed to initialize UniDic enricher: {e}")
	self.unidic_enricher = None

	def _detect_gpu_availability(self) -> Tuple[bool, Optional[str], Optional[int]]:
	"""
	Detect if GPU/CUDA is available for spaCy processing.

	Returns:
	Tuple of (is_available, device_name, device_id)
	"""
	try:
	import torch

	if torch.cuda.is_available():
	device_count = torch.cuda.device_count()
	if device_count > 0:
	# Use specified device or default to 0
	if self.gpu_device is not None and self.gpu_device >= 0:
	device_id = min(self.gpu_device, device_count - 1)
	else:
	device_id = 0

	device_name = torch.cuda.get_device_name(device_id)
	return True, device_name, device_id

	return False, None, None

	except ImportError:
	logger.debug("PyTorch not available - GPU support disabled")
	return False, None, None
	except Exception as e:
	logger.warning(f"Error detecting GPU: {e}")
	return False, None, None

	def _configure_gpu_for_spacy(self) -> bool:
	"""
	Configure spaCy to use GPU if available with strong enforcement.

	Returns:
	True if GPU was successfully configured, False otherwise
	"""
	# Check if GPU should be disabled explicitly
	if self.gpu_device == -1:
	logger.info("GPU explicitly disabled by user")
	return False

	# Check if GPU is disabled via environment variable
	if os.environ.get('SPACY_USE_GPU', '').lower() == 'false':
	logger.info("GPU disabled via SPACY_USE_GPU environment variable")
	return False

	gpu_available, device_name, device_id = self._detect_gpu_availability()

	if not gpu_available:
	# For transformer models, this is a critical issue
	if self.model_size == 'trf':
	logger.warning("No GPU/CUDA device available for transformer model - performance will be degraded")
	else:
	logger.info("No GPU/CUDA device available - using CPU")
	return False

	try:
	# Import torch to set device explicitly
	import torch

	# Set CUDA device globally for all operations
	torch.cuda.set_device(device_id)
	os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)

	# Force spaCy to use GPU - use require_gpu for stronger enforcement
	try:
	spacy.require_gpu(gpu_id=device_id)
	logger.info(f"Successfully enforced GPU usage with spacy.require_gpu()")
	except Exception as e:
	# Fallback to prefer_gpu if require_gpu fails
	logger.warning(f"spacy.require_gpu() failed: {e}, trying prefer_gpu()")
	gpu_id = spacy.prefer_gpu(gpu_id=device_id)

	if gpu_id is False:
	raise RuntimeError("spacy.prefer_gpu() returned False despite GPU being available")

	logger.info(f"GPU strongly configured for spaCy - using {device_name} (device {device_id})")

	# Set environment variable to ensure GPU usage
	os.environ['SPACY_PREFER_GPU'] = '1'

	return True

	except Exception as e:
	logger.error(f"Failed to enable GPU for spaCy: {e}")
	# For transformer models, this is critical
	if self.model_size == 'trf':
	logger.error("GPU initialization failed for transformer model - processing will be slow")
	return False

	def _configure_batch_sizes(self) -> None:
	"""Configure optimal batch sizes for GPU processing."""
	if self.model_size == 'trf':
	# Transformer models need smaller batch sizes due to memory constraints
	# But GPU can handle larger batches than CPU
	if hasattr(self.nlp, 'pipe'):
	for pipe_name in self.nlp.pipe_names:
	pipe = self.nlp.get_pipe(pipe_name)
	if hasattr(pipe, 'cfg'):
	# Set batch size based on available GPU memory
	# These are conservative defaults that work on most GPUs
	if pipe_name == 'transformer':
	pipe.cfg['batch_size'] = 128 # Transformer batch size
	else:
	pipe.cfg['batch_size'] = 256 # Other components
	else:
	# Non-transformer models can use larger batches
	if hasattr(self.nlp, 'pipe'):
	for pipe_name in self.nlp.pipe_names:
	pipe = self.nlp.get_pipe(pipe_name)
	if hasattr(pipe, 'cfg'):
	pipe.cfg['batch_size'] = 1024

	def _force_model_to_gpu(self) -> bool:
	"""
	Force all model components to GPU after loading.

	Returns:
	True if successful, False otherwise
	"""
	if not self._using_gpu or not self.nlp:
	return False

	try:
	import torch

	# Force each pipeline component to GPU
	for pipe_name, pipe in self.nlp.pipeline:
	if hasattr(pipe, 'model'):
	# Move the model to GPU
	if hasattr(pipe.model, 'to'):
	pipe.model.to('cuda:0')
	logger.debug(f"Moved '{pipe_name}' component to GPU")

	# Special handling for transformer components
	if pipe_name == 'transformer' and hasattr(pipe, 'model'):
	# Ensure transformer model is on GPU
	if hasattr(pipe.model, 'transformer'):
	pipe.model.transformer.to('cuda:0')
	logger.info(f"Transformer component forcefully moved to GPU")

	return True

	except Exception as e:
	logger.error(f"Failed to force model components to GPU: {e}")
	return False

	def _verify_gpu_usage(self) -> bool:
	"""
	Verify that model components are actually using GPU.

	Returns:
	True if GPU is being used, False otherwise
	"""
	if not self._using_gpu or not self.nlp:
	return False

	try:
	import torch

	gpu_components = []
	cpu_components = []

	for pipe_name, pipe in self.nlp.pipeline:
	if hasattr(pipe, 'model'):
	# Check device of model parameters
	is_on_gpu = False

	if hasattr(pipe.model, 'parameters'):
	# Check if any parameters are on GPU
	for param in pipe.model.parameters():
	if param.is_cuda:
	is_on_gpu = True
	break
	elif hasattr(pipe.model, 'device'):
	# Check device attribute
	device = str(pipe.model.device)
	is_on_gpu = 'cuda' in device

	if is_on_gpu:
	gpu_components.append(pipe_name)
	else:
	cpu_components.append(pipe_name)

	if gpu_components:
	logger.info(f"Components on GPU: {', '.join(gpu_components)}")
	if cpu_components:
	logger.warning(f"Components still on CPU: {', '.join(cpu_components)}")

	# For transformer models, ensure the transformer component is on GPU
	if self.model_size == 'trf' and 'transformer' not in gpu_components:
	logger.error("Transformer component is not on GPU!")
	return False

	return len(gpu_components) > 0

	except Exception as e:
	logger.error(f"Failed to verify GPU usage: {e}")
	return False

	def _load_spacy_model(self) -> None:
	"""Load appropriate SpaCy model based on language and size with strong GPU enforcement."""
	# Validate combination
	if not AppConfig.validate_language_model_combination(self.language, self.model_size):
	raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")

	model_name = AppConfig.get_spacy_model_name(self.language, self.model_size)
	if not model_name:
	raise ValueError(f"No model found for language '{self.language}' and size '{self.model_size}'")

	# Configure GPU BEFORE loading model - this is critical
	self._using_gpu = self._configure_gpu_for_spacy()

	try:
	# Load model with optimizations for GPU if available
	if self._using_gpu and self.model_size == 'trf':
	# Enable mixed precision for transformer models on GPU
	self.nlp = spacy.load(model_name, config={"components": {"transformer": {"model": {"mixed_precision": True}}}})
	else:
	self.nlp = spacy.load(model_name)

	# Force model components to GPU after loading
	if self._using_gpu:
	gpu_forced = self._force_model_to_gpu()
	if not gpu_forced:
	logger.warning("Failed to force model components to GPU")

	# Verify GPU usage
	gpu_verified = self._verify_gpu_usage()
	if not gpu_verified and self.model_size == 'trf':
	logger.error("GPU verification failed for transformer model")

	# Get GPU info for model info
	gpu_info = "CPU"
	if self._using_gpu:
	gpu_available, device_name, device_id = self._detect_gpu_availability()
	if gpu_available:
	gpu_info = f"GPU ({device_name}, device {device_id})"
	# Add verification status
	if self._verify_gpu_usage():
	gpu_info += " [VERIFIED]"
	else:
	gpu_info += " [NOT VERIFIED]"

	self._model_info = {
	'name': model_name,
	'language': self.language,
	'model_size': self.model_size,
	'version': spacy.__version__,
	'device': gpu_info,
	'gpu_enabled': self._using_gpu
	}

	logger.info(f"Loaded SpaCy model: {model_name} on {gpu_info}")

	# Configure batch sizes for optimal GPU performance
	if self._using_gpu and hasattr(self.nlp, 'pipe'):
	# Increase batch size for GPU processing
	self._configure_batch_sizes()

	except OSError as e:
	error_msg = f"SpaCy model {model_name} not found. Please install it first."
	logger.error(error_msg)
	raise OSError(error_msg) from e
	except Exception as e:
	logger.error(f"Error loading SpaCy model: {e}")
	# Try fallback to CPU if GPU loading failed
	if self._using_gpu:
	logger.warning("Falling back to CPU after GPU loading failed")
	self._using_gpu = False
	try:
	self.nlp = spacy.load(model_name)
	self._model_info['device'] = 'CPU (fallback)'
	self._model_info['gpu_enabled'] = False
	logger.info(f"Successfully loaded {model_name} on CPU after GPU failure")
	except Exception as cpu_error:
	raise ValueError(f"Failed to load model on both GPU and CPU: {cpu_error}") from cpu_error
	else:
	raise

	def get_model_info(self) -> Dict[str, str]:
	"""
	Get information about the loaded model.

	Returns:
	Dictionary with model information
	"""
	return self._model_info.copy()

	def process_document(self, text: str) -> "spacy.Doc":
	"""
	Process text into a SpaCy document.

	Args:
	text: Input text to process

	Returns:
	Processed SpaCy document

	Raises:
	ValueError: If model not loaded or text processing fails
	"""
	if not self.nlp:
	raise ValueError("SpaCy model not loaded")

	if not text or not text.strip():
	raise ValueError("Empty text provided")

	try:
	# Clean text before processing
	cleaned_text = TextUtility.clean_text_input(text)

	# Process with SpaCy
	doc = self.nlp(cleaned_text)

	# Add UniDic enrichment for Japanese
	if self.unidic_enricher and self.language == 'ja':
	try:
	self.unidic_enricher.enrich_spacy_doc(doc, cleaned_text)
	logger.debug("UniDic enrichment completed")
	except Exception as e:
	logger.warning(f"UniDic enrichment failed: {e}")

	return doc

	except Exception as e:
	self.handle_processing_error(e, f"processing text of length {len(text)}")
	raise

	def handle_processing_error(self, error: Exception, context: str) -> None:
	"""
	Handle processing errors with appropriate logging.

	Args:
	error: The exception that occurred
	context: Context description for the error
	"""
	error_msg = f"Error {context}: {error}"
	logger.error(error_msg)

	def filter_tokens(self,
	doc: "spacy.Doc",
	exclude_punct: bool = True,
	exclude_space: bool = True,
	word_type_filter: Optional[str] = None) -> List["spacy.Token"]:
	"""
	Filter tokens based on various criteria.

	Args:
	doc: SpaCy document
	exclude_punct: Whether to exclude punctuation
	exclude_space: Whether to exclude spaces
	word_type_filter: Filter by word type ('CW', 'FW', or None)

	Returns:
	List of filtered tokens
	"""
	filtered_tokens = []

	for token in doc:
	# Basic filtering
	if exclude_space and token.is_space:
	continue
	if exclude_punct and token.is_punct:
	continue

	# Word type filtering
	if word_type_filter:
	word_type = self._classify_pos(token)
	if word_type != word_type_filter:
	continue

	filtered_tokens.append(token)

	return filtered_tokens

	def _classify_pos(self, token: "spacy.Token") -> str:
	"""
	Classify token as content word (CW) or function word (FW).

	Args:
	token: SpaCy token object

	Returns:
	'CW' for content words, 'FW' for function words
	"""
	content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
	function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}

	if token.pos_ in content_pos:
	return 'CW'
	elif token.pos_ in function_pos:
	return 'FW'
	else:
	# Default classification for ambiguous cases
	return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'

	def format_token_for_display(self, token: "spacy.Token", include_syntax: bool = True) -> Dict[str, Any]:
	"""
	Format token for UI display - only call when needed for output.

	Args:
	token: SpaCy token
	include_syntax: Whether to include syntactic information (dep_, head, etc.)

	Returns:
	Formatted token data dictionary for display
	"""
	result = {
	'token': token.text,
	'lemma': token.lemma_,
	'pos': token.pos_,
	'tag': token.tag_,
	'word_type': self._classify_pos(token)
	}

	if include_syntax:
	result.update({
	'dep_': token.dep_,
	'head_text': token.head.text,
	'head_pos': token.head.pos_,
	})

	return result

	def get_syntactic_context(self, token: "spacy.Token") -> Dict[str, Any]:
	"""
	Get comprehensive syntactic relationships for a token.

	Args:
	token: SpaCy token

	Returns:
	Dictionary with syntactic context information
	"""
	return {
	'dep_': token.dep_,
	'head': token.head,
	'children': list(token.children),
	'ancestors': list(token.ancestors),
	'subtree_span': token.subtree,
	'left_edge': token.left_edge,
	'right_edge': token.right_edge
	}

	def process_sentences(self,
	doc: "spacy.Doc",
	max_tokens: Optional[int] = None) -> List["spacy.Span"]:
	"""
	Process sentences with optional token limits.

	Args:
	doc: SpaCy document
	max_tokens: Maximum tokens per sentence (uses config default if None)

	Returns:
	List of sentence spans
	"""
	max_tokens = max_tokens or AppConfig.MAX_TOKENS_FOR_VISUALIZATION

	processed_sentences = []
	for sent in doc.sents:
	# Filter tokens (exclude spaces for counting)
	sent_tokens = [token for token in sent if not token.is_space]

	if len(sent_tokens) > max_tokens:
	# Truncate sentence
	truncated_tokens = sent_tokens[:max_tokens]
	# Create new span with truncated tokens
	start_idx = truncated_tokens[0].i
	end_idx = truncated_tokens[-1].i + 1
	truncated_span = doc[start_idx:end_idx]
	processed_sentences.append(truncated_span)
	else:
	processed_sentences.append(sent)

	return processed_sentences

	def setup_batch_processing(self, file_paths: List[str]) -> Iterator[Tuple[str, str]]:
	"""
	Set up batch processing for multiple files.

	Args:
	file_paths: List of file paths to process

	Yields:
	Tuples of (file_path, text_content)
	"""
	for file_path in file_paths:
	try:
	text_content = TextUtility.extract_text_from_file(file_path)
	yield file_path, text_content
	except Exception as e:
	logger.error(f"Error processing file {file_path}: {e}")
	yield file_path, f"ERROR: {e}"

	def cleanup_batch_processing(self, temp_files: List[str]) -> None:
	"""
	Clean up temporary files from batch processing.

	Args:
	temp_files: List of temporary file paths
	"""
	TextUtility.cleanup_temp_files(temp_files)