Spaces:

M4xjunior
/

locseed

Runtime error

App Files Files Community

locseed / sentence_analyzer.py

M4xjunior

Upload sentence_analyzer.py

d06e453 verified over 1 year ago

raw

history blame contribute delete

11.1 kB

	#sentence_analyzer.py
	import re
	import logging
	from typing import List, Tuple
	from datetime import datetime
	import os
	import unicodedata
	import nltk

	# Download the Punkt tokenizer if not already downloaded
	nltk.download('punkt', quiet=True)
	from nltk.tokenize import sent_tokenize

	class SentenceAnalyzer:
	def __init__(self):
	self._setup_logger()

	# Sentence types and their corresponding flags
	self.SENTENCE_TYPES = ['exclamation', 'question', 'statement', 'ellipsis', 'quote', 'emphasis']
	self.FLAGS = {
	'exclamation': 'EXCL',
	'question': 'QUES',
	'statement': 'STMT',
	'ellipsis': 'ELIP',
	'quote': 'QUOT',
	'emphasis': 'EMPH'
	}

	self.logger.info("SentenceAnalyzer initialized successfully")

	def _setup_logger(self):
	"""Set up logging configuration."""
	try:
	# Create logs directory if it doesn't exist
	os.makedirs('logs', exist_ok=True)

	# Get current date for log file name
	current_date = datetime.now().strftime('%Y-%m-%d')
	log_file = f'logs/sentence_analyzer_{current_date}.log'

	# Create and configure logger
	self.logger = logging.getLogger('SentenceAnalyzer')
	self.logger.setLevel(logging.DEBUG) # Set to DEBUG to capture all logs

	# Clear existing handlers to avoid duplicates
	if self.logger.handlers:
	self.logger.handlers.clear()

	# Create file handler
	file_handler = logging.FileHandler(log_file, encoding='utf-8')
	file_handler.setLevel(logging.DEBUG)

	# Create console handler
	console_handler = logging.StreamHandler()
	console_handler.setLevel(logging.INFO)

	# Create formatter
	formatter = logging.Formatter(
	'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	file_handler.setFormatter(formatter)
	console_handler.setFormatter(formatter)

	# Add handlers to logger
	self.logger.addHandler(file_handler)
	self.logger.addHandler(console_handler)

	self.logger.debug("Logger set up successfully")

	except Exception as e:
	print(f"Error setting up logger: {str(e)}")
	raise

	def split_into_sentences(self, text: str) -> List[str]:
	"""Split text into sentences using NLTK's sentence tokenizer."""
	if not text:
	return []

	self.logger.debug("Starting sentence splitting")

	# Normalize Unicode characters
	text = unicodedata.normalize('NFC', text)
	self.logger.debug("Normalized text using NFC")

	# Remove page numbers and chapter titles (common in PDFs)
	text = re.sub(r'Page \d+\|Chapter \d+:.*', '', text)
	self.logger.debug("Removed page numbers and chapter titles")

	# Replace hyphenated line breaks with just the word
	text = re.sub(r'-\s+\n', '', text)
	text = re.sub(r'-\s+', '', text)
	self.logger.debug("Replaced hyphenated line breaks")

	# Replace multiple newlines and carriage returns with a space
	text = re.sub(r'[\r\n]+', ' ', text)
	self.logger.debug("Replaced multiple newlines with a space")

	# Replace multiple spaces with a single space
	text = re.sub(r'\s+', ' ', text).strip()
	self.logger.debug("Normalized whitespace")

	# Use NLTK's sent_tokenize to split into sentences
	sentences = sent_tokenize(text)
	self.logger.debug(f"Split text into {len(sentences)} sentences using NLTK")

	# Clean up sentences
	sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
	self.logger.info(f"Split text into {len(sentences)} sentences after cleanup")
	return sentences

	def analyze_sentence(self, sentence: str) -> Tuple[str, str, str]:
	"""Analyze a sentence and return its type, color (handled by CSS), and flag."""
	if not sentence:
	return ('statement', '', self.FLAGS['statement'])

	sentence = sentence.strip()
	self.logger.debug(f"Analyzing sentence: '{sentence}'")

	# Function to check for complete quotes
	def has_complete_quote(text):
	quote_pairs = [
	('"', '"'),
	("'", "'"),
	('“', '”'),
	('‘', '’'),
	('«', '»')
	]
	text = text.strip()
	for open_quote, close_quote in quote_pairs:
	if text.startswith(open_quote) and text.endswith(close_quote):
	# Ensure that the quotes are balanced
	if text.count(open_quote) == text.count(close_quote):
	self.logger.debug(f"Sentence starts and ends with matching quotes: {open_quote}{close_quote}")
	return True
	return False

	# Check if the entire sentence is enclosed in matching quotes
	if has_complete_quote(sentence):
	sent_type = 'quote'
	self.logger.debug("Sentence classified as 'quote'")
	# Check for emphasis
	elif re.search(r'\[^]+\*', sentence):
	sent_type = 'emphasis'
	self.logger.debug("Sentence classified as 'emphasis'")
	# Check regular sentence types
	elif sentence.endswith(('!', '！')):
	sent_type = 'exclamation'
	self.logger.debug("Sentence classified as 'exclamation'")
	elif sentence.endswith(('?', '？')):
	sent_type = 'question'
	self.logger.debug("Sentence classified as 'question'")
	elif sentence.endswith('…') or sentence.endswith('...'):
	sent_type = 'ellipsis'
	self.logger.debug("Sentence classified as 'ellipsis'")
	else:
	sent_type = 'statement'
	self.logger.debug("Sentence classified as 'statement'")

	color = '' # Color is now handled by CSS classes
	self.logger.debug(f"Sentence type: {sent_type}, Flag: {self.FLAGS[sent_type]}")
	return (sent_type, color, self.FLAGS[sent_type])

	def clean_sentence(self, sentence: str) -> str:
	"""Remove special characters from the sentence that might confuse TTS models."""
	# Define the pattern to match unwanted special characters
	pattern = r'[^\w\s.,!?\'"“”‘’«»\-—()]'
	cleaned_sentence = re.sub(pattern, '', sentence)
	self.logger.debug(f"Cleaned sentence: '{cleaned_sentence}'")
	return cleaned_sentence

	def process_text_interactive(self, text: str) -> str:
	"""Process the text and return HTML-formatted output with interactive sentences."""
	self.logger.info("Starting interactive text processing")

	if not text:
	self.logger.warning("Empty text received")
	return ''

	try:
	# Normalize Unicode characters
	text = unicodedata.normalize('NFC', text)
	self.logger.debug("Normalized text using NFC in interactive processing")

	sentences = self.split_into_sentences(text)
	formatted_output = []

	for index, sentence in enumerate(sentences, 1):
	sent_type, color, flag = self.analyze_sentence(sentence)
	# Updated HTML to include class for sentence type and data attribute for indexing
	formatted_sentence = f'''
	<div class="sentence-row {sent_type}">
	<div class="sentence-number">{index}.</div>
	<div class="sentence-content">
	{sentence}
	</div>
	<div class="sentence-type">{sent_type.capitalize()}</div>
	</div>
	'''
	formatted_output.append(formatted_sentence)
	self.logger.info(f"Processed sentence {index}/{len(sentences)} - Type: {sent_type}")
	self.logger.debug(f"Formatted HTML for sentence {index}: {formatted_sentence}")

	result = ''.join(formatted_output)
	self.logger.info("Text processing completed successfully")
	return result

	except Exception as e:
	self.logger.error(f"Error processing text: {str(e)}", exc_info=True)
	return f'<span style="color: red;">Error processing text: {str(e)}</span>'

	def prepare_text_for_tts(self, sentences: List[str]) -> str:
	"""Prepare the text for TTS by cleaning special characters from each sentence."""
	cleaned_sentences = [self.clean_sentence(sentence) for sentence in sentences]
	tts_text = ' '.join(cleaned_sentences)
	self.logger.debug(f"Prepared text for TTS: '{tts_text}'")
	return tts_text

	def process_text(self, text: str) -> str:
	"""Legacy method for non-interactive processing. Kept for compatibility."""
	self.logger.info("Starting text processing (legacy method)")

	if not text:
	self.logger.warning("Empty text received")
	return ""

	try:
	# Normalize Unicode characters
	text = unicodedata.normalize('NFC', text)
	self.logger.debug("Normalized text using NFC in legacy processing")

	sentences = self.split_into_sentences(text)
	formatted_output = []

	for index, sentence in enumerate(sentences, 1):
	sent_type, _, flag = self.analyze_sentence(sentence)
	# Color is now handled by CSS classes
	formatted_sentence = (
	f'<span class="{sent_type}" '
	f'data-flag="{flag}" '
	f'title="Sentence type: {sent_type}">'
	f'{sentence}</span>'
	)
	formatted_output.append(formatted_sentence)
	self.logger.info(f"Processed sentence {index}/{len(sentences)} - Type: {sent_type}")
	self.logger.debug(f"Formatted HTML for sentence {index}: {formatted_sentence}")

	result = " ".join(formatted_output)
	self.logger.info("Text processing completed successfully")
	return result

	except Exception as e:
	self.logger.error(f"Error processing text: {str(e)}", exc_info=True)
	return f'<span style="color: red;">Error processing text: {str(e)}</span>'