Spaces:

rdlf
/

madriClaro

Sleeping

madriClaro / analyzers /analyzer_wrapper.py

Ruben

Integrate Aclarador with Groq API for clarity analysis

28aa7d9 3 months ago

13.8 kB

	"""
	Aclarador Analyzer Wrapper
	Uses Groq API to analyze text clarity based on Aclarador principles
	"""

	import logging
	import os
	from typing import Dict, Any, List
	from pathlib import Path

	logger = logging.getLogger(__name__)

	# Try to import Groq
	try:
	from groq import Groq
	GROQ_AVAILABLE = True
	except ImportError:
	GROQ_AVAILABLE = False
	logger.warning("Groq library not available - install with: pip install groq")


	class AclaradorAnalyzer:
	"""
	Clarity analyzer using Groq API
	Based on Aclarador's system prompt for Spanish clarity analysis
	"""

	def __init__(self):
	self.system_prompt = self._load_system_prompt()
	self.groq_client = None

	if GROQ_AVAILABLE:
	# Get Groq API key from environment
	api_key = os.getenv('GROQ_API_KEY')
	if api_key:
	self.groq_client = Groq(api_key=api_key)
	logger.info("✅ Aclarador analyzer initialized with Groq API")
	else:
	logger.warning("⚠️ GROQ_API_KEY not found - using fallback analyzer")
	else:
	logger.warning("⚠️ Groq not available - using fallback analyzer")

	def _load_system_prompt(self) -> str:
	"""Load system prompt from system_prompt.md"""
	try:
	prompt_path = Path(__file__).parent / 'aclarador' / 'system_prompt.md'
	with open(prompt_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Extract content between ``` markers
	if '```' in content:
	parts = content.split('```')
	if len(parts) >= 3:
	return parts[1].strip()

	# Fallback: use whole content
	return content

	except Exception as e:
	logger.error(f"Error loading system prompt: {e}")
	return self._get_default_system_prompt()

	def _get_default_system_prompt(self) -> str:
	"""Default system prompt if file not found"""
	return """Eres un experto en lenguaje claro especializado en la mejora de textos en español.
	Analiza el texto y proporciona:
	1. Una versión mejorada más clara
	2. Explicación de las mejoras realizadas
	3. Identificación de problemas de claridad"""

	def analyze(self, text: str, title: str = None) -> Dict[str, Any]:
	"""
	Analyze text using Groq API

	Returns Madrid Analyzer's expected format with clarity scores and suggestions
	"""

	# Use Groq if available, otherwise fallback
	if not self.groq_client:
	return self._fallback_analysis(text)

	try:
	# Call Groq API
	response = self.groq_client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{"role": "system", "content": self.system_prompt},
	{"role": "user", "content": text}
	],
	temperature=0.3,
	max_tokens=2000
	)

	# Extract response text
	analysis_text = response.choices[0].message.content

	# Parse response and calculate scores
	return self._parse_groq_response(analysis_text, text)

	except Exception as e:
	logger.error(f"Error calling Groq API: {e}")
	import traceback
	traceback.print_exc()
	return self._fallback_analysis(text)

	def _parse_groq_response(self, analysis_text: str, original_text: str) -> Dict[str, Any]:
	"""
	Parse Groq's response and map to expected format

	Groq returns sections like:
	### TEXTO CORREGIDO
	### EXPLICACIÓN DE MEJORAS
	### PRINCIPIOS APLICADOS
	"""

	# Extract sections
	sections = self._extract_sections(analysis_text)

	# Analyze original text for statistics
	sentences = [s.strip() for s in original_text.split('.') if s.strip()]
	words = original_text.split()

	# Detect issues from the explanation
	issues = self._extract_issues_from_explanation(sections.get('explicacion', ''))

	# Calculate scores based on analysis
	readability_score = self._calculate_readability_from_analysis(original_text, issues)
	complexity_score = self._calculate_complexity_from_analysis(issues)
	overall_score = (readability_score * 0.5 + complexity_score * 0.5)

	# Extract suggestions from explanation
	suggestions = self._extract_suggestions(sections.get('explicacion', ''))

	# Detect jargon from original text
	jargon_words = self._detect_jargon(words)

	# Get sentence statistics
	sentence_stats = self._get_sentence_stats(sentences)
	vocabulary_stats = self._get_vocabulary_stats(words)

	return {
	'overall_score': overall_score,
	'readability_score': readability_score,
	'complexity_score': complexity_score,
	'sentence_stats': sentence_stats,
	'vocabulary_stats': vocabulary_stats,
	'readability_metrics': {
	'issues_detected': issues,
	'corrected_text': sections.get('corregido', '')
	},
	'grammar_stats': {
	'issues_count': len(issues)
	},
	'jargon_count': len(jargon_words),
	'jargon_words': jargon_words,
	'long_sentences_count': sentence_stats.get('long_sentences', 0),
	'suggestions': suggestions
	}

	def _extract_sections(self, text: str) -> Dict[str, str]:
	"""Extract sections from Groq response"""
	sections = {}

	# Look for section headers
	if '### TEXTO CORREGIDO' in text or '###TEXTO CORREGIDO' in text:
	parts = text.split('###')
	for i, part in enumerate(parts):
	part_lower = part.lower()
	if 'texto corregido' in part_lower:
	# Get content until next section
	content = part.split('\n', 1)[1] if '\n' in part else part
	sections['corregido'] = content.split('###')[0].strip()
	elif 'explicación' in part_lower or 'explicacion' in part_lower:
	content = part.split('\n', 1)[1] if '\n' in part else part
	sections['explicacion'] = content.split('###')[0].strip()
	elif 'principios' in part_lower:
	content = part.split('\n', 1)[1] if '\n' in part else part
	sections['principios'] = content.split('###')[0].strip()

	return sections

	def _extract_issues_from_explanation(self, explanation: str) -> List[str]:
	"""Extract detected issues from explanation text"""
	issues = []
	explanation_lower = explanation.lower()

	# Check for common issue mentions
	if 'oración' in explanation_lower and ('larga' in explanation_lower or 'compleja' in explanation_lower):
	issues.append('long_sentences')

	if 'vocabulario' in explanation_lower or 'tecnicismo' in explanation_lower or 'jerga' in explanation_lower:
	issues.append('complex_vocabulary')

	if 'voz pasiva' in explanation_lower or 'pasiva' in explanation_lower:
	issues.append('passive_voice')

	if 'redundancia' in explanation_lower or 'repetición' in explanation_lower:
	issues.append('redundancy')

	return issues

	def _extract_suggestions(self, explanation: str) -> List[str]:
	"""Extract improvement suggestions from explanation"""
	suggestions = []

	# Split by sections in the explanation
	lines = explanation.split('\n')

	for line in lines:
	line = line.strip()
	# Look for bullet points or numbered items
	if line.startswith('-') or line.startswith('*') or (line and line[0].isdigit() and '.' in line[:3]):
	# Clean up the line
	clean_line = line.lstrip('-*0123456789. ').strip()
	if clean_line and len(clean_line) > 10: # Meaningful suggestion
	suggestions.append(clean_line)

	# If no suggestions found, add a general one
	if not suggestions:
	suggestions.append('Texto analizado con principios de lenguaje claro')

	return suggestions[:5] # Limit to 5

	def _calculate_readability_from_analysis(self, text: str, issues: List[str]) -> float:
	"""Calculate readability score based on text and detected issues"""
	sentences = [s.strip() for s in text.split('.') if s.strip()]
	if not sentences:
	return 50.0

	# Base score from sentence structure
	avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
	score = 100 - abs(avg_length - 20) * 2

	# Penalize for issues
	score -= len(issues) * 8

	return max(0, min(100, score))

	def _calculate_complexity_from_analysis(self, issues: List[str]) -> float:
	"""Calculate complexity score (inverse of complexity)"""
	# Start with high score
	score = 100.0

	# Deduct for each issue type
	score -= len(issues) * 12

	return max(0, min(100, score))

	def _get_sentence_stats(self, sentences: List[str]) -> Dict[str, Any]:
	"""Get statistics about sentences"""
	if not sentences:
	return {'count': 0, 'avg_length': 0, 'long_sentences': 0}

	sentence_lengths = [len(s.split()) for s in sentences]
	long_sentences = [s for s in sentences if len(s.split()) > 30]

	return {
	'count': len(sentences),
	'avg_length': sum(sentence_lengths) / len(sentences),
	'max_length': max(sentence_lengths) if sentence_lengths else 0,
	'min_length': min(sentence_lengths) if sentence_lengths else 0,
	'long_sentences': len(long_sentences)
	}

	def _get_vocabulary_stats(self, words: List[str]) -> Dict[str, Any]:
	"""Get statistics about vocabulary"""
	if not words:
	return {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0}

	unique_words = set(w.lower() for w in words)
	lexical_diversity = len(unique_words) / len(words)

	return {
	'total_words': len(words),
	'unique_words': len(unique_words),
	'lexical_diversity': lexical_diversity,
	'avg_word_length': sum(len(w) for w in words) / len(words)
	}

	def _detect_jargon(self, words: List[str]) -> List[str]:
	"""Detect potential jargon/technical terms"""
	# Common administrative jargon in Spanish
	admin_jargon = [
	'normativa', 'procedimiento', 'expediente', 'tramitación',
	'reglamento', 'disposición', 'resolución', 'acreditación',
	'competencias', 'subsanación', 'notificación', 'administrativo'
	]

	jargon = []

	# Check for long words (likely technical)
	for word in words:
	clean_word = word.lower().strip('.,;:¿?¡!')
	if len(clean_word) > 12 and clean_word not in jargon:
	jargon.append(clean_word)

	# Check for known administrative jargon
	for word in words:
	clean_word = word.lower().strip('.,;:¿?¡!')
	if clean_word in admin_jargon and clean_word not in jargon:
	jargon.append(clean_word)

	return jargon[:10] # Limit to 10 terms

	def _fallback_analysis(self, text: str) -> Dict[str, Any]:
	"""
	Fallback analysis when Groq is not available
	Uses simple heuristics
	"""
	logger.warning("Using fallback analysis - Groq API not available")

	sentences = [s.strip() for s in text.split('.') if s.strip()]
	words = text.split()

	if not sentences or not words:
	return self._get_empty_result()

	# Simple scoring
	avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
	avg_word_length = sum(len(w) for w in words) / len(words)

	readability_score = max(0, 100 - abs(avg_sentence_length - 20) * 2)
	long_sentences = [s for s in sentences if len(s.split()) > 30]
	complexity_score = max(0, 100 - len(long_sentences) * 10)
	overall_score = (readability_score + complexity_score) / 2

	return {
	'overall_score': overall_score,
	'readability_score': readability_score,
	'complexity_score': complexity_score,
	'sentence_stats': self._get_sentence_stats(sentences),
	'vocabulary_stats': self._get_vocabulary_stats(words),
	'readability_metrics': {'issues_detected': []},
	'grammar_stats': {'issues_count': 0},
	'jargon_count': len(self._detect_jargon(words)),
	'jargon_words': self._detect_jargon(words),
	'long_sentences_count': len(long_sentences),
	'suggestions': [
	'Groq API no disponible - usando análisis simple',
	'Configurar GROQ_API_KEY para análisis completo'
	]
	}

	def _get_empty_result(self) -> Dict[str, Any]:
	"""Return empty result for invalid text"""
	return {
	'overall_score': 0,
	'readability_score': 0,
	'complexity_score': 0,
	'sentence_stats': {'count': 0, 'avg_length': 0, 'long_sentences': 0},
	'vocabulary_stats': {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0},
	'readability_metrics': {},
	'grammar_stats': {},
	'jargon_count': 0,
	'jargon_words': [],
	'long_sentences_count': 0,
	'suggestions': ['Texto vacío o inválido']
	}