madriClaro / analyzers /analyzer_wrapper.py
Ruben
Integrate Aclarador with Groq API for clarity analysis
28aa7d9
"""
Aclarador Analyzer Wrapper
Uses Groq API to analyze text clarity based on Aclarador principles
"""
import logging
import os
from typing import Dict, Any, List
from pathlib import Path
logger = logging.getLogger(__name__)
# Try to import Groq
try:
from groq import Groq
GROQ_AVAILABLE = True
except ImportError:
GROQ_AVAILABLE = False
logger.warning("Groq library not available - install with: pip install groq")
class AclaradorAnalyzer:
"""
Clarity analyzer using Groq API
Based on Aclarador's system prompt for Spanish clarity analysis
"""
def __init__(self):
self.system_prompt = self._load_system_prompt()
self.groq_client = None
if GROQ_AVAILABLE:
# Get Groq API key from environment
api_key = os.getenv('GROQ_API_KEY')
if api_key:
self.groq_client = Groq(api_key=api_key)
logger.info("✅ Aclarador analyzer initialized with Groq API")
else:
logger.warning("⚠️ GROQ_API_KEY not found - using fallback analyzer")
else:
logger.warning("⚠️ Groq not available - using fallback analyzer")
def _load_system_prompt(self) -> str:
"""Load system prompt from system_prompt.md"""
try:
prompt_path = Path(__file__).parent / 'aclarador' / 'system_prompt.md'
with open(prompt_path, 'r', encoding='utf-8') as f:
content = f.read()
# Extract content between ``` markers
if '```' in content:
parts = content.split('```')
if len(parts) >= 3:
return parts[1].strip()
# Fallback: use whole content
return content
except Exception as e:
logger.error(f"Error loading system prompt: {e}")
return self._get_default_system_prompt()
def _get_default_system_prompt(self) -> str:
"""Default system prompt if file not found"""
return """Eres un experto en lenguaje claro especializado en la mejora de textos en español.
Analiza el texto y proporciona:
1. Una versión mejorada más clara
2. Explicación de las mejoras realizadas
3. Identificación de problemas de claridad"""
def analyze(self, text: str, title: str = None) -> Dict[str, Any]:
"""
Analyze text using Groq API
Returns Madrid Analyzer's expected format with clarity scores and suggestions
"""
# Use Groq if available, otherwise fallback
if not self.groq_client:
return self._fallback_analysis(text)
try:
# Call Groq API
response = self.groq_client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": text}
],
temperature=0.3,
max_tokens=2000
)
# Extract response text
analysis_text = response.choices[0].message.content
# Parse response and calculate scores
return self._parse_groq_response(analysis_text, text)
except Exception as e:
logger.error(f"Error calling Groq API: {e}")
import traceback
traceback.print_exc()
return self._fallback_analysis(text)
def _parse_groq_response(self, analysis_text: str, original_text: str) -> Dict[str, Any]:
"""
Parse Groq's response and map to expected format
Groq returns sections like:
### TEXTO CORREGIDO
### EXPLICACIÓN DE MEJORAS
### PRINCIPIOS APLICADOS
"""
# Extract sections
sections = self._extract_sections(analysis_text)
# Analyze original text for statistics
sentences = [s.strip() for s in original_text.split('.') if s.strip()]
words = original_text.split()
# Detect issues from the explanation
issues = self._extract_issues_from_explanation(sections.get('explicacion', ''))
# Calculate scores based on analysis
readability_score = self._calculate_readability_from_analysis(original_text, issues)
complexity_score = self._calculate_complexity_from_analysis(issues)
overall_score = (readability_score * 0.5 + complexity_score * 0.5)
# Extract suggestions from explanation
suggestions = self._extract_suggestions(sections.get('explicacion', ''))
# Detect jargon from original text
jargon_words = self._detect_jargon(words)
# Get sentence statistics
sentence_stats = self._get_sentence_stats(sentences)
vocabulary_stats = self._get_vocabulary_stats(words)
return {
'overall_score': overall_score,
'readability_score': readability_score,
'complexity_score': complexity_score,
'sentence_stats': sentence_stats,
'vocabulary_stats': vocabulary_stats,
'readability_metrics': {
'issues_detected': issues,
'corrected_text': sections.get('corregido', '')
},
'grammar_stats': {
'issues_count': len(issues)
},
'jargon_count': len(jargon_words),
'jargon_words': jargon_words,
'long_sentences_count': sentence_stats.get('long_sentences', 0),
'suggestions': suggestions
}
def _extract_sections(self, text: str) -> Dict[str, str]:
"""Extract sections from Groq response"""
sections = {}
# Look for section headers
if '### TEXTO CORREGIDO' in text or '###TEXTO CORREGIDO' in text:
parts = text.split('###')
for i, part in enumerate(parts):
part_lower = part.lower()
if 'texto corregido' in part_lower:
# Get content until next section
content = part.split('\n', 1)[1] if '\n' in part else part
sections['corregido'] = content.split('###')[0].strip()
elif 'explicación' in part_lower or 'explicacion' in part_lower:
content = part.split('\n', 1)[1] if '\n' in part else part
sections['explicacion'] = content.split('###')[0].strip()
elif 'principios' in part_lower:
content = part.split('\n', 1)[1] if '\n' in part else part
sections['principios'] = content.split('###')[0].strip()
return sections
def _extract_issues_from_explanation(self, explanation: str) -> List[str]:
"""Extract detected issues from explanation text"""
issues = []
explanation_lower = explanation.lower()
# Check for common issue mentions
if 'oración' in explanation_lower and ('larga' in explanation_lower or 'compleja' in explanation_lower):
issues.append('long_sentences')
if 'vocabulario' in explanation_lower or 'tecnicismo' in explanation_lower or 'jerga' in explanation_lower:
issues.append('complex_vocabulary')
if 'voz pasiva' in explanation_lower or 'pasiva' in explanation_lower:
issues.append('passive_voice')
if 'redundancia' in explanation_lower or 'repetición' in explanation_lower:
issues.append('redundancy')
return issues
def _extract_suggestions(self, explanation: str) -> List[str]:
"""Extract improvement suggestions from explanation"""
suggestions = []
# Split by sections in the explanation
lines = explanation.split('\n')
for line in lines:
line = line.strip()
# Look for bullet points or numbered items
if line.startswith('-') or line.startswith('*') or (line and line[0].isdigit() and '.' in line[:3]):
# Clean up the line
clean_line = line.lstrip('-*0123456789. ').strip()
if clean_line and len(clean_line) > 10: # Meaningful suggestion
suggestions.append(clean_line)
# If no suggestions found, add a general one
if not suggestions:
suggestions.append('Texto analizado con principios de lenguaje claro')
return suggestions[:5] # Limit to 5
def _calculate_readability_from_analysis(self, text: str, issues: List[str]) -> float:
"""Calculate readability score based on text and detected issues"""
sentences = [s.strip() for s in text.split('.') if s.strip()]
if not sentences:
return 50.0
# Base score from sentence structure
avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
score = 100 - abs(avg_length - 20) * 2
# Penalize for issues
score -= len(issues) * 8
return max(0, min(100, score))
def _calculate_complexity_from_analysis(self, issues: List[str]) -> float:
"""Calculate complexity score (inverse of complexity)"""
# Start with high score
score = 100.0
# Deduct for each issue type
score -= len(issues) * 12
return max(0, min(100, score))
def _get_sentence_stats(self, sentences: List[str]) -> Dict[str, Any]:
"""Get statistics about sentences"""
if not sentences:
return {'count': 0, 'avg_length': 0, 'long_sentences': 0}
sentence_lengths = [len(s.split()) for s in sentences]
long_sentences = [s for s in sentences if len(s.split()) > 30]
return {
'count': len(sentences),
'avg_length': sum(sentence_lengths) / len(sentences),
'max_length': max(sentence_lengths) if sentence_lengths else 0,
'min_length': min(sentence_lengths) if sentence_lengths else 0,
'long_sentences': len(long_sentences)
}
def _get_vocabulary_stats(self, words: List[str]) -> Dict[str, Any]:
"""Get statistics about vocabulary"""
if not words:
return {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0}
unique_words = set(w.lower() for w in words)
lexical_diversity = len(unique_words) / len(words)
return {
'total_words': len(words),
'unique_words': len(unique_words),
'lexical_diversity': lexical_diversity,
'avg_word_length': sum(len(w) for w in words) / len(words)
}
def _detect_jargon(self, words: List[str]) -> List[str]:
"""Detect potential jargon/technical terms"""
# Common administrative jargon in Spanish
admin_jargon = [
'normativa', 'procedimiento', 'expediente', 'tramitación',
'reglamento', 'disposición', 'resolución', 'acreditación',
'competencias', 'subsanación', 'notificación', 'administrativo'
]
jargon = []
# Check for long words (likely technical)
for word in words:
clean_word = word.lower().strip('.,;:¿?¡!')
if len(clean_word) > 12 and clean_word not in jargon:
jargon.append(clean_word)
# Check for known administrative jargon
for word in words:
clean_word = word.lower().strip('.,;:¿?¡!')
if clean_word in admin_jargon and clean_word not in jargon:
jargon.append(clean_word)
return jargon[:10] # Limit to 10 terms
def _fallback_analysis(self, text: str) -> Dict[str, Any]:
"""
Fallback analysis when Groq is not available
Uses simple heuristics
"""
logger.warning("Using fallback analysis - Groq API not available")
sentences = [s.strip() for s in text.split('.') if s.strip()]
words = text.split()
if not sentences or not words:
return self._get_empty_result()
# Simple scoring
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
avg_word_length = sum(len(w) for w in words) / len(words)
readability_score = max(0, 100 - abs(avg_sentence_length - 20) * 2)
long_sentences = [s for s in sentences if len(s.split()) > 30]
complexity_score = max(0, 100 - len(long_sentences) * 10)
overall_score = (readability_score + complexity_score) / 2
return {
'overall_score': overall_score,
'readability_score': readability_score,
'complexity_score': complexity_score,
'sentence_stats': self._get_sentence_stats(sentences),
'vocabulary_stats': self._get_vocabulary_stats(words),
'readability_metrics': {'issues_detected': []},
'grammar_stats': {'issues_count': 0},
'jargon_count': len(self._detect_jargon(words)),
'jargon_words': self._detect_jargon(words),
'long_sentences_count': len(long_sentences),
'suggestions': [
'Groq API no disponible - usando análisis simple',
'Configurar GROQ_API_KEY para análisis completo'
]
}
def _get_empty_result(self) -> Dict[str, Any]:
"""Return empty result for invalid text"""
return {
'overall_score': 0,
'readability_score': 0,
'complexity_score': 0,
'sentence_stats': {'count': 0, 'avg_length': 0, 'long_sentences': 0},
'vocabulary_stats': {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0},
'readability_metrics': {},
'grammar_stats': {},
'jargon_count': 0,
'jargon_words': [],
'long_sentences_count': 0,
'suggestions': ['Texto vacío o inválido']
}