""" Aclarador Analyzer Wrapper Uses Groq API to analyze text clarity based on Aclarador principles """ import logging import os from typing import Dict, Any, List from pathlib import Path logger = logging.getLogger(__name__) # Try to import Groq try: from groq import Groq GROQ_AVAILABLE = True except ImportError: GROQ_AVAILABLE = False logger.warning("Groq library not available - install with: pip install groq") class AclaradorAnalyzer: """ Clarity analyzer using Groq API Based on Aclarador's system prompt for Spanish clarity analysis """ def __init__(self): self.system_prompt = self._load_system_prompt() self.groq_client = None if GROQ_AVAILABLE: # Get Groq API key from environment api_key = os.getenv('GROQ_API_KEY') if api_key: self.groq_client = Groq(api_key=api_key) logger.info("✅ Aclarador analyzer initialized with Groq API") else: logger.warning("⚠️ GROQ_API_KEY not found - using fallback analyzer") else: logger.warning("⚠️ Groq not available - using fallback analyzer") def _load_system_prompt(self) -> str: """Load system prompt from system_prompt.md""" try: prompt_path = Path(__file__).parent / 'aclarador' / 'system_prompt.md' with open(prompt_path, 'r', encoding='utf-8') as f: content = f.read() # Extract content between ``` markers if '```' in content: parts = content.split('```') if len(parts) >= 3: return parts[1].strip() # Fallback: use whole content return content except Exception as e: logger.error(f"Error loading system prompt: {e}") return self._get_default_system_prompt() def _get_default_system_prompt(self) -> str: """Default system prompt if file not found""" return """Eres un experto en lenguaje claro especializado en la mejora de textos en español. Analiza el texto y proporciona: 1. Una versión mejorada más clara 2. Explicación de las mejoras realizadas 3. Identificación de problemas de claridad""" def analyze(self, text: str, title: str = None) -> Dict[str, Any]: """ Analyze text using Groq API Returns Madrid Analyzer's expected format with clarity scores and suggestions """ # Use Groq if available, otherwise fallback if not self.groq_client: return self._fallback_analysis(text) try: # Call Groq API response = self.groq_client.chat.completions.create( model="llama-3.3-70b-versatile", messages=[ {"role": "system", "content": self.system_prompt}, {"role": "user", "content": text} ], temperature=0.3, max_tokens=2000 ) # Extract response text analysis_text = response.choices[0].message.content # Parse response and calculate scores return self._parse_groq_response(analysis_text, text) except Exception as e: logger.error(f"Error calling Groq API: {e}") import traceback traceback.print_exc() return self._fallback_analysis(text) def _parse_groq_response(self, analysis_text: str, original_text: str) -> Dict[str, Any]: """ Parse Groq's response and map to expected format Groq returns sections like: ### TEXTO CORREGIDO ### EXPLICACIÓN DE MEJORAS ### PRINCIPIOS APLICADOS """ # Extract sections sections = self._extract_sections(analysis_text) # Analyze original text for statistics sentences = [s.strip() for s in original_text.split('.') if s.strip()] words = original_text.split() # Detect issues from the explanation issues = self._extract_issues_from_explanation(sections.get('explicacion', '')) # Calculate scores based on analysis readability_score = self._calculate_readability_from_analysis(original_text, issues) complexity_score = self._calculate_complexity_from_analysis(issues) overall_score = (readability_score * 0.5 + complexity_score * 0.5) # Extract suggestions from explanation suggestions = self._extract_suggestions(sections.get('explicacion', '')) # Detect jargon from original text jargon_words = self._detect_jargon(words) # Get sentence statistics sentence_stats = self._get_sentence_stats(sentences) vocabulary_stats = self._get_vocabulary_stats(words) return { 'overall_score': overall_score, 'readability_score': readability_score, 'complexity_score': complexity_score, 'sentence_stats': sentence_stats, 'vocabulary_stats': vocabulary_stats, 'readability_metrics': { 'issues_detected': issues, 'corrected_text': sections.get('corregido', '') }, 'grammar_stats': { 'issues_count': len(issues) }, 'jargon_count': len(jargon_words), 'jargon_words': jargon_words, 'long_sentences_count': sentence_stats.get('long_sentences', 0), 'suggestions': suggestions } def _extract_sections(self, text: str) -> Dict[str, str]: """Extract sections from Groq response""" sections = {} # Look for section headers if '### TEXTO CORREGIDO' in text or '###TEXTO CORREGIDO' in text: parts = text.split('###') for i, part in enumerate(parts): part_lower = part.lower() if 'texto corregido' in part_lower: # Get content until next section content = part.split('\n', 1)[1] if '\n' in part else part sections['corregido'] = content.split('###')[0].strip() elif 'explicación' in part_lower or 'explicacion' in part_lower: content = part.split('\n', 1)[1] if '\n' in part else part sections['explicacion'] = content.split('###')[0].strip() elif 'principios' in part_lower: content = part.split('\n', 1)[1] if '\n' in part else part sections['principios'] = content.split('###')[0].strip() return sections def _extract_issues_from_explanation(self, explanation: str) -> List[str]: """Extract detected issues from explanation text""" issues = [] explanation_lower = explanation.lower() # Check for common issue mentions if 'oración' in explanation_lower and ('larga' in explanation_lower or 'compleja' in explanation_lower): issues.append('long_sentences') if 'vocabulario' in explanation_lower or 'tecnicismo' in explanation_lower or 'jerga' in explanation_lower: issues.append('complex_vocabulary') if 'voz pasiva' in explanation_lower or 'pasiva' in explanation_lower: issues.append('passive_voice') if 'redundancia' in explanation_lower or 'repetición' in explanation_lower: issues.append('redundancy') return issues def _extract_suggestions(self, explanation: str) -> List[str]: """Extract improvement suggestions from explanation""" suggestions = [] # Split by sections in the explanation lines = explanation.split('\n') for line in lines: line = line.strip() # Look for bullet points or numbered items if line.startswith('-') or line.startswith('*') or (line and line[0].isdigit() and '.' in line[:3]): # Clean up the line clean_line = line.lstrip('-*0123456789. ').strip() if clean_line and len(clean_line) > 10: # Meaningful suggestion suggestions.append(clean_line) # If no suggestions found, add a general one if not suggestions: suggestions.append('Texto analizado con principios de lenguaje claro') return suggestions[:5] # Limit to 5 def _calculate_readability_from_analysis(self, text: str, issues: List[str]) -> float: """Calculate readability score based on text and detected issues""" sentences = [s.strip() for s in text.split('.') if s.strip()] if not sentences: return 50.0 # Base score from sentence structure avg_length = sum(len(s.split()) for s in sentences) / len(sentences) score = 100 - abs(avg_length - 20) * 2 # Penalize for issues score -= len(issues) * 8 return max(0, min(100, score)) def _calculate_complexity_from_analysis(self, issues: List[str]) -> float: """Calculate complexity score (inverse of complexity)""" # Start with high score score = 100.0 # Deduct for each issue type score -= len(issues) * 12 return max(0, min(100, score)) def _get_sentence_stats(self, sentences: List[str]) -> Dict[str, Any]: """Get statistics about sentences""" if not sentences: return {'count': 0, 'avg_length': 0, 'long_sentences': 0} sentence_lengths = [len(s.split()) for s in sentences] long_sentences = [s for s in sentences if len(s.split()) > 30] return { 'count': len(sentences), 'avg_length': sum(sentence_lengths) / len(sentences), 'max_length': max(sentence_lengths) if sentence_lengths else 0, 'min_length': min(sentence_lengths) if sentence_lengths else 0, 'long_sentences': len(long_sentences) } def _get_vocabulary_stats(self, words: List[str]) -> Dict[str, Any]: """Get statistics about vocabulary""" if not words: return {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0} unique_words = set(w.lower() for w in words) lexical_diversity = len(unique_words) / len(words) return { 'total_words': len(words), 'unique_words': len(unique_words), 'lexical_diversity': lexical_diversity, 'avg_word_length': sum(len(w) for w in words) / len(words) } def _detect_jargon(self, words: List[str]) -> List[str]: """Detect potential jargon/technical terms""" # Common administrative jargon in Spanish admin_jargon = [ 'normativa', 'procedimiento', 'expediente', 'tramitación', 'reglamento', 'disposición', 'resolución', 'acreditación', 'competencias', 'subsanación', 'notificación', 'administrativo' ] jargon = [] # Check for long words (likely technical) for word in words: clean_word = word.lower().strip('.,;:¿?¡!') if len(clean_word) > 12 and clean_word not in jargon: jargon.append(clean_word) # Check for known administrative jargon for word in words: clean_word = word.lower().strip('.,;:¿?¡!') if clean_word in admin_jargon and clean_word not in jargon: jargon.append(clean_word) return jargon[:10] # Limit to 10 terms def _fallback_analysis(self, text: str) -> Dict[str, Any]: """ Fallback analysis when Groq is not available Uses simple heuristics """ logger.warning("Using fallback analysis - Groq API not available") sentences = [s.strip() for s in text.split('.') if s.strip()] words = text.split() if not sentences or not words: return self._get_empty_result() # Simple scoring avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) avg_word_length = sum(len(w) for w in words) / len(words) readability_score = max(0, 100 - abs(avg_sentence_length - 20) * 2) long_sentences = [s for s in sentences if len(s.split()) > 30] complexity_score = max(0, 100 - len(long_sentences) * 10) overall_score = (readability_score + complexity_score) / 2 return { 'overall_score': overall_score, 'readability_score': readability_score, 'complexity_score': complexity_score, 'sentence_stats': self._get_sentence_stats(sentences), 'vocabulary_stats': self._get_vocabulary_stats(words), 'readability_metrics': {'issues_detected': []}, 'grammar_stats': {'issues_count': 0}, 'jargon_count': len(self._detect_jargon(words)), 'jargon_words': self._detect_jargon(words), 'long_sentences_count': len(long_sentences), 'suggestions': [ 'Groq API no disponible - usando análisis simple', 'Configurar GROQ_API_KEY para análisis completo' ] } def _get_empty_result(self) -> Dict[str, Any]: """Return empty result for invalid text""" return { 'overall_score': 0, 'readability_score': 0, 'complexity_score': 0, 'sentence_stats': {'count': 0, 'avg_length': 0, 'long_sentences': 0}, 'vocabulary_stats': {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0}, 'readability_metrics': {}, 'grammar_stats': {}, 'jargon_count': 0, 'jargon_words': [], 'long_sentences_count': 0, 'suggestions': ['Texto vacío o inválido'] }