Spaces:
Sleeping
Sleeping
| """ | |
| Aclarador Analyzer Wrapper | |
| Uses Groq API to analyze text clarity based on Aclarador principles | |
| """ | |
| import logging | |
| import os | |
| from typing import Dict, Any, List | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| # Try to import Groq | |
| try: | |
| from groq import Groq | |
| GROQ_AVAILABLE = True | |
| except ImportError: | |
| GROQ_AVAILABLE = False | |
| logger.warning("Groq library not available - install with: pip install groq") | |
| class AclaradorAnalyzer: | |
| """ | |
| Clarity analyzer using Groq API | |
| Based on Aclarador's system prompt for Spanish clarity analysis | |
| """ | |
| def __init__(self): | |
| self.system_prompt = self._load_system_prompt() | |
| self.groq_client = None | |
| if GROQ_AVAILABLE: | |
| # Get Groq API key from environment | |
| api_key = os.getenv('GROQ_API_KEY') | |
| if api_key: | |
| self.groq_client = Groq(api_key=api_key) | |
| logger.info("✅ Aclarador analyzer initialized with Groq API") | |
| else: | |
| logger.warning("⚠️ GROQ_API_KEY not found - using fallback analyzer") | |
| else: | |
| logger.warning("⚠️ Groq not available - using fallback analyzer") | |
| def _load_system_prompt(self) -> str: | |
| """Load system prompt from system_prompt.md""" | |
| try: | |
| prompt_path = Path(__file__).parent / 'aclarador' / 'system_prompt.md' | |
| with open(prompt_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Extract content between ``` markers | |
| if '```' in content: | |
| parts = content.split('```') | |
| if len(parts) >= 3: | |
| return parts[1].strip() | |
| # Fallback: use whole content | |
| return content | |
| except Exception as e: | |
| logger.error(f"Error loading system prompt: {e}") | |
| return self._get_default_system_prompt() | |
| def _get_default_system_prompt(self) -> str: | |
| """Default system prompt if file not found""" | |
| return """Eres un experto en lenguaje claro especializado en la mejora de textos en español. | |
| Analiza el texto y proporciona: | |
| 1. Una versión mejorada más clara | |
| 2. Explicación de las mejoras realizadas | |
| 3. Identificación de problemas de claridad""" | |
| def analyze(self, text: str, title: str = None) -> Dict[str, Any]: | |
| """ | |
| Analyze text using Groq API | |
| Returns Madrid Analyzer's expected format with clarity scores and suggestions | |
| """ | |
| # Use Groq if available, otherwise fallback | |
| if not self.groq_client: | |
| return self._fallback_analysis(text) | |
| try: | |
| # Call Groq API | |
| response = self.groq_client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| {"role": "system", "content": self.system_prompt}, | |
| {"role": "user", "content": text} | |
| ], | |
| temperature=0.3, | |
| max_tokens=2000 | |
| ) | |
| # Extract response text | |
| analysis_text = response.choices[0].message.content | |
| # Parse response and calculate scores | |
| return self._parse_groq_response(analysis_text, text) | |
| except Exception as e: | |
| logger.error(f"Error calling Groq API: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return self._fallback_analysis(text) | |
| def _parse_groq_response(self, analysis_text: str, original_text: str) -> Dict[str, Any]: | |
| """ | |
| Parse Groq's response and map to expected format | |
| Groq returns sections like: | |
| ### TEXTO CORREGIDO | |
| ### EXPLICACIÓN DE MEJORAS | |
| ### PRINCIPIOS APLICADOS | |
| """ | |
| # Extract sections | |
| sections = self._extract_sections(analysis_text) | |
| # Analyze original text for statistics | |
| sentences = [s.strip() for s in original_text.split('.') if s.strip()] | |
| words = original_text.split() | |
| # Detect issues from the explanation | |
| issues = self._extract_issues_from_explanation(sections.get('explicacion', '')) | |
| # Calculate scores based on analysis | |
| readability_score = self._calculate_readability_from_analysis(original_text, issues) | |
| complexity_score = self._calculate_complexity_from_analysis(issues) | |
| overall_score = (readability_score * 0.5 + complexity_score * 0.5) | |
| # Extract suggestions from explanation | |
| suggestions = self._extract_suggestions(sections.get('explicacion', '')) | |
| # Detect jargon from original text | |
| jargon_words = self._detect_jargon(words) | |
| # Get sentence statistics | |
| sentence_stats = self._get_sentence_stats(sentences) | |
| vocabulary_stats = self._get_vocabulary_stats(words) | |
| return { | |
| 'overall_score': overall_score, | |
| 'readability_score': readability_score, | |
| 'complexity_score': complexity_score, | |
| 'sentence_stats': sentence_stats, | |
| 'vocabulary_stats': vocabulary_stats, | |
| 'readability_metrics': { | |
| 'issues_detected': issues, | |
| 'corrected_text': sections.get('corregido', '') | |
| }, | |
| 'grammar_stats': { | |
| 'issues_count': len(issues) | |
| }, | |
| 'jargon_count': len(jargon_words), | |
| 'jargon_words': jargon_words, | |
| 'long_sentences_count': sentence_stats.get('long_sentences', 0), | |
| 'suggestions': suggestions | |
| } | |
| def _extract_sections(self, text: str) -> Dict[str, str]: | |
| """Extract sections from Groq response""" | |
| sections = {} | |
| # Look for section headers | |
| if '### TEXTO CORREGIDO' in text or '###TEXTO CORREGIDO' in text: | |
| parts = text.split('###') | |
| for i, part in enumerate(parts): | |
| part_lower = part.lower() | |
| if 'texto corregido' in part_lower: | |
| # Get content until next section | |
| content = part.split('\n', 1)[1] if '\n' in part else part | |
| sections['corregido'] = content.split('###')[0].strip() | |
| elif 'explicación' in part_lower or 'explicacion' in part_lower: | |
| content = part.split('\n', 1)[1] if '\n' in part else part | |
| sections['explicacion'] = content.split('###')[0].strip() | |
| elif 'principios' in part_lower: | |
| content = part.split('\n', 1)[1] if '\n' in part else part | |
| sections['principios'] = content.split('###')[0].strip() | |
| return sections | |
| def _extract_issues_from_explanation(self, explanation: str) -> List[str]: | |
| """Extract detected issues from explanation text""" | |
| issues = [] | |
| explanation_lower = explanation.lower() | |
| # Check for common issue mentions | |
| if 'oración' in explanation_lower and ('larga' in explanation_lower or 'compleja' in explanation_lower): | |
| issues.append('long_sentences') | |
| if 'vocabulario' in explanation_lower or 'tecnicismo' in explanation_lower or 'jerga' in explanation_lower: | |
| issues.append('complex_vocabulary') | |
| if 'voz pasiva' in explanation_lower or 'pasiva' in explanation_lower: | |
| issues.append('passive_voice') | |
| if 'redundancia' in explanation_lower or 'repetición' in explanation_lower: | |
| issues.append('redundancy') | |
| return issues | |
| def _extract_suggestions(self, explanation: str) -> List[str]: | |
| """Extract improvement suggestions from explanation""" | |
| suggestions = [] | |
| # Split by sections in the explanation | |
| lines = explanation.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| # Look for bullet points or numbered items | |
| if line.startswith('-') or line.startswith('*') or (line and line[0].isdigit() and '.' in line[:3]): | |
| # Clean up the line | |
| clean_line = line.lstrip('-*0123456789. ').strip() | |
| if clean_line and len(clean_line) > 10: # Meaningful suggestion | |
| suggestions.append(clean_line) | |
| # If no suggestions found, add a general one | |
| if not suggestions: | |
| suggestions.append('Texto analizado con principios de lenguaje claro') | |
| return suggestions[:5] # Limit to 5 | |
| def _calculate_readability_from_analysis(self, text: str, issues: List[str]) -> float: | |
| """Calculate readability score based on text and detected issues""" | |
| sentences = [s.strip() for s in text.split('.') if s.strip()] | |
| if not sentences: | |
| return 50.0 | |
| # Base score from sentence structure | |
| avg_length = sum(len(s.split()) for s in sentences) / len(sentences) | |
| score = 100 - abs(avg_length - 20) * 2 | |
| # Penalize for issues | |
| score -= len(issues) * 8 | |
| return max(0, min(100, score)) | |
| def _calculate_complexity_from_analysis(self, issues: List[str]) -> float: | |
| """Calculate complexity score (inverse of complexity)""" | |
| # Start with high score | |
| score = 100.0 | |
| # Deduct for each issue type | |
| score -= len(issues) * 12 | |
| return max(0, min(100, score)) | |
| def _get_sentence_stats(self, sentences: List[str]) -> Dict[str, Any]: | |
| """Get statistics about sentences""" | |
| if not sentences: | |
| return {'count': 0, 'avg_length': 0, 'long_sentences': 0} | |
| sentence_lengths = [len(s.split()) for s in sentences] | |
| long_sentences = [s for s in sentences if len(s.split()) > 30] | |
| return { | |
| 'count': len(sentences), | |
| 'avg_length': sum(sentence_lengths) / len(sentences), | |
| 'max_length': max(sentence_lengths) if sentence_lengths else 0, | |
| 'min_length': min(sentence_lengths) if sentence_lengths else 0, | |
| 'long_sentences': len(long_sentences) | |
| } | |
| def _get_vocabulary_stats(self, words: List[str]) -> Dict[str, Any]: | |
| """Get statistics about vocabulary""" | |
| if not words: | |
| return {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0} | |
| unique_words = set(w.lower() for w in words) | |
| lexical_diversity = len(unique_words) / len(words) | |
| return { | |
| 'total_words': len(words), | |
| 'unique_words': len(unique_words), | |
| 'lexical_diversity': lexical_diversity, | |
| 'avg_word_length': sum(len(w) for w in words) / len(words) | |
| } | |
| def _detect_jargon(self, words: List[str]) -> List[str]: | |
| """Detect potential jargon/technical terms""" | |
| # Common administrative jargon in Spanish | |
| admin_jargon = [ | |
| 'normativa', 'procedimiento', 'expediente', 'tramitación', | |
| 'reglamento', 'disposición', 'resolución', 'acreditación', | |
| 'competencias', 'subsanación', 'notificación', 'administrativo' | |
| ] | |
| jargon = [] | |
| # Check for long words (likely technical) | |
| for word in words: | |
| clean_word = word.lower().strip('.,;:¿?¡!') | |
| if len(clean_word) > 12 and clean_word not in jargon: | |
| jargon.append(clean_word) | |
| # Check for known administrative jargon | |
| for word in words: | |
| clean_word = word.lower().strip('.,;:¿?¡!') | |
| if clean_word in admin_jargon and clean_word not in jargon: | |
| jargon.append(clean_word) | |
| return jargon[:10] # Limit to 10 terms | |
| def _fallback_analysis(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Fallback analysis when Groq is not available | |
| Uses simple heuristics | |
| """ | |
| logger.warning("Using fallback analysis - Groq API not available") | |
| sentences = [s.strip() for s in text.split('.') if s.strip()] | |
| words = text.split() | |
| if not sentences or not words: | |
| return self._get_empty_result() | |
| # Simple scoring | |
| avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) | |
| avg_word_length = sum(len(w) for w in words) / len(words) | |
| readability_score = max(0, 100 - abs(avg_sentence_length - 20) * 2) | |
| long_sentences = [s for s in sentences if len(s.split()) > 30] | |
| complexity_score = max(0, 100 - len(long_sentences) * 10) | |
| overall_score = (readability_score + complexity_score) / 2 | |
| return { | |
| 'overall_score': overall_score, | |
| 'readability_score': readability_score, | |
| 'complexity_score': complexity_score, | |
| 'sentence_stats': self._get_sentence_stats(sentences), | |
| 'vocabulary_stats': self._get_vocabulary_stats(words), | |
| 'readability_metrics': {'issues_detected': []}, | |
| 'grammar_stats': {'issues_count': 0}, | |
| 'jargon_count': len(self._detect_jargon(words)), | |
| 'jargon_words': self._detect_jargon(words), | |
| 'long_sentences_count': len(long_sentences), | |
| 'suggestions': [ | |
| 'Groq API no disponible - usando análisis simple', | |
| 'Configurar GROQ_API_KEY para análisis completo' | |
| ] | |
| } | |
| def _get_empty_result(self) -> Dict[str, Any]: | |
| """Return empty result for invalid text""" | |
| return { | |
| 'overall_score': 0, | |
| 'readability_score': 0, | |
| 'complexity_score': 0, | |
| 'sentence_stats': {'count': 0, 'avg_length': 0, 'long_sentences': 0}, | |
| 'vocabulary_stats': {'total_words': 0, 'unique_words': 0, 'lexical_diversity': 0}, | |
| 'readability_metrics': {}, | |
| 'grammar_stats': {}, | |
| 'jargon_count': 0, | |
| 'jargon_words': [], | |
| 'long_sentences_count': 0, | |
| 'suggestions': ['Texto vacío o inválido'] | |
| } | |