Spaces:
Sleeping
Sleeping
| """ | |
| simplification_service.py | |
| ========================= | |
| Context-aware text simplification for the Highlight-to-Understand feature. | |
| When a student highlights text, this service: | |
| 1. Simplifies without destroying literary voice | |
| 2. Explains the author's intent | |
| 3. Provides vocabulary support with relatable analogies | |
| 4. Adjusts explanation level based on student reading profile | |
| 5. Detects literary devices (metaphor, irony, foreshadowing, etc.) | |
| 2-tier architecture: | |
| Tier 1: Gemini API (best quality) | |
| Tier 2: Rule-based (always available) | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from typing import Any, Dict, List, Optional | |
| from services.gemini_service import GeminiService | |
| import os | |
| from services.hf_inference_service import HFInferenceService | |
| from services.pronunciation_service import PronunciationService | |
| _hf_inference = HFInferenceService(os.getenv("HF_API_TOKEN")) | |
| _pronunciation = PronunciationService() | |
| # ── Literary device detection ───────────────────────────────────────────────── | |
| # ── Kinyarwanda cultural bridge ────────────────────────────────────────────── | |
| # Maps Western literary concepts → Rwandan cultural parallels | |
| # Helps students from Rwanda connect unfamiliar ideas to lived experience. | |
| _KINYARWANDA_BRIDGE: Dict[str, str] = { | |
| # Honour & family | |
| "honour": "Like *icyubahiro* (respect) in Rwandan culture — something earned through how you treat others.", | |
| "honor": "Like *icyubahiro* (respect) in Rwandan culture — something earned through how you treat others.", | |
| "noble": "Think of the *umwami* (king) and the respect shown to community elders.", | |
| "loyalty": "Like the loyalty of *inshuti* (close friends) who stand with you no matter what.", | |
| "clan": "Similar to *umuryango* — your extended family who share your identity and history.", | |
| "tribe": "Like *ubwoko* — the group you belong to that shapes your community ties.", | |
| # Fate & nature | |
| "destiny": "Like *inzira* (the path) — Rwandans believe each person walks a path shaped by God (*Imana*).", | |
| "fate": "Similar to the Rwandan idea that *Imana* (God) guides every life's course.", | |
| "harvest": "Like the *isarura* — the season when families come together to gather crops from the hills.", | |
| "drought": "Like the *inzara* (hunger seasons) that families in rural Rwanda still endure.", | |
| # Power & conflict | |
| "tyranny": "Like oppressive rule — Rwanda's history of *ubukuru bubi* (bad leadership) makes this very real.", | |
| "exile": "Like those forced to leave home — many Rwandans know exile from 1959 and 1994.", | |
| "war": "Like *intambara* — Rwanda's history makes the weight of war deeply understood.", | |
| "betrayal": "Like breaking *amasezerano* (a sworn agreement) — considered deeply shameful.", | |
| # Social life | |
| "feast": "Like an *umuganda* celebration or wedding feast — a gathering of community with food and song.", | |
| "market": "Like the *amasoko* — where community members exchange goods and news.", | |
| "elder": "Like *umukuru* — the respected voice of wisdom in every Rwandan village.", | |
| "ancestors": "Like *imigabane y'ababyeyi* — the legacy and spirit of those who came before.", | |
| # Love & relationships | |
| "courtship": "Like traditional *gusaba* — the formal process of asking a family for their daughter's hand.", | |
| "jealousy": "Like *ishyari* — the feeling of wanting what another person has.", | |
| # Abstract concepts | |
| "ambition": "Like *ubushake* (strong desire) — the drive to achieve that many Rwandan students carry.", | |
| "grief": "Like *agahinda* — the deep sorrow felt when loss touches a family.", | |
| "justice": "Like *ubutabera* and the *gacaca* courts — where truth and community healing meet.", | |
| "identity": "Like *indangamuntu* (identity card) — but deeper, who you are inside, your values and story.", | |
| # Education & learning | |
| "school": "Like *ishuri* — a place of growth and opportunity that many Rwandan families sacrifice for.", | |
| "teacher": "Like *umwarimu* — a deeply respected figure who shapes the next generation.", | |
| "student": "Like *umunyeshuri* — one who works hard today for a better tomorrow.", | |
| "exam": "Like *ikizamini* — a test of what you have learned, taken seriously in Rwanda's education system.", | |
| "knowledge": "Like *ubumenyi* — the wisdom passed down from elders and gained through study.", | |
| "learning": "Like *kwiga* — the lifelong Rwandan value of seeking understanding and growth.", | |
| # Nature & environment | |
| "river": "Like the *Nyabarongo* — Rwanda's largest river, central to life, farming, and history.", | |
| "mountain": "Like the *ibirunga* (volcanoes) of the north — awe-inspiring landmarks that shape the land.", | |
| "forest": "Like *ishyamba* — forests that many Rwandan families depend on for firewood and food.", | |
| "rain": "Like *imvura* — in Rwanda, rain is a blessing that brings green hills to life.", | |
| "sun": "Like *izuba* — the sun that rises over a thousand hills every morning.", | |
| "hill": "Like the *inkike* (hills) that define Rwanda's famous landscape — *pays des mille collines*.", | |
| # Community & belonging | |
| "community": "Like *umuryango w'abaturage* — the village community that raises every child together.", | |
| "refugee": "Like *impunzi* — a word with deep meaning in Rwanda's history of displacement and return.", | |
| "home": "Like *urugo* — the family homestead, a sacred place of belonging.", | |
| "neighbour": "Like *umuturanyi* — whom Rwandans greet every morning by name.", | |
| "stranger": "Like *umunyamahanga* — in Rwandan culture, even strangers deserve hospitality (*ubushyingiro*).", | |
| # Modern concepts | |
| "government": "Like *leta* — the state that many Rwandans have seen transform dramatically since 1994.", | |
| "democracy": "Like *demokarasi* — a concept Rwanda works towards through its *Gacaca* and local governance.", | |
| "city": "Like *umujyi wa Kigali* — a rapidly growing city that many students dream of reaching one day.", | |
| "technology": "Like Rwanda's *Smart Rwanda* vision — a country that has embraced mobile phones and internet.", | |
| "phone": "Like the mobile phones that connect *abaturage* (citizens) across Rwanda's hills.", | |
| # More literary themes | |
| "power": "Like *ingufu* — in Rwanda, the abuse of power has deep historical resonance.", | |
| "freedom": "Like *ubwigenge* — independence, celebrated every 1st of July in Rwanda.", | |
| "hope": "Like *ibyiringiro* — the hope that Rwanda's *Agaciro* (dignity, self-worth) is built upon.", | |
| "shame": "Like *isoni* — a powerful feeling in Rwandan culture, tied to community reputation.", | |
| "pride": "Like *ikuzo* — the pride Rwandans feel in their culture, resilience, and progress.", | |
| "courage": "Like *ubutwari* — praised in Rwandan history as the quality of great leaders and liberators.", | |
| "loneliness": "Like *ubusa* — an unusual feeling in a culture where communal ties are deep.", | |
| "forgiveness": "Like *imbabazi* — central to Rwanda's post-genocide reconciliation process.", | |
| "revenge": "Like *impyisi* (hyena) in Rwandan fables — seeking revenge but ultimately paying the price.", | |
| "corruption": "Like *akarengane* — a betrayal of *agaciro* (dignity) that Rwanda actively fights.", | |
| "sacrifice": "Like *gutanga* — giving something precious for others; a virtue celebrated in Rwandan songs.", | |
| } | |
| def _get_kinyarwanda_bridge(text: str) -> Optional[str]: | |
| """Find a Kinyarwanda cultural analogy for concepts in the text.""" | |
| text_lower = text.lower() | |
| matches = [] | |
| for concept, bridge in _KINYARWANDA_BRIDGE.items(): | |
| if concept in text_lower: | |
| matches.append(bridge) | |
| if not matches: | |
| return None | |
| # Return the most relevant bridge (first match, keep it short) | |
| return matches[0] | |
| _DEVICE_PATTERNS = [ | |
| (r"\blike\s+(?:a|an)\s+\w+", "simile", "a comparison using 'like' or 'as'"), | |
| (r"\bas\s+\w+\s+as\b", "simile", "a comparison using 'like' or 'as'"), | |
| (r"\bnot\s+\w+\s+but\s+\w+", "antithesis", "contrasting ideas placed side by side"), | |
| (r"(.)\1{2,}", "repetition", "a word or sound repeated for emphasis"), | |
| (r"\b(?:O|Oh)\s+[A-Z]", "apostrophe", "addressing someone absent or an abstract idea"), | |
| ] | |
| _ARCHAIC_GLOSSARY: Dict[str, str] = { | |
| "thou": "you", "thee": "you", "thy": "your", "thine": "yours", | |
| "hath": "has", "doth": "does", "art": "are", "wilt": "will", | |
| "shalt": "shall", "ere": "before", "hence": "from here / therefore", | |
| "hither": "to here", "thither": "to there", "wherefore": "why", | |
| "forsooth": "truly", "prithee": "please", "methinks": "I think", | |
| "anon": "soon", "perchance": "perhaps", "nay": "no", "yea": "yes", | |
| "betwixt": "between", "whence": "from where", "oft": "often", | |
| "ere": "before", "alas": "unfortunately", | |
| } | |
| class SimplificationService: | |
| """ | |
| Context-aware simplification engine for highlighted text. | |
| """ | |
| def __init__(self): | |
| self.gemini = GeminiService() | |
| def simplify( | |
| self, | |
| highlighted_text: str, | |
| book_title: str = "", | |
| author: str = "", | |
| doc_type: str = "generic", | |
| chapter_context: str = "", | |
| speaker: str = "", | |
| reading_level: str = "intermediate", | |
| language: str = "en", | |
| ) -> Dict[str, Any]: | |
| """ | |
| Simplify highlighted text with full context awareness. | |
| Returns: | |
| { | |
| "simple_version": str, | |
| "author_intent": str, | |
| "vocabulary": [{word, meaning, analogy}], | |
| "literary_devices": [{device, explanation}], | |
| "cultural_context": str | None, | |
| "phonics": Dict[str, Any] | None, | |
| "tier": "gemini" | "rule_based", | |
| } | |
| """ | |
| # --- Phonics Enhancement (Project Revamp) --- | |
| phonics_data = None | |
| clean_text = highlighted_text.strip(".,!?;:\"' ").lower() | |
| if " " not in clean_text and len(clean_text) > 1: | |
| phonics_data = _pronunciation.get_phonics_breakdown(highlighted_text) | |
| # Primary: Gemini Cloud LLM | |
| if self.gemini.is_available(): | |
| try: | |
| result = self._simplify_gemini( | |
| highlighted_text, book_title, author, doc_type, | |
| chapter_context, speaker, reading_level, language, | |
| ) | |
| if result and result.get("simple_version"): | |
| result["tier"] = "gemini" | |
| # Supplement with local assets + Gemini Vocab | |
| llm_vocab = self._extract_vocabulary_gemini(highlighted_text, reading_level) | |
| rule_vocab = self._extract_vocabulary(highlighted_text) | |
| # Merge (LLM takes priority) | |
| final_vocab = llm_vocab | |
| seen_words = {v["word"].lower() for v in llm_vocab} | |
| for rv in rule_vocab: | |
| if rv["word"].lower() not in seen_words: | |
| final_vocab.append(rv) | |
| result["vocabulary"] = final_vocab | |
| result["literary_devices"] = self._detect_devices(highlighted_text) | |
| if not result.get("kinyarwanda_bridge"): | |
| result["kinyarwanda_bridge"] = _get_kinyarwanda_bridge(highlighted_text) | |
| # Better Pronunciation (Gemini Upgrade) | |
| gem_phonics = result.get("phonics_guide") | |
| if gem_phonics and isinstance(gem_phonics, dict) and gem_phonics.get("syllables"): | |
| result["phonics"] = { | |
| "word": highlighted_text, | |
| "syllables": gem_phonics.get("syllables"), | |
| "phonics": gem_phonics.get("syllables"), | |
| "display": " · ".join(gem_phonics.get("syllables")), | |
| "pronunciation": gem_phonics.get("pronunciation") | |
| } | |
| else: | |
| result["phonics"] = phonics_data | |
| return result | |
| except Exception as e: | |
| print(f"Gemini simplification failed: {e}") | |
| # Tier 1.5: HF Inference (Serverless) | |
| if os.getenv("USE_HF_INFERENCE") == "1" and _hf_inference.api_token: | |
| try: | |
| result = _hf_inference.simplify_text( | |
| highlighted_text, book_title, author, doc_type, | |
| chapter_context, speaker, reading_level, language | |
| ) | |
| if result and result.get("simple_version"): | |
| result["tier"] = "hf_inference" | |
| # Add Kinyarwanda bridge if missing | |
| if not result.get("kinyarwanda_bridge"): | |
| result["kinyarwanda_bridge"] = _get_kinyarwanda_bridge(highlighted_text) | |
| # Merge with local literary device detection | |
| result["literary_devices"] = self._detect_devices(highlighted_text) | |
| result["phonics"] = phonics_data | |
| return result | |
| except Exception as e: | |
| print(f"HF Inference simplification failed: {e}") | |
| # Fallback: Rule-based (always works) | |
| return self._simplify_rule_based( | |
| highlighted_text, book_title, author, doc_type, | |
| speaker, reading_level, language, phonics_data, | |
| ) | |
| def _simplify_gemini( | |
| self, | |
| text: str, | |
| book_title: str, | |
| author: str, | |
| doc_type: str, | |
| chapter_context: str, | |
| speaker: str, | |
| reading_level: str, | |
| language: str, | |
| ) -> Dict[str, Any]: | |
| """Use Gemini for high-performance simplification.""" | |
| level_desc = { | |
| "beginner": "a 9-year-old Primary 4 (P4) student", | |
| "intermediate": "a 10-year-old Primary 5 (P5) student", | |
| "advanced": "an 11-12 year old Primary 6 (P6) student", | |
| }.get(reading_level, "a 10-year-old Primary 5 student") | |
| speaker_ctx = f' spoken by the character "{speaker}"' if speaker else "" | |
| book_ctx = f' from "{book_title}" by {author}' if book_title else "" | |
| system_instruction = f"You are an expert pedagogical AI specializing in simplifying literature for {level_desc}. You focus on clarity, cultural relevance (Rwanda), and preserving the author's emotional resonance. You provide extremely simple, visual explanations for difficult words, often using metaphors a child can understand." | |
| if language == "fr": | |
| prompt = f"""Analyse ce passage{book_ctx}{speaker_ctx}: | |
| "{text}" | |
| Contexte du chapitre: {chapter_context[:500] if chapter_context else 'Non fourni'} | |
| Réponds en JSON avec exactement ces clés: | |
| {{ | |
| "simple_version": "Version simplifiée (2-3 phrases)", | |
| "author_intent": "Pourquoi l'auteur a écrit cela ainsi", | |
| "cultural_context": "Contexte culturel si pertinent, sinon null", | |
| "kinyarwanda_bridge": "Analogie culturelle rwandaise — compare avec un concept rwandais (imigabane, umuganda, icyubahiro, etc.)", | |
| "vocabulary": [ | |
| {{"word": "string", "meaning": "définition simple pour un enfant", "analogy": "comparaison simple", "category": "ex: Archaïque, Vocabulaire, Métaphore"}} | |
| ], | |
| "phonics_guide": {{ | |
| "syllables": ["syl", "la", "be"], | |
| "pronunciation": "pro-non-cia-cion" | |
| }} | |
| }} (phonics_guide uniquement pour un seul mot, sinon null)""" | |
| else: | |
| prompt = f"""Analyze this passage{book_ctx}{speaker_ctx}: | |
| "{text}" | |
| Chapter context: {chapter_context[:500] if chapter_context else 'Not provided'} | |
| Respond in JSON with exactly these keys: | |
| {{ | |
| "simple_version": "Simplified version (2-3 sentences)", | |
| "author_intent": "Why the author wrote it this way", | |
| "cultural_context": "Cultural context if relevant, otherwise null", | |
| "kinyarwanda_bridge": "Rwanda cultural connection — compare to umuganda, gacaca, icyubahiro, or another Rwandan concept", | |
| "vocabulary": [ | |
| {{"word": "string", "meaning": "simple child-friendly definition", "analogy": "simple comparison for a 10-year-old", "category": "e.g. Archaic, Vocabulary, Idiom"}} | |
| ], | |
| "phonics_guide": {{ | |
| "syllables": ["syl", "la", "bles"], | |
| "pronunciation": "pro-nun-see-ay-shun" | |
| }} | |
| }} (phonics_guide is only for single words, otherwise null)""" | |
| result = self.gemini.generate_json(prompt, system_instruction) | |
| return result if isinstance(result, dict) else {} | |
| def _simplify_rule_based( | |
| self, | |
| text: str, | |
| book_title: str, | |
| author: str, | |
| doc_type: str, | |
| speaker: str, | |
| reading_level: str, | |
| language: str, | |
| phonics_data: Optional[Dict[str, Any]] = None, | |
| ) -> Dict[str, Any]: | |
| """Rule-based fallback simplification.""" | |
| # Replace archaic words | |
| simplified_words = text.split() | |
| replaced = [] | |
| for word in simplified_words: | |
| clean = word.lower().strip(".,!?;:'\"()-") | |
| if clean in _ARCHAIC_GLOSSARY: | |
| replacement = _ARCHAIC_GLOSSARY[clean] | |
| # Preserve punctuation | |
| trailing = "" | |
| if word and word[-1] in ".,!?;:'\"()-": | |
| trailing = word[-1] | |
| replaced.append(f"{replacement}{trailing}") | |
| else: | |
| replaced.append(word) | |
| simple_version = " ".join(replaced) | |
| # Break long sentences | |
| sentences = re.split(r"(?<=[.!?])\s+", simple_version) | |
| shortened = [] | |
| for sent in sentences: | |
| if len(sent.split()) > 20: | |
| if ", and " in sent: | |
| parts = sent.split(", and ", 1) | |
| shortened.append(parts[0] + ".") | |
| shortened.append(parts[1].strip().capitalize()) | |
| elif "; " in sent: | |
| parts = sent.split("; ", 1) | |
| shortened.append(parts[0] + ".") | |
| shortened.append(parts[1].strip().capitalize()) | |
| else: | |
| shortened.append(sent) | |
| else: | |
| shortened.append(sent) | |
| simple_version = " ".join(shortened) | |
| # Generate basic author intent | |
| speaker_note = f" {speaker} is saying" if speaker else " The author is saying" | |
| if doc_type == "play": | |
| intent = f"In this {doc_type},{speaker_note} something important about the situation or characters." | |
| else: | |
| intent = f"The author uses these words to create a vivid picture and advance the story." | |
| if book_title: | |
| intent += f' This is from "{book_title}"' | |
| if author: | |
| intent += f" by {author}" | |
| intent += "." | |
| # Add Kinyarwanda cultural bridge if applicable | |
| kinyarwanda_bridge = _get_kinyarwanda_bridge(text) | |
| return { | |
| "simple_version": simple_version, | |
| "author_intent": intent, | |
| "vocabulary": self._extract_vocabulary(text), | |
| "literary_devices": self._detect_devices(text), | |
| "cultural_context": None, | |
| "kinyarwanda_bridge": kinyarwanda_bridge, | |
| "phonics": phonics_data, | |
| "tier": "rule_based", | |
| } | |
| def _extract_vocabulary_gemini(self, text: str, reading_level: str) -> List[Dict[str, str]]: | |
| """Extract difficult words and generate child-friendly definitions using Gemini.""" | |
| if not self.gemini.is_available(): | |
| return self._extract_vocabulary(text) | |
| level_desc = { | |
| "beginner": "Primary 4 (9 years old)", | |
| "intermediate": "Primary 5 (10 years old)", | |
| "advanced": "Primary 6 (11-12 years old)", | |
| }.get(reading_level, "Primary 5 student") | |
| prompt = f"""Identify 3-5 difficult or important words from this text and provide child-friendly definitions: | |
| "{text}" | |
| The definitions should be suitable for a {level_desc}. | |
| Respond in JSON as a list of objects with "word", "meaning", "analogy" (a simple comparison), and "category" (e.g. Vocabulary, Archaic, Figurative). | |
| Example: [{{"word": "astounded", "meaning": "very surprised", "analogy": "like when you see a magic trick", "category": "Vocabulary"}}] | |
| """ | |
| result = self.gemini.generate_json(prompt) | |
| return result if isinstance(result, list) else self._extract_vocabulary(text) | |
| def _extract_vocabulary(self, text: str) -> List[Dict[str, str]]: | |
| """Extract difficult words with definitions.""" | |
| vocab = [] | |
| words = re.findall(r"[a-zA-Z']+", text) | |
| for word in words: | |
| clean = word.lower().strip("'") | |
| if clean in _ARCHAIC_GLOSSARY: | |
| vocab.append({ | |
| "word": word, | |
| "meaning": _ARCHAIC_GLOSSARY[clean], | |
| "analogy": f'Think of it as a fancy old way of saying "{_ARCHAIC_GLOSSARY[clean]}".', | |
| "type": "archaic", | |
| "category": "Archaic", | |
| }) | |
| # Deduplicate | |
| seen = set() | |
| unique = [] | |
| for v in vocab: | |
| if v["word"].lower() not in seen: | |
| seen.add(v["word"].lower()) | |
| unique.append(v) | |
| return unique | |
| def _detect_devices(self, text: str) -> List[Dict[str, str]]: | |
| """Detect literary devices in the text.""" | |
| devices = [] | |
| for pattern, device_name, explanation in _DEVICE_PATTERNS: | |
| if re.search(pattern, text, re.IGNORECASE): | |
| devices.append({ | |
| "device": device_name, | |
| "explanation": explanation, | |
| }) | |
| # Deduplicate by device name | |
| seen = set() | |
| unique = [] | |
| for d in devices: | |
| if d["device"] not in seen: | |
| seen.add(d["device"]) | |
| unique.append(d) | |
| return unique | |