File size: 22,507 Bytes
162cb6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
"""
simplification_service.py
=========================
Context-aware text simplification for the Highlight-to-Understand feature.

When a student highlights text, this service:
1. Simplifies without destroying literary voice
2. Explains the author's intent
3. Provides vocabulary support with relatable analogies
4. Adjusts explanation level based on student reading profile
5. Detects literary devices (metaphor, irony, foreshadowing, etc.)

2-tier architecture:
  Tier 1: Gemini API (best quality)
  Tier 2: Rule-based (always available)
"""

from __future__ import annotations

import re
from typing import Any, Dict, List, Optional

from services.gemini_service import GeminiService
import os
from services.hf_inference_service import HFInferenceService
from services.pronunciation_service import PronunciationService

_hf_inference = HFInferenceService(os.getenv("HF_API_TOKEN"))
_pronunciation = PronunciationService()


# ── Literary device detection ─────────────────────────────────────────────────

# ── Kinyarwanda cultural bridge ──────────────────────────────────────────────
# Maps Western literary concepts β†’ Rwandan cultural parallels
# Helps students from Rwanda connect unfamiliar ideas to lived experience.

_KINYARWANDA_BRIDGE: Dict[str, str] = {
    # Honour & family
    "honour":        "Like *icyubahiro* (respect) in Rwandan culture β€” something earned through how you treat others.",
    "honor":         "Like *icyubahiro* (respect) in Rwandan culture β€” something earned through how you treat others.",
    "noble":         "Think of the *umwami* (king) and the respect shown to community elders.",
    "loyalty":       "Like the loyalty of *inshuti* (close friends) who stand with you no matter what.",
    "clan":          "Similar to *umuryango* β€” your extended family who share your identity and history.",
    "tribe":         "Like *ubwoko* β€” the group you belong to that shapes your community ties.",
    # Fate & nature
    "destiny":       "Like *inzira* (the path) β€” Rwandans believe each person walks a path shaped by God (*Imana*).",
    "fate":          "Similar to the Rwandan idea that *Imana* (God) guides every life's course.",
    "harvest":       "Like the *isarura* β€” the season when families come together to gather crops from the hills.",
    "drought":       "Like the *inzara* (hunger seasons) that families in rural Rwanda still endure.",
    # Power & conflict
    "tyranny":       "Like oppressive rule β€” Rwanda's history of *ubukuru bubi* (bad leadership) makes this very real.",
    "exile":         "Like those forced to leave home β€” many Rwandans know exile from 1959 and 1994.",
    "war":           "Like *intambara* β€” Rwanda's history makes the weight of war deeply understood.",
    "betrayal":      "Like breaking *amasezerano* (a sworn agreement) β€” considered deeply shameful.",
    # Social life
    "feast":         "Like an *umuganda* celebration or wedding feast β€” a gathering of community with food and song.",
    "market":        "Like the *amasoko* β€” where community members exchange goods and news.",
    "elder":         "Like *umukuru* β€” the respected voice of wisdom in every Rwandan village.",
    "ancestors":     "Like *imigabane y'ababyeyi* β€” the legacy and spirit of those who came before.",
    # Love & relationships
    "courtship":     "Like traditional *gusaba* β€” the formal process of asking a family for their daughter's hand.",
    "jealousy":      "Like *ishyari* β€” the feeling of wanting what another person has.",
    # Abstract concepts
    "ambition":      "Like *ubushake* (strong desire) β€” the drive to achieve that many Rwandan students carry.",
    "grief":         "Like *agahinda* β€” the deep sorrow felt when loss touches a family.",
    "justice":       "Like *ubutabera* and the *gacaca* courts β€” where truth and community healing meet.",
    "identity":      "Like *indangamuntu* (identity card) β€” but deeper, who you are inside, your values and story.",
    # Education & learning
    "school":        "Like *ishuri* β€” a place of growth and opportunity that many Rwandan families sacrifice for.",
    "teacher":       "Like *umwarimu* β€” a deeply respected figure who shapes the next generation.",
    "student":       "Like *umunyeshuri* β€” one who works hard today for a better tomorrow.",
    "exam":          "Like *ikizamini* β€” a test of what you have learned, taken seriously in Rwanda's education system.",
    "knowledge":     "Like *ubumenyi* β€” the wisdom passed down from elders and gained through study.",
    "learning":      "Like *kwiga* β€” the lifelong Rwandan value of seeking understanding and growth.",
    # Nature & environment
    "river":         "Like the *Nyabarongo* β€” Rwanda's largest river, central to life, farming, and history.",
    "mountain":      "Like the *ibirunga* (volcanoes) of the north β€” awe-inspiring landmarks that shape the land.",
    "forest":        "Like *ishyamba* β€” forests that many Rwandan families depend on for firewood and food.",
    "rain":          "Like *imvura* β€” in Rwanda, rain is a blessing that brings green hills to life.",
    "sun":           "Like *izuba* β€” the sun that rises over a thousand hills every morning.",
    "hill":          "Like the *inkike* (hills) that define Rwanda's famous landscape β€” *pays des mille collines*.",
    # Community & belonging
    "community":     "Like *umuryango w'abaturage* β€” the village community that raises every child together.",
    "refugee":       "Like *impunzi* β€” a word with deep meaning in Rwanda's history of displacement and return.",
    "home":          "Like *urugo* β€” the family homestead, a sacred place of belonging.",
    "neighbour":     "Like *umuturanyi* β€” whom Rwandans greet every morning by name.",
    "stranger":      "Like *umunyamahanga* β€” in Rwandan culture, even strangers deserve hospitality (*ubushyingiro*).",
    # Modern concepts
    "government":    "Like *leta* β€” the state that many Rwandans have seen transform dramatically since 1994.",
    "democracy":     "Like *demokarasi* β€” a concept Rwanda works towards through its *Gacaca* and local governance.",
    "city":          "Like *umujyi wa Kigali* β€” a rapidly growing city that many students dream of reaching one day.",
    "technology":    "Like Rwanda's *Smart Rwanda* vision β€” a country that has embraced mobile phones and internet.",
    "phone":         "Like the mobile phones that connect *abaturage* (citizens) across Rwanda's hills.",
    # More literary themes
    "power":         "Like *ingufu* β€” in Rwanda, the abuse of power has deep historical resonance.",
    "freedom":       "Like *ubwigenge* β€” independence, celebrated every 1st of July in Rwanda.",
    "hope":          "Like *ibyiringiro* β€” the hope that Rwanda's *Agaciro* (dignity, self-worth) is built upon.",
    "shame":         "Like *isoni* β€” a powerful feeling in Rwandan culture, tied to community reputation.",
    "pride":         "Like *ikuzo* β€” the pride Rwandans feel in their culture, resilience, and progress.",
    "courage":       "Like *ubutwari* β€” praised in Rwandan history as the quality of great leaders and liberators.",
    "loneliness":    "Like *ubusa* β€” an unusual feeling in a culture where communal ties are deep.",
    "forgiveness":   "Like *imbabazi* β€” central to Rwanda's post-genocide reconciliation process.",
    "revenge":       "Like *impyisi* (hyena) in Rwandan fables β€” seeking revenge but ultimately paying the price.",
    "corruption":    "Like *akarengane* β€” a betrayal of *agaciro* (dignity) that Rwanda actively fights.",
    "sacrifice":     "Like *gutanga* β€” giving something precious for others; a virtue celebrated in Rwandan songs.",
}



def _get_kinyarwanda_bridge(text: str) -> Optional[str]:
    """Find a Kinyarwanda cultural analogy for concepts in the text."""
    text_lower = text.lower()
    matches = []
    for concept, bridge in _KINYARWANDA_BRIDGE.items():
        if concept in text_lower:
            matches.append(bridge)
    if not matches:
        return None
    # Return the most relevant bridge (first match, keep it short)
    return matches[0]


_DEVICE_PATTERNS = [
    (r"\blike\s+(?:a|an)\s+\w+", "simile", "a comparison using 'like' or 'as'"),
    (r"\bas\s+\w+\s+as\b", "simile", "a comparison using 'like' or 'as'"),
    (r"\bnot\s+\w+\s+but\s+\w+", "antithesis", "contrasting ideas placed side by side"),
    (r"(.)\1{2,}", "repetition", "a word or sound repeated for emphasis"),
    (r"\b(?:O|Oh)\s+[A-Z]", "apostrophe", "addressing someone absent or an abstract idea"),
]

_ARCHAIC_GLOSSARY: Dict[str, str] = {
    "thou": "you", "thee": "you", "thy": "your", "thine": "yours",
    "hath": "has", "doth": "does", "art": "are", "wilt": "will",
    "shalt": "shall", "ere": "before", "hence": "from here / therefore",
    "hither": "to here", "thither": "to there", "wherefore": "why",
    "forsooth": "truly", "prithee": "please", "methinks": "I think",
    "anon": "soon", "perchance": "perhaps", "nay": "no", "yea": "yes",
    "betwixt": "between", "whence": "from where", "oft": "often",
    "ere": "before", "alas": "unfortunately",
}


class SimplificationService:
    """
    Context-aware simplification engine for highlighted text.
    """

    def __init__(self):
        self.gemini = GeminiService()

    def simplify(
        self,
        highlighted_text: str,
        book_title: str = "",
        author: str = "",
        doc_type: str = "generic",
        chapter_context: str = "",
        speaker: str = "",
        reading_level: str = "intermediate",
        language: str = "en",
    ) -> Dict[str, Any]:
        """
        Simplify highlighted text with full context awareness.

        Returns:
            {
                "simple_version": str,
                "author_intent": str,
                "vocabulary": [{word, meaning, analogy}],
                "literary_devices": [{device, explanation}],
                "cultural_context": str | None,
                "phonics": Dict[str, Any] | None,
                "tier": "gemini" | "rule_based",
            }
        """
        # --- Phonics Enhancement (Project Revamp) ---
        phonics_data = None
        clean_text = highlighted_text.strip(".,!?;:\"' ").lower()
        if " " not in clean_text and len(clean_text) > 1:
            phonics_data = _pronunciation.get_phonics_breakdown(highlighted_text)

        # Primary: Gemini Cloud LLM
        if self.gemini.is_available():
            try:
                result = self._simplify_gemini(
                    highlighted_text, book_title, author, doc_type,
                    chapter_context, speaker, reading_level, language,
                )
                if result and result.get("simple_version"):
                    result["tier"] = "gemini"
                    # Supplement with local assets + Gemini Vocab
                    llm_vocab = self._extract_vocabulary_gemini(highlighted_text, reading_level)
                    rule_vocab = self._extract_vocabulary(highlighted_text)

                    # Merge (LLM takes priority)
                    final_vocab = llm_vocab
                    seen_words = {v["word"].lower() for v in llm_vocab}
                    for rv in rule_vocab:
                        if rv["word"].lower() not in seen_words:
                            final_vocab.append(rv)

                    result["vocabulary"] = final_vocab
                    result["literary_devices"] = self._detect_devices(highlighted_text)
                    if not result.get("kinyarwanda_bridge"):
                        result["kinyarwanda_bridge"] = _get_kinyarwanda_bridge(highlighted_text)
                    
                    # Better Pronunciation (Gemini Upgrade)
                    gem_phonics = result.get("phonics_guide")
                    if gem_phonics and isinstance(gem_phonics, dict) and gem_phonics.get("syllables"):
                        result["phonics"] = {
                            "word": highlighted_text,
                            "syllables": gem_phonics.get("syllables"),
                            "phonics": gem_phonics.get("syllables"),
                            "display": " Β· ".join(gem_phonics.get("syllables")),
                            "pronunciation": gem_phonics.get("pronunciation")
                        }
                    else:
                        result["phonics"] = phonics_data

                    return result
            except Exception as e:
                print(f"Gemini simplification failed: {e}")

        # Tier 1.5: HF Inference (Serverless)
        if os.getenv("USE_HF_INFERENCE") == "1" and _hf_inference.api_token:
            try:
                result = _hf_inference.simplify_text(
                    highlighted_text, book_title, author, doc_type,
                    chapter_context, speaker, reading_level, language
                )
                if result and result.get("simple_version"):
                    result["tier"] = "hf_inference"
                    # Add Kinyarwanda bridge if missing
                    if not result.get("kinyarwanda_bridge"):
                        result["kinyarwanda_bridge"] = _get_kinyarwanda_bridge(highlighted_text)
                    # Merge with local literary device detection
                    result["literary_devices"] = self._detect_devices(highlighted_text)
                    result["phonics"] = phonics_data
                    return result
            except Exception as e:
                print(f"HF Inference simplification failed: {e}")

        # Fallback: Rule-based (always works)
        return self._simplify_rule_based(
            highlighted_text, book_title, author, doc_type,
            speaker, reading_level, language, phonics_data,
        )

    def _simplify_gemini(
        self,
        text: str,
        book_title: str,
        author: str,
        doc_type: str,
        chapter_context: str,
        speaker: str,
        reading_level: str,
        language: str,
    ) -> Dict[str, Any]:
        """Use Gemini for high-performance simplification."""
        level_desc = {
            "beginner": "a 9-year-old Primary 4 (P4) student",
            "intermediate": "a 10-year-old Primary 5 (P5) student",
            "advanced": "an 11-12 year old Primary 6 (P6) student",
        }.get(reading_level, "a 10-year-old Primary 5 student")

        speaker_ctx = f' spoken by the character "{speaker}"' if speaker else ""
        book_ctx = f' from "{book_title}" by {author}' if book_title else ""

        system_instruction = f"You are an expert pedagogical AI specializing in simplifying literature for {level_desc}. You focus on clarity, cultural relevance (Rwanda), and preserving the author's emotional resonance. You provide extremely simple, visual explanations for difficult words, often using metaphors a child can understand."

        if language == "fr":
            prompt = f"""Analyse ce passage{book_ctx}{speaker_ctx}:

"{text}"

Contexte du chapitre: {chapter_context[:500] if chapter_context else 'Non fourni'}

RΓ©ponds en JSON avec exactement ces clΓ©s:
{{
  "simple_version": "Version simplifiΓ©e (2-3 phrases)",
  "author_intent": "Pourquoi l'auteur a Γ©crit cela ainsi",
  "cultural_context": "Contexte culturel si pertinent, sinon null",
  "kinyarwanda_bridge": "Analogie culturelle rwandaise β€” compare avec un concept rwandais (imigabane, umuganda, icyubahiro, etc.)",
  "vocabulary": [
    {{"word": "string", "meaning": "dΓ©finition simple pour un enfant", "analogy": "comparaison simple", "category": "ex: ArchaΓ―que, Vocabulaire, MΓ©taphore"}}
  ],
  "phonics_guide": {{
    "syllables": ["syl", "la", "be"],
    "pronunciation": "pro-non-cia-cion"
  }}
}} (phonics_guide uniquement pour un seul mot, sinon null)"""
        else:
            prompt = f"""Analyze this passage{book_ctx}{speaker_ctx}:

"{text}"

Chapter context: {chapter_context[:500] if chapter_context else 'Not provided'}

Respond in JSON with exactly these keys:
{{
  "simple_version": "Simplified version (2-3 sentences)",
  "author_intent": "Why the author wrote it this way",
  "cultural_context": "Cultural context if relevant, otherwise null",
  "kinyarwanda_bridge": "Rwanda cultural connection β€” compare to umuganda, gacaca, icyubahiro, or another Rwandan concept",
  "vocabulary": [
    {{"word": "string", "meaning": "simple child-friendly definition", "analogy": "simple comparison for a 10-year-old", "category": "e.g. Archaic, Vocabulary, Idiom"}}
  ],
  "phonics_guide": {{
    "syllables": ["syl", "la", "bles"],
    "pronunciation": "pro-nun-see-ay-shun"
  }}
}} (phonics_guide is only for single words, otherwise null)"""

        result = self.gemini.generate_json(prompt, system_instruction)
        return result if isinstance(result, dict) else {}

    def _simplify_rule_based(
        self,
        text: str,
        book_title: str,
        author: str,
        doc_type: str,
        speaker: str,
        reading_level: str,
        language: str,
        phonics_data: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Rule-based fallback simplification."""
        # Replace archaic words
        simplified_words = text.split()
        replaced = []
        for word in simplified_words:
            clean = word.lower().strip(".,!?;:'\"()-")
            if clean in _ARCHAIC_GLOSSARY:
                replacement = _ARCHAIC_GLOSSARY[clean]
                # Preserve punctuation
                trailing = ""
                if word and word[-1] in ".,!?;:'\"()-":
                    trailing = word[-1]
                replaced.append(f"{replacement}{trailing}")
            else:
                replaced.append(word)
        simple_version = " ".join(replaced)

        # Break long sentences
        sentences = re.split(r"(?<=[.!?])\s+", simple_version)
        shortened = []
        for sent in sentences:
            if len(sent.split()) > 20:
                if ", and " in sent:
                    parts = sent.split(", and ", 1)
                    shortened.append(parts[0] + ".")
                    shortened.append(parts[1].strip().capitalize())
                elif "; " in sent:
                    parts = sent.split("; ", 1)
                    shortened.append(parts[0] + ".")
                    shortened.append(parts[1].strip().capitalize())
                else:
                    shortened.append(sent)
            else:
                shortened.append(sent)
        simple_version = " ".join(shortened)

        # Generate basic author intent
        speaker_note = f" {speaker} is saying" if speaker else " The author is saying"
        if doc_type == "play":
            intent = f"In this {doc_type},{speaker_note} something important about the situation or characters."
        else:
            intent = f"The author uses these words to create a vivid picture and advance the story."

        if book_title:
            intent += f' This is from "{book_title}"'
            if author:
                intent += f" by {author}"
            intent += "."

        # Add Kinyarwanda cultural bridge if applicable
        kinyarwanda_bridge = _get_kinyarwanda_bridge(text)

        return {
            "simple_version": simple_version,
            "author_intent": intent,
            "vocabulary": self._extract_vocabulary(text),
            "literary_devices": self._detect_devices(text),
            "cultural_context": None,
            "kinyarwanda_bridge": kinyarwanda_bridge,
            "phonics": phonics_data,
            "tier": "rule_based",
        }

    def _extract_vocabulary_gemini(self, text: str, reading_level: str) -> List[Dict[str, str]]:
        """Extract difficult words and generate child-friendly definitions using Gemini."""
        if not self.gemini.is_available():
            return self._extract_vocabulary(text)

        level_desc = {
            "beginner": "Primary 4 (9 years old)",
            "intermediate": "Primary 5 (10 years old)",
            "advanced": "Primary 6 (11-12 years old)",
        }.get(reading_level, "Primary 5 student")

        prompt = f"""Identify 3-5 difficult or important words from this text and provide child-friendly definitions:
"{text}"

The definitions should be suitable for a {level_desc}.
Respond in JSON as a list of objects with "word", "meaning", "analogy" (a simple comparison), and "category" (e.g. Vocabulary, Archaic, Figurative).
Example: [{{"word": "astounded", "meaning": "very surprised", "analogy": "like when you see a magic trick", "category": "Vocabulary"}}]
"""
        result = self.gemini.generate_json(prompt)
        return result if isinstance(result, list) else self._extract_vocabulary(text)

    def _extract_vocabulary(self, text: str) -> List[Dict[str, str]]:
        """Extract difficult words with definitions."""
        vocab = []
        words = re.findall(r"[a-zA-Z']+", text)

        for word in words:
            clean = word.lower().strip("'")
            if clean in _ARCHAIC_GLOSSARY:
                vocab.append({
                    "word": word,
                    "meaning": _ARCHAIC_GLOSSARY[clean],
                    "analogy": f'Think of it as a fancy old way of saying "{_ARCHAIC_GLOSSARY[clean]}".',
                    "type": "archaic",
                    "category": "Archaic",
                })

        # Deduplicate
        seen = set()
        unique = []
        for v in vocab:
            if v["word"].lower() not in seen:
                seen.add(v["word"].lower())
                unique.append(v)

        return unique

    def _detect_devices(self, text: str) -> List[Dict[str, str]]:
        """Detect literary devices in the text."""
        devices = []
        for pattern, device_name, explanation in _DEVICE_PATTERNS:
            if re.search(pattern, text, re.IGNORECASE):
                devices.append({
                    "device": device_name,
                    "explanation": explanation,
                })

        # Deduplicate by device name
        seen = set()
        unique = []
        for d in devices:
            if d["device"] not in seen:
                seen.add(d["device"])
                unique.append(d)

        return unique