Spaces:

Akshay30
/

decipherai-api

Sleeping

File size: 33,814 Bytes

import pytesseract
import re
import os
import cv2
import numpy as np
import torch
from PIL import Image
from .base_processor import BaseScriptProcessor
from utils.text_utils import is_gibberish

BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr")

class GreekProcessor(BaseScriptProcessor):
    def __init__(self, groq_client, references, clip_classifier):
        super().__init__(groq_client, references, clip_classifier)
        self.clip_classifier = clip_classifier
        self.setup_ancient_greek_ocr()
        
        self.trocr_model = None
        self.trocr_processor = None
        self.trocr_available = False
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Register for dynamic VRAM management
        from utils.gpu_diagnostics import register_processor
        register_processor("greek", self)
        
        # Metrics for Greek Glossary
        self.glossary_success_count = 0
        self.glossary_json_failure_count = 0
        self.regex_recovery_count = 0

    def setup_greek_trocr(self):
        """Setup TrOCR model — BEST for ancient Greek manuscripts"""
        try:
            from utils.gpu_diagnostics import reclaim_vram_for
            reclaim_vram_for("greek")
            
            print("[INFO] Lazily loading TrOCR model for ancient Greek...")
            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
            import torch
            
            import os
            HF_TOKEN = os.getenv("HF_TOKEN")
            self.trocr_processor = TrOCRProcessor.from_pretrained(
                'rithwikn/trocr_greek_combined',
                cache_dir=GREEK_TROCR_MODEL_DIR,
                local_files_only=False,
                token=HF_TOKEN
            )
            self.trocr_model = VisionEncoderDecoderModel.from_pretrained(
                'rithwikn/trocr_greek_combined',
                cache_dir=GREEK_TROCR_MODEL_DIR,
                local_files_only=False,
                token=HF_TOKEN
            )
            
            self.trocr_model.to(self.device)
            self.trocr_model.eval()  # Put in evaluation mode
            
            from utils.gpu_diagnostics import log_model_device
            log_model_device("Greek TrOCR", self.device)
            
            self.trocr_available = True
            print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}")
            
        except Exception as e:
            print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}")
            self.trocr_available = False
    
    def setup_ancient_greek_ocr(self):
        """Setup Ancient Greek OCR with Tesseract language check"""
        try:
            langs = pytesseract.get_languages(config='')
            self.grc_available = "grc" in langs
            if self.grc_available:
                print("[INFO] Ancient Greek Tesseract language pack 'grc' is available")
            else:
                print("[WARN] Ancient Greek Tesseract language pack 'grc' is NOT available")
        except Exception as e:
            print(f"[ERROR] Failed to check Tesseract languages: {e}")
            self.grc_available = False
    def detect_script(self, image_path):
        """Simplified detection - Groq Vision handles main classification"""
        try:
            if not getattr(self, 'trocr_available', False):
                # Check if Ancient Greek OCR is available as fallback
                if not getattr(self, 'grc_available', False):
                    print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)")
                    return False, 0.5
            
            # If called by Groq Vision classification, accept with high confidence
            print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)")
            return True, 0.95
            
        except Exception as e:
            print(f"[ERROR] Greek detection failed: {e}")
            return False, 0.0

    
    def _quick_greek_ocr_test(self, image_path):
        """Quick OCR test to validate Greek content"""
        try:
            # Quick test with small image crop
            image = Image.open(image_path)
            # Take center crop for testing
            w, h = image.size
            crop_box = (w//4, h//4, 3*w//4, 3*h//4)
            test_crop = image.crop(crop_box)
            
            # Test with standard Greek OCR
            test_text = pytesseract.image_to_string(test_crop, lang="ell")
            greek_char_count = self._count_greek_chars(test_text or "")
            
            # If we find Greek characters, it's likely Greek
            return greek_char_count >= 3
            
        except Exception:
            return False
    
    def extract_text(self, image_path):
        """Enhanced Greek text extraction with TrOCR primary, Tesseract fallback"""
        try:
            image = Image.open(image_path)
            
            # Ensure the Greek TrOCR model is loaded dynamically
            if self.trocr_model is None:
                self.setup_greek_trocr()
            else:
                from utils.gpu_diagnostics import reclaim_vram_for
                reclaim_vram_for("greek")
                if str(next(self.trocr_model.parameters()).device) != str(self.device):
                    print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
                    self.trocr_model.to(self.device)
            
            # Method 1: Ancient Greek TrOCR (if available)
            if getattr(self, 'trocr_available', False) and self.trocr_model is not None:
                print("[INFO] Attempting Ancient Greek extraction with TrOCR...")
                trocr_text = self._extract_with_trocr(image_path)
                if trocr_text and self._validate_greek_text(trocr_text):
                    print("[INFO] Using Ancient Greek TrOCR result")
                    return trocr_text
                print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...")

            # Method 2: Ancient Greek OCR (if available and safe)
            if getattr(self, 'grc_available', False):
                ancient_greek_text = self._extract_with_ancient_greek_ocr(image)
                if ancient_greek_text and self._validate_greek_text(ancient_greek_text):
                    print("[INFO] Using Ancient Greek OCR result")
                    return ancient_greek_text
            
            # Method 3: Standard Greek OCR
            standard_greek_text = self._extract_with_standard_greek_ocr(image)
            if standard_greek_text and self._validate_greek_text(standard_greek_text):
                print("[INFO] Using standard Greek OCR result")
                return standard_greek_text
            
            # Method 4: Layout-aware line segment fallback
            print("[INFO] Trying layout-aware Greek segmentation fallback...")
            layout_aware_greek_text = self._extract_layout_aware_ocr(image_path)
            if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text):
                print("[INFO] Using layout-aware Greek OCR result")
                return layout_aware_greek_text
            
            # Method 5: Final validation - if no good Greek text found, return empty
            print("[INFO] No valid Greek text detected")
            return ""
        
        except Exception as e:
            print(f"[ERROR] Greek text extraction failed: {e}")
            return ""

    def _extract_with_trocr(self, image_path):
        """Extract text using TrOCR Ancient Greek model line-by-line"""
        if self.trocr_model is None:
            self.setup_greek_trocr()
        else:
            from utils.gpu_diagnostics import reclaim_vram_for
            reclaim_vram_for("greek")
            if str(next(self.trocr_model.parameters()).device) != str(self.device):
                print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
                self.trocr_model.to(self.device)
                
        if not getattr(self, 'trocr_available', False) or self.trocr_model is None:
            return ""
            
        try:
            import torch
            from PIL import Image
            print("[INFO] Segmenting layout for Greek TrOCR...")
            layout = self.layout_parser.analyze_layout(image_path)
            crops = self.layout_parser.crop_lines(image_path, layout)
            
            # Fallback to whole image if no crops detected
            if not crops:
                print("[WARN] No line crops found, processing full image with TrOCR")
                crops = [Image.open(image_path).convert("RGB")]
            
            line_texts = []
            print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...")
            for idx, crop in enumerate(crops):
                # Ensure RGB mode for TrOCR
                crop_rgb = crop.convert("RGB")
                
                pixel_values = self.trocr_processor(
                    images=crop_rgb, 
                    return_tensors="pt"
                ).pixel_values.to(self.device)
                
                with torch.inference_mode():
                    generated_ids = self.trocr_model.generate(
                        pixel_values,
                        max_length=256,
                        num_beams=4,
                        early_stopping=True,
                        repetition_penalty=1.2
                    )
                
                text = self.trocr_processor.batch_decode(
                    generated_ids, 
                    skip_special_tokens=True
                )[0]
                
                if text.strip():
                    line_texts.append(text.strip())
            
            full_text = "\n".join(line_texts)
            print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image")
            return full_text
            
        except Exception as e:
            print(f"[ERROR] Greek TrOCR extraction failed: {e}")
            return ""

    
    def _extract_with_ancient_greek_ocr(self, image):
        """Extract using specialized Ancient Greek OCR"""
        try:
            if not getattr(self, 'grc_available', False):
                return ""
            
            # Use ancient Greek language code 'grc' with optimized settings
            config = "--psm 6 --oem 1 -c preserve_interword_spaces=1"
            
            # Try ancient Greek language pack
            text = pytesseract.image_to_string(
                image, 
                lang="grc",  # Ancient Greek language code
                config=config
            )
            return text.strip()
            
        except Exception as e:
            print(f"[WARN] Ancient Greek OCR failed: {e}")
            return ""

    def _extract_layout_aware_ocr(self, image_path):
        """Extract text by segmenting the page layout into lines first for improved readability order"""
        try:
            import pytesseract
            print("[INFO] Running layout-aware line segmentation for Greek...")
            layout = self.layout_parser.analyze_layout(image_path)
            crops = self.layout_parser.crop_lines(image_path, layout)
            
            if not crops:
                print("[WARN] Layout parser returned no line crops for Greek")
                return ""
                
            print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines")
            line_texts = []
            
            # Try to use Ancient Greek first
            use_grc = getattr(self, 'grc_available', False)
            
            try:
                for idx, crop in enumerate(crops):
                    # Enhance line crop for OCR
                    crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
                    gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
                    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
                    enhanced = clahe.apply(gray)
                    crop_pil = Image.fromarray(enhanced)
                    
                    config = '--oem 3 --psm 7'
                    text = ""
                    
                    if use_grc:
                        text = pytesseract.image_to_string(
                            crop_pil,
                            lang='grc',
                            config=config
                        ).strip()
                        
                    if not text:
                        text = pytesseract.image_to_string(
                            crop_pil,
                            lang='ell',
                            config=config
                        ).strip()
                        
                    if text:
                        line_texts.append(text)
            finally:
                pass
            
            return "\n".join(line_texts)
        except Exception as e:
            print(f"[WARN] Layout aware Greek OCR failed: {e}")
            return ""

    
    def _extract_with_standard_greek_ocr(self, image):
        """Extract using standard Greek OCR with optimized settings"""
        try:
            # Multiple OCR attempts with different settings
            configs = [
                "--psm 6 --oem 1",  # Uniform text block
                "--psm 4 --oem 1",  # Single column text
                "--psm 3 --oem 1",  # Default, automatic page segmentation
                "--psm 8 --oem 1"   # Single word
            ]
            
            for config in configs:
                try:
                    text = pytesseract.image_to_string(
                        image,
                        lang="ell",  # Modern Greek
                        config=config
                    )
                    
                    if text and self._validate_greek_text(text):
                        return text.strip()
                        
                except Exception:
                    continue
            
            return ""
            
        except Exception as e:
            print(f"[WARN] Standard Greek OCR failed: {e}")
            return ""
    
    def _extract_with_preprocessing(self, image):
        """Fallback extraction with image preprocessing"""
        try:
            # Convert PIL to CV2
            cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            
            # Image preprocessing for better OCR
            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
            
            # Try different preprocessing approaches
            preprocessed_images = [
                gray,  # Original grayscale
                cv2.GaussianBlur(gray, (1, 1), 0),  # Slight blur
                cv2.medianBlur(gray, 3),  # Noise reduction
                cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]  # Adaptive threshold
            ]
            
            for processed_img in preprocessed_images:
                try:
                    pil_img = Image.fromarray(processed_img)
                    text = pytesseract.image_to_string(
                        pil_img,
                        lang="ell",
                        config="--psm 6 --oem 1"
                    )
                    
                    if self._validate_greek_text(text):
                        return text.strip()
                        
                except Exception:
                    continue
            
            return ""
            
        except Exception as e:
            print(f"[WARN] Fallback Greek OCR failed: {e}")
            return ""
    
    def _count_greek_chars(self, text):
        """Count Greek Unicode characters including polytonic marks"""
        if not text:
            return 0
            
        def is_greek_char(ch):
            o = ord(ch)
            # Greek and Coptic (0x0370-0x03FF)
            # Greek Extended (0x1F00-0x1FFF) - includes polytonic marks
            return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF)
        
        return sum(is_greek_char(ch) for ch in text)
    
    def _validate_greek_text(self, text):
        """Validate if text contains meaningful Greek content"""
        if not text or len(text.strip()) < 3:
            return False
        
        # Count Greek characters
        greek_char_count = self._count_greek_chars(text)
        total_chars = len(re.sub(r'\s+', '', text))
        
        if total_chars == 0:
            return False
        
        # Check for Latin characters (should reject if too many)
        latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
        latin_ratio = latin_chars / total_chars if total_chars > 0 else 0
        
        # If text is mostly Latin characters, reject it
        if latin_ratio > 0.8 and greek_char_count < 3:
            print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}")
            return False
        
        # At least 20% should be Greek characters, or minimum 5 Greek chars
        greek_ratio = greek_char_count / total_chars
        
        return greek_char_count >= 5 or greek_ratio >= 0.20

    
    def _extract_distinct_terms(self, text):
        """Extract distinct Greek terms from text"""
        if not text:
            return []
        
        # Find Greek words (including those with diacritical marks)
        tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
        
        def is_greek_word(word):
            return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF) 
                      for ch in word)
        
        distinct_terms = []
        seen = set()
        
        for token in tokens:
            if len(token) < 2:  # Skip single characters
                continue
                
            if is_greek_word(token):
                normalized = token.lower()
                if normalized not in seen:
                    distinct_terms.append(token)
                    seen.add(normalized)
        
        return distinct_terms[:20]  # Limit to 20 terms
    
    def process_text(self, greek_text):
        """Process extracted Greek text"""
        if not greek_text:
            return {"text": "", "terms": [], "char_analysis": {}, "validation": {}}
        
        # Extract distinct terms
        terms = self._extract_distinct_terms(greek_text)
        
        # Character analysis
        char_analysis = {
            "total_chars": len(greek_text),
            "greek_chars": self._count_greek_chars(greek_text),
            "unique_chars": len(set(greek_text)),
            "words": len(greek_text.split())
        }
        
        # Validation metrics
        validation = {
            "has_polytonic": self._has_polytonic_marks(greek_text),
            "greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]),
            "quality_score": self._calculate_quality_score(greek_text)
        }
        
        return {
            "text": greek_text,
            "terms": terms,
            "char_analysis": char_analysis,
            "validation": validation
        }
    
    def _has_polytonic_marks(self, text):
        """Check if text contains polytonic Greek marks"""
        # Greek Extended block contains polytonic diacritical marks
        return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text)
    
    def _calculate_quality_score(self, text):
        """Calculate a quality score for the extracted text"""
        if not text:
            return 0.0
        
        score = 0.0
        
        # Base score from Greek character ratio
        greek_ratio = self._count_greek_chars(text) / max(1, len(text))
        score += greek_ratio * 0.4
        
        # Bonus for polytonic marks (indicates authentic ancient Greek)
        if self._has_polytonic_marks(text):
            score += 0.3
        
        # Penalty for too many non-alphabetic characters
        alpha_chars = sum(ch.isalpha() for ch in text)
        alpha_ratio = alpha_chars / max(1, len(text))
        score += alpha_ratio * 0.3
        
        return min(1.0, score)
    
    def generate_historical_context(self, processed_result):
        """Generate historical context for Greek text"""
        greek_text = processed_result.get("text", "")
        terms = processed_result.get("terms", [])
        
        # Generate Groq context
        groq_detail = self._generate_groq_context(greek_text)
        
        # Build references - query both words and individual characters
        query_terms = list(terms) if terms else []
        if greek_text:
            query_terms.extend([char for char in greek_text if char.strip()])
        print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}")
        refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
        print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}")
        
        return {
            "uses_box": {
                "title": "Each symbol's possible use by the Greek people",
                "items": self._build_uses_list(terms, greek_text)
            },
            "meaning_box": self._build_meaning_box(terms, groq_detail),
            "references": refs
        }
    
    def _generate_groq_context(self, greek_text):
        """Generate contextual information using Groq"""
        if not self.groq_client.is_available():
            return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
        
        prompt = (
            f"This ancient Greek text was found: {greek_text}\n\n"
            "Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, "
            "possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), "
            "and paleographic cues. Avoid repeating the prompt."
        )
        
        system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context."
        enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text)
        
        return self.groq_client.generate_response(
            system_prompt=enriched_system_prompt,
            user_prompt=prompt
        ) or "(context unavailable due to Groq error)"
        
    def _generate_batch_explanations(self, terms):
        """Generate scholarly glossary definitions for Greek terms in a single batch query"""
        if not terms or not self.groq_client or not self.groq_client.is_available():
            return {}
            
        # Limit to first 15 terms to prevent token limit/truncation issues
        terms_to_query = list(terms)[:15]
        terms_list = ", ".join(terms_to_query)
        
        system_prompt = (
            "You are an expert classicist and lexicographer of Ancient Greek. "
            "Return ONLY valid JSON matching the requested schema. "
            "No markdown, no code fences (like ```json), no explanations, no prose."
        )
        user_prompt = (
            f"For each of the following Ancient Greek words, provide a scholarly definition, "
            f"etymological note, and grammatical gloss:\n\n"
            f"Words: {terms_list}\n\n"
            f"You MUST format the output as a single JSON object where the keys are the exact words "
            f"and the values are objects containing 'definition', 'gloss', and 'etymology' keys.\n\n"
            f"Output schema:\n"
            f"{{\n"
            f"  \"TERM\": {{\n"
            f"    \"definition\": \"...\",\n"
            f"    \"gloss\": \"...\",\n"
            f"    \"etymology\": \"...\"\n"
            f"  }}\n"
            f"}}\n"
        )
        
        try:
            raw_response = self.groq_client.generate_response(
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                max_tokens=2048,
                response_format={"type": "json_object"}
            )
            # Safe print to avoid UnicodeEncodeError in Windows command prompt
            print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
            
            # Find JSON block in response
            json_str = raw_response.strip()
            if "{" in json_str and "}" in json_str:
                start = json_str.find("{")
                end = json_str.rfind("}") + 1
                json_str = json_str[start:end]
            
            import json
            definitions = {}
            try:
                definitions = json.loads(json_str)
                self.glossary_success_count += 1
            except Exception as je:
                self.glossary_json_failure_count += 1
                import logging
                logger = logging.getLogger(__name__)
                logger.warning(
                    "Malformed Greek glossary JSON",
                    extra={"response": raw_response[:2000]}
                )
                print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
                
                # Regex recovery fallback
                import re
                self.regex_recovery_count += 1
                term_blocks = re.findall(r'"([^"]+)"\s*:\s*\{([^}]+)\}', json_str)
                for term, block in term_blocks:
                    def_match = re.search(r'"definition"\s*:\s*["\']([^"\']+)["\']', block)
                    gloss_match = re.search(r'"gloss"\s*:\s*["\']([^"\']+)["\']', block)
                    ety_match = re.search(r'"etymology"\s*:\s*["\']([^"\']+)["\']', block)
                    definitions[term] = {
                        "definition": def_match.group(1) if def_match else "",
                        "gloss": gloss_match.group(1) if gloss_match else "",
                        "etymology": ety_match.group(1) if ety_match else ""
                    }
                    
            return definitions
        except Exception as e:
            print(f"[WARN] Failed to generate batch Greek explanations: {e}")
            
        return {}

    def _build_uses_list(self, terms, greek_text):
        """Build list of symbol/word uses using RAG and batch Groq explanations"""
        import unicodedata
        items = []
        
        # 1. Get definitions for the extracted Greek words (terms)
        if terms:
            # Unique terms preserving order
            unique_terms = list(dict.fromkeys(terms))
            # Limit to top 15 terms to be concise
            unique_terms = unique_terms[:15]
            print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...")
            definitions = {}
            missing_terms = []
            
            for term in unique_terms:
                # Check RAG corpus (normalize search query)
                norm_term = unicodedata.normalize('NFC', term).strip()
                rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1)
                if rag_matches:
                    definitions[term] = rag_matches[0]["definition"]
                else:
                    missing_terms.append(term)
            
            # Generate remaining definitions with Groq in a single batch
            if missing_terms:
                groq_defs = self._generate_batch_explanations(missing_terms)
                # Normalize groq keys for matching
                normalized_groq_defs = {}
                for k, v in groq_defs.items():
                    nk = unicodedata.normalize('NFC', k).strip().lower()
                    normalized_groq_defs[nk] = v
                
                # Assign matching definitions
                for term in missing_terms:
                    nt = unicodedata.normalize('NFC', term).strip().lower()
                    if nt in normalized_groq_defs:
                        definitions[term] = normalized_groq_defs[nt]
                    else:
                        # Case/accent insensitive backup match (in case Groq stripped accents)
                        import unicodedata as ud
                        def strip_accents(s):
                            return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn')
                        
                        stripped_t = strip_accents(nt)
                        for gk, gv in normalized_groq_defs.items():
                            if strip_accents(gk) == stripped_t:
                                definitions[term] = gv
                                break
                
            for term in unique_terms:
                definition = definitions.get(term)
                if not definition:
                    definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
                elif isinstance(definition, dict):
                    parts = []
                    d_val = definition.get("definition", "").strip()
                    g_val = definition.get("gloss", "").strip()
                    e_val = definition.get("etymology", "").strip()
                    if d_val:
                        parts.append(d_val)
                    if g_val:
                        parts.append(f"Gloss: {g_val}")
                    if e_val:
                        parts.append(f"Etymology: {e_val}")
                    definition = " | ".join(parts) if parts else "Ancient Greek lexical term."
                items.append(f"{term}: {definition}")
        
        # 2. Add significant paleographical/character markers found in the text if they are in the references
        notes = self.references.get("greek_symbol_notes", {}) or {}
        seen_chars = set()
        char_items = []
        for ch in greek_text:
            if ch in notes and ch not in seen_chars:
                seen_chars.add(ch)
                char_items.append(f"Character '{ch}': {notes[ch]}")
                
        # Limit character notes to prevent clutter
        items.extend(char_items[:5])
        
        # Format as list items with bullets
        formatted_items = [f"- {item}" for item in items]
        
        if not formatted_items:
            default_hint = self.references.get("greek_hint", 
                "Ancient Greek script marker; values are determined by polytonic diacritical marks.")
            formatted_items.append(f"- —: {default_hint}")
            
        return formatted_items

    
    def _build_meaning_box(self, terms, groq_detail):
        """Build meaning interpretation box"""
        intro_lines = [
            "The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.",
            "Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification."
        ]
        
        points = [
            "• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.",
            "• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.",
        ]
        
        if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
            points.append(groq_detail.strip())
        
        return {
            "title": "Possible meaning:",
            "intro_lines": intro_lines,
            "frequent_label": "Key terms noted",
            "frequent": terms[:10],
            "points": points
        }
    
    def generate_story(self, processed_result):
        """Generate creative story for Greek text"""
        greek_text = processed_result.get("text", "")
        
        if not self.groq_client.is_available():
            return "Groq client unavailable, cannot generate story."
        
        styles = [
            "as an epic poem told by a travelling rhapsode",
            "as a prophecy inscribed on the Oracle at Delphi",
            "as a philosophical dialogue in the Academy",
            "as a myth recounted by ancient storytellers",
            "as a recovered scroll from the Library of Alexandria",
            "as a hymn sung in honor of the gods"
        ]
        
        import random
        chosen_style = random.choice(styles)
        seed = random.randint(1000, 9999)
        
        prompt = (
            f"The following ancient Greek text was found: {greek_text}\n\n"
            f"Create a long, vivid, imaginative story from ancient Greek times "
            f"based on this Greek text. Write it as one rich paragraph with "
            f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n"
            f"Creative seed: {seed}\n"
            f"Write a detailed, imaginative myth-like story {chosen_style}. "
            "Include multiple characters, rich imagery, and scenes. "
            "Avoid repetition and keep it unpredictable."
        )
        
        system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture."
        
        story = self.groq_client.generate_response(
            system_prompt=system_prompt,
            user_prompt=prompt
        )
        
        if not story or is_gibberish(story):
            return "Failed to create quality story; the ancient texts remain silent."
        
        return story