Spaces:

Akshay30
/

decipherai-api

Sleeping

File size: 61,650 Bytes

import os
import cv2
import numpy as np
import re
import time
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
from .base_processor import BaseScriptProcessor
from utils.text_utils import is_gibberish

BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
TRIDIS_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "tridis")
TROCR_LATIN_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "trocr_latin")


class LatinProcessor(BaseScriptProcessor):
    def __init__(self, groq_client, references, clip_classifier):
        super().__init__(groq_client, references, clip_classifier)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.tridis_model = None
        self.tridis_processor = None
        self.tridis_available = False
        
        self.trocr_latin_model = None
        self.trocr_latin_processor = None
        self.trocr_latin_available = False
        
        self.active_style = "cursive"
        self.active_model = "None"
        
        self.setup_tesseract_fallback()
        
        # Register for dynamic VRAM management
        from utils.gpu_diagnostics import register_processor
        register_processor("latin", self)

    def setup_tridis_htr(self):
        """Setup TRIDIS HTR model - BEST for medieval Latin manuscripts"""
        try:
            from utils.gpu_diagnostics import reclaim_vram_for
            reclaim_vram_for("latin")
            
            print("[INFO] Lazily loading TRIDIS HTR model for medieval Latin...")
            print("[INFO] This model specializes in 13th-16th century manuscripts with automatic abbreviation expansion")
            
            # TRIDIS model from Hugging Face - runs locally after download
            import os
            HF_TOKEN = os.getenv("HF_TOKEN")
            self.tridis_processor = TrOCRProcessor.from_pretrained(
                'magistermilitum/tridis_HTR',
                cache_dir=TRIDIS_MODEL_DIR,
                local_files_only=False,
                token=HF_TOKEN
            )
            self.tridis_model = VisionEncoderDecoderModel.from_pretrained(
                'magistermilitum/tridis_HTR', 
                cache_dir=TRIDIS_MODEL_DIR,
                local_files_only=False,
                token=HF_TOKEN
            )
            
            self.tridis_model.to(self.device)
            self.tridis_model.eval()  # Put in evaluation mode
            
            from utils.gpu_diagnostics import log_model_device
            log_model_device("Latin TRIDIS HTR (Cursive)", self.device)
            
            print(f"[INFO] TRIDIS HTR loaded successfully on {self.device}")
            print("[INFO] Training: 245,000 lines of Latin/Old French/Old Spanish medieval manuscripts")
            print("[INFO] Features: Automatic abbreviation expansion, named entity capitalization, cancellation markers")
            self.tridis_available = True
            
        except Exception as e:
            print(f"[ERROR] TRIDIS HTR model failed to load: {e}")
            print("[WARN] Falling back to Tesseract for basic Latin recognition...")
            self.tridis_available = False

    def setup_trocr_base_latin(self):
        """Setup TRIDIS v2 HTR model - Primary for printed or manuscript Latin, fallback to printed"""
        import os
        HF_TOKEN = os.getenv("HF_TOKEN")
        try:
            from utils.gpu_diagnostics import reclaim_vram_for
            reclaim_vram_for("latin")
            
            print("[LATIN] Loading TRIDIS v2 model...")
            self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
                'magistermilitum/tridis_v2_HTR_historical_manuscripts',
                cache_dir=TROCR_LATIN_MODEL_DIR,
                local_files_only=False,
                token=HF_TOKEN
            )
            self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
                'magistermilitum/tridis_v2_HTR_historical_manuscripts',
                cache_dir=TROCR_LATIN_MODEL_DIR,
                local_files_only=False,
                token=HF_TOKEN
            )
            
            self.trocr_latin_model.to(self.device)
            self.trocr_latin_model.eval()  # Put in evaluation mode
            
            from utils.gpu_diagnostics import log_model_device
            log_model_device("Latin TRIDIS v2 HTR", self.device)
            
            self.trocr_latin_available = True
            self.loaded_printed_model_name = "tridis_v2_HTR_historical_manuscripts"
            print("[LATIN] TRIDIS v2 model loaded successfully")
            print(f"processor class: {type(self.trocr_latin_processor).__name__}")
            print(f"model class: {type(self.trocr_latin_model).__name__}")
            print(f"device: {self.device}")
            print(f"parameter count: {sum(p.numel() for p in self.trocr_latin_model.parameters())}")
        except Exception as e:
            print(f"[LATIN] TRIDIS unavailable, using microsoft/trocr-base-printed")
            try:
                # Free VRAM again in case partial allocation left residue
                reclaim_vram_for("latin")
                self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
                    'microsoft/trocr-base-printed',
                    cache_dir=TROCR_LATIN_MODEL_DIR,
                    local_files_only=False,
                    token=HF_TOKEN
                )
                self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
                    'microsoft/trocr-base-printed',
                    cache_dir=TROCR_LATIN_MODEL_DIR,
                    local_files_only=False,
                    token=HF_TOKEN
                )
                self.trocr_latin_model.to(self.device)
                self.trocr_latin_model.eval()  # Put in evaluation mode
                
                from utils.gpu_diagnostics import log_model_device
                log_model_device("Latin TrOCR (Printed Fallback)", self.device)
                
                self.trocr_latin_available = True
                self.loaded_printed_model_name = "trocr-base-printed"
                print(f"[INFO] Public fallback microsoft/trocr-base-printed loaded successfully on {self.device}")
                print(f"processor class: {type(self.trocr_latin_processor).__name__}")
                print(f"model class: {type(self.trocr_latin_model).__name__}")
                print(f"device: {self.device}")
                print(f"parameter count: {sum(p.numel() for p in self.trocr_latin_model.parameters())}")
            except Exception as ex:
                print(f"[ERROR] All printed Latin models failed to load: {ex}")
                self.trocr_latin_available = False


    def setup_tesseract_fallback(self):
        """Setup Tesseract as fallback for basic Latin recognition"""
        try:
            import pytesseract
            
            # Test Tesseract availability
            try:
                version = pytesseract.get_tesseract_version()
                print(f"[INFO] Tesseract fallback version: {version}")
            except:
                print("[INFO] Tesseract version check skipped")
            
            self.ocr_configs = {
                'medieval_extended': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁꝐꝑꝒꝓꝔꝕꝖꝗꝘꝙꝚꝛꝜꝝꞀꞁꞂꞃ$',
                'medieval_basic': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-',
                'standard': r'--oem 3 --psm 6',
                'single_line': r'--oem 3 --psm 7',
                'single_word': r'--oem 3 --psm 8',
                'auto': r'--oem 3 --psm 3'
            }
            self.tesseract_available = True
            print("[INFO] Tesseract fallback configured with medieval symbol support")
            
        except ImportError:
            print("[ERROR] pytesseract not available")
            self.tesseract_available = False
        except Exception as e:
            print(f"[WARN] Tesseract setup failed: {e}")
            self.tesseract_available = False

    def detect_script(self, image_path):
        """Detection handled by Groq Vision classification"""
        try:
            if not self.tridis_available and not self.tesseract_available:
                print("[ERROR] No OCR engines available for Latin processing")
                return False, 0.0
            
            method = "TRIDIS HTR (medieval specialist)" if self.tridis_available else "Tesseract fallback"
            print(f"[INFO] Latin processor activated - Using {method}")
            return True, 0.98 if self.tridis_available else 0.85
            
        except Exception as e:
            print(f"[ERROR] Latin detection failed: {e}")
            return False, 0.0

    def extract_text(self, image_path):
        """Extract text using dual-mode routing: trocr-base-latin for printed, tridis_HTR for cursive"""
        try:
            start_time = time.time()
            
            # Step 1: Detect script style
            style = self.layout_parser.detect_writing_style(image_path, self.clip_classifier)
            print(f"[INFO] Latin writing style detected: {style.upper()}")
            
            primary_text = ""
            fallback_text = ""
            
            # Ensure the required model is loaded dynamically
            if style == "printed":
                if self.trocr_latin_model is None:
                    self.setup_trocr_base_latin()
                else:
                    from utils.gpu_diagnostics import reclaim_vram_for
                    reclaim_vram_for("latin")
                    if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
                        print(f"[VRAM MANAGER] Activating Latin TrOCR (Printed) model on {self.device}...")
                        self.trocr_latin_model.to(self.device)
            else:
                if self.tridis_model is None:
                    self.setup_tridis_htr()
                else:
                    from utils.gpu_diagnostics import reclaim_vram_for
                    reclaim_vram_for("latin")
                    if str(next(self.tridis_model.parameters()).device) != str(self.device):
                        print(f"[VRAM MANAGER] Activating Latin TRIDIS HTR (Cursive) model on {self.device}...")
                        self.tridis_model.to(self.device)
            
            if style == "printed" and self.trocr_latin_available:
                print("[INFO] Routing to printed/carved Latin model (trocr-base-latin)...")
                primary_text = self._extract_with_trocr_base_latin(image_path)
                if primary_text and self._validate_latin_text(primary_text, style):
                    processing_time = time.time() - start_time
                    print(f"[SUCCESS] Routed to trocr-base-latin and completed in {processing_time:.2f}s")
                    self.active_style = "printed"
                    self.active_model = getattr(self, "loaded_printed_model_name", "tridis_v2_HTR_historical_manuscripts")
                    return primary_text
                else:
                    print("[WARN] trocr-base-latin returned poor quality result, trying TRIDIS HTR fallback...")
                    if self.tridis_model is None:
                        self.setup_tridis_htr()
                    if self.tridis_available:
                        fallback_text = self._extract_with_tridis_htr(image_path)
            
            else:  # cursive / manuscript
                print("[INFO] Routing to medieval manuscript model (tridis_HTR)...")
                if self.tridis_available:
                    primary_text = self._extract_with_tridis_htr(image_path)
                    if primary_text and self._validate_latin_text(primary_text, style):
                        processing_time = time.time() - start_time
                        print(f"[SUCCESS] Routed to tridis_HTR and completed in {processing_time:.2f}s")
                        self.active_style = "cursive"
                        self.active_model = "tridis_HTR"
                        return primary_text
                    else:
                        print("[WARN] TRIDIS HTR returned poor quality result, trying trocr-base-latin fallback...")
                        if self.trocr_latin_model is None:
                            self.setup_trocr_base_latin()
                        if self.trocr_latin_available:
                            fallback_text = self._extract_with_trocr_base_latin(image_path)
            
            # Step 2: Check fallback text from the other model
            if fallback_text and self._validate_latin_text(fallback_text, "printed" if style == "cursive" else "cursive"):
                processing_time = time.time() - start_time
                print(f"[SUCCESS] Fallback model transcription successful in {processing_time:.2f}s")
                self.active_style = "printed" if style == "cursive" else "cursive"
                self.active_model = getattr(self, "loaded_printed_model_name", "tridis_v2_HTR_historical_manuscripts") if style == "cursive" else "tridis_HTR"
                return fallback_text
                
            # Step 3: Tesseract fallback
            if self.tesseract_available:
                print("[INFO] Neural models failed. Processing with Tesseract fallback...")
                tesseract_text = self._extract_with_tesseract_enhanced(image_path)
                
                if tesseract_text and self._validate_latin_text(tesseract_text, "any"):
                    processing_time = time.time() - start_time
                    print(f"[SUCCESS] Tesseract fallback completed in {processing_time:.2f}s")
                    self.active_style = "printed"  # Tesseract works best on printed
                    self.active_model = "Tesseract OCR"
                    return tesseract_text
                else:
                    print("[WARN] Tesseract returned poor quality result, trying layout-aware segmentation fallback...")
                    
                    # Method 3: Layout-aware line segment fallback
                    layout_aware_text = self._extract_layout_aware_ocr(image_path)
                    if layout_aware_text and self._validate_latin_text(layout_aware_text, "any"):
                        processing_time = time.time() - start_time
                        print(f"[SUCCESS] Layout-aware OCR completed in {processing_time:.2f}s")
                        self.active_style = "printed"
                        self.active_model = "Tesseract Layout-Aware"
                        return layout_aware_text
            
            print("[ERROR] All OCR methods failed or returned poor quality results")
            self.active_style = "unknown"
            self.active_model = "None"
            return "No readable Latin text detected with sufficient confidence"
            
        except Exception as e:
            print(f"[ERROR] Latin text extraction failed: {e}")
            self.active_style = "error"
            self.active_model = "None"
            return f"Error during text extraction: {str(e)}"

    def _extract_with_trocr_base_latin(self, image_path):
        """Extract text using trocr-base-latin - SPECIALIZED for printed/carved Latin"""
        if self.trocr_latin_model is None:
            self.setup_trocr_base_latin()
        else:
            from utils.gpu_diagnostics import reclaim_vram_for
            reclaim_vram_for("latin")
            if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
                print(f"[VRAM MANAGER] Activating Latin TrOCR model on {self.device}...")
                self.trocr_latin_model.to(self.device)
                
        if not getattr(self, 'trocr_latin_available', False) or self.trocr_latin_model is None:
            return ""
            
        try:
            image = Image.open(image_path).convert("RGB")
            print(f"[INFO] Processing image with trocr-base-latin: {image.size[0]}x{image.size[1]} pixels")
            
            # Since trocr models are line-level OCR models, segment into lines first
            layout = self.layout_parser.analyze_layout(image_path)
            crops = self.layout_parser.crop_lines(image_path, layout)
            
            if crops and len(crops) > 1:
                print(f"[INFO] Image contains multiple lines ({len(crops)}). Running line-by-line trocr-base-latin...")
                line_texts = []
                for idx, crop in enumerate(crops):
                    text = self._ocr_single_crop_with_trocr_base_latin(crop)
                    if text:
                        line_texts.append(text)
                return "\n".join(line_texts)
            else:
                print("[INFO] Single line detected or layout parser returned no lines. Processing full image...")
                return self._ocr_single_crop_with_trocr_base_latin(image)
                
        except Exception as e:
            print(f"[ERROR] trocr-base-latin extraction failed: {e}")
            return ""

    def _ocr_single_crop_with_trocr_base_latin(self, crop_image):
        """Helper to run trocr-base-latin inference on a single image crop"""
        try:
            pixel_values = self.trocr_latin_processor(
                images=crop_image, 
                return_tensors="pt"
            ).pixel_values.to(self.device)
            
            with torch.inference_mode():
                generated_ids = self.trocr_latin_model.generate(
                    pixel_values,
                    max_length=512,
                    num_beams=4,
                    early_stopping=True
                )
                
            text = self.trocr_latin_processor.batch_decode(
                generated_ids, 
                skip_special_tokens=True
            )[0]
            
            text = ' '.join(text.split())
            return text.strip()
        except Exception as e:
            print(f"[ERROR] Single line OCR with trocr-base-latin failed: {e}")
            return ""

    def _extract_with_tridis_htr(self, image_path):
        """Extract text using TRIDIS HTR - SPECIALIZED for medieval Latin manuscripts.
        Uses layout-aware line segmentation so multi-line documents are fully transcribed."""
        if self.tridis_model is None:
            self.setup_tridis_htr()
        else:
            from utils.gpu_diagnostics import reclaim_vram_for
            reclaim_vram_for("latin")
            if str(next(self.tridis_model.parameters()).device) != str(self.device):
                print(f"[VRAM MANAGER] Activating Latin TRIDIS model on {self.device}...")
                self.tridis_model.to(self.device)
                
        if not getattr(self, 'tridis_available', False) or self.tridis_model is None:
            return ""
            
        try:
            # Load and validate image
            image = Image.open(image_path).convert("RGB")
            print(f"[INFO] Processing image with TRIDIS HTR: {image.size[0]}x{image.size[1]} pixels")
            
            # Use layout parser to segment into individual lines
            layout = self.layout_parser.analyze_layout(image_path)
            crops = self.layout_parser.crop_lines(image_path, layout)
            
            if crops and len(crops) > 1:
                # Cap lines to prevent timeout on very large documents (CPU inference)
                MAX_LINES = 50
                total_detected = len(crops)
                if len(crops) > MAX_LINES:
                    print(f"[INFO] Layout parser detected {total_detected} text lines. Capping to {MAX_LINES} for performance.")
                    crops = crops[:MAX_LINES]
                else:
                    print(f"[INFO] Layout parser detected {total_detected} text lines. Running line-by-line TRIDIS HTR...")
                
                line_texts = []
                for idx, crop in enumerate(crops):
                    # Preprocess each line crop for medieval manuscripts
                    enhanced_crop = self._preprocess_for_medieval_manuscript(crop)
                    text = self._ocr_single_crop_with_tridis(enhanced_crop)
                    if text:
                        line_texts.append(text)
                        print(f"  [LINE {idx+1}/{len(crops)}] {text[:80]}...")
                
                if line_texts:
                    full_text = "\n".join(line_texts)
                    # Post-process medieval abbreviations, corrections, and formatting
                    processed_text = self._post_process_medieval_text(full_text)
                    
                    char_count = len(processed_text)
                    word_count = len(processed_text.split())
                    print(f"[INFO] TRIDIS HTR extracted (multi-line): {char_count} characters, {word_count} words from {len(line_texts)} lines")
                    
                    medieval_features = self._analyze_medieval_features(processed_text)
                    if medieval_features:
                        print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
                    
                    return processed_text.strip()
            
            # Single line or no layout detected — process full image
            print("[INFO] Single line or no layout segmentation. Processing full image with TRIDIS HTR...")
            enhanced_image = self._preprocess_for_medieval_manuscript(image)
            
            # Process with TRIDIS HTR
            print("[INFO] Running TRIDIS HTR inference...")
            pixel_values = self.tridis_processor(
                images=enhanced_image, 
                return_tensors="pt"
            ).pixel_values.to(self.device)
            
            # Generate text with parameters optimized for medieval manuscripts
            with torch.inference_mode():
                generated_ids = self.tridis_model.generate(
                    pixel_values,
                    max_length=768,  # Longer sequences for medieval texts with abbreviations
                    num_beams=6,     # Higher quality beam search for historical accuracy
                    early_stopping=True,
                    do_sample=False,
                    repetition_penalty=1.15,  # Avoid repetition common in medieval texts
                    length_penalty=0.8,       # Don't penalize longer expansions
                    no_repeat_ngram_size=2    # Avoid immediate repetitions
                )
            
            # Decode the generated text
            generated_text = self.tridis_processor.batch_decode(
                generated_ids, 
                skip_special_tokens=True
            )[0]
            
            # Post-process medieval abbreviations, corrections, and formatting
            processed_text = self._post_process_medieval_text(generated_text)
            
            # Log extraction results
            char_count = len(processed_text)
            word_count = len(processed_text.split())
            print(f"[INFO] TRIDIS HTR extracted: {char_count} characters, {word_count} words")
            
            # Detect medieval features
            medieval_features = self._analyze_medieval_features(processed_text)
            if medieval_features:
                print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
            
            return processed_text.strip()
            
        except Exception as e:
            print(f"[ERROR] TRIDIS HTR extraction failed: {e}")
            return ""

    def _ocr_single_crop_with_tridis(self, crop_image):
        """Helper to run TRIDIS HTR inference on a single line crop image"""
        try:
            pixel_values = self.tridis_processor(
                images=crop_image, 
                return_tensors="pt"
            ).pixel_values.to(self.device)
            
            with torch.inference_mode():
                generated_ids = self.tridis_model.generate(
                    pixel_values,
                    max_length=768,
                    num_beams=6,
                    early_stopping=True,
                    do_sample=False,
                    repetition_penalty=1.15,
                    length_penalty=0.8,
                    no_repeat_ngram_size=2
                )
                
            text = self.tridis_processor.batch_decode(
                generated_ids, 
                skip_special_tokens=True
            )[0]
            
            text = ' '.join(text.split())
            return text.strip()
        except Exception as e:
            print(f"[ERROR] Single line OCR with TRIDIS failed: {e}")
            return ""

    def _preprocess_for_medieval_manuscript(self, image):
        """Enhanced preprocessing specifically optimized for medieval manuscripts"""
        try:
            print("[INFO] Applying medieval manuscript preprocessing...")
            
            # Convert to OpenCV format
            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
            
            # Step 1: Handle parchment/paper background variations
            # CLAHE for local contrast enhancement (handles uneven illumination)
            clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
            contrast_enhanced = clahe.apply(gray)
            
            # Step 2: Gentle denoising to preserve medieval letterforms and ink variations
            # Bilateral filter preserves edges while reducing noise
            denoised = cv2.bilateralFilter(contrast_enhanced, 7, 80, 80)
            
            # Step 3: Enhance faded ink while preserving original stroke width
            # Subtle sharpening kernel
            sharpen_kernel = np.array([
                [-0.5, -1, -0.5],
                [-1,   6, -1  ],
                [-0.5, -1, -0.5]
            ])
            sharpened = cv2.filter2D(denoised, -1, sharpen_kernel)
            
            # Step 4: Normalize intensity range for optimal TRIDIS input
            normalized = cv2.normalize(sharpened, None, 0, 255, cv2.NORM_MINMAX)
            
            # Convert back to PIL format and ensure it is RGB mode
            processed_image = Image.fromarray(normalized).convert("RGB")
            
            print("[INFO] Medieval preprocessing completed: contrast enhanced, denoised, sharpened")
            return processed_image
            
        except Exception as e:
            print(f"[WARN] Medieval preprocessing failed: {e}, using original image")
            return image

    def _post_process_medieval_text(self, text):
        """Post-process text from TRIDIS HTR with medieval-specific corrections"""
        try:
            if not text:
                return text
            
            print("[INFO] Post-processing TRIDIS HTR output for medieval features...")
            processed = text
            
            # Handle TRIDIS cancellation/correction markers
            # TRIDIS uses $word$ to mark cancelled/corrected text
            import re
            
            # Count cancellations before processing
            cancellation_count = processed.count('$') // 2
            
            # Convert $word$ to editorial brackets [word] for scholarly display
            processed = re.sub(r'\$([^$]*)\$', r'[\1]', processed)
            
            if cancellation_count > 0:
                print(f"[INFO] Processed {cancellation_count} scribal corrections/cancellations")
            
            # Clean up multiple spaces and normalize whitespace
            processed = ' '.join(processed.split())
            
            # Detect and log TRIDIS abbreviation expansions
            # Common medieval abbreviations that TRIDIS expands automatically
            medieval_expansions = {
                'domini': 'dñi/dni/dom̃',
                'facimus': 'facim̃/facimꝰ', 
                'quod': 'qd/q̃d',
                'enim': 'enim̃/en̄',
                'pro': 'ꝓ/p̃',
                'et': '⁊/et̃',
                'cum': 'cũ/cum̃',
                'per': 'p̃/ꝑ',
                'sunt': 'sũt/sunt̃',
                'omnia': 'om̃ia/omn̄a'
            }
            
            expansions_found = []
            for expansion, abbreviations in medieval_expansions.items():
                if expansion in processed.lower():
                    expansions_found.append(f"{abbreviations}→{expansion}")
            
            if expansions_found:
                print(f"[INFO] TRIDIS expanded abbreviations: {', '.join(expansions_found[:5])}")
                if len(expansions_found) > 5:
                    print(f"[INFO] ... and {len(expansions_found) - 5} more abbreviations")
            
            # Detect capitalization patterns (TRIDIS capitalizes named entities)
            capitalized_words = re.findall(r'\b[A-Z][a-z]+', processed)
            if capitalized_words:
                unique_caps = list(set(capitalized_words))
                print(f"[INFO] Named entities capitalized: {', '.join(unique_caps[:5])}")
                if len(unique_caps) > 5:
                    print(f"[INFO] ... and {len(unique_caps) - 5} more entities")
            
            return processed
            
        except Exception as e:
            print(f"[WARN] Medieval post-processing failed: {e}")
            return text

    def _analyze_medieval_features(self, text):
        """Analyze and identify medieval manuscript features in the text"""
        features = []
        
        if not text:
            return features
        
        try:
            # Cancellation markers
            if '[' in text and ']' in text:
                features.append("scribal corrections")
            
            # Expanded abbreviations
            medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt', 'omnia']
            found_expansions = [word for word in medieval_words if word in text.lower()]
            if found_expansions:
                features.append(f"abbreviation expansions ({len(found_expansions)})")
            
            # Named entity capitalization
            import re
            caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
            if caps_count > 0:
                features.append(f"capitalized entities ({caps_count})")
            
            # Medieval punctuation patterns
            if '.' in text or ',' in text or ':' in text:
                features.append("punctuation normalization")
            
            # Special medieval characters
            medieval_chars = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
            if medieval_chars > 0:
                features.append(f"medieval symbols ({medieval_chars})")
            
        except Exception as e:
            print(f"[WARN] Medieval feature analysis failed: {e}")
        
        return features

    def _extract_with_tesseract_enhanced(self, image_path):
        """Enhanced Tesseract extraction with multiple configurations"""
        try:
            import pytesseract
            
            image = Image.open(image_path).convert("RGB")
            
            # Multiple preprocessing approaches
            preprocessed_images = {
                'enhanced': self._preprocess_for_tesseract_enhanced(image),
                'basic': self._preprocess_for_tesseract_basic(image),
                'original': image
            }
            
            best_text = ""
            best_score = 0
            best_config = ""
            best_preprocessing = ""
            
            # Try different combinations of preprocessing and OCR configurations
            for prep_name, prep_image in preprocessed_images.items():
                for config_name, config in self.ocr_configs.items():
                    try:
                        # Try with Latin language first
                        text = pytesseract.image_to_string(
                            prep_image, 
                            lang='lat', 
                            config=config
                        ).strip()
                        
                        # If Latin fails or produces poor results, try English
                        if not text or len(text) < 5:
                            text = pytesseract.image_to_string(
                                prep_image, 
                                lang='eng', 
                                config=config
                            ).strip()
                        
                        # Score the result
                        score = self._score_tesseract_result(text)
                        
                        if text and score > best_score:
                            best_text = text
                            best_score = score
                            best_config = config_name
                            best_preprocessing = prep_name
                    
                    except Exception as e:
                        continue  # Skip failed configurations
            
            if best_text:
                print(f"[INFO] Best Tesseract result: {best_preprocessing} + {best_config} (score: {best_score:.3f})")
                return self._post_process_tesseract_text(best_text)
            
            return ""
            
        except Exception as e:
            print(f"[ERROR] Enhanced Tesseract extraction failed: {e}")
            return ""

    def _extract_layout_aware_ocr(self, image_path):
        """Extract text by segmenting the page layout into lines first for improved readability order"""
        try:
            import pytesseract
            print("[INFO] Running layout-aware line segmentation...")
            layout = self.layout_parser.analyze_layout(image_path)
            crops = self.layout_parser.crop_lines(image_path, layout)
            
            if not crops:
                print("[WARN] Layout parser returned no line crops")
                return ""
                
            print(f"[INFO] Layout-aware line parser cropped {len(crops)} lines")
            line_texts = []
            
            for idx, crop in enumerate(crops):
                # Enhance line crop for OCR
                crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
                gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
                clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
                enhanced = clahe.apply(gray)
                crop_pil = Image.fromarray(enhanced)
                
                # Single line OCR configuration
                config = '--oem 3 --psm 7'
                
                # Try Latin OCR first
                text = pytesseract.image_to_string(
                    crop_pil,
                    lang='lat',
                    config=config
                ).strip()
                
                # Try English fallback
                if not text or len(text) < 3:
                    text = pytesseract.image_to_string(
                        crop_pil,
                        lang='eng',
                        config=config
                    ).strip()
                    
                if text:
                    line_texts.append(self._post_process_tesseract_text(text))
            
            return "\n".join(line_texts)
        except Exception as e:
            print(f"[WARN] Layout aware Latin OCR failed: {e}")
            return ""

    def _preprocess_for_tesseract_enhanced(self, image):
        """Enhanced preprocessing for Tesseract OCR"""
        try:
            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
            
            # More aggressive enhancement for Tesseract
            clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
            enhanced = clahe.apply(gray)
            
            # Morphological operations to clean up characters
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
            cleaned = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
            
            return Image.fromarray(cleaned)
            
        except Exception as e:
            print(f"[WARN] Enhanced Tesseract preprocessing failed: {e}")
            return image

    def _preprocess_for_tesseract_basic(self, image):
        """Basic preprocessing for Tesseract OCR"""
        try:
            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
            
            # Simple contrast enhancement
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            enhanced = clahe.apply(gray)
            
            return Image.fromarray(enhanced)
            
        except Exception as e:
            return image

    def _score_tesseract_result(self, text):
        """Score Tesseract OCR result quality"""
        if not text or len(text.strip()) < 2:
            return 0.0
        
        score = 0.0
        words = text.split()
        
        # Base length bonus
        score += min(len(words) / 15.0, 0.25)
        
        # Latin character ratio
        latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
        if len(text) > 0:
            latin_ratio = latin_chars / len(text)
            score += latin_ratio * 0.35
        
        # Word formation bonus
        if len(words) > 1:
            score += 0.2
        
        # Common Latin words bonus
        common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab', 'post', 'ante', 'inter']
        latin_matches = sum(1 for word in words if word.lower() in common_latin)
        if latin_matches > 0:
            score += latin_matches * 0.05
        
        # Medieval symbols bonus
        medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
        symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
        if symbol_count > 0:
            score += 0.15
        
        # Penalize excessive garbage characters
        garbage_chars = sum(1 for c in text if not c.isalnum() and c not in " .,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁ")
        if len(text) > 0:
            garbage_ratio = garbage_chars / len(text)
            score -= garbage_ratio * 0.3
        
        return max(0.0, min(1.0, score))

    def _post_process_tesseract_text(self, text):
        """Post-process Tesseract OCR result"""
        try:
            # Clean up common OCR errors
            corrections = {
                'rn': 'm',
                'cl': 'd', 
                '|': 'I',
                '°': 'o',
                '¢': 'c',
                '£': 'E'
            }
            
            processed = text
            for wrong, correct in corrections.items():
                processed = processed.replace(wrong, correct)
            
            # Normalize whitespace
            processed = ' '.join(processed.split())
            
            return processed
            
        except Exception as e:
            print(f"[WARN] Tesseract post-processing failed: {e}")
            return text

    def _validate_latin_text(self, text, style="any"):
        """Validate text with criteria appropriate for classical/printed or medieval Latin"""
        if not text or len(text.strip()) < 3:
            return False
            
        try:
            # Count Latin characters
            latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
            total_chars = len(text.replace(' ', ''))
            
            if total_chars == 0:
                return False
                
            latin_ratio = latin_chars / max(total_chars, 1)
            
            # For printed/classical Latin, we require a high ratio of standard alphabetical letters
            if style == "printed":
                return latin_chars >= 5 and latin_ratio >= 0.6
                
            # For cursive/medieval Latin, we can be more generous and include medieval symbol weight
            medieval_symbols = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§[]")
            medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt']
            word_bonus = sum(3 for word in medieval_words if word in text.lower())
            
            total_meaningful = latin_chars + medieval_symbols + word_bonus
            meaningful_ratio = total_meaningful / max(total_chars, 1)
            
            if total_meaningful >= 10:
                return True
            elif meaningful_ratio >= 0.6:
                return True
            elif total_meaningful >= 5 and meaningful_ratio >= 0.3:
                return True
            else:
                return False
                
        except Exception as e:
            print(f"[WARN] Text validation failed: {e}")
            return len(text.strip()) >= 5  # Fallback validation


    def process_text(self, latin_text):
        """Process extracted Latin text with comprehensive TRIDIS-aware analysis"""
        if not latin_text:
            return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
        
        print("[INFO] Processing Latin text with medieval manuscript analysis...")
        
        # Extract symbols including medieval markers and corrections
        symbols = ''.join(filter(lambda x: x.isalnum() or x in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§$[]", latin_text))
        
        # Comprehensive medieval character analysis
        medieval_symbols = [c for c in latin_text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§"]
        correction_markers = latin_text.count('[') + latin_text.count('$')
        
        # Detect expanded abbreviations
        medieval_abbreviations = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt', 'omnia']
        expansions_found = [word for word in medieval_abbreviations if word in latin_text.lower()]
        
        # Count capitalized entities (TRIDIS feature)
        import re
        capitalized_entities = re.findall(r'\b[A-Z][a-z]+', latin_text)
        unique_entities = list(set(capitalized_entities))
        
        # Comprehensive character analysis
        char_analysis = {
            "total_chars": len(latin_text),
            "alpha_chars": sum(c.isalpha() for c in latin_text),
            "unique_chars": len(set(latin_text)),
            "word_count": len(latin_text.split()),
            "medieval_symbols": len(medieval_symbols),
            "medieval_symbol_types": list(set(medieval_symbols)),
            "abbreviation_expansions": expansions_found,
            "expansion_count": len(expansions_found),
            "correction_markers": correction_markers,
            "capitalized_entities": unique_entities,
            "entity_count": len(unique_entities),
            "avg_word_length": sum(len(word) for word in latin_text.split()) / max(1, len(latin_text.split()))
        }
        
        # Enhanced validation with medieval features
        validation = {
            "latin_ratio": sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in latin_text) / max(1, len(latin_text)),
            "quality_score": self._calculate_comprehensive_quality_score(latin_text),
            "ocr_method": getattr(self, 'active_model', "TRIDIS HTR (Medieval Manuscript Specialist)" if self.tridis_available else "Tesseract OCR"),
            "model_specialization": "General Latin text" if getattr(self, 'active_style', '') == 'printed' else ("13th-16th century manuscripts" if self.tridis_available else "General Latin text"),
            "medieval_features_detected": bool(medieval_symbols or expansions_found or correction_markers),
            "tridis_used": getattr(self, 'active_model', '') == 'tridis_HTR',
            "manuscript_period": "Classical/Roman Monumental" if getattr(self, 'active_style', '') == 'printed' else ("Late Medieval (13th-16th centuries)" if (medieval_symbols or expansions_found) else "Classical/Modern"),
            "text_type": "classical_inscription" if getattr(self, 'active_style', '') == 'printed' else self._determine_text_type(latin_text),
            "abbreviations_expanded": len(expansions_found) > 0,
            "named_entities_detected": len(unique_entities) > 0,
            "scribal_corrections_found": correction_markers > 0,
            "confidence_level": self._determine_confidence_level(latin_text),
            "writing_style": getattr(self, 'active_style', 'cursive')
        }
        
        return {
            "text": latin_text,
            "symbols": symbols,
            "char_analysis": char_analysis,
            "validation": validation
        }

    def _calculate_comprehensive_quality_score(self, text):
        """Calculate comprehensive quality score with medieval bonuses"""
        if not text:
            return 0.0
        
        score = 0.0
        words = text.split()
        
        # Base metrics
        score += min(len(words) / 15.0, 0.2)  # Length bonus (max 0.2)
        
        # Latin character ratio
        latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
        score += (latin_chars / max(1, len(text))) * 0.25
        
        # TRIDIS Medieval bonuses (only if TRIDIS was used)
        if self.tridis_available and getattr(self, 'active_model', '') == 'tridis_HTR':
            # Expanded abbreviations (major quality indicator)
            medieval_expansions = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt']
            expansion_count = sum(1 for exp in medieval_expansions if exp in text.lower())
            score += min(expansion_count * 0.05, 0.2)  # Max 0.2 bonus
            
            # Named entity capitalization (TRIDIS feature)
            import re
            caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
            score += min(caps_count * 0.02, 0.15)  # Max 0.15 bonus
            
            # Correction markers (authenticity indicator)
            corrections = text.count('[') + text.count('$')
            score += min(corrections * 0.03, 0.1)  # Max 0.1 bonus
        
        # Medieval symbols (regardless of OCR method)
        medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
        symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
        score += min(symbol_count * 0.04, 0.15)  # Max 0.15 bonus

        
        # Word formation
        if len(words) > 1:
            score += 0.1
        
        # Common Latin words
        common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab']
        latin_matches = sum(1 for word in words if word.lower() in common_latin)
        score += min(latin_matches * 0.02, 0.1)
        
        return max(0.0, min(1.0, score))

    def _determine_text_type(self, text):
        """Determine the type of Latin text based on features"""
        if not text:
            return "unknown"
        
        # Medieval indicators
        medieval_expansions = ['domini', 'facimus', 'quod', 'enim']
        has_expansions = any(exp in text.lower() for exp in medieval_expansions)
        has_corrections = '[' in text or '$' in text
        has_medieval_symbols = any(c in text for c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
        
        if has_expansions and has_corrections:
            return "medieval_documentary_manuscript"
        elif has_expansions or has_medieval_symbols:
            return "medieval_manuscript"  
        elif has_corrections:
            return "manuscript_with_corrections"
        else:
            return "classical_latin_text"

    def _determine_confidence_level(self, text):
        """Determine confidence level based on text characteristics"""
        score = self._calculate_comprehensive_quality_score(text)
        
        if score >= 0.8:
            return "Very High"
        elif score >= 0.6:
            return "High"
        elif score >= 0.4:
            return "Medium"
        elif score >= 0.2:
            return "Low"
        else:
            return "Very Low"

    def generate_historical_context(self, processed_result):
        """Generate comprehensive historical context for Latin text"""
        latin_text = processed_result.get("text", "")
        
        groq_detail = self._generate_groq_context(latin_text)
        
        # Build references using words/symbols in Latin text
        words = re.findall(r'\w+', latin_text) if latin_text else []
        query_terms = list(words)
        if latin_text:
            query_terms.extend([char for char in latin_text if char.strip()])
        refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
        
        return {
            "uses_box": {
                "title": "Medieval Latin manuscript analysis",
                "items": self._build_uses_list(latin_text)
            },
            "meaning_box": self._build_enhanced_meaning_box(latin_text, groq_detail, processed_result),
            "references": refs
        }

    def _generate_groq_context(self, latin_text):
        """Generate contextual information using Groq with medieval awareness"""
        if not self.groq_client.is_available():
            return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
        
        # Analyze medieval features for context
        has_expansions = any(word in latin_text.lower() for word in ['domini', 'facimus', 'quod', 'enim'])
        has_corrections = '[' in latin_text or '$' in latin_text
        has_caps = any(c.isupper() for c in latin_text)
        
        if is_gibberish(latin_text):
            prompt = (
                "The following sequence appears to be fragmentary medieval Latin text, possibly with scribal abbreviations or corrections. "
                "Provide a concise, scholarly paragraph (6-10 sentences) covering possible meanings, historical context of medieval Latin manuscripts, "
                "common abbreviation practices, and typical documentary uses in 13th-16th century Europe."
            )
        else:
            context_note = ""
            if has_expansions:
                context_note += "The text contains expanded medieval abbreviations. "
            if has_corrections:
                context_note += "Scribal corrections or cancellations are present. "
            if has_caps:
                context_note += "Named entities appear to be properly capitalized. "
                
            prompt = (
                f"Analyze this medieval Latin text: {latin_text}\n\n"
                f"Context: {context_note}This appears to be from a medieval manuscript (13th-16th centuries). "
                f"Provide a scholarly paragraph (6-10 sentences) on its historical significance, cultural context, "
                f"likely documentary purpose, and interpretations. Focus on medieval manuscript practices, "
                f"legal/administrative contexts, and paleographic significance."
            )
        
        system_prompt = "You are a medieval Latin paleography specialist and historian. Provide accurate, concise scholarly analysis focusing on manuscript traditions, abbreviation practices, and documentary contexts of the late medieval period."
        enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, latin_text)
        
        return self.groq_client.generate_response(
            system_prompt=enriched_system_prompt,
            user_prompt=prompt
        ) or "(Historical context unavailable due to Groq error)"

    def _build_uses_list(self, latin_text):
        """Build enhanced list of character uses with TRIDIS context"""
        notes = self.references.get("latin_symbol_notes", {}) or {}
        default_hint = self.references.get("latin_hint", 
            "Letters and symbols reflect phonetic values and scribal practices in medieval manuscripts.")
        
        seen = set()
        items = []
        
        # Add TRIDIS-specific information for medieval features
        tridis_notes = {
            '[': "Editorial bracket indicating scribal correction or cancellation (TRIDIS transcription standard)",
            '$': "Cancellation marker for struck-through text (TRIDIS notation)",
        }
        
        for ch in latin_text:
            if ch in seen or not ch.strip():
                continue
            seen.add(ch)
            
            # Check TRIDIS-specific notes first
            if ch in tridis_notes:
                note = tridis_notes[ch]
            elif ch in notes:
                note = notes[ch]
            else:
                note = default_hint
                
            items.append(f"- {ch}: {note}")
        
        if not items:
            items.append("- —: " + default_hint)
        
        # Limit to prevent overwhelming output
        return items[:20]

    def _build_enhanced_meaning_box(self, latin_text, groq_detail, processed_result):
        """Build comprehensive meaning box with TRIDIS medieval analysis"""
        char_analysis = processed_result.get("char_analysis", {})
        validation = processed_result.get("validation", {})
        
        # Enhanced introduction with TRIDIS context
        processing_method = validation.get("ocr_method", "Unknown OCR")
        text_type = validation.get("text_type", "unknown")
        confidence = validation.get("confidence_level", "Unknown")
        
        intro_lines = [
            f"Text processed using {processing_method} with confidence level: {confidence}.",
        ]
        
        if self.tridis_available:
            intro_lines.extend([
                "TRIDIS HTR model trained on 245,000 lines of medieval manuscripts (13th-16th centuries).",
                "Specializes in Latin, Old French, Old Spanish documentary texts with automatic abbreviation expansion."
            ])
        
        # Medieval features summary
        medieval_features = []
        expansion_count = char_analysis.get("expansion_count", 0)
        if expansion_count > 0:
            medieval_features.append(f"{expansion_count} abbreviation expansions")
        
        correction_count = char_analysis.get("correction_markers", 0)
        if correction_count > 0:
            medieval_features.append(f"{correction_count} scribal corrections")
        
        entity_count = char_analysis.get("entity_count", 0)
        if entity_count > 0:
            medieval_features.append(f"{entity_count} named entities")
        
        if medieval_features:
            intro_lines.append(f"Medieval features detected: {', '.join(medieval_features)}.")
        
        # Key terms for frequent list
        expansions = char_analysis.get("abbreviation_expansions", [])
        entities = char_analysis.get("capitalized_entities", [])
        frequent_terms = expansions + entities
        
        if not frequent_terms:
            frequent_terms = list(set(w for w in latin_text.split() if len(w) > 2))[:10]
        
        # Enhanced analysis points
        points = []
        
        if self.tridis_available:
            points.extend([
                "• TRIDIS HTR provides semi-diplomatic transcription following scholarly editorial standards.",
                "• Automatic abbreviation expansion: dom̃→domini, facimꝰ→facimus, ꝓ→pro, ⁊→et.",
                "• Named entity capitalization and punctuation normalization applied."
            ])
        else:
            points.append("• Tesseract OCR provides basic Latin character recognition with limited medieval symbol support.")
        
        if correction_count > 0:
            points.append(f"• [{correction_count}] scribal corrections/cancellations indicate active manuscript editing process.")
        
        if expansion_count > 0:
            expansions_list = ", ".join(char_analysis.get("abbreviation_expansions", [])[:5])
            points.append(f"• Expanded abbreviations suggest legal/administrative document: {expansions_list}.")
        
        if validation.get("medieval_features_detected", False):
            manuscript_period = validation.get("manuscript_period", "Medieval")
            points.append(f"• {manuscript_period} characteristics indicate documentary manuscript tradition.")
        
        if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
            points.append(f"• Historical analysis: {groq_detail.strip()}")
        
        return {
            "title": "Medieval Latin manuscript analysis:",
            "intro_lines": intro_lines,
            "frequent_label": "Key medieval terms identified",
            "frequent": frequent_terms[:12],
            "points": points
        }

    def generate_story(self, processed_result):
        """Generate creative story with medieval manuscript context"""
        latin_text = processed_result.get("text", "")
        
        if not self.groq_client.is_available():
            return "Groq client unavailable, cannot generate historical narrative."
        
        # Analyze text features for story context
        char_analysis = processed_result.get("char_analysis", {})
        validation = processed_result.get("validation", {})
        
        has_expansions = char_analysis.get("expansion_count", 0) > 0
        has_corrections = char_analysis.get("correction_markers", 0) > 0
        has_entities = char_analysis.get("entity_count", 0) > 0
        text_type = validation.get("text_type", "unknown")
        used_tridis = validation.get("tridis_used", False)
        
        # Choose appropriate narrative style based on detected features
        if "documentary" in text_type or has_expansions:
            styles = [
                "as a legal charter discovered in monastic archives",
                "as an administrative record from a medieval royal court",
                "as a property deed found in cathedral scriptorium",
                "as a guild register from a medieval trading city",
                "as a tax record from a 14th-century monastery"
            ]
        elif has_corrections or has_entities:
            styles = [
                "as a monk's working manuscript with personal annotations", 
                "as a scholar's commentary on ancient texts",
                "as a chronicle being revised by a medieval historian",
                "as a theological treatise with scribal corrections",
                "as a copy of classical texts with medieval glosses"
            ]
        else:
            styles = [
                "as a sacred text illuminated by medieval scribes",
                "as a philosophical work from a cathedral school",
                "as a liturgical manuscript from a monastic library",
                "as a medical treatise translated in medieval Spain",
                "as an astronomical text from a medieval university"
            ]
        
        import random
        chosen_style = random.choice(styles)
        seed = random.randint(1000, 9999)
        
        # Craft historically-informed prompt
        processing_context = "deciphered using advanced medieval manuscript AI" if used_tridis else "carefully transcribed from the original"
        time_period = "13th-16th centuries" if (has_expansions or has_corrections) else "medieval period"
        
        prompt = (
            f"This Latin manuscript text was {processing_context}: {latin_text}\n\n"
            f"Historical context: The text appears to be from the {time_period}, "
            f"{'with expanded abbreviations and scribal corrections typical of documentary manuscripts' if has_expansions else 'showing characteristics of medieval scholarly tradition'}.\n\n"
            f"Create a vivid, historically accurate narrative (250+ words) set in medieval Europe, "
            f"telling the story of this manuscript's creation and significance. "
            f"Write {chosen_style}.\n\n"
            f"Include: Medieval setting, authentic historical details, multiple characters, "
            f"the process of manuscript creation, and the document's importance to its community.\n"
            f"Narrative seed: {seed}"
        )
        
        system_prompt = (
            "You are a medieval historian and storyteller specializing in manuscript culture, "
            "paleography, and daily life in 13th-16th century Europe. Create authentic, "
            "engaging narratives that reflect accurate historical knowledge of medieval "
            "scriptoriums, legal practices, and scholarly traditions."
        )
        
        story = self.groq_client.generate_response(
            system_prompt=system_prompt,
            user_prompt=prompt
        )
        
        if not story or is_gibberish(story):
            return "Failed to generate historical narrative; medieval story creation unavailable."
        
        return story