Spaces:

Akshay30
/

decipherai-api

Sleeping

App Files Files Community

Akshay30 commited on 3 days ago

Commit

2f4af3f

0 Parent(s):

Initial DecipherAI backend deployment

Browse files

Files changed (30) hide show

.gitattributes +35 -0
Dockerfile +49 -0
README.md +11 -0
app.py +473 -0
config.py +48 -0
decipherai-api +1 -0
models/clip_classifier.py +129 -0
models/groq_client.py +51 -0
models/huggingface_models.py +62 -0
models/tesseract_ocr.py +7 -0
processors/__init__.py +0 -0
processors/base_processor.py +71 -0
processors/cuneiform_processor.py +804 -0
processors/egyptian_processor.py +390 -0
processors/greek_processor.py +774 -0
processors/latin_processor.py +1281 -0
references.json +149 -0
requirements.txt +30 -0
services/__init__.py +0 -0
services/context_generator.py +0 -0
services/groq_vision_classifier.py +254 -0
services/layout_parser.py +244 -0
services/rag_service.py +186 -0
services/script_detector.py +223 -0
services/story_generator.py +0 -0
utils/__init__.py +0 -0
utils/gpu_diagnostics.py +137 -0
utils/image_utils.py +121 -0
utils/text_utils.py +82 -0
utils/validation.py +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+# DecipherAI Backend — Hugging Face Spaces Docker Configuration
+# Space SDK: Docker
+# Port: 7860
+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    wget \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Download Ancient Greek Tesseract model
+RUN mkdir -p /usr/share/tesseract-ocr/5/tessdata && \
+    wget -q \
+    https://github.com/tesseract-ocr/tessdata/raw/main/grc.traineddata \
+    -O /usr/share/tesseract-ocr/5/tessdata/grc.traineddata
+# Create non-root user (HF Spaces recommendation)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user
+ENV PATH=/home/user/.local/bin:$PATH
+WORKDIR /home/user/app
+# Install Python dependencies
+COPY --chown=user:user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application
+COPY --chown=user:user . .
+# Hugging Face Space port
+EXPOSE 7860
+# Production server
+CMD ["gunicorn", \
+     "--bind", "0.0.0.0:7860", \
+     "--workers", "1", \
+     "--timeout", "300", \
+     "--preload", \
+     "app:app"]

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Decipherai Api
+emoji: 🔥
+colorFrom: indigo
+colorTo: blue
+sdk: docker
+pinned: false
+short_description: Ancient script analysis, OCR, translation and historical int
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,473 @@

+import os
+import sys
+import tempfile
+from dotenv import load_dotenv
+# Safe stdout/stderr wrapper to prevent OSError: [Errno 22] when stdout pipe is closed/unbuffered in background
+class SafeStream:
+    def __init__(self, original_stream):
+        self.original_stream = original_stream
+    def write(self, data):
+        try:
+            if self.original_stream:
+                self.original_stream.write(data)
+        except OSError as e:
+            if e.errno != 22:
+                raise
+    def flush(self):
+        try:
+            if self.original_stream:
+                self.original_stream.flush()
+        except OSError:
+            pass
+    def __getattr__(self, attr):
+        return getattr(self.original_stream, attr)
+sys.stdout = SafeStream(sys.stdout)
+sys.stderr = SafeStream(sys.stderr)
+# Load .env variables (including HF_HOME and GROQ_API_KEY) before imports
+load_dotenv()
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+# Import modular components
+from config import Config
+from models.groq_client import GroqClient
+from models.clip_classifier import CLIPClassifier
+from models.tesseract_ocr import TesseractOCR
+from models.huggingface_models import HuggingFaceModels
+from services.groq_vision_classifier import GroqVisionScriptClassifier
+from services.script_detector import ScriptDetectionService
+from utils.image_utils import validate_image
+from utils.text_utils import clean_text
+from processors.cuneiform_processor import CuneiformProcessor
+from utils.gpu_diagnostics import log_gpu_info
+# Initialize Flask app
+app = Flask(__name__)
+# CORS — restrict origins in production via ALLOWED_ORIGINS env var
+# Example: ALLOWED_ORIGINS=https://your-frontend.vercel.app,https://custom-domain.com
+allowed_origins = os.getenv(
+    "ALLOWED_ORIGINS",
+    "http://localhost:3000,http://localhost:5173,http://localhost:5000"
+)
+CORS(app, origins=allowed_origins.split(","))
+# Global components
+import threading
+config = Config()
+groq_client = None
+clip_classifier = None
+hf_models = None
+script_detector = None
+cuneiform_processor = None
+references = {}
+# Live model preloading status tracking
+model_status = {
+    "status": "loading",
+    "groq": "pending",
+    "clip": "pending",
+    "translator": "pending",
+    "cuneiform": "pending",
+    "script_detector": "pending"
+}
+def load_references():
+    """Load references from JSON file"""
+    global references
+    try:
+        import json
+        with open(config.REFERENCES_PATH, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        references = {
+            "egypt_symbol_notes": data.get("egypt_symbol_notes", {}),
+            "greek_symbol_notes": data.get("greek_symbol_notes", {}),
+            "greek_hint": data.get("greek_hint", "If no specific character note is found, treat as lexical marker considering diacriticals (breathing marks, accents, vowel quantity) which affect pronunciation, meaning, and grammatical function in ancient Greek texts."),
+            "latin_symbol_notes": data.get("latin_symbol_notes", {}),
+            "latin_hint": data.get("latin_hint", "If no specific character note is found, consider standard Latin letters or medieval scribal abbreviations."),
+            # Cuneiform references
+            "cuneiform_symbol_notes": data.get("cuneiform_symbol_notes", {}),
+            "cuneiform_hint": data.get("cuneiform_hint", "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages (Sumerian, Akkadian, etc.)")
+        }
+        print(f"[INFO] Loaded references from {config.REFERENCES_PATH}")
+    except Exception as e:
+        print(f"[WARN] Failed to load references: {e}")
+        references = {
+            "egypt_symbol_notes": {},
+            "greek_symbol_notes": {},
+            "greek_hint": "Possible Greek lexical marker.",
+            "latin_symbol_notes": {},
+            "latin_hint": "Latin scribal practice.",
+            "cuneiform_symbol_notes": {},
+            "cuneiform_hint": "Ancient cuneiform sign."
+        }
+def initialize_models_async():
+    """Load models sequentially in the background to prevent blocking Flask startup"""
+    global groq_client, clip_classifier, hf_models, script_detector, cuneiform_processor, model_status
+    try:
+        print("[INFO] Background model preloading thread started...")
+        # Log GPU Diagnostics
+        log_gpu_info()
+        # Load references first
+        load_references()
+        # Groq
+        model_status["groq"] = "loading"
+        groq_client = GroqClient()
+        model_status["groq"] = "ready" if groq_client.is_available() else "unavailable"
+        print(f"[INFO] Groq client initialization complete: {model_status['groq']}")
+        # CLIP
+        model_status["clip"] = "loading"
+        clip_classifier = CLIPClassifier()
+        model_status["clip"] = "ready" if (clip_classifier and clip_classifier.pipeline is not None) else "failed"
+        print(f"[INFO] CLIP classifier initialization complete: {model_status['clip']}")
+        # HF Translator
+        model_status["translator"] = "loading"
+        hf_models = HuggingFaceModels()
+        model_status["translator"] = "ready" if (hf_models and hf_models.get_translator() is not None) else "failed"
+        print(f"[INFO] Hugging Face models initialization complete: {model_status['translator']}")
+        # Cuneiform Processor
+        model_status["cuneiform"] = "loading"
+        try:
+            print("[INFO] Initializing cuneiform processor...")
+            cuneiform_processor = CuneiformProcessor(
+                groq_client=groq_client,
+                references=references,
+                clip_classifier=clip_classifier
+            )
+            model_status["cuneiform"] = "ready" if cuneiform_processor.cuneiform_available else "unavailable"
+        except Exception as e:
+            print(f"[ERROR] Failed to initialize cuneiform processor: {e}")
+            model_status["cuneiform"] = "failed"
+            cuneiform_processor = None
+        print(f"[INFO] Cuneiform processor initialization complete: {model_status['cuneiform']}")
+        # Script Detection Service
+        model_status["script_detector"] = "loading"
+        script_detector = ScriptDetectionService(
+            groq_client=groq_client,
+            references=references,
+            clip_classifier=clip_classifier,
+            translator_pipe=hf_models.get_translator(),
+            cuneiform_processor=cuneiform_processor
+        )
+        model_status["script_detector"] = "ready"
+        print(f"[INFO] Script detection service initialization complete: {model_status['script_detector']}")
+        model_status["status"] = "ready"
+        print("[SUCCESS] All models initialized successfully in the background")
+    except Exception as e:
+        model_status["status"] = "failed"
+        print(f"[ERROR] Critical failure in background model initialization: {e}")
+def initialize_models():
+    """Spawn background thread to load models"""
+    print("[INFO] Spawning background thread for model initialization...")
+    model_status["status"] = "loading"
+    threading.Thread(target=initialize_models_async, daemon=True).start()
+@app.route('/analyze', methods=['POST'])
+def analyze():
+    """Main analysis endpoint with Groq Vision classification"""
+    tmp_path = None
+    try:
+        # Check if models are fully loaded
+        if model_status["status"] != "ready":
+            return jsonify({
+                "error": "Models are still loading in the background. Please try again in a few moments.",
+                "status": "loading",
+                "models_status": model_status
+            }), 503
+        # Validate request
+        if 'image' not in request.files:
+            return jsonify({"error": "No image uploaded"}), 400
+        img_file = request.files['image']
+        if img_file.filename == '':
+            return jsonify({"error": "Empty filename"}), 400
+        # Validate image file
+        try:
+            validate_image(img_file)
+        except ValueError as e:
+            return jsonify({"error": str(e)}), 400
+        # Save temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+            tmp_path = tmp.name
+            img_file.save(tmp_path)
+        # Process image with Groq Vision classification
+        result = script_detector.detect_and_process(tmp_path)
+        if not result:
+            return jsonify({"error": "Could not process image"}), 500
+        # Get Vision classification info
+        vision_classification = result.get('vision_classification', 'unknown')
+        classification_method = result.get('classification_method', 'unknown')
+        classification_confidence = result.get('classification_confidence', 0.0)
+        script_type = result.get('script_type', 'egyptian')
+        # Base response with Vision classification info
+        base_response = {
+            "script_type": script_type,
+            "vision_classification": vision_classification,
+            "classification_method": classification_method,
+            "classification_confidence": classification_confidence,
+            "confidence": result.get('confidence', 0.0),
+            "historical_context": result.get('historical_context', {}),
+            "creative_story": result.get('creative_story', ''),
+            "model_used": "llama-3.2-90b-vision-preview"
+        }
+        # Handle cuneiform processing
+        if script_type == 'cuneiform':
+            if not cuneiform_processor or not cuneiform_processor.cuneiform_available:
+                return jsonify({
+                    **base_response,
+                    "error": "Cuneiform processing unavailable",
+                    "labels": [],
+                    "gardiner_codes": [],
+                    "translation": "Cuneiform translation model not available",
+                    "translation_ok": False
+                }), 200
+            try:
+                # Process cuneiform text
+                processed_result = result.get('processed_result', {})
+                cuneiform_text = processed_result.get('text', '')
+                # Translate cuneiform to English
+                translation = ""
+                translation_ok = False
+                if cuneiform_text and len(cuneiform_text.strip()) > 2:
+                    print(f"[INFO] Translating cuneiform: {cuneiform_text[:50]}...")
+                    translation = cuneiform_processor.translate_cuneiform(cuneiform_text)
+                    translation_ok = bool(translation and not translation.startswith("Error"))
+                else:
+                    translation = "No readable cuneiform text extracted"
+                # Build cuneiform response
+                response_data = {
+                    **base_response,
+                    "labels": [],
+                    "gardiner_codes": [],
+                    "translation": translation,
+                    "translation_ok": translation_ok,
+                    "cuneiform_text": cuneiform_text,
+                    "validation": {
+                        "quality_score": processed_result.get('validation', {}).get('quality_score', 0.0),
+                        "cuneiform_ratio": processed_result.get('validation', {}).get('cuneiform_ratio', 0.0),
+                        "atf_ratio": processed_result.get('validation', {}).get('atf_ratio', 0.0),
+                        "char_analysis": processed_result.get('char_analysis', {}),
+                        "ocr_method": "praeclarum/cuneiform (T5-based translation)",
+                        "supports_translation": True,
+                        "input_format": processed_result.get('char_analysis', {}).get('text_format', 'Unknown')
+                    }
+                }
+                return jsonify(response_data)
+            except Exception as e:
+                print(f"[ERROR] Cuneiform processing failed: {e}")
+                return jsonify({
+                    **base_response,
+                    "error": f"Cuneiform processing error: {str(e)}",
+                    "labels": [],
+                    "gardiner_codes": [],
+                    "translation": "Cuneiform processing failed",
+                    "translation_ok": False
+                }), 200
+        elif script_type in ['greek', 'latin']:
+            processed_result = result.get('processed_result', {})
+            validation = processed_result.get('validation', {})
+            response_data = {
+                **base_response,
+                "labels": [],
+                "gardiner_codes": [],
+                "translation": processed_result.get('text', ''),
+                "translation_ok": True,
+            }
+            # Add enhanced validation info for Greek
+            if script_type == 'greek':
+                response_data["validation"] = {
+                    "quality_score": validation.get('quality_score', 0.0),
+                    "greek_ratio": validation.get('greek_ratio', 0.0),
+                    "has_polytonic": validation.get('has_polytonic', False),
+                    "char_analysis": processed_result.get('char_analysis', {}),
+                    "ocr_method": "ancient_greek_ocr" if validation.get('quality_score', 0) > 0.7 else "standard_greek_ocr"
+                }
+            elif script_type == 'latin':
+                response_data["validation"] = {
+                    "quality_score": validation.get('quality_score', 0.0),
+                    "latin_ratio": validation.get('latin_ratio', 0.0),
+                    "trocr_used": validation.get('tridis_used', False) or (validation.get('ocr_method') in ['trocr-base-latin', 'tridis_HTR']),
+                    "char_analysis": processed_result.get('char_analysis', {}),
+                    "ocr_method": validation.get('ocr_method', 'standard_latin_ocr'),
+                    "writing_style": validation.get('writing_style', 'cursive')
+                }
+            return jsonify(response_data)
+        else:  # Egyptian
+            processed = result['processed_result']
+            return jsonify({
+                **base_response,
+                "labels": processed['labels'],
+                "gardiner_codes": processed['codes'],
+                "translation": processed['translation'],
+                "translation_ok": processed['translation_ok']
+            })
+    except Exception as e:
+        print(f"[ERROR] Analysis failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return jsonify({"error": "Processing failed"}), 500
+    finally:
+        # Cleanup temporary file
+        if tmp_path:
+            try:
+                os.remove(tmp_path)
+            except Exception:
+                pass
+@app.route('/chat', methods=['POST'])
+def chat():
+    """Chatbot endpoint for manuscript queries and general dialogue"""
+    try:
+        data = request.get_json() or {}
+        message = data.get("message", "")
+        history = data.get("history", [])
+        context = data.get("context", "")
+        if not message:
+            return jsonify({"error": "Message is required"}), 400
+        system_prompt = (
+            "You are DecipherAI's helpful historical assistant. You are an expert paleographer and historian.\n"
+            "Answer the user's questions about ancient scripts, translations, and history in a helpful, "
+            "academic yet accessible manner. Cite historical sources when appropriate."
+        )
+        if context:
+            system_prompt += f"\n\nHere is the context of the current manuscript translation:\n{context}"
+        if not groq_client or not groq_client.is_available():
+            reply = (
+                f"Thank you for your question: '{message}'. I'm currently running in offline fallback mode "
+                f"because the Groq API key is not set. Once configured, I will be able to answer all your "
+                f"scholarly questions about the translated scripts, historical context, and paleography in real time!"
+            )
+        else:
+            prompt = ""
+            for turn in history[-5:]:
+                role = turn.get("role", "user")
+                content = turn.get("content", "")
+                prompt += f"{role.upper()}: {content}\n"
+            prompt += f"USER: {message}\nASSISTANT:"
+            reply = groq_client.generate_response(
+                system_prompt=system_prompt,
+                user_prompt=prompt,
+                max_tokens=500
+            ) or "I'm sorry, I encountered an error generating a response."
+        return jsonify({"reply": reply})
+    except Exception as e:
+        print(f"[ERROR] Chat failed: {e}")
+        return jsonify({"error": "Failed to process chat message"}), 500
+@app.route('/health', methods=['GET'])
+def health_check():
+    """Health check endpoint returning real-time load status"""
+    return jsonify({
+        "status": "healthy" if model_status["status"] == "ready" else "initializing",
+        "models_status": model_status
+    })
+@app.route('/info', methods=['GET'])
+def info():
+    """Information endpoint"""
+    return jsonify({
+        "app": "Ancient Script Recognition System",
+        "version": "2.1.0",
+        "supported_scripts": [
+            "Egyptian Hieroglyphs",
+            "Ancient Greek",
+            "Latin",
+            "Ancient Cuneiform"
+        ],
+        "features": [
+            "Multi-script detection",
+            "OCR text extraction",
+            "Historical context generation",
+            "Creative story generation",
+            "Cuneiform translation (Sumerian/Akkadian → English)"
+        ]
+    })
+# --- Model initialization ---
+# When running under gunicorn (or any WSGI server), __name__ != "__main__",
+# so we initialize models at module level. The gunicorn --preload flag ensures
+# this runs once in the master process before forking workers.
+def _auto_initialize():
+    """Initialize models when running under a WSGI server (gunicorn, waitress, etc.)"""
+    if os.getenv("WERKZEUG_RUN_MAIN") == "true":
+        # Flask reloader child process — handled by __main__ block
+        return
+    print("[INIT] WSGI server detected — initializing models...")
+    initialize_models()
+if __name__ == "__main__":
+    print("[INIT] Starting Ancient Script Recognition System...")
+    # Start Flask app
+    port = int(os.getenv("PORT", 7860))
+    debug = os.getenv("DEBUG", "False").lower() == "true"
+    # Initialize all models (only in child process if debug mode is on to avoid duplicate threads)
+    if not debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
+        initialize_models()
+    else:
+        print("[INFO] Reloader active. Model initialization deferred to child process.")
+    print(f"[INFO] Starting server on port {port}")
+    app.run(host="0.0.0.0", port=port, debug=debug)
+else:
+    # Running under gunicorn / WSGI
+    _auto_initialize()

config.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+from pathlib import Path
+import torch
+class Config:
+    # Paths
+    BASE_DIR = Path(__file__).parent
+    TESSERACT_EXE = os.getenv("TESSERACT_EXE", "tesseract")
+    TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX")
+    REFERENCES_PATH = BASE_DIR / "references.json"
+    ANCIENT_GREEK_TESSDATA = BASE_DIR / "tessdata" / "ancient-greek"
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    # Model Settings
+    HF_TRANSLATOR_MODEL = "AnushS/Hieroglyph-Translator-Using-Gardiner-Codes"
+    CLIP_MODEL = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+    DEVICE = 0 if torch.cuda.is_available() else -1
+    # Groq Settings
+    GROQ_MODEL = "openai/gpt-oss-120b"
+    GROQ_TEMPERATURE = 1.0
+    GROQ_STORY_MAX_TOKENS = 1024
+    GROQ_CONTEXT_MAX_TOKENS = 2048
+    # File Upload Settings
+    MAX_FILE_SIZE = 16 * 1024 * 1024  # 16MB
+    ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp'}
+    # Gardiner Code Mapping
+    GARDINER_MAP = {
+        "man_seated": "A1", "woman_seated": "B1", "god_figure": "C1",
+        "eye": "D4", "hippopotamus": "E25", "leg": "F28", "owl": "G17",
+        "feather": "H2", "lizard": "I1", "fish": "K1", "insect": "L1",
+        "reed": "M17", "sun": "N5", "crown": "S39", "bow": "T14",
+        "hoe": "U25", "rope": "V1", "jar": "W1", "bread": "X3", "scribe_tools": "Y5"
+    }
+    TESSERACT_CONFIGS = {
+        'ancient_greek': "--psm 6 --oem 1 -c preserve_interword_spaces=1",
+        'standard_greek': "--psm 6 --oem 1",
+        'fallback': "--psm 3 --oem 1"
+    }
+    @property
+    def CODE_TO_LABEL(self):
+        return {v: k for k, v in self.GARDINER_MAP.items()}

decipherai-api ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit ed9e7fdd210f252a5309a7e6fc728a29fce274dd

models/clip_classifier.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+import numpy as np
+from config import Config
+from utils.gpu_diagnostics import log_model_device
+class CLIPClassifier:
+    def __init__(self):
+        self.config = Config()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = None
+        self.processor = None
+        # Load CLIP model and processor with fallback
+        model_name = getattr(self.config, 'CLIP_MODEL', 'openai/clip-vit-base-patch32')
+        try:
+            print(f"[INFO] Loading CLIP model: {model_name}...")
+            self.model = CLIPModel.from_pretrained(model_name)
+            self.processor = CLIPProcessor.from_pretrained(model_name)
+            self.model.to(self.device)
+            self.model.eval()  # Set model to evaluation mode
+            log_model_device("CLIP script classifier", self.device)
+            print(f"[INFO] CLIP model loaded on {self.device}")
+        except Exception as e:
+            print(f"[WARN] Failed to load CLIP model '{model_name}': {e}")
+            fallback_name = "openai/clip-vit-base-patch32"
+            try:
+                print(f"[INFO] Loading fallback CLIP model: {fallback_name}...")
+                self.model = CLIPModel.from_pretrained(fallback_name)
+                self.processor = CLIPProcessor.from_pretrained(fallback_name)
+                self.model.to(self.device)
+                self.model.eval()  # Set model to evaluation mode
+                log_model_device("CLIP script classifier (fallback)", self.device)
+                print(f"[INFO] Fallback CLIP model loaded on {self.device}")
+            except Exception as fe:
+                print(f"[ERROR] Failed to load fallback CLIP model: {fe}")
+    @property
+    def pipeline(self):
+        """Property checked in app.py/test.py to ensure model is initialized"""
+        return self.model if self.model is not None else None
+    def classify_script_type(self, image):
+        """Classify script type of image into one of the four supported categories"""
+        if not self.pipeline:
+            return "unknown", 0.0
+        try:
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            # Prompts representing the four classes
+            scripts = ["egyptian", "greek", "latin", "cuneiform"]
+            descriptions = [
+                "ancient Egyptian hieroglyphic writing with drawings of animals and humans",
+                "ancient Greek alphabet script on papyrus or stone with polytonic symbols",
+                "medieval Latin manuscript text written in ink on parchment",
+                "ancient Mesopotamian cuneiform tablet with wedge-shaped markings in clay"
+            ]
+            inputs = self.processor(
+                text=descriptions,
+                images=image,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
+            with torch.inference_mode():
+                outputs = self.model(**inputs)
+                logits_per_image = outputs.logits_per_image
+                probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
+            best_idx = np.argmax(probs)
+            score = float(probs[best_idx])
+            script_label = scripts[best_idx]
+            print(f"[INFO] CLIP script classification: {script_label} ({score:.3f})")
+            return script_label, score
+        except Exception as e:
+            print(f"[ERROR] CLIP script classification failed: {e}")
+            return "unknown", 0.0
+    def classify_symbols(self, crops, candidate_labels):
+        """Classify segmented symbol image crops against candidate labels"""
+        if not self.pipeline or not crops or not candidate_labels:
+            return [None] * len(crops) if crops else []
+        try:
+            print(f"[INFO] Batch classifying {len(crops)} crops using CLIP...")
+            # Format candidate labels into descriptive prompts for better visual matching
+            prompts = [f"an ancient Egyptian hieroglyph symbol of a {label.replace('_', ' ')}" for label in candidate_labels]
+            # Tokenize prompts once
+            text_inputs = self.processor(
+                text=prompts,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
+            with torch.inference_mode():
+                text_features = self.model.get_text_features(**text_inputs)
+                text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+            results = []
+            # Process crops (images)
+            for crop in crops:
+                if isinstance(crop, np.ndarray):
+                    crop = Image.fromarray(crop)
+                image_inputs = self.processor(images=crop, return_tensors="pt").to(self.device)
+                with torch.inference_mode():
+                    image_features = self.model.get_image_features(**image_inputs)
+                    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+                    # Compute cosine similarities
+                    similarities = (image_features @ text_features.T).squeeze(0)
+                    best_idx = torch.argmax(similarities).item()
+                results.append(candidate_labels[best_idx])
+            return results
+        except Exception as e:
+            print(f"[ERROR] CLIP symbol classification failed: {e}")
+            return [candidate_labels[0]] * len(crops) if candidate_labels else [None] * len(crops)

models/groq_client.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+from config import Config
+class GroqClient:
+    def __init__(self):
+        self.config = Config()
+        self.api_key = self.config.GROQ_API_KEY or os.getenv("GROQ_API_KEY")
+        self.client = None
+        if self.api_key:
+            try:
+                from groq import Groq
+                self.client = Groq(api_key=self.api_key)
+                print("[INFO] Groq client initialized successfully")
+            except ImportError:
+                print("[WARN] groq package not installed. Run 'pip install groq'.")
+            except Exception as e:
+                print(f"[ERROR] Failed to initialize Groq client: {e}")
+        else:
+            print("[WARN] GROQ_API_KEY not found in configuration or environment.")
+    def is_available(self) -> bool:
+        """Check if Groq API client is available and configured"""
+        return self.client is not None
+    def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> str:
+        """Generate response from Groq LLM"""
+        if not self.is_available():
+            print("[WARN] GroqClient not available for generating response.")
+            return ""
+        try:
+            # Use stable model name or configured fallback
+            model = self.config.GROQ_MODEL
+            # Common model fallbacks if config is generic or outdated
+            if model == "openai/gpt-oss-120b":
+                model = "llama-3.1-8b-instant"  # standard Groq model
+            completion = self.client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                temperature=getattr(self.config, 'GROQ_TEMPERATURE', 0.7),
+                max_completion_tokens=max_tokens,
+            )
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"[ERROR] Groq API call failed: {e}")
+            return ""

models/huggingface_models.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from config import Config
+from utils.gpu_diagnostics import log_model_device, register_processor, reclaim_vram_for
+class HuggingFaceModels:
+    def __init__(self):
+        self.config = Config()
+        self.device = torch.device("cpu")  # Force Egyptian translator to CPU to save GPU VRAM
+        self._tokenizer = None
+        self._model = None
+        self.translator = self._translate_fn
+        print("[INFO] Egyptian translator initialized (Forced to CPU)")
+    def setup_translation_model(self):
+        """Load T5 Seq2Seq model on CPU."""
+        model_name = getattr(self.config, 'HF_TRANSLATOR_MODEL', 'AnushS/Hieroglyph-Translator-Using-Gardiner-Codes')
+        try:
+            print(f"[INFO] Lazily loading Hugging Face translation model on CPU: {model_name}...")
+            self._tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self._model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            self._model.to(self.device)
+            self._model.eval()
+            log_model_device("Egyptian T5 Translator", self.device)
+            print("[INFO] Translation model loaded successfully on CPU (Seq2Seq direct)")
+        except Exception as e:
+            print(f"[ERROR] Failed to load translation model '{model_name}': {e}")
+            self.translator = self._get_mock_translator()
+    def _translate_fn(self, prompt, max_new_tokens=128, **kwargs):
+        """Translate using the T5 model directly on CPU."""
+        try:
+            if self._model is None:
+                self.setup_translation_model()
+            inputs = self._tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.inference_mode():
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    num_beams=kwargs.get("num_beams", 4),
+                    do_sample=kwargs.get("do_sample", False),
+                )
+            decoded = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
+            return [{"generated_text": decoded, "translation_text": decoded}]
+        except Exception as e:
+            print(f"[ERROR] Translation inference failed: {e}")
+            return [{"generated_text": "", "translation_text": ""}]
+    def get_translator(self):
+        """Return the loaded translation function or mock fallback"""
+        return self.translator
+    def _get_mock_translator(self):
+        """Returns a dummy translator function that mimics pipeline behavior on error"""
+        print("[INFO] Setting up mock fallback translator")
+        def mock_pipeline(prompt, *args, **kwargs):
+            return [{"generated_text": "", "translation_text": ""}]
+        return mock_pipeline

models/tesseract_ocr.py ADDED Viewed

	@@ -0,0 +1,7 @@

+class TesseractOCR:
+    """
+    Placeholder class to satisfy imports in app.py and test.py.
+    The processors themselves communicate directly with pytesseract.
+    """
+    def __init__(self):
+        pass

processors/__init__.py ADDED Viewed

File without changes

processors/base_processor.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from abc import ABC, abstractmethod
+from PIL import Image
+class BaseScriptProcessor(ABC):
+    def __init__(self, groq_client, references, clip_classifier=None):  # Add clip_classifier parameter
+        self.groq_client = groq_client
+        self.references = references
+        self.clip_classifier = clip_classifier  # Store clip_classifier
+        from services.rag_service import RAGService
+        from services.layout_parser import LayoutParser
+        self.rag_service = RAGService()
+        self.layout_parser = LayoutParser()
+    @abstractmethod
+    def detect_script(self, image_path):
+        """Detect if image contains this script type"""
+        pass
+    @abstractmethod
+    def extract_text(self, image_path):
+        """Extract text/symbols from image"""
+        pass
+    @abstractmethod
+    def process_text(self, extracted_text):
+        """Process extracted text into meaningful output"""
+        pass
+    @abstractmethod
+    def generate_historical_context(self, processed_text):
+        """Generate historical context for the text"""
+        pass
+    @abstractmethod
+    def generate_story(self, processed_text):
+        """Generate creative story based on the text"""
+        pass
+    def process_image(self, image_path):
+        """Main processing pipeline"""
+        try:
+            # Step 1: Detect script
+            is_detected, confidence = self.detect_script(image_path)
+            if not is_detected:
+                return None
+            # Step 2: Extract text
+            extracted_text = self.extract_text(image_path)
+            if not extracted_text:
+                return None
+            # Step 3: Process text
+            processed_result = self.process_text(extracted_text)
+            # Step 4: Generate context and story
+            historical_context = self.generate_historical_context(processed_result)
+            creative_story = self.generate_story(processed_result)
+            return {
+                "script_type": self.__class__.__name__.replace("Processor", "").lower(),
+                "confidence": confidence,
+                "extracted_text": extracted_text,
+                "processed_result": processed_result,
+                "historical_context": historical_context,
+                "creative_story": creative_story
+            }
+        except Exception as e:
+            print(f"[ERROR] Processing failed in {self.__class__.__name__}: {e}")
+            return None

processors/cuneiform_processor.py ADDED Viewed

	@@ -0,0 +1,804 @@

+import os
+import cv2
+import numpy as np
+import re
+import time
+from PIL import Image, ImageEnhance, ImageFilter
+import torch
+from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForSeq2SeqLM
+from .base_processor import BaseScriptProcessor
+from utils.text_utils import is_gibberish
+BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
+CUNEIFORM_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "cuneiform")
+class CuneiformProcessor(BaseScriptProcessor):
+    def __init__(self, groq_client, references, clip_classifier):
+        super().__init__(groq_client, references, clip_classifier)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.clip_model = None
+        self.clip_processor = None
+        self.clip_available = False
+        self.cuneiform_model = None
+        self.cuneiform_tokenizer = None
+        self.translator_available = False
+        # Register for dynamic VRAM management
+        from utils.gpu_diagnostics import register_processor
+        register_processor("cuneiform", self)
+    @property
+    def cuneiform_available(self):
+        """Property to match interface expected by ScriptDetectionService"""
+        # Always return True since we load lazily on demand
+        return True
+    def setup_cuneiform_clip(self):
+        """Setup CLIP for cuneiform visual recognition - MUCH better than OCR"""
+        try:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("cuneiform")
+            print("[INFO] Lazily loading CLIP for cuneiform visual recognition...")
+            # Use a powerful CLIP model for better ancient script understanding
+            model_name = "openai/clip-vit-large-patch14"
+            self.clip_processor = CLIPProcessor.from_pretrained(model_name)
+            self.clip_model = CLIPModel.from_pretrained(model_name)
+            self.clip_model.to(self.device)
+            self.clip_model.eval()  # Put in evaluation mode
+            from utils.gpu_diagnostics import log_model_device
+            log_model_device("Cuneiform CLIP Recognition", self.device)
+            # Define cuneiform sign categories for CLIP classification
+            self.cuneiform_signs = [
+                "ancient Sumerian cuneiform sign AN meaning god or heaven",
+                "ancient Akkadian cuneiform sign LUGAL meaning king or ruler",
+                "ancient cuneiform sign KI meaning earth or place",
+                "ancient cuneiform sign DINGIR divine determinative marker",
+                "ancient cuneiform sign UD meaning day or sun",
+                "ancient cuneiform sign E meaning house or temple",
+                "ancient cuneiform sign EN meaning lord or priest",
+                "ancient cuneiform sign NIN meaning lady or queen",
+                "ancient cuneiform administrative record with numbers",
+                "ancient cuneiform legal contract or treaty text",
+                "ancient cuneiform royal inscription or decree",
+                "ancient cuneiform literary or mythological text",
+                "ancient cuneiform school exercise or practice tablet"
+            ]
+            # Tablet layout descriptions for structural analysis
+            self.tablet_layouts = [
+                "clay tablet with cuneiform text arranged in horizontal lines",
+                "cuneiform tablet with vertical column organization",
+                "administrative record tablet with numerical entries",
+                "legal document tablet with witness signatures",
+                "literary tablet with continuous narrative text",
+                "damaged or fragmentary cuneiform tablet",
+                "clear well-preserved cuneiform inscription",
+                "practice tablet with student exercises"
+            ]
+            print(f"[INFO] CLIP cuneiform recognition loaded on {self.device}")
+            print("[INFO] Using visual pattern recognition instead of character OCR")
+            self.clip_available = True
+        except Exception as e:
+            print(f"[ERROR] CLIP cuneiform setup failed: {e}")
+            self.clip_available = False
+    def setup_praeclarum_translator(self):
+        """Setup praeclarum translation model for converting recognized content"""
+        try:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("cuneiform")
+            print("[INFO] Lazily loading praeclarum cuneiform translation model...")
+            self.cuneiform_tokenizer = AutoTokenizer.from_pretrained(
+                "praeclarum/cuneiform",
+                cache_dir=CUNEIFORM_MODEL_DIR
+            )
+            self.cuneiform_model = AutoModelForSeq2SeqLM.from_pretrained(
+                "praeclarum/cuneiform",
+                cache_dir=CUNEIFORM_MODEL_DIR
+            )
+            self.cuneiform_model.to(self.device)
+            self.cuneiform_model.eval()  # Put in evaluation mode
+            from utils.gpu_diagnostics import log_model_device
+            log_model_device("Cuneiform Translator (Praeclarum T5)", self.device)
+            self.translator_available = True
+            print("[INFO] Cuneiform translator ready for CLIP-recognized content")
+        except Exception as e:
+            print(f"[ERROR] Translation model setup failed: {e}")
+            self.translator_available = False
+    def detect_script(self, image_path):
+        """Detection handled by enhanced CLIP classification"""
+        try:
+            if not self.clip_available:
+                print("[ERROR] No cuneiform processing engines available")
+                return False, 0.0
+            print(f"[INFO] Cuneiform processor activated - Using CLIP visual recognition")
+            return True, 0.95
+        except Exception as e:
+            print(f"[ERROR] Cuneiform detection failed: {e}")
+            return False, 0.0
+    def extract_text(self, image_path):
+        """Extract cuneiform using CLIP visual recognition instead of OCR"""
+        if self.clip_model is None:
+            self.setup_cuneiform_clip()
+        else:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("cuneiform")
+            if str(next(self.clip_model.parameters()).device) != str(self.device):
+                print(f"[VRAM MANAGER] Activating Cuneiform CLIP model on {self.device}...")
+                self.clip_model.to(self.device)
+        if not getattr(self, 'clip_available', False) or self.clip_model is None:
+            return "CUNEIFORM_CLIP_FAILED: Visual recognition model not available"
+        try:
+            start_time = time.time()
+            # Method 1: CLIP-based visual analysis
+            print("[INFO] Analyzing cuneiform using CLIP visual recognition...")
+            visual_analysis = self._analyze_cuneiform_with_clip(image_path)
+            if visual_analysis and visual_analysis['confidence'] > 0.3:
+                processing_time = time.time() - start_time
+                print(f"[SUCCESS] CLIP visual analysis completed in {processing_time:.2f}s")
+                return visual_analysis['description']
+            # Method 2: Fallback to basic tablet description
+            tablet_description = self._describe_tablet_layout(image_path)
+            if tablet_description:
+                return tablet_description
+            return "CUNEIFORM_VISUAL_ANALYSIS_INCOMPLETE: Clay tablet detected but content analysis requires higher resolution or clearer image"
+        except Exception as e:
+            print(f"[ERROR] CLIP cuneiform analysis failed: {e}")
+            return f"CUNEIFORM_ERROR: {str(e)}"
+    def _analyze_cuneiform_with_clip(self, image_path):
+        """Use CLIP to analyze cuneiform content visually"""
+        try:
+            image = Image.open(image_path).convert("RGB")
+            # Enhanced preprocessing for CLIP analysis
+            enhanced_image = self._preprocess_for_clip_analysis(image)
+            # CLIP classification of cuneiform content
+            print("[INFO] Running CLIP classification on cuneiform signs...")
+            inputs = self.clip_processor(
+                text=self.cuneiform_signs,
+                images=enhanced_image,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
+            with torch.inference_mode():
+                outputs = self.clip_model(**inputs)
+                logits_per_image = outputs.logits_per_image
+                probs = logits_per_image.softmax(dim=1)
+            # Get top predictions
+            top_probs, top_indices = torch.topk(probs, k=3)
+            # Build description based on CLIP analysis
+            descriptions = []
+            confidences = []
+            for i, (prob, idx) in enumerate(zip(top_probs[0], top_indices[0])):
+                if prob > 0.2:  # Reasonable confidence threshold
+                    sign_desc = self.cuneiform_signs[idx]
+                    descriptions.append(sign_desc)
+                    confidences.append(prob.item())
+                    print(f"[INFO] CLIP detected: {sign_desc} (confidence: {prob:.3f})")
+            if descriptions:
+                # Convert visual analysis to ATF-like description
+                atf_description = self._convert_visual_to_atf(descriptions, confidences)
+                return {
+                    'description': atf_description,
+                    'confidence': max(confidences),
+                    'visual_elements': descriptions,
+                    'method': 'CLIP_visual_analysis'
+                }
+            return None
+        except Exception as e:
+            print(f"[ERROR] CLIP cuneiform analysis failed: {e}")
+            return None
+    def _preprocess_for_clip_analysis(self, image):
+        """Preprocess image specifically for CLIP cuneiform analysis"""
+        try:
+            # Convert to numpy for OpenCV processing
+            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            # Enhance for CLIP visual understanding
+            # 1. Increase contrast to make wedges more visible
+            lab = cv2.cvtColor(image_cv, cv2.COLOR_BGR2LAB)
+            l_channel, a, b = cv2.split(lab)
+            # Apply CLAHE to lightness channel
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+            l_channel = clahe.apply(l_channel)
+            # Merge back
+            enhanced_lab = cv2.merge((l_channel, a, b))
+            enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
+            # 2. Sharpen edges to help CLIP see wedge boundaries
+            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
+            sharpened = cv2.filter2D(enhanced_bgr, -1, kernel)
+            # Convert back to PIL RGB
+            enhanced_rgb = cv2.cvtColor(sharpened, cv2.COLOR_BGR2RGB)
+            return Image.fromarray(enhanced_rgb)
+        except Exception as e:
+            print(f"[WARN] CLIP preprocessing failed: {e}")
+            return image
+    def _convert_visual_to_atf(self, visual_descriptions, confidences):
+        """Convert CLIP visual analysis to ATF-like transliteration"""
+        try:
+            atf_elements = []
+            for desc, conf in zip(visual_descriptions, confidences):
+                desc_lower = desc.lower()
+                # Map visual descriptions to ATF transliterations
+                if 'lugal' in desc_lower or 'king' in desc_lower:
+                    atf_elements.append('lugal')
+                elif 'an' in desc_lower or 'god' in desc_lower or 'heaven' in desc_lower:
+                    atf_elements.append('an')
+                elif 'ki' in desc_lower or 'earth' in desc_lower or 'place' in desc_lower:
+                    atf_elements.append('ki')
+                elif 'dingir' in desc_lower or 'divine' in desc_lower:
+                    atf_elements.append('{d}')
+                elif 'ud' in desc_lower or 'day' in desc_lower or 'sun' in desc_lower:
+                    atf_elements.append('ud')
+                elif 'e' in desc_lower and ('house' in desc_lower or 'temple' in desc_lower):
+                    atf_elements.append('e2')
+                elif 'en' in desc_lower and 'lord' in desc_lower:
+                    atf_elements.append('en')
+                elif 'nin' in desc_lower and ('lady' in desc_lower or 'queen' in desc_lower):
+                    atf_elements.append('nin')
+                elif 'administrative' in desc_lower or 'numbers' in desc_lower:
+                    atf_elements.extend(['1(disz)', '2(disz)', 'sze'])
+                elif 'royal' in desc_lower or 'inscription' in desc_lower:
+                    atf_elements.extend(['lugal', 'kur', 'kur'])
+                elif 'legal' in desc_lower or 'contract' in desc_lower:
+                    atf_elements.extend(['kiszib3', 'mu', 'pad'])
+                elif 'literary' in desc_lower or 'mythological' in desc_lower:
+                    atf_elements.extend(['en', 'dingir', 'kur'])
+                elif 'school' in desc_lower or 'practice' in desc_lower:
+                    atf_elements.extend(['a', 'ba', 'ka', 'la'])
+            # Build coherent ATF string
+            if atf_elements:
+                # Add line structure typical of cuneiform tablets
+                atf_text = f"1. {' '.join(atf_elements[:3])}"
+                if len(atf_elements) > 3:
+                    atf_text += f"\n2. {' '.join(atf_elements[3:6])}"
+                if len(atf_elements) > 6:
+                    atf_text += f"\n3. {' '.join(atf_elements[6:])}"
+                return atf_text
+            else:
+                return "cuneiform tablet content analysis incomplete"
+        except Exception as e:
+            print(f"[ERROR] Visual to ATF conversion failed: {e}")
+            return "visual analysis available but ATF conversion failed"
+    def _describe_tablet_layout(self, image_path):
+        """Describe tablet layout and structure using CLIP"""
+        try:
+            image = Image.open(image_path).convert("RGB")
+            inputs = self.clip_processor(
+                text=self.tablet_layouts,
+                images=image,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
+            with torch.inference_mode():
+                outputs = self.clip_model(**inputs)
+                probs = outputs.logits_per_image.softmax(dim=1)
+            # Get best layout description
+            best_idx = torch.argmax(probs)
+            best_desc = self.tablet_layouts[best_idx]
+            confidence = probs[0][best_idx].item()
+            print(f"[INFO] Tablet layout: {best_desc} (confidence: {confidence:.3f})")
+            if confidence > 0.4:
+                return f"tablet_layout: {best_desc}"
+            return "tablet_layout: unidentified cuneiform tablet structure"
+        except Exception as e:
+            print(f"[ERROR] Tablet layout analysis failed: {e}")
+            return "tablet_layout: analysis_failed"
+    def translate_cuneiform(self, cuneiform_text):
+        """Translate CLIP-analyzed cuneiform content using praeclarum model"""
+        if self.cuneiform_model is None:
+            self.setup_praeclarum_translator()
+        else:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("cuneiform")
+            if str(next(self.cuneiform_model.parameters()).device) != str(self.device):
+                print(f"[VRAM MANAGER] Activating Cuneiform Translator model on {self.device}...")
+                self.cuneiform_model.to(self.device)
+        if not getattr(self, 'translator_available', False) or self.cuneiform_model is None:
+            return "Translation unavailable - praeclarum model not loaded"
+        # Handle CLIP analysis results
+        if cuneiform_text.startswith(("CUNEIFORM_CLIP_FAILED", "CUNEIFORM_ERROR:")):
+            return "Translation failed: Visual analysis could not identify cuneiform content"
+        if cuneiform_text.startswith("tablet_layout:"):
+            layout_desc = cuneiform_text.replace("tablet_layout: ", "")
+            return f"Visual analysis indicates: {layout_desc}. Specific text translation requires clearer wedge visibility."
+        try:
+            print(f"[INFO] Translating CLIP-analyzed content: {cuneiform_text[:50]}...")
+            # Use the praeclarum model for translation
+            inputs = self.cuneiform_tokenizer(
+                cuneiform_text,
+                return_tensors="pt",
+                max_length=512,
+                truncation=True
+            ).input_ids.to(self.device)
+            with torch.inference_mode():
+                outputs = self.cuneiform_model.generate(
+                    inputs,
+                    max_new_tokens=200,
+                    do_sample=True,
+                    top_k=30,
+                    top_p=0.95,
+                    temperature=0.7,
+                    pad_token_id=self.cuneiform_tokenizer.eos_token_id
+                )
+            translation = self.cuneiform_tokenizer.decode(
+                outputs[0],
+                skip_special_tokens=True
+            )
+            translation = self._post_process_translation(translation)
+            if translation.strip():
+                print(f"[INFO] CLIP+Translation completed: {translation[:100]}...")
+                return translation
+            else:
+                return "Visual analysis successful, but textual translation inconclusive. This may be a non-textual or damaged tablet section."
+        except Exception as e:
+            print(f"[ERROR] Translation of CLIP content failed: {e}")
+            return f"Visual analysis successful, translation error: {str(e)}"
+    def _post_process_translation(self, translation):
+        """Post-process cuneiform translation"""
+        try:
+            # Clean up common translation artifacts
+            cleaned = translation.strip()
+            # Check for dots-only output (failed translation)
+            if cleaned in ["", "...", ". . .", "... ... ..."] or cleaned.count('.') > len(cleaned) * 0.8:
+                print(f"[WARN] Translation appears to be dots/empty, marking as failed")
+                return ""
+            # Remove any input text that might have been echoed
+            if cleaned.startswith(('lugal', 'an ', 'ki ', 'dingir')):
+                lines = cleaned.split('\n')
+                for line in lines:
+                    if not any(line.lower().startswith(pattern) for pattern in ['lugal', 'an ', 'ki ']):
+                        if len(line.strip()) > 10:
+                            cleaned = line.strip()
+                            break
+            # Capitalize first letter
+            if cleaned and not cleaned[0].isupper():
+                cleaned = cleaned[0].upper() + cleaned[1:]
+            return cleaned
+        except Exception as e:
+            print(f"[WARN] Translation post-processing failed: {e}")
+            return translation
+    def process_text(self, cuneiform_text):
+        """Process extracted cuneiform text with comprehensive CLIP-aware analysis"""
+        if not cuneiform_text:
+            return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
+        print("[INFO] Processing cuneiform text with CLIP visual analysis...")
+        # Handle error messages
+        if cuneiform_text.startswith(("CUNEIFORM_CLIP_FAILED", "CUNEIFORM_ERROR:", "CUNEIFORM_VISUAL_ANALYSIS_INCOMPLETE")):
+            return {
+                "text": cuneiform_text,
+                "symbols": [],
+                "char_analysis": {
+                    "total_chars": 0,
+                    "error": "CLIP visual analysis failed",
+                    "text_format": "Error"
+                },
+                "validation": {
+                    "quality_score": 0.0,
+                    "confidence_level": "Failed",
+                    "ocr_method": "CLIP Visual Recognition (Failed)",
+                    "error": cuneiform_text
+                }
+            }
+        # Extract symbols for visual analysis
+        if cuneiform_text.startswith("tablet_layout:"):
+            # Layout analysis
+            symbols = ""
+            char_analysis = {
+                "total_chars": len(cuneiform_text),
+                "layout_analysis": True,
+                "text_format": "Layout Description"
+            }
+        else:
+            # ATF or visual analysis content
+            symbols = ''.join(filter(lambda x: x.isalnum() or x in "{}[]().-", cuneiform_text))
+            char_analysis = {
+                "total_chars": len(cuneiform_text),
+                "atf_elements": len(cuneiform_text.split()),
+                "unique_chars": len(set(cuneiform_text)),
+                "word_count": len(cuneiform_text.split()),
+                "text_format": "CLIP Visual Analysis + ATF"
+            }
+        # Enhanced validation with CLIP-specific metrics
+        validation = {
+            "quality_score": self._calculate_clip_quality_score(cuneiform_text),
+            "recognition_method": "CLIP Visual Pattern Recognition",
+            "model_specialization": "Large-scale Vision Transformer for Ancient Scripts",
+            "clip_analysis": True,
+            "supports_translation": self.translator_available,
+            "input_format": char_analysis.get("text_format", "Unknown"),
+            "confidence_level": self._determine_confidence_level(cuneiform_text)
+        }
+        return {
+            "text": cuneiform_text,
+            "symbols": symbols,
+            "char_analysis": char_analysis,
+            "validation": validation
+        }
+    def _calculate_clip_quality_score(self, text):
+        """Calculate quality score for CLIP-analyzed text"""
+        if not text:
+            return 0.0
+        score = 0.0
+        # Layout analysis bonus
+        if text.startswith("tablet_layout:"):
+            score = 0.7  # Good layout analysis
+        # ATF content bonuses
+        elif any(pattern in text.lower() for pattern in ['lugal', 'an', 'ki', 'dingir', '{d}', 'e2']):
+            score += 0.8  # High quality CLIP recognition
+            # Multiple lines bonus
+            if '\n' in text:
+                score += 0.1
+            # Coherent structure bonus
+            words = text.split()
+            if len(words) >= 3:
+                score += 0.1
+        # Error penalty
+        elif text.startswith(("CUNEIFORM_", "visual analysis", "tablet content")):
+            score = 0.3  # Some recognition but incomplete
+        return max(0.0, min(1.0, score))
+    def _determine_confidence_level(self, text):
+        """Determine confidence level for CLIP analysis"""
+        score = self._calculate_clip_quality_score(text)
+        if score >= 0.8:
+            return "Very High"
+        elif score >= 0.6:
+            return "High"
+        elif score >= 0.4:
+            return "Medium"
+        elif score >= 0.2:
+            return "Low"
+        else:
+            return "Very Low"
+    def process_image(self, image_path):
+        """Main processing method - same interface as other processors"""
+        try:
+            print(f"[INFO] Processing cuneiform image: {image_path}")
+            # Extract text using CLIP
+            extracted_text = self.extract_text(image_path)
+            # Process the extracted content
+            processed_result = self.process_text(extracted_text)
+            # Generate historical context
+            historical_context = self.generate_historical_context(processed_result)
+            # Generate creative story
+            creative_story = self.generate_story(processed_result)
+            return {
+                'script_type': 'cuneiform',
+                'confidence': processed_result['validation'].get('quality_score', 0.0),
+                'processed_result': processed_result,
+                'historical_context': historical_context,
+                'creative_story': creative_story
+            }
+        except Exception as e:
+            print(f"[ERROR] Cuneiform image processing failed: {e}")
+            return None
+    def generate_historical_context(self, processed_result):
+        """Generate historical context for cuneiform text"""
+        cuneiform_text = processed_result.get("text", "")
+        groq_detail = self._generate_groq_context(cuneiform_text)
+        # Build references using words/symbols in cuneiform text
+        words = re.findall(r'\w+', cuneiform_text) if cuneiform_text else []
+        query_terms = list(words)
+        if cuneiform_text:
+            query_terms.extend([char for char in cuneiform_text if char.strip()])
+        refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
+        return {
+            "uses_box": {
+                "title": "Cuneiform symbols and their ancient usage",
+                "items": self._build_uses_list(cuneiform_text)
+            },
+            "meaning_box": self._build_meaning_box(cuneiform_text, groq_detail, processed_result),
+            "references": refs
+        }
+    def _generate_groq_context(self, cuneiform_text):
+        """Generate contextual information using Groq"""
+        if not self.groq_client.is_available():
+            return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
+        if cuneiform_text.startswith(("CUNEIFORM_", "tablet_layout:")):
+            prompt = (
+                "This appears to be a cuneiform clay tablet analyzed using computer vision. "
+                "Provide a concise, scholarly paragraph (6-10 sentences) covering the history of cuneiform writing, "
+                "its use in ancient Mesopotamia, common contexts (administrative, legal, literary), "
+                "and the languages it represented (Sumerian, Akkadian, etc.). Include information about "
+                "clay tablet creation, scribal practices, and the significance of cuneiform in ancient civilizations."
+            )
+        else:
+            prompt = (
+                f"Analyze this cuneiform content identified through visual analysis: {cuneiform_text}\n\n"
+                f"Provide a scholarly paragraph (6-10 sentences) on its likely historical context, "
+                f"period (3200 BCE to 100 CE), probable purpose (administrative, legal, literary, religious), "
+                f"language (Sumerian/Akkadian/other), and cultural significance in ancient Mesopotamian civilization. "
+                f"Consider that this was analyzed using AI vision recognition rather than traditional transliteration."
+            )
+        system_prompt = "You are an expert Assyriologist and ancient Near Eastern historian. Provide accurate, concise scholarly analysis of cuneiform texts, focusing on historical context, linguistic analysis, and cultural significance."
+        enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, cuneiform_text)
+        return self.groq_client.generate_response(
+            system_prompt=enriched_system_prompt,
+            user_prompt=prompt
+        ) or "(Historical context unavailable due to Groq error)"
+    def _build_uses_list(self, cuneiform_text):
+        """Build list of cuneiform symbol uses"""
+        # Handle error messages
+        if cuneiform_text.startswith(("CUNEIFORM_", "tablet_layout:")):
+            return [
+                "- Visual analysis attempted but content recognition incomplete",
+                "- This may be due to image quality, tablet damage, or complex wedge patterns",
+                "- CLIP visual recognition specializes in identifying cuneiform sign types and layouts",
+                "- For detailed transliteration, consider using CDLI tools or consulting cuneiform specialists"
+            ]
+        notes = self.references.get("cuneiform_symbol_notes", {}) or {}
+        default_hint = self.references.get("cuneiform_hint",
+            "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages")
+        seen = set()
+        items = []
+        # Process ATF elements
+        for element in cuneiform_text.split():
+            if element in seen or not element.strip():
+                continue
+            seen.add(element)
+            if element in notes:
+                note = notes[element]
+            else:
+                note = default_hint
+            items.append(f"- {element}: {note}")
+        if not items:
+            items.append("- Analysis incomplete: CLIP visual recognition in progress")
+        return items[:15]  # Limit display
+    def _build_meaning_box(self, cuneiform_text, groq_detail, processed_result):
+        """Build meaning interpretation box for cuneiform"""
+        char_analysis = processed_result.get("char_analysis", {})
+        validation = processed_result.get("validation", {})
+        # Build introduction with CLIP context
+        text_format = char_analysis.get("text_format", "Unknown")
+        confidence = validation.get("confidence_level", "Unknown")
+        intro_lines = [
+            f"Cuneiform processed using CLIP visual recognition with confidence: {confidence}.",
+        ]
+        if validation.get("clip_analysis"):
+            intro_lines.extend([
+                "Analysis powered by OpenAI CLIP Vision Transformer (Large) for ancient script recognition.",
+                "Visual pattern recognition identifies cuneiform signs, layouts, and tablet structures."
+            ])
+        if self.translator_available:
+            intro_lines.append("Translation provided by praeclarum/cuneiform model trained on 210,247 examples.")
+        # Add format-specific information
+        if text_format == "Layout Description":
+            intro_lines.append("Tablet structure and organization analyzed through computer vision.")
+        elif text_format == "CLIP Visual Analysis + ATF":
+            intro_lines.append("Visual elements converted to ATF transliteration format.")
+        # Analysis points
+        points = []
+        points.extend([
+            "• CLIP Vision Transformer provides advanced visual understanding of cuneiform wedge patterns.",
+            "• Model trained on large-scale image-text datasets enables zero-shot cuneiform recognition.",
+            "• Visual analysis identifies sign types, tablet layouts, and manuscript characteristics."
+        ])
+        if validation.get("supports_translation"):
+            points.append("• Recognized visual elements translated using specialized Mesopotamian language models.")
+        if text_format == "Layout Description":
+            points.append("• Tablet structure analysis indicates overall document type and organization.")
+        layout_analysis = char_analysis.get("layout_analysis", False)
+        if layout_analysis:
+            points.append("• Computer vision successfully identified tablet layout and structural elements.")
+        if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
+            points.append(f"• Historical analysis: {groq_detail.strip()}")
+        # Extract key elements for frequent display
+        if text_format == "CLIP Visual Analysis + ATF":
+            frequent_elements = cuneiform_text.split()[:10]
+        else:
+            frequent_elements = ["Visual", "Analysis", "CLIP", "Recognition"]
+        return {
+            "title": "Cuneiform visual analysis:",
+            "intro_lines": intro_lines,
+            "frequent_label": "Key elements identified",
+            "frequent": frequent_elements,
+            "points": points
+        }
+    def generate_story(self, processed_result):
+        """Generate creative story for cuneiform text"""
+        cuneiform_text = processed_result.get("text", "")
+        if not self.groq_client.is_available():
+            return "Groq client unavailable, cannot generate historical narrative."
+        # Determine story context based on analysis type
+        char_analysis = processed_result.get("char_analysis", {})
+        validation = processed_result.get("validation", {})
+        text_format = char_analysis.get("text_format", "Unknown")
+        # Choose appropriate narrative style based on CLIP analysis
+        if "lugal" in cuneiform_text.lower() or "royal" in cuneiform_text.lower():
+            styles = [
+                "as a royal inscription from the court of Hammurabi",
+                "as a victory stela from ancient Assyria",
+                "as a chronicle of Mesopotamian kings",
+                "as a royal decree from Nebuchadnezzar's reign"
+            ]
+        elif "administrative" in cuneiform_text.lower() or "numbers" in cuneiform_text.lower():
+            styles = [
+                "as a merchant's inventory from ancient Babylon",
+                "as a tax record from a Sumerian temple",
+                "as a grain distribution list from Ur",
+                "as an administrative archive from Mari"
+            ]
+        elif text_format == "Layout Description":
+            styles = [
+                "as a damaged tablet discovered in archaeological excavation",
+                "as a mysterious cuneiform fragment found in ancient ruins",
+                "as a clay tablet uncovered in a Mesopotamian library",
+                "as an ancient document preserved in palace archives"
+            ]
+        else:
+            styles = [
+                "as a scribe's practice tablet from ancient Sumer",
+                "as a legal contract from Babylonian courts",
+                "as a temple inscription from Mesopotamia",
+                "as a literary work from the ancient Near East"
+            ]
+        import random
+        chosen_style = random.choice(styles)
+        seed = random.randint(1000, 9999)
+        processing_note = "analyzed through advanced computer vision AI specialized in ancient scripts"
+        prompt = (
+            f"This cuneiform tablet was {processing_note}: {cuneiform_text[:100]}...\n\n"
+            f"Historical context: This represents one of humanity's oldest writing systems, "
+            f"used across ancient Mesopotamia from 3200 BCE to 100 CE.\n\n"
+            f"Create a vivid, historically accurate narrative (250+ words) set in ancient Mesopotamia, "
+            f"telling the story of this cuneiform tablet's creation and significance. "
+            f"Write {chosen_style}.\n\n"
+            f"Include: Clay tablet creation process, scribe's daily life, the tablet's importance "
+            f"to ancient Mesopotamian society, and authentic historical details of Sumerian/Babylonian/Assyrian culture.\n"
+            f"Narrative seed: {seed}"
+        )
+        system_prompt = (
+            "You are a master storyteller and Assyriologist specializing in ancient Mesopotamian "
+            "history, cuneiform literature, and daily life in Sumerian, Babylonian, and Assyrian "
+            "civilizations. Create authentic, engaging narratives that reflect accurate knowledge "
+            "of ancient Near Eastern cultures, writing practices, and social contexts."
+        )
+        story = self.groq_client.generate_response(
+            system_prompt=system_prompt,
+            user_prompt=prompt
+        )
+        if not story or is_gibberish(story):
+            return "Failed to generate historical narrative; ancient Mesopotamian story creation unavailable."
+        return story

processors/egyptian_processor.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import cv2
+import numpy as np
+import base64
+import json
+from PIL import Image
+from io import BytesIO
+from itertools import groupby
+from collections import Counter
+from .base_processor import BaseScriptProcessor
+from utils.image_utils import segment_hieroglyphs
+from utils.text_utils import is_gibberish, build_description_from_codes
+from config import Config
+class EgyptianProcessor(BaseScriptProcessor):
+    def __init__(self, groq_client, references, clip_classifier, translator_pipe):
+        super().__init__(groq_client, references)
+        self.clip_classifier = clip_classifier
+        self.translator_pipe = translator_pipe
+        self.config = Config()
+    def detect_script(self, image_path):
+        """Simplified detection - Groq Vision handles main classification"""
+        try:
+            print("[INFO] Egyptian processor activated by Groq Vision (Llama-4-Scout)")
+            return True, 0.95
+        except Exception as e:
+            print(f"[ERROR] Egyptian detection failed: {e}")
+            return False, 0.0
+    def _identify_hieroglyphs_with_vision(self, image_path):
+        """Use Groq Vision (Llama-4-Scout) to identify hieroglyphic symbols from the full image."""
+        if not self.groq_client or not self.groq_client.is_available():
+            return None
+        try:
+            from groq import Groq
+            # Load and encode image
+            image = Image.open(image_path)
+            if max(image.size) > 1200:
+                image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
+            buffer = BytesIO()
+            image.save(buffer, format="JPEG", quality=90)
+            b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+            gardiner_labels = list(self.config.GARDINER_MAP.keys())
+            gardiner_codes = list(self.config.GARDINER_MAP.values())
+            label_list = ", ".join(
+                f"{lbl} ({code})" for lbl, code in zip(gardiner_labels, gardiner_codes)
+            )
+            prompt = (
+                "You are an expert Egyptologist analyzing an image of Egyptian hieroglyphs.\n\n"
+                f"Known Gardiner signs: {label_list}\n\n"
+                "Identify up to 15 of the most prominent hieroglyphic symbols visible in the image, in reading order (left-to-right, top-to-bottom).\n"
+                "For each identified symbol, pick the BEST matching Gardiner label from the list above.\n"
+                "Do not output more than 15 symbols. If a symbol doesn't match any known label, use \"unknown\".\n\n"
+                "Respond ONLY with a JSON object:\n"
+                "{\"symbols\": [\"label1\", \"label2\", \"label3\", ...]}\n"
+                "Example: {\"symbols\": [\"owl\", \"eye\", \"reed\", \"bread\", \"sun\"]}"
+            )
+            print("[INFO] Sending request to Groq Vision model meta-llama/llama-4-scout-17b-16e-instruct...")
+            client = Groq(api_key=self.groq_client.api_key)
+            completion = client.chat.completions.create(
+                model="meta-llama/llama-4-scout-17b-16e-instruct",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{b64}",
+                                },
+                            },
+                        ],
+                    }
+                ],
+                temperature=0.1,
+                max_completion_tokens=1024,
+                response_format={"type": "json_object"},
+            )
+            raw = completion.choices[0].message.content
+            print(f"[INFO] Groq Vision raw response received: {raw[:150]}...")
+            data = json.loads(raw)
+            symbols = data.get("symbols", [])
+            if symbols and isinstance(symbols, list) and len(symbols) > 0:
+                # Validate labels against known set + "unknown"
+                valid = set(gardiner_labels) | {"unknown"}
+                cleaned = [s if s in valid else "unknown" for s in symbols]
+                if all(s == "unknown" for s in cleaned):
+                    print("[INFO] Groq Vision identified only 'unknown' symbols. Falling back.")
+                    return None
+                print(f"[INFO] Groq Vision identified {len(cleaned)} hieroglyphs: {cleaned}")
+                return cleaned
+        except Exception as e:
+            print(f"[WARN] Groq Vision hieroglyph identification failed: {e}")
+        return None
+    def extract_text(self, image_path):
+        """Extract hieroglyphs — Groq Vision primary, CLIP fallback"""
+        try:
+            print("[INFO] Starting Egyptian hieroglyph extraction...")
+            # PRIMARY: Use Groq Vision to identify symbols from the full image
+            vision_labels = self._identify_hieroglyphs_with_vision(image_path)
+            if vision_labels:
+                print(f"[INFO] Using Groq Vision result ({len(vision_labels)} symbols)")
+                return vision_labels
+            # FALLBACK: Segment + CLIP zero-shot
+            print("[INFO] Falling back to CLIP segmentation-based classification...")
+            from utils.image_utils import segment_hieroglyphs
+            crops = segment_hieroglyphs(image_path)
+            print(f"[INFO] Segmented {len(crops)} hieroglyph regions")
+            if not crops:
+                print("[WARN] No hieroglyph regions found")
+                return []
+            candidate_labels = list(self.config.GARDINER_MAP.keys())
+            labels = self.clip_classifier.classify_symbols(crops, candidate_labels)
+            print(f"[INFO] CLIP classified {len(labels)} symbols: {labels}")
+            return labels
+        except Exception as e:
+            print(f"[ERROR] Egyptian text extraction failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    def process_text(self, labels):
+        """Process hieroglyph labels into translation"""
+        if not labels:
+            return {"labels": [], "codes": [], "translation": "", "translation_ok": False}
+        # Convert labels to Gardiner codes
+        codes = [self.config.GARDINER_MAP.get((lbl or "").lower(), "?") for lbl in labels]
+        # Attempt translation
+        translation, translation_ok = self._translate_sequence(labels, codes)
+        return {
+            "labels": labels,
+            "codes": codes,
+            "translation": translation,
+            "translation_ok": translation_ok
+        }
+    def _translate_sequence(self, labels, codes):
+        """Translate Gardiner sequence using HuggingFace model or Groq fallback"""
+        valid_codes = [c for c in codes if c != "?"]
+        if valid_codes and self.translator_pipe:
+            seq = " ".join(valid_codes)
+            prompt = f"Translate hieroglyph unicode sequence to English: {seq}"
+            try:
+                output = self.translator_pipe(prompt, max_new_tokens=128, do_sample=False, num_beams=4)
+                text = output[0].get('generated_text') or output[0].get('translation_text') or str(output[0])
+                if text and text.strip() != "?" and not is_gibberish(text):
+                    return text.strip(), True
+                # Try alternative approach
+                alt_output = self.translator_pipe(seq, max_new_tokens=128, do_sample=False, num_beams=4)
+                alt_text = alt_output[0].get('generated_text') or alt_output[0].get('translation_text') or str(alt_output[0])
+                if alt_text and alt_text.strip() != "?" and not is_gibberish(alt_text):
+                    return alt_text.strip(), True
+            except Exception as e:
+                print(f"[WARN] Seq2Seq translation failed: {e}")
+        # Groq Fallback for translating known symbols
+        if self.groq_client and self.groq_client.is_available():
+            try:
+                known_labels = [lbl for lbl in labels if lbl and lbl != "unknown"]
+                if known_labels:
+                    symbols_str = ", ".join(known_labels)
+                    system_prompt = "You are an expert Egyptologist and translator of ancient Egyptian hieroglyphs."
+                    user_prompt = (
+                        f"We detected a sequence of ancient Egyptian hieroglyphic symbols: {symbols_str}.\n"
+                        "Provide a concise, scholarly English translation or logical interpretation of this combination of signs.\n"
+                        "Keep it direct, under 15 words, and do not include any introductory phrases, explanations, or quotes."
+                    )
+                    translation = self.groq_client.generate_response(system_prompt, user_prompt, max_tokens=64)
+                    translation = translation.strip().replace('"', '')
+                    if translation and not is_gibberish(translation):
+                        return translation, True
+            except Exception as e:
+                print(f"[WARN] Groq fallback translation failed: {e}")
+        # Fallback to description
+        description = build_description_from_codes(codes)
+        return f"(Symbols described as: {description})", False
+    def generate_historical_context(self, processed_result):
+        """Generate historical context for Egyptian text"""
+        translation = processed_result.get("translation", "")
+        codes = processed_result.get("codes", [])
+        labels = processed_result.get("labels", [])
+        # Generate Groq context
+        groq_detail = self._generate_groq_context(translation, codes)
+        # Build references
+        query_terms = list(labels) + list(codes)
+        refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
+        # Build structured context
+        return {
+            "uses_box": {
+                "title": "Each symbol's possible use by the egyptian people",
+                "items": self._build_uses_list(labels)
+            },
+            "meaning_box": self._build_meaning_box(labels, groq_detail),
+            "references": refs
+        }
+    def _generate_groq_context(self, translation_text, codes):
+        """Generate contextual information using Groq"""
+        if not self.groq_client.is_available():
+            return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
+        if is_gibberish(translation_text):
+            prompt_body = build_description_from_codes(codes)
+            prompt = (
+                f"The following sequence of ancient Egyptian symbols is described as: {prompt_body}.\n\n"
+                "Provide a concise, scholarly paragraph (6-10 sentences) covering cultural context, symbolic meanings, "
+                "typical usage, probable time period, and relevant archaeological comparisons. Avoid repeating the prompt."
+            )
+        else:
+            prompt = (
+                f"Provide a concise, scholarly paragraph (6-10 sentences) on the historical significance, cultural context, "
+                f"symbolism, and possible interpretations of this ancient Egyptian text: {translation_text}. Avoid repeating the prompt."
+            )
+        system_prompt = "You are a careful Egyptologist and historian. Provide accurate, concise scholarly context."
+        enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, translation_text, codes)
+        return self.groq_client.generate_response(
+            system_prompt=enriched_system_prompt,
+            user_prompt=prompt,
+            max_tokens=self.config.GROQ_CONTEXT_MAX_TOKENS
+        ) or "(context unavailable due to Groq error)"
+    def _build_uses_list(self, labels):
+        """Build list of symbol uses"""
+        groups = []
+        for key, g in groupby(labels):
+            if not key:
+                continue
+            groups.append((key, len(list(g))))
+        notes = self.references.get("egypt_symbol_notes", {}) or {}
+        seen = set()
+        items = []
+        for name, count in groups:
+            if not name or name.lower() in seen:
+                continue
+            seen.add(name.lower())
+            count_str = f" (x{count})" if count > 1 else ""
+            note = notes.get(name.lower(), "Common sign whose meaning varies by phonetic/ideogram/determinative roles.")
+            items.append(f"- {name}{count_str}: {note}")
+        if not items:
+            items.append("- unknown: No stable mapping; likely decorative or damaged glyphs.")
+        return items
+    def _build_meaning_box(self, labels, groq_detail):
+        """Build meaning interpretation box"""
+        freq = Counter([l for l in labels if l])
+        frequent = [f"{name} (x{cnt})" for name, cnt in freq.most_common(6)]
+        intro_lines = [
+            "The dense recurrence of signs suggests a formulaic or protective sequence, where phonograms articulate a core utterance and determinatives or iconic signs reinforce ritual intent.",
+            "Comparable sequences appear on funerary equipment from the Middle Kingdom onward."
+        ]
+        points = [
+            "• Offering and action signs (bread, jar, hoe, bow) commonly structure invocations or provisioning lists for the afterlife.",
+            "• Repetition often encodes names or epithets; determinatives (eye, feather, god_figure) frame a protective or ritual context.",
+            "• Repertoire and layout align with New Kingdom funerary practice focused on protection, sustenance, and legitimation."
+        ]
+        if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
+            points.append(groq_detail.strip())
+        return {
+            "title": "Possible meaning:",
+            "intro_lines": intro_lines,
+            "frequent_label": "Frequently observed signs",
+            "frequent": frequent,
+            "points": points
+        }
+    def generate_story(self, processed_result):
+        """Generate creative story for Egyptian text"""
+        labels = processed_result.get("labels", [])
+        description = ", ".join([lbl for lbl in labels if lbl])
+        if not self.groq_client.is_available():
+            return self._simple_templated_story(description)
+        style = [
+            "as an epic poem from a wandering bard",
+            "as a prophecy carved in stone",
+            "as a fireside tale with vivid emotions",
+            "as a dialogue between two ancient gods",
+            "as a lost papyrus narrative recovered from the sands",
+            "as a myth told by a court poet"
+        ]
+        import random
+        chosen_style = random.choice(style)
+        seed = random.randint(1000, 9999)
+        prompt = (
+            f"The following sequence of ancient Egyptian symbols is described as: {description}\n\n"
+            f"Can you create a long, vivid, imaginative story from ancient times "
+            f"based on this sequence of Egyptian symbols: [your sequence]. "
+            f"Write it as one rich paragraph with a lot of detail, mystery, and historical atmosphere. "
+            f"At least 200 words.\n\n"
+            f"Creative seed: {seed}\n"
+            f"Write a richly detailed, imaginative myth-like story {chosen_style}. "
+            "Include multiple characters, vivid imagery, and at least 3 short scenes. "
+            "Do NOT repeat the same sentence or phrase verbatim. "
+            "Keep it evocative and unpredictable."
+        )
+        system_prompt = "You are a creative ancient historian and myth-maker. Invent rich, imaginative tales."
+        story = self.groq_client.generate_response(
+            system_prompt=system_prompt,
+            user_prompt=prompt,
+            max_tokens=self.config.GROQ_STORY_MAX_TOKENS
+        )
+        if not story or is_gibberish(story):
+            return self._simple_templated_story(description)
+        return story
+    def _simple_templated_story(self, description):
+        """Fallback story generation"""
+        import re
+        parts = [p.strip() for p in re.split(r',\s*', description) if p.strip()]
+        keywords = []
+        for p in parts:
+            m = re.match(r'([a-zA-Z0-9_-]+)', p)
+            if m:
+                kw = m.group(1)
+                if kw not in keywords:
+                    keywords.append(kw)
+            if len(keywords) >= 8:
+                break
+        flavor = {
+            "bow": "strength and vigilance",
+            "hoe": "the work of the fields",
+            "reed": "the scribe's craft",
+            "owl": "hidden wisdom of the night",
+            "eye": "divine sight",
+            "bread": "offerings to the ka",
+            "unknown": "mysterious signs"
+        }
+        lead = []
+        if keywords:
+            lead.append(f"In an age of river and stone, a tale was told of {flavor.get(keywords[0], keywords[0])}.")
+        if len(keywords) > 1:
+            second = flavor.get(keywords[1], keywords[1])
+            third = flavor.get(keywords[2], keywords[2]) if len(keywords) > 2 else "omens"
+            lead.append(f"It spoke of {second} and {third} guiding a soul beyond the horizon.")
+        lead.append("Under the stars, elders whispered a vow that the names would endure.")
+        return " ".join(lead)

processors/greek_processor.py ADDED Viewed

	@@ -0,0 +1,774 @@

+import pytesseract
+import re
+import os
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from .base_processor import BaseScriptProcessor
+from utils.text_utils import is_gibberish
+BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
+GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr")
+class GreekProcessor(BaseScriptProcessor):
+    def __init__(self, groq_client, references, clip_classifier):
+        super().__init__(groq_client, references, clip_classifier)
+        self.clip_classifier = clip_classifier
+        self.setup_ancient_greek_ocr()
+        self.trocr_model = None
+        self.trocr_processor = None
+        self.trocr_available = False
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Register for dynamic VRAM management
+        from utils.gpu_diagnostics import register_processor
+        register_processor("greek", self)
+    def setup_greek_trocr(self):
+        """Setup TrOCR model — BEST for ancient Greek manuscripts"""
+        try:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("greek")
+            print("[INFO] Lazily loading TrOCR model for ancient Greek...")
+            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+            import torch
+            self.trocr_processor = TrOCRProcessor.from_pretrained(
+                'rithwikn/trocr_greek_combined',
+                cache_dir=GREEK_TROCR_MODEL_DIR,
+                local_files_only=False
+            )
+            self.trocr_model = VisionEncoderDecoderModel.from_pretrained(
+                'rithwikn/trocr_greek_combined',
+                cache_dir=GREEK_TROCR_MODEL_DIR,
+                local_files_only=False
+            )
+            self.trocr_model.to(self.device)
+            self.trocr_model.eval()  # Put in evaluation mode
+            from utils.gpu_diagnostics import log_model_device
+            log_model_device("Greek TrOCR", self.device)
+            self.trocr_available = True
+            print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}")
+        except Exception as e:
+            print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}")
+            self.trocr_available = False
+    def setup_ancient_greek_ocr(self):
+        """Setup Ancient Greek OCR with specialized tessdata"""
+        # Path to Ancient Greek tessdata (download from ancientgreekocr.org)
+        self.ancient_greek_tessdata = os.path.join(
+            os.path.dirname(__file__),
+            "..", "tessdata", "ancient-greek"
+        )
+        # Verify tessdata exists
+        if os.path.exists(self.ancient_greek_tessdata):
+            print(f"[INFO] Ancient Greek tessdata found: {self.ancient_greek_tessdata}")
+        else:
+            print(f"[WARN] Ancient Greek tessdata not found at: {self.ancient_greek_tessdata}")
+            print("[INFO] Download from: https://ancientgreekocr.org")
+    def detect_script(self, image_path):
+        """Simplified detection - Groq Vision handles main classification"""
+        try:
+            if not getattr(self, 'trocr_available', False):
+                # Check if Ancient Greek OCR is available as fallback
+                grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
+                if not os.path.exists(grc_file):
+                    print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)")
+                    return False, 0.5
+            # If called by Groq Vision classification, accept with high confidence
+            print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)")
+            return True, 0.95
+        except Exception as e:
+            print(f"[ERROR] Greek detection failed: {e}")
+            return False, 0.0
+    def _quick_greek_ocr_test(self, image_path):
+        """Quick OCR test to validate Greek content"""
+        try:
+            # Quick test with small image crop
+            image = Image.open(image_path)
+            # Take center crop for testing
+            w, h = image.size
+            crop_box = (w//4, h//4, 3*w//4, 3*h//4)
+            test_crop = image.crop(crop_box)
+            # Test with standard Greek OCR
+            test_text = pytesseract.image_to_string(test_crop, lang="ell")
+            greek_char_count = self._count_greek_chars(test_text or "")
+            # If we find Greek characters, it's likely Greek
+            return greek_char_count >= 3
+        except Exception:
+            return False
+    def extract_text(self, image_path):
+        """Enhanced Greek text extraction with TrOCR primary, Tesseract fallback"""
+        try:
+            image = Image.open(image_path)
+            # Ensure the Greek TrOCR model is loaded dynamically
+            if self.trocr_model is None:
+                self.setup_greek_trocr()
+            else:
+                from utils.gpu_diagnostics import reclaim_vram_for
+                reclaim_vram_for("greek")
+                if str(next(self.trocr_model.parameters()).device) != str(self.device):
+                    print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
+                    self.trocr_model.to(self.device)
+            # Method 1: Ancient Greek TrOCR (if available)
+            if getattr(self, 'trocr_available', False) and self.trocr_model is not None:
+                print("[INFO] Attempting Ancient Greek extraction with TrOCR...")
+                trocr_text = self._extract_with_trocr(image_path)
+                if trocr_text and self._validate_greek_text(trocr_text):
+                    print("[INFO] Using Ancient Greek TrOCR result")
+                    return trocr_text
+                print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...")
+            # Method 2: Ancient Greek OCR (if available and safe)
+            grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
+            if os.path.exists(grc_file):
+                ancient_greek_text = self._extract_with_ancient_greek_ocr(image)
+                if ancient_greek_text and self._validate_greek_text(ancient_greek_text):
+                    print("[INFO] Using Ancient Greek OCR result")
+                    return ancient_greek_text
+            # Method 3: Standard Greek OCR
+            standard_greek_text = self._extract_with_standard_greek_ocr(image)
+            if standard_greek_text and self._validate_greek_text(standard_greek_text):
+                print("[INFO] Using standard Greek OCR result")
+                return standard_greek_text
+            # Method 4: Layout-aware line segment fallback
+            print("[INFO] Trying layout-aware Greek segmentation fallback...")
+            layout_aware_greek_text = self._extract_layout_aware_ocr(image_path)
+            if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text):
+                print("[INFO] Using layout-aware Greek OCR result")
+                return layout_aware_greek_text
+            # Method 5: Final validation - if no good Greek text found, return empty
+            print("[INFO] No valid Greek text detected")
+            return ""
+        except Exception as e:
+            print(f"[ERROR] Greek text extraction failed: {e}")
+            return ""
+    def _extract_with_trocr(self, image_path):
+        """Extract text using TrOCR Ancient Greek model line-by-line"""
+        if self.trocr_model is None:
+            self.setup_greek_trocr()
+        else:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("greek")
+            if str(next(self.trocr_model.parameters()).device) != str(self.device):
+                print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
+                self.trocr_model.to(self.device)
+        if not getattr(self, 'trocr_available', False) or self.trocr_model is None:
+            return ""
+        try:
+            import torch
+            from PIL import Image
+            print("[INFO] Segmenting layout for Greek TrOCR...")
+            layout = self.layout_parser.analyze_layout(image_path)
+            crops = self.layout_parser.crop_lines(image_path, layout)
+            # Fallback to whole image if no crops detected
+            if not crops:
+                print("[WARN] No line crops found, processing full image with TrOCR")
+                crops = [Image.open(image_path).convert("RGB")]
+            line_texts = []
+            print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...")
+            for idx, crop in enumerate(crops):
+                # Ensure RGB mode for TrOCR
+                crop_rgb = crop.convert("RGB")
+                pixel_values = self.trocr_processor(
+                    images=crop_rgb,
+                    return_tensors="pt"
+                ).pixel_values.to(self.device)
+                with torch.inference_mode():
+                    generated_ids = self.trocr_model.generate(
+                        pixel_values,
+                        max_length=256,
+                        num_beams=4,
+                        early_stopping=True,
+                        repetition_penalty=1.2
+                    )
+                text = self.trocr_processor.batch_decode(
+                    generated_ids,
+                    skip_special_tokens=True
+                )[0]
+                if text.strip():
+                    line_texts.append(text.strip())
+            full_text = "\n".join(line_texts)
+            print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image")
+            return full_text
+        except Exception as e:
+            print(f"[ERROR] Greek TrOCR extraction failed: {e}")
+            return ""
+    def _extract_with_ancient_greek_ocr(self, image):
+        """Extract using specialized Ancient Greek OCR"""
+        try:
+            # Save original tessdata path
+            original_tessdata = os.environ.get("TESSDATA_PREFIX", "")
+            # Set tessdata path properly (fix the path format)
+            if os.path.exists(self.ancient_greek_tessdata):
+                # Ensure proper path format without trailing quotes
+                clean_path = str(self.ancient_greek_tessdata).replace('"', '')
+                os.environ["TESSDATA_PREFIX"] = clean_path
+                print(f"[INFO] Set TESSDATA_PREFIX to: {clean_path}")
+            else:
+                print(f"[WARN] Ancient Greek tessdata not found at: {self.ancient_greek_tessdata}")
+                return ""
+            # Use ancient Greek language code 'grc' with optimized settings
+            config = "--psm 6 --oem 1 -c preserve_interword_spaces=1"
+            # Try ancient Greek language pack
+            text = pytesseract.image_to_string(
+                image,
+                lang="grc",  # Ancient Greek language code
+                config=config
+            )
+            # Restore original tessdata path
+            if original_tessdata:
+                os.environ["TESSDATA_PREFIX"] = original_tessdata
+            else:
+                # Remove the environment variable if it wasn't set before
+                if "TESSDATA_PREFIX" in os.environ:
+                    del os.environ["TESSDATA_PREFIX"]
+            return text.strip()
+        except Exception as e:
+            print(f"[WARN] Ancient Greek OCR failed: {e}")
+            # Make sure to restore tessdata path even on error
+            if 'original_tessdata' in locals() and original_tessdata:
+                os.environ["TESSDATA_PREFIX"] = original_tessdata
+            return ""
+    def _extract_layout_aware_ocr(self, image_path):
+        """Extract text by segmenting the page layout into lines first for improved readability order"""
+        try:
+            import pytesseract
+            print("[INFO] Running layout-aware line segmentation for Greek...")
+            layout = self.layout_parser.analyze_layout(image_path)
+            crops = self.layout_parser.crop_lines(image_path, layout)
+            if not crops:
+                print("[WARN] Layout parser returned no line crops for Greek")
+                return ""
+            print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines")
+            line_texts = []
+            # Try to use Ancient Greek first
+            grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
+            use_grc = os.path.exists(grc_file)
+            # Save original TESSDATA_PREFIX
+            original_tessdata = os.environ.get("TESSDATA_PREFIX", "")
+            if use_grc:
+                clean_path = str(self.ancient_greek_tessdata).replace('"', '')
+                os.environ["TESSDATA_PREFIX"] = clean_path
+            try:
+                for idx, crop in enumerate(crops):
+                    # Enhance line crop for OCR
+                    crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
+                    gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
+                    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
+                    enhanced = clahe.apply(gray)
+                    crop_pil = Image.fromarray(enhanced)
+                    config = '--oem 3 --psm 7'
+                    text = ""
+                    if use_grc:
+                        text = pytesseract.image_to_string(
+                            crop_pil,
+                            lang='grc',
+                            config=config
+                        ).strip()
+                    if not text:
+                        text = pytesseract.image_to_string(
+                            crop_pil,
+                            lang='ell',
+                            config=config
+                        ).strip()
+                    if text:
+                        line_texts.append(text)
+            finally:
+                if use_grc and original_tessdata:
+                    os.environ["TESSDATA_PREFIX"] = original_tessdata
+            return "\n".join(line_texts)
+        except Exception as e:
+            print(f"[WARN] Layout aware Greek OCR failed: {e}")
+            return ""
+    def _extract_with_standard_greek_ocr(self, image):
+        """Extract using standard Greek OCR with optimized settings"""
+        try:
+            # Multiple OCR attempts with different settings
+            configs = [
+                "--psm 6 --oem 1",  # Uniform text block
+                "--psm 4 --oem 1",  # Single column text
+                "--psm 3 --oem 1",  # Default, automatic page segmentation
+                "--psm 8 --oem 1"   # Single word
+            ]
+            for config in configs:
+                try:
+                    text = pytesseract.image_to_string(
+                        image,
+                        lang="ell",  # Modern Greek
+                        config=config
+                    )
+                    if text and self._validate_greek_text(text):
+                        return text.strip()
+                except Exception:
+                    continue
+            return ""
+        except Exception as e:
+            print(f"[WARN] Standard Greek OCR failed: {e}")
+            return ""
+    def _extract_with_preprocessing(self, image):
+        """Fallback extraction with image preprocessing"""
+        try:
+            # Convert PIL to CV2
+            cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            # Image preprocessing for better OCR
+            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+            # Try different preprocessing approaches
+            preprocessed_images = [
+                gray,  # Original grayscale
+                cv2.GaussianBlur(gray, (1, 1), 0),  # Slight blur
+                cv2.medianBlur(gray, 3),  # Noise reduction
+                cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]  # Adaptive threshold
+            ]
+            for processed_img in preprocessed_images:
+                try:
+                    pil_img = Image.fromarray(processed_img)
+                    text = pytesseract.image_to_string(
+                        pil_img,
+                        lang="ell",
+                        config="--psm 6 --oem 1"
+                    )
+                    if self._validate_greek_text(text):
+                        return text.strip()
+                except Exception:
+                    continue
+            return ""
+        except Exception as e:
+            print(f"[WARN] Fallback Greek OCR failed: {e}")
+            return ""
+    def _count_greek_chars(self, text):
+        """Count Greek Unicode characters including polytonic marks"""
+        if not text:
+            return 0
+        def is_greek_char(ch):
+            o = ord(ch)
+            # Greek and Coptic (0x0370-0x03FF)
+            # Greek Extended (0x1F00-0x1FFF) - includes polytonic marks
+            return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF)
+        return sum(is_greek_char(ch) for ch in text)
+    def _validate_greek_text(self, text):
+        """Validate if text contains meaningful Greek content"""
+        if not text or len(text.strip()) < 3:
+            return False
+        # Count Greek characters
+        greek_char_count = self._count_greek_chars(text)
+        total_chars = len(re.sub(r'\s+', '', text))
+        if total_chars == 0:
+            return False
+        # Check for Latin characters (should reject if too many)
+        latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
+        latin_ratio = latin_chars / total_chars if total_chars > 0 else 0
+        # If text is mostly Latin characters, reject it
+        if latin_ratio > 0.8 and greek_char_count < 3:
+            print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}")
+            return False
+        # At least 20% should be Greek characters, or minimum 5 Greek chars
+        greek_ratio = greek_char_count / total_chars
+        return greek_char_count >= 5 or greek_ratio >= 0.20
+    def _extract_distinct_terms(self, text):
+        """Extract distinct Greek terms from text"""
+        if not text:
+            return []
+        # Find Greek words (including those with diacritical marks)
+        tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
+        def is_greek_word(word):
+            return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF)
+                      for ch in word)
+        distinct_terms = []
+        seen = set()
+        for token in tokens:
+            if len(token) < 2:  # Skip single characters
+                continue
+            if is_greek_word(token):
+                normalized = token.lower()
+                if normalized not in seen:
+                    distinct_terms.append(token)
+                    seen.add(normalized)
+        return distinct_terms[:20]  # Limit to 20 terms
+    def process_text(self, greek_text):
+        """Process extracted Greek text"""
+        if not greek_text:
+            return {"text": "", "terms": [], "char_analysis": {}, "validation": {}}
+        # Extract distinct terms
+        terms = self._extract_distinct_terms(greek_text)
+        # Character analysis
+        char_analysis = {
+            "total_chars": len(greek_text),
+            "greek_chars": self._count_greek_chars(greek_text),
+            "unique_chars": len(set(greek_text)),
+            "words": len(greek_text.split())
+        }
+        # Validation metrics
+        validation = {
+            "has_polytonic": self._has_polytonic_marks(greek_text),
+            "greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]),
+            "quality_score": self._calculate_quality_score(greek_text)
+        }
+        return {
+            "text": greek_text,
+            "terms": terms,
+            "char_analysis": char_analysis,
+            "validation": validation
+        }
+    def _has_polytonic_marks(self, text):
+        """Check if text contains polytonic Greek marks"""
+        # Greek Extended block contains polytonic diacritical marks
+        return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text)
+    def _calculate_quality_score(self, text):
+        """Calculate a quality score for the extracted text"""
+        if not text:
+            return 0.0
+        score = 0.0
+        # Base score from Greek character ratio
+        greek_ratio = self._count_greek_chars(text) / max(1, len(text))
+        score += greek_ratio * 0.4
+        # Bonus for polytonic marks (indicates authentic ancient Greek)
+        if self._has_polytonic_marks(text):
+            score += 0.3
+        # Penalty for too many non-alphabetic characters
+        alpha_chars = sum(ch.isalpha() for ch in text)
+        alpha_ratio = alpha_chars / max(1, len(text))
+        score += alpha_ratio * 0.3
+        return min(1.0, score)
+    def generate_historical_context(self, processed_result):
+        """Generate historical context for Greek text"""
+        greek_text = processed_result.get("text", "")
+        terms = processed_result.get("terms", [])
+        # Generate Groq context
+        groq_detail = self._generate_groq_context(greek_text)
+        # Build references - query both words and individual characters
+        query_terms = list(terms) if terms else []
+        if greek_text:
+            query_terms.extend([char for char in greek_text if char.strip()])
+        print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}")
+        refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
+        print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}")
+        return {
+            "uses_box": {
+                "title": "Each symbol's possible use by the Greek people",
+                "items": self._build_uses_list(terms, greek_text)
+            },
+            "meaning_box": self._build_meaning_box(terms, groq_detail),
+            "references": refs
+        }
+    def _generate_groq_context(self, greek_text):
+        """Generate contextual information using Groq"""
+        if not self.groq_client.is_available():
+            return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
+        prompt = (
+            f"This ancient Greek text was found: {greek_text}\n\n"
+            "Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, "
+            "possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), "
+            "and paleographic cues. Avoid repeating the prompt."
+        )
+        system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context."
+        enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text)
+        return self.groq_client.generate_response(
+            system_prompt=enriched_system_prompt,
+            user_prompt=prompt
+        ) or "(context unavailable due to Groq error)"
+    def _generate_batch_explanations(self, terms):
+        """Generate scholarly glossary definitions for Greek terms in a single batch query"""
+        if not terms or not self.groq_client or not self.groq_client.is_available():
+            return {}
+        # Limit to first 15 terms to prevent token limit/truncation issues
+        terms_to_query = list(terms)[:15]
+        terms_list = ", ".join(terms_to_query)
+        system_prompt = (
+            "You are an expert classicist and lexicographer of Ancient Greek. "
+            "Respond ONLY with a JSON object. Do NOT wrap values in double quotes inside the strings. "
+            "Use single quotes '...' for any internal quotes, definitions, or translations."
+        )
+        user_prompt = (
+            f"For each of the following Ancient Greek words, provide a brief, scholarly one-sentence definition, "
+            f"etymological note, or grammatical gloss:\n\n"
+            f"Words: {terms_list}\n\n"
+            f"Respond ONLY with a JSON object where the keys are the exact words and the values are the definitions.\n"
+            f"Do NOT use double quotes inside the definitions/values; use single quotes instead.\n"
+            f"Example: {{\"word1\": \"definition1\", \"word2\": \"definition2\"}}"
+        )
+        try:
+            raw_response = self.groq_client.generate_response(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                max_tokens=2048
+            )
+            # Safe print to avoid UnicodeEncodeError in Windows command prompt
+            print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
+            # Find JSON block in response
+            if "{" in raw_response and "}" in raw_response:
+                start = raw_response.find("{")
+                end = raw_response.rfind("}") + 1
+                json_str = raw_response[start:end]
+                import json
+                try:
+                    definitions = json.loads(json_str)
+                except Exception as je:
+                    print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
+                    import re
+                    definitions = {}
+                    # Matches "key": "value"
+                    pattern = re.compile(r'"([^"]+)":\s*"((?:[^"\\]|\\.)*)"')
+                    matches = pattern.findall(json_str)
+                    for k, v in matches:
+                        definitions[k] = v
+                return {k: str(v) for k, v in definitions.items()}
+        except Exception as e:
+            print(f"[WARN] Failed to generate batch Greek explanations: {e}")
+        return {}
+    def _build_uses_list(self, terms, greek_text):
+        """Build list of symbol/word uses using RAG and batch Groq explanations"""
+        import unicodedata
+        items = []
+        # 1. Get definitions for the extracted Greek words (terms)
+        if terms:
+            # Unique terms preserving order
+            unique_terms = list(dict.fromkeys(terms))
+            # Limit to top 15 terms to be concise
+            unique_terms = unique_terms[:15]
+            print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...")
+            definitions = {}
+            missing_terms = []
+            for term in unique_terms:
+                # Check RAG corpus (normalize search query)
+                norm_term = unicodedata.normalize('NFC', term).strip()
+                rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1)
+                if rag_matches:
+                    definitions[term] = rag_matches[0]["definition"]
+                else:
+                    missing_terms.append(term)
+            # Generate remaining definitions with Groq in a single batch
+            if missing_terms:
+                groq_defs = self._generate_batch_explanations(missing_terms)
+                # Normalize groq keys for matching
+                normalized_groq_defs = {}
+                for k, v in groq_defs.items():
+                    nk = unicodedata.normalize('NFC', k).strip().lower()
+                    normalized_groq_defs[nk] = v
+                # Assign matching definitions
+                for term in missing_terms:
+                    nt = unicodedata.normalize('NFC', term).strip().lower()
+                    if nt in normalized_groq_defs:
+                        definitions[term] = normalized_groq_defs[nt]
+                    else:
+                        # Case/accent insensitive backup match (in case Groq stripped accents)
+                        import unicodedata as ud
+                        def strip_accents(s):
+                            return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn')
+                        stripped_t = strip_accents(nt)
+                        for gk, gv in normalized_groq_defs.items():
+                            if strip_accents(gk) == stripped_t:
+                                definitions[term] = gv
+                                break
+            for term in unique_terms:
+                definition = definitions.get(term)
+                if not definition:
+                    definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
+                items.append(f"{term}: {definition}")
+        # 2. Add significant paleographical/character markers found in the text if they are in the references
+        notes = self.references.get("greek_symbol_notes", {}) or {}
+        seen_chars = set()
+        char_items = []
+        for ch in greek_text:
+            if ch in notes and ch not in seen_chars:
+                seen_chars.add(ch)
+                char_items.append(f"Character '{ch}': {notes[ch]}")
+        # Limit character notes to prevent clutter
+        items.extend(char_items[:5])
+        # Format as list items with bullets
+        formatted_items = [f"- {item}" for item in items]
+        if not formatted_items:
+            default_hint = self.references.get("greek_hint",
+                "Ancient Greek script marker; values are determined by polytonic diacritical marks.")
+            formatted_items.append(f"- —: {default_hint}")
+        return formatted_items
+    def _build_meaning_box(self, terms, groq_detail):
+        """Build meaning interpretation box"""
+        intro_lines = [
+            "The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.",
+            "Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification."
+        ]
+        points = [
+            "• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.",
+            "• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.",
+        ]
+        if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
+            points.append(groq_detail.strip())
+        return {
+            "title": "Possible meaning:",
+            "intro_lines": intro_lines,
+            "frequent_label": "Key terms noted",
+            "frequent": terms[:10],
+            "points": points
+        }
+    def generate_story(self, processed_result):
+        """Generate creative story for Greek text"""
+        greek_text = processed_result.get("text", "")
+        if not self.groq_client.is_available():
+            return "Groq client unavailable, cannot generate story."
+        styles = [
+            "as an epic poem told by a travelling rhapsode",
+            "as a prophecy inscribed on the Oracle at Delphi",
+            "as a philosophical dialogue in the Academy",
+            "as a myth recounted by ancient storytellers",
+            "as a recovered scroll from the Library of Alexandria",
+            "as a hymn sung in honor of the gods"
+        ]
+        import random
+        chosen_style = random.choice(styles)
+        seed = random.randint(1000, 9999)
+        prompt = (
+            f"The following ancient Greek text was found: {greek_text}\n\n"
+            f"Create a long, vivid, imaginative story from ancient Greek times "
+            f"based on this Greek text. Write it as one rich paragraph with "
+            f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n"
+            f"Creative seed: {seed}\n"
+            f"Write a detailed, imaginative myth-like story {chosen_style}. "
+            "Include multiple characters, rich imagery, and scenes. "
+            "Avoid repetition and keep it unpredictable."
+        )
+        system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture."
+        story = self.groq_client.generate_response(
+            system_prompt=system_prompt,
+            user_prompt=prompt
+        )
+        if not story or is_gibberish(story):
+            return "Failed to create quality story; the ancient texts remain silent."
+        return story

processors/latin_processor.py ADDED Viewed

	@@ -0,0 +1,1281 @@

+import os
+import cv2
+import numpy as np
+import re
+import time
+from PIL import Image
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import torch
+from .base_processor import BaseScriptProcessor
+from utils.text_utils import is_gibberish
+BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
+TRIDIS_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "tridis")
+TROCR_LATIN_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "trocr_latin")
+class LatinProcessor(BaseScriptProcessor):
+    def __init__(self, groq_client, references, clip_classifier):
+        super().__init__(groq_client, references, clip_classifier)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.tridis_model = None
+        self.tridis_processor = None
+        self.tridis_available = False
+        self.trocr_latin_model = None
+        self.trocr_latin_processor = None
+        self.trocr_latin_available = False
+        self.active_style = "cursive"
+        self.active_model = "None"
+        self.setup_tesseract_fallback()
+        # Register for dynamic VRAM management
+        from utils.gpu_diagnostics import register_processor
+        register_processor("latin", self)
+    def setup_tridis_htr(self):
+        """Setup TRIDIS HTR model - BEST for medieval Latin manuscripts"""
+        try:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("latin")
+            print("[INFO] Lazily loading TRIDIS HTR model for medieval Latin...")
+            print("[INFO] This model specializes in 13th-16th century manuscripts with automatic abbreviation expansion")
+            # TRIDIS model from Hugging Face - runs locally after download
+            self.tridis_processor = TrOCRProcessor.from_pretrained(
+                'magistermilitum/tridis_HTR',
+                cache_dir=TRIDIS_MODEL_DIR,
+                local_files_only=False  # Download first time, then cache locally
+            )
+            self.tridis_model = VisionEncoderDecoderModel.from_pretrained(
+                'magistermilitum/tridis_HTR',
+                cache_dir=TRIDIS_MODEL_DIR,
+                local_files_only=False
+            )
+            self.tridis_model.to(self.device)
+            self.tridis_model.eval()  # Put in evaluation mode
+            from utils.gpu_diagnostics import log_model_device
+            log_model_device("Latin TRIDIS HTR (Cursive)", self.device)
+            print(f"[INFO] TRIDIS HTR loaded successfully on {self.device}")
+            print("[INFO] Training: 245,000 lines of Latin/Old French/Old Spanish medieval manuscripts")
+            print("[INFO] Features: Automatic abbreviation expansion, named entity capitalization, cancellation markers")
+            self.tridis_available = True
+        except Exception as e:
+            print(f"[ERROR] TRIDIS HTR model failed to load: {e}")
+            print("[WARN] Falling back to Tesseract for basic Latin recognition...")
+            self.tridis_available = False
+    def setup_trocr_base_latin(self):
+        """Setup trocr-base-latin model - BEST for printed or carved classical Latin"""
+        try:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("latin")
+            print("[INFO] Lazily loading trocr-base-latin model for printed/carved Latin...")
+            self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
+                'magistermilitum/trocr-base-latin',
+                cache_dir=TROCR_LATIN_MODEL_DIR,
+                local_files_only=False
+            )
+            self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
+                'magistermilitum/trocr-base-latin',
+                cache_dir=TROCR_LATIN_MODEL_DIR,
+                local_files_only=False
+            )
+            self.trocr_latin_model.to(self.device)
+            self.trocr_latin_model.eval()  # Put in evaluation mode
+            from utils.gpu_diagnostics import log_model_device
+            log_model_device("Latin TrOCR (Printed)", self.device)
+            self.trocr_latin_available = True
+            print(f"[INFO] trocr-base-latin loaded successfully on {self.device}")
+        except Exception as e:
+            print(f"[WARN] magistermilitum/trocr-base-latin model failed to load ({e}). Trying public fallback 'microsoft/trocr-base-printed'...")
+            try:
+                # Free VRAM again in case partial allocation left residue
+                reclaim_vram_for("latin")
+                self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
+                    'microsoft/trocr-base-printed',
+                    cache_dir=TROCR_LATIN_MODEL_DIR,
+                    local_files_only=False
+                )
+                self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
+                    'microsoft/trocr-base-printed',
+                    cache_dir=TROCR_LATIN_MODEL_DIR,
+                    local_files_only=False
+                )
+                self.trocr_latin_model.to(self.device)
+                self.trocr_latin_model.eval()  # Put in evaluation mode
+                from utils.gpu_diagnostics import log_model_device
+                log_model_device("Latin TrOCR (Printed Fallback)", self.device)
+                self.trocr_latin_available = True
+                print(f"[INFO] Public fallback microsoft/trocr-base-printed loaded successfully on {self.device}")
+            except Exception as ex:
+                print(f"[ERROR] All printed Latin models failed to load: {ex}")
+                self.trocr_latin_available = False
+    def setup_tesseract_fallback(self):
+        """Setup Tesseract as fallback for basic Latin recognition"""
+        try:
+            import pytesseract
+            # Test Tesseract availability
+            try:
+                version = pytesseract.get_tesseract_version()
+                print(f"[INFO] Tesseract fallback version: {version}")
+            except:
+                print("[INFO] Tesseract version check skipped")
+            self.ocr_configs = {
+                'medieval_extended': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁꝐꝑꝒꝓꝔꝕꝖꝗꝘꝙꝚꝛꝜꝝꞀꞁꞂꞃ$',
+                'medieval_basic': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-',
+                'standard': r'--oem 3 --psm 6',
+                'single_line': r'--oem 3 --psm 7',
+                'single_word': r'--oem 3 --psm 8',
+                'auto': r'--oem 3 --psm 3'
+            }
+            self.tesseract_available = True
+            print("[INFO] Tesseract fallback configured with medieval symbol support")
+        except ImportError:
+            print("[ERROR] pytesseract not available")
+            self.tesseract_available = False
+        except Exception as e:
+            print(f"[WARN] Tesseract setup failed: {e}")
+            self.tesseract_available = False
+    def detect_script(self, image_path):
+        """Detection handled by Groq Vision classification"""
+        try:
+            if not self.tridis_available and not self.tesseract_available:
+                print("[ERROR] No OCR engines available for Latin processing")
+                return False, 0.0
+            method = "TRIDIS HTR (medieval specialist)" if self.tridis_available else "Tesseract fallback"
+            print(f"[INFO] Latin processor activated - Using {method}")
+            return True, 0.98 if self.tridis_available else 0.85
+        except Exception as e:
+            print(f"[ERROR] Latin detection failed: {e}")
+            return False, 0.0
+    def extract_text(self, image_path):
+        """Extract text using dual-mode routing: trocr-base-latin for printed, tridis_HTR for cursive"""
+        try:
+            start_time = time.time()
+            # Step 1: Detect script style
+            style = self.layout_parser.detect_writing_style(image_path, self.clip_classifier)
+            print(f"[INFO] Latin writing style detected: {style.upper()}")
+            primary_text = ""
+            fallback_text = ""
+            # Ensure the required model is loaded dynamically
+            if style == "printed":
+                if self.trocr_latin_model is None:
+                    self.setup_trocr_base_latin()
+                else:
+                    from utils.gpu_diagnostics import reclaim_vram_for
+                    reclaim_vram_for("latin")
+                    if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
+                        print(f"[VRAM MANAGER] Activating Latin TrOCR (Printed) model on {self.device}...")
+                        self.trocr_latin_model.to(self.device)
+            else:
+                if self.tridis_model is None:
+                    self.setup_tridis_htr()
+                else:
+                    from utils.gpu_diagnostics import reclaim_vram_for
+                    reclaim_vram_for("latin")
+                    if str(next(self.tridis_model.parameters()).device) != str(self.device):
+                        print(f"[VRAM MANAGER] Activating Latin TRIDIS HTR (Cursive) model on {self.device}...")
+                        self.tridis_model.to(self.device)
+            if style == "printed" and self.trocr_latin_available:
+                print("[INFO] Routing to printed/carved Latin model (trocr-base-latin)...")
+                primary_text = self._extract_with_trocr_base_latin(image_path)
+                if primary_text and self._validate_latin_text(primary_text, style):
+                    processing_time = time.time() - start_time
+                    print(f"[SUCCESS] Routed to trocr-base-latin and completed in {processing_time:.2f}s")
+                    self.active_style = "printed"
+                    self.active_model = "trocr-base-latin"
+                    return primary_text
+                else:
+                    print("[WARN] trocr-base-latin returned poor quality result, trying TRIDIS HTR fallback...")
+                    if self.tridis_model is None:
+                        self.setup_tridis_htr()
+                    if self.tridis_available:
+                        fallback_text = self._extract_with_tridis_htr(image_path)
+            else:  # cursive / manuscript
+                print("[INFO] Routing to medieval manuscript model (tridis_HTR)...")
+                if self.tridis_available:
+                    primary_text = self._extract_with_tridis_htr(image_path)
+                    if primary_text and self._validate_latin_text(primary_text, style):
+                        processing_time = time.time() - start_time
+                        print(f"[SUCCESS] Routed to tridis_HTR and completed in {processing_time:.2f}s")
+                        self.active_style = "cursive"
+                        self.active_model = "tridis_HTR"
+                        return primary_text
+                    else:
+                        print("[WARN] TRIDIS HTR returned poor quality result, trying trocr-base-latin fallback...")
+                        if self.trocr_latin_model is None:
+                            self.setup_trocr_base_latin()
+                        if self.trocr_latin_available:
+                            fallback_text = self._extract_with_trocr_base_latin(image_path)
+            # Step 2: Check fallback text from the other model
+            if fallback_text and self._validate_latin_text(fallback_text, "printed" if style == "cursive" else "cursive"):
+                processing_time = time.time() - start_time
+                print(f"[SUCCESS] Fallback model transcription successful in {processing_time:.2f}s")
+                self.active_style = "printed" if style == "cursive" else "cursive"
+                self.active_model = "trocr-base-latin" if style == "cursive" else "tridis_HTR"
+                return fallback_text
+            # Step 3: Tesseract fallback
+            if self.tesseract_available:
+                print("[INFO] Neural models failed. Processing with Tesseract fallback...")
+                tesseract_text = self._extract_with_tesseract_enhanced(image_path)
+                if tesseract_text and self._validate_latin_text(tesseract_text, "any"):
+                    processing_time = time.time() - start_time
+                    print(f"[SUCCESS] Tesseract fallback completed in {processing_time:.2f}s")
+                    self.active_style = "printed"  # Tesseract works best on printed
+                    self.active_model = "Tesseract OCR"
+                    return tesseract_text
+                else:
+                    print("[WARN] Tesseract returned poor quality result, trying layout-aware segmentation fallback...")
+                    # Method 3: Layout-aware line segment fallback
+                    layout_aware_text = self._extract_layout_aware_ocr(image_path)
+                    if layout_aware_text and self._validate_latin_text(layout_aware_text, "any"):
+                        processing_time = time.time() - start_time
+                        print(f"[SUCCESS] Layout-aware OCR completed in {processing_time:.2f}s")
+                        self.active_style = "printed"
+                        self.active_model = "Tesseract Layout-Aware"
+                        return layout_aware_text
+            print("[ERROR] All OCR methods failed or returned poor quality results")
+            self.active_style = "unknown"
+            self.active_model = "None"
+            return "No readable Latin text detected with sufficient confidence"
+        except Exception as e:
+            print(f"[ERROR] Latin text extraction failed: {e}")
+            self.active_style = "error"
+            self.active_model = "None"
+            return f"Error during text extraction: {str(e)}"
+    def _extract_with_trocr_base_latin(self, image_path):
+        """Extract text using trocr-base-latin - SPECIALIZED for printed/carved Latin"""
+        if self.trocr_latin_model is None:
+            self.setup_trocr_base_latin()
+        else:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("latin")
+            if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
+                print(f"[VRAM MANAGER] Activating Latin TrOCR model on {self.device}...")
+                self.trocr_latin_model.to(self.device)
+        if not getattr(self, 'trocr_latin_available', False) or self.trocr_latin_model is None:
+            return ""
+        try:
+            image = Image.open(image_path).convert("RGB")
+            print(f"[INFO] Processing image with trocr-base-latin: {image.size[0]}x{image.size[1]} pixels")
+            # Since trocr models are line-level OCR models, segment into lines first
+            layout = self.layout_parser.analyze_layout(image_path)
+            crops = self.layout_parser.crop_lines(image_path, layout)
+            if crops and len(crops) > 1:
+                print(f"[INFO] Image contains multiple lines ({len(crops)}). Running line-by-line trocr-base-latin...")
+                line_texts = []
+                for idx, crop in enumerate(crops):
+                    text = self._ocr_single_crop_with_trocr_base_latin(crop)
+                    if text:
+                        line_texts.append(text)
+                return "\n".join(line_texts)
+            else:
+                print("[INFO] Single line detected or layout parser returned no lines. Processing full image...")
+                return self._ocr_single_crop_with_trocr_base_latin(image)
+        except Exception as e:
+            print(f"[ERROR] trocr-base-latin extraction failed: {e}")
+            return ""
+    def _ocr_single_crop_with_trocr_base_latin(self, crop_image):
+        """Helper to run trocr-base-latin inference on a single image crop"""
+        try:
+            pixel_values = self.trocr_latin_processor(
+                images=crop_image,
+                return_tensors="pt"
+            ).pixel_values.to(self.device)
+            with torch.inference_mode():
+                generated_ids = self.trocr_latin_model.generate(
+                    pixel_values,
+                    max_length=512,
+                    num_beams=4,
+                    early_stopping=True
+                )
+            text = self.trocr_latin_processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0]
+            text = ' '.join(text.split())
+            return text.strip()
+        except Exception as e:
+            print(f"[ERROR] Single line OCR with trocr-base-latin failed: {e}")
+            return ""
+    def _extract_with_tridis_htr(self, image_path):
+        """Extract text using TRIDIS HTR - SPECIALIZED for medieval Latin manuscripts.
+        Uses layout-aware line segmentation so multi-line documents are fully transcribed."""
+        if self.tridis_model is None:
+            self.setup_tridis_htr()
+        else:
+            from utils.gpu_diagnostics import reclaim_vram_for
+            reclaim_vram_for("latin")
+            if str(next(self.tridis_model.parameters()).device) != str(self.device):
+                print(f"[VRAM MANAGER] Activating Latin TRIDIS model on {self.device}...")
+                self.tridis_model.to(self.device)
+        if not getattr(self, 'tridis_available', False) or self.tridis_model is None:
+            return ""
+        try:
+            # Load and validate image
+            image = Image.open(image_path).convert("RGB")
+            print(f"[INFO] Processing image with TRIDIS HTR: {image.size[0]}x{image.size[1]} pixels")
+            # Use layout parser to segment into individual lines
+            layout = self.layout_parser.analyze_layout(image_path)
+            crops = self.layout_parser.crop_lines(image_path, layout)
+            if crops and len(crops) > 1:
+                # Cap lines to prevent timeout on very large documents (CPU inference)
+                MAX_LINES = 50
+                total_detected = len(crops)
+                if len(crops) > MAX_LINES:
+                    print(f"[INFO] Layout parser detected {total_detected} text lines. Capping to {MAX_LINES} for performance.")
+                    crops = crops[:MAX_LINES]
+                else:
+                    print(f"[INFO] Layout parser detected {total_detected} text lines. Running line-by-line TRIDIS HTR...")
+                line_texts = []
+                for idx, crop in enumerate(crops):
+                    # Preprocess each line crop for medieval manuscripts
+                    enhanced_crop = self._preprocess_for_medieval_manuscript(crop)
+                    text = self._ocr_single_crop_with_tridis(enhanced_crop)
+                    if text:
+                        line_texts.append(text)
+                        print(f"  [LINE {idx+1}/{len(crops)}] {text[:80]}...")
+                if line_texts:
+                    full_text = "\n".join(line_texts)
+                    # Post-process medieval abbreviations, corrections, and formatting
+                    processed_text = self._post_process_medieval_text(full_text)
+                    char_count = len(processed_text)
+                    word_count = len(processed_text.split())
+                    print(f"[INFO] TRIDIS HTR extracted (multi-line): {char_count} characters, {word_count} words from {len(line_texts)} lines")
+                    medieval_features = self._analyze_medieval_features(processed_text)
+                    if medieval_features:
+                        print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
+                    return processed_text.strip()
+            # Single line or no layout detected — process full image
+            print("[INFO] Single line or no layout segmentation. Processing full image with TRIDIS HTR...")
+            enhanced_image = self._preprocess_for_medieval_manuscript(image)
+            # Process with TRIDIS HTR
+            print("[INFO] Running TRIDIS HTR inference...")
+            pixel_values = self.tridis_processor(
+                images=enhanced_image,
+                return_tensors="pt"
+            ).pixel_values.to(self.device)
+            # Generate text with parameters optimized for medieval manuscripts
+            with torch.inference_mode():
+                generated_ids = self.tridis_model.generate(
+                    pixel_values,
+                    max_length=768,  # Longer sequences for medieval texts with abbreviations
+                    num_beams=6,     # Higher quality beam search for historical accuracy
+                    early_stopping=True,
+                    do_sample=False,
+                    repetition_penalty=1.15,  # Avoid repetition common in medieval texts
+                    length_penalty=0.8,       # Don't penalize longer expansions
+                    no_repeat_ngram_size=2    # Avoid immediate repetitions
+                )
+            # Decode the generated text
+            generated_text = self.tridis_processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0]
+            # Post-process medieval abbreviations, corrections, and formatting
+            processed_text = self._post_process_medieval_text(generated_text)
+            # Log extraction results
+            char_count = len(processed_text)
+            word_count = len(processed_text.split())
+            print(f"[INFO] TRIDIS HTR extracted: {char_count} characters, {word_count} words")
+            # Detect medieval features
+            medieval_features = self._analyze_medieval_features(processed_text)
+            if medieval_features:
+                print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
+            return processed_text.strip()
+        except Exception as e:
+            print(f"[ERROR] TRIDIS HTR extraction failed: {e}")
+            return ""
+    def _ocr_single_crop_with_tridis(self, crop_image):
+        """Helper to run TRIDIS HTR inference on a single line crop image"""
+        try:
+            pixel_values = self.tridis_processor(
+                images=crop_image,
+                return_tensors="pt"
+            ).pixel_values.to(self.device)
+            with torch.inference_mode():
+                generated_ids = self.tridis_model.generate(
+                    pixel_values,
+                    max_length=768,
+                    num_beams=6,
+                    early_stopping=True,
+                    do_sample=False,
+                    repetition_penalty=1.15,
+                    length_penalty=0.8,
+                    no_repeat_ngram_size=2
+                )
+            text = self.tridis_processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0]
+            text = ' '.join(text.split())
+            return text.strip()
+        except Exception as e:
+            print(f"[ERROR] Single line OCR with TRIDIS failed: {e}")
+            return ""
+    def _preprocess_for_medieval_manuscript(self, image):
+        """Enhanced preprocessing specifically optimized for medieval manuscripts"""
+        try:
+            print("[INFO] Applying medieval manuscript preprocessing...")
+            # Convert to OpenCV format
+            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
+            # Step 1: Handle parchment/paper background variations
+            # CLAHE for local contrast enhancement (handles uneven illumination)
+            clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
+            contrast_enhanced = clahe.apply(gray)
+            # Step 2: Gentle denoising to preserve medieval letterforms and ink variations
+            # Bilateral filter preserves edges while reducing noise
+            denoised = cv2.bilateralFilter(contrast_enhanced, 7, 80, 80)
+            # Step 3: Enhance faded ink while preserving original stroke width
+            # Subtle sharpening kernel
+            sharpen_kernel = np.array([
+                [-0.5, -1, -0.5],
+                [-1,   6, -1  ],
+                [-0.5, -1, -0.5]
+            ])
+            sharpened = cv2.filter2D(denoised, -1, sharpen_kernel)
+            # Step 4: Normalize intensity range for optimal TRIDIS input
+            normalized = cv2.normalize(sharpened, None, 0, 255, cv2.NORM_MINMAX)
+            # Convert back to PIL format and ensure it is RGB mode
+            processed_image = Image.fromarray(normalized).convert("RGB")
+            print("[INFO] Medieval preprocessing completed: contrast enhanced, denoised, sharpened")
+            return processed_image
+        except Exception as e:
+            print(f"[WARN] Medieval preprocessing failed: {e}, using original image")
+            return image
+    def _post_process_medieval_text(self, text):
+        """Post-process text from TRIDIS HTR with medieval-specific corrections"""
+        try:
+            if not text:
+                return text
+            print("[INFO] Post-processing TRIDIS HTR output for medieval features...")
+            processed = text
+            # Handle TRIDIS cancellation/correction markers
+            # TRIDIS uses $word$ to mark cancelled/corrected text
+            import re
+            # Count cancellations before processing
+            cancellation_count = processed.count('$') // 2
+            # Convert $word$ to editorial brackets [word] for scholarly display
+            processed = re.sub(r'\$([^$]*)\$', r'[\1]', processed)
+            if cancellation_count > 0:
+                print(f"[INFO] Processed {cancellation_count} scribal corrections/cancellations")
+            # Clean up multiple spaces and normalize whitespace
+            processed = ' '.join(processed.split())
+            # Detect and log TRIDIS abbreviation expansions
+            # Common medieval abbreviations that TRIDIS expands automatically
+            medieval_expansions = {
+                'domini': 'dñi/dni/dom̃',
+                'facimus': 'facim̃/facimꝰ',
+                'quod': 'qd/q̃d',
+                'enim': 'enim̃/en̄',
+                'pro': 'ꝓ/p̃',
+                'et': '⁊/et̃',
+                'cum': 'cũ/cum̃',
+                'per': 'p̃/ꝑ',
+                'sunt': 'sũt/sunt̃',
+                'omnia': 'om̃ia/omn̄a'
+            }
+            expansions_found = []
+            for expansion, abbreviations in medieval_expansions.items():
+                if expansion in processed.lower():
+                    expansions_found.append(f"{abbreviations}→{expansion}")
+            if expansions_found:
+                print(f"[INFO] TRIDIS expanded abbreviations: {', '.join(expansions_found[:5])}")
+                if len(expansions_found) > 5:
+                    print(f"[INFO] ... and {len(expansions_found) - 5} more abbreviations")
+            # Detect capitalization patterns (TRIDIS capitalizes named entities)
+            capitalized_words = re.findall(r'\b[A-Z][a-z]+', processed)
+            if capitalized_words:
+                unique_caps = list(set(capitalized_words))
+                print(f"[INFO] Named entities capitalized: {', '.join(unique_caps[:5])}")
+                if len(unique_caps) > 5:
+                    print(f"[INFO] ... and {len(unique_caps) - 5} more entities")
+            return processed
+        except Exception as e:
+            print(f"[WARN] Medieval post-processing failed: {e}")
+            return text
+    def _analyze_medieval_features(self, text):
+        """Analyze and identify medieval manuscript features in the text"""
+        features = []
+        if not text:
+            return features
+        try:
+            # Cancellation markers
+            if '[' in text and ']' in text:
+                features.append("scribal corrections")
+            # Expanded abbreviations
+            medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt', 'omnia']
+            found_expansions = [word for word in medieval_words if word in text.lower()]
+            if found_expansions:
+                features.append(f"abbreviation expansions ({len(found_expansions)})")
+            # Named entity capitalization
+            import re
+            caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
+            if caps_count > 0:
+                features.append(f"capitalized entities ({caps_count})")
+            # Medieval punctuation patterns
+            if '.' in text or ',' in text or ':' in text:
+                features.append("punctuation normalization")
+            # Special medieval characters
+            medieval_chars = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
+            if medieval_chars > 0:
+                features.append(f"medieval symbols ({medieval_chars})")
+        except Exception as e:
+            print(f"[WARN] Medieval feature analysis failed: {e}")
+        return features
+    def _extract_with_tesseract_enhanced(self, image_path):
+        """Enhanced Tesseract extraction with multiple configurations"""
+        try:
+            import pytesseract
+            image = Image.open(image_path).convert("RGB")
+            # Multiple preprocessing approaches
+            preprocessed_images = {
+                'enhanced': self._preprocess_for_tesseract_enhanced(image),
+                'basic': self._preprocess_for_tesseract_basic(image),
+                'original': image
+            }
+            best_text = ""
+            best_score = 0
+            best_config = ""
+            best_preprocessing = ""
+            # Try different combinations of preprocessing and OCR configurations
+            for prep_name, prep_image in preprocessed_images.items():
+                for config_name, config in self.ocr_configs.items():
+                    try:
+                        # Try with Latin language first
+                        text = pytesseract.image_to_string(
+                            prep_image,
+                            lang='lat',
+                            config=config
+                        ).strip()
+                        # If Latin fails or produces poor results, try English
+                        if not text or len(text) < 5:
+                            text = pytesseract.image_to_string(
+                                prep_image,
+                                lang='eng',
+                                config=config
+                            ).strip()
+                        # Score the result
+                        score = self._score_tesseract_result(text)
+                        if text and score > best_score:
+                            best_text = text
+                            best_score = score
+                            best_config = config_name
+                            best_preprocessing = prep_name
+                    except Exception as e:
+                        continue  # Skip failed configurations
+            if best_text:
+                print(f"[INFO] Best Tesseract result: {best_preprocessing} + {best_config} (score: {best_score:.3f})")
+                return self._post_process_tesseract_text(best_text)
+            return ""
+        except Exception as e:
+            print(f"[ERROR] Enhanced Tesseract extraction failed: {e}")
+            return ""
+    def _extract_layout_aware_ocr(self, image_path):
+        """Extract text by segmenting the page layout into lines first for improved readability order"""
+        try:
+            import pytesseract
+            print("[INFO] Running layout-aware line segmentation...")
+            layout = self.layout_parser.analyze_layout(image_path)
+            crops = self.layout_parser.crop_lines(image_path, layout)
+            if not crops:
+                print("[WARN] Layout parser returned no line crops")
+                return ""
+            print(f"[INFO] Layout-aware line parser cropped {len(crops)} lines")
+            line_texts = []
+            for idx, crop in enumerate(crops):
+                # Enhance line crop for OCR
+                crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
+                gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
+                clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
+                enhanced = clahe.apply(gray)
+                crop_pil = Image.fromarray(enhanced)
+                # Single line OCR configuration
+                config = '--oem 3 --psm 7'
+                # Try Latin OCR first
+                text = pytesseract.image_to_string(
+                    crop_pil,
+                    lang='lat',
+                    config=config
+                ).strip()
+                # Try English fallback
+                if not text or len(text) < 3:
+                    text = pytesseract.image_to_string(
+                        crop_pil,
+                        lang='eng',
+                        config=config
+                    ).strip()
+                if text:
+                    line_texts.append(self._post_process_tesseract_text(text))
+            return "\n".join(line_texts)
+        except Exception as e:
+            print(f"[WARN] Layout aware Latin OCR failed: {e}")
+            return ""
+    def _preprocess_for_tesseract_enhanced(self, image):
+        """Enhanced preprocessing for Tesseract OCR"""
+        try:
+            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
+            # More aggressive enhancement for Tesseract
+            clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(gray)
+            # Morphological operations to clean up characters
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
+            cleaned = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
+            return Image.fromarray(cleaned)
+        except Exception as e:
+            print(f"[WARN] Enhanced Tesseract preprocessing failed: {e}")
+            return image
+    def _preprocess_for_tesseract_basic(self, image):
+        """Basic preprocessing for Tesseract OCR"""
+        try:
+            image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
+            # Simple contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(gray)
+            return Image.fromarray(enhanced)
+        except Exception as e:
+            return image
+    def _score_tesseract_result(self, text):
+        """Score Tesseract OCR result quality"""
+        if not text or len(text.strip()) < 2:
+            return 0.0
+        score = 0.0
+        words = text.split()
+        # Base length bonus
+        score += min(len(words) / 15.0, 0.25)
+        # Latin character ratio
+        latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
+        if len(text) > 0:
+            latin_ratio = latin_chars / len(text)
+            score += latin_ratio * 0.35
+        # Word formation bonus
+        if len(words) > 1:
+            score += 0.2
+        # Common Latin words bonus
+        common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab', 'post', 'ante', 'inter']
+        latin_matches = sum(1 for word in words if word.lower() in common_latin)
+        if latin_matches > 0:
+            score += latin_matches * 0.05
+        # Medieval symbols bonus
+        medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
+        symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
+        if symbol_count > 0:
+            score += 0.15
+        # Penalize excessive garbage characters
+        garbage_chars = sum(1 for c in text if not c.isalnum() and c not in " .,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁ")
+        if len(text) > 0:
+            garbage_ratio = garbage_chars / len(text)
+            score -= garbage_ratio * 0.3
+        return max(0.0, min(1.0, score))
+    def _post_process_tesseract_text(self, text):
+        """Post-process Tesseract OCR result"""
+        try:
+            # Clean up common OCR errors
+            corrections = {
+                'rn': 'm',
+                'cl': 'd',
+                '|': 'I',
+                '°': 'o',
+                '¢': 'c',
+                '£': 'E'
+            }
+            processed = text
+            for wrong, correct in corrections.items():
+                processed = processed.replace(wrong, correct)
+            # Normalize whitespace
+            processed = ' '.join(processed.split())
+            return processed
+        except Exception as e:
+            print(f"[WARN] Tesseract post-processing failed: {e}")
+            return text
+    def _validate_latin_text(self, text, style="any"):
+        """Validate text with criteria appropriate for classical/printed or medieval Latin"""
+        if not text or len(text.strip()) < 3:
+            return False
+        try:
+            # Count Latin characters
+            latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
+            total_chars = len(text.replace(' ', ''))
+            if total_chars == 0:
+                return False
+            latin_ratio = latin_chars / max(total_chars, 1)
+            # For printed/classical Latin, we require a high ratio of standard alphabetical letters
+            if style == "printed":
+                return latin_chars >= 5 and latin_ratio >= 0.6
+            # For cursive/medieval Latin, we can be more generous and include medieval symbol weight
+            medieval_symbols = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§[]")
+            medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt']
+            word_bonus = sum(3 for word in medieval_words if word in text.lower())
+            total_meaningful = latin_chars + medieval_symbols + word_bonus
+            meaningful_ratio = total_meaningful / max(total_chars, 1)
+            if total_meaningful >= 10:
+                return True
+            elif meaningful_ratio >= 0.6:
+                return True
+            elif total_meaningful >= 5 and meaningful_ratio >= 0.3:
+                return True
+            else:
+                return False
+        except Exception as e:
+            print(f"[WARN] Text validation failed: {e}")
+            return len(text.strip()) >= 5  # Fallback validation
+    def process_text(self, latin_text):
+        """Process extracted Latin text with comprehensive TRIDIS-aware analysis"""
+        if not latin_text:
+            return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
+        print("[INFO] Processing Latin text with medieval manuscript analysis...")
+        # Extract symbols including medieval markers and corrections
+        symbols = ''.join(filter(lambda x: x.isalnum() or x in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§$[]", latin_text))
+        # Comprehensive medieval character analysis
+        medieval_symbols = [c for c in latin_text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§"]
+        correction_markers = latin_text.count('[') + latin_text.count('$')
+        # Detect expanded abbreviations
+        medieval_abbreviations = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt', 'omnia']
+        expansions_found = [word for word in medieval_abbreviations if word in latin_text.lower()]
+        # Count capitalized entities (TRIDIS feature)
+        import re
+        capitalized_entities = re.findall(r'\b[A-Z][a-z]+', latin_text)
+        unique_entities = list(set(capitalized_entities))
+        # Comprehensive character analysis
+        char_analysis = {
+            "total_chars": len(latin_text),
+            "alpha_chars": sum(c.isalpha() for c in latin_text),
+            "unique_chars": len(set(latin_text)),
+            "word_count": len(latin_text.split()),
+            "medieval_symbols": len(medieval_symbols),
+            "medieval_symbol_types": list(set(medieval_symbols)),
+            "abbreviation_expansions": expansions_found,
+            "expansion_count": len(expansions_found),
+            "correction_markers": correction_markers,
+            "capitalized_entities": unique_entities,
+            "entity_count": len(unique_entities),
+            "avg_word_length": sum(len(word) for word in latin_text.split()) / max(1, len(latin_text.split()))
+        }
+        # Enhanced validation with medieval features
+        validation = {
+            "latin_ratio": sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in latin_text) / max(1, len(latin_text)),
+            "quality_score": self._calculate_comprehensive_quality_score(latin_text),
+            "ocr_method": getattr(self, 'active_model', "TRIDIS HTR (Medieval Manuscript Specialist)" if self.tridis_available else "Tesseract OCR"),
+            "model_specialization": "General Latin text" if getattr(self, 'active_style', '') == 'printed' else ("13th-16th century manuscripts" if self.tridis_available else "General Latin text"),
+            "medieval_features_detected": bool(medieval_symbols or expansions_found or correction_markers),
+            "tridis_used": getattr(self, 'active_model', '') == 'tridis_HTR',
+            "manuscript_period": "Classical/Roman Monumental" if getattr(self, 'active_style', '') == 'printed' else ("Late Medieval (13th-16th centuries)" if (medieval_symbols or expansions_found) else "Classical/Modern"),
+            "text_type": "classical_inscription" if getattr(self, 'active_style', '') == 'printed' else self._determine_text_type(latin_text),
+            "abbreviations_expanded": len(expansions_found) > 0,
+            "named_entities_detected": len(unique_entities) > 0,
+            "scribal_corrections_found": correction_markers > 0,
+            "confidence_level": self._determine_confidence_level(latin_text),
+            "writing_style": getattr(self, 'active_style', 'cursive')
+        }
+        return {
+            "text": latin_text,
+            "symbols": symbols,
+            "char_analysis": char_analysis,
+            "validation": validation
+        }
+    def _calculate_comprehensive_quality_score(self, text):
+        """Calculate comprehensive quality score with medieval bonuses"""
+        if not text:
+            return 0.0
+        score = 0.0
+        words = text.split()
+        # Base metrics
+        score += min(len(words) / 15.0, 0.2)  # Length bonus (max 0.2)
+        # Latin character ratio
+        latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
+        score += (latin_chars / max(1, len(text))) * 0.25
+        # TRIDIS Medieval bonuses (only if TRIDIS was used)
+        if self.tridis_available and getattr(self, 'active_model', '') == 'tridis_HTR':
+            # Expanded abbreviations (major quality indicator)
+            medieval_expansions = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt']
+            expansion_count = sum(1 for exp in medieval_expansions if exp in text.lower())
+            score += min(expansion_count * 0.05, 0.2)  # Max 0.2 bonus
+            # Named entity capitalization (TRIDIS feature)
+            import re
+            caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
+            score += min(caps_count * 0.02, 0.15)  # Max 0.15 bonus
+            # Correction markers (authenticity indicator)
+            corrections = text.count('[') + text.count('$')
+            score += min(corrections * 0.03, 0.1)  # Max 0.1 bonus
+        # Medieval symbols (regardless of OCR method)
+        medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
+        symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
+        score += min(symbol_count * 0.04, 0.15)  # Max 0.15 bonus
+        # Word formation
+        if len(words) > 1:
+            score += 0.1
+        # Common Latin words
+        common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab']
+        latin_matches = sum(1 for word in words if word.lower() in common_latin)
+        score += min(latin_matches * 0.02, 0.1)
+        return max(0.0, min(1.0, score))
+    def _determine_text_type(self, text):
+        """Determine the type of Latin text based on features"""
+        if not text:
+            return "unknown"
+        # Medieval indicators
+        medieval_expansions = ['domini', 'facimus', 'quod', 'enim']
+        has_expansions = any(exp in text.lower() for exp in medieval_expansions)
+        has_corrections = '[' in text or '$' in text
+        has_medieval_symbols = any(c in text for c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
+        if has_expansions and has_corrections:
+            return "medieval_documentary_manuscript"
+        elif has_expansions or has_medieval_symbols:
+            return "medieval_manuscript"
+        elif has_corrections:
+            return "manuscript_with_corrections"
+        else:
+            return "classical_latin_text"
+    def _determine_confidence_level(self, text):
+        """Determine confidence level based on text characteristics"""
+        score = self._calculate_comprehensive_quality_score(text)
+        if score >= 0.8:
+            return "Very High"
+        elif score >= 0.6:
+            return "High"
+        elif score >= 0.4:
+            return "Medium"
+        elif score >= 0.2:
+            return "Low"
+        else:
+            return "Very Low"
+    def generate_historical_context(self, processed_result):
+        """Generate comprehensive historical context for Latin text"""
+        latin_text = processed_result.get("text", "")
+        groq_detail = self._generate_groq_context(latin_text)
+        # Build references using words/symbols in Latin text
+        words = re.findall(r'\w+', latin_text) if latin_text else []
+        query_terms = list(words)
+        if latin_text:
+            query_terms.extend([char for char in latin_text if char.strip()])
+        refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
+        return {
+            "uses_box": {
+                "title": "Medieval Latin manuscript analysis",
+                "items": self._build_uses_list(latin_text)
+            },
+            "meaning_box": self._build_enhanced_meaning_box(latin_text, groq_detail, processed_result),
+            "references": refs
+        }
+    def _generate_groq_context(self, latin_text):
+        """Generate contextual information using Groq with medieval awareness"""
+        if not self.groq_client.is_available():
+            return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
+        # Analyze medieval features for context
+        has_expansions = any(word in latin_text.lower() for word in ['domini', 'facimus', 'quod', 'enim'])
+        has_corrections = '[' in latin_text or '$' in latin_text
+        has_caps = any(c.isupper() for c in latin_text)
+        if is_gibberish(latin_text):
+            prompt = (
+                "The following sequence appears to be fragmentary medieval Latin text, possibly with scribal abbreviations or corrections. "
+                "Provide a concise, scholarly paragraph (6-10 sentences) covering possible meanings, historical context of medieval Latin manuscripts, "
+                "common abbreviation practices, and typical documentary uses in 13th-16th century Europe."
+            )
+        else:
+            context_note = ""
+            if has_expansions:
+                context_note += "The text contains expanded medieval abbreviations. "
+            if has_corrections:
+                context_note += "Scribal corrections or cancellations are present. "
+            if has_caps:
+                context_note += "Named entities appear to be properly capitalized. "
+            prompt = (
+                f"Analyze this medieval Latin text: {latin_text}\n\n"
+                f"Context: {context_note}This appears to be from a medieval manuscript (13th-16th centuries). "
+                f"Provide a scholarly paragraph (6-10 sentences) on its historical significance, cultural context, "
+                f"likely documentary purpose, and interpretations. Focus on medieval manuscript practices, "
+                f"legal/administrative contexts, and paleographic significance."
+            )
+        system_prompt = "You are a medieval Latin paleography specialist and historian. Provide accurate, concise scholarly analysis focusing on manuscript traditions, abbreviation practices, and documentary contexts of the late medieval period."
+        enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, latin_text)
+        return self.groq_client.generate_response(
+            system_prompt=enriched_system_prompt,
+            user_prompt=prompt
+        ) or "(Historical context unavailable due to Groq error)"
+    def _build_uses_list(self, latin_text):
+        """Build enhanced list of character uses with TRIDIS context"""
+        notes = self.references.get("latin_symbol_notes", {}) or {}
+        default_hint = self.references.get("latin_hint",
+            "Letters and symbols reflect phonetic values and scribal practices in medieval manuscripts.")
+        seen = set()
+        items = []
+        # Add TRIDIS-specific information for medieval features
+        tridis_notes = {
+            '[': "Editorial bracket indicating scribal correction or cancellation (TRIDIS transcription standard)",
+            '$': "Cancellation marker for struck-through text (TRIDIS notation)",
+        }
+        for ch in latin_text:
+            if ch in seen or not ch.strip():
+                continue
+            seen.add(ch)
+            # Check TRIDIS-specific notes first
+            if ch in tridis_notes:
+                note = tridis_notes[ch]
+            elif ch in notes:
+                note = notes[ch]
+            else:
+                note = default_hint
+            items.append(f"- {ch}: {note}")
+        if not items:
+            items.append("- —: " + default_hint)
+        # Limit to prevent overwhelming output
+        return items[:20]
+    def _build_enhanced_meaning_box(self, latin_text, groq_detail, processed_result):
+        """Build comprehensive meaning box with TRIDIS medieval analysis"""
+        char_analysis = processed_result.get("char_analysis", {})
+        validation = processed_result.get("validation", {})
+        # Enhanced introduction with TRIDIS context
+        processing_method = validation.get("ocr_method", "Unknown OCR")
+        text_type = validation.get("text_type", "unknown")
+        confidence = validation.get("confidence_level", "Unknown")
+        intro_lines = [
+            f"Text processed using {processing_method} with confidence level: {confidence}.",
+        ]
+        if self.tridis_available:
+            intro_lines.extend([
+                "TRIDIS HTR model trained on 245,000 lines of medieval manuscripts (13th-16th centuries).",
+                "Specializes in Latin, Old French, Old Spanish documentary texts with automatic abbreviation expansion."
+            ])
+        # Medieval features summary
+        medieval_features = []
+        expansion_count = char_analysis.get("expansion_count", 0)
+        if expansion_count > 0:
+            medieval_features.append(f"{expansion_count} abbreviation expansions")
+        correction_count = char_analysis.get("correction_markers", 0)
+        if correction_count > 0:
+            medieval_features.append(f"{correction_count} scribal corrections")
+        entity_count = char_analysis.get("entity_count", 0)
+        if entity_count > 0:
+            medieval_features.append(f"{entity_count} named entities")
+        if medieval_features:
+            intro_lines.append(f"Medieval features detected: {', '.join(medieval_features)}.")
+        # Key terms for frequent list
+        expansions = char_analysis.get("abbreviation_expansions", [])
+        entities = char_analysis.get("capitalized_entities", [])
+        frequent_terms = expansions + entities
+        if not frequent_terms:
+            frequent_terms = list(set(w for w in latin_text.split() if len(w) > 2))[:10]
+        # Enhanced analysis points
+        points = []
+        if self.tridis_available:
+            points.extend([
+                "• TRIDIS HTR provides semi-diplomatic transcription following scholarly editorial standards.",
+                "• Automatic abbreviation expansion: dom̃→domini, facimꝰ→facimus, ꝓ→pro, ⁊→et.",
+                "• Named entity capitalization and punctuation normalization applied."
+            ])
+        else:
+            points.append("��� Tesseract OCR provides basic Latin character recognition with limited medieval symbol support.")
+        if correction_count > 0:
+            points.append(f"• [{correction_count}] scribal corrections/cancellations indicate active manuscript editing process.")
+        if expansion_count > 0:
+            expansions_list = ", ".join(char_analysis.get("abbreviation_expansions", [])[:5])
+            points.append(f"• Expanded abbreviations suggest legal/administrative document: {expansions_list}.")
+        if validation.get("medieval_features_detected", False):
+            manuscript_period = validation.get("manuscript_period", "Medieval")
+            points.append(f"• {manuscript_period} characteristics indicate documentary manuscript tradition.")
+        if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
+            points.append(f"• Historical analysis: {groq_detail.strip()}")
+        return {
+            "title": "Medieval Latin manuscript analysis:",
+            "intro_lines": intro_lines,
+            "frequent_label": "Key medieval terms identified",
+            "frequent": frequent_terms[:12],
+            "points": points
+        }
+    def generate_story(self, processed_result):
+        """Generate creative story with medieval manuscript context"""
+        latin_text = processed_result.get("text", "")
+        if not self.groq_client.is_available():
+            return "Groq client unavailable, cannot generate historical narrative."
+        # Analyze text features for story context
+        char_analysis = processed_result.get("char_analysis", {})
+        validation = processed_result.get("validation", {})
+        has_expansions = char_analysis.get("expansion_count", 0) > 0
+        has_corrections = char_analysis.get("correction_markers", 0) > 0
+        has_entities = char_analysis.get("entity_count", 0) > 0
+        text_type = validation.get("text_type", "unknown")
+        used_tridis = validation.get("tridis_used", False)
+        # Choose appropriate narrative style based on detected features
+        if "documentary" in text_type or has_expansions:
+            styles = [
+                "as a legal charter discovered in monastic archives",
+                "as an administrative record from a medieval royal court",
+                "as a property deed found in cathedral scriptorium",
+                "as a guild register from a medieval trading city",
+                "as a tax record from a 14th-century monastery"
+            ]
+        elif has_corrections or has_entities:
+            styles = [
+                "as a monk's working manuscript with personal annotations",
+                "as a scholar's commentary on ancient texts",
+                "as a chronicle being revised by a medieval historian",
+                "as a theological treatise with scribal corrections",
+                "as a copy of classical texts with medieval glosses"
+            ]
+        else:
+            styles = [
+                "as a sacred text illuminated by medieval scribes",
+                "as a philosophical work from a cathedral school",
+                "as a liturgical manuscript from a monastic library",
+                "as a medical treatise translated in medieval Spain",
+                "as an astronomical text from a medieval university"
+            ]
+        import random
+        chosen_style = random.choice(styles)
+        seed = random.randint(1000, 9999)
+        # Craft historically-informed prompt
+        processing_context = "deciphered using advanced medieval manuscript AI" if used_tridis else "carefully transcribed from the original"
+        time_period = "13th-16th centuries" if (has_expansions or has_corrections) else "medieval period"
+        prompt = (
+            f"This Latin manuscript text was {processing_context}: {latin_text}\n\n"
+            f"Historical context: The text appears to be from the {time_period}, "
+            f"{'with expanded abbreviations and scribal corrections typical of documentary manuscripts' if has_expansions else 'showing characteristics of medieval scholarly tradition'}.\n\n"
+            f"Create a vivid, historically accurate narrative (250+ words) set in medieval Europe, "
+            f"telling the story of this manuscript's creation and significance. "
+            f"Write {chosen_style}.\n\n"
+            f"Include: Medieval setting, authentic historical details, multiple characters, "
+            f"the process of manuscript creation, and the document's importance to its community.\n"
+            f"Narrative seed: {seed}"
+        )
+        system_prompt = (
+            "You are a medieval historian and storyteller specializing in manuscript culture, "
+            "paleography, and daily life in 13th-16th century Europe. Create authentic, "
+            "engaging narratives that reflect accurate historical knowledge of medieval "
+            "scriptoriums, legal practices, and scholarly traditions."
+        )
+        story = self.groq_client.generate_response(
+            system_prompt=system_prompt,
+            user_prompt=prompt
+        )
+        if not story or is_gibberish(story):
+            return "Failed to generate historical narrative; medieval story creation unavailable."
+        return story

references.json ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+  "egypt_symbol_notes": {
+    "bow": "Warfare and hunting tool; often a phonogram (q/p/k depending on context) and a determinative for martial power, subjugation, or protection.",
+    "hoe": "Agricultural implement tied to cultivation and ritual ‘preparation’; used as a determinative and phonetic sign.",
+    "god_figure": "Anthropomorphic marker indicating divine agency; commonly a determinative for deities.",
+    "bread": "Offering loaf symbolizing food and sustenance; phonetic value 't'.",
+    "feather": "Represents the concepts of truth and balance; phonetic value 'm'.",
+    "eye": "Wedjat eye symbolizing protection and healing; phonetic for 'ir' or 'jr'.",
+    "owl": "Common phonogram 'm'; also signifies night and hidden knowledge.",
+    "reed": "Symbol for writing and speech; phonetic and determinative use.",
+    "scribe_tools": "Represents writing, record-keeping, and administration.",
+    "leg": "Indicates motion, going forth, phonetic sign.",
+    "lizard": "Represents reptiles; associated with protective symbolism.",
+    "woman_seated": "Determinative for female persons or roles.",
+    "jar": "Associated with offerings, fluids, and ritual context.",
+    "crown": "Symbol of royal authority and divine power.",
+    "man_seated": "Determinative for male persons or generic agents."
+  },
+  "greek_symbol_notes": {
+  "Κ": "Kappa: Tenth letter, consonant /k/ sound, equivalent to Latin C/K.",
+  "γ": "Lowercase gamma, consonant /g/ sound, or nasal /ng/ before γ, κ, χ, ξ sounds.",
+  "ι": "Lowercase iota, vowel /i/ sound, can form diphthongs with other vowels.",
+  "Ν": "Nu: Thirteenth letter, consonant /n/ sound, nasal consonant with grammatical functions.",
+  "Τ": "Tau: Nineteenth letter, consonant /t/ sound, common in grammatical endings.",
+  "ο": "Lowercase omicron, short vowel /o/ sound, frequent in grammatical morphemes.",
+  "λ": "Lowercase lambda, consonant /l/ sound, liquid consonant with metrical significance.",
+  "θ": "Lowercase theta, aspirated /tʰ/ sound, distinguishes words from similar tau forms.",
+  "υ": "Lowercase upsilon, vowel /y/ sound, forms diphthongs and appears in many endings.",
+  "Θ": "Theta: Eighth letter, aspirated /tʰ/ sound in ancient Greek, /θ/ (voiceless th) in modern.",
+  "Η": "Eta: Seventh letter, long vowel /ē/ sound in ancient Greek, /i/ in modern Greek.",
+  "ς": "Lowercase sigma (final form), consonant /s/ sound, used only at word endings.",
+  "ε": "Lowercase epsilon, short vowel /e/ sound, appears frequently in verb conjugations.",
+  "-": "Hyphen; marks word division or compound elements in Greek texts.",
+  "Ὶ": "Capital Iota with grave accent, indicates lowered pitch or unstressed position.",
+  "ῖ": "Lowercase iota with circumflex accent, indicates falling tone on long vowel /ī/.",
+  "ί": "Lowercase iota with acute accent, vowel /i/ with raised pitch indicating word stress.",
+  "Ἰ": "Capital Iota with smooth breathing, vowel /i/ without initial aspiration.",
+  "Ἑ": "Capital Epsilon with rough breathing, indicates /h/ sound before vowel.",
+  "'": "Apostrophe; indicates elision (omitted vowel) or contraction in Greek.",
+  "Π": "Pi: Sixteenth letter, consonant /p/ sound, appears in mathematical and scientific contexts.",
+  "Ο": "Omicron: Fifteenth letter, short vowel /o/ sound, distinct from omega (long o).",
+  "Μ": "Mu: Twelfth letter, consonant /m/ sound, nasal consonant often used in word formation.",
+  "[": "Opening square bracket; typically editorial reconstructions or uncertain readings.",
+  "Α": "Alpha: First letter of Greek alphabet, vowel /a/ sound, often marks beginnings or primacy.",
+  "μ": "Lowercase mu, consonant /m/ sound, nasal consonant often in prefixes and roots.",
+  "ὲ": "Lowercase epsilon with grave accent, short /e/ sound with lowered pitch.",
+  "Ᾱ": "Capital Alpha with macron (long mark), indicates long /ā/ vowel quantity.",
+  "Γ": "Gamma: Third letter, consonant /g/ sound, or /ng/ before γ, κ, χ, ξ sounds.",
+  "Υ": "Upsilon: Twentieth letter, vowel /y/ sound in ancient Greek, /i/ in modern pronunciation.",
+  "(": "Opening parenthesis; editorial or explanatory insertions.",
+  ")": "Closing parenthesis; completes editorial or explanatory insertions.",
+  "ω": "Lowercase omega, long vowel /ō/ sound, often in verb endings and declensions.",
+  "ῑ": "Lowercase iota with macron, explicitly marks long vowel quantity /ī/.",
+  "·": "Middle dot (Greek semicolon); equivalent to modern semicolon, marks major pause.",
+  "ῐ": "Lowercase iota with breve, explicitly marks short vowel quantity /ĭ/.",
+  "Ξ": "Xi: Fourteenth letter, consonant cluster /ks/ sound, compound sound written as single letter.",
+  "ν": "Lowercase nu, consonant /n/ sound, assimilates before consonants in pronunciation.",
+  "Ε": "Epsilon: Fifth letter, short vowel /e/ sound, distinct from eta (long e).",
+  "η": "Lowercase eta, long vowel /ē/ sound in ancient Greek, /i/ in modern pronunciation.",
+  "]": "Closing square bracket; completes editorial reconstructions.",
+  "Ι": "Iota: Ninth letter, vowel /i/ sound, can form diphthongs with other vowels.",
+  "κ": "Lowercase kappa, consonant /k/ sound, common in word formation and inflection.",
+  "1": "Numeral one; manuscript numbering, line numbers, or verse citations.",
+  "ῃ": "Lowercase eta with iota subscript, indicates original diphthong /ēi/ sound.",
+  "ψ": "Lowercase psi, consonant cluster /ps/ sound, compound phoneme as single letter.",
+  "ἢ": "Lowercase eta with rough breathing and grave accent, aspirated long vowel with lowered tone.",
+  "Ὗ": "Capital Upsilon with rough breathing and circumflex, indicates aspiration and falling tone.",
+  "Ἱ": "Capital Iota with rough breathing, vowel /i/ with initial aspiration /h/.",
+  "Ᾰ": "Capital Alpha with breve (short mark), indicates short /ă/ vowel quantity.",
+  "Ί": "Capital Iota with acute accent, indicates raised pitch or primary word stress.",
+  "Λ": "Lambda: Eleventh letter, consonant /l/ sound, liquid consonant in Greek phonology.",
+  "\"": "Quotation mark; marks direct speech or citations in Greek texts.",
+  "σ": "Lowercase sigma (medial form), consonant /s/ sound, used within words.",
+  "Ἡ": "Capital Eta with rough breathing, long /ē/ sound with initial aspiration /h/.",
+  "Χ": "Chi: Twenty-second letter, aspirated /kʰ/ sound in ancient Greek, /x/ (voiceless velar fricative) in modern.",
+  "ζ": "Lowercase zeta, consonant cluster /zd/ sound, represents compound phoneme.",
+  "Ἷ": "Capital Iota with rough breathing and circumflex accent, complex tonal marking.",
+  "ὶ": "Lowercase iota with grave accent, vowel /i/ with lowered pitch or unstressed.",
+  "ἰ": "Lowercase iota with smooth breathing, vowel /i/ without initial aspiration.",
+  "α": "Lowercase alpha, vowel /a/ sound, fundamental vowel in Greek phonology.",
+  ",": "Comma; punctuation for pauses, lists, or grammatical separation.",
+  "ᾗ": "Lowercase eta with rough breathing, circumflex accent, and iota subscript, complex phonetic marking.",
+  "τ": "Lowercase tau, consonant /t/ sound, appears in many grammatical suffixes.",
+  "<": "Less-than symbol; editorial mark for textual corrections or variants.",
+  "Σ": "Sigma: Eighteenth letter, consonant /s/ sound, has special final form (ς) at word end.",
+  "ρ": "Lowercase rho, consonant /r/ sound, when word-initial requires rough breathing mark.",
+  "ἡ": "Lowercase eta with rough breathing, long /ē/ sound with initial /h/.",
+  "Ω": "Omega: Twenty-fourth letter, long vowel /ō/ sound in ancient Greek, /o/ in modern.",
+  ".": "Period (full stop); marks end of sentences in Greek texts.",
+  "Ῥ": "Capital Rho with rough breathing, indicates /hr/ sound at word beginning (all word-initial rhos are aspirated).",
+  "ἕ": "Lowercase epsilon with rough breathing and acute accent, aspirated short vowel with raised tone.",
+  "ῆ": "Lowercase eta with circumflex accent, falling tone on long vowel /ē/.",
+  "Ἶ": "Capital Iota with smooth breathing and circumflex accent, vowel /ī/ with falling tone, no initial aspiration.",
+  "β": "Lowercase beta, consonant /b/ sound in ancient Greek, /v/ sound in modern Greek pronunciation.",
+  "Ὁ": "Capital Omicron with rough breathing, short /o/ sound with initial aspiration /h/.",
+  "Ϊ": "Capital Iota with diaeresis (trema), indicates /i/ vowel pronounced separately, not as diphthong.",
+  "Φ": "Phi: Twenty-first letter, aspirated /pʰ/ sound in ancient Greek, /f/ in modern pronunciation.",
+  "ῗ": "Lowercase iota with diaeresis and circumflex, /ī/ vowel with falling tone, pronounced separately.",
+  "έ": "Lowercase epsilon with acute accent (modern Greek), short /e/ sound with stress marking.",
+  "ἷ": "Lowercase iota with rough breathing and circumflex accent, aspirated /ī/ vowel with falling tone."
+}
+,
+  "latin_symbol_notes": {
+    "꜠": "Modifier letter for stress and high tone, used in phonetic transcription and transliteration.",
+    "꜡": "Modifier letter for stress and low tone, common in linguistic notation.",
+    "Ꜣ": "Capital Letter Egyptological Alef, used in transliterating Egyptian hieroglyphs.",
+    "ꜣ": "Small Letter Egyptological Alef, counterpart to capital version.",
+    "Ꜥ": "Capital Letter Egyptological Ain, reflecting voiced pharyngeal sounds in transliteration.",
+    "ꜥ": "Small Letter Egyptological Ain, used in Semitic transliterations.",
+    "Ꝁ": "Capital Letter K with Stroke, scribal abbreviation mark for legal or medieval texts.",
+    "ꝁ": "Small Letter K with Stroke, similar abbreviation symbol.",
+    "ꝑ": "Small Letter P with Stroke Through Descender, abbreviation of 'per' in medieval Latin manuscripts.",
+    "ꝛ": "Small Letter R Rotunda, a stylistic medieval form of 'r' to save space.",
+    "Ꞁ": "Capital Letter Turned L, used in paleography to denote variant forms.",
+    "ꞁ": "Small Letter Turned L, lowercase variant in medieval scripts.",
+    "Ꞃ": "Capital Letter Insular R, found in Insular script manuscripts in medieval Britain and Ireland.",
+    "Ꝼ": "Capital Letter Insular F, distinct letter in Celtic Insular manuscripts.",
+    "ꟽ": "Epigraphic Letter Inverted M, used as a logogram for 'mulier' or 'matrona' in Roman inscriptions.",
+    "ꟿ": "Epigraphic Letter Archaic M, represents the praenomen 'Manius' in inscriptions."
+  },"cuneiform_symbol_notes": {
+    "𒀀": "Cuneiform sign A: vowel sound /a/ in Sumerian and Akkadian, fundamental vowel marker",
+    "𒀭": "Cuneiform sign AN/DINGIR: divine determinative, heaven, god concept in religious texts",
+    "𒈗": "Cuneiform sign LUGAL: king, ruler, sovereign used in royal inscriptions and titles",
+    "𒊕": "Cuneiform sign UD: day, sun, light, time marker in calendrical and chronological contexts",
+    "𒄿": "Cuneiform sign I: vowel /i/, often used in verbal forms and grammatical particles",
+    "𒂍": "Cuneiform sign E: house, temple, building in architectural and religious contexts",
+    "𒀸": "Cuneiform sign ARAD: servant, slave, worker in administrative and legal documents",
+    "𒁹": "Cuneiform sign DIRIG: to exceed, surplus, extra in mathematical and accounting texts",
+    "𒉋": "Cuneiform sign TI: life, to live, arrow in medical, military, and philosophical contexts",
+    "𒆠": "Cuneiform sign KI: earth, place, land in geographical and territorial designations",
+    "𒌓": "Cuneiform sign ZU: to know, knowledge, wisdom in educational and scribal contexts",
+    "𒈨": "Cuneiform sign ME: divine powers, cultural practices in mythological and religious texts",
+    "𒉿": "Cuneiform sign TUKU: to have, possess, hold in commercial and legal transactions",
+    "𒄩": "Cuneiform sign HA: fish, to catch in texts about fishing, food, and economy",
+    "𒁇": "Cuneiform sign DU: to go, walk, build in construction, travel, and action contexts",
+    "lugal": "ATF: lugal - Sumerian/Akkadian for 'king', royal title in administrative texts",
+    "an": "ATF: an - Sky god An/Anu, heaven concept in religious and mythological contexts",
+    "ki": "ATF: ki - Earth, place, land in geographical and cosmological descriptions",
+    "dingir": "ATF: dingir - God, divine being, deity in religious and ceremonial texts",
+    "sar": "ATF: sar - To write, inscription, totality in scribal and administrative contexts",
+    "{d}": "ATF: determinative for divine names, indicates following word refers to a deity"
+  },
+  "cuneiform_hint": "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages (Sumerian, Akkadian, Hittite, etc.). ATF format uses Latin transliteration of cuneiform symbols.",
+  "greek_hint": "If no specific character note is found, treat as lexical marker considering diacriticals (breathing marks, accents, vowel quantity) which affect pronunciation, meaning, and grammatical function in ancient Greek texts.",
+  "latin_hint": "Letters and symbols reflect phonetic values and scribal practices in manuscripts."
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+# DecipherAI Backend — Production Dependencies (Hugging Face Spaces)
+# CPU-only torch for HF Spaces free tier
+# Web framework
+Flask==3.1.3
+flask-cors==6.0.2
+python-dotenv==1.2.2
+# AI / ML
+groq==1.2.0
+transformers==5.9.0
+safetensors==0.7.0
+# PyTorch CPU-only (HF Spaces free tier does not have GPU)
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.12.0+cpu
+torchvision==0.27.0+cpu
+# Image processing
+opencv-python-headless==4.13.0.92
+pillow==12.2.0
+pytesseract==0.3.13
+# Utilities
+numpy==2.4.4
+regex==2026.5.9
+tqdm==4.67.3
+# Production WSGI server
+gunicorn==23.0.0

services/__init__.py ADDED Viewed

File without changes

services/context_generator.py ADDED Viewed

File without changes

services/groq_vision_classifier.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import base64
+import json
+import os
+from io import BytesIO
+from PIL import Image
+from groq import Groq
+class GroqVisionScriptClassifier:
+    def __init__(self, groq_api_key):
+        self.groq_client = Groq(api_key=groq_api_key)
+        # FIXED: Use the correct stable model name
+        self.vision_model = "meta-llama/llama-4-scout-17b-16e-instruct"
+        print(f"[INFO] Groq Vision Classifier initialized with {self.vision_model}")
+    def classify_script(self, image_path):
+        """Enhanced script classification including cuneiform using Groq's Llama Vision model"""
+        try:
+            # Convert image to base64
+            base64_image = self._image_to_base64(image_path)
+            if not base64_image:
+                return "unknown"
+            # Query Groq Vision API
+            response = self._query_groq_vision(base64_image)
+            # Parse the response
+            script_type = self._parse_classification_response(response)
+            print(f"[INFO] Llama Vision classified script as: {script_type}")
+            return script_type.lower()
+        except Exception as e:
+            print(f"[ERROR] Groq Vision script classification failed: {e}")
+            return "unknown"
+    def _image_to_base64(self, image_path):
+        """Convert image to base64 for Groq Vision API (4MB limit)"""
+        try:
+            image = Image.open(image_path)
+            # Resize if too large (keep under 4MB base64 limit)
+            if max(image.size) > 1200:
+                image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
+            # Convert to base64 JPEG (smaller than PNG)
+            buffer = BytesIO()
+            image.save(buffer, format="JPEG", quality=90)
+            image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+            # Check size (base64 should be < 4MB)
+            if len(image_b64) > 4 * 1024 * 1024:  # 4MB limit
+                # Reduce quality and try again
+                buffer = BytesIO()
+                image.save(buffer, format="JPEG", quality=70)
+                image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+            return image_b64
+        except Exception as e:
+            print(f"[ERROR] Image to base64 conversion failed: {e}")
+            return None
+    def _query_groq_vision(self, base64_image):
+        """Enhanced query for Groq Llama Vision API including cuneiform"""
+        try:
+            # FIXED: Simplified prompt to avoid token limit issues
+            prompt = """Analyze this image of ancient text/script as an expert paleographer.
+Classify it as ONE of these ancient script types:
+- EGYPTIAN: Hieroglyphic symbols (birds, eyes, human figures, cartouches)
+- GREEK: Ancient/medieval Greek alphabet (α,β,γ,δ,ε,ζ,η,θ) with diacritics
+- LATIN: Latin alphabet letters, Roman inscriptions, medieval manuscripts
+- CUNEIFORM: Wedge-shaped impressions on clay tablets (triangular marks)
+IMPORTANT: Cuneiform has geometric wedge patterns, NOT pictures like hieroglyphs.
+Respond ONLY with JSON:
+{"classification": "EGYPTIAN" or "GREEK" or "LATIN" or "CUNEIFORM", "confidence": 0.0-1.0}"""
+            completion = self.groq_client.chat.completions.create(
+                model=self.vision_model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                temperature=0.1,  # Low temperature for consistent classification
+                max_completion_tokens=100,  # FIXED: Reduced to avoid token errors
+                top_p=0.9,
+                stream=False,
+                response_format={"type": "json_object"}
+            )
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"[ERROR] Groq Vision API call failed: {e}")
+            return None
+    def _parse_classification_response(self, response):
+        """Enhanced parsing for JSON response including cuneiform"""
+        if not response:
+            return "unknown"
+        try:
+            # Parse JSON response
+            data = json.loads(response)
+            classification = data.get('classification', '').upper()
+            confidence = data.get('confidence', 0.0)
+            print(f"[INFO] Vision model confidence: {confidence:.3f}")
+            # Enhanced classification mapping including cuneiform
+            if classification == "EGYPTIAN":
+                return "egyptian"
+            elif classification == "GREEK":
+                return "greek"
+            elif classification == "LATIN":
+                return "latin"
+            elif classification == "CUNEIFORM":
+                return "cuneiform"
+            else:
+                print(f"[WARN] Unknown classification: {classification}")
+                return "unknown"
+        except json.JSONDecodeError:
+            print(f"[WARN] Failed to parse JSON response, trying text parsing: {response}")
+            # Enhanced fallback to text parsing
+            response_upper = response.strip().upper()
+            # Priority order: cuneiform keywords first (most specific)
+            cuneiform_keywords = ["CUNEIFORM", "WEDGE", "CLAY", "MESOPOTAMIAN", "AKKADIAN", "SUMERIAN", "BABYLONIAN"]
+            if any(keyword in response_upper for keyword in cuneiform_keywords):
+                return "cuneiform"
+            elif "EGYPTIAN" in response_upper or "HIEROGLYPH" in response_upper:
+                return "egyptian"
+            elif "GREEK" in response_upper:
+                return "greek"
+            elif "LATIN" in response_upper or "ROMAN" in response_upper:
+                return "latin"
+        except Exception as e:
+            print(f"[ERROR] Response parsing failed: {e}")
+        return "unknown"
+    def classify_with_fallback(self, image_path, max_retries=2):
+        """Enhanced classification with retry logic"""
+        for attempt in range(max_retries + 1):
+            try:
+                result = self.classify_script(image_path)
+                if result != "unknown":
+                    return result
+                elif attempt < max_retries:
+                    print(f"[INFO] Classification attempt {attempt + 1} returned unknown, retrying...")
+                    continue
+                else:
+                    print(f"[WARN] All classification attempts returned unknown")
+                    return "unknown"
+            except Exception as e:
+                if attempt < max_retries:
+                    print(f"[WARN] Classification attempt {attempt + 1} failed: {e}, retrying...")
+                    continue
+                else:
+                    print(f"[ERROR] All classification attempts failed: {e}")
+                    return "unknown"
+        return "unknown"
+    def get_supported_scripts(self):
+        """Get list of supported script types"""
+        return ["egyptian", "greek", "latin", "cuneiform"]
+    def validate_classification(self, script_type, confidence_threshold=0.7):
+        """Validate classification result"""
+        supported_scripts = self.get_supported_scripts()
+        if script_type not in supported_scripts:
+            print(f"[WARN] Unsupported script type: {script_type}")
+            return False
+        # All classifications from Llama Vision are considered valid
+        return True
+    def get_model_info(self):
+        """Get information about the vision model being used"""
+        return {
+            "model": self.vision_model,
+            "provider": "Groq",
+            "supported_scripts": self.get_supported_scripts(),
+            "features": [
+                "Ancient script classification",
+                "Multi-script support",
+                "Cuneiform wedge detection",
+                "Clay tablet recognition",
+                "High-resolution image processing"
+            ]
+        }
+    def debug_classification(self, image_path, save_debug_info=False):
+        """Debug classification with detailed information"""
+        try:
+            print(f"[DEBUG] Starting classification for: {image_path}")
+            # Check image properties
+            image = Image.open(image_path)
+            print(f"[DEBUG] Image size: {image.size}")
+            print(f"[DEBUG] Image mode: {image.mode}")
+            # Get base64 size
+            base64_image = self._image_to_base64(image_path)
+            if base64_image:
+                print(f"[DEBUG] Base64 size: {len(base64_image)} characters")
+            # Get raw response
+            response = self._query_groq_vision(base64_image)
+            print(f"[DEBUG] Raw API response: {response}")
+            # Parse and return
+            result = self._parse_classification_response(response)
+            print(f"[DEBUG] Final classification: {result}")
+            if save_debug_info:
+                debug_info = {
+                    "image_path": image_path,
+                    "image_size": image.size,
+                    "base64_length": len(base64_image) if base64_image else 0,
+                    "raw_response": response,
+                    "classification": result
+                }
+                debug_file = f"debug_classification_{result}_{hash(image_path) % 10000}.json"
+                with open(debug_file, 'w') as f:
+                    json.dump(debug_info, f, indent=2)
+                print(f"[DEBUG] Debug info saved to: {debug_file}")
+            return result
+        except Exception as e:
+            print(f"[ERROR] Debug classification failed: {e}")
+            return "unknown"

services/layout_parser.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import cv2
+import numpy as np
+from PIL import Image
+from typing import List, Dict, Tuple
+class LayoutParser:
+    def __init__(self):
+        pass
+    def analyze_layout(self, image_path: str) -> Dict:
+        """Analyze document image layout to detect columns, blocks, and lines of text"""
+        try:
+            img = cv2.imread(image_path)
+            if img is None:
+                raise FileNotFoundError(f"Image not found: {image_path}")
+            h_img, w_img, _ = img.shape
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            # Step 1: Preprocess to remove noise and binarize
+            # Use Otsu's thresholding after Gaussian blur
+            blur = cv2.GaussianBlur(gray, (5, 5), 0)
+            _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+            # Step 2: Dilation to merge words into horizontal line segments
+            # Use larger horizontal kernel to join words along text lines
+            line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
+            dilated = cv2.dilate(thresh, line_kernel, iterations=2)
+            # Step 3: Find contours of lines
+            contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            lines = []
+            for cnt in contours:
+                x, y, w, h = cv2.boundingRect(cnt)
+                # Filter out small noise and full page boundaries
+                if w < 15 or h < 5:
+                    continue
+                if w > w_img * 0.98 or h > h_img * 0.98:
+                    continue
+                lines.append({
+                    "box": (x, y, w, h),
+                    "area": w * h
+                })
+            # Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
+            # We group lines into columns based on horizontal positions
+            lines = sorted(lines, key=lambda l: l["box"][1])  # sort by top coord first
+            columns = self._group_lines_into_columns(lines, w_img)
+            structured_layout = {
+                "width": w_img,
+                "height": h_img,
+                "column_count": len(columns),
+                "columns": columns
+            }
+            print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
+            return structured_layout
+        except Exception as e:
+            print(f"[ERROR] Layout parsing failed: {e}")
+            return {"width": 0, "height": 0, "column_count": 1, "columns": []}
+    def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
+        """Group detected text lines into column blocks based on horizontal overlap"""
+        if not lines:
+            return []
+        # Find horizontal overlaps using a histogram projection
+        hist = np.zeros(page_width, dtype=np.int32)
+        for line in lines:
+            x, _, w, _ = line["box"]
+            hist[x:x+w] += 1
+        # Threshold histogram to find column boundaries
+        min_col_width = int(page_width * 0.1)
+        columns_x = []
+        in_col = False
+        start_x = 0
+        for x, val in enumerate(hist):
+            if val > 1 and not in_col:
+                in_col = True
+                start_x = x
+            elif val <= 1 and in_col:
+                in_col = False
+                end_x = x
+                if (end_x - start_x) >= min_col_width:
+                    columns_x.append((start_x, end_x))
+        # Handle case where column stretches to the end
+        if in_col:
+            columns_x.append((start_x, page_width))
+        if not columns_x:
+            columns_x = [(0, page_width)]
+        # Assign lines to closest columns
+        cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
+        for line in lines:
+            x, y, w, h = line["box"]
+            line_center_x = x + w / 2
+            # Find the best column index
+            best_idx = 0
+            min_dist = page_width
+            for idx, col in enumerate(cols_data):
+                cx_start, cx_end = col["x_range"]
+                if cx_start <= line_center_x <= cx_end:
+                    best_idx = idx
+                    break
+                else:
+                    dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
+                    if dist < min_dist:
+                        min_dist = dist
+                        best_idx = idx
+            cols_data[best_idx]["lines"].append((x, y, w, h))
+        # Sort lines inside each column by vertical (y) coordinate
+        for col in cols_data:
+            col["lines"] = sorted(col["lines"], key=lambda box: box[1])
+        return cols_data
+    def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
+        """Crop and return PIL images of detected text lines in reading order"""
+        try:
+            img = cv2.imread(image_path)
+            if img is None:
+                return []
+            crops = []
+            h_img, w_img, _ = img.shape
+            for col in layout.get("columns", []):
+                for (x, y, w, h) in col["lines"]:
+                    # Add small padding for HTR/OCR context
+                    pad_y = int(h * 0.1) + 2
+                    pad_x = int(w * 0.05) + 2
+                    y0 = max(0, y - pad_y)
+                    y1 = min(h_img, y + h + pad_y)
+                    x0 = max(0, x - pad_x)
+                    x1 = min(w_img, x + w + pad_x)
+                    crop = img[y0:y1, x0:x1]
+                    if crop.size > 0:
+                        crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
+            return crops
+        except Exception as e:
+            print(f"[ERROR] Failed to crop layout lines: {e}")
+            return []
+    def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
+        """Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
+        try:
+            # 1. Try using CLIP classifier if provided
+            if clip_classifier and clip_classifier.model and clip_classifier.processor:
+                try:
+                    from PIL import Image
+                    image = Image.open(image_path).convert("RGB")
+                    styles = ["printed", "cursive"]
+                    descriptions = [
+                        "classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
+                        "medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
+                    ]
+                    inputs = clip_classifier.processor(
+                        text=descriptions,
+                        images=image,
+                        return_tensors="pt",
+                        padding=True
+                    ).to(clip_classifier.device)
+                    import torch
+                    with torch.no_grad():
+                        outputs = clip_classifier.model(**inputs)
+                        logits_per_image = outputs.logits_per_image
+                        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
+                    best_idx = np.argmax(probs)
+                    style_label = styles[best_idx]
+                    confidence = float(probs[best_idx])
+                    print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
+                    return style_label
+                except Exception as e:
+                    print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
+            # 2. Fallback: Computer Vision heuristics
+            print("[INFO] Running computer vision heuristics for Latin style detection...")
+            img = cv2.imread(image_path)
+            if img is None:
+                return "cursive"  # Safe default
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            blur = cv2.GaussianBlur(gray, (5, 5), 0)
+            _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+            # Find contours without heavy dilation (character level components)
+            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            if not contours:
+                return "cursive"
+            aspect_ratios = []
+            widths = []
+            heights = []
+            for cnt in contours:
+                x, y, w, h = cv2.boundingRect(cnt)
+                # Filter noise
+                if w < 5 or h < 5:
+                    continue
+                aspect_ratios.append(w / h)
+                widths.append(w)
+                heights.append(h)
+            if not aspect_ratios:
+                return "cursive"
+            avg_aspect_ratio = np.mean(aspect_ratios)
+            median_width = np.median(widths)
+            # Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
+            # Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
+            print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
+            if avg_aspect_ratio < 1.3:
+                return "printed"
+            else:
+                return "cursive"
+        except Exception as e:
+            print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
+            return "cursive"

services/rag_service.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import json
+import os
+import re
+from typing import Dict, List, Tuple
+from config import Config
+class RAGService:
+    def __init__(self, references_path: str = None):
+        self.config = Config()
+        self.references_path = references_path or str(self.config.REFERENCES_PATH)
+        self.corpus = []
+        self.load_corpus()
+    def load_corpus(self):
+        """Load and index the historical reference document corpus"""
+        try:
+            if os.path.exists(self.references_path):
+                with open(self.references_path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                # Index Egyptian
+                for term, note in data.get("egypt_symbol_notes", {}).items():
+                    self.corpus.append({
+                        "category": "Egyptian Hieroglyphic Sign",
+                        "term": term,
+                        "definition": note,
+                        "citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
+                    })
+                # Index Greek
+                for term, note in data.get("greek_symbol_notes", {}).items():
+                    self.corpus.append({
+                        "category": "Greek Paleography Mark",
+                        "term": term,
+                        "definition": note,
+                        "citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
+                    })
+                # Index Latin
+                for term, note in data.get("latin_symbol_notes", {}).items():
+                    self.corpus.append({
+                        "category": "Latin Scribal Abbreviation",
+                        "term": term,
+                        "definition": note,
+                        "citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
+                    })
+                # Index Cuneiform
+                for term, note in data.get("cuneiform_symbol_notes", {}).items():
+                    self.corpus.append({
+                        "category": "Mesopotamian Cuneiform Logogram",
+                        "term": term,
+                        "definition": note,
+                        "citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
+                    })
+                print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
+            else:
+                print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
+        except Exception as e:
+            print(f"[ERROR] Failed to initialize RAG index: {e}")
+    def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
+        """Search reference records and build a grounding context string with academic citations"""
+        if not query_terms or not self.corpus:
+            return ""
+        matches = []
+        seen = set()
+        for term in query_terms:
+            if not term or len(term.strip()) < 1:
+                continue
+            clean_term = term.lower().strip()
+            # Simple keyword search with scoring
+            for record in self.corpus:
+                score = 0
+                record_term = record["term"].lower()
+                record_def = record["definition"].lower()
+                if clean_term == record_term:
+                    score += 10
+                else:
+                    # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
+                    term_parts = re.split(r'[_ \-]', record_term)
+                    if clean_term in term_parts:
+                        score += 5
+                    elif len(clean_term) > 3:
+                        if clean_term in record_term:
+                            score += 5
+                        elif clean_term in record_def:
+                            score += 2
+                if score > 0:
+                    record_key = f"{record['category']}:{record['term']}"
+                    if record_key not in seen:
+                        seen.add(record_key)
+                        matches.append((score, record))
+        # Sort matches by relevance score
+        matches.sort(key=lambda x: x[0], reverse=True)
+        top_matches = [m[1] for m in matches[:max_results]]
+        if not top_matches:
+            return ""
+        context_lines = ["### Scholarly Grounding and Sign References:"]
+        for idx, match in enumerate(top_matches, 1):
+            context_lines.append(
+                f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
+                f"   *Source Citation:* {match['citation']}"
+            )
+        return "\n".join(context_lines)
+    def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
+        """Search reference records and return the raw list of matching reference dicts with citations"""
+        if not query_terms or not self.corpus:
+            return []
+        matches = []
+        seen = set()
+        for term in query_terms:
+            if not term or len(term.strip()) < 1:
+                continue
+            clean_term = term.lower().strip()
+            # Simple keyword search with scoring
+            for record in self.corpus:
+                score = 0
+                record_term = record["term"].lower()
+                record_def = record["definition"].lower()
+                if clean_term == record_term:
+                    score += 10
+                else:
+                    # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
+                    term_parts = re.split(r'[_ \-]', record_term)
+                    if clean_term in term_parts:
+                        score += 5
+                    elif len(clean_term) > 3:
+                        if clean_term in record_term:
+                            score += 5
+                        elif clean_term in record_def:
+                            score += 2
+                if score > 0:
+                    record_key = f"{record['category']}:{record['term']}"
+                    if record_key not in seen:
+                        seen.add(record_key)
+                        matches.append((score, record))
+        # Sort matches by relevance score
+        matches.sort(key=lambda x: x[0], reverse=True)
+        return [m[1] for m in matches[:max_results]]
+    def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
+        """Enrich LLM prompts with RAG context and citation grounding instructions"""
+        # Parse query terms from extracted text or labels
+        query_terms = []
+        if extracted_symbols:
+            query_terms.extend(extracted_symbols)
+        # Split clean words from text
+        if extracted_text:
+            words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
+            query_terms.extend(words[:15])  # Cap to prevent excessive token use
+        grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
+        if not grounding_context:
+            return base_system_prompt
+        enriched_prompt = (
+            f"{base_system_prompt}\n\n"
+            f"Here is some verified historical and paleographical grounding information that you MUST use "
+            f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
+            f"whenever discussing these symbols:\n\n"
+            f"{grounding_context}\n\n"
+            f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
+        )
+        return enriched_prompt

services/script_detector.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from processors.egyptian_processor import EgyptianProcessor
+from processors.greek_processor import GreekProcessor
+from processors.latin_processor import LatinProcessor
+from processors.cuneiform_processor import CuneiformProcessor
+from .groq_vision_classifier import GroqVisionScriptClassifier
+class ScriptDetectionService:
+    def __init__(self, groq_client, references, clip_classifier, translator_pipe, cuneiform_processor=None):
+        # Initialize processors including cuneiform
+        self.egyptian_processor = EgyptianProcessor(groq_client, references, clip_classifier, translator_pipe)
+        self.greek_processor = GreekProcessor(groq_client, references, clip_classifier)
+        self.latin_processor = LatinProcessor(groq_client, references, clip_classifier)
+        # Initialize cuneiform processor or use the shared instance
+        if cuneiform_processor:
+            self.cuneiform_processor = cuneiform_processor
+            print("[INFO] Cuneiform processor shared from global app instance")
+        else:
+            try:
+                print("[INFO] Initializing cuneiform processor in detection service...")
+                self.cuneiform_processor = CuneiformProcessor(groq_client, references, clip_classifier)
+                print("[INFO] Cuneiform processor initialized successfully")
+            except Exception as e:
+                print(f"[WARN] Failed to initialize cuneiform processor: {e}")
+                self.cuneiform_processor = None
+        # FIXED: Get API key from groq_client with multiple fallback options
+        api_key = None
+        if hasattr(groq_client, 'api_key'):
+            api_key = groq_client.api_key
+        elif hasattr(groq_client, 'client') and hasattr(groq_client.client, 'api_key'):
+            api_key = groq_client.client.api_key
+        else:
+            # Fallback: get from config or environment
+            try:
+                from config import Config
+                config = Config()
+                api_key = config.GROQ_API_KEY
+            except:
+                import os
+                api_key = os.getenv('GROQ_API_KEY')
+        # Initialize Groq Vision script classifier if API key is present
+        if api_key:
+            try:
+                self.vision_classifier = GroqVisionScriptClassifier(api_key)
+                print("[INFO] Groq Vision Script Detection Service initialized")
+            except Exception as e:
+                print(f"[WARN] Failed to initialize Groq Vision script classifier: {e}")
+                self.vision_classifier = None
+        else:
+            print("[WARN] GROQ_API_KEY not found! Groq Vision classifier disabled. Falling back to zero-shot CLIP classifier.")
+            self.vision_classifier = None
+        # Keep track of clip_classifier
+        self.clip_classifier = clip_classifier
+        # Enhanced processor mapping with cuneiform
+        self.processors = {
+            'egyptian': self.egyptian_processor,
+            'greek': self.greek_processor,
+            'latin': self.latin_processor,
+            'cuneiform': self.cuneiform_processor
+        }
+        if self.cuneiform_processor:
+            print("[INFO] Cuneiform support: ENABLED (praeclarum/cuneiform model)")
+        else:
+            print("[WARN] Cuneiform support: DISABLED (processor initialization failed)")
+    def detect_and_process(self, image_path):
+        """Enhanced detection with cuneiform support - uses Groq Vision with CLIP fallback"""
+        try:
+            # Step 1: Get script classification from Groq Vision or CLIP
+            script_type = "unknown"
+            classification_method = "unknown"
+            classification_confidence = 0.0
+            if self.vision_classifier:
+                try:
+                    script_type = self.vision_classifier.classify_script(image_path)
+                    classification_method = 'groq_vision'
+                    classification_confidence = 0.95
+                except Exception as e:
+                    print(f"[WARN] Groq Vision classification failed: {e}. Falling back to CLIP.")
+            if script_type == "unknown" or not self.vision_classifier:
+                from PIL import Image
+                try:
+                    img = Image.open(image_path)
+                    script_type, classification_confidence = self.clip_classifier.classify_script_type(img)
+                    classification_method = 'clip_zero_shot'
+                    print(f"[INFO] CLIP fallback classification: {script_type} (conf={classification_confidence:.3f})")
+                except Exception as ce:
+                    print(f"[ERROR] CLIP fallback classification failed: {ce}")
+                    script_type = "egyptian"  # default fallback
+                    classification_method = "default_fallback"
+                    classification_confidence = 0.5
+            print(f"[INFO] Final classification routed: {script_type} via {classification_method}")
+            # Step 2: Route to appropriate processor including cuneiform
+            if script_type == "egyptian":
+                print("[INFO] Routing to Egyptian processor...")
+                result = self.egyptian_processor.process_image(image_path)
+            elif script_type == "greek":
+                print("[INFO] Routing to Greek processor...")
+                result = self.greek_processor.process_image(image_path)
+            elif script_type == "latin":
+                print("[INFO] Routing to Latin processor...")
+                result = self.latin_processor.process_image(image_path)
+            elif script_type == "cuneiform":
+                print("[INFO] Routing to Cuneiform processor...")
+                if self.cuneiform_processor and self.cuneiform_processor.cuneiform_available:
+                    result = self.cuneiform_processor.process_image(image_path)
+                else:
+                    print("[ERROR] Cuneiform processor not available!")
+                    # Create error result
+                    result = {
+                        'script_type': 'cuneiform',
+                        'confidence': 0.0,
+                        'processed_result': {
+                            'text': 'Cuneiform processor unavailable',
+                            'validation': {'quality_score': 0.0, 'error': 'Model not loaded'}
+                        },
+                        'historical_context': {},
+                        'creative_story': 'Cuneiform processing failed - model not available'
+                    }
+            else:  # unknown
+                print(f"[INFO] Unknown classification '{script_type}', defaulting to Egyptian...")
+                result = self.egyptian_processor.process_image(image_path)
+            # Step 3: Return result with classification metadata
+            if result:
+                result['vision_classification'] = script_type
+                result['classification_method'] = classification_method
+                result['classification_confidence'] = classification_confidence
+                print(f"[INFO] {script_type.title()} processing completed successfully")
+                return result
+            else:
+                print(f"[ERROR] {script_type.title()} processor returned None")
+                return None
+        except Exception as e:
+            print(f"[ERROR] Classification and processing failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    def get_processor_by_type(self, script_type):
+        """Get processor by script type - now includes cuneiform"""
+        processor = self.processors.get(script_type.lower())
+        if script_type.lower() == 'cuneiform' and processor and not processor.cuneiform_available:
+            print(f"[WARN] Cuneiform processor exists but model not available")
+            return None
+        return processor
+    def get_supported_scripts(self):
+        """Get list of supported script types"""
+        scripts = ['egyptian', 'greek', 'latin']
+        if self.cuneiform_processor and self.cuneiform_processor.cuneiform_available:
+            scripts.append('cuneiform')
+        return scripts
+    def get_processor_status(self):
+        """Get status of all processors"""
+        status = {
+            'egyptian': self.egyptian_processor is not None,
+            'greek': self.greek_processor is not None,
+            'latin': self.latin_processor is not None,
+            'cuneiform': self.cuneiform_processor is not None and getattr(self.cuneiform_processor, 'cuneiform_available', False)
+        }
+        return status
+    def validate_script_detection(self, script_type, processed_result):
+        """Validate script detection results - enhanced for cuneiform"""
+        try:
+            validation = processed_result.get('validation', {})
+            quality_score = validation.get('quality_score', 0.0)
+            # Script-specific validation thresholds
+            thresholds = {
+                'egyptian': 0.3,
+                'greek': 0.4,
+                'latin': 0.4,
+                'cuneiform': 0.2  # Lower threshold due to OCR challenges
+            }
+            threshold = thresholds.get(script_type, 0.3)
+            # Additional cuneiform validation
+            if script_type == 'cuneiform':
+                cuneiform_ratio = validation.get('cuneiform_ratio', 0.0)
+                atf_ratio = validation.get('atf_ratio', 0.0)
+                # Accept if either Unicode cuneiform or ATF format detected
+                if cuneiform_ratio > 0.1 or atf_ratio > 0.3:
+                    print(f"[INFO] Cuneiform validation passed: cuneiform_ratio={cuneiform_ratio:.3f}, atf_ratio={atf_ratio:.3f}")
+                    return True
+            # Standard quality validation
+            is_valid = quality_score >= threshold
+            if is_valid:
+                print(f"[INFO] {script_type.title()} validation passed: quality={quality_score:.3f} >= {threshold}")
+            else:
+                print(f"[WARN] {script_type.title()} validation failed: quality={quality_score:.3f} < {threshold}")
+            return is_valid
+        except Exception as e:
+            print(f"[ERROR] Validation failed: {e}")
+            return False

services/story_generator.py ADDED Viewed

File without changes

utils/__init__.py ADDED Viewed

File without changes

utils/gpu_diagnostics.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+import gc
+_active_processors = {}
+def register_processor(name, processor_instance):
+    """Register a processor instance for active VRAM offloading."""
+    _active_processors[name] = processor_instance
+    print(f"[VRAM MANAGER] Registered processor: {name}")
+def reclaim_vram_for(target_processor_name):
+    """Offload other processors' models from GPU to CPU to avoid Out of Memory (OOM) crashes."""
+    if not torch.cuda.is_available():
+        return
+    print(f"[VRAM MANAGER] Reclaiming GPU VRAM for '{target_processor_name}'...")
+    offloaded = False
+    for name, proc in list(_active_processors.items()):
+        if name == target_processor_name:
+            continue
+        try:
+            # 1. Greek Processor
+            if name == "greek" and getattr(proc, "trocr_model", None) is not None:
+                current_device = next(proc.trocr_model.parameters()).device
+                if str(current_device).startswith("cuda"):
+                    print("[VRAM MANAGER] Offloading Greek TrOCR to CPU...")
+                    proc.trocr_model.to("cpu")
+                    offloaded = True
+            # 2. Latin Processor
+            elif name == "latin":
+                if getattr(proc, "tridis_model", None) is not None:
+                    current_device = next(proc.tridis_model.parameters()).device
+                    if str(current_device).startswith("cuda"):
+                        print("[VRAM MANAGER] Offloading Latin TRIDIS to CPU...")
+                        proc.tridis_model.to("cpu")
+                        offloaded = True
+                if getattr(proc, "trocr_latin_model", None) is not None:
+                    current_device = next(proc.trocr_latin_model.parameters()).device
+                    if str(current_device).startswith("cuda"):
+                        print("[VRAM MANAGER] Offloading Latin TrOCR to CPU...")
+                        proc.trocr_latin_model.to("cpu")
+                        offloaded = True
+            # 3. Cuneiform Processor
+            elif name == "cuneiform":
+                if getattr(proc, "clip_model", None) is not None:
+                    current_device = next(proc.clip_model.parameters()).device
+                    if str(current_device).startswith("cuda"):
+                        print("[VRAM MANAGER] Offloading Cuneiform CLIP to CPU...")
+                        proc.clip_model.to("cpu")
+                        offloaded = True
+                if getattr(proc, "cuneiform_model", None) is not None:
+                    current_device = next(proc.cuneiform_model.parameters()).device
+                    if str(current_device).startswith("cuda"):
+                        print("[VRAM MANAGER] Offloading Cuneiform Translator to CPU...")
+                        proc.cuneiform_model.to("cpu")
+                        offloaded = True
+            # 4. Egyptian Processor (HuggingFaceModels)
+            elif name == "egyptian" and getattr(proc, "_model", None) is not None:
+                current_device = next(proc._model.parameters()).device
+                if str(current_device).startswith("cuda"):
+                    print("[VRAM MANAGER] Offloading Egyptian T5 to CPU...")
+                    proc._model.to("cpu")
+                    offloaded = True
+        except Exception as e:
+            print(f"[WARN] Failed to offload '{name}' models: {e}")
+    if offloaded:
+        gc.collect()
+        torch.cuda.empty_cache()
+        print("[VRAM MANAGER] VRAM cache cleared successfully.")
+def get_gpu_info():
+    """Get diagnostic information about the NVIDIA GPU if available."""
+    info = {
+        "cuda_available": torch.cuda.is_available(),
+        "gpu_name": "N/A",
+        "vram_total_gb": 0.0,
+        "vram_allocated_gb": 0.0,
+        "vram_cached_gb": 0.0,
+        "vram_free_gb": 0.0,
+        "cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A",
+        "device": "cpu"
+    }
+    if info["cuda_available"]:
+        info["device"] = "cuda"
+        try:
+            info["gpu_name"] = torch.cuda.get_device_name(0)
+            props = torch.cuda.get_device_properties(0)
+            info["vram_total_gb"] = round(props.total_memory / 1024**3, 2)
+            allocated = torch.cuda.memory_allocated(0)
+            cached = torch.cuda.memory_reserved(0)
+            info["vram_allocated_gb"] = round(allocated / 1024**3, 3)
+            info["vram_cached_gb"] = round(cached / 1024**3, 3)
+            try:
+                free_mem, total_mem = torch.cuda.mem_get_info(0)
+                info["vram_free_gb"] = round(free_mem / 1024**3, 3)
+            except Exception:
+                info["vram_free_gb"] = round((props.total_memory - allocated) / 1024**3, 3)
+        except Exception as e:
+            print(f"[WARN] Error gathering detailed GPU info: {e}")
+    return info
+def log_gpu_info():
+    """Print clean diagnostic logs at startup."""
+    info = get_gpu_info()
+    print("=" * 60)
+    print("        NVIDIA GPU & CUDA INITIALIZATION DIAGNOSTICS")
+    print("=" * 60)
+    print(f"CUDA Available:      {info['cuda_available']}")
+    if info["cuda_available"]:
+        print(f"CUDA Version:        {info['cuda_version']}")
+        print(f"GPU Model:           {info['gpu_name']}")
+        print(f"Total VRAM:          {info['vram_total_gb']} GB")
+        print(f"Free VRAM:           {info['vram_free_gb']} GB")
+        print(f"Active Device:       CUDA (Dynamic Offloading Enabled)")
+    else:
+        print("Active Device:       CPU (GPU acceleration not available)")
+    print("=" * 60)
+def log_model_device(model_name, device):
+    """Log the device selected for a specific model."""
+    print(f"[DEVICE LOG] Model '{model_name}' -> Assigned to: {str(device).upper()}")
+def clear_gpu_cache():
+    """Utility to clean memory cache during benchmarks or processing."""
+    if torch.cuda.is_available():
+        gc.collect()
+        torch.cuda.empty_cache()

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import cv2
+import numpy as np
+from PIL import Image
+def segment_hieroglyphs(image_path):
+    """Segment hieroglyphs from image using OpenCV"""
+    try:
+        img = cv2.imread(image_path)
+        if img is None:
+            raise FileNotFoundError(f"Image not found or cannot be read: {image_path}")
+        # Convert to grayscale and apply adaptive thresholding
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                   cv2.THRESH_BINARY_INV, 25, 10)
+        # Apply morphological operations
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
+        th = cv2.morphologyEx(th, cv2.MORPH_CLOSE, kernel, iterations=1)
+        # Find contours
+        contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        boxes = []
+        h_img, w_img = th.shape
+        for cnt in contours:
+            x, y, w, h = cv2.boundingRect(cnt)
+            area = w * h
+            # Filter small areas and full-image contours
+            if area < 200:
+                continue
+            if w > 0.95*w_img or h > 0.95*h_img:
+                continue
+            boxes.append((x, y, w, h))
+        # If no boxes found, return full image
+        if not boxes:
+            return [Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))]
+        # Sort boxes by position (top to bottom, left to right)
+        boxes = sorted(boxes, key=lambda b: (b[1]//50, b[0]))
+        # Extract crops
+        crops = []
+        for (x, y, w, h) in boxes:
+            pad = 6
+            x0 = max(0, x - pad)
+            y0 = max(0, y - pad)
+            x1 = min(w_img, x + w + pad)
+            y1 = min(h_img, y + h + pad)
+            crop = img[y0:y1, x0:x1]
+            crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
+        return crops
+    except Exception as e:
+        print(f"[ERROR] Hieroglyph segmentation failed: {e}")
+        return []
+def validate_image(file):
+    """Validate uploaded image file"""
+    from config import Config
+    config = Config()
+    # Check file size
+    if hasattr(file, 'content_length') and file.content_length > config.MAX_FILE_SIZE:
+        raise ValueError(f"File too large. Maximum size: {config.MAX_FILE_SIZE} bytes")
+    # Check file extension
+    if not file.filename or '.' not in file.filename:
+        raise ValueError("Invalid filename")
+    extension = file.filename.rsplit('.', 1)[1].lower()
+    if extension not in config.ALLOWED_EXTENSIONS:
+        raise ValueError(f"Invalid file type. Allowed: {', '.join(config.ALLOWED_EXTENSIONS)}")
+    # Try to open as image
+    try:
+        image = Image.open(file.stream)
+        image.verify()
+        file.stream.seek(0)  # Reset stream for later use
+        return True
+    except Exception:
+        raise ValueError("File is not a valid image")
+def preprocess_for_latin_ocr(image_path):
+    """Specialized preprocessing for Latin texts"""
+    try:
+        # Load image
+        image = cv2.imread(image_path)
+        if image is None:
+            raise ValueError(f"Cannot load image: {image_path}")
+        # Convert to grayscale
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        # Apply bilateral filter to reduce noise while preserving edges
+        filtered = cv2.bilateralFilter(gray, 9, 75, 75)
+        # Adaptive thresholding for varying lighting
+        thresh = cv2.adaptiveThreshold(
+            filtered, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 11, 2
+        )
+        return thresh
+    except Exception as e:
+        print(f"[ERROR] Latin preprocessing failed: {e}")
+        return None
+def enhance_contrast_for_manuscripts(image):
+    """Enhanced contrast specifically for manuscript images"""
+    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+    enhanced = clahe.apply(image)
+    return enhanced

utils/text_utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import re
+from collections import Counter
+from itertools import groupby
+def is_gibberish(text):
+    """Check if text appears to be gibberish"""
+    if not text or not isinstance(text, str):
+        return True
+    words = re.findall(r"\w+", text.lower())
+    if len(words) == 0:
+        return True
+    # Check for excessive repetition
+    word_counts = Counter(words)
+    if word_counts:
+        most_common, count = word_counts.most_common(1)[0]
+        if count > 12 or (count / len(words)) > 0.4:
+            return True
+    # Check minimum word count
+    if len(words) < 1:
+        return True
+    if len(words) == 1 and len(words[0]) < 3:
+        return True
+    return False
+def build_description_from_codes(codes):
+    """Build description from Gardiner codes"""
+    from config import Config
+    config = Config()
+    labels = [config.CODE_TO_LABEL.get(code, code) for code in codes]
+    compressed = []
+    for key, group in groupby(labels):
+        count = len(list(group))
+        name = "unknown" if (key == "?" or key is None) else key
+        compressed.append(f"{name} (x{count})" if count > 1 else name)
+    return ", ".join(compressed)
+def clean_text(text):
+    """Clean and normalize text"""
+    if not text:
+        return ""
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Strip leading/trailing whitespace
+    text = text.strip()
+    return text
+def extract_words(text, min_length=2):
+    """Extract words from text with minimum length"""
+    if not text:
+        return []
+    words = re.findall(r"\w+", text, flags=re.UNICODE)
+    return [word for word in words if len(word) >= min_length]
+def calculate_text_stats(text):
+    """Calculate basic text statistics"""
+    if not text:
+        return {
+            "char_count": 0,
+            "word_count": 0,
+            "unique_chars": 0,
+            "avg_word_length": 0
+        }
+    words = extract_words(text)
+    return {
+        "char_count": len(text),
+        "word_count": len(words),
+        "unique_chars": len(set(text)),
+        "avg_word_length": sum(len(word) for word in words) / max(1, len(words))
+    }

utils/validation.py ADDED Viewed

File without changes