Spaces:
Sleeping
Sleeping
Commit ·
2f4af3f
0
Parent(s):
Initial DecipherAI backend deployment
Browse files- .gitattributes +35 -0
- Dockerfile +49 -0
- README.md +11 -0
- app.py +473 -0
- config.py +48 -0
- decipherai-api +1 -0
- models/clip_classifier.py +129 -0
- models/groq_client.py +51 -0
- models/huggingface_models.py +62 -0
- models/tesseract_ocr.py +7 -0
- processors/__init__.py +0 -0
- processors/base_processor.py +71 -0
- processors/cuneiform_processor.py +804 -0
- processors/egyptian_processor.py +390 -0
- processors/greek_processor.py +774 -0
- processors/latin_processor.py +1281 -0
- references.json +149 -0
- requirements.txt +30 -0
- services/__init__.py +0 -0
- services/context_generator.py +0 -0
- services/groq_vision_classifier.py +254 -0
- services/layout_parser.py +244 -0
- services/rag_service.py +186 -0
- services/script_detector.py +223 -0
- services/story_generator.py +0 -0
- utils/__init__.py +0 -0
- utils/gpu_diagnostics.py +137 -0
- utils/image_utils.py +121 -0
- utils/text_utils.py +82 -0
- utils/validation.py +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DecipherAI Backend — Hugging Face Spaces Docker Configuration
|
| 2 |
+
# Space SDK: Docker
|
| 3 |
+
# Port: 7860
|
| 4 |
+
|
| 5 |
+
FROM python:3.11-slim
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
tesseract-ocr \
|
| 10 |
+
wget \
|
| 11 |
+
libgl1 \
|
| 12 |
+
libglib2.0-0 \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Download Ancient Greek Tesseract model
|
| 16 |
+
RUN mkdir -p /usr/share/tesseract-ocr/5/tessdata && \
|
| 17 |
+
wget -q \
|
| 18 |
+
https://github.com/tesseract-ocr/tessdata/raw/main/grc.traineddata \
|
| 19 |
+
-O /usr/share/tesseract-ocr/5/tessdata/grc.traineddata
|
| 20 |
+
|
| 21 |
+
# Create non-root user (HF Spaces recommendation)
|
| 22 |
+
RUN useradd -m -u 1000 user
|
| 23 |
+
|
| 24 |
+
USER user
|
| 25 |
+
|
| 26 |
+
ENV HOME=/home/user
|
| 27 |
+
ENV PATH=/home/user/.local/bin:$PATH
|
| 28 |
+
|
| 29 |
+
WORKDIR /home/user/app
|
| 30 |
+
|
| 31 |
+
# Install Python dependencies
|
| 32 |
+
COPY --chown=user:user requirements.txt .
|
| 33 |
+
|
| 34 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 35 |
+
pip install --no-cache-dir -r requirements.txt
|
| 36 |
+
|
| 37 |
+
# Copy application
|
| 38 |
+
COPY --chown=user:user . .
|
| 39 |
+
|
| 40 |
+
# Hugging Face Space port
|
| 41 |
+
EXPOSE 7860
|
| 42 |
+
|
| 43 |
+
# Production server
|
| 44 |
+
CMD ["gunicorn", \
|
| 45 |
+
"--bind", "0.0.0.0:7860", \
|
| 46 |
+
"--workers", "1", \
|
| 47 |
+
"--timeout", "300", \
|
| 48 |
+
"--preload", \
|
| 49 |
+
"app:app"]
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Decipherai Api
|
| 3 |
+
emoji: 🔥
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
short_description: Ancient script analysis, OCR, translation and historical int
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import tempfile
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Safe stdout/stderr wrapper to prevent OSError: [Errno 22] when stdout pipe is closed/unbuffered in background
|
| 7 |
+
class SafeStream:
|
| 8 |
+
def __init__(self, original_stream):
|
| 9 |
+
self.original_stream = original_stream
|
| 10 |
+
|
| 11 |
+
def write(self, data):
|
| 12 |
+
try:
|
| 13 |
+
if self.original_stream:
|
| 14 |
+
self.original_stream.write(data)
|
| 15 |
+
except OSError as e:
|
| 16 |
+
if e.errno != 22:
|
| 17 |
+
raise
|
| 18 |
+
|
| 19 |
+
def flush(self):
|
| 20 |
+
try:
|
| 21 |
+
if self.original_stream:
|
| 22 |
+
self.original_stream.flush()
|
| 23 |
+
except OSError:
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
def __getattr__(self, attr):
|
| 27 |
+
return getattr(self.original_stream, attr)
|
| 28 |
+
|
| 29 |
+
sys.stdout = SafeStream(sys.stdout)
|
| 30 |
+
sys.stderr = SafeStream(sys.stderr)
|
| 31 |
+
|
| 32 |
+
# Load .env variables (including HF_HOME and GROQ_API_KEY) before imports
|
| 33 |
+
load_dotenv()
|
| 34 |
+
|
| 35 |
+
from flask import Flask, request, jsonify
|
| 36 |
+
from flask_cors import CORS
|
| 37 |
+
|
| 38 |
+
# Import modular components
|
| 39 |
+
from config import Config
|
| 40 |
+
from models.groq_client import GroqClient
|
| 41 |
+
from models.clip_classifier import CLIPClassifier
|
| 42 |
+
from models.tesseract_ocr import TesseractOCR
|
| 43 |
+
from models.huggingface_models import HuggingFaceModels
|
| 44 |
+
from services.groq_vision_classifier import GroqVisionScriptClassifier
|
| 45 |
+
from services.script_detector import ScriptDetectionService
|
| 46 |
+
from utils.image_utils import validate_image
|
| 47 |
+
from utils.text_utils import clean_text
|
| 48 |
+
from processors.cuneiform_processor import CuneiformProcessor
|
| 49 |
+
from utils.gpu_diagnostics import log_gpu_info
|
| 50 |
+
|
| 51 |
+
# Initialize Flask app
|
| 52 |
+
app = Flask(__name__)
|
| 53 |
+
|
| 54 |
+
# CORS — restrict origins in production via ALLOWED_ORIGINS env var
|
| 55 |
+
# Example: ALLOWED_ORIGINS=https://your-frontend.vercel.app,https://custom-domain.com
|
| 56 |
+
allowed_origins = os.getenv(
|
| 57 |
+
"ALLOWED_ORIGINS",
|
| 58 |
+
"http://localhost:3000,http://localhost:5173,http://localhost:5000"
|
| 59 |
+
)
|
| 60 |
+
CORS(app, origins=allowed_origins.split(","))
|
| 61 |
+
|
| 62 |
+
# Global components
|
| 63 |
+
import threading
|
| 64 |
+
config = Config()
|
| 65 |
+
groq_client = None
|
| 66 |
+
clip_classifier = None
|
| 67 |
+
hf_models = None
|
| 68 |
+
script_detector = None
|
| 69 |
+
cuneiform_processor = None
|
| 70 |
+
references = {}
|
| 71 |
+
|
| 72 |
+
# Live model preloading status tracking
|
| 73 |
+
model_status = {
|
| 74 |
+
"status": "loading",
|
| 75 |
+
"groq": "pending",
|
| 76 |
+
"clip": "pending",
|
| 77 |
+
"translator": "pending",
|
| 78 |
+
"cuneiform": "pending",
|
| 79 |
+
"script_detector": "pending"
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def load_references():
|
| 84 |
+
"""Load references from JSON file"""
|
| 85 |
+
global references
|
| 86 |
+
try:
|
| 87 |
+
import json
|
| 88 |
+
with open(config.REFERENCES_PATH, "r", encoding="utf-8") as f:
|
| 89 |
+
data = json.load(f)
|
| 90 |
+
|
| 91 |
+
references = {
|
| 92 |
+
"egypt_symbol_notes": data.get("egypt_symbol_notes", {}),
|
| 93 |
+
"greek_symbol_notes": data.get("greek_symbol_notes", {}),
|
| 94 |
+
"greek_hint": data.get("greek_hint", "If no specific character note is found, treat as lexical marker considering diacriticals (breathing marks, accents, vowel quantity) which affect pronunciation, meaning, and grammatical function in ancient Greek texts."),
|
| 95 |
+
"latin_symbol_notes": data.get("latin_symbol_notes", {}),
|
| 96 |
+
"latin_hint": data.get("latin_hint", "If no specific character note is found, consider standard Latin letters or medieval scribal abbreviations."),
|
| 97 |
+
# Cuneiform references
|
| 98 |
+
"cuneiform_symbol_notes": data.get("cuneiform_symbol_notes", {}),
|
| 99 |
+
"cuneiform_hint": data.get("cuneiform_hint", "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages (Sumerian, Akkadian, etc.)")
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
print(f"[INFO] Loaded references from {config.REFERENCES_PATH}")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"[WARN] Failed to load references: {e}")
|
| 105 |
+
references = {
|
| 106 |
+
"egypt_symbol_notes": {},
|
| 107 |
+
"greek_symbol_notes": {},
|
| 108 |
+
"greek_hint": "Possible Greek lexical marker.",
|
| 109 |
+
"latin_symbol_notes": {},
|
| 110 |
+
"latin_hint": "Latin scribal practice.",
|
| 111 |
+
"cuneiform_symbol_notes": {},
|
| 112 |
+
"cuneiform_hint": "Ancient cuneiform sign."
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def initialize_models_async():
|
| 117 |
+
"""Load models sequentially in the background to prevent blocking Flask startup"""
|
| 118 |
+
global groq_client, clip_classifier, hf_models, script_detector, cuneiform_processor, model_status
|
| 119 |
+
try:
|
| 120 |
+
print("[INFO] Background model preloading thread started...")
|
| 121 |
+
|
| 122 |
+
# Log GPU Diagnostics
|
| 123 |
+
log_gpu_info()
|
| 124 |
+
|
| 125 |
+
# Load references first
|
| 126 |
+
load_references()
|
| 127 |
+
|
| 128 |
+
# Groq
|
| 129 |
+
model_status["groq"] = "loading"
|
| 130 |
+
groq_client = GroqClient()
|
| 131 |
+
model_status["groq"] = "ready" if groq_client.is_available() else "unavailable"
|
| 132 |
+
print(f"[INFO] Groq client initialization complete: {model_status['groq']}")
|
| 133 |
+
|
| 134 |
+
# CLIP
|
| 135 |
+
model_status["clip"] = "loading"
|
| 136 |
+
clip_classifier = CLIPClassifier()
|
| 137 |
+
model_status["clip"] = "ready" if (clip_classifier and clip_classifier.pipeline is not None) else "failed"
|
| 138 |
+
print(f"[INFO] CLIP classifier initialization complete: {model_status['clip']}")
|
| 139 |
+
|
| 140 |
+
# HF Translator
|
| 141 |
+
model_status["translator"] = "loading"
|
| 142 |
+
hf_models = HuggingFaceModels()
|
| 143 |
+
model_status["translator"] = "ready" if (hf_models and hf_models.get_translator() is not None) else "failed"
|
| 144 |
+
print(f"[INFO] Hugging Face models initialization complete: {model_status['translator']}")
|
| 145 |
+
|
| 146 |
+
# Cuneiform Processor
|
| 147 |
+
model_status["cuneiform"] = "loading"
|
| 148 |
+
try:
|
| 149 |
+
print("[INFO] Initializing cuneiform processor...")
|
| 150 |
+
cuneiform_processor = CuneiformProcessor(
|
| 151 |
+
groq_client=groq_client,
|
| 152 |
+
references=references,
|
| 153 |
+
clip_classifier=clip_classifier
|
| 154 |
+
)
|
| 155 |
+
model_status["cuneiform"] = "ready" if cuneiform_processor.cuneiform_available else "unavailable"
|
| 156 |
+
except Exception as e:
|
| 157 |
+
print(f"[ERROR] Failed to initialize cuneiform processor: {e}")
|
| 158 |
+
model_status["cuneiform"] = "failed"
|
| 159 |
+
cuneiform_processor = None
|
| 160 |
+
print(f"[INFO] Cuneiform processor initialization complete: {model_status['cuneiform']}")
|
| 161 |
+
|
| 162 |
+
# Script Detection Service
|
| 163 |
+
model_status["script_detector"] = "loading"
|
| 164 |
+
script_detector = ScriptDetectionService(
|
| 165 |
+
groq_client=groq_client,
|
| 166 |
+
references=references,
|
| 167 |
+
clip_classifier=clip_classifier,
|
| 168 |
+
translator_pipe=hf_models.get_translator(),
|
| 169 |
+
cuneiform_processor=cuneiform_processor
|
| 170 |
+
)
|
| 171 |
+
model_status["script_detector"] = "ready"
|
| 172 |
+
print(f"[INFO] Script detection service initialization complete: {model_status['script_detector']}")
|
| 173 |
+
|
| 174 |
+
model_status["status"] = "ready"
|
| 175 |
+
print("[SUCCESS] All models initialized successfully in the background")
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
model_status["status"] = "failed"
|
| 179 |
+
print(f"[ERROR] Critical failure in background model initialization: {e}")
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def initialize_models():
|
| 183 |
+
"""Spawn background thread to load models"""
|
| 184 |
+
print("[INFO] Spawning background thread for model initialization...")
|
| 185 |
+
model_status["status"] = "loading"
|
| 186 |
+
threading.Thread(target=initialize_models_async, daemon=True).start()
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
@app.route('/analyze', methods=['POST'])
|
| 190 |
+
def analyze():
|
| 191 |
+
"""Main analysis endpoint with Groq Vision classification"""
|
| 192 |
+
tmp_path = None
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
# Check if models are fully loaded
|
| 196 |
+
if model_status["status"] != "ready":
|
| 197 |
+
return jsonify({
|
| 198 |
+
"error": "Models are still loading in the background. Please try again in a few moments.",
|
| 199 |
+
"status": "loading",
|
| 200 |
+
"models_status": model_status
|
| 201 |
+
}), 503
|
| 202 |
+
|
| 203 |
+
# Validate request
|
| 204 |
+
if 'image' not in request.files:
|
| 205 |
+
return jsonify({"error": "No image uploaded"}), 400
|
| 206 |
+
|
| 207 |
+
img_file = request.files['image']
|
| 208 |
+
if img_file.filename == '':
|
| 209 |
+
return jsonify({"error": "Empty filename"}), 400
|
| 210 |
+
|
| 211 |
+
# Validate image file
|
| 212 |
+
try:
|
| 213 |
+
validate_image(img_file)
|
| 214 |
+
except ValueError as e:
|
| 215 |
+
return jsonify({"error": str(e)}), 400
|
| 216 |
+
|
| 217 |
+
# Save temporary file
|
| 218 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
| 219 |
+
tmp_path = tmp.name
|
| 220 |
+
img_file.save(tmp_path)
|
| 221 |
+
|
| 222 |
+
# Process image with Groq Vision classification
|
| 223 |
+
result = script_detector.detect_and_process(tmp_path)
|
| 224 |
+
|
| 225 |
+
if not result:
|
| 226 |
+
return jsonify({"error": "Could not process image"}), 500
|
| 227 |
+
|
| 228 |
+
# Get Vision classification info
|
| 229 |
+
vision_classification = result.get('vision_classification', 'unknown')
|
| 230 |
+
classification_method = result.get('classification_method', 'unknown')
|
| 231 |
+
classification_confidence = result.get('classification_confidence', 0.0)
|
| 232 |
+
script_type = result.get('script_type', 'egyptian')
|
| 233 |
+
|
| 234 |
+
# Base response with Vision classification info
|
| 235 |
+
base_response = {
|
| 236 |
+
"script_type": script_type,
|
| 237 |
+
"vision_classification": vision_classification,
|
| 238 |
+
"classification_method": classification_method,
|
| 239 |
+
"classification_confidence": classification_confidence,
|
| 240 |
+
"confidence": result.get('confidence', 0.0),
|
| 241 |
+
"historical_context": result.get('historical_context', {}),
|
| 242 |
+
"creative_story": result.get('creative_story', ''),
|
| 243 |
+
"model_used": "llama-3.2-90b-vision-preview"
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Handle cuneiform processing
|
| 247 |
+
if script_type == 'cuneiform':
|
| 248 |
+
if not cuneiform_processor or not cuneiform_processor.cuneiform_available:
|
| 249 |
+
return jsonify({
|
| 250 |
+
**base_response,
|
| 251 |
+
"error": "Cuneiform processing unavailable",
|
| 252 |
+
"labels": [],
|
| 253 |
+
"gardiner_codes": [],
|
| 254 |
+
"translation": "Cuneiform translation model not available",
|
| 255 |
+
"translation_ok": False
|
| 256 |
+
}), 200
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
# Process cuneiform text
|
| 260 |
+
processed_result = result.get('processed_result', {})
|
| 261 |
+
cuneiform_text = processed_result.get('text', '')
|
| 262 |
+
|
| 263 |
+
# Translate cuneiform to English
|
| 264 |
+
translation = ""
|
| 265 |
+
translation_ok = False
|
| 266 |
+
|
| 267 |
+
if cuneiform_text and len(cuneiform_text.strip()) > 2:
|
| 268 |
+
print(f"[INFO] Translating cuneiform: {cuneiform_text[:50]}...")
|
| 269 |
+
translation = cuneiform_processor.translate_cuneiform(cuneiform_text)
|
| 270 |
+
translation_ok = bool(translation and not translation.startswith("Error"))
|
| 271 |
+
else:
|
| 272 |
+
translation = "No readable cuneiform text extracted"
|
| 273 |
+
|
| 274 |
+
# Build cuneiform response
|
| 275 |
+
response_data = {
|
| 276 |
+
**base_response,
|
| 277 |
+
"labels": [],
|
| 278 |
+
"gardiner_codes": [],
|
| 279 |
+
"translation": translation,
|
| 280 |
+
"translation_ok": translation_ok,
|
| 281 |
+
"cuneiform_text": cuneiform_text,
|
| 282 |
+
"validation": {
|
| 283 |
+
"quality_score": processed_result.get('validation', {}).get('quality_score', 0.0),
|
| 284 |
+
"cuneiform_ratio": processed_result.get('validation', {}).get('cuneiform_ratio', 0.0),
|
| 285 |
+
"atf_ratio": processed_result.get('validation', {}).get('atf_ratio', 0.0),
|
| 286 |
+
"char_analysis": processed_result.get('char_analysis', {}),
|
| 287 |
+
"ocr_method": "praeclarum/cuneiform (T5-based translation)",
|
| 288 |
+
"supports_translation": True,
|
| 289 |
+
"input_format": processed_result.get('char_analysis', {}).get('text_format', 'Unknown')
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
return jsonify(response_data)
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
print(f"[ERROR] Cuneiform processing failed: {e}")
|
| 297 |
+
return jsonify({
|
| 298 |
+
**base_response,
|
| 299 |
+
"error": f"Cuneiform processing error: {str(e)}",
|
| 300 |
+
"labels": [],
|
| 301 |
+
"gardiner_codes": [],
|
| 302 |
+
"translation": "Cuneiform processing failed",
|
| 303 |
+
"translation_ok": False
|
| 304 |
+
}), 200
|
| 305 |
+
|
| 306 |
+
elif script_type in ['greek', 'latin']:
|
| 307 |
+
processed_result = result.get('processed_result', {})
|
| 308 |
+
validation = processed_result.get('validation', {})
|
| 309 |
+
|
| 310 |
+
response_data = {
|
| 311 |
+
**base_response,
|
| 312 |
+
"labels": [],
|
| 313 |
+
"gardiner_codes": [],
|
| 314 |
+
"translation": processed_result.get('text', ''),
|
| 315 |
+
"translation_ok": True,
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
# Add enhanced validation info for Greek
|
| 319 |
+
if script_type == 'greek':
|
| 320 |
+
response_data["validation"] = {
|
| 321 |
+
"quality_score": validation.get('quality_score', 0.0),
|
| 322 |
+
"greek_ratio": validation.get('greek_ratio', 0.0),
|
| 323 |
+
"has_polytonic": validation.get('has_polytonic', False),
|
| 324 |
+
"char_analysis": processed_result.get('char_analysis', {}),
|
| 325 |
+
"ocr_method": "ancient_greek_ocr" if validation.get('quality_score', 0) > 0.7 else "standard_greek_ocr"
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
elif script_type == 'latin':
|
| 329 |
+
response_data["validation"] = {
|
| 330 |
+
"quality_score": validation.get('quality_score', 0.0),
|
| 331 |
+
"latin_ratio": validation.get('latin_ratio', 0.0),
|
| 332 |
+
"trocr_used": validation.get('tridis_used', False) or (validation.get('ocr_method') in ['trocr-base-latin', 'tridis_HTR']),
|
| 333 |
+
"char_analysis": processed_result.get('char_analysis', {}),
|
| 334 |
+
"ocr_method": validation.get('ocr_method', 'standard_latin_ocr'),
|
| 335 |
+
"writing_style": validation.get('writing_style', 'cursive')
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
return jsonify(response_data)
|
| 340 |
+
|
| 341 |
+
else: # Egyptian
|
| 342 |
+
processed = result['processed_result']
|
| 343 |
+
return jsonify({
|
| 344 |
+
**base_response,
|
| 345 |
+
"labels": processed['labels'],
|
| 346 |
+
"gardiner_codes": processed['codes'],
|
| 347 |
+
"translation": processed['translation'],
|
| 348 |
+
"translation_ok": processed['translation_ok']
|
| 349 |
+
})
|
| 350 |
+
|
| 351 |
+
except Exception as e:
|
| 352 |
+
print(f"[ERROR] Analysis failed: {e}")
|
| 353 |
+
import traceback
|
| 354 |
+
traceback.print_exc()
|
| 355 |
+
return jsonify({"error": "Processing failed"}), 500
|
| 356 |
+
|
| 357 |
+
finally:
|
| 358 |
+
# Cleanup temporary file
|
| 359 |
+
if tmp_path:
|
| 360 |
+
try:
|
| 361 |
+
os.remove(tmp_path)
|
| 362 |
+
except Exception:
|
| 363 |
+
pass
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
@app.route('/chat', methods=['POST'])
|
| 367 |
+
def chat():
|
| 368 |
+
"""Chatbot endpoint for manuscript queries and general dialogue"""
|
| 369 |
+
try:
|
| 370 |
+
data = request.get_json() or {}
|
| 371 |
+
message = data.get("message", "")
|
| 372 |
+
history = data.get("history", [])
|
| 373 |
+
context = data.get("context", "")
|
| 374 |
+
|
| 375 |
+
if not message:
|
| 376 |
+
return jsonify({"error": "Message is required"}), 400
|
| 377 |
+
|
| 378 |
+
system_prompt = (
|
| 379 |
+
"You are DecipherAI's helpful historical assistant. You are an expert paleographer and historian.\n"
|
| 380 |
+
"Answer the user's questions about ancient scripts, translations, and history in a helpful, "
|
| 381 |
+
"academic yet accessible manner. Cite historical sources when appropriate."
|
| 382 |
+
)
|
| 383 |
+
if context:
|
| 384 |
+
system_prompt += f"\n\nHere is the context of the current manuscript translation:\n{context}"
|
| 385 |
+
|
| 386 |
+
if not groq_client or not groq_client.is_available():
|
| 387 |
+
reply = (
|
| 388 |
+
f"Thank you for your question: '{message}'. I'm currently running in offline fallback mode "
|
| 389 |
+
f"because the Groq API key is not set. Once configured, I will be able to answer all your "
|
| 390 |
+
f"scholarly questions about the translated scripts, historical context, and paleography in real time!"
|
| 391 |
+
)
|
| 392 |
+
else:
|
| 393 |
+
prompt = ""
|
| 394 |
+
for turn in history[-5:]:
|
| 395 |
+
role = turn.get("role", "user")
|
| 396 |
+
content = turn.get("content", "")
|
| 397 |
+
prompt += f"{role.upper()}: {content}\n"
|
| 398 |
+
prompt += f"USER: {message}\nASSISTANT:"
|
| 399 |
+
|
| 400 |
+
reply = groq_client.generate_response(
|
| 401 |
+
system_prompt=system_prompt,
|
| 402 |
+
user_prompt=prompt,
|
| 403 |
+
max_tokens=500
|
| 404 |
+
) or "I'm sorry, I encountered an error generating a response."
|
| 405 |
+
|
| 406 |
+
return jsonify({"reply": reply})
|
| 407 |
+
except Exception as e:
|
| 408 |
+
print(f"[ERROR] Chat failed: {e}")
|
| 409 |
+
return jsonify({"error": "Failed to process chat message"}), 500
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
@app.route('/health', methods=['GET'])
|
| 413 |
+
def health_check():
|
| 414 |
+
"""Health check endpoint returning real-time load status"""
|
| 415 |
+
return jsonify({
|
| 416 |
+
"status": "healthy" if model_status["status"] == "ready" else "initializing",
|
| 417 |
+
"models_status": model_status
|
| 418 |
+
})
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
@app.route('/info', methods=['GET'])
|
| 422 |
+
def info():
|
| 423 |
+
"""Information endpoint"""
|
| 424 |
+
return jsonify({
|
| 425 |
+
"app": "Ancient Script Recognition System",
|
| 426 |
+
"version": "2.1.0",
|
| 427 |
+
"supported_scripts": [
|
| 428 |
+
"Egyptian Hieroglyphs",
|
| 429 |
+
"Ancient Greek",
|
| 430 |
+
"Latin",
|
| 431 |
+
"Ancient Cuneiform"
|
| 432 |
+
],
|
| 433 |
+
"features": [
|
| 434 |
+
"Multi-script detection",
|
| 435 |
+
"OCR text extraction",
|
| 436 |
+
"Historical context generation",
|
| 437 |
+
"Creative story generation",
|
| 438 |
+
"Cuneiform translation (Sumerian/Akkadian → English)"
|
| 439 |
+
]
|
| 440 |
+
})
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
# --- Model initialization ---
|
| 444 |
+
# When running under gunicorn (or any WSGI server), __name__ != "__main__",
|
| 445 |
+
# so we initialize models at module level. The gunicorn --preload flag ensures
|
| 446 |
+
# this runs once in the master process before forking workers.
|
| 447 |
+
def _auto_initialize():
|
| 448 |
+
"""Initialize models when running under a WSGI server (gunicorn, waitress, etc.)"""
|
| 449 |
+
if os.getenv("WERKZEUG_RUN_MAIN") == "true":
|
| 450 |
+
# Flask reloader child process — handled by __main__ block
|
| 451 |
+
return
|
| 452 |
+
print("[INIT] WSGI server detected — initializing models...")
|
| 453 |
+
initialize_models()
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
if __name__ == "__main__":
|
| 457 |
+
print("[INIT] Starting Ancient Script Recognition System...")
|
| 458 |
+
|
| 459 |
+
# Start Flask app
|
| 460 |
+
port = int(os.getenv("PORT", 7860))
|
| 461 |
+
debug = os.getenv("DEBUG", "False").lower() == "true"
|
| 462 |
+
|
| 463 |
+
# Initialize all models (only in child process if debug mode is on to avoid duplicate threads)
|
| 464 |
+
if not debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
|
| 465 |
+
initialize_models()
|
| 466 |
+
else:
|
| 467 |
+
print("[INFO] Reloader active. Model initialization deferred to child process.")
|
| 468 |
+
|
| 469 |
+
print(f"[INFO] Starting server on port {port}")
|
| 470 |
+
app.run(host="0.0.0.0", port=port, debug=debug)
|
| 471 |
+
else:
|
| 472 |
+
# Running under gunicorn / WSGI
|
| 473 |
+
_auto_initialize()
|
config.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
class Config:
|
| 6 |
+
# Paths
|
| 7 |
+
BASE_DIR = Path(__file__).parent
|
| 8 |
+
|
| 9 |
+
TESSERACT_EXE = os.getenv("TESSERACT_EXE", "tesseract")
|
| 10 |
+
TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX")
|
| 11 |
+
|
| 12 |
+
REFERENCES_PATH = BASE_DIR / "references.json"
|
| 13 |
+
ANCIENT_GREEK_TESSDATA = BASE_DIR / "tessdata" / "ancient-greek"
|
| 14 |
+
|
| 15 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 16 |
+
|
| 17 |
+
# Model Settings
|
| 18 |
+
HF_TRANSLATOR_MODEL = "AnushS/Hieroglyph-Translator-Using-Gardiner-Codes"
|
| 19 |
+
CLIP_MODEL = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
|
| 20 |
+
DEVICE = 0 if torch.cuda.is_available() else -1
|
| 21 |
+
|
| 22 |
+
# Groq Settings
|
| 23 |
+
GROQ_MODEL = "openai/gpt-oss-120b"
|
| 24 |
+
GROQ_TEMPERATURE = 1.0
|
| 25 |
+
GROQ_STORY_MAX_TOKENS = 1024
|
| 26 |
+
GROQ_CONTEXT_MAX_TOKENS = 2048
|
| 27 |
+
|
| 28 |
+
# File Upload Settings
|
| 29 |
+
MAX_FILE_SIZE = 16 * 1024 * 1024 # 16MB
|
| 30 |
+
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp'}
|
| 31 |
+
|
| 32 |
+
# Gardiner Code Mapping
|
| 33 |
+
GARDINER_MAP = {
|
| 34 |
+
"man_seated": "A1", "woman_seated": "B1", "god_figure": "C1",
|
| 35 |
+
"eye": "D4", "hippopotamus": "E25", "leg": "F28", "owl": "G17",
|
| 36 |
+
"feather": "H2", "lizard": "I1", "fish": "K1", "insect": "L1",
|
| 37 |
+
"reed": "M17", "sun": "N5", "crown": "S39", "bow": "T14",
|
| 38 |
+
"hoe": "U25", "rope": "V1", "jar": "W1", "bread": "X3", "scribe_tools": "Y5"
|
| 39 |
+
}
|
| 40 |
+
TESSERACT_CONFIGS = {
|
| 41 |
+
'ancient_greek': "--psm 6 --oem 1 -c preserve_interword_spaces=1",
|
| 42 |
+
'standard_greek': "--psm 6 --oem 1",
|
| 43 |
+
'fallback': "--psm 3 --oem 1"
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def CODE_TO_LABEL(self):
|
| 48 |
+
return {v: k for k, v in self.GARDINER_MAP.items()}
|
decipherai-api
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit ed9e7fdd210f252a5309a7e6fc728a29fce274dd
|
models/clip_classifier.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import numpy as np
|
| 5 |
+
from config import Config
|
| 6 |
+
from utils.gpu_diagnostics import log_model_device
|
| 7 |
+
|
| 8 |
+
class CLIPClassifier:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config = Config()
|
| 11 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 12 |
+
self.model = None
|
| 13 |
+
self.processor = None
|
| 14 |
+
|
| 15 |
+
# Load CLIP model and processor with fallback
|
| 16 |
+
model_name = getattr(self.config, 'CLIP_MODEL', 'openai/clip-vit-base-patch32')
|
| 17 |
+
try:
|
| 18 |
+
print(f"[INFO] Loading CLIP model: {model_name}...")
|
| 19 |
+
self.model = CLIPModel.from_pretrained(model_name)
|
| 20 |
+
self.processor = CLIPProcessor.from_pretrained(model_name)
|
| 21 |
+
self.model.to(self.device)
|
| 22 |
+
self.model.eval() # Set model to evaluation mode
|
| 23 |
+
log_model_device("CLIP script classifier", self.device)
|
| 24 |
+
print(f"[INFO] CLIP model loaded on {self.device}")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"[WARN] Failed to load CLIP model '{model_name}': {e}")
|
| 27 |
+
fallback_name = "openai/clip-vit-base-patch32"
|
| 28 |
+
try:
|
| 29 |
+
print(f"[INFO] Loading fallback CLIP model: {fallback_name}...")
|
| 30 |
+
self.model = CLIPModel.from_pretrained(fallback_name)
|
| 31 |
+
self.processor = CLIPProcessor.from_pretrained(fallback_name)
|
| 32 |
+
self.model.to(self.device)
|
| 33 |
+
self.model.eval() # Set model to evaluation mode
|
| 34 |
+
log_model_device("CLIP script classifier (fallback)", self.device)
|
| 35 |
+
print(f"[INFO] Fallback CLIP model loaded on {self.device}")
|
| 36 |
+
except Exception as fe:
|
| 37 |
+
print(f"[ERROR] Failed to load fallback CLIP model: {fe}")
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def pipeline(self):
|
| 41 |
+
"""Property checked in app.py/test.py to ensure model is initialized"""
|
| 42 |
+
return self.model if self.model is not None else None
|
| 43 |
+
|
| 44 |
+
def classify_script_type(self, image):
|
| 45 |
+
"""Classify script type of image into one of the four supported categories"""
|
| 46 |
+
if not self.pipeline:
|
| 47 |
+
return "unknown", 0.0
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
if isinstance(image, np.ndarray):
|
| 51 |
+
image = Image.fromarray(image)
|
| 52 |
+
|
| 53 |
+
# Prompts representing the four classes
|
| 54 |
+
scripts = ["egyptian", "greek", "latin", "cuneiform"]
|
| 55 |
+
descriptions = [
|
| 56 |
+
"ancient Egyptian hieroglyphic writing with drawings of animals and humans",
|
| 57 |
+
"ancient Greek alphabet script on papyrus or stone with polytonic symbols",
|
| 58 |
+
"medieval Latin manuscript text written in ink on parchment",
|
| 59 |
+
"ancient Mesopotamian cuneiform tablet with wedge-shaped markings in clay"
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
inputs = self.processor(
|
| 63 |
+
text=descriptions,
|
| 64 |
+
images=image,
|
| 65 |
+
return_tensors="pt",
|
| 66 |
+
padding=True
|
| 67 |
+
).to(self.device)
|
| 68 |
+
|
| 69 |
+
with torch.inference_mode():
|
| 70 |
+
outputs = self.model(**inputs)
|
| 71 |
+
logits_per_image = outputs.logits_per_image
|
| 72 |
+
probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
|
| 73 |
+
|
| 74 |
+
best_idx = np.argmax(probs)
|
| 75 |
+
score = float(probs[best_idx])
|
| 76 |
+
script_label = scripts[best_idx]
|
| 77 |
+
|
| 78 |
+
print(f"[INFO] CLIP script classification: {script_label} ({score:.3f})")
|
| 79 |
+
return script_label, score
|
| 80 |
+
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"[ERROR] CLIP script classification failed: {e}")
|
| 83 |
+
return "unknown", 0.0
|
| 84 |
+
|
| 85 |
+
def classify_symbols(self, crops, candidate_labels):
|
| 86 |
+
"""Classify segmented symbol image crops against candidate labels"""
|
| 87 |
+
if not self.pipeline or not crops or not candidate_labels:
|
| 88 |
+
return [None] * len(crops) if crops else []
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
print(f"[INFO] Batch classifying {len(crops)} crops using CLIP...")
|
| 92 |
+
|
| 93 |
+
# Format candidate labels into descriptive prompts for better visual matching
|
| 94 |
+
prompts = [f"an ancient Egyptian hieroglyph symbol of a {label.replace('_', ' ')}" for label in candidate_labels]
|
| 95 |
+
|
| 96 |
+
# Tokenize prompts once
|
| 97 |
+
text_inputs = self.processor(
|
| 98 |
+
text=prompts,
|
| 99 |
+
return_tensors="pt",
|
| 100 |
+
padding=True
|
| 101 |
+
).to(self.device)
|
| 102 |
+
|
| 103 |
+
with torch.inference_mode():
|
| 104 |
+
text_features = self.model.get_text_features(**text_inputs)
|
| 105 |
+
text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
|
| 106 |
+
|
| 107 |
+
results = []
|
| 108 |
+
# Process crops (images)
|
| 109 |
+
for crop in crops:
|
| 110 |
+
if isinstance(crop, np.ndarray):
|
| 111 |
+
crop = Image.fromarray(crop)
|
| 112 |
+
|
| 113 |
+
image_inputs = self.processor(images=crop, return_tensors="pt").to(self.device)
|
| 114 |
+
|
| 115 |
+
with torch.inference_mode():
|
| 116 |
+
image_features = self.model.get_image_features(**image_inputs)
|
| 117 |
+
image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
|
| 118 |
+
|
| 119 |
+
# Compute cosine similarities
|
| 120 |
+
similarities = (image_features @ text_features.T).squeeze(0)
|
| 121 |
+
best_idx = torch.argmax(similarities).item()
|
| 122 |
+
|
| 123 |
+
results.append(candidate_labels[best_idx])
|
| 124 |
+
|
| 125 |
+
return results
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
print(f"[ERROR] CLIP symbol classification failed: {e}")
|
| 129 |
+
return [candidate_labels[0]] * len(crops) if candidate_labels else [None] * len(crops)
|
models/groq_client.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from config import Config
|
| 3 |
+
|
| 4 |
+
class GroqClient:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.config = Config()
|
| 7 |
+
self.api_key = self.config.GROQ_API_KEY or os.getenv("GROQ_API_KEY")
|
| 8 |
+
self.client = None
|
| 9 |
+
|
| 10 |
+
if self.api_key:
|
| 11 |
+
try:
|
| 12 |
+
from groq import Groq
|
| 13 |
+
self.client = Groq(api_key=self.api_key)
|
| 14 |
+
print("[INFO] Groq client initialized successfully")
|
| 15 |
+
except ImportError:
|
| 16 |
+
print("[WARN] groq package not installed. Run 'pip install groq'.")
|
| 17 |
+
except Exception as e:
|
| 18 |
+
print(f"[ERROR] Failed to initialize Groq client: {e}")
|
| 19 |
+
else:
|
| 20 |
+
print("[WARN] GROQ_API_KEY not found in configuration or environment.")
|
| 21 |
+
|
| 22 |
+
def is_available(self) -> bool:
|
| 23 |
+
"""Check if Groq API client is available and configured"""
|
| 24 |
+
return self.client is not None
|
| 25 |
+
|
| 26 |
+
def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> str:
|
| 27 |
+
"""Generate response from Groq LLM"""
|
| 28 |
+
if not self.is_available():
|
| 29 |
+
print("[WARN] GroqClient not available for generating response.")
|
| 30 |
+
return ""
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# Use stable model name or configured fallback
|
| 34 |
+
model = self.config.GROQ_MODEL
|
| 35 |
+
# Common model fallbacks if config is generic or outdated
|
| 36 |
+
if model == "openai/gpt-oss-120b":
|
| 37 |
+
model = "llama-3.1-8b-instant" # standard Groq model
|
| 38 |
+
|
| 39 |
+
completion = self.client.chat.completions.create(
|
| 40 |
+
model=model,
|
| 41 |
+
messages=[
|
| 42 |
+
{"role": "system", "content": system_prompt},
|
| 43 |
+
{"role": "user", "content": user_prompt}
|
| 44 |
+
],
|
| 45 |
+
temperature=getattr(self.config, 'GROQ_TEMPERATURE', 0.7),
|
| 46 |
+
max_completion_tokens=max_tokens,
|
| 47 |
+
)
|
| 48 |
+
return completion.choices[0].message.content
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"[ERROR] Groq API call failed: {e}")
|
| 51 |
+
return ""
|
models/huggingface_models.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 3 |
+
from config import Config
|
| 4 |
+
from utils.gpu_diagnostics import log_model_device, register_processor, reclaim_vram_for
|
| 5 |
+
|
| 6 |
+
class HuggingFaceModels:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.config = Config()
|
| 9 |
+
self.device = torch.device("cpu") # Force Egyptian translator to CPU to save GPU VRAM
|
| 10 |
+
self._tokenizer = None
|
| 11 |
+
self._model = None
|
| 12 |
+
self.translator = self._translate_fn
|
| 13 |
+
print("[INFO] Egyptian translator initialized (Forced to CPU)")
|
| 14 |
+
|
| 15 |
+
def setup_translation_model(self):
|
| 16 |
+
"""Load T5 Seq2Seq model on CPU."""
|
| 17 |
+
model_name = getattr(self.config, 'HF_TRANSLATOR_MODEL', 'AnushS/Hieroglyph-Translator-Using-Gardiner-Codes')
|
| 18 |
+
try:
|
| 19 |
+
print(f"[INFO] Lazily loading Hugging Face translation model on CPU: {model_name}...")
|
| 20 |
+
self._tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 21 |
+
self._model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 22 |
+
self._model.to(self.device)
|
| 23 |
+
self._model.eval()
|
| 24 |
+
log_model_device("Egyptian T5 Translator", self.device)
|
| 25 |
+
print("[INFO] Translation model loaded successfully on CPU (Seq2Seq direct)")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"[ERROR] Failed to load translation model '{model_name}': {e}")
|
| 28 |
+
self.translator = self._get_mock_translator()
|
| 29 |
+
|
| 30 |
+
def _translate_fn(self, prompt, max_new_tokens=128, **kwargs):
|
| 31 |
+
"""Translate using the T5 model directly on CPU."""
|
| 32 |
+
try:
|
| 33 |
+
if self._model is None:
|
| 34 |
+
self.setup_translation_model()
|
| 35 |
+
|
| 36 |
+
inputs = self._tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
| 37 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 38 |
+
|
| 39 |
+
with torch.inference_mode():
|
| 40 |
+
outputs = self._model.generate(
|
| 41 |
+
**inputs,
|
| 42 |
+
max_new_tokens=max_new_tokens,
|
| 43 |
+
num_beams=kwargs.get("num_beams", 4),
|
| 44 |
+
do_sample=kwargs.get("do_sample", False),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
decoded = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 48 |
+
return [{"generated_text": decoded, "translation_text": decoded}]
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"[ERROR] Translation inference failed: {e}")
|
| 51 |
+
return [{"generated_text": "", "translation_text": ""}]
|
| 52 |
+
|
| 53 |
+
def get_translator(self):
|
| 54 |
+
"""Return the loaded translation function or mock fallback"""
|
| 55 |
+
return self.translator
|
| 56 |
+
|
| 57 |
+
def _get_mock_translator(self):
|
| 58 |
+
"""Returns a dummy translator function that mimics pipeline behavior on error"""
|
| 59 |
+
print("[INFO] Setting up mock fallback translator")
|
| 60 |
+
def mock_pipeline(prompt, *args, **kwargs):
|
| 61 |
+
return [{"generated_text": "", "translation_text": ""}]
|
| 62 |
+
return mock_pipeline
|
models/tesseract_ocr.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class TesseractOCR:
|
| 2 |
+
"""
|
| 3 |
+
Placeholder class to satisfy imports in app.py and test.py.
|
| 4 |
+
The processors themselves communicate directly with pytesseract.
|
| 5 |
+
"""
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
processors/__init__.py
ADDED
|
File without changes
|
processors/base_processor.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from PIL import Image
|
| 3 |
+
|
| 4 |
+
class BaseScriptProcessor(ABC):
|
| 5 |
+
def __init__(self, groq_client, references, clip_classifier=None): # Add clip_classifier parameter
|
| 6 |
+
self.groq_client = groq_client
|
| 7 |
+
self.references = references
|
| 8 |
+
self.clip_classifier = clip_classifier # Store clip_classifier
|
| 9 |
+
from services.rag_service import RAGService
|
| 10 |
+
from services.layout_parser import LayoutParser
|
| 11 |
+
self.rag_service = RAGService()
|
| 12 |
+
self.layout_parser = LayoutParser()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@abstractmethod
|
| 16 |
+
def detect_script(self, image_path):
|
| 17 |
+
"""Detect if image contains this script type"""
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def extract_text(self, image_path):
|
| 22 |
+
"""Extract text/symbols from image"""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
@abstractmethod
|
| 26 |
+
def process_text(self, extracted_text):
|
| 27 |
+
"""Process extracted text into meaningful output"""
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
@abstractmethod
|
| 31 |
+
def generate_historical_context(self, processed_text):
|
| 32 |
+
"""Generate historical context for the text"""
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
@abstractmethod
|
| 36 |
+
def generate_story(self, processed_text):
|
| 37 |
+
"""Generate creative story based on the text"""
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
def process_image(self, image_path):
|
| 41 |
+
"""Main processing pipeline"""
|
| 42 |
+
try:
|
| 43 |
+
# Step 1: Detect script
|
| 44 |
+
is_detected, confidence = self.detect_script(image_path)
|
| 45 |
+
if not is_detected:
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
# Step 2: Extract text
|
| 49 |
+
extracted_text = self.extract_text(image_path)
|
| 50 |
+
if not extracted_text:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
# Step 3: Process text
|
| 54 |
+
processed_result = self.process_text(extracted_text)
|
| 55 |
+
|
| 56 |
+
# Step 4: Generate context and story
|
| 57 |
+
historical_context = self.generate_historical_context(processed_result)
|
| 58 |
+
creative_story = self.generate_story(processed_result)
|
| 59 |
+
|
| 60 |
+
return {
|
| 61 |
+
"script_type": self.__class__.__name__.replace("Processor", "").lower(),
|
| 62 |
+
"confidence": confidence,
|
| 63 |
+
"extracted_text": extracted_text,
|
| 64 |
+
"processed_result": processed_result,
|
| 65 |
+
"historical_context": historical_context,
|
| 66 |
+
"creative_story": creative_story
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"[ERROR] Processing failed in {self.__class__.__name__}: {e}")
|
| 71 |
+
return None
|
processors/cuneiform_processor.py
ADDED
|
@@ -0,0 +1,804 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 9 |
+
from .base_processor import BaseScriptProcessor
|
| 10 |
+
from utils.text_utils import is_gibberish
|
| 11 |
+
|
| 12 |
+
BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
|
| 13 |
+
CUNEIFORM_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "cuneiform")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CuneiformProcessor(BaseScriptProcessor):
|
| 17 |
+
def __init__(self, groq_client, references, clip_classifier):
|
| 18 |
+
super().__init__(groq_client, references, clip_classifier)
|
| 19 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 20 |
+
|
| 21 |
+
self.clip_model = None
|
| 22 |
+
self.clip_processor = None
|
| 23 |
+
self.clip_available = False
|
| 24 |
+
|
| 25 |
+
self.cuneiform_model = None
|
| 26 |
+
self.cuneiform_tokenizer = None
|
| 27 |
+
self.translator_available = False
|
| 28 |
+
|
| 29 |
+
# Register for dynamic VRAM management
|
| 30 |
+
from utils.gpu_diagnostics import register_processor
|
| 31 |
+
register_processor("cuneiform", self)
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def cuneiform_available(self):
|
| 35 |
+
"""Property to match interface expected by ScriptDetectionService"""
|
| 36 |
+
# Always return True since we load lazily on demand
|
| 37 |
+
return True
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def setup_cuneiform_clip(self):
|
| 41 |
+
"""Setup CLIP for cuneiform visual recognition - MUCH better than OCR"""
|
| 42 |
+
try:
|
| 43 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 44 |
+
reclaim_vram_for("cuneiform")
|
| 45 |
+
|
| 46 |
+
print("[INFO] Lazily loading CLIP for cuneiform visual recognition...")
|
| 47 |
+
|
| 48 |
+
# Use a powerful CLIP model for better ancient script understanding
|
| 49 |
+
model_name = "openai/clip-vit-large-patch14"
|
| 50 |
+
|
| 51 |
+
self.clip_processor = CLIPProcessor.from_pretrained(model_name)
|
| 52 |
+
self.clip_model = CLIPModel.from_pretrained(model_name)
|
| 53 |
+
|
| 54 |
+
self.clip_model.to(self.device)
|
| 55 |
+
self.clip_model.eval() # Put in evaluation mode
|
| 56 |
+
|
| 57 |
+
from utils.gpu_diagnostics import log_model_device
|
| 58 |
+
log_model_device("Cuneiform CLIP Recognition", self.device)
|
| 59 |
+
|
| 60 |
+
# Define cuneiform sign categories for CLIP classification
|
| 61 |
+
self.cuneiform_signs = [
|
| 62 |
+
"ancient Sumerian cuneiform sign AN meaning god or heaven",
|
| 63 |
+
"ancient Akkadian cuneiform sign LUGAL meaning king or ruler",
|
| 64 |
+
"ancient cuneiform sign KI meaning earth or place",
|
| 65 |
+
"ancient cuneiform sign DINGIR divine determinative marker",
|
| 66 |
+
"ancient cuneiform sign UD meaning day or sun",
|
| 67 |
+
"ancient cuneiform sign E meaning house or temple",
|
| 68 |
+
"ancient cuneiform sign EN meaning lord or priest",
|
| 69 |
+
"ancient cuneiform sign NIN meaning lady or queen",
|
| 70 |
+
"ancient cuneiform administrative record with numbers",
|
| 71 |
+
"ancient cuneiform legal contract or treaty text",
|
| 72 |
+
"ancient cuneiform royal inscription or decree",
|
| 73 |
+
"ancient cuneiform literary or mythological text",
|
| 74 |
+
"ancient cuneiform school exercise or practice tablet"
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
# Tablet layout descriptions for structural analysis
|
| 78 |
+
self.tablet_layouts = [
|
| 79 |
+
"clay tablet with cuneiform text arranged in horizontal lines",
|
| 80 |
+
"cuneiform tablet with vertical column organization",
|
| 81 |
+
"administrative record tablet with numerical entries",
|
| 82 |
+
"legal document tablet with witness signatures",
|
| 83 |
+
"literary tablet with continuous narrative text",
|
| 84 |
+
"damaged or fragmentary cuneiform tablet",
|
| 85 |
+
"clear well-preserved cuneiform inscription",
|
| 86 |
+
"practice tablet with student exercises"
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
print(f"[INFO] CLIP cuneiform recognition loaded on {self.device}")
|
| 90 |
+
print("[INFO] Using visual pattern recognition instead of character OCR")
|
| 91 |
+
self.clip_available = True
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"[ERROR] CLIP cuneiform setup failed: {e}")
|
| 95 |
+
self.clip_available = False
|
| 96 |
+
|
| 97 |
+
def setup_praeclarum_translator(self):
|
| 98 |
+
"""Setup praeclarum translation model for converting recognized content"""
|
| 99 |
+
try:
|
| 100 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 101 |
+
reclaim_vram_for("cuneiform")
|
| 102 |
+
|
| 103 |
+
print("[INFO] Lazily loading praeclarum cuneiform translation model...")
|
| 104 |
+
|
| 105 |
+
self.cuneiform_tokenizer = AutoTokenizer.from_pretrained(
|
| 106 |
+
"praeclarum/cuneiform",
|
| 107 |
+
cache_dir=CUNEIFORM_MODEL_DIR
|
| 108 |
+
)
|
| 109 |
+
self.cuneiform_model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 110 |
+
"praeclarum/cuneiform",
|
| 111 |
+
cache_dir=CUNEIFORM_MODEL_DIR
|
| 112 |
+
)
|
| 113 |
+
self.cuneiform_model.to(self.device)
|
| 114 |
+
self.cuneiform_model.eval() # Put in evaluation mode
|
| 115 |
+
|
| 116 |
+
from utils.gpu_diagnostics import log_model_device
|
| 117 |
+
log_model_device("Cuneiform Translator (Praeclarum T5)", self.device)
|
| 118 |
+
|
| 119 |
+
self.translator_available = True
|
| 120 |
+
print("[INFO] Cuneiform translator ready for CLIP-recognized content")
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f"[ERROR] Translation model setup failed: {e}")
|
| 124 |
+
self.translator_available = False
|
| 125 |
+
|
| 126 |
+
def detect_script(self, image_path):
|
| 127 |
+
"""Detection handled by enhanced CLIP classification"""
|
| 128 |
+
try:
|
| 129 |
+
if not self.clip_available:
|
| 130 |
+
print("[ERROR] No cuneiform processing engines available")
|
| 131 |
+
return False, 0.0
|
| 132 |
+
|
| 133 |
+
print(f"[INFO] Cuneiform processor activated - Using CLIP visual recognition")
|
| 134 |
+
return True, 0.95
|
| 135 |
+
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"[ERROR] Cuneiform detection failed: {e}")
|
| 138 |
+
return False, 0.0
|
| 139 |
+
|
| 140 |
+
def extract_text(self, image_path):
|
| 141 |
+
"""Extract cuneiform using CLIP visual recognition instead of OCR"""
|
| 142 |
+
if self.clip_model is None:
|
| 143 |
+
self.setup_cuneiform_clip()
|
| 144 |
+
else:
|
| 145 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 146 |
+
reclaim_vram_for("cuneiform")
|
| 147 |
+
if str(next(self.clip_model.parameters()).device) != str(self.device):
|
| 148 |
+
print(f"[VRAM MANAGER] Activating Cuneiform CLIP model on {self.device}...")
|
| 149 |
+
self.clip_model.to(self.device)
|
| 150 |
+
|
| 151 |
+
if not getattr(self, 'clip_available', False) or self.clip_model is None:
|
| 152 |
+
return "CUNEIFORM_CLIP_FAILED: Visual recognition model not available"
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
start_time = time.time()
|
| 156 |
+
|
| 157 |
+
# Method 1: CLIP-based visual analysis
|
| 158 |
+
print("[INFO] Analyzing cuneiform using CLIP visual recognition...")
|
| 159 |
+
visual_analysis = self._analyze_cuneiform_with_clip(image_path)
|
| 160 |
+
|
| 161 |
+
if visual_analysis and visual_analysis['confidence'] > 0.3:
|
| 162 |
+
processing_time = time.time() - start_time
|
| 163 |
+
print(f"[SUCCESS] CLIP visual analysis completed in {processing_time:.2f}s")
|
| 164 |
+
return visual_analysis['description']
|
| 165 |
+
|
| 166 |
+
# Method 2: Fallback to basic tablet description
|
| 167 |
+
tablet_description = self._describe_tablet_layout(image_path)
|
| 168 |
+
if tablet_description:
|
| 169 |
+
return tablet_description
|
| 170 |
+
|
| 171 |
+
return "CUNEIFORM_VISUAL_ANALYSIS_INCOMPLETE: Clay tablet detected but content analysis requires higher resolution or clearer image"
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f"[ERROR] CLIP cuneiform analysis failed: {e}")
|
| 175 |
+
return f"CUNEIFORM_ERROR: {str(e)}"
|
| 176 |
+
|
| 177 |
+
def _analyze_cuneiform_with_clip(self, image_path):
|
| 178 |
+
"""Use CLIP to analyze cuneiform content visually"""
|
| 179 |
+
try:
|
| 180 |
+
image = Image.open(image_path).convert("RGB")
|
| 181 |
+
|
| 182 |
+
# Enhanced preprocessing for CLIP analysis
|
| 183 |
+
enhanced_image = self._preprocess_for_clip_analysis(image)
|
| 184 |
+
|
| 185 |
+
# CLIP classification of cuneiform content
|
| 186 |
+
print("[INFO] Running CLIP classification on cuneiform signs...")
|
| 187 |
+
|
| 188 |
+
inputs = self.clip_processor(
|
| 189 |
+
text=self.cuneiform_signs,
|
| 190 |
+
images=enhanced_image,
|
| 191 |
+
return_tensors="pt",
|
| 192 |
+
padding=True
|
| 193 |
+
).to(self.device)
|
| 194 |
+
|
| 195 |
+
with torch.inference_mode():
|
| 196 |
+
outputs = self.clip_model(**inputs)
|
| 197 |
+
logits_per_image = outputs.logits_per_image
|
| 198 |
+
probs = logits_per_image.softmax(dim=1)
|
| 199 |
+
|
| 200 |
+
# Get top predictions
|
| 201 |
+
top_probs, top_indices = torch.topk(probs, k=3)
|
| 202 |
+
|
| 203 |
+
# Build description based on CLIP analysis
|
| 204 |
+
descriptions = []
|
| 205 |
+
confidences = []
|
| 206 |
+
|
| 207 |
+
for i, (prob, idx) in enumerate(zip(top_probs[0], top_indices[0])):
|
| 208 |
+
if prob > 0.2: # Reasonable confidence threshold
|
| 209 |
+
sign_desc = self.cuneiform_signs[idx]
|
| 210 |
+
descriptions.append(sign_desc)
|
| 211 |
+
confidences.append(prob.item())
|
| 212 |
+
print(f"[INFO] CLIP detected: {sign_desc} (confidence: {prob:.3f})")
|
| 213 |
+
|
| 214 |
+
if descriptions:
|
| 215 |
+
# Convert visual analysis to ATF-like description
|
| 216 |
+
atf_description = self._convert_visual_to_atf(descriptions, confidences)
|
| 217 |
+
|
| 218 |
+
return {
|
| 219 |
+
'description': atf_description,
|
| 220 |
+
'confidence': max(confidences),
|
| 221 |
+
'visual_elements': descriptions,
|
| 222 |
+
'method': 'CLIP_visual_analysis'
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
return None
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"[ERROR] CLIP cuneiform analysis failed: {e}")
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
def _preprocess_for_clip_analysis(self, image):
|
| 232 |
+
"""Preprocess image specifically for CLIP cuneiform analysis"""
|
| 233 |
+
try:
|
| 234 |
+
# Convert to numpy for OpenCV processing
|
| 235 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 236 |
+
|
| 237 |
+
# Enhance for CLIP visual understanding
|
| 238 |
+
# 1. Increase contrast to make wedges more visible
|
| 239 |
+
lab = cv2.cvtColor(image_cv, cv2.COLOR_BGR2LAB)
|
| 240 |
+
l_channel, a, b = cv2.split(lab)
|
| 241 |
+
|
| 242 |
+
# Apply CLAHE to lightness channel
|
| 243 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
|
| 244 |
+
l_channel = clahe.apply(l_channel)
|
| 245 |
+
|
| 246 |
+
# Merge back
|
| 247 |
+
enhanced_lab = cv2.merge((l_channel, a, b))
|
| 248 |
+
enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
|
| 249 |
+
|
| 250 |
+
# 2. Sharpen edges to help CLIP see wedge boundaries
|
| 251 |
+
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
|
| 252 |
+
sharpened = cv2.filter2D(enhanced_bgr, -1, kernel)
|
| 253 |
+
|
| 254 |
+
# Convert back to PIL RGB
|
| 255 |
+
enhanced_rgb = cv2.cvtColor(sharpened, cv2.COLOR_BGR2RGB)
|
| 256 |
+
return Image.fromarray(enhanced_rgb)
|
| 257 |
+
|
| 258 |
+
except Exception as e:
|
| 259 |
+
print(f"[WARN] CLIP preprocessing failed: {e}")
|
| 260 |
+
return image
|
| 261 |
+
|
| 262 |
+
def _convert_visual_to_atf(self, visual_descriptions, confidences):
|
| 263 |
+
"""Convert CLIP visual analysis to ATF-like transliteration"""
|
| 264 |
+
try:
|
| 265 |
+
atf_elements = []
|
| 266 |
+
|
| 267 |
+
for desc, conf in zip(visual_descriptions, confidences):
|
| 268 |
+
desc_lower = desc.lower()
|
| 269 |
+
|
| 270 |
+
# Map visual descriptions to ATF transliterations
|
| 271 |
+
if 'lugal' in desc_lower or 'king' in desc_lower:
|
| 272 |
+
atf_elements.append('lugal')
|
| 273 |
+
elif 'an' in desc_lower or 'god' in desc_lower or 'heaven' in desc_lower:
|
| 274 |
+
atf_elements.append('an')
|
| 275 |
+
elif 'ki' in desc_lower or 'earth' in desc_lower or 'place' in desc_lower:
|
| 276 |
+
atf_elements.append('ki')
|
| 277 |
+
elif 'dingir' in desc_lower or 'divine' in desc_lower:
|
| 278 |
+
atf_elements.append('{d}')
|
| 279 |
+
elif 'ud' in desc_lower or 'day' in desc_lower or 'sun' in desc_lower:
|
| 280 |
+
atf_elements.append('ud')
|
| 281 |
+
elif 'e' in desc_lower and ('house' in desc_lower or 'temple' in desc_lower):
|
| 282 |
+
atf_elements.append('e2')
|
| 283 |
+
elif 'en' in desc_lower and 'lord' in desc_lower:
|
| 284 |
+
atf_elements.append('en')
|
| 285 |
+
elif 'nin' in desc_lower and ('lady' in desc_lower or 'queen' in desc_lower):
|
| 286 |
+
atf_elements.append('nin')
|
| 287 |
+
elif 'administrative' in desc_lower or 'numbers' in desc_lower:
|
| 288 |
+
atf_elements.extend(['1(disz)', '2(disz)', 'sze'])
|
| 289 |
+
elif 'royal' in desc_lower or 'inscription' in desc_lower:
|
| 290 |
+
atf_elements.extend(['lugal', 'kur', 'kur'])
|
| 291 |
+
elif 'legal' in desc_lower or 'contract' in desc_lower:
|
| 292 |
+
atf_elements.extend(['kiszib3', 'mu', 'pad'])
|
| 293 |
+
elif 'literary' in desc_lower or 'mythological' in desc_lower:
|
| 294 |
+
atf_elements.extend(['en', 'dingir', 'kur'])
|
| 295 |
+
elif 'school' in desc_lower or 'practice' in desc_lower:
|
| 296 |
+
atf_elements.extend(['a', 'ba', 'ka', 'la'])
|
| 297 |
+
|
| 298 |
+
# Build coherent ATF string
|
| 299 |
+
if atf_elements:
|
| 300 |
+
# Add line structure typical of cuneiform tablets
|
| 301 |
+
atf_text = f"1. {' '.join(atf_elements[:3])}"
|
| 302 |
+
if len(atf_elements) > 3:
|
| 303 |
+
atf_text += f"\n2. {' '.join(atf_elements[3:6])}"
|
| 304 |
+
if len(atf_elements) > 6:
|
| 305 |
+
atf_text += f"\n3. {' '.join(atf_elements[6:])}"
|
| 306 |
+
|
| 307 |
+
return atf_text
|
| 308 |
+
else:
|
| 309 |
+
return "cuneiform tablet content analysis incomplete"
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
print(f"[ERROR] Visual to ATF conversion failed: {e}")
|
| 313 |
+
return "visual analysis available but ATF conversion failed"
|
| 314 |
+
|
| 315 |
+
def _describe_tablet_layout(self, image_path):
|
| 316 |
+
"""Describe tablet layout and structure using CLIP"""
|
| 317 |
+
try:
|
| 318 |
+
image = Image.open(image_path).convert("RGB")
|
| 319 |
+
|
| 320 |
+
inputs = self.clip_processor(
|
| 321 |
+
text=self.tablet_layouts,
|
| 322 |
+
images=image,
|
| 323 |
+
return_tensors="pt",
|
| 324 |
+
padding=True
|
| 325 |
+
).to(self.device)
|
| 326 |
+
|
| 327 |
+
with torch.inference_mode():
|
| 328 |
+
outputs = self.clip_model(**inputs)
|
| 329 |
+
probs = outputs.logits_per_image.softmax(dim=1)
|
| 330 |
+
|
| 331 |
+
# Get best layout description
|
| 332 |
+
best_idx = torch.argmax(probs)
|
| 333 |
+
best_desc = self.tablet_layouts[best_idx]
|
| 334 |
+
confidence = probs[0][best_idx].item()
|
| 335 |
+
|
| 336 |
+
print(f"[INFO] Tablet layout: {best_desc} (confidence: {confidence:.3f})")
|
| 337 |
+
|
| 338 |
+
if confidence > 0.4:
|
| 339 |
+
return f"tablet_layout: {best_desc}"
|
| 340 |
+
|
| 341 |
+
return "tablet_layout: unidentified cuneiform tablet structure"
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
print(f"[ERROR] Tablet layout analysis failed: {e}")
|
| 345 |
+
return "tablet_layout: analysis_failed"
|
| 346 |
+
|
| 347 |
+
def translate_cuneiform(self, cuneiform_text):
|
| 348 |
+
"""Translate CLIP-analyzed cuneiform content using praeclarum model"""
|
| 349 |
+
if self.cuneiform_model is None:
|
| 350 |
+
self.setup_praeclarum_translator()
|
| 351 |
+
else:
|
| 352 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 353 |
+
reclaim_vram_for("cuneiform")
|
| 354 |
+
if str(next(self.cuneiform_model.parameters()).device) != str(self.device):
|
| 355 |
+
print(f"[VRAM MANAGER] Activating Cuneiform Translator model on {self.device}...")
|
| 356 |
+
self.cuneiform_model.to(self.device)
|
| 357 |
+
|
| 358 |
+
if not getattr(self, 'translator_available', False) or self.cuneiform_model is None:
|
| 359 |
+
return "Translation unavailable - praeclarum model not loaded"
|
| 360 |
+
|
| 361 |
+
# Handle CLIP analysis results
|
| 362 |
+
if cuneiform_text.startswith(("CUNEIFORM_CLIP_FAILED", "CUNEIFORM_ERROR:")):
|
| 363 |
+
return "Translation failed: Visual analysis could not identify cuneiform content"
|
| 364 |
+
|
| 365 |
+
if cuneiform_text.startswith("tablet_layout:"):
|
| 366 |
+
layout_desc = cuneiform_text.replace("tablet_layout: ", "")
|
| 367 |
+
return f"Visual analysis indicates: {layout_desc}. Specific text translation requires clearer wedge visibility."
|
| 368 |
+
|
| 369 |
+
try:
|
| 370 |
+
print(f"[INFO] Translating CLIP-analyzed content: {cuneiform_text[:50]}...")
|
| 371 |
+
|
| 372 |
+
# Use the praeclarum model for translation
|
| 373 |
+
inputs = self.cuneiform_tokenizer(
|
| 374 |
+
cuneiform_text,
|
| 375 |
+
return_tensors="pt",
|
| 376 |
+
max_length=512,
|
| 377 |
+
truncation=True
|
| 378 |
+
).input_ids.to(self.device)
|
| 379 |
+
|
| 380 |
+
with torch.inference_mode():
|
| 381 |
+
outputs = self.cuneiform_model.generate(
|
| 382 |
+
inputs,
|
| 383 |
+
max_new_tokens=200,
|
| 384 |
+
do_sample=True,
|
| 385 |
+
top_k=30,
|
| 386 |
+
top_p=0.95,
|
| 387 |
+
temperature=0.7,
|
| 388 |
+
pad_token_id=self.cuneiform_tokenizer.eos_token_id
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
translation = self.cuneiform_tokenizer.decode(
|
| 392 |
+
outputs[0],
|
| 393 |
+
skip_special_tokens=True
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
translation = self._post_process_translation(translation)
|
| 397 |
+
|
| 398 |
+
if translation.strip():
|
| 399 |
+
print(f"[INFO] CLIP+Translation completed: {translation[:100]}...")
|
| 400 |
+
return translation
|
| 401 |
+
else:
|
| 402 |
+
return "Visual analysis successful, but textual translation inconclusive. This may be a non-textual or damaged tablet section."
|
| 403 |
+
|
| 404 |
+
except Exception as e:
|
| 405 |
+
print(f"[ERROR] Translation of CLIP content failed: {e}")
|
| 406 |
+
return f"Visual analysis successful, translation error: {str(e)}"
|
| 407 |
+
|
| 408 |
+
def _post_process_translation(self, translation):
|
| 409 |
+
"""Post-process cuneiform translation"""
|
| 410 |
+
try:
|
| 411 |
+
# Clean up common translation artifacts
|
| 412 |
+
cleaned = translation.strip()
|
| 413 |
+
|
| 414 |
+
# Check for dots-only output (failed translation)
|
| 415 |
+
if cleaned in ["", "...", ". . .", "... ... ..."] or cleaned.count('.') > len(cleaned) * 0.8:
|
| 416 |
+
print(f"[WARN] Translation appears to be dots/empty, marking as failed")
|
| 417 |
+
return ""
|
| 418 |
+
|
| 419 |
+
# Remove any input text that might have been echoed
|
| 420 |
+
if cleaned.startswith(('lugal', 'an ', 'ki ', 'dingir')):
|
| 421 |
+
lines = cleaned.split('\n')
|
| 422 |
+
for line in lines:
|
| 423 |
+
if not any(line.lower().startswith(pattern) for pattern in ['lugal', 'an ', 'ki ']):
|
| 424 |
+
if len(line.strip()) > 10:
|
| 425 |
+
cleaned = line.strip()
|
| 426 |
+
break
|
| 427 |
+
|
| 428 |
+
# Capitalize first letter
|
| 429 |
+
if cleaned and not cleaned[0].isupper():
|
| 430 |
+
cleaned = cleaned[0].upper() + cleaned[1:]
|
| 431 |
+
|
| 432 |
+
return cleaned
|
| 433 |
+
|
| 434 |
+
except Exception as e:
|
| 435 |
+
print(f"[WARN] Translation post-processing failed: {e}")
|
| 436 |
+
return translation
|
| 437 |
+
|
| 438 |
+
def process_text(self, cuneiform_text):
|
| 439 |
+
"""Process extracted cuneiform text with comprehensive CLIP-aware analysis"""
|
| 440 |
+
if not cuneiform_text:
|
| 441 |
+
return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
|
| 442 |
+
|
| 443 |
+
print("[INFO] Processing cuneiform text with CLIP visual analysis...")
|
| 444 |
+
|
| 445 |
+
# Handle error messages
|
| 446 |
+
if cuneiform_text.startswith(("CUNEIFORM_CLIP_FAILED", "CUNEIFORM_ERROR:", "CUNEIFORM_VISUAL_ANALYSIS_INCOMPLETE")):
|
| 447 |
+
return {
|
| 448 |
+
"text": cuneiform_text,
|
| 449 |
+
"symbols": [],
|
| 450 |
+
"char_analysis": {
|
| 451 |
+
"total_chars": 0,
|
| 452 |
+
"error": "CLIP visual analysis failed",
|
| 453 |
+
"text_format": "Error"
|
| 454 |
+
},
|
| 455 |
+
"validation": {
|
| 456 |
+
"quality_score": 0.0,
|
| 457 |
+
"confidence_level": "Failed",
|
| 458 |
+
"ocr_method": "CLIP Visual Recognition (Failed)",
|
| 459 |
+
"error": cuneiform_text
|
| 460 |
+
}
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
# Extract symbols for visual analysis
|
| 464 |
+
if cuneiform_text.startswith("tablet_layout:"):
|
| 465 |
+
# Layout analysis
|
| 466 |
+
symbols = ""
|
| 467 |
+
char_analysis = {
|
| 468 |
+
"total_chars": len(cuneiform_text),
|
| 469 |
+
"layout_analysis": True,
|
| 470 |
+
"text_format": "Layout Description"
|
| 471 |
+
}
|
| 472 |
+
else:
|
| 473 |
+
# ATF or visual analysis content
|
| 474 |
+
symbols = ''.join(filter(lambda x: x.isalnum() or x in "{}[]().-", cuneiform_text))
|
| 475 |
+
char_analysis = {
|
| 476 |
+
"total_chars": len(cuneiform_text),
|
| 477 |
+
"atf_elements": len(cuneiform_text.split()),
|
| 478 |
+
"unique_chars": len(set(cuneiform_text)),
|
| 479 |
+
"word_count": len(cuneiform_text.split()),
|
| 480 |
+
"text_format": "CLIP Visual Analysis + ATF"
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
# Enhanced validation with CLIP-specific metrics
|
| 484 |
+
validation = {
|
| 485 |
+
"quality_score": self._calculate_clip_quality_score(cuneiform_text),
|
| 486 |
+
"recognition_method": "CLIP Visual Pattern Recognition",
|
| 487 |
+
"model_specialization": "Large-scale Vision Transformer for Ancient Scripts",
|
| 488 |
+
"clip_analysis": True,
|
| 489 |
+
"supports_translation": self.translator_available,
|
| 490 |
+
"input_format": char_analysis.get("text_format", "Unknown"),
|
| 491 |
+
"confidence_level": self._determine_confidence_level(cuneiform_text)
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
return {
|
| 495 |
+
"text": cuneiform_text,
|
| 496 |
+
"symbols": symbols,
|
| 497 |
+
"char_analysis": char_analysis,
|
| 498 |
+
"validation": validation
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
def _calculate_clip_quality_score(self, text):
|
| 502 |
+
"""Calculate quality score for CLIP-analyzed text"""
|
| 503 |
+
if not text:
|
| 504 |
+
return 0.0
|
| 505 |
+
|
| 506 |
+
score = 0.0
|
| 507 |
+
|
| 508 |
+
# Layout analysis bonus
|
| 509 |
+
if text.startswith("tablet_layout:"):
|
| 510 |
+
score = 0.7 # Good layout analysis
|
| 511 |
+
|
| 512 |
+
# ATF content bonuses
|
| 513 |
+
elif any(pattern in text.lower() for pattern in ['lugal', 'an', 'ki', 'dingir', '{d}', 'e2']):
|
| 514 |
+
score += 0.8 # High quality CLIP recognition
|
| 515 |
+
|
| 516 |
+
# Multiple lines bonus
|
| 517 |
+
if '\n' in text:
|
| 518 |
+
score += 0.1
|
| 519 |
+
|
| 520 |
+
# Coherent structure bonus
|
| 521 |
+
words = text.split()
|
| 522 |
+
if len(words) >= 3:
|
| 523 |
+
score += 0.1
|
| 524 |
+
|
| 525 |
+
# Error penalty
|
| 526 |
+
elif text.startswith(("CUNEIFORM_", "visual analysis", "tablet content")):
|
| 527 |
+
score = 0.3 # Some recognition but incomplete
|
| 528 |
+
|
| 529 |
+
return max(0.0, min(1.0, score))
|
| 530 |
+
|
| 531 |
+
def _determine_confidence_level(self, text):
|
| 532 |
+
"""Determine confidence level for CLIP analysis"""
|
| 533 |
+
score = self._calculate_clip_quality_score(text)
|
| 534 |
+
|
| 535 |
+
if score >= 0.8:
|
| 536 |
+
return "Very High"
|
| 537 |
+
elif score >= 0.6:
|
| 538 |
+
return "High"
|
| 539 |
+
elif score >= 0.4:
|
| 540 |
+
return "Medium"
|
| 541 |
+
elif score >= 0.2:
|
| 542 |
+
return "Low"
|
| 543 |
+
else:
|
| 544 |
+
return "Very Low"
|
| 545 |
+
|
| 546 |
+
def process_image(self, image_path):
|
| 547 |
+
"""Main processing method - same interface as other processors"""
|
| 548 |
+
try:
|
| 549 |
+
print(f"[INFO] Processing cuneiform image: {image_path}")
|
| 550 |
+
|
| 551 |
+
# Extract text using CLIP
|
| 552 |
+
extracted_text = self.extract_text(image_path)
|
| 553 |
+
|
| 554 |
+
# Process the extracted content
|
| 555 |
+
processed_result = self.process_text(extracted_text)
|
| 556 |
+
|
| 557 |
+
# Generate historical context
|
| 558 |
+
historical_context = self.generate_historical_context(processed_result)
|
| 559 |
+
|
| 560 |
+
# Generate creative story
|
| 561 |
+
creative_story = self.generate_story(processed_result)
|
| 562 |
+
|
| 563 |
+
return {
|
| 564 |
+
'script_type': 'cuneiform',
|
| 565 |
+
'confidence': processed_result['validation'].get('quality_score', 0.0),
|
| 566 |
+
'processed_result': processed_result,
|
| 567 |
+
'historical_context': historical_context,
|
| 568 |
+
'creative_story': creative_story
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
except Exception as e:
|
| 572 |
+
print(f"[ERROR] Cuneiform image processing failed: {e}")
|
| 573 |
+
return None
|
| 574 |
+
|
| 575 |
+
def generate_historical_context(self, processed_result):
|
| 576 |
+
"""Generate historical context for cuneiform text"""
|
| 577 |
+
cuneiform_text = processed_result.get("text", "")
|
| 578 |
+
|
| 579 |
+
groq_detail = self._generate_groq_context(cuneiform_text)
|
| 580 |
+
|
| 581 |
+
# Build references using words/symbols in cuneiform text
|
| 582 |
+
words = re.findall(r'\w+', cuneiform_text) if cuneiform_text else []
|
| 583 |
+
query_terms = list(words)
|
| 584 |
+
if cuneiform_text:
|
| 585 |
+
query_terms.extend([char for char in cuneiform_text if char.strip()])
|
| 586 |
+
refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
|
| 587 |
+
|
| 588 |
+
return {
|
| 589 |
+
"uses_box": {
|
| 590 |
+
"title": "Cuneiform symbols and their ancient usage",
|
| 591 |
+
"items": self._build_uses_list(cuneiform_text)
|
| 592 |
+
},
|
| 593 |
+
"meaning_box": self._build_meaning_box(cuneiform_text, groq_detail, processed_result),
|
| 594 |
+
"references": refs
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
def _generate_groq_context(self, cuneiform_text):
|
| 598 |
+
"""Generate contextual information using Groq"""
|
| 599 |
+
if not self.groq_client.is_available():
|
| 600 |
+
return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
|
| 601 |
+
|
| 602 |
+
if cuneiform_text.startswith(("CUNEIFORM_", "tablet_layout:")):
|
| 603 |
+
prompt = (
|
| 604 |
+
"This appears to be a cuneiform clay tablet analyzed using computer vision. "
|
| 605 |
+
"Provide a concise, scholarly paragraph (6-10 sentences) covering the history of cuneiform writing, "
|
| 606 |
+
"its use in ancient Mesopotamia, common contexts (administrative, legal, literary), "
|
| 607 |
+
"and the languages it represented (Sumerian, Akkadian, etc.). Include information about "
|
| 608 |
+
"clay tablet creation, scribal practices, and the significance of cuneiform in ancient civilizations."
|
| 609 |
+
)
|
| 610 |
+
else:
|
| 611 |
+
prompt = (
|
| 612 |
+
f"Analyze this cuneiform content identified through visual analysis: {cuneiform_text}\n\n"
|
| 613 |
+
f"Provide a scholarly paragraph (6-10 sentences) on its likely historical context, "
|
| 614 |
+
f"period (3200 BCE to 100 CE), probable purpose (administrative, legal, literary, religious), "
|
| 615 |
+
f"language (Sumerian/Akkadian/other), and cultural significance in ancient Mesopotamian civilization. "
|
| 616 |
+
f"Consider that this was analyzed using AI vision recognition rather than traditional transliteration."
|
| 617 |
+
)
|
| 618 |
+
|
| 619 |
+
system_prompt = "You are an expert Assyriologist and ancient Near Eastern historian. Provide accurate, concise scholarly analysis of cuneiform texts, focusing on historical context, linguistic analysis, and cultural significance."
|
| 620 |
+
enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, cuneiform_text)
|
| 621 |
+
|
| 622 |
+
return self.groq_client.generate_response(
|
| 623 |
+
system_prompt=enriched_system_prompt,
|
| 624 |
+
user_prompt=prompt
|
| 625 |
+
) or "(Historical context unavailable due to Groq error)"
|
| 626 |
+
|
| 627 |
+
def _build_uses_list(self, cuneiform_text):
|
| 628 |
+
"""Build list of cuneiform symbol uses"""
|
| 629 |
+
|
| 630 |
+
# Handle error messages
|
| 631 |
+
if cuneiform_text.startswith(("CUNEIFORM_", "tablet_layout:")):
|
| 632 |
+
return [
|
| 633 |
+
"- Visual analysis attempted but content recognition incomplete",
|
| 634 |
+
"- This may be due to image quality, tablet damage, or complex wedge patterns",
|
| 635 |
+
"- CLIP visual recognition specializes in identifying cuneiform sign types and layouts",
|
| 636 |
+
"- For detailed transliteration, consider using CDLI tools or consulting cuneiform specialists"
|
| 637 |
+
]
|
| 638 |
+
|
| 639 |
+
notes = self.references.get("cuneiform_symbol_notes", {}) or {}
|
| 640 |
+
default_hint = self.references.get("cuneiform_hint",
|
| 641 |
+
"Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages")
|
| 642 |
+
|
| 643 |
+
seen = set()
|
| 644 |
+
items = []
|
| 645 |
+
|
| 646 |
+
# Process ATF elements
|
| 647 |
+
for element in cuneiform_text.split():
|
| 648 |
+
if element in seen or not element.strip():
|
| 649 |
+
continue
|
| 650 |
+
seen.add(element)
|
| 651 |
+
|
| 652 |
+
if element in notes:
|
| 653 |
+
note = notes[element]
|
| 654 |
+
else:
|
| 655 |
+
note = default_hint
|
| 656 |
+
|
| 657 |
+
items.append(f"- {element}: {note}")
|
| 658 |
+
|
| 659 |
+
if not items:
|
| 660 |
+
items.append("- Analysis incomplete: CLIP visual recognition in progress")
|
| 661 |
+
|
| 662 |
+
return items[:15] # Limit display
|
| 663 |
+
|
| 664 |
+
def _build_meaning_box(self, cuneiform_text, groq_detail, processed_result):
|
| 665 |
+
"""Build meaning interpretation box for cuneiform"""
|
| 666 |
+
char_analysis = processed_result.get("char_analysis", {})
|
| 667 |
+
validation = processed_result.get("validation", {})
|
| 668 |
+
|
| 669 |
+
# Build introduction with CLIP context
|
| 670 |
+
text_format = char_analysis.get("text_format", "Unknown")
|
| 671 |
+
confidence = validation.get("confidence_level", "Unknown")
|
| 672 |
+
|
| 673 |
+
intro_lines = [
|
| 674 |
+
f"Cuneiform processed using CLIP visual recognition with confidence: {confidence}.",
|
| 675 |
+
]
|
| 676 |
+
|
| 677 |
+
if validation.get("clip_analysis"):
|
| 678 |
+
intro_lines.extend([
|
| 679 |
+
"Analysis powered by OpenAI CLIP Vision Transformer (Large) for ancient script recognition.",
|
| 680 |
+
"Visual pattern recognition identifies cuneiform signs, layouts, and tablet structures."
|
| 681 |
+
])
|
| 682 |
+
|
| 683 |
+
if self.translator_available:
|
| 684 |
+
intro_lines.append("Translation provided by praeclarum/cuneiform model trained on 210,247 examples.")
|
| 685 |
+
|
| 686 |
+
# Add format-specific information
|
| 687 |
+
if text_format == "Layout Description":
|
| 688 |
+
intro_lines.append("Tablet structure and organization analyzed through computer vision.")
|
| 689 |
+
elif text_format == "CLIP Visual Analysis + ATF":
|
| 690 |
+
intro_lines.append("Visual elements converted to ATF transliteration format.")
|
| 691 |
+
|
| 692 |
+
# Analysis points
|
| 693 |
+
points = []
|
| 694 |
+
|
| 695 |
+
points.extend([
|
| 696 |
+
"• CLIP Vision Transformer provides advanced visual understanding of cuneiform wedge patterns.",
|
| 697 |
+
"• Model trained on large-scale image-text datasets enables zero-shot cuneiform recognition.",
|
| 698 |
+
"• Visual analysis identifies sign types, tablet layouts, and manuscript characteristics."
|
| 699 |
+
])
|
| 700 |
+
|
| 701 |
+
if validation.get("supports_translation"):
|
| 702 |
+
points.append("• Recognized visual elements translated using specialized Mesopotamian language models.")
|
| 703 |
+
|
| 704 |
+
if text_format == "Layout Description":
|
| 705 |
+
points.append("• Tablet structure analysis indicates overall document type and organization.")
|
| 706 |
+
|
| 707 |
+
layout_analysis = char_analysis.get("layout_analysis", False)
|
| 708 |
+
if layout_analysis:
|
| 709 |
+
points.append("• Computer vision successfully identified tablet layout and structural elements.")
|
| 710 |
+
|
| 711 |
+
if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
|
| 712 |
+
points.append(f"• Historical analysis: {groq_detail.strip()}")
|
| 713 |
+
|
| 714 |
+
# Extract key elements for frequent display
|
| 715 |
+
if text_format == "CLIP Visual Analysis + ATF":
|
| 716 |
+
frequent_elements = cuneiform_text.split()[:10]
|
| 717 |
+
else:
|
| 718 |
+
frequent_elements = ["Visual", "Analysis", "CLIP", "Recognition"]
|
| 719 |
+
|
| 720 |
+
return {
|
| 721 |
+
"title": "Cuneiform visual analysis:",
|
| 722 |
+
"intro_lines": intro_lines,
|
| 723 |
+
"frequent_label": "Key elements identified",
|
| 724 |
+
"frequent": frequent_elements,
|
| 725 |
+
"points": points
|
| 726 |
+
}
|
| 727 |
+
|
| 728 |
+
def generate_story(self, processed_result):
|
| 729 |
+
"""Generate creative story for cuneiform text"""
|
| 730 |
+
cuneiform_text = processed_result.get("text", "")
|
| 731 |
+
|
| 732 |
+
if not self.groq_client.is_available():
|
| 733 |
+
return "Groq client unavailable, cannot generate historical narrative."
|
| 734 |
+
|
| 735 |
+
# Determine story context based on analysis type
|
| 736 |
+
char_analysis = processed_result.get("char_analysis", {})
|
| 737 |
+
validation = processed_result.get("validation", {})
|
| 738 |
+
|
| 739 |
+
text_format = char_analysis.get("text_format", "Unknown")
|
| 740 |
+
|
| 741 |
+
# Choose appropriate narrative style based on CLIP analysis
|
| 742 |
+
if "lugal" in cuneiform_text.lower() or "royal" in cuneiform_text.lower():
|
| 743 |
+
styles = [
|
| 744 |
+
"as a royal inscription from the court of Hammurabi",
|
| 745 |
+
"as a victory stela from ancient Assyria",
|
| 746 |
+
"as a chronicle of Mesopotamian kings",
|
| 747 |
+
"as a royal decree from Nebuchadnezzar's reign"
|
| 748 |
+
]
|
| 749 |
+
elif "administrative" in cuneiform_text.lower() or "numbers" in cuneiform_text.lower():
|
| 750 |
+
styles = [
|
| 751 |
+
"as a merchant's inventory from ancient Babylon",
|
| 752 |
+
"as a tax record from a Sumerian temple",
|
| 753 |
+
"as a grain distribution list from Ur",
|
| 754 |
+
"as an administrative archive from Mari"
|
| 755 |
+
]
|
| 756 |
+
elif text_format == "Layout Description":
|
| 757 |
+
styles = [
|
| 758 |
+
"as a damaged tablet discovered in archaeological excavation",
|
| 759 |
+
"as a mysterious cuneiform fragment found in ancient ruins",
|
| 760 |
+
"as a clay tablet uncovered in a Mesopotamian library",
|
| 761 |
+
"as an ancient document preserved in palace archives"
|
| 762 |
+
]
|
| 763 |
+
else:
|
| 764 |
+
styles = [
|
| 765 |
+
"as a scribe's practice tablet from ancient Sumer",
|
| 766 |
+
"as a legal contract from Babylonian courts",
|
| 767 |
+
"as a temple inscription from Mesopotamia",
|
| 768 |
+
"as a literary work from the ancient Near East"
|
| 769 |
+
]
|
| 770 |
+
|
| 771 |
+
import random
|
| 772 |
+
chosen_style = random.choice(styles)
|
| 773 |
+
seed = random.randint(1000, 9999)
|
| 774 |
+
|
| 775 |
+
processing_note = "analyzed through advanced computer vision AI specialized in ancient scripts"
|
| 776 |
+
|
| 777 |
+
prompt = (
|
| 778 |
+
f"This cuneiform tablet was {processing_note}: {cuneiform_text[:100]}...\n\n"
|
| 779 |
+
f"Historical context: This represents one of humanity's oldest writing systems, "
|
| 780 |
+
f"used across ancient Mesopotamia from 3200 BCE to 100 CE.\n\n"
|
| 781 |
+
f"Create a vivid, historically accurate narrative (250+ words) set in ancient Mesopotamia, "
|
| 782 |
+
f"telling the story of this cuneiform tablet's creation and significance. "
|
| 783 |
+
f"Write {chosen_style}.\n\n"
|
| 784 |
+
f"Include: Clay tablet creation process, scribe's daily life, the tablet's importance "
|
| 785 |
+
f"to ancient Mesopotamian society, and authentic historical details of Sumerian/Babylonian/Assyrian culture.\n"
|
| 786 |
+
f"Narrative seed: {seed}"
|
| 787 |
+
)
|
| 788 |
+
|
| 789 |
+
system_prompt = (
|
| 790 |
+
"You are a master storyteller and Assyriologist specializing in ancient Mesopotamian "
|
| 791 |
+
"history, cuneiform literature, and daily life in Sumerian, Babylonian, and Assyrian "
|
| 792 |
+
"civilizations. Create authentic, engaging narratives that reflect accurate knowledge "
|
| 793 |
+
"of ancient Near Eastern cultures, writing practices, and social contexts."
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
story = self.groq_client.generate_response(
|
| 797 |
+
system_prompt=system_prompt,
|
| 798 |
+
user_prompt=prompt
|
| 799 |
+
)
|
| 800 |
+
|
| 801 |
+
if not story or is_gibberish(story):
|
| 802 |
+
return "Failed to generate historical narrative; ancient Mesopotamian story creation unavailable."
|
| 803 |
+
|
| 804 |
+
return story
|
processors/egyptian_processor.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import base64
|
| 4 |
+
import json
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from io import BytesIO
|
| 7 |
+
from itertools import groupby
|
| 8 |
+
from collections import Counter
|
| 9 |
+
from .base_processor import BaseScriptProcessor
|
| 10 |
+
from utils.image_utils import segment_hieroglyphs
|
| 11 |
+
from utils.text_utils import is_gibberish, build_description_from_codes
|
| 12 |
+
from config import Config
|
| 13 |
+
|
| 14 |
+
class EgyptianProcessor(BaseScriptProcessor):
|
| 15 |
+
def __init__(self, groq_client, references, clip_classifier, translator_pipe):
|
| 16 |
+
super().__init__(groq_client, references)
|
| 17 |
+
self.clip_classifier = clip_classifier
|
| 18 |
+
self.translator_pipe = translator_pipe
|
| 19 |
+
self.config = Config()
|
| 20 |
+
|
| 21 |
+
def detect_script(self, image_path):
|
| 22 |
+
"""Simplified detection - Groq Vision handles main classification"""
|
| 23 |
+
try:
|
| 24 |
+
print("[INFO] Egyptian processor activated by Groq Vision (Llama-4-Scout)")
|
| 25 |
+
return True, 0.95
|
| 26 |
+
|
| 27 |
+
except Exception as e:
|
| 28 |
+
print(f"[ERROR] Egyptian detection failed: {e}")
|
| 29 |
+
return False, 0.0
|
| 30 |
+
|
| 31 |
+
def _identify_hieroglyphs_with_vision(self, image_path):
|
| 32 |
+
"""Use Groq Vision (Llama-4-Scout) to identify hieroglyphic symbols from the full image."""
|
| 33 |
+
if not self.groq_client or not self.groq_client.is_available():
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from groq import Groq
|
| 38 |
+
|
| 39 |
+
# Load and encode image
|
| 40 |
+
image = Image.open(image_path)
|
| 41 |
+
if max(image.size) > 1200:
|
| 42 |
+
image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
|
| 43 |
+
buffer = BytesIO()
|
| 44 |
+
image.save(buffer, format="JPEG", quality=90)
|
| 45 |
+
b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 46 |
+
|
| 47 |
+
gardiner_labels = list(self.config.GARDINER_MAP.keys())
|
| 48 |
+
gardiner_codes = list(self.config.GARDINER_MAP.values())
|
| 49 |
+
label_list = ", ".join(
|
| 50 |
+
f"{lbl} ({code})" for lbl, code in zip(gardiner_labels, gardiner_codes)
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
prompt = (
|
| 54 |
+
"You are an expert Egyptologist analyzing an image of Egyptian hieroglyphs.\n\n"
|
| 55 |
+
f"Known Gardiner signs: {label_list}\n\n"
|
| 56 |
+
"Identify up to 15 of the most prominent hieroglyphic symbols visible in the image, in reading order (left-to-right, top-to-bottom).\n"
|
| 57 |
+
"For each identified symbol, pick the BEST matching Gardiner label from the list above.\n"
|
| 58 |
+
"Do not output more than 15 symbols. If a symbol doesn't match any known label, use \"unknown\".\n\n"
|
| 59 |
+
"Respond ONLY with a JSON object:\n"
|
| 60 |
+
"{\"symbols\": [\"label1\", \"label2\", \"label3\", ...]}\n"
|
| 61 |
+
"Example: {\"symbols\": [\"owl\", \"eye\", \"reed\", \"bread\", \"sun\"]}"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
print("[INFO] Sending request to Groq Vision model meta-llama/llama-4-scout-17b-16e-instruct...")
|
| 65 |
+
|
| 66 |
+
client = Groq(api_key=self.groq_client.api_key)
|
| 67 |
+
completion = client.chat.completions.create(
|
| 68 |
+
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
| 69 |
+
messages=[
|
| 70 |
+
{
|
| 71 |
+
"role": "user",
|
| 72 |
+
"content": [
|
| 73 |
+
{"type": "text", "text": prompt},
|
| 74 |
+
{
|
| 75 |
+
"type": "image_url",
|
| 76 |
+
"image_url": {
|
| 77 |
+
"url": f"data:image/jpeg;base64,{b64}",
|
| 78 |
+
},
|
| 79 |
+
},
|
| 80 |
+
],
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
temperature=0.1,
|
| 84 |
+
max_completion_tokens=1024,
|
| 85 |
+
response_format={"type": "json_object"},
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
raw = completion.choices[0].message.content
|
| 89 |
+
print(f"[INFO] Groq Vision raw response received: {raw[:150]}...")
|
| 90 |
+
data = json.loads(raw)
|
| 91 |
+
symbols = data.get("symbols", [])
|
| 92 |
+
|
| 93 |
+
if symbols and isinstance(symbols, list) and len(symbols) > 0:
|
| 94 |
+
# Validate labels against known set + "unknown"
|
| 95 |
+
valid = set(gardiner_labels) | {"unknown"}
|
| 96 |
+
cleaned = [s if s in valid else "unknown" for s in symbols]
|
| 97 |
+
if all(s == "unknown" for s in cleaned):
|
| 98 |
+
print("[INFO] Groq Vision identified only 'unknown' symbols. Falling back.")
|
| 99 |
+
return None
|
| 100 |
+
print(f"[INFO] Groq Vision identified {len(cleaned)} hieroglyphs: {cleaned}")
|
| 101 |
+
return cleaned
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"[WARN] Groq Vision hieroglyph identification failed: {e}")
|
| 105 |
+
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
+
def extract_text(self, image_path):
|
| 109 |
+
"""Extract hieroglyphs — Groq Vision primary, CLIP fallback"""
|
| 110 |
+
try:
|
| 111 |
+
print("[INFO] Starting Egyptian hieroglyph extraction...")
|
| 112 |
+
|
| 113 |
+
# PRIMARY: Use Groq Vision to identify symbols from the full image
|
| 114 |
+
vision_labels = self._identify_hieroglyphs_with_vision(image_path)
|
| 115 |
+
if vision_labels:
|
| 116 |
+
print(f"[INFO] Using Groq Vision result ({len(vision_labels)} symbols)")
|
| 117 |
+
return vision_labels
|
| 118 |
+
|
| 119 |
+
# FALLBACK: Segment + CLIP zero-shot
|
| 120 |
+
print("[INFO] Falling back to CLIP segmentation-based classification...")
|
| 121 |
+
from utils.image_utils import segment_hieroglyphs
|
| 122 |
+
|
| 123 |
+
crops = segment_hieroglyphs(image_path)
|
| 124 |
+
print(f"[INFO] Segmented {len(crops)} hieroglyph regions")
|
| 125 |
+
|
| 126 |
+
if not crops:
|
| 127 |
+
print("[WARN] No hieroglyph regions found")
|
| 128 |
+
return []
|
| 129 |
+
|
| 130 |
+
candidate_labels = list(self.config.GARDINER_MAP.keys())
|
| 131 |
+
labels = self.clip_classifier.classify_symbols(crops, candidate_labels)
|
| 132 |
+
|
| 133 |
+
print(f"[INFO] CLIP classified {len(labels)} symbols: {labels}")
|
| 134 |
+
return labels
|
| 135 |
+
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"[ERROR] Egyptian text extraction failed: {e}")
|
| 138 |
+
import traceback
|
| 139 |
+
traceback.print_exc()
|
| 140 |
+
return []
|
| 141 |
+
def process_text(self, labels):
|
| 142 |
+
"""Process hieroglyph labels into translation"""
|
| 143 |
+
if not labels:
|
| 144 |
+
return {"labels": [], "codes": [], "translation": "", "translation_ok": False}
|
| 145 |
+
|
| 146 |
+
# Convert labels to Gardiner codes
|
| 147 |
+
codes = [self.config.GARDINER_MAP.get((lbl or "").lower(), "?") for lbl in labels]
|
| 148 |
+
|
| 149 |
+
# Attempt translation
|
| 150 |
+
translation, translation_ok = self._translate_sequence(labels, codes)
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
"labels": labels,
|
| 154 |
+
"codes": codes,
|
| 155 |
+
"translation": translation,
|
| 156 |
+
"translation_ok": translation_ok
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
def _translate_sequence(self, labels, codes):
|
| 160 |
+
"""Translate Gardiner sequence using HuggingFace model or Groq fallback"""
|
| 161 |
+
valid_codes = [c for c in codes if c != "?"]
|
| 162 |
+
|
| 163 |
+
if valid_codes and self.translator_pipe:
|
| 164 |
+
seq = " ".join(valid_codes)
|
| 165 |
+
prompt = f"Translate hieroglyph unicode sequence to English: {seq}"
|
| 166 |
+
try:
|
| 167 |
+
output = self.translator_pipe(prompt, max_new_tokens=128, do_sample=False, num_beams=4)
|
| 168 |
+
text = output[0].get('generated_text') or output[0].get('translation_text') or str(output[0])
|
| 169 |
+
|
| 170 |
+
if text and text.strip() != "?" and not is_gibberish(text):
|
| 171 |
+
return text.strip(), True
|
| 172 |
+
|
| 173 |
+
# Try alternative approach
|
| 174 |
+
alt_output = self.translator_pipe(seq, max_new_tokens=128, do_sample=False, num_beams=4)
|
| 175 |
+
alt_text = alt_output[0].get('generated_text') or alt_output[0].get('translation_text') or str(alt_output[0])
|
| 176 |
+
|
| 177 |
+
if alt_text and alt_text.strip() != "?" and not is_gibberish(alt_text):
|
| 178 |
+
return alt_text.strip(), True
|
| 179 |
+
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"[WARN] Seq2Seq translation failed: {e}")
|
| 182 |
+
|
| 183 |
+
# Groq Fallback for translating known symbols
|
| 184 |
+
if self.groq_client and self.groq_client.is_available():
|
| 185 |
+
try:
|
| 186 |
+
known_labels = [lbl for lbl in labels if lbl and lbl != "unknown"]
|
| 187 |
+
if known_labels:
|
| 188 |
+
symbols_str = ", ".join(known_labels)
|
| 189 |
+
system_prompt = "You are an expert Egyptologist and translator of ancient Egyptian hieroglyphs."
|
| 190 |
+
user_prompt = (
|
| 191 |
+
f"We detected a sequence of ancient Egyptian hieroglyphic symbols: {symbols_str}.\n"
|
| 192 |
+
"Provide a concise, scholarly English translation or logical interpretation of this combination of signs.\n"
|
| 193 |
+
"Keep it direct, under 15 words, and do not include any introductory phrases, explanations, or quotes."
|
| 194 |
+
)
|
| 195 |
+
translation = self.groq_client.generate_response(system_prompt, user_prompt, max_tokens=64)
|
| 196 |
+
translation = translation.strip().replace('"', '')
|
| 197 |
+
if translation and not is_gibberish(translation):
|
| 198 |
+
return translation, True
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"[WARN] Groq fallback translation failed: {e}")
|
| 201 |
+
|
| 202 |
+
# Fallback to description
|
| 203 |
+
description = build_description_from_codes(codes)
|
| 204 |
+
return f"(Symbols described as: {description})", False
|
| 205 |
+
|
| 206 |
+
def generate_historical_context(self, processed_result):
|
| 207 |
+
"""Generate historical context for Egyptian text"""
|
| 208 |
+
translation = processed_result.get("translation", "")
|
| 209 |
+
codes = processed_result.get("codes", [])
|
| 210 |
+
labels = processed_result.get("labels", [])
|
| 211 |
+
|
| 212 |
+
# Generate Groq context
|
| 213 |
+
groq_detail = self._generate_groq_context(translation, codes)
|
| 214 |
+
|
| 215 |
+
# Build references
|
| 216 |
+
query_terms = list(labels) + list(codes)
|
| 217 |
+
refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
|
| 218 |
+
|
| 219 |
+
# Build structured context
|
| 220 |
+
return {
|
| 221 |
+
"uses_box": {
|
| 222 |
+
"title": "Each symbol's possible use by the egyptian people",
|
| 223 |
+
"items": self._build_uses_list(labels)
|
| 224 |
+
},
|
| 225 |
+
"meaning_box": self._build_meaning_box(labels, groq_detail),
|
| 226 |
+
"references": refs
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
def _generate_groq_context(self, translation_text, codes):
|
| 230 |
+
"""Generate contextual information using Groq"""
|
| 231 |
+
if not self.groq_client.is_available():
|
| 232 |
+
return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
|
| 233 |
+
|
| 234 |
+
if is_gibberish(translation_text):
|
| 235 |
+
prompt_body = build_description_from_codes(codes)
|
| 236 |
+
prompt = (
|
| 237 |
+
f"The following sequence of ancient Egyptian symbols is described as: {prompt_body}.\n\n"
|
| 238 |
+
"Provide a concise, scholarly paragraph (6-10 sentences) covering cultural context, symbolic meanings, "
|
| 239 |
+
"typical usage, probable time period, and relevant archaeological comparisons. Avoid repeating the prompt."
|
| 240 |
+
)
|
| 241 |
+
else:
|
| 242 |
+
prompt = (
|
| 243 |
+
f"Provide a concise, scholarly paragraph (6-10 sentences) on the historical significance, cultural context, "
|
| 244 |
+
f"symbolism, and possible interpretations of this ancient Egyptian text: {translation_text}. Avoid repeating the prompt."
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
system_prompt = "You are a careful Egyptologist and historian. Provide accurate, concise scholarly context."
|
| 248 |
+
enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, translation_text, codes)
|
| 249 |
+
|
| 250 |
+
return self.groq_client.generate_response(
|
| 251 |
+
system_prompt=enriched_system_prompt,
|
| 252 |
+
user_prompt=prompt,
|
| 253 |
+
max_tokens=self.config.GROQ_CONTEXT_MAX_TOKENS
|
| 254 |
+
) or "(context unavailable due to Groq error)"
|
| 255 |
+
|
| 256 |
+
def _build_uses_list(self, labels):
|
| 257 |
+
"""Build list of symbol uses"""
|
| 258 |
+
groups = []
|
| 259 |
+
for key, g in groupby(labels):
|
| 260 |
+
if not key:
|
| 261 |
+
continue
|
| 262 |
+
groups.append((key, len(list(g))))
|
| 263 |
+
|
| 264 |
+
notes = self.references.get("egypt_symbol_notes", {}) or {}
|
| 265 |
+
seen = set()
|
| 266 |
+
items = []
|
| 267 |
+
|
| 268 |
+
for name, count in groups:
|
| 269 |
+
if not name or name.lower() in seen:
|
| 270 |
+
continue
|
| 271 |
+
seen.add(name.lower())
|
| 272 |
+
|
| 273 |
+
count_str = f" (x{count})" if count > 1 else ""
|
| 274 |
+
note = notes.get(name.lower(), "Common sign whose meaning varies by phonetic/ideogram/determinative roles.")
|
| 275 |
+
items.append(f"- {name}{count_str}: {note}")
|
| 276 |
+
|
| 277 |
+
if not items:
|
| 278 |
+
items.append("- unknown: No stable mapping; likely decorative or damaged glyphs.")
|
| 279 |
+
|
| 280 |
+
return items
|
| 281 |
+
|
| 282 |
+
def _build_meaning_box(self, labels, groq_detail):
|
| 283 |
+
"""Build meaning interpretation box"""
|
| 284 |
+
freq = Counter([l for l in labels if l])
|
| 285 |
+
frequent = [f"{name} (x{cnt})" for name, cnt in freq.most_common(6)]
|
| 286 |
+
|
| 287 |
+
intro_lines = [
|
| 288 |
+
"The dense recurrence of signs suggests a formulaic or protective sequence, where phonograms articulate a core utterance and determinatives or iconic signs reinforce ritual intent.",
|
| 289 |
+
"Comparable sequences appear on funerary equipment from the Middle Kingdom onward."
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
points = [
|
| 293 |
+
"• Offering and action signs (bread, jar, hoe, bow) commonly structure invocations or provisioning lists for the afterlife.",
|
| 294 |
+
"• Repetition often encodes names or epithets; determinatives (eye, feather, god_figure) frame a protective or ritual context.",
|
| 295 |
+
"• Repertoire and layout align with New Kingdom funerary practice focused on protection, sustenance, and legitimation."
|
| 296 |
+
]
|
| 297 |
+
|
| 298 |
+
if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
|
| 299 |
+
points.append(groq_detail.strip())
|
| 300 |
+
|
| 301 |
+
return {
|
| 302 |
+
"title": "Possible meaning:",
|
| 303 |
+
"intro_lines": intro_lines,
|
| 304 |
+
"frequent_label": "Frequently observed signs",
|
| 305 |
+
"frequent": frequent,
|
| 306 |
+
"points": points
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
def generate_story(self, processed_result):
|
| 310 |
+
"""Generate creative story for Egyptian text"""
|
| 311 |
+
labels = processed_result.get("labels", [])
|
| 312 |
+
description = ", ".join([lbl for lbl in labels if lbl])
|
| 313 |
+
|
| 314 |
+
if not self.groq_client.is_available():
|
| 315 |
+
return self._simple_templated_story(description)
|
| 316 |
+
|
| 317 |
+
style = [
|
| 318 |
+
"as an epic poem from a wandering bard",
|
| 319 |
+
"as a prophecy carved in stone",
|
| 320 |
+
"as a fireside tale with vivid emotions",
|
| 321 |
+
"as a dialogue between two ancient gods",
|
| 322 |
+
"as a lost papyrus narrative recovered from the sands",
|
| 323 |
+
"as a myth told by a court poet"
|
| 324 |
+
]
|
| 325 |
+
|
| 326 |
+
import random
|
| 327 |
+
chosen_style = random.choice(style)
|
| 328 |
+
seed = random.randint(1000, 9999)
|
| 329 |
+
|
| 330 |
+
prompt = (
|
| 331 |
+
f"The following sequence of ancient Egyptian symbols is described as: {description}\n\n"
|
| 332 |
+
f"Can you create a long, vivid, imaginative story from ancient times "
|
| 333 |
+
f"based on this sequence of Egyptian symbols: [your sequence]. "
|
| 334 |
+
f"Write it as one rich paragraph with a lot of detail, mystery, and historical atmosphere. "
|
| 335 |
+
f"At least 200 words.\n\n"
|
| 336 |
+
f"Creative seed: {seed}\n"
|
| 337 |
+
f"Write a richly detailed, imaginative myth-like story {chosen_style}. "
|
| 338 |
+
"Include multiple characters, vivid imagery, and at least 3 short scenes. "
|
| 339 |
+
"Do NOT repeat the same sentence or phrase verbatim. "
|
| 340 |
+
"Keep it evocative and unpredictable."
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
system_prompt = "You are a creative ancient historian and myth-maker. Invent rich, imaginative tales."
|
| 344 |
+
|
| 345 |
+
story = self.groq_client.generate_response(
|
| 346 |
+
system_prompt=system_prompt,
|
| 347 |
+
user_prompt=prompt,
|
| 348 |
+
max_tokens=self.config.GROQ_STORY_MAX_TOKENS
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
if not story or is_gibberish(story):
|
| 352 |
+
return self._simple_templated_story(description)
|
| 353 |
+
|
| 354 |
+
return story
|
| 355 |
+
|
| 356 |
+
def _simple_templated_story(self, description):
|
| 357 |
+
"""Fallback story generation"""
|
| 358 |
+
import re
|
| 359 |
+
parts = [p.strip() for p in re.split(r',\s*', description) if p.strip()]
|
| 360 |
+
keywords = []
|
| 361 |
+
|
| 362 |
+
for p in parts:
|
| 363 |
+
m = re.match(r'([a-zA-Z0-9_-]+)', p)
|
| 364 |
+
if m:
|
| 365 |
+
kw = m.group(1)
|
| 366 |
+
if kw not in keywords:
|
| 367 |
+
keywords.append(kw)
|
| 368 |
+
if len(keywords) >= 8:
|
| 369 |
+
break
|
| 370 |
+
|
| 371 |
+
flavor = {
|
| 372 |
+
"bow": "strength and vigilance",
|
| 373 |
+
"hoe": "the work of the fields",
|
| 374 |
+
"reed": "the scribe's craft",
|
| 375 |
+
"owl": "hidden wisdom of the night",
|
| 376 |
+
"eye": "divine sight",
|
| 377 |
+
"bread": "offerings to the ka",
|
| 378 |
+
"unknown": "mysterious signs"
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
lead = []
|
| 382 |
+
if keywords:
|
| 383 |
+
lead.append(f"In an age of river and stone, a tale was told of {flavor.get(keywords[0], keywords[0])}.")
|
| 384 |
+
if len(keywords) > 1:
|
| 385 |
+
second = flavor.get(keywords[1], keywords[1])
|
| 386 |
+
third = flavor.get(keywords[2], keywords[2]) if len(keywords) > 2 else "omens"
|
| 387 |
+
lead.append(f"It spoke of {second} and {third} guiding a soul beyond the horizon.")
|
| 388 |
+
lead.append("Under the stars, elders whispered a vow that the names would endure.")
|
| 389 |
+
|
| 390 |
+
return " ".join(lead)
|
processors/greek_processor.py
ADDED
|
@@ -0,0 +1,774 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytesseract
|
| 2 |
+
import re
|
| 3 |
+
import os
|
| 4 |
+
import cv2
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from .base_processor import BaseScriptProcessor
|
| 9 |
+
from utils.text_utils import is_gibberish
|
| 10 |
+
|
| 11 |
+
BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
|
| 12 |
+
GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr")
|
| 13 |
+
|
| 14 |
+
class GreekProcessor(BaseScriptProcessor):
|
| 15 |
+
def __init__(self, groq_client, references, clip_classifier):
|
| 16 |
+
super().__init__(groq_client, references, clip_classifier)
|
| 17 |
+
self.clip_classifier = clip_classifier
|
| 18 |
+
self.setup_ancient_greek_ocr()
|
| 19 |
+
|
| 20 |
+
self.trocr_model = None
|
| 21 |
+
self.trocr_processor = None
|
| 22 |
+
self.trocr_available = False
|
| 23 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 24 |
+
|
| 25 |
+
# Register for dynamic VRAM management
|
| 26 |
+
from utils.gpu_diagnostics import register_processor
|
| 27 |
+
register_processor("greek", self)
|
| 28 |
+
|
| 29 |
+
def setup_greek_trocr(self):
|
| 30 |
+
"""Setup TrOCR model — BEST for ancient Greek manuscripts"""
|
| 31 |
+
try:
|
| 32 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 33 |
+
reclaim_vram_for("greek")
|
| 34 |
+
|
| 35 |
+
print("[INFO] Lazily loading TrOCR model for ancient Greek...")
|
| 36 |
+
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 37 |
+
import torch
|
| 38 |
+
|
| 39 |
+
self.trocr_processor = TrOCRProcessor.from_pretrained(
|
| 40 |
+
'rithwikn/trocr_greek_combined',
|
| 41 |
+
cache_dir=GREEK_TROCR_MODEL_DIR,
|
| 42 |
+
local_files_only=False
|
| 43 |
+
)
|
| 44 |
+
self.trocr_model = VisionEncoderDecoderModel.from_pretrained(
|
| 45 |
+
'rithwikn/trocr_greek_combined',
|
| 46 |
+
cache_dir=GREEK_TROCR_MODEL_DIR,
|
| 47 |
+
local_files_only=False
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
self.trocr_model.to(self.device)
|
| 51 |
+
self.trocr_model.eval() # Put in evaluation mode
|
| 52 |
+
|
| 53 |
+
from utils.gpu_diagnostics import log_model_device
|
| 54 |
+
log_model_device("Greek TrOCR", self.device)
|
| 55 |
+
|
| 56 |
+
self.trocr_available = True
|
| 57 |
+
print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}")
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}")
|
| 61 |
+
self.trocr_available = False
|
| 62 |
+
|
| 63 |
+
def setup_ancient_greek_ocr(self):
|
| 64 |
+
"""Setup Ancient Greek OCR with specialized tessdata"""
|
| 65 |
+
# Path to Ancient Greek tessdata (download from ancientgreekocr.org)
|
| 66 |
+
self.ancient_greek_tessdata = os.path.join(
|
| 67 |
+
os.path.dirname(__file__),
|
| 68 |
+
"..", "tessdata", "ancient-greek"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Verify tessdata exists
|
| 72 |
+
if os.path.exists(self.ancient_greek_tessdata):
|
| 73 |
+
print(f"[INFO] Ancient Greek tessdata found: {self.ancient_greek_tessdata}")
|
| 74 |
+
else:
|
| 75 |
+
print(f"[WARN] Ancient Greek tessdata not found at: {self.ancient_greek_tessdata}")
|
| 76 |
+
print("[INFO] Download from: https://ancientgreekocr.org")
|
| 77 |
+
def detect_script(self, image_path):
|
| 78 |
+
"""Simplified detection - Groq Vision handles main classification"""
|
| 79 |
+
try:
|
| 80 |
+
if not getattr(self, 'trocr_available', False):
|
| 81 |
+
# Check if Ancient Greek OCR is available as fallback
|
| 82 |
+
grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
|
| 83 |
+
if not os.path.exists(grc_file):
|
| 84 |
+
print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)")
|
| 85 |
+
return False, 0.5
|
| 86 |
+
|
| 87 |
+
# If called by Groq Vision classification, accept with high confidence
|
| 88 |
+
print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)")
|
| 89 |
+
return True, 0.95
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"[ERROR] Greek detection failed: {e}")
|
| 93 |
+
return False, 0.0
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _quick_greek_ocr_test(self, image_path):
|
| 97 |
+
"""Quick OCR test to validate Greek content"""
|
| 98 |
+
try:
|
| 99 |
+
# Quick test with small image crop
|
| 100 |
+
image = Image.open(image_path)
|
| 101 |
+
# Take center crop for testing
|
| 102 |
+
w, h = image.size
|
| 103 |
+
crop_box = (w//4, h//4, 3*w//4, 3*h//4)
|
| 104 |
+
test_crop = image.crop(crop_box)
|
| 105 |
+
|
| 106 |
+
# Test with standard Greek OCR
|
| 107 |
+
test_text = pytesseract.image_to_string(test_crop, lang="ell")
|
| 108 |
+
greek_char_count = self._count_greek_chars(test_text or "")
|
| 109 |
+
|
| 110 |
+
# If we find Greek characters, it's likely Greek
|
| 111 |
+
return greek_char_count >= 3
|
| 112 |
+
|
| 113 |
+
except Exception:
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
def extract_text(self, image_path):
|
| 117 |
+
"""Enhanced Greek text extraction with TrOCR primary, Tesseract fallback"""
|
| 118 |
+
try:
|
| 119 |
+
image = Image.open(image_path)
|
| 120 |
+
|
| 121 |
+
# Ensure the Greek TrOCR model is loaded dynamically
|
| 122 |
+
if self.trocr_model is None:
|
| 123 |
+
self.setup_greek_trocr()
|
| 124 |
+
else:
|
| 125 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 126 |
+
reclaim_vram_for("greek")
|
| 127 |
+
if str(next(self.trocr_model.parameters()).device) != str(self.device):
|
| 128 |
+
print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
|
| 129 |
+
self.trocr_model.to(self.device)
|
| 130 |
+
|
| 131 |
+
# Method 1: Ancient Greek TrOCR (if available)
|
| 132 |
+
if getattr(self, 'trocr_available', False) and self.trocr_model is not None:
|
| 133 |
+
print("[INFO] Attempting Ancient Greek extraction with TrOCR...")
|
| 134 |
+
trocr_text = self._extract_with_trocr(image_path)
|
| 135 |
+
if trocr_text and self._validate_greek_text(trocr_text):
|
| 136 |
+
print("[INFO] Using Ancient Greek TrOCR result")
|
| 137 |
+
return trocr_text
|
| 138 |
+
print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...")
|
| 139 |
+
|
| 140 |
+
# Method 2: Ancient Greek OCR (if available and safe)
|
| 141 |
+
grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
|
| 142 |
+
if os.path.exists(grc_file):
|
| 143 |
+
ancient_greek_text = self._extract_with_ancient_greek_ocr(image)
|
| 144 |
+
if ancient_greek_text and self._validate_greek_text(ancient_greek_text):
|
| 145 |
+
print("[INFO] Using Ancient Greek OCR result")
|
| 146 |
+
return ancient_greek_text
|
| 147 |
+
|
| 148 |
+
# Method 3: Standard Greek OCR
|
| 149 |
+
standard_greek_text = self._extract_with_standard_greek_ocr(image)
|
| 150 |
+
if standard_greek_text and self._validate_greek_text(standard_greek_text):
|
| 151 |
+
print("[INFO] Using standard Greek OCR result")
|
| 152 |
+
return standard_greek_text
|
| 153 |
+
|
| 154 |
+
# Method 4: Layout-aware line segment fallback
|
| 155 |
+
print("[INFO] Trying layout-aware Greek segmentation fallback...")
|
| 156 |
+
layout_aware_greek_text = self._extract_layout_aware_ocr(image_path)
|
| 157 |
+
if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text):
|
| 158 |
+
print("[INFO] Using layout-aware Greek OCR result")
|
| 159 |
+
return layout_aware_greek_text
|
| 160 |
+
|
| 161 |
+
# Method 5: Final validation - if no good Greek text found, return empty
|
| 162 |
+
print("[INFO] No valid Greek text detected")
|
| 163 |
+
return ""
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"[ERROR] Greek text extraction failed: {e}")
|
| 167 |
+
return ""
|
| 168 |
+
|
| 169 |
+
def _extract_with_trocr(self, image_path):
|
| 170 |
+
"""Extract text using TrOCR Ancient Greek model line-by-line"""
|
| 171 |
+
if self.trocr_model is None:
|
| 172 |
+
self.setup_greek_trocr()
|
| 173 |
+
else:
|
| 174 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 175 |
+
reclaim_vram_for("greek")
|
| 176 |
+
if str(next(self.trocr_model.parameters()).device) != str(self.device):
|
| 177 |
+
print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
|
| 178 |
+
self.trocr_model.to(self.device)
|
| 179 |
+
|
| 180 |
+
if not getattr(self, 'trocr_available', False) or self.trocr_model is None:
|
| 181 |
+
return ""
|
| 182 |
+
|
| 183 |
+
try:
|
| 184 |
+
import torch
|
| 185 |
+
from PIL import Image
|
| 186 |
+
print("[INFO] Segmenting layout for Greek TrOCR...")
|
| 187 |
+
layout = self.layout_parser.analyze_layout(image_path)
|
| 188 |
+
crops = self.layout_parser.crop_lines(image_path, layout)
|
| 189 |
+
|
| 190 |
+
# Fallback to whole image if no crops detected
|
| 191 |
+
if not crops:
|
| 192 |
+
print("[WARN] No line crops found, processing full image with TrOCR")
|
| 193 |
+
crops = [Image.open(image_path).convert("RGB")]
|
| 194 |
+
|
| 195 |
+
line_texts = []
|
| 196 |
+
print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...")
|
| 197 |
+
for idx, crop in enumerate(crops):
|
| 198 |
+
# Ensure RGB mode for TrOCR
|
| 199 |
+
crop_rgb = crop.convert("RGB")
|
| 200 |
+
|
| 201 |
+
pixel_values = self.trocr_processor(
|
| 202 |
+
images=crop_rgb,
|
| 203 |
+
return_tensors="pt"
|
| 204 |
+
).pixel_values.to(self.device)
|
| 205 |
+
|
| 206 |
+
with torch.inference_mode():
|
| 207 |
+
generated_ids = self.trocr_model.generate(
|
| 208 |
+
pixel_values,
|
| 209 |
+
max_length=256,
|
| 210 |
+
num_beams=4,
|
| 211 |
+
early_stopping=True,
|
| 212 |
+
repetition_penalty=1.2
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
text = self.trocr_processor.batch_decode(
|
| 216 |
+
generated_ids,
|
| 217 |
+
skip_special_tokens=True
|
| 218 |
+
)[0]
|
| 219 |
+
|
| 220 |
+
if text.strip():
|
| 221 |
+
line_texts.append(text.strip())
|
| 222 |
+
|
| 223 |
+
full_text = "\n".join(line_texts)
|
| 224 |
+
print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image")
|
| 225 |
+
return full_text
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"[ERROR] Greek TrOCR extraction failed: {e}")
|
| 229 |
+
return ""
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _extract_with_ancient_greek_ocr(self, image):
|
| 233 |
+
"""Extract using specialized Ancient Greek OCR"""
|
| 234 |
+
try:
|
| 235 |
+
# Save original tessdata path
|
| 236 |
+
original_tessdata = os.environ.get("TESSDATA_PREFIX", "")
|
| 237 |
+
|
| 238 |
+
# Set tessdata path properly (fix the path format)
|
| 239 |
+
if os.path.exists(self.ancient_greek_tessdata):
|
| 240 |
+
# Ensure proper path format without trailing quotes
|
| 241 |
+
clean_path = str(self.ancient_greek_tessdata).replace('"', '')
|
| 242 |
+
os.environ["TESSDATA_PREFIX"] = clean_path
|
| 243 |
+
print(f"[INFO] Set TESSDATA_PREFIX to: {clean_path}")
|
| 244 |
+
else:
|
| 245 |
+
print(f"[WARN] Ancient Greek tessdata not found at: {self.ancient_greek_tessdata}")
|
| 246 |
+
return ""
|
| 247 |
+
|
| 248 |
+
# Use ancient Greek language code 'grc' with optimized settings
|
| 249 |
+
config = "--psm 6 --oem 1 -c preserve_interword_spaces=1"
|
| 250 |
+
|
| 251 |
+
# Try ancient Greek language pack
|
| 252 |
+
text = pytesseract.image_to_string(
|
| 253 |
+
image,
|
| 254 |
+
lang="grc", # Ancient Greek language code
|
| 255 |
+
config=config
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Restore original tessdata path
|
| 259 |
+
if original_tessdata:
|
| 260 |
+
os.environ["TESSDATA_PREFIX"] = original_tessdata
|
| 261 |
+
else:
|
| 262 |
+
# Remove the environment variable if it wasn't set before
|
| 263 |
+
if "TESSDATA_PREFIX" in os.environ:
|
| 264 |
+
del os.environ["TESSDATA_PREFIX"]
|
| 265 |
+
|
| 266 |
+
return text.strip()
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
print(f"[WARN] Ancient Greek OCR failed: {e}")
|
| 270 |
+
# Make sure to restore tessdata path even on error
|
| 271 |
+
if 'original_tessdata' in locals() and original_tessdata:
|
| 272 |
+
os.environ["TESSDATA_PREFIX"] = original_tessdata
|
| 273 |
+
return ""
|
| 274 |
+
|
| 275 |
+
def _extract_layout_aware_ocr(self, image_path):
|
| 276 |
+
"""Extract text by segmenting the page layout into lines first for improved readability order"""
|
| 277 |
+
try:
|
| 278 |
+
import pytesseract
|
| 279 |
+
print("[INFO] Running layout-aware line segmentation for Greek...")
|
| 280 |
+
layout = self.layout_parser.analyze_layout(image_path)
|
| 281 |
+
crops = self.layout_parser.crop_lines(image_path, layout)
|
| 282 |
+
|
| 283 |
+
if not crops:
|
| 284 |
+
print("[WARN] Layout parser returned no line crops for Greek")
|
| 285 |
+
return ""
|
| 286 |
+
|
| 287 |
+
print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines")
|
| 288 |
+
line_texts = []
|
| 289 |
+
|
| 290 |
+
# Try to use Ancient Greek first
|
| 291 |
+
grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
|
| 292 |
+
use_grc = os.path.exists(grc_file)
|
| 293 |
+
|
| 294 |
+
# Save original TESSDATA_PREFIX
|
| 295 |
+
original_tessdata = os.environ.get("TESSDATA_PREFIX", "")
|
| 296 |
+
if use_grc:
|
| 297 |
+
clean_path = str(self.ancient_greek_tessdata).replace('"', '')
|
| 298 |
+
os.environ["TESSDATA_PREFIX"] = clean_path
|
| 299 |
+
|
| 300 |
+
try:
|
| 301 |
+
for idx, crop in enumerate(crops):
|
| 302 |
+
# Enhance line crop for OCR
|
| 303 |
+
crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
|
| 304 |
+
gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
|
| 305 |
+
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
|
| 306 |
+
enhanced = clahe.apply(gray)
|
| 307 |
+
crop_pil = Image.fromarray(enhanced)
|
| 308 |
+
|
| 309 |
+
config = '--oem 3 --psm 7'
|
| 310 |
+
text = ""
|
| 311 |
+
|
| 312 |
+
if use_grc:
|
| 313 |
+
text = pytesseract.image_to_string(
|
| 314 |
+
crop_pil,
|
| 315 |
+
lang='grc',
|
| 316 |
+
config=config
|
| 317 |
+
).strip()
|
| 318 |
+
|
| 319 |
+
if not text:
|
| 320 |
+
text = pytesseract.image_to_string(
|
| 321 |
+
crop_pil,
|
| 322 |
+
lang='ell',
|
| 323 |
+
config=config
|
| 324 |
+
).strip()
|
| 325 |
+
|
| 326 |
+
if text:
|
| 327 |
+
line_texts.append(text)
|
| 328 |
+
finally:
|
| 329 |
+
if use_grc and original_tessdata:
|
| 330 |
+
os.environ["TESSDATA_PREFIX"] = original_tessdata
|
| 331 |
+
|
| 332 |
+
return "\n".join(line_texts)
|
| 333 |
+
except Exception as e:
|
| 334 |
+
print(f"[WARN] Layout aware Greek OCR failed: {e}")
|
| 335 |
+
return ""
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def _extract_with_standard_greek_ocr(self, image):
|
| 339 |
+
"""Extract using standard Greek OCR with optimized settings"""
|
| 340 |
+
try:
|
| 341 |
+
# Multiple OCR attempts with different settings
|
| 342 |
+
configs = [
|
| 343 |
+
"--psm 6 --oem 1", # Uniform text block
|
| 344 |
+
"--psm 4 --oem 1", # Single column text
|
| 345 |
+
"--psm 3 --oem 1", # Default, automatic page segmentation
|
| 346 |
+
"--psm 8 --oem 1" # Single word
|
| 347 |
+
]
|
| 348 |
+
|
| 349 |
+
for config in configs:
|
| 350 |
+
try:
|
| 351 |
+
text = pytesseract.image_to_string(
|
| 352 |
+
image,
|
| 353 |
+
lang="ell", # Modern Greek
|
| 354 |
+
config=config
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
if text and self._validate_greek_text(text):
|
| 358 |
+
return text.strip()
|
| 359 |
+
|
| 360 |
+
except Exception:
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
return ""
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print(f"[WARN] Standard Greek OCR failed: {e}")
|
| 367 |
+
return ""
|
| 368 |
+
|
| 369 |
+
def _extract_with_preprocessing(self, image):
|
| 370 |
+
"""Fallback extraction with image preprocessing"""
|
| 371 |
+
try:
|
| 372 |
+
# Convert PIL to CV2
|
| 373 |
+
cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 374 |
+
|
| 375 |
+
# Image preprocessing for better OCR
|
| 376 |
+
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
|
| 377 |
+
|
| 378 |
+
# Try different preprocessing approaches
|
| 379 |
+
preprocessed_images = [
|
| 380 |
+
gray, # Original grayscale
|
| 381 |
+
cv2.GaussianBlur(gray, (1, 1), 0), # Slight blur
|
| 382 |
+
cv2.medianBlur(gray, 3), # Noise reduction
|
| 383 |
+
cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Adaptive threshold
|
| 384 |
+
]
|
| 385 |
+
|
| 386 |
+
for processed_img in preprocessed_images:
|
| 387 |
+
try:
|
| 388 |
+
pil_img = Image.fromarray(processed_img)
|
| 389 |
+
text = pytesseract.image_to_string(
|
| 390 |
+
pil_img,
|
| 391 |
+
lang="ell",
|
| 392 |
+
config="--psm 6 --oem 1"
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
if self._validate_greek_text(text):
|
| 396 |
+
return text.strip()
|
| 397 |
+
|
| 398 |
+
except Exception:
|
| 399 |
+
continue
|
| 400 |
+
|
| 401 |
+
return ""
|
| 402 |
+
|
| 403 |
+
except Exception as e:
|
| 404 |
+
print(f"[WARN] Fallback Greek OCR failed: {e}")
|
| 405 |
+
return ""
|
| 406 |
+
|
| 407 |
+
def _count_greek_chars(self, text):
|
| 408 |
+
"""Count Greek Unicode characters including polytonic marks"""
|
| 409 |
+
if not text:
|
| 410 |
+
return 0
|
| 411 |
+
|
| 412 |
+
def is_greek_char(ch):
|
| 413 |
+
o = ord(ch)
|
| 414 |
+
# Greek and Coptic (0x0370-0x03FF)
|
| 415 |
+
# Greek Extended (0x1F00-0x1FFF) - includes polytonic marks
|
| 416 |
+
return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF)
|
| 417 |
+
|
| 418 |
+
return sum(is_greek_char(ch) for ch in text)
|
| 419 |
+
|
| 420 |
+
def _validate_greek_text(self, text):
|
| 421 |
+
"""Validate if text contains meaningful Greek content"""
|
| 422 |
+
if not text or len(text.strip()) < 3:
|
| 423 |
+
return False
|
| 424 |
+
|
| 425 |
+
# Count Greek characters
|
| 426 |
+
greek_char_count = self._count_greek_chars(text)
|
| 427 |
+
total_chars = len(re.sub(r'\s+', '', text))
|
| 428 |
+
|
| 429 |
+
if total_chars == 0:
|
| 430 |
+
return False
|
| 431 |
+
|
| 432 |
+
# Check for Latin characters (should reject if too many)
|
| 433 |
+
latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
|
| 434 |
+
latin_ratio = latin_chars / total_chars if total_chars > 0 else 0
|
| 435 |
+
|
| 436 |
+
# If text is mostly Latin characters, reject it
|
| 437 |
+
if latin_ratio > 0.8 and greek_char_count < 3:
|
| 438 |
+
print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}")
|
| 439 |
+
return False
|
| 440 |
+
|
| 441 |
+
# At least 20% should be Greek characters, or minimum 5 Greek chars
|
| 442 |
+
greek_ratio = greek_char_count / total_chars
|
| 443 |
+
|
| 444 |
+
return greek_char_count >= 5 or greek_ratio >= 0.20
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def _extract_distinct_terms(self, text):
|
| 448 |
+
"""Extract distinct Greek terms from text"""
|
| 449 |
+
if not text:
|
| 450 |
+
return []
|
| 451 |
+
|
| 452 |
+
# Find Greek words (including those with diacritical marks)
|
| 453 |
+
tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
|
| 454 |
+
|
| 455 |
+
def is_greek_word(word):
|
| 456 |
+
return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF)
|
| 457 |
+
for ch in word)
|
| 458 |
+
|
| 459 |
+
distinct_terms = []
|
| 460 |
+
seen = set()
|
| 461 |
+
|
| 462 |
+
for token in tokens:
|
| 463 |
+
if len(token) < 2: # Skip single characters
|
| 464 |
+
continue
|
| 465 |
+
|
| 466 |
+
if is_greek_word(token):
|
| 467 |
+
normalized = token.lower()
|
| 468 |
+
if normalized not in seen:
|
| 469 |
+
distinct_terms.append(token)
|
| 470 |
+
seen.add(normalized)
|
| 471 |
+
|
| 472 |
+
return distinct_terms[:20] # Limit to 20 terms
|
| 473 |
+
|
| 474 |
+
def process_text(self, greek_text):
|
| 475 |
+
"""Process extracted Greek text"""
|
| 476 |
+
if not greek_text:
|
| 477 |
+
return {"text": "", "terms": [], "char_analysis": {}, "validation": {}}
|
| 478 |
+
|
| 479 |
+
# Extract distinct terms
|
| 480 |
+
terms = self._extract_distinct_terms(greek_text)
|
| 481 |
+
|
| 482 |
+
# Character analysis
|
| 483 |
+
char_analysis = {
|
| 484 |
+
"total_chars": len(greek_text),
|
| 485 |
+
"greek_chars": self._count_greek_chars(greek_text),
|
| 486 |
+
"unique_chars": len(set(greek_text)),
|
| 487 |
+
"words": len(greek_text.split())
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
# Validation metrics
|
| 491 |
+
validation = {
|
| 492 |
+
"has_polytonic": self._has_polytonic_marks(greek_text),
|
| 493 |
+
"greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]),
|
| 494 |
+
"quality_score": self._calculate_quality_score(greek_text)
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
return {
|
| 498 |
+
"text": greek_text,
|
| 499 |
+
"terms": terms,
|
| 500 |
+
"char_analysis": char_analysis,
|
| 501 |
+
"validation": validation
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
def _has_polytonic_marks(self, text):
|
| 505 |
+
"""Check if text contains polytonic Greek marks"""
|
| 506 |
+
# Greek Extended block contains polytonic diacritical marks
|
| 507 |
+
return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text)
|
| 508 |
+
|
| 509 |
+
def _calculate_quality_score(self, text):
|
| 510 |
+
"""Calculate a quality score for the extracted text"""
|
| 511 |
+
if not text:
|
| 512 |
+
return 0.0
|
| 513 |
+
|
| 514 |
+
score = 0.0
|
| 515 |
+
|
| 516 |
+
# Base score from Greek character ratio
|
| 517 |
+
greek_ratio = self._count_greek_chars(text) / max(1, len(text))
|
| 518 |
+
score += greek_ratio * 0.4
|
| 519 |
+
|
| 520 |
+
# Bonus for polytonic marks (indicates authentic ancient Greek)
|
| 521 |
+
if self._has_polytonic_marks(text):
|
| 522 |
+
score += 0.3
|
| 523 |
+
|
| 524 |
+
# Penalty for too many non-alphabetic characters
|
| 525 |
+
alpha_chars = sum(ch.isalpha() for ch in text)
|
| 526 |
+
alpha_ratio = alpha_chars / max(1, len(text))
|
| 527 |
+
score += alpha_ratio * 0.3
|
| 528 |
+
|
| 529 |
+
return min(1.0, score)
|
| 530 |
+
|
| 531 |
+
def generate_historical_context(self, processed_result):
|
| 532 |
+
"""Generate historical context for Greek text"""
|
| 533 |
+
greek_text = processed_result.get("text", "")
|
| 534 |
+
terms = processed_result.get("terms", [])
|
| 535 |
+
|
| 536 |
+
# Generate Groq context
|
| 537 |
+
groq_detail = self._generate_groq_context(greek_text)
|
| 538 |
+
|
| 539 |
+
# Build references - query both words and individual characters
|
| 540 |
+
query_terms = list(terms) if terms else []
|
| 541 |
+
if greek_text:
|
| 542 |
+
query_terms.extend([char for char in greek_text if char.strip()])
|
| 543 |
+
print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}")
|
| 544 |
+
refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
|
| 545 |
+
print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}")
|
| 546 |
+
|
| 547 |
+
return {
|
| 548 |
+
"uses_box": {
|
| 549 |
+
"title": "Each symbol's possible use by the Greek people",
|
| 550 |
+
"items": self._build_uses_list(terms, greek_text)
|
| 551 |
+
},
|
| 552 |
+
"meaning_box": self._build_meaning_box(terms, groq_detail),
|
| 553 |
+
"references": refs
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
def _generate_groq_context(self, greek_text):
|
| 557 |
+
"""Generate contextual information using Groq"""
|
| 558 |
+
if not self.groq_client.is_available():
|
| 559 |
+
return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
|
| 560 |
+
|
| 561 |
+
prompt = (
|
| 562 |
+
f"This ancient Greek text was found: {greek_text}\n\n"
|
| 563 |
+
"Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, "
|
| 564 |
+
"possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), "
|
| 565 |
+
"and paleographic cues. Avoid repeating the prompt."
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context."
|
| 569 |
+
enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text)
|
| 570 |
+
|
| 571 |
+
return self.groq_client.generate_response(
|
| 572 |
+
system_prompt=enriched_system_prompt,
|
| 573 |
+
user_prompt=prompt
|
| 574 |
+
) or "(context unavailable due to Groq error)"
|
| 575 |
+
|
| 576 |
+
def _generate_batch_explanations(self, terms):
|
| 577 |
+
"""Generate scholarly glossary definitions for Greek terms in a single batch query"""
|
| 578 |
+
if not terms or not self.groq_client or not self.groq_client.is_available():
|
| 579 |
+
return {}
|
| 580 |
+
|
| 581 |
+
# Limit to first 15 terms to prevent token limit/truncation issues
|
| 582 |
+
terms_to_query = list(terms)[:15]
|
| 583 |
+
terms_list = ", ".join(terms_to_query)
|
| 584 |
+
|
| 585 |
+
system_prompt = (
|
| 586 |
+
"You are an expert classicist and lexicographer of Ancient Greek. "
|
| 587 |
+
"Respond ONLY with a JSON object. Do NOT wrap values in double quotes inside the strings. "
|
| 588 |
+
"Use single quotes '...' for any internal quotes, definitions, or translations."
|
| 589 |
+
)
|
| 590 |
+
user_prompt = (
|
| 591 |
+
f"For each of the following Ancient Greek words, provide a brief, scholarly one-sentence definition, "
|
| 592 |
+
f"etymological note, or grammatical gloss:\n\n"
|
| 593 |
+
f"Words: {terms_list}\n\n"
|
| 594 |
+
f"Respond ONLY with a JSON object where the keys are the exact words and the values are the definitions.\n"
|
| 595 |
+
f"Do NOT use double quotes inside the definitions/values; use single quotes instead.\n"
|
| 596 |
+
f"Example: {{\"word1\": \"definition1\", \"word2\": \"definition2\"}}"
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
try:
|
| 600 |
+
raw_response = self.groq_client.generate_response(
|
| 601 |
+
system_prompt=system_prompt,
|
| 602 |
+
user_prompt=user_prompt,
|
| 603 |
+
max_tokens=2048
|
| 604 |
+
)
|
| 605 |
+
# Safe print to avoid UnicodeEncodeError in Windows command prompt
|
| 606 |
+
print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
|
| 607 |
+
|
| 608 |
+
# Find JSON block in response
|
| 609 |
+
if "{" in raw_response and "}" in raw_response:
|
| 610 |
+
start = raw_response.find("{")
|
| 611 |
+
end = raw_response.rfind("}") + 1
|
| 612 |
+
json_str = raw_response[start:end]
|
| 613 |
+
import json
|
| 614 |
+
try:
|
| 615 |
+
definitions = json.loads(json_str)
|
| 616 |
+
except Exception as je:
|
| 617 |
+
print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
|
| 618 |
+
import re
|
| 619 |
+
definitions = {}
|
| 620 |
+
# Matches "key": "value"
|
| 621 |
+
pattern = re.compile(r'"([^"]+)":\s*"((?:[^"\\]|\\.)*)"')
|
| 622 |
+
matches = pattern.findall(json_str)
|
| 623 |
+
for k, v in matches:
|
| 624 |
+
definitions[k] = v
|
| 625 |
+
return {k: str(v) for k, v in definitions.items()}
|
| 626 |
+
except Exception as e:
|
| 627 |
+
print(f"[WARN] Failed to generate batch Greek explanations: {e}")
|
| 628 |
+
|
| 629 |
+
return {}
|
| 630 |
+
|
| 631 |
+
def _build_uses_list(self, terms, greek_text):
|
| 632 |
+
"""Build list of symbol/word uses using RAG and batch Groq explanations"""
|
| 633 |
+
import unicodedata
|
| 634 |
+
items = []
|
| 635 |
+
|
| 636 |
+
# 1. Get definitions for the extracted Greek words (terms)
|
| 637 |
+
if terms:
|
| 638 |
+
# Unique terms preserving order
|
| 639 |
+
unique_terms = list(dict.fromkeys(terms))
|
| 640 |
+
# Limit to top 15 terms to be concise
|
| 641 |
+
unique_terms = unique_terms[:15]
|
| 642 |
+
print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...")
|
| 643 |
+
definitions = {}
|
| 644 |
+
missing_terms = []
|
| 645 |
+
|
| 646 |
+
for term in unique_terms:
|
| 647 |
+
# Check RAG corpus (normalize search query)
|
| 648 |
+
norm_term = unicodedata.normalize('NFC', term).strip()
|
| 649 |
+
rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1)
|
| 650 |
+
if rag_matches:
|
| 651 |
+
definitions[term] = rag_matches[0]["definition"]
|
| 652 |
+
else:
|
| 653 |
+
missing_terms.append(term)
|
| 654 |
+
|
| 655 |
+
# Generate remaining definitions with Groq in a single batch
|
| 656 |
+
if missing_terms:
|
| 657 |
+
groq_defs = self._generate_batch_explanations(missing_terms)
|
| 658 |
+
# Normalize groq keys for matching
|
| 659 |
+
normalized_groq_defs = {}
|
| 660 |
+
for k, v in groq_defs.items():
|
| 661 |
+
nk = unicodedata.normalize('NFC', k).strip().lower()
|
| 662 |
+
normalized_groq_defs[nk] = v
|
| 663 |
+
|
| 664 |
+
# Assign matching definitions
|
| 665 |
+
for term in missing_terms:
|
| 666 |
+
nt = unicodedata.normalize('NFC', term).strip().lower()
|
| 667 |
+
if nt in normalized_groq_defs:
|
| 668 |
+
definitions[term] = normalized_groq_defs[nt]
|
| 669 |
+
else:
|
| 670 |
+
# Case/accent insensitive backup match (in case Groq stripped accents)
|
| 671 |
+
import unicodedata as ud
|
| 672 |
+
def strip_accents(s):
|
| 673 |
+
return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn')
|
| 674 |
+
|
| 675 |
+
stripped_t = strip_accents(nt)
|
| 676 |
+
for gk, gv in normalized_groq_defs.items():
|
| 677 |
+
if strip_accents(gk) == stripped_t:
|
| 678 |
+
definitions[term] = gv
|
| 679 |
+
break
|
| 680 |
+
|
| 681 |
+
for term in unique_terms:
|
| 682 |
+
definition = definitions.get(term)
|
| 683 |
+
if not definition:
|
| 684 |
+
definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
|
| 685 |
+
items.append(f"{term}: {definition}")
|
| 686 |
+
|
| 687 |
+
# 2. Add significant paleographical/character markers found in the text if they are in the references
|
| 688 |
+
notes = self.references.get("greek_symbol_notes", {}) or {}
|
| 689 |
+
seen_chars = set()
|
| 690 |
+
char_items = []
|
| 691 |
+
for ch in greek_text:
|
| 692 |
+
if ch in notes and ch not in seen_chars:
|
| 693 |
+
seen_chars.add(ch)
|
| 694 |
+
char_items.append(f"Character '{ch}': {notes[ch]}")
|
| 695 |
+
|
| 696 |
+
# Limit character notes to prevent clutter
|
| 697 |
+
items.extend(char_items[:5])
|
| 698 |
+
|
| 699 |
+
# Format as list items with bullets
|
| 700 |
+
formatted_items = [f"- {item}" for item in items]
|
| 701 |
+
|
| 702 |
+
if not formatted_items:
|
| 703 |
+
default_hint = self.references.get("greek_hint",
|
| 704 |
+
"Ancient Greek script marker; values are determined by polytonic diacritical marks.")
|
| 705 |
+
formatted_items.append(f"- —: {default_hint}")
|
| 706 |
+
|
| 707 |
+
return formatted_items
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
def _build_meaning_box(self, terms, groq_detail):
|
| 711 |
+
"""Build meaning interpretation box"""
|
| 712 |
+
intro_lines = [
|
| 713 |
+
"The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.",
|
| 714 |
+
"Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification."
|
| 715 |
+
]
|
| 716 |
+
|
| 717 |
+
points = [
|
| 718 |
+
"• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.",
|
| 719 |
+
"• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.",
|
| 720 |
+
]
|
| 721 |
+
|
| 722 |
+
if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
|
| 723 |
+
points.append(groq_detail.strip())
|
| 724 |
+
|
| 725 |
+
return {
|
| 726 |
+
"title": "Possible meaning:",
|
| 727 |
+
"intro_lines": intro_lines,
|
| 728 |
+
"frequent_label": "Key terms noted",
|
| 729 |
+
"frequent": terms[:10],
|
| 730 |
+
"points": points
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
def generate_story(self, processed_result):
|
| 734 |
+
"""Generate creative story for Greek text"""
|
| 735 |
+
greek_text = processed_result.get("text", "")
|
| 736 |
+
|
| 737 |
+
if not self.groq_client.is_available():
|
| 738 |
+
return "Groq client unavailable, cannot generate story."
|
| 739 |
+
|
| 740 |
+
styles = [
|
| 741 |
+
"as an epic poem told by a travelling rhapsode",
|
| 742 |
+
"as a prophecy inscribed on the Oracle at Delphi",
|
| 743 |
+
"as a philosophical dialogue in the Academy",
|
| 744 |
+
"as a myth recounted by ancient storytellers",
|
| 745 |
+
"as a recovered scroll from the Library of Alexandria",
|
| 746 |
+
"as a hymn sung in honor of the gods"
|
| 747 |
+
]
|
| 748 |
+
|
| 749 |
+
import random
|
| 750 |
+
chosen_style = random.choice(styles)
|
| 751 |
+
seed = random.randint(1000, 9999)
|
| 752 |
+
|
| 753 |
+
prompt = (
|
| 754 |
+
f"The following ancient Greek text was found: {greek_text}\n\n"
|
| 755 |
+
f"Create a long, vivid, imaginative story from ancient Greek times "
|
| 756 |
+
f"based on this Greek text. Write it as one rich paragraph with "
|
| 757 |
+
f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n"
|
| 758 |
+
f"Creative seed: {seed}\n"
|
| 759 |
+
f"Write a detailed, imaginative myth-like story {chosen_style}. "
|
| 760 |
+
"Include multiple characters, rich imagery, and scenes. "
|
| 761 |
+
"Avoid repetition and keep it unpredictable."
|
| 762 |
+
)
|
| 763 |
+
|
| 764 |
+
system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture."
|
| 765 |
+
|
| 766 |
+
story = self.groq_client.generate_response(
|
| 767 |
+
system_prompt=system_prompt,
|
| 768 |
+
user_prompt=prompt
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
if not story or is_gibberish(story):
|
| 772 |
+
return "Failed to create quality story; the ancient texts remain silent."
|
| 773 |
+
|
| 774 |
+
return story
|
processors/latin_processor.py
ADDED
|
@@ -0,0 +1,1281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 8 |
+
import torch
|
| 9 |
+
from .base_processor import BaseScriptProcessor
|
| 10 |
+
from utils.text_utils import is_gibberish
|
| 11 |
+
|
| 12 |
+
BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
|
| 13 |
+
TRIDIS_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "tridis")
|
| 14 |
+
TROCR_LATIN_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "trocr_latin")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LatinProcessor(BaseScriptProcessor):
|
| 18 |
+
def __init__(self, groq_client, references, clip_classifier):
|
| 19 |
+
super().__init__(groq_client, references, clip_classifier)
|
| 20 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 21 |
+
|
| 22 |
+
self.tridis_model = None
|
| 23 |
+
self.tridis_processor = None
|
| 24 |
+
self.tridis_available = False
|
| 25 |
+
|
| 26 |
+
self.trocr_latin_model = None
|
| 27 |
+
self.trocr_latin_processor = None
|
| 28 |
+
self.trocr_latin_available = False
|
| 29 |
+
|
| 30 |
+
self.active_style = "cursive"
|
| 31 |
+
self.active_model = "None"
|
| 32 |
+
|
| 33 |
+
self.setup_tesseract_fallback()
|
| 34 |
+
|
| 35 |
+
# Register for dynamic VRAM management
|
| 36 |
+
from utils.gpu_diagnostics import register_processor
|
| 37 |
+
register_processor("latin", self)
|
| 38 |
+
|
| 39 |
+
def setup_tridis_htr(self):
|
| 40 |
+
"""Setup TRIDIS HTR model - BEST for medieval Latin manuscripts"""
|
| 41 |
+
try:
|
| 42 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 43 |
+
reclaim_vram_for("latin")
|
| 44 |
+
|
| 45 |
+
print("[INFO] Lazily loading TRIDIS HTR model for medieval Latin...")
|
| 46 |
+
print("[INFO] This model specializes in 13th-16th century manuscripts with automatic abbreviation expansion")
|
| 47 |
+
|
| 48 |
+
# TRIDIS model from Hugging Face - runs locally after download
|
| 49 |
+
self.tridis_processor = TrOCRProcessor.from_pretrained(
|
| 50 |
+
'magistermilitum/tridis_HTR',
|
| 51 |
+
cache_dir=TRIDIS_MODEL_DIR,
|
| 52 |
+
local_files_only=False # Download first time, then cache locally
|
| 53 |
+
)
|
| 54 |
+
self.tridis_model = VisionEncoderDecoderModel.from_pretrained(
|
| 55 |
+
'magistermilitum/tridis_HTR',
|
| 56 |
+
cache_dir=TRIDIS_MODEL_DIR,
|
| 57 |
+
local_files_only=False
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
self.tridis_model.to(self.device)
|
| 61 |
+
self.tridis_model.eval() # Put in evaluation mode
|
| 62 |
+
|
| 63 |
+
from utils.gpu_diagnostics import log_model_device
|
| 64 |
+
log_model_device("Latin TRIDIS HTR (Cursive)", self.device)
|
| 65 |
+
|
| 66 |
+
print(f"[INFO] TRIDIS HTR loaded successfully on {self.device}")
|
| 67 |
+
print("[INFO] Training: 245,000 lines of Latin/Old French/Old Spanish medieval manuscripts")
|
| 68 |
+
print("[INFO] Features: Automatic abbreviation expansion, named entity capitalization, cancellation markers")
|
| 69 |
+
self.tridis_available = True
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"[ERROR] TRIDIS HTR model failed to load: {e}")
|
| 73 |
+
print("[WARN] Falling back to Tesseract for basic Latin recognition...")
|
| 74 |
+
self.tridis_available = False
|
| 75 |
+
|
| 76 |
+
def setup_trocr_base_latin(self):
|
| 77 |
+
"""Setup trocr-base-latin model - BEST for printed or carved classical Latin"""
|
| 78 |
+
try:
|
| 79 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 80 |
+
reclaim_vram_for("latin")
|
| 81 |
+
|
| 82 |
+
print("[INFO] Lazily loading trocr-base-latin model for printed/carved Latin...")
|
| 83 |
+
self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
|
| 84 |
+
'magistermilitum/trocr-base-latin',
|
| 85 |
+
cache_dir=TROCR_LATIN_MODEL_DIR,
|
| 86 |
+
local_files_only=False
|
| 87 |
+
)
|
| 88 |
+
self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
|
| 89 |
+
'magistermilitum/trocr-base-latin',
|
| 90 |
+
cache_dir=TROCR_LATIN_MODEL_DIR,
|
| 91 |
+
local_files_only=False
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
self.trocr_latin_model.to(self.device)
|
| 95 |
+
self.trocr_latin_model.eval() # Put in evaluation mode
|
| 96 |
+
|
| 97 |
+
from utils.gpu_diagnostics import log_model_device
|
| 98 |
+
log_model_device("Latin TrOCR (Printed)", self.device)
|
| 99 |
+
|
| 100 |
+
self.trocr_latin_available = True
|
| 101 |
+
print(f"[INFO] trocr-base-latin loaded successfully on {self.device}")
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"[WARN] magistermilitum/trocr-base-latin model failed to load ({e}). Trying public fallback 'microsoft/trocr-base-printed'...")
|
| 104 |
+
try:
|
| 105 |
+
# Free VRAM again in case partial allocation left residue
|
| 106 |
+
reclaim_vram_for("latin")
|
| 107 |
+
self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
|
| 108 |
+
'microsoft/trocr-base-printed',
|
| 109 |
+
cache_dir=TROCR_LATIN_MODEL_DIR,
|
| 110 |
+
local_files_only=False
|
| 111 |
+
)
|
| 112 |
+
self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
|
| 113 |
+
'microsoft/trocr-base-printed',
|
| 114 |
+
cache_dir=TROCR_LATIN_MODEL_DIR,
|
| 115 |
+
local_files_only=False
|
| 116 |
+
)
|
| 117 |
+
self.trocr_latin_model.to(self.device)
|
| 118 |
+
self.trocr_latin_model.eval() # Put in evaluation mode
|
| 119 |
+
|
| 120 |
+
from utils.gpu_diagnostics import log_model_device
|
| 121 |
+
log_model_device("Latin TrOCR (Printed Fallback)", self.device)
|
| 122 |
+
|
| 123 |
+
self.trocr_latin_available = True
|
| 124 |
+
print(f"[INFO] Public fallback microsoft/trocr-base-printed loaded successfully on {self.device}")
|
| 125 |
+
except Exception as ex:
|
| 126 |
+
print(f"[ERROR] All printed Latin models failed to load: {ex}")
|
| 127 |
+
self.trocr_latin_available = False
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def setup_tesseract_fallback(self):
|
| 131 |
+
"""Setup Tesseract as fallback for basic Latin recognition"""
|
| 132 |
+
try:
|
| 133 |
+
import pytesseract
|
| 134 |
+
|
| 135 |
+
# Test Tesseract availability
|
| 136 |
+
try:
|
| 137 |
+
version = pytesseract.get_tesseract_version()
|
| 138 |
+
print(f"[INFO] Tesseract fallback version: {version}")
|
| 139 |
+
except:
|
| 140 |
+
print("[INFO] Tesseract version check skipped")
|
| 141 |
+
|
| 142 |
+
self.ocr_configs = {
|
| 143 |
+
'medieval_extended': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁꝐꝑꝒꝓꝔꝕꝖꝗꝘꝙꝚꝛꝜꝝꞀꞁꞂꞃ$',
|
| 144 |
+
'medieval_basic': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-',
|
| 145 |
+
'standard': r'--oem 3 --psm 6',
|
| 146 |
+
'single_line': r'--oem 3 --psm 7',
|
| 147 |
+
'single_word': r'--oem 3 --psm 8',
|
| 148 |
+
'auto': r'--oem 3 --psm 3'
|
| 149 |
+
}
|
| 150 |
+
self.tesseract_available = True
|
| 151 |
+
print("[INFO] Tesseract fallback configured with medieval symbol support")
|
| 152 |
+
|
| 153 |
+
except ImportError:
|
| 154 |
+
print("[ERROR] pytesseract not available")
|
| 155 |
+
self.tesseract_available = False
|
| 156 |
+
except Exception as e:
|
| 157 |
+
print(f"[WARN] Tesseract setup failed: {e}")
|
| 158 |
+
self.tesseract_available = False
|
| 159 |
+
|
| 160 |
+
def detect_script(self, image_path):
|
| 161 |
+
"""Detection handled by Groq Vision classification"""
|
| 162 |
+
try:
|
| 163 |
+
if not self.tridis_available and not self.tesseract_available:
|
| 164 |
+
print("[ERROR] No OCR engines available for Latin processing")
|
| 165 |
+
return False, 0.0
|
| 166 |
+
|
| 167 |
+
method = "TRIDIS HTR (medieval specialist)" if self.tridis_available else "Tesseract fallback"
|
| 168 |
+
print(f"[INFO] Latin processor activated - Using {method}")
|
| 169 |
+
return True, 0.98 if self.tridis_available else 0.85
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
print(f"[ERROR] Latin detection failed: {e}")
|
| 173 |
+
return False, 0.0
|
| 174 |
+
|
| 175 |
+
def extract_text(self, image_path):
|
| 176 |
+
"""Extract text using dual-mode routing: trocr-base-latin for printed, tridis_HTR for cursive"""
|
| 177 |
+
try:
|
| 178 |
+
start_time = time.time()
|
| 179 |
+
|
| 180 |
+
# Step 1: Detect script style
|
| 181 |
+
style = self.layout_parser.detect_writing_style(image_path, self.clip_classifier)
|
| 182 |
+
print(f"[INFO] Latin writing style detected: {style.upper()}")
|
| 183 |
+
|
| 184 |
+
primary_text = ""
|
| 185 |
+
fallback_text = ""
|
| 186 |
+
|
| 187 |
+
# Ensure the required model is loaded dynamically
|
| 188 |
+
if style == "printed":
|
| 189 |
+
if self.trocr_latin_model is None:
|
| 190 |
+
self.setup_trocr_base_latin()
|
| 191 |
+
else:
|
| 192 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 193 |
+
reclaim_vram_for("latin")
|
| 194 |
+
if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
|
| 195 |
+
print(f"[VRAM MANAGER] Activating Latin TrOCR (Printed) model on {self.device}...")
|
| 196 |
+
self.trocr_latin_model.to(self.device)
|
| 197 |
+
else:
|
| 198 |
+
if self.tridis_model is None:
|
| 199 |
+
self.setup_tridis_htr()
|
| 200 |
+
else:
|
| 201 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 202 |
+
reclaim_vram_for("latin")
|
| 203 |
+
if str(next(self.tridis_model.parameters()).device) != str(self.device):
|
| 204 |
+
print(f"[VRAM MANAGER] Activating Latin TRIDIS HTR (Cursive) model on {self.device}...")
|
| 205 |
+
self.tridis_model.to(self.device)
|
| 206 |
+
|
| 207 |
+
if style == "printed" and self.trocr_latin_available:
|
| 208 |
+
print("[INFO] Routing to printed/carved Latin model (trocr-base-latin)...")
|
| 209 |
+
primary_text = self._extract_with_trocr_base_latin(image_path)
|
| 210 |
+
if primary_text and self._validate_latin_text(primary_text, style):
|
| 211 |
+
processing_time = time.time() - start_time
|
| 212 |
+
print(f"[SUCCESS] Routed to trocr-base-latin and completed in {processing_time:.2f}s")
|
| 213 |
+
self.active_style = "printed"
|
| 214 |
+
self.active_model = "trocr-base-latin"
|
| 215 |
+
return primary_text
|
| 216 |
+
else:
|
| 217 |
+
print("[WARN] trocr-base-latin returned poor quality result, trying TRIDIS HTR fallback...")
|
| 218 |
+
if self.tridis_model is None:
|
| 219 |
+
self.setup_tridis_htr()
|
| 220 |
+
if self.tridis_available:
|
| 221 |
+
fallback_text = self._extract_with_tridis_htr(image_path)
|
| 222 |
+
|
| 223 |
+
else: # cursive / manuscript
|
| 224 |
+
print("[INFO] Routing to medieval manuscript model (tridis_HTR)...")
|
| 225 |
+
if self.tridis_available:
|
| 226 |
+
primary_text = self._extract_with_tridis_htr(image_path)
|
| 227 |
+
if primary_text and self._validate_latin_text(primary_text, style):
|
| 228 |
+
processing_time = time.time() - start_time
|
| 229 |
+
print(f"[SUCCESS] Routed to tridis_HTR and completed in {processing_time:.2f}s")
|
| 230 |
+
self.active_style = "cursive"
|
| 231 |
+
self.active_model = "tridis_HTR"
|
| 232 |
+
return primary_text
|
| 233 |
+
else:
|
| 234 |
+
print("[WARN] TRIDIS HTR returned poor quality result, trying trocr-base-latin fallback...")
|
| 235 |
+
if self.trocr_latin_model is None:
|
| 236 |
+
self.setup_trocr_base_latin()
|
| 237 |
+
if self.trocr_latin_available:
|
| 238 |
+
fallback_text = self._extract_with_trocr_base_latin(image_path)
|
| 239 |
+
|
| 240 |
+
# Step 2: Check fallback text from the other model
|
| 241 |
+
if fallback_text and self._validate_latin_text(fallback_text, "printed" if style == "cursive" else "cursive"):
|
| 242 |
+
processing_time = time.time() - start_time
|
| 243 |
+
print(f"[SUCCESS] Fallback model transcription successful in {processing_time:.2f}s")
|
| 244 |
+
self.active_style = "printed" if style == "cursive" else "cursive"
|
| 245 |
+
self.active_model = "trocr-base-latin" if style == "cursive" else "tridis_HTR"
|
| 246 |
+
return fallback_text
|
| 247 |
+
|
| 248 |
+
# Step 3: Tesseract fallback
|
| 249 |
+
if self.tesseract_available:
|
| 250 |
+
print("[INFO] Neural models failed. Processing with Tesseract fallback...")
|
| 251 |
+
tesseract_text = self._extract_with_tesseract_enhanced(image_path)
|
| 252 |
+
|
| 253 |
+
if tesseract_text and self._validate_latin_text(tesseract_text, "any"):
|
| 254 |
+
processing_time = time.time() - start_time
|
| 255 |
+
print(f"[SUCCESS] Tesseract fallback completed in {processing_time:.2f}s")
|
| 256 |
+
self.active_style = "printed" # Tesseract works best on printed
|
| 257 |
+
self.active_model = "Tesseract OCR"
|
| 258 |
+
return tesseract_text
|
| 259 |
+
else:
|
| 260 |
+
print("[WARN] Tesseract returned poor quality result, trying layout-aware segmentation fallback...")
|
| 261 |
+
|
| 262 |
+
# Method 3: Layout-aware line segment fallback
|
| 263 |
+
layout_aware_text = self._extract_layout_aware_ocr(image_path)
|
| 264 |
+
if layout_aware_text and self._validate_latin_text(layout_aware_text, "any"):
|
| 265 |
+
processing_time = time.time() - start_time
|
| 266 |
+
print(f"[SUCCESS] Layout-aware OCR completed in {processing_time:.2f}s")
|
| 267 |
+
self.active_style = "printed"
|
| 268 |
+
self.active_model = "Tesseract Layout-Aware"
|
| 269 |
+
return layout_aware_text
|
| 270 |
+
|
| 271 |
+
print("[ERROR] All OCR methods failed or returned poor quality results")
|
| 272 |
+
self.active_style = "unknown"
|
| 273 |
+
self.active_model = "None"
|
| 274 |
+
return "No readable Latin text detected with sufficient confidence"
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
print(f"[ERROR] Latin text extraction failed: {e}")
|
| 278 |
+
self.active_style = "error"
|
| 279 |
+
self.active_model = "None"
|
| 280 |
+
return f"Error during text extraction: {str(e)}"
|
| 281 |
+
|
| 282 |
+
def _extract_with_trocr_base_latin(self, image_path):
|
| 283 |
+
"""Extract text using trocr-base-latin - SPECIALIZED for printed/carved Latin"""
|
| 284 |
+
if self.trocr_latin_model is None:
|
| 285 |
+
self.setup_trocr_base_latin()
|
| 286 |
+
else:
|
| 287 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 288 |
+
reclaim_vram_for("latin")
|
| 289 |
+
if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
|
| 290 |
+
print(f"[VRAM MANAGER] Activating Latin TrOCR model on {self.device}...")
|
| 291 |
+
self.trocr_latin_model.to(self.device)
|
| 292 |
+
|
| 293 |
+
if not getattr(self, 'trocr_latin_available', False) or self.trocr_latin_model is None:
|
| 294 |
+
return ""
|
| 295 |
+
|
| 296 |
+
try:
|
| 297 |
+
image = Image.open(image_path).convert("RGB")
|
| 298 |
+
print(f"[INFO] Processing image with trocr-base-latin: {image.size[0]}x{image.size[1]} pixels")
|
| 299 |
+
|
| 300 |
+
# Since trocr models are line-level OCR models, segment into lines first
|
| 301 |
+
layout = self.layout_parser.analyze_layout(image_path)
|
| 302 |
+
crops = self.layout_parser.crop_lines(image_path, layout)
|
| 303 |
+
|
| 304 |
+
if crops and len(crops) > 1:
|
| 305 |
+
print(f"[INFO] Image contains multiple lines ({len(crops)}). Running line-by-line trocr-base-latin...")
|
| 306 |
+
line_texts = []
|
| 307 |
+
for idx, crop in enumerate(crops):
|
| 308 |
+
text = self._ocr_single_crop_with_trocr_base_latin(crop)
|
| 309 |
+
if text:
|
| 310 |
+
line_texts.append(text)
|
| 311 |
+
return "\n".join(line_texts)
|
| 312 |
+
else:
|
| 313 |
+
print("[INFO] Single line detected or layout parser returned no lines. Processing full image...")
|
| 314 |
+
return self._ocr_single_crop_with_trocr_base_latin(image)
|
| 315 |
+
|
| 316 |
+
except Exception as e:
|
| 317 |
+
print(f"[ERROR] trocr-base-latin extraction failed: {e}")
|
| 318 |
+
return ""
|
| 319 |
+
|
| 320 |
+
def _ocr_single_crop_with_trocr_base_latin(self, crop_image):
|
| 321 |
+
"""Helper to run trocr-base-latin inference on a single image crop"""
|
| 322 |
+
try:
|
| 323 |
+
pixel_values = self.trocr_latin_processor(
|
| 324 |
+
images=crop_image,
|
| 325 |
+
return_tensors="pt"
|
| 326 |
+
).pixel_values.to(self.device)
|
| 327 |
+
|
| 328 |
+
with torch.inference_mode():
|
| 329 |
+
generated_ids = self.trocr_latin_model.generate(
|
| 330 |
+
pixel_values,
|
| 331 |
+
max_length=512,
|
| 332 |
+
num_beams=4,
|
| 333 |
+
early_stopping=True
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
text = self.trocr_latin_processor.batch_decode(
|
| 337 |
+
generated_ids,
|
| 338 |
+
skip_special_tokens=True
|
| 339 |
+
)[0]
|
| 340 |
+
|
| 341 |
+
text = ' '.join(text.split())
|
| 342 |
+
return text.strip()
|
| 343 |
+
except Exception as e:
|
| 344 |
+
print(f"[ERROR] Single line OCR with trocr-base-latin failed: {e}")
|
| 345 |
+
return ""
|
| 346 |
+
|
| 347 |
+
def _extract_with_tridis_htr(self, image_path):
|
| 348 |
+
"""Extract text using TRIDIS HTR - SPECIALIZED for medieval Latin manuscripts.
|
| 349 |
+
Uses layout-aware line segmentation so multi-line documents are fully transcribed."""
|
| 350 |
+
if self.tridis_model is None:
|
| 351 |
+
self.setup_tridis_htr()
|
| 352 |
+
else:
|
| 353 |
+
from utils.gpu_diagnostics import reclaim_vram_for
|
| 354 |
+
reclaim_vram_for("latin")
|
| 355 |
+
if str(next(self.tridis_model.parameters()).device) != str(self.device):
|
| 356 |
+
print(f"[VRAM MANAGER] Activating Latin TRIDIS model on {self.device}...")
|
| 357 |
+
self.tridis_model.to(self.device)
|
| 358 |
+
|
| 359 |
+
if not getattr(self, 'tridis_available', False) or self.tridis_model is None:
|
| 360 |
+
return ""
|
| 361 |
+
|
| 362 |
+
try:
|
| 363 |
+
# Load and validate image
|
| 364 |
+
image = Image.open(image_path).convert("RGB")
|
| 365 |
+
print(f"[INFO] Processing image with TRIDIS HTR: {image.size[0]}x{image.size[1]} pixels")
|
| 366 |
+
|
| 367 |
+
# Use layout parser to segment into individual lines
|
| 368 |
+
layout = self.layout_parser.analyze_layout(image_path)
|
| 369 |
+
crops = self.layout_parser.crop_lines(image_path, layout)
|
| 370 |
+
|
| 371 |
+
if crops and len(crops) > 1:
|
| 372 |
+
# Cap lines to prevent timeout on very large documents (CPU inference)
|
| 373 |
+
MAX_LINES = 50
|
| 374 |
+
total_detected = len(crops)
|
| 375 |
+
if len(crops) > MAX_LINES:
|
| 376 |
+
print(f"[INFO] Layout parser detected {total_detected} text lines. Capping to {MAX_LINES} for performance.")
|
| 377 |
+
crops = crops[:MAX_LINES]
|
| 378 |
+
else:
|
| 379 |
+
print(f"[INFO] Layout parser detected {total_detected} text lines. Running line-by-line TRIDIS HTR...")
|
| 380 |
+
|
| 381 |
+
line_texts = []
|
| 382 |
+
for idx, crop in enumerate(crops):
|
| 383 |
+
# Preprocess each line crop for medieval manuscripts
|
| 384 |
+
enhanced_crop = self._preprocess_for_medieval_manuscript(crop)
|
| 385 |
+
text = self._ocr_single_crop_with_tridis(enhanced_crop)
|
| 386 |
+
if text:
|
| 387 |
+
line_texts.append(text)
|
| 388 |
+
print(f" [LINE {idx+1}/{len(crops)}] {text[:80]}...")
|
| 389 |
+
|
| 390 |
+
if line_texts:
|
| 391 |
+
full_text = "\n".join(line_texts)
|
| 392 |
+
# Post-process medieval abbreviations, corrections, and formatting
|
| 393 |
+
processed_text = self._post_process_medieval_text(full_text)
|
| 394 |
+
|
| 395 |
+
char_count = len(processed_text)
|
| 396 |
+
word_count = len(processed_text.split())
|
| 397 |
+
print(f"[INFO] TRIDIS HTR extracted (multi-line): {char_count} characters, {word_count} words from {len(line_texts)} lines")
|
| 398 |
+
|
| 399 |
+
medieval_features = self._analyze_medieval_features(processed_text)
|
| 400 |
+
if medieval_features:
|
| 401 |
+
print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
|
| 402 |
+
|
| 403 |
+
return processed_text.strip()
|
| 404 |
+
|
| 405 |
+
# Single line or no layout detected — process full image
|
| 406 |
+
print("[INFO] Single line or no layout segmentation. Processing full image with TRIDIS HTR...")
|
| 407 |
+
enhanced_image = self._preprocess_for_medieval_manuscript(image)
|
| 408 |
+
|
| 409 |
+
# Process with TRIDIS HTR
|
| 410 |
+
print("[INFO] Running TRIDIS HTR inference...")
|
| 411 |
+
pixel_values = self.tridis_processor(
|
| 412 |
+
images=enhanced_image,
|
| 413 |
+
return_tensors="pt"
|
| 414 |
+
).pixel_values.to(self.device)
|
| 415 |
+
|
| 416 |
+
# Generate text with parameters optimized for medieval manuscripts
|
| 417 |
+
with torch.inference_mode():
|
| 418 |
+
generated_ids = self.tridis_model.generate(
|
| 419 |
+
pixel_values,
|
| 420 |
+
max_length=768, # Longer sequences for medieval texts with abbreviations
|
| 421 |
+
num_beams=6, # Higher quality beam search for historical accuracy
|
| 422 |
+
early_stopping=True,
|
| 423 |
+
do_sample=False,
|
| 424 |
+
repetition_penalty=1.15, # Avoid repetition common in medieval texts
|
| 425 |
+
length_penalty=0.8, # Don't penalize longer expansions
|
| 426 |
+
no_repeat_ngram_size=2 # Avoid immediate repetitions
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
# Decode the generated text
|
| 430 |
+
generated_text = self.tridis_processor.batch_decode(
|
| 431 |
+
generated_ids,
|
| 432 |
+
skip_special_tokens=True
|
| 433 |
+
)[0]
|
| 434 |
+
|
| 435 |
+
# Post-process medieval abbreviations, corrections, and formatting
|
| 436 |
+
processed_text = self._post_process_medieval_text(generated_text)
|
| 437 |
+
|
| 438 |
+
# Log extraction results
|
| 439 |
+
char_count = len(processed_text)
|
| 440 |
+
word_count = len(processed_text.split())
|
| 441 |
+
print(f"[INFO] TRIDIS HTR extracted: {char_count} characters, {word_count} words")
|
| 442 |
+
|
| 443 |
+
# Detect medieval features
|
| 444 |
+
medieval_features = self._analyze_medieval_features(processed_text)
|
| 445 |
+
if medieval_features:
|
| 446 |
+
print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
|
| 447 |
+
|
| 448 |
+
return processed_text.strip()
|
| 449 |
+
|
| 450 |
+
except Exception as e:
|
| 451 |
+
print(f"[ERROR] TRIDIS HTR extraction failed: {e}")
|
| 452 |
+
return ""
|
| 453 |
+
|
| 454 |
+
def _ocr_single_crop_with_tridis(self, crop_image):
|
| 455 |
+
"""Helper to run TRIDIS HTR inference on a single line crop image"""
|
| 456 |
+
try:
|
| 457 |
+
pixel_values = self.tridis_processor(
|
| 458 |
+
images=crop_image,
|
| 459 |
+
return_tensors="pt"
|
| 460 |
+
).pixel_values.to(self.device)
|
| 461 |
+
|
| 462 |
+
with torch.inference_mode():
|
| 463 |
+
generated_ids = self.tridis_model.generate(
|
| 464 |
+
pixel_values,
|
| 465 |
+
max_length=768,
|
| 466 |
+
num_beams=6,
|
| 467 |
+
early_stopping=True,
|
| 468 |
+
do_sample=False,
|
| 469 |
+
repetition_penalty=1.15,
|
| 470 |
+
length_penalty=0.8,
|
| 471 |
+
no_repeat_ngram_size=2
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
text = self.tridis_processor.batch_decode(
|
| 475 |
+
generated_ids,
|
| 476 |
+
skip_special_tokens=True
|
| 477 |
+
)[0]
|
| 478 |
+
|
| 479 |
+
text = ' '.join(text.split())
|
| 480 |
+
return text.strip()
|
| 481 |
+
except Exception as e:
|
| 482 |
+
print(f"[ERROR] Single line OCR with TRIDIS failed: {e}")
|
| 483 |
+
return ""
|
| 484 |
+
|
| 485 |
+
def _preprocess_for_medieval_manuscript(self, image):
|
| 486 |
+
"""Enhanced preprocessing specifically optimized for medieval manuscripts"""
|
| 487 |
+
try:
|
| 488 |
+
print("[INFO] Applying medieval manuscript preprocessing...")
|
| 489 |
+
|
| 490 |
+
# Convert to OpenCV format
|
| 491 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 492 |
+
gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
|
| 493 |
+
|
| 494 |
+
# Step 1: Handle parchment/paper background variations
|
| 495 |
+
# CLAHE for local contrast enhancement (handles uneven illumination)
|
| 496 |
+
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
|
| 497 |
+
contrast_enhanced = clahe.apply(gray)
|
| 498 |
+
|
| 499 |
+
# Step 2: Gentle denoising to preserve medieval letterforms and ink variations
|
| 500 |
+
# Bilateral filter preserves edges while reducing noise
|
| 501 |
+
denoised = cv2.bilateralFilter(contrast_enhanced, 7, 80, 80)
|
| 502 |
+
|
| 503 |
+
# Step 3: Enhance faded ink while preserving original stroke width
|
| 504 |
+
# Subtle sharpening kernel
|
| 505 |
+
sharpen_kernel = np.array([
|
| 506 |
+
[-0.5, -1, -0.5],
|
| 507 |
+
[-1, 6, -1 ],
|
| 508 |
+
[-0.5, -1, -0.5]
|
| 509 |
+
])
|
| 510 |
+
sharpened = cv2.filter2D(denoised, -1, sharpen_kernel)
|
| 511 |
+
|
| 512 |
+
# Step 4: Normalize intensity range for optimal TRIDIS input
|
| 513 |
+
normalized = cv2.normalize(sharpened, None, 0, 255, cv2.NORM_MINMAX)
|
| 514 |
+
|
| 515 |
+
# Convert back to PIL format and ensure it is RGB mode
|
| 516 |
+
processed_image = Image.fromarray(normalized).convert("RGB")
|
| 517 |
+
|
| 518 |
+
print("[INFO] Medieval preprocessing completed: contrast enhanced, denoised, sharpened")
|
| 519 |
+
return processed_image
|
| 520 |
+
|
| 521 |
+
except Exception as e:
|
| 522 |
+
print(f"[WARN] Medieval preprocessing failed: {e}, using original image")
|
| 523 |
+
return image
|
| 524 |
+
|
| 525 |
+
def _post_process_medieval_text(self, text):
|
| 526 |
+
"""Post-process text from TRIDIS HTR with medieval-specific corrections"""
|
| 527 |
+
try:
|
| 528 |
+
if not text:
|
| 529 |
+
return text
|
| 530 |
+
|
| 531 |
+
print("[INFO] Post-processing TRIDIS HTR output for medieval features...")
|
| 532 |
+
processed = text
|
| 533 |
+
|
| 534 |
+
# Handle TRIDIS cancellation/correction markers
|
| 535 |
+
# TRIDIS uses $word$ to mark cancelled/corrected text
|
| 536 |
+
import re
|
| 537 |
+
|
| 538 |
+
# Count cancellations before processing
|
| 539 |
+
cancellation_count = processed.count('$') // 2
|
| 540 |
+
|
| 541 |
+
# Convert $word$ to editorial brackets [word] for scholarly display
|
| 542 |
+
processed = re.sub(r'\$([^$]*)\$', r'[\1]', processed)
|
| 543 |
+
|
| 544 |
+
if cancellation_count > 0:
|
| 545 |
+
print(f"[INFO] Processed {cancellation_count} scribal corrections/cancellations")
|
| 546 |
+
|
| 547 |
+
# Clean up multiple spaces and normalize whitespace
|
| 548 |
+
processed = ' '.join(processed.split())
|
| 549 |
+
|
| 550 |
+
# Detect and log TRIDIS abbreviation expansions
|
| 551 |
+
# Common medieval abbreviations that TRIDIS expands automatically
|
| 552 |
+
medieval_expansions = {
|
| 553 |
+
'domini': 'dñi/dni/dom̃',
|
| 554 |
+
'facimus': 'facim̃/facimꝰ',
|
| 555 |
+
'quod': 'qd/q̃d',
|
| 556 |
+
'enim': 'enim̃/en̄',
|
| 557 |
+
'pro': 'ꝓ/p̃',
|
| 558 |
+
'et': '⁊/et̃',
|
| 559 |
+
'cum': 'cũ/cum̃',
|
| 560 |
+
'per': 'p̃/ꝑ',
|
| 561 |
+
'sunt': 'sũt/sunt̃',
|
| 562 |
+
'omnia': 'om̃ia/omn̄a'
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
expansions_found = []
|
| 566 |
+
for expansion, abbreviations in medieval_expansions.items():
|
| 567 |
+
if expansion in processed.lower():
|
| 568 |
+
expansions_found.append(f"{abbreviations}→{expansion}")
|
| 569 |
+
|
| 570 |
+
if expansions_found:
|
| 571 |
+
print(f"[INFO] TRIDIS expanded abbreviations: {', '.join(expansions_found[:5])}")
|
| 572 |
+
if len(expansions_found) > 5:
|
| 573 |
+
print(f"[INFO] ... and {len(expansions_found) - 5} more abbreviations")
|
| 574 |
+
|
| 575 |
+
# Detect capitalization patterns (TRIDIS capitalizes named entities)
|
| 576 |
+
capitalized_words = re.findall(r'\b[A-Z][a-z]+', processed)
|
| 577 |
+
if capitalized_words:
|
| 578 |
+
unique_caps = list(set(capitalized_words))
|
| 579 |
+
print(f"[INFO] Named entities capitalized: {', '.join(unique_caps[:5])}")
|
| 580 |
+
if len(unique_caps) > 5:
|
| 581 |
+
print(f"[INFO] ... and {len(unique_caps) - 5} more entities")
|
| 582 |
+
|
| 583 |
+
return processed
|
| 584 |
+
|
| 585 |
+
except Exception as e:
|
| 586 |
+
print(f"[WARN] Medieval post-processing failed: {e}")
|
| 587 |
+
return text
|
| 588 |
+
|
| 589 |
+
def _analyze_medieval_features(self, text):
|
| 590 |
+
"""Analyze and identify medieval manuscript features in the text"""
|
| 591 |
+
features = []
|
| 592 |
+
|
| 593 |
+
if not text:
|
| 594 |
+
return features
|
| 595 |
+
|
| 596 |
+
try:
|
| 597 |
+
# Cancellation markers
|
| 598 |
+
if '[' in text and ']' in text:
|
| 599 |
+
features.append("scribal corrections")
|
| 600 |
+
|
| 601 |
+
# Expanded abbreviations
|
| 602 |
+
medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt', 'omnia']
|
| 603 |
+
found_expansions = [word for word in medieval_words if word in text.lower()]
|
| 604 |
+
if found_expansions:
|
| 605 |
+
features.append(f"abbreviation expansions ({len(found_expansions)})")
|
| 606 |
+
|
| 607 |
+
# Named entity capitalization
|
| 608 |
+
import re
|
| 609 |
+
caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
|
| 610 |
+
if caps_count > 0:
|
| 611 |
+
features.append(f"capitalized entities ({caps_count})")
|
| 612 |
+
|
| 613 |
+
# Medieval punctuation patterns
|
| 614 |
+
if '.' in text or ',' in text or ':' in text:
|
| 615 |
+
features.append("punctuation normalization")
|
| 616 |
+
|
| 617 |
+
# Special medieval characters
|
| 618 |
+
medieval_chars = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
|
| 619 |
+
if medieval_chars > 0:
|
| 620 |
+
features.append(f"medieval symbols ({medieval_chars})")
|
| 621 |
+
|
| 622 |
+
except Exception as e:
|
| 623 |
+
print(f"[WARN] Medieval feature analysis failed: {e}")
|
| 624 |
+
|
| 625 |
+
return features
|
| 626 |
+
|
| 627 |
+
def _extract_with_tesseract_enhanced(self, image_path):
|
| 628 |
+
"""Enhanced Tesseract extraction with multiple configurations"""
|
| 629 |
+
try:
|
| 630 |
+
import pytesseract
|
| 631 |
+
|
| 632 |
+
image = Image.open(image_path).convert("RGB")
|
| 633 |
+
|
| 634 |
+
# Multiple preprocessing approaches
|
| 635 |
+
preprocessed_images = {
|
| 636 |
+
'enhanced': self._preprocess_for_tesseract_enhanced(image),
|
| 637 |
+
'basic': self._preprocess_for_tesseract_basic(image),
|
| 638 |
+
'original': image
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
best_text = ""
|
| 642 |
+
best_score = 0
|
| 643 |
+
best_config = ""
|
| 644 |
+
best_preprocessing = ""
|
| 645 |
+
|
| 646 |
+
# Try different combinations of preprocessing and OCR configurations
|
| 647 |
+
for prep_name, prep_image in preprocessed_images.items():
|
| 648 |
+
for config_name, config in self.ocr_configs.items():
|
| 649 |
+
try:
|
| 650 |
+
# Try with Latin language first
|
| 651 |
+
text = pytesseract.image_to_string(
|
| 652 |
+
prep_image,
|
| 653 |
+
lang='lat',
|
| 654 |
+
config=config
|
| 655 |
+
).strip()
|
| 656 |
+
|
| 657 |
+
# If Latin fails or produces poor results, try English
|
| 658 |
+
if not text or len(text) < 5:
|
| 659 |
+
text = pytesseract.image_to_string(
|
| 660 |
+
prep_image,
|
| 661 |
+
lang='eng',
|
| 662 |
+
config=config
|
| 663 |
+
).strip()
|
| 664 |
+
|
| 665 |
+
# Score the result
|
| 666 |
+
score = self._score_tesseract_result(text)
|
| 667 |
+
|
| 668 |
+
if text and score > best_score:
|
| 669 |
+
best_text = text
|
| 670 |
+
best_score = score
|
| 671 |
+
best_config = config_name
|
| 672 |
+
best_preprocessing = prep_name
|
| 673 |
+
|
| 674 |
+
except Exception as e:
|
| 675 |
+
continue # Skip failed configurations
|
| 676 |
+
|
| 677 |
+
if best_text:
|
| 678 |
+
print(f"[INFO] Best Tesseract result: {best_preprocessing} + {best_config} (score: {best_score:.3f})")
|
| 679 |
+
return self._post_process_tesseract_text(best_text)
|
| 680 |
+
|
| 681 |
+
return ""
|
| 682 |
+
|
| 683 |
+
except Exception as e:
|
| 684 |
+
print(f"[ERROR] Enhanced Tesseract extraction failed: {e}")
|
| 685 |
+
return ""
|
| 686 |
+
|
| 687 |
+
def _extract_layout_aware_ocr(self, image_path):
|
| 688 |
+
"""Extract text by segmenting the page layout into lines first for improved readability order"""
|
| 689 |
+
try:
|
| 690 |
+
import pytesseract
|
| 691 |
+
print("[INFO] Running layout-aware line segmentation...")
|
| 692 |
+
layout = self.layout_parser.analyze_layout(image_path)
|
| 693 |
+
crops = self.layout_parser.crop_lines(image_path, layout)
|
| 694 |
+
|
| 695 |
+
if not crops:
|
| 696 |
+
print("[WARN] Layout parser returned no line crops")
|
| 697 |
+
return ""
|
| 698 |
+
|
| 699 |
+
print(f"[INFO] Layout-aware line parser cropped {len(crops)} lines")
|
| 700 |
+
line_texts = []
|
| 701 |
+
|
| 702 |
+
for idx, crop in enumerate(crops):
|
| 703 |
+
# Enhance line crop for OCR
|
| 704 |
+
crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
|
| 705 |
+
gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
|
| 706 |
+
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
|
| 707 |
+
enhanced = clahe.apply(gray)
|
| 708 |
+
crop_pil = Image.fromarray(enhanced)
|
| 709 |
+
|
| 710 |
+
# Single line OCR configuration
|
| 711 |
+
config = '--oem 3 --psm 7'
|
| 712 |
+
|
| 713 |
+
# Try Latin OCR first
|
| 714 |
+
text = pytesseract.image_to_string(
|
| 715 |
+
crop_pil,
|
| 716 |
+
lang='lat',
|
| 717 |
+
config=config
|
| 718 |
+
).strip()
|
| 719 |
+
|
| 720 |
+
# Try English fallback
|
| 721 |
+
if not text or len(text) < 3:
|
| 722 |
+
text = pytesseract.image_to_string(
|
| 723 |
+
crop_pil,
|
| 724 |
+
lang='eng',
|
| 725 |
+
config=config
|
| 726 |
+
).strip()
|
| 727 |
+
|
| 728 |
+
if text:
|
| 729 |
+
line_texts.append(self._post_process_tesseract_text(text))
|
| 730 |
+
|
| 731 |
+
return "\n".join(line_texts)
|
| 732 |
+
except Exception as e:
|
| 733 |
+
print(f"[WARN] Layout aware Latin OCR failed: {e}")
|
| 734 |
+
return ""
|
| 735 |
+
|
| 736 |
+
def _preprocess_for_tesseract_enhanced(self, image):
|
| 737 |
+
"""Enhanced preprocessing for Tesseract OCR"""
|
| 738 |
+
try:
|
| 739 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 740 |
+
gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
|
| 741 |
+
|
| 742 |
+
# More aggressive enhancement for Tesseract
|
| 743 |
+
clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
|
| 744 |
+
enhanced = clahe.apply(gray)
|
| 745 |
+
|
| 746 |
+
# Morphological operations to clean up characters
|
| 747 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
|
| 748 |
+
cleaned = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
|
| 749 |
+
|
| 750 |
+
return Image.fromarray(cleaned)
|
| 751 |
+
|
| 752 |
+
except Exception as e:
|
| 753 |
+
print(f"[WARN] Enhanced Tesseract preprocessing failed: {e}")
|
| 754 |
+
return image
|
| 755 |
+
|
| 756 |
+
def _preprocess_for_tesseract_basic(self, image):
|
| 757 |
+
"""Basic preprocessing for Tesseract OCR"""
|
| 758 |
+
try:
|
| 759 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 760 |
+
gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
|
| 761 |
+
|
| 762 |
+
# Simple contrast enhancement
|
| 763 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
| 764 |
+
enhanced = clahe.apply(gray)
|
| 765 |
+
|
| 766 |
+
return Image.fromarray(enhanced)
|
| 767 |
+
|
| 768 |
+
except Exception as e:
|
| 769 |
+
return image
|
| 770 |
+
|
| 771 |
+
def _score_tesseract_result(self, text):
|
| 772 |
+
"""Score Tesseract OCR result quality"""
|
| 773 |
+
if not text or len(text.strip()) < 2:
|
| 774 |
+
return 0.0
|
| 775 |
+
|
| 776 |
+
score = 0.0
|
| 777 |
+
words = text.split()
|
| 778 |
+
|
| 779 |
+
# Base length bonus
|
| 780 |
+
score += min(len(words) / 15.0, 0.25)
|
| 781 |
+
|
| 782 |
+
# Latin character ratio
|
| 783 |
+
latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
|
| 784 |
+
if len(text) > 0:
|
| 785 |
+
latin_ratio = latin_chars / len(text)
|
| 786 |
+
score += latin_ratio * 0.35
|
| 787 |
+
|
| 788 |
+
# Word formation bonus
|
| 789 |
+
if len(words) > 1:
|
| 790 |
+
score += 0.2
|
| 791 |
+
|
| 792 |
+
# Common Latin words bonus
|
| 793 |
+
common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab', 'post', 'ante', 'inter']
|
| 794 |
+
latin_matches = sum(1 for word in words if word.lower() in common_latin)
|
| 795 |
+
if latin_matches > 0:
|
| 796 |
+
score += latin_matches * 0.05
|
| 797 |
+
|
| 798 |
+
# Medieval symbols bonus
|
| 799 |
+
medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
|
| 800 |
+
symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
|
| 801 |
+
if symbol_count > 0:
|
| 802 |
+
score += 0.15
|
| 803 |
+
|
| 804 |
+
# Penalize excessive garbage characters
|
| 805 |
+
garbage_chars = sum(1 for c in text if not c.isalnum() and c not in " .,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁ")
|
| 806 |
+
if len(text) > 0:
|
| 807 |
+
garbage_ratio = garbage_chars / len(text)
|
| 808 |
+
score -= garbage_ratio * 0.3
|
| 809 |
+
|
| 810 |
+
return max(0.0, min(1.0, score))
|
| 811 |
+
|
| 812 |
+
def _post_process_tesseract_text(self, text):
|
| 813 |
+
"""Post-process Tesseract OCR result"""
|
| 814 |
+
try:
|
| 815 |
+
# Clean up common OCR errors
|
| 816 |
+
corrections = {
|
| 817 |
+
'rn': 'm',
|
| 818 |
+
'cl': 'd',
|
| 819 |
+
'|': 'I',
|
| 820 |
+
'°': 'o',
|
| 821 |
+
'¢': 'c',
|
| 822 |
+
'£': 'E'
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
processed = text
|
| 826 |
+
for wrong, correct in corrections.items():
|
| 827 |
+
processed = processed.replace(wrong, correct)
|
| 828 |
+
|
| 829 |
+
# Normalize whitespace
|
| 830 |
+
processed = ' '.join(processed.split())
|
| 831 |
+
|
| 832 |
+
return processed
|
| 833 |
+
|
| 834 |
+
except Exception as e:
|
| 835 |
+
print(f"[WARN] Tesseract post-processing failed: {e}")
|
| 836 |
+
return text
|
| 837 |
+
|
| 838 |
+
def _validate_latin_text(self, text, style="any"):
|
| 839 |
+
"""Validate text with criteria appropriate for classical/printed or medieval Latin"""
|
| 840 |
+
if not text or len(text.strip()) < 3:
|
| 841 |
+
return False
|
| 842 |
+
|
| 843 |
+
try:
|
| 844 |
+
# Count Latin characters
|
| 845 |
+
latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
|
| 846 |
+
total_chars = len(text.replace(' ', ''))
|
| 847 |
+
|
| 848 |
+
if total_chars == 0:
|
| 849 |
+
return False
|
| 850 |
+
|
| 851 |
+
latin_ratio = latin_chars / max(total_chars, 1)
|
| 852 |
+
|
| 853 |
+
# For printed/classical Latin, we require a high ratio of standard alphabetical letters
|
| 854 |
+
if style == "printed":
|
| 855 |
+
return latin_chars >= 5 and latin_ratio >= 0.6
|
| 856 |
+
|
| 857 |
+
# For cursive/medieval Latin, we can be more generous and include medieval symbol weight
|
| 858 |
+
medieval_symbols = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§[]")
|
| 859 |
+
medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt']
|
| 860 |
+
word_bonus = sum(3 for word in medieval_words if word in text.lower())
|
| 861 |
+
|
| 862 |
+
total_meaningful = latin_chars + medieval_symbols + word_bonus
|
| 863 |
+
meaningful_ratio = total_meaningful / max(total_chars, 1)
|
| 864 |
+
|
| 865 |
+
if total_meaningful >= 10:
|
| 866 |
+
return True
|
| 867 |
+
elif meaningful_ratio >= 0.6:
|
| 868 |
+
return True
|
| 869 |
+
elif total_meaningful >= 5 and meaningful_ratio >= 0.3:
|
| 870 |
+
return True
|
| 871 |
+
else:
|
| 872 |
+
return False
|
| 873 |
+
|
| 874 |
+
except Exception as e:
|
| 875 |
+
print(f"[WARN] Text validation failed: {e}")
|
| 876 |
+
return len(text.strip()) >= 5 # Fallback validation
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def process_text(self, latin_text):
|
| 880 |
+
"""Process extracted Latin text with comprehensive TRIDIS-aware analysis"""
|
| 881 |
+
if not latin_text:
|
| 882 |
+
return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
|
| 883 |
+
|
| 884 |
+
print("[INFO] Processing Latin text with medieval manuscript analysis...")
|
| 885 |
+
|
| 886 |
+
# Extract symbols including medieval markers and corrections
|
| 887 |
+
symbols = ''.join(filter(lambda x: x.isalnum() or x in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§$[]", latin_text))
|
| 888 |
+
|
| 889 |
+
# Comprehensive medieval character analysis
|
| 890 |
+
medieval_symbols = [c for c in latin_text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§"]
|
| 891 |
+
correction_markers = latin_text.count('[') + latin_text.count('$')
|
| 892 |
+
|
| 893 |
+
# Detect expanded abbreviations
|
| 894 |
+
medieval_abbreviations = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt', 'omnia']
|
| 895 |
+
expansions_found = [word for word in medieval_abbreviations if word in latin_text.lower()]
|
| 896 |
+
|
| 897 |
+
# Count capitalized entities (TRIDIS feature)
|
| 898 |
+
import re
|
| 899 |
+
capitalized_entities = re.findall(r'\b[A-Z][a-z]+', latin_text)
|
| 900 |
+
unique_entities = list(set(capitalized_entities))
|
| 901 |
+
|
| 902 |
+
# Comprehensive character analysis
|
| 903 |
+
char_analysis = {
|
| 904 |
+
"total_chars": len(latin_text),
|
| 905 |
+
"alpha_chars": sum(c.isalpha() for c in latin_text),
|
| 906 |
+
"unique_chars": len(set(latin_text)),
|
| 907 |
+
"word_count": len(latin_text.split()),
|
| 908 |
+
"medieval_symbols": len(medieval_symbols),
|
| 909 |
+
"medieval_symbol_types": list(set(medieval_symbols)),
|
| 910 |
+
"abbreviation_expansions": expansions_found,
|
| 911 |
+
"expansion_count": len(expansions_found),
|
| 912 |
+
"correction_markers": correction_markers,
|
| 913 |
+
"capitalized_entities": unique_entities,
|
| 914 |
+
"entity_count": len(unique_entities),
|
| 915 |
+
"avg_word_length": sum(len(word) for word in latin_text.split()) / max(1, len(latin_text.split()))
|
| 916 |
+
}
|
| 917 |
+
|
| 918 |
+
# Enhanced validation with medieval features
|
| 919 |
+
validation = {
|
| 920 |
+
"latin_ratio": sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in latin_text) / max(1, len(latin_text)),
|
| 921 |
+
"quality_score": self._calculate_comprehensive_quality_score(latin_text),
|
| 922 |
+
"ocr_method": getattr(self, 'active_model', "TRIDIS HTR (Medieval Manuscript Specialist)" if self.tridis_available else "Tesseract OCR"),
|
| 923 |
+
"model_specialization": "General Latin text" if getattr(self, 'active_style', '') == 'printed' else ("13th-16th century manuscripts" if self.tridis_available else "General Latin text"),
|
| 924 |
+
"medieval_features_detected": bool(medieval_symbols or expansions_found or correction_markers),
|
| 925 |
+
"tridis_used": getattr(self, 'active_model', '') == 'tridis_HTR',
|
| 926 |
+
"manuscript_period": "Classical/Roman Monumental" if getattr(self, 'active_style', '') == 'printed' else ("Late Medieval (13th-16th centuries)" if (medieval_symbols or expansions_found) else "Classical/Modern"),
|
| 927 |
+
"text_type": "classical_inscription" if getattr(self, 'active_style', '') == 'printed' else self._determine_text_type(latin_text),
|
| 928 |
+
"abbreviations_expanded": len(expansions_found) > 0,
|
| 929 |
+
"named_entities_detected": len(unique_entities) > 0,
|
| 930 |
+
"scribal_corrections_found": correction_markers > 0,
|
| 931 |
+
"confidence_level": self._determine_confidence_level(latin_text),
|
| 932 |
+
"writing_style": getattr(self, 'active_style', 'cursive')
|
| 933 |
+
}
|
| 934 |
+
|
| 935 |
+
return {
|
| 936 |
+
"text": latin_text,
|
| 937 |
+
"symbols": symbols,
|
| 938 |
+
"char_analysis": char_analysis,
|
| 939 |
+
"validation": validation
|
| 940 |
+
}
|
| 941 |
+
|
| 942 |
+
def _calculate_comprehensive_quality_score(self, text):
|
| 943 |
+
"""Calculate comprehensive quality score with medieval bonuses"""
|
| 944 |
+
if not text:
|
| 945 |
+
return 0.0
|
| 946 |
+
|
| 947 |
+
score = 0.0
|
| 948 |
+
words = text.split()
|
| 949 |
+
|
| 950 |
+
# Base metrics
|
| 951 |
+
score += min(len(words) / 15.0, 0.2) # Length bonus (max 0.2)
|
| 952 |
+
|
| 953 |
+
# Latin character ratio
|
| 954 |
+
latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
|
| 955 |
+
score += (latin_chars / max(1, len(text))) * 0.25
|
| 956 |
+
|
| 957 |
+
# TRIDIS Medieval bonuses (only if TRIDIS was used)
|
| 958 |
+
if self.tridis_available and getattr(self, 'active_model', '') == 'tridis_HTR':
|
| 959 |
+
# Expanded abbreviations (major quality indicator)
|
| 960 |
+
medieval_expansions = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt']
|
| 961 |
+
expansion_count = sum(1 for exp in medieval_expansions if exp in text.lower())
|
| 962 |
+
score += min(expansion_count * 0.05, 0.2) # Max 0.2 bonus
|
| 963 |
+
|
| 964 |
+
# Named entity capitalization (TRIDIS feature)
|
| 965 |
+
import re
|
| 966 |
+
caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
|
| 967 |
+
score += min(caps_count * 0.02, 0.15) # Max 0.15 bonus
|
| 968 |
+
|
| 969 |
+
# Correction markers (authenticity indicator)
|
| 970 |
+
corrections = text.count('[') + text.count('$')
|
| 971 |
+
score += min(corrections * 0.03, 0.1) # Max 0.1 bonus
|
| 972 |
+
|
| 973 |
+
# Medieval symbols (regardless of OCR method)
|
| 974 |
+
medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
|
| 975 |
+
symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
|
| 976 |
+
score += min(symbol_count * 0.04, 0.15) # Max 0.15 bonus
|
| 977 |
+
|
| 978 |
+
|
| 979 |
+
# Word formation
|
| 980 |
+
if len(words) > 1:
|
| 981 |
+
score += 0.1
|
| 982 |
+
|
| 983 |
+
# Common Latin words
|
| 984 |
+
common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab']
|
| 985 |
+
latin_matches = sum(1 for word in words if word.lower() in common_latin)
|
| 986 |
+
score += min(latin_matches * 0.02, 0.1)
|
| 987 |
+
|
| 988 |
+
return max(0.0, min(1.0, score))
|
| 989 |
+
|
| 990 |
+
def _determine_text_type(self, text):
|
| 991 |
+
"""Determine the type of Latin text based on features"""
|
| 992 |
+
if not text:
|
| 993 |
+
return "unknown"
|
| 994 |
+
|
| 995 |
+
# Medieval indicators
|
| 996 |
+
medieval_expansions = ['domini', 'facimus', 'quod', 'enim']
|
| 997 |
+
has_expansions = any(exp in text.lower() for exp in medieval_expansions)
|
| 998 |
+
has_corrections = '[' in text or '$' in text
|
| 999 |
+
has_medieval_symbols = any(c in text for c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
|
| 1000 |
+
|
| 1001 |
+
if has_expansions and has_corrections:
|
| 1002 |
+
return "medieval_documentary_manuscript"
|
| 1003 |
+
elif has_expansions or has_medieval_symbols:
|
| 1004 |
+
return "medieval_manuscript"
|
| 1005 |
+
elif has_corrections:
|
| 1006 |
+
return "manuscript_with_corrections"
|
| 1007 |
+
else:
|
| 1008 |
+
return "classical_latin_text"
|
| 1009 |
+
|
| 1010 |
+
def _determine_confidence_level(self, text):
|
| 1011 |
+
"""Determine confidence level based on text characteristics"""
|
| 1012 |
+
score = self._calculate_comprehensive_quality_score(text)
|
| 1013 |
+
|
| 1014 |
+
if score >= 0.8:
|
| 1015 |
+
return "Very High"
|
| 1016 |
+
elif score >= 0.6:
|
| 1017 |
+
return "High"
|
| 1018 |
+
elif score >= 0.4:
|
| 1019 |
+
return "Medium"
|
| 1020 |
+
elif score >= 0.2:
|
| 1021 |
+
return "Low"
|
| 1022 |
+
else:
|
| 1023 |
+
return "Very Low"
|
| 1024 |
+
|
| 1025 |
+
def generate_historical_context(self, processed_result):
|
| 1026 |
+
"""Generate comprehensive historical context for Latin text"""
|
| 1027 |
+
latin_text = processed_result.get("text", "")
|
| 1028 |
+
|
| 1029 |
+
groq_detail = self._generate_groq_context(latin_text)
|
| 1030 |
+
|
| 1031 |
+
# Build references using words/symbols in Latin text
|
| 1032 |
+
words = re.findall(r'\w+', latin_text) if latin_text else []
|
| 1033 |
+
query_terms = list(words)
|
| 1034 |
+
if latin_text:
|
| 1035 |
+
query_terms.extend([char for char in latin_text if char.strip()])
|
| 1036 |
+
refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
|
| 1037 |
+
|
| 1038 |
+
return {
|
| 1039 |
+
"uses_box": {
|
| 1040 |
+
"title": "Medieval Latin manuscript analysis",
|
| 1041 |
+
"items": self._build_uses_list(latin_text)
|
| 1042 |
+
},
|
| 1043 |
+
"meaning_box": self._build_enhanced_meaning_box(latin_text, groq_detail, processed_result),
|
| 1044 |
+
"references": refs
|
| 1045 |
+
}
|
| 1046 |
+
|
| 1047 |
+
def _generate_groq_context(self, latin_text):
|
| 1048 |
+
"""Generate contextual information using Groq with medieval awareness"""
|
| 1049 |
+
if not self.groq_client.is_available():
|
| 1050 |
+
return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
|
| 1051 |
+
|
| 1052 |
+
# Analyze medieval features for context
|
| 1053 |
+
has_expansions = any(word in latin_text.lower() for word in ['domini', 'facimus', 'quod', 'enim'])
|
| 1054 |
+
has_corrections = '[' in latin_text or '$' in latin_text
|
| 1055 |
+
has_caps = any(c.isupper() for c in latin_text)
|
| 1056 |
+
|
| 1057 |
+
if is_gibberish(latin_text):
|
| 1058 |
+
prompt = (
|
| 1059 |
+
"The following sequence appears to be fragmentary medieval Latin text, possibly with scribal abbreviations or corrections. "
|
| 1060 |
+
"Provide a concise, scholarly paragraph (6-10 sentences) covering possible meanings, historical context of medieval Latin manuscripts, "
|
| 1061 |
+
"common abbreviation practices, and typical documentary uses in 13th-16th century Europe."
|
| 1062 |
+
)
|
| 1063 |
+
else:
|
| 1064 |
+
context_note = ""
|
| 1065 |
+
if has_expansions:
|
| 1066 |
+
context_note += "The text contains expanded medieval abbreviations. "
|
| 1067 |
+
if has_corrections:
|
| 1068 |
+
context_note += "Scribal corrections or cancellations are present. "
|
| 1069 |
+
if has_caps:
|
| 1070 |
+
context_note += "Named entities appear to be properly capitalized. "
|
| 1071 |
+
|
| 1072 |
+
prompt = (
|
| 1073 |
+
f"Analyze this medieval Latin text: {latin_text}\n\n"
|
| 1074 |
+
f"Context: {context_note}This appears to be from a medieval manuscript (13th-16th centuries). "
|
| 1075 |
+
f"Provide a scholarly paragraph (6-10 sentences) on its historical significance, cultural context, "
|
| 1076 |
+
f"likely documentary purpose, and interpretations. Focus on medieval manuscript practices, "
|
| 1077 |
+
f"legal/administrative contexts, and paleographic significance."
|
| 1078 |
+
)
|
| 1079 |
+
|
| 1080 |
+
system_prompt = "You are a medieval Latin paleography specialist and historian. Provide accurate, concise scholarly analysis focusing on manuscript traditions, abbreviation practices, and documentary contexts of the late medieval period."
|
| 1081 |
+
enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, latin_text)
|
| 1082 |
+
|
| 1083 |
+
return self.groq_client.generate_response(
|
| 1084 |
+
system_prompt=enriched_system_prompt,
|
| 1085 |
+
user_prompt=prompt
|
| 1086 |
+
) or "(Historical context unavailable due to Groq error)"
|
| 1087 |
+
|
| 1088 |
+
def _build_uses_list(self, latin_text):
|
| 1089 |
+
"""Build enhanced list of character uses with TRIDIS context"""
|
| 1090 |
+
notes = self.references.get("latin_symbol_notes", {}) or {}
|
| 1091 |
+
default_hint = self.references.get("latin_hint",
|
| 1092 |
+
"Letters and symbols reflect phonetic values and scribal practices in medieval manuscripts.")
|
| 1093 |
+
|
| 1094 |
+
seen = set()
|
| 1095 |
+
items = []
|
| 1096 |
+
|
| 1097 |
+
# Add TRIDIS-specific information for medieval features
|
| 1098 |
+
tridis_notes = {
|
| 1099 |
+
'[': "Editorial bracket indicating scribal correction or cancellation (TRIDIS transcription standard)",
|
| 1100 |
+
'$': "Cancellation marker for struck-through text (TRIDIS notation)",
|
| 1101 |
+
}
|
| 1102 |
+
|
| 1103 |
+
for ch in latin_text:
|
| 1104 |
+
if ch in seen or not ch.strip():
|
| 1105 |
+
continue
|
| 1106 |
+
seen.add(ch)
|
| 1107 |
+
|
| 1108 |
+
# Check TRIDIS-specific notes first
|
| 1109 |
+
if ch in tridis_notes:
|
| 1110 |
+
note = tridis_notes[ch]
|
| 1111 |
+
elif ch in notes:
|
| 1112 |
+
note = notes[ch]
|
| 1113 |
+
else:
|
| 1114 |
+
note = default_hint
|
| 1115 |
+
|
| 1116 |
+
items.append(f"- {ch}: {note}")
|
| 1117 |
+
|
| 1118 |
+
if not items:
|
| 1119 |
+
items.append("- —: " + default_hint)
|
| 1120 |
+
|
| 1121 |
+
# Limit to prevent overwhelming output
|
| 1122 |
+
return items[:20]
|
| 1123 |
+
|
| 1124 |
+
def _build_enhanced_meaning_box(self, latin_text, groq_detail, processed_result):
|
| 1125 |
+
"""Build comprehensive meaning box with TRIDIS medieval analysis"""
|
| 1126 |
+
char_analysis = processed_result.get("char_analysis", {})
|
| 1127 |
+
validation = processed_result.get("validation", {})
|
| 1128 |
+
|
| 1129 |
+
# Enhanced introduction with TRIDIS context
|
| 1130 |
+
processing_method = validation.get("ocr_method", "Unknown OCR")
|
| 1131 |
+
text_type = validation.get("text_type", "unknown")
|
| 1132 |
+
confidence = validation.get("confidence_level", "Unknown")
|
| 1133 |
+
|
| 1134 |
+
intro_lines = [
|
| 1135 |
+
f"Text processed using {processing_method} with confidence level: {confidence}.",
|
| 1136 |
+
]
|
| 1137 |
+
|
| 1138 |
+
if self.tridis_available:
|
| 1139 |
+
intro_lines.extend([
|
| 1140 |
+
"TRIDIS HTR model trained on 245,000 lines of medieval manuscripts (13th-16th centuries).",
|
| 1141 |
+
"Specializes in Latin, Old French, Old Spanish documentary texts with automatic abbreviation expansion."
|
| 1142 |
+
])
|
| 1143 |
+
|
| 1144 |
+
# Medieval features summary
|
| 1145 |
+
medieval_features = []
|
| 1146 |
+
expansion_count = char_analysis.get("expansion_count", 0)
|
| 1147 |
+
if expansion_count > 0:
|
| 1148 |
+
medieval_features.append(f"{expansion_count} abbreviation expansions")
|
| 1149 |
+
|
| 1150 |
+
correction_count = char_analysis.get("correction_markers", 0)
|
| 1151 |
+
if correction_count > 0:
|
| 1152 |
+
medieval_features.append(f"{correction_count} scribal corrections")
|
| 1153 |
+
|
| 1154 |
+
entity_count = char_analysis.get("entity_count", 0)
|
| 1155 |
+
if entity_count > 0:
|
| 1156 |
+
medieval_features.append(f"{entity_count} named entities")
|
| 1157 |
+
|
| 1158 |
+
if medieval_features:
|
| 1159 |
+
intro_lines.append(f"Medieval features detected: {', '.join(medieval_features)}.")
|
| 1160 |
+
|
| 1161 |
+
# Key terms for frequent list
|
| 1162 |
+
expansions = char_analysis.get("abbreviation_expansions", [])
|
| 1163 |
+
entities = char_analysis.get("capitalized_entities", [])
|
| 1164 |
+
frequent_terms = expansions + entities
|
| 1165 |
+
|
| 1166 |
+
if not frequent_terms:
|
| 1167 |
+
frequent_terms = list(set(w for w in latin_text.split() if len(w) > 2))[:10]
|
| 1168 |
+
|
| 1169 |
+
# Enhanced analysis points
|
| 1170 |
+
points = []
|
| 1171 |
+
|
| 1172 |
+
if self.tridis_available:
|
| 1173 |
+
points.extend([
|
| 1174 |
+
"• TRIDIS HTR provides semi-diplomatic transcription following scholarly editorial standards.",
|
| 1175 |
+
"• Automatic abbreviation expansion: dom̃→domini, facimꝰ→facimus, ꝓ→pro, ⁊→et.",
|
| 1176 |
+
"• Named entity capitalization and punctuation normalization applied."
|
| 1177 |
+
])
|
| 1178 |
+
else:
|
| 1179 |
+
points.append("��� Tesseract OCR provides basic Latin character recognition with limited medieval symbol support.")
|
| 1180 |
+
|
| 1181 |
+
if correction_count > 0:
|
| 1182 |
+
points.append(f"• [{correction_count}] scribal corrections/cancellations indicate active manuscript editing process.")
|
| 1183 |
+
|
| 1184 |
+
if expansion_count > 0:
|
| 1185 |
+
expansions_list = ", ".join(char_analysis.get("abbreviation_expansions", [])[:5])
|
| 1186 |
+
points.append(f"• Expanded abbreviations suggest legal/administrative document: {expansions_list}.")
|
| 1187 |
+
|
| 1188 |
+
if validation.get("medieval_features_detected", False):
|
| 1189 |
+
manuscript_period = validation.get("manuscript_period", "Medieval")
|
| 1190 |
+
points.append(f"• {manuscript_period} characteristics indicate documentary manuscript tradition.")
|
| 1191 |
+
|
| 1192 |
+
if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
|
| 1193 |
+
points.append(f"• Historical analysis: {groq_detail.strip()}")
|
| 1194 |
+
|
| 1195 |
+
return {
|
| 1196 |
+
"title": "Medieval Latin manuscript analysis:",
|
| 1197 |
+
"intro_lines": intro_lines,
|
| 1198 |
+
"frequent_label": "Key medieval terms identified",
|
| 1199 |
+
"frequent": frequent_terms[:12],
|
| 1200 |
+
"points": points
|
| 1201 |
+
}
|
| 1202 |
+
|
| 1203 |
+
def generate_story(self, processed_result):
|
| 1204 |
+
"""Generate creative story with medieval manuscript context"""
|
| 1205 |
+
latin_text = processed_result.get("text", "")
|
| 1206 |
+
|
| 1207 |
+
if not self.groq_client.is_available():
|
| 1208 |
+
return "Groq client unavailable, cannot generate historical narrative."
|
| 1209 |
+
|
| 1210 |
+
# Analyze text features for story context
|
| 1211 |
+
char_analysis = processed_result.get("char_analysis", {})
|
| 1212 |
+
validation = processed_result.get("validation", {})
|
| 1213 |
+
|
| 1214 |
+
has_expansions = char_analysis.get("expansion_count", 0) > 0
|
| 1215 |
+
has_corrections = char_analysis.get("correction_markers", 0) > 0
|
| 1216 |
+
has_entities = char_analysis.get("entity_count", 0) > 0
|
| 1217 |
+
text_type = validation.get("text_type", "unknown")
|
| 1218 |
+
used_tridis = validation.get("tridis_used", False)
|
| 1219 |
+
|
| 1220 |
+
# Choose appropriate narrative style based on detected features
|
| 1221 |
+
if "documentary" in text_type or has_expansions:
|
| 1222 |
+
styles = [
|
| 1223 |
+
"as a legal charter discovered in monastic archives",
|
| 1224 |
+
"as an administrative record from a medieval royal court",
|
| 1225 |
+
"as a property deed found in cathedral scriptorium",
|
| 1226 |
+
"as a guild register from a medieval trading city",
|
| 1227 |
+
"as a tax record from a 14th-century monastery"
|
| 1228 |
+
]
|
| 1229 |
+
elif has_corrections or has_entities:
|
| 1230 |
+
styles = [
|
| 1231 |
+
"as a monk's working manuscript with personal annotations",
|
| 1232 |
+
"as a scholar's commentary on ancient texts",
|
| 1233 |
+
"as a chronicle being revised by a medieval historian",
|
| 1234 |
+
"as a theological treatise with scribal corrections",
|
| 1235 |
+
"as a copy of classical texts with medieval glosses"
|
| 1236 |
+
]
|
| 1237 |
+
else:
|
| 1238 |
+
styles = [
|
| 1239 |
+
"as a sacred text illuminated by medieval scribes",
|
| 1240 |
+
"as a philosophical work from a cathedral school",
|
| 1241 |
+
"as a liturgical manuscript from a monastic library",
|
| 1242 |
+
"as a medical treatise translated in medieval Spain",
|
| 1243 |
+
"as an astronomical text from a medieval university"
|
| 1244 |
+
]
|
| 1245 |
+
|
| 1246 |
+
import random
|
| 1247 |
+
chosen_style = random.choice(styles)
|
| 1248 |
+
seed = random.randint(1000, 9999)
|
| 1249 |
+
|
| 1250 |
+
# Craft historically-informed prompt
|
| 1251 |
+
processing_context = "deciphered using advanced medieval manuscript AI" if used_tridis else "carefully transcribed from the original"
|
| 1252 |
+
time_period = "13th-16th centuries" if (has_expansions or has_corrections) else "medieval period"
|
| 1253 |
+
|
| 1254 |
+
prompt = (
|
| 1255 |
+
f"This Latin manuscript text was {processing_context}: {latin_text}\n\n"
|
| 1256 |
+
f"Historical context: The text appears to be from the {time_period}, "
|
| 1257 |
+
f"{'with expanded abbreviations and scribal corrections typical of documentary manuscripts' if has_expansions else 'showing characteristics of medieval scholarly tradition'}.\n\n"
|
| 1258 |
+
f"Create a vivid, historically accurate narrative (250+ words) set in medieval Europe, "
|
| 1259 |
+
f"telling the story of this manuscript's creation and significance. "
|
| 1260 |
+
f"Write {chosen_style}.\n\n"
|
| 1261 |
+
f"Include: Medieval setting, authentic historical details, multiple characters, "
|
| 1262 |
+
f"the process of manuscript creation, and the document's importance to its community.\n"
|
| 1263 |
+
f"Narrative seed: {seed}"
|
| 1264 |
+
)
|
| 1265 |
+
|
| 1266 |
+
system_prompt = (
|
| 1267 |
+
"You are a medieval historian and storyteller specializing in manuscript culture, "
|
| 1268 |
+
"paleography, and daily life in 13th-16th century Europe. Create authentic, "
|
| 1269 |
+
"engaging narratives that reflect accurate historical knowledge of medieval "
|
| 1270 |
+
"scriptoriums, legal practices, and scholarly traditions."
|
| 1271 |
+
)
|
| 1272 |
+
|
| 1273 |
+
story = self.groq_client.generate_response(
|
| 1274 |
+
system_prompt=system_prompt,
|
| 1275 |
+
user_prompt=prompt
|
| 1276 |
+
)
|
| 1277 |
+
|
| 1278 |
+
if not story or is_gibberish(story):
|
| 1279 |
+
return "Failed to generate historical narrative; medieval story creation unavailable."
|
| 1280 |
+
|
| 1281 |
+
return story
|
references.json
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"egypt_symbol_notes": {
|
| 3 |
+
"bow": "Warfare and hunting tool; often a phonogram (q/p/k depending on context) and a determinative for martial power, subjugation, or protection.",
|
| 4 |
+
"hoe": "Agricultural implement tied to cultivation and ritual ‘preparation’; used as a determinative and phonetic sign.",
|
| 5 |
+
"god_figure": "Anthropomorphic marker indicating divine agency; commonly a determinative for deities.",
|
| 6 |
+
"bread": "Offering loaf symbolizing food and sustenance; phonetic value 't'.",
|
| 7 |
+
"feather": "Represents the concepts of truth and balance; phonetic value 'm'.",
|
| 8 |
+
"eye": "Wedjat eye symbolizing protection and healing; phonetic for 'ir' or 'jr'.",
|
| 9 |
+
"owl": "Common phonogram 'm'; also signifies night and hidden knowledge.",
|
| 10 |
+
"reed": "Symbol for writing and speech; phonetic and determinative use.",
|
| 11 |
+
"scribe_tools": "Represents writing, record-keeping, and administration.",
|
| 12 |
+
"leg": "Indicates motion, going forth, phonetic sign.",
|
| 13 |
+
"lizard": "Represents reptiles; associated with protective symbolism.",
|
| 14 |
+
"woman_seated": "Determinative for female persons or roles.",
|
| 15 |
+
"jar": "Associated with offerings, fluids, and ritual context.",
|
| 16 |
+
"crown": "Symbol of royal authority and divine power.",
|
| 17 |
+
"man_seated": "Determinative for male persons or generic agents."
|
| 18 |
+
},
|
| 19 |
+
|
| 20 |
+
"greek_symbol_notes": {
|
| 21 |
+
"Κ": "Kappa: Tenth letter, consonant /k/ sound, equivalent to Latin C/K.",
|
| 22 |
+
"γ": "Lowercase gamma, consonant /g/ sound, or nasal /ng/ before γ, κ, χ, ξ sounds.",
|
| 23 |
+
"ι": "Lowercase iota, vowel /i/ sound, can form diphthongs with other vowels.",
|
| 24 |
+
"Ν": "Nu: Thirteenth letter, consonant /n/ sound, nasal consonant with grammatical functions.",
|
| 25 |
+
"Τ": "Tau: Nineteenth letter, consonant /t/ sound, common in grammatical endings.",
|
| 26 |
+
"ο": "Lowercase omicron, short vowel /o/ sound, frequent in grammatical morphemes.",
|
| 27 |
+
"λ": "Lowercase lambda, consonant /l/ sound, liquid consonant with metrical significance.",
|
| 28 |
+
"θ": "Lowercase theta, aspirated /tʰ/ sound, distinguishes words from similar tau forms.",
|
| 29 |
+
"υ": "Lowercase upsilon, vowel /y/ sound, forms diphthongs and appears in many endings.",
|
| 30 |
+
"Θ": "Theta: Eighth letter, aspirated /tʰ/ sound in ancient Greek, /θ/ (voiceless th) in modern.",
|
| 31 |
+
"Η": "Eta: Seventh letter, long vowel /ē/ sound in ancient Greek, /i/ in modern Greek.",
|
| 32 |
+
"ς": "Lowercase sigma (final form), consonant /s/ sound, used only at word endings.",
|
| 33 |
+
"ε": "Lowercase epsilon, short vowel /e/ sound, appears frequently in verb conjugations.",
|
| 34 |
+
"-": "Hyphen; marks word division or compound elements in Greek texts.",
|
| 35 |
+
"Ὶ": "Capital Iota with grave accent, indicates lowered pitch or unstressed position.",
|
| 36 |
+
"ῖ": "Lowercase iota with circumflex accent, indicates falling tone on long vowel /ī/.",
|
| 37 |
+
"ί": "Lowercase iota with acute accent, vowel /i/ with raised pitch indicating word stress.",
|
| 38 |
+
"Ἰ": "Capital Iota with smooth breathing, vowel /i/ without initial aspiration.",
|
| 39 |
+
"Ἑ": "Capital Epsilon with rough breathing, indicates /h/ sound before vowel.",
|
| 40 |
+
"'": "Apostrophe; indicates elision (omitted vowel) or contraction in Greek.",
|
| 41 |
+
"Π": "Pi: Sixteenth letter, consonant /p/ sound, appears in mathematical and scientific contexts.",
|
| 42 |
+
"Ο": "Omicron: Fifteenth letter, short vowel /o/ sound, distinct from omega (long o).",
|
| 43 |
+
"Μ": "Mu: Twelfth letter, consonant /m/ sound, nasal consonant often used in word formation.",
|
| 44 |
+
"[": "Opening square bracket; typically editorial reconstructions or uncertain readings.",
|
| 45 |
+
"Α": "Alpha: First letter of Greek alphabet, vowel /a/ sound, often marks beginnings or primacy.",
|
| 46 |
+
"μ": "Lowercase mu, consonant /m/ sound, nasal consonant often in prefixes and roots.",
|
| 47 |
+
"ὲ": "Lowercase epsilon with grave accent, short /e/ sound with lowered pitch.",
|
| 48 |
+
"Ᾱ": "Capital Alpha with macron (long mark), indicates long /ā/ vowel quantity.",
|
| 49 |
+
"Γ": "Gamma: Third letter, consonant /g/ sound, or /ng/ before γ, κ, χ, ξ sounds.",
|
| 50 |
+
"Υ": "Upsilon: Twentieth letter, vowel /y/ sound in ancient Greek, /i/ in modern pronunciation.",
|
| 51 |
+
"(": "Opening parenthesis; editorial or explanatory insertions.",
|
| 52 |
+
")": "Closing parenthesis; completes editorial or explanatory insertions.",
|
| 53 |
+
"ω": "Lowercase omega, long vowel /ō/ sound, often in verb endings and declensions.",
|
| 54 |
+
"ῑ": "Lowercase iota with macron, explicitly marks long vowel quantity /ī/.",
|
| 55 |
+
"·": "Middle dot (Greek semicolon); equivalent to modern semicolon, marks major pause.",
|
| 56 |
+
"ῐ": "Lowercase iota with breve, explicitly marks short vowel quantity /ĭ/.",
|
| 57 |
+
"Ξ": "Xi: Fourteenth letter, consonant cluster /ks/ sound, compound sound written as single letter.",
|
| 58 |
+
"ν": "Lowercase nu, consonant /n/ sound, assimilates before consonants in pronunciation.",
|
| 59 |
+
"Ε": "Epsilon: Fifth letter, short vowel /e/ sound, distinct from eta (long e).",
|
| 60 |
+
"η": "Lowercase eta, long vowel /ē/ sound in ancient Greek, /i/ in modern pronunciation.",
|
| 61 |
+
"]": "Closing square bracket; completes editorial reconstructions.",
|
| 62 |
+
"Ι": "Iota: Ninth letter, vowel /i/ sound, can form diphthongs with other vowels.",
|
| 63 |
+
"κ": "Lowercase kappa, consonant /k/ sound, common in word formation and inflection.",
|
| 64 |
+
"1": "Numeral one; manuscript numbering, line numbers, or verse citations.",
|
| 65 |
+
"ῃ": "Lowercase eta with iota subscript, indicates original diphthong /ēi/ sound.",
|
| 66 |
+
"ψ": "Lowercase psi, consonant cluster /ps/ sound, compound phoneme as single letter.",
|
| 67 |
+
"ἢ": "Lowercase eta with rough breathing and grave accent, aspirated long vowel with lowered tone.",
|
| 68 |
+
"Ὗ": "Capital Upsilon with rough breathing and circumflex, indicates aspiration and falling tone.",
|
| 69 |
+
"Ἱ": "Capital Iota with rough breathing, vowel /i/ with initial aspiration /h/.",
|
| 70 |
+
"Ᾰ": "Capital Alpha with breve (short mark), indicates short /ă/ vowel quantity.",
|
| 71 |
+
"Ί": "Capital Iota with acute accent, indicates raised pitch or primary word stress.",
|
| 72 |
+
"Λ": "Lambda: Eleventh letter, consonant /l/ sound, liquid consonant in Greek phonology.",
|
| 73 |
+
"\"": "Quotation mark; marks direct speech or citations in Greek texts.",
|
| 74 |
+
"σ": "Lowercase sigma (medial form), consonant /s/ sound, used within words.",
|
| 75 |
+
"Ἡ": "Capital Eta with rough breathing, long /ē/ sound with initial aspiration /h/.",
|
| 76 |
+
"Χ": "Chi: Twenty-second letter, aspirated /kʰ/ sound in ancient Greek, /x/ (voiceless velar fricative) in modern.",
|
| 77 |
+
"ζ": "Lowercase zeta, consonant cluster /zd/ sound, represents compound phoneme.",
|
| 78 |
+
"Ἷ": "Capital Iota with rough breathing and circumflex accent, complex tonal marking.",
|
| 79 |
+
"ὶ": "Lowercase iota with grave accent, vowel /i/ with lowered pitch or unstressed.",
|
| 80 |
+
"ἰ": "Lowercase iota with smooth breathing, vowel /i/ without initial aspiration.",
|
| 81 |
+
"α": "Lowercase alpha, vowel /a/ sound, fundamental vowel in Greek phonology.",
|
| 82 |
+
",": "Comma; punctuation for pauses, lists, or grammatical separation.",
|
| 83 |
+
"ᾗ": "Lowercase eta with rough breathing, circumflex accent, and iota subscript, complex phonetic marking.",
|
| 84 |
+
"τ": "Lowercase tau, consonant /t/ sound, appears in many grammatical suffixes.",
|
| 85 |
+
"<": "Less-than symbol; editorial mark for textual corrections or variants.",
|
| 86 |
+
"Σ": "Sigma: Eighteenth letter, consonant /s/ sound, has special final form (ς) at word end.",
|
| 87 |
+
"ρ": "Lowercase rho, consonant /r/ sound, when word-initial requires rough breathing mark.",
|
| 88 |
+
"ἡ": "Lowercase eta with rough breathing, long /ē/ sound with initial /h/.",
|
| 89 |
+
"Ω": "Omega: Twenty-fourth letter, long vowel /ō/ sound in ancient Greek, /o/ in modern.",
|
| 90 |
+
".": "Period (full stop); marks end of sentences in Greek texts.",
|
| 91 |
+
"Ῥ": "Capital Rho with rough breathing, indicates /hr/ sound at word beginning (all word-initial rhos are aspirated).",
|
| 92 |
+
"ἕ": "Lowercase epsilon with rough breathing and acute accent, aspirated short vowel with raised tone.",
|
| 93 |
+
"ῆ": "Lowercase eta with circumflex accent, falling tone on long vowel /ē/.",
|
| 94 |
+
"Ἶ": "Capital Iota with smooth breathing and circumflex accent, vowel /ī/ with falling tone, no initial aspiration.",
|
| 95 |
+
"β": "Lowercase beta, consonant /b/ sound in ancient Greek, /v/ sound in modern Greek pronunciation.",
|
| 96 |
+
"Ὁ": "Capital Omicron with rough breathing, short /o/ sound with initial aspiration /h/.",
|
| 97 |
+
"Ϊ": "Capital Iota with diaeresis (trema), indicates /i/ vowel pronounced separately, not as diphthong.",
|
| 98 |
+
"Φ": "Phi: Twenty-first letter, aspirated /pʰ/ sound in ancient Greek, /f/ in modern pronunciation.",
|
| 99 |
+
"ῗ": "Lowercase iota with diaeresis and circumflex, /ī/ vowel with falling tone, pronounced separately.",
|
| 100 |
+
"έ": "Lowercase epsilon with acute accent (modern Greek), short /e/ sound with stress marking.",
|
| 101 |
+
"ἷ": "Lowercase iota with rough breathing and circumflex accent, aspirated /ī/ vowel with falling tone."
|
| 102 |
+
|
| 103 |
+
}
|
| 104 |
+
,
|
| 105 |
+
"latin_symbol_notes": {
|
| 106 |
+
"꜠": "Modifier letter for stress and high tone, used in phonetic transcription and transliteration.",
|
| 107 |
+
"꜡": "Modifier letter for stress and low tone, common in linguistic notation.",
|
| 108 |
+
"Ꜣ": "Capital Letter Egyptological Alef, used in transliterating Egyptian hieroglyphs.",
|
| 109 |
+
"ꜣ": "Small Letter Egyptological Alef, counterpart to capital version.",
|
| 110 |
+
"Ꜥ": "Capital Letter Egyptological Ain, reflecting voiced pharyngeal sounds in transliteration.",
|
| 111 |
+
"ꜥ": "Small Letter Egyptological Ain, used in Semitic transliterations.",
|
| 112 |
+
"Ꝁ": "Capital Letter K with Stroke, scribal abbreviation mark for legal or medieval texts.",
|
| 113 |
+
"ꝁ": "Small Letter K with Stroke, similar abbreviation symbol.",
|
| 114 |
+
"ꝑ": "Small Letter P with Stroke Through Descender, abbreviation of 'per' in medieval Latin manuscripts.",
|
| 115 |
+
"ꝛ": "Small Letter R Rotunda, a stylistic medieval form of 'r' to save space.",
|
| 116 |
+
"Ꞁ": "Capital Letter Turned L, used in paleography to denote variant forms.",
|
| 117 |
+
"ꞁ": "Small Letter Turned L, lowercase variant in medieval scripts.",
|
| 118 |
+
"Ꞃ": "Capital Letter Insular R, found in Insular script manuscripts in medieval Britain and Ireland.",
|
| 119 |
+
"Ꝼ": "Capital Letter Insular F, distinct letter in Celtic Insular manuscripts.",
|
| 120 |
+
"ꟽ": "Epigraphic Letter Inverted M, used as a logogram for 'mulier' or 'matrona' in Roman inscriptions.",
|
| 121 |
+
"ꟿ": "Epigraphic Letter Archaic M, represents the praenomen 'Manius' in inscriptions."
|
| 122 |
+
},"cuneiform_symbol_notes": {
|
| 123 |
+
"𒀀": "Cuneiform sign A: vowel sound /a/ in Sumerian and Akkadian, fundamental vowel marker",
|
| 124 |
+
"𒀭": "Cuneiform sign AN/DINGIR: divine determinative, heaven, god concept in religious texts",
|
| 125 |
+
"𒈗": "Cuneiform sign LUGAL: king, ruler, sovereign used in royal inscriptions and titles",
|
| 126 |
+
"𒊕": "Cuneiform sign UD: day, sun, light, time marker in calendrical and chronological contexts",
|
| 127 |
+
"𒄿": "Cuneiform sign I: vowel /i/, often used in verbal forms and grammatical particles",
|
| 128 |
+
"𒂍": "Cuneiform sign E: house, temple, building in architectural and religious contexts",
|
| 129 |
+
"𒀸": "Cuneiform sign ARAD: servant, slave, worker in administrative and legal documents",
|
| 130 |
+
"𒁹": "Cuneiform sign DIRIG: to exceed, surplus, extra in mathematical and accounting texts",
|
| 131 |
+
"𒉋": "Cuneiform sign TI: life, to live, arrow in medical, military, and philosophical contexts",
|
| 132 |
+
"𒆠": "Cuneiform sign KI: earth, place, land in geographical and territorial designations",
|
| 133 |
+
"𒌓": "Cuneiform sign ZU: to know, knowledge, wisdom in educational and scribal contexts",
|
| 134 |
+
"𒈨": "Cuneiform sign ME: divine powers, cultural practices in mythological and religious texts",
|
| 135 |
+
"𒉿": "Cuneiform sign TUKU: to have, possess, hold in commercial and legal transactions",
|
| 136 |
+
"𒄩": "Cuneiform sign HA: fish, to catch in texts about fishing, food, and economy",
|
| 137 |
+
"𒁇": "Cuneiform sign DU: to go, walk, build in construction, travel, and action contexts",
|
| 138 |
+
"lugal": "ATF: lugal - Sumerian/Akkadian for 'king', royal title in administrative texts",
|
| 139 |
+
"an": "ATF: an - Sky god An/Anu, heaven concept in religious and mythological contexts",
|
| 140 |
+
"ki": "ATF: ki - Earth, place, land in geographical and cosmological descriptions",
|
| 141 |
+
"dingir": "ATF: dingir - God, divine being, deity in religious and ceremonial texts",
|
| 142 |
+
"sar": "ATF: sar - To write, inscription, totality in scribal and administrative contexts",
|
| 143 |
+
"{d}": "ATF: determinative for divine names, indicates following word refers to a deity"
|
| 144 |
+
},
|
| 145 |
+
"cuneiform_hint": "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages (Sumerian, Akkadian, Hittite, etc.). ATF format uses Latin transliteration of cuneiform symbols.",
|
| 146 |
+
"greek_hint": "If no specific character note is found, treat as lexical marker considering diacriticals (breathing marks, accents, vowel quantity) which affect pronunciation, meaning, and grammatical function in ancient Greek texts.",
|
| 147 |
+
"latin_hint": "Letters and symbols reflect phonetic values and scribal practices in manuscripts."
|
| 148 |
+
}
|
| 149 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DecipherAI Backend — Production Dependencies (Hugging Face Spaces)
|
| 2 |
+
# CPU-only torch for HF Spaces free tier
|
| 3 |
+
|
| 4 |
+
# Web framework
|
| 5 |
+
Flask==3.1.3
|
| 6 |
+
flask-cors==6.0.2
|
| 7 |
+
python-dotenv==1.2.2
|
| 8 |
+
|
| 9 |
+
# AI / ML
|
| 10 |
+
groq==1.2.0
|
| 11 |
+
transformers==5.9.0
|
| 12 |
+
safetensors==0.7.0
|
| 13 |
+
|
| 14 |
+
# PyTorch CPU-only (HF Spaces free tier does not have GPU)
|
| 15 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 16 |
+
torch==2.12.0+cpu
|
| 17 |
+
torchvision==0.27.0+cpu
|
| 18 |
+
|
| 19 |
+
# Image processing
|
| 20 |
+
opencv-python-headless==4.13.0.92
|
| 21 |
+
pillow==12.2.0
|
| 22 |
+
pytesseract==0.3.13
|
| 23 |
+
|
| 24 |
+
# Utilities
|
| 25 |
+
numpy==2.4.4
|
| 26 |
+
regex==2026.5.9
|
| 27 |
+
tqdm==4.67.3
|
| 28 |
+
|
| 29 |
+
# Production WSGI server
|
| 30 |
+
gunicorn==23.0.0
|
services/__init__.py
ADDED
|
File without changes
|
services/context_generator.py
ADDED
|
File without changes
|
services/groq_vision_classifier.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from groq import Groq
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class GroqVisionScriptClassifier:
|
| 10 |
+
def __init__(self, groq_api_key):
|
| 11 |
+
self.groq_client = Groq(api_key=groq_api_key)
|
| 12 |
+
# FIXED: Use the correct stable model name
|
| 13 |
+
self.vision_model = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 14 |
+
print(f"[INFO] Groq Vision Classifier initialized with {self.vision_model}")
|
| 15 |
+
|
| 16 |
+
def classify_script(self, image_path):
|
| 17 |
+
"""Enhanced script classification including cuneiform using Groq's Llama Vision model"""
|
| 18 |
+
try:
|
| 19 |
+
# Convert image to base64
|
| 20 |
+
base64_image = self._image_to_base64(image_path)
|
| 21 |
+
if not base64_image:
|
| 22 |
+
return "unknown"
|
| 23 |
+
|
| 24 |
+
# Query Groq Vision API
|
| 25 |
+
response = self._query_groq_vision(base64_image)
|
| 26 |
+
|
| 27 |
+
# Parse the response
|
| 28 |
+
script_type = self._parse_classification_response(response)
|
| 29 |
+
|
| 30 |
+
print(f"[INFO] Llama Vision classified script as: {script_type}")
|
| 31 |
+
return script_type.lower()
|
| 32 |
+
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"[ERROR] Groq Vision script classification failed: {e}")
|
| 35 |
+
return "unknown"
|
| 36 |
+
|
| 37 |
+
def _image_to_base64(self, image_path):
|
| 38 |
+
"""Convert image to base64 for Groq Vision API (4MB limit)"""
|
| 39 |
+
try:
|
| 40 |
+
image = Image.open(image_path)
|
| 41 |
+
|
| 42 |
+
# Resize if too large (keep under 4MB base64 limit)
|
| 43 |
+
if max(image.size) > 1200:
|
| 44 |
+
image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
|
| 45 |
+
|
| 46 |
+
# Convert to base64 JPEG (smaller than PNG)
|
| 47 |
+
buffer = BytesIO()
|
| 48 |
+
image.save(buffer, format="JPEG", quality=90)
|
| 49 |
+
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 50 |
+
|
| 51 |
+
# Check size (base64 should be < 4MB)
|
| 52 |
+
if len(image_b64) > 4 * 1024 * 1024: # 4MB limit
|
| 53 |
+
# Reduce quality and try again
|
| 54 |
+
buffer = BytesIO()
|
| 55 |
+
image.save(buffer, format="JPEG", quality=70)
|
| 56 |
+
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 57 |
+
|
| 58 |
+
return image_b64
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"[ERROR] Image to base64 conversion failed: {e}")
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
def _query_groq_vision(self, base64_image):
|
| 65 |
+
"""Enhanced query for Groq Llama Vision API including cuneiform"""
|
| 66 |
+
try:
|
| 67 |
+
# FIXED: Simplified prompt to avoid token limit issues
|
| 68 |
+
prompt = """Analyze this image of ancient text/script as an expert paleographer.
|
| 69 |
+
|
| 70 |
+
Classify it as ONE of these ancient script types:
|
| 71 |
+
|
| 72 |
+
- EGYPTIAN: Hieroglyphic symbols (birds, eyes, human figures, cartouches)
|
| 73 |
+
- GREEK: Ancient/medieval Greek alphabet (α,β,γ,δ,ε,ζ,η,θ) with diacritics
|
| 74 |
+
- LATIN: Latin alphabet letters, Roman inscriptions, medieval manuscripts
|
| 75 |
+
- CUNEIFORM: Wedge-shaped impressions on clay tablets (triangular marks)
|
| 76 |
+
|
| 77 |
+
IMPORTANT: Cuneiform has geometric wedge patterns, NOT pictures like hieroglyphs.
|
| 78 |
+
|
| 79 |
+
Respond ONLY with JSON:
|
| 80 |
+
{"classification": "EGYPTIAN" or "GREEK" or "LATIN" or "CUNEIFORM", "confidence": 0.0-1.0}"""
|
| 81 |
+
|
| 82 |
+
completion = self.groq_client.chat.completions.create(
|
| 83 |
+
model=self.vision_model,
|
| 84 |
+
messages=[
|
| 85 |
+
{
|
| 86 |
+
"role": "user",
|
| 87 |
+
"content": [
|
| 88 |
+
{"type": "text", "text": prompt},
|
| 89 |
+
{
|
| 90 |
+
"type": "image_url",
|
| 91 |
+
"image_url": {
|
| 92 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
]
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
temperature=0.1, # Low temperature for consistent classification
|
| 99 |
+
max_completion_tokens=100, # FIXED: Reduced to avoid token errors
|
| 100 |
+
top_p=0.9,
|
| 101 |
+
stream=False,
|
| 102 |
+
response_format={"type": "json_object"}
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
return completion.choices[0].message.content
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"[ERROR] Groq Vision API call failed: {e}")
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
def _parse_classification_response(self, response):
|
| 112 |
+
"""Enhanced parsing for JSON response including cuneiform"""
|
| 113 |
+
if not response:
|
| 114 |
+
return "unknown"
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
# Parse JSON response
|
| 118 |
+
data = json.loads(response)
|
| 119 |
+
classification = data.get('classification', '').upper()
|
| 120 |
+
confidence = data.get('confidence', 0.0)
|
| 121 |
+
|
| 122 |
+
print(f"[INFO] Vision model confidence: {confidence:.3f}")
|
| 123 |
+
|
| 124 |
+
# Enhanced classification mapping including cuneiform
|
| 125 |
+
if classification == "EGYPTIAN":
|
| 126 |
+
return "egyptian"
|
| 127 |
+
elif classification == "GREEK":
|
| 128 |
+
return "greek"
|
| 129 |
+
elif classification == "LATIN":
|
| 130 |
+
return "latin"
|
| 131 |
+
elif classification == "CUNEIFORM":
|
| 132 |
+
return "cuneiform"
|
| 133 |
+
else:
|
| 134 |
+
print(f"[WARN] Unknown classification: {classification}")
|
| 135 |
+
return "unknown"
|
| 136 |
+
|
| 137 |
+
except json.JSONDecodeError:
|
| 138 |
+
print(f"[WARN] Failed to parse JSON response, trying text parsing: {response}")
|
| 139 |
+
# Enhanced fallback to text parsing
|
| 140 |
+
response_upper = response.strip().upper()
|
| 141 |
+
|
| 142 |
+
# Priority order: cuneiform keywords first (most specific)
|
| 143 |
+
cuneiform_keywords = ["CUNEIFORM", "WEDGE", "CLAY", "MESOPOTAMIAN", "AKKADIAN", "SUMERIAN", "BABYLONIAN"]
|
| 144 |
+
if any(keyword in response_upper for keyword in cuneiform_keywords):
|
| 145 |
+
return "cuneiform"
|
| 146 |
+
elif "EGYPTIAN" in response_upper or "HIEROGLYPH" in response_upper:
|
| 147 |
+
return "egyptian"
|
| 148 |
+
elif "GREEK" in response_upper:
|
| 149 |
+
return "greek"
|
| 150 |
+
elif "LATIN" in response_upper or "ROMAN" in response_upper:
|
| 151 |
+
return "latin"
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"[ERROR] Response parsing failed: {e}")
|
| 155 |
+
|
| 156 |
+
return "unknown"
|
| 157 |
+
|
| 158 |
+
def classify_with_fallback(self, image_path, max_retries=2):
|
| 159 |
+
"""Enhanced classification with retry logic"""
|
| 160 |
+
for attempt in range(max_retries + 1):
|
| 161 |
+
try:
|
| 162 |
+
result = self.classify_script(image_path)
|
| 163 |
+
|
| 164 |
+
if result != "unknown":
|
| 165 |
+
return result
|
| 166 |
+
elif attempt < max_retries:
|
| 167 |
+
print(f"[INFO] Classification attempt {attempt + 1} returned unknown, retrying...")
|
| 168 |
+
continue
|
| 169 |
+
else:
|
| 170 |
+
print(f"[WARN] All classification attempts returned unknown")
|
| 171 |
+
return "unknown"
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
if attempt < max_retries:
|
| 175 |
+
print(f"[WARN] Classification attempt {attempt + 1} failed: {e}, retrying...")
|
| 176 |
+
continue
|
| 177 |
+
else:
|
| 178 |
+
print(f"[ERROR] All classification attempts failed: {e}")
|
| 179 |
+
return "unknown"
|
| 180 |
+
|
| 181 |
+
return "unknown"
|
| 182 |
+
|
| 183 |
+
def get_supported_scripts(self):
|
| 184 |
+
"""Get list of supported script types"""
|
| 185 |
+
return ["egyptian", "greek", "latin", "cuneiform"]
|
| 186 |
+
|
| 187 |
+
def validate_classification(self, script_type, confidence_threshold=0.7):
|
| 188 |
+
"""Validate classification result"""
|
| 189 |
+
supported_scripts = self.get_supported_scripts()
|
| 190 |
+
|
| 191 |
+
if script_type not in supported_scripts:
|
| 192 |
+
print(f"[WARN] Unsupported script type: {script_type}")
|
| 193 |
+
return False
|
| 194 |
+
|
| 195 |
+
# All classifications from Llama Vision are considered valid
|
| 196 |
+
return True
|
| 197 |
+
|
| 198 |
+
def get_model_info(self):
|
| 199 |
+
"""Get information about the vision model being used"""
|
| 200 |
+
return {
|
| 201 |
+
"model": self.vision_model,
|
| 202 |
+
"provider": "Groq",
|
| 203 |
+
"supported_scripts": self.get_supported_scripts(),
|
| 204 |
+
"features": [
|
| 205 |
+
"Ancient script classification",
|
| 206 |
+
"Multi-script support",
|
| 207 |
+
"Cuneiform wedge detection",
|
| 208 |
+
"Clay tablet recognition",
|
| 209 |
+
"High-resolution image processing"
|
| 210 |
+
]
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
def debug_classification(self, image_path, save_debug_info=False):
|
| 214 |
+
"""Debug classification with detailed information"""
|
| 215 |
+
try:
|
| 216 |
+
print(f"[DEBUG] Starting classification for: {image_path}")
|
| 217 |
+
|
| 218 |
+
# Check image properties
|
| 219 |
+
image = Image.open(image_path)
|
| 220 |
+
print(f"[DEBUG] Image size: {image.size}")
|
| 221 |
+
print(f"[DEBUG] Image mode: {image.mode}")
|
| 222 |
+
|
| 223 |
+
# Get base64 size
|
| 224 |
+
base64_image = self._image_to_base64(image_path)
|
| 225 |
+
if base64_image:
|
| 226 |
+
print(f"[DEBUG] Base64 size: {len(base64_image)} characters")
|
| 227 |
+
|
| 228 |
+
# Get raw response
|
| 229 |
+
response = self._query_groq_vision(base64_image)
|
| 230 |
+
print(f"[DEBUG] Raw API response: {response}")
|
| 231 |
+
|
| 232 |
+
# Parse and return
|
| 233 |
+
result = self._parse_classification_response(response)
|
| 234 |
+
print(f"[DEBUG] Final classification: {result}")
|
| 235 |
+
|
| 236 |
+
if save_debug_info:
|
| 237 |
+
debug_info = {
|
| 238 |
+
"image_path": image_path,
|
| 239 |
+
"image_size": image.size,
|
| 240 |
+
"base64_length": len(base64_image) if base64_image else 0,
|
| 241 |
+
"raw_response": response,
|
| 242 |
+
"classification": result
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
debug_file = f"debug_classification_{result}_{hash(image_path) % 10000}.json"
|
| 246 |
+
with open(debug_file, 'w') as f:
|
| 247 |
+
json.dump(debug_info, f, indent=2)
|
| 248 |
+
print(f"[DEBUG] Debug info saved to: {debug_file}")
|
| 249 |
+
|
| 250 |
+
return result
|
| 251 |
+
|
| 252 |
+
except Exception as e:
|
| 253 |
+
print(f"[ERROR] Debug classification failed: {e}")
|
| 254 |
+
return "unknown"
|
services/layout_parser.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from typing import List, Dict, Tuple
|
| 5 |
+
|
| 6 |
+
class LayoutParser:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
def analyze_layout(self, image_path: str) -> Dict:
|
| 11 |
+
"""Analyze document image layout to detect columns, blocks, and lines of text"""
|
| 12 |
+
try:
|
| 13 |
+
img = cv2.imread(image_path)
|
| 14 |
+
if img is None:
|
| 15 |
+
raise FileNotFoundError(f"Image not found: {image_path}")
|
| 16 |
+
|
| 17 |
+
h_img, w_img, _ = img.shape
|
| 18 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 19 |
+
|
| 20 |
+
# Step 1: Preprocess to remove noise and binarize
|
| 21 |
+
# Use Otsu's thresholding after Gaussian blur
|
| 22 |
+
blur = cv2.GaussianBlur(gray, (5, 5), 0)
|
| 23 |
+
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
| 24 |
+
|
| 25 |
+
# Step 2: Dilation to merge words into horizontal line segments
|
| 26 |
+
# Use larger horizontal kernel to join words along text lines
|
| 27 |
+
line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
|
| 28 |
+
dilated = cv2.dilate(thresh, line_kernel, iterations=2)
|
| 29 |
+
|
| 30 |
+
# Step 3: Find contours of lines
|
| 31 |
+
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 32 |
+
|
| 33 |
+
lines = []
|
| 34 |
+
for cnt in contours:
|
| 35 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
| 36 |
+
|
| 37 |
+
# Filter out small noise and full page boundaries
|
| 38 |
+
if w < 15 or h < 5:
|
| 39 |
+
continue
|
| 40 |
+
if w > w_img * 0.98 or h > h_img * 0.98:
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
lines.append({
|
| 44 |
+
"box": (x, y, w, h),
|
| 45 |
+
"area": w * h
|
| 46 |
+
})
|
| 47 |
+
|
| 48 |
+
# Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
|
| 49 |
+
# We group lines into columns based on horizontal positions
|
| 50 |
+
lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first
|
| 51 |
+
|
| 52 |
+
columns = self._group_lines_into_columns(lines, w_img)
|
| 53 |
+
|
| 54 |
+
structured_layout = {
|
| 55 |
+
"width": w_img,
|
| 56 |
+
"height": h_img,
|
| 57 |
+
"column_count": len(columns),
|
| 58 |
+
"columns": columns
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
|
| 62 |
+
return structured_layout
|
| 63 |
+
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"[ERROR] Layout parsing failed: {e}")
|
| 66 |
+
return {"width": 0, "height": 0, "column_count": 1, "columns": []}
|
| 67 |
+
|
| 68 |
+
def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
|
| 69 |
+
"""Group detected text lines into column blocks based on horizontal overlap"""
|
| 70 |
+
if not lines:
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
# Find horizontal overlaps using a histogram projection
|
| 74 |
+
hist = np.zeros(page_width, dtype=np.int32)
|
| 75 |
+
for line in lines:
|
| 76 |
+
x, _, w, _ = line["box"]
|
| 77 |
+
hist[x:x+w] += 1
|
| 78 |
+
|
| 79 |
+
# Threshold histogram to find column boundaries
|
| 80 |
+
min_col_width = int(page_width * 0.1)
|
| 81 |
+
columns_x = []
|
| 82 |
+
in_col = False
|
| 83 |
+
start_x = 0
|
| 84 |
+
|
| 85 |
+
for x, val in enumerate(hist):
|
| 86 |
+
if val > 1 and not in_col:
|
| 87 |
+
in_col = True
|
| 88 |
+
start_x = x
|
| 89 |
+
elif val <= 1 and in_col:
|
| 90 |
+
in_col = False
|
| 91 |
+
end_x = x
|
| 92 |
+
if (end_x - start_x) >= min_col_width:
|
| 93 |
+
columns_x.append((start_x, end_x))
|
| 94 |
+
|
| 95 |
+
# Handle case where column stretches to the end
|
| 96 |
+
if in_col:
|
| 97 |
+
columns_x.append((start_x, page_width))
|
| 98 |
+
|
| 99 |
+
if not columns_x:
|
| 100 |
+
columns_x = [(0, page_width)]
|
| 101 |
+
|
| 102 |
+
# Assign lines to closest columns
|
| 103 |
+
cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
|
| 104 |
+
|
| 105 |
+
for line in lines:
|
| 106 |
+
x, y, w, h = line["box"]
|
| 107 |
+
line_center_x = x + w / 2
|
| 108 |
+
|
| 109 |
+
# Find the best column index
|
| 110 |
+
best_idx = 0
|
| 111 |
+
min_dist = page_width
|
| 112 |
+
for idx, col in enumerate(cols_data):
|
| 113 |
+
cx_start, cx_end = col["x_range"]
|
| 114 |
+
if cx_start <= line_center_x <= cx_end:
|
| 115 |
+
best_idx = idx
|
| 116 |
+
break
|
| 117 |
+
else:
|
| 118 |
+
dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
|
| 119 |
+
if dist < min_dist:
|
| 120 |
+
min_dist = dist
|
| 121 |
+
best_idx = idx
|
| 122 |
+
|
| 123 |
+
cols_data[best_idx]["lines"].append((x, y, w, h))
|
| 124 |
+
|
| 125 |
+
# Sort lines inside each column by vertical (y) coordinate
|
| 126 |
+
for col in cols_data:
|
| 127 |
+
col["lines"] = sorted(col["lines"], key=lambda box: box[1])
|
| 128 |
+
|
| 129 |
+
return cols_data
|
| 130 |
+
|
| 131 |
+
def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
|
| 132 |
+
"""Crop and return PIL images of detected text lines in reading order"""
|
| 133 |
+
try:
|
| 134 |
+
img = cv2.imread(image_path)
|
| 135 |
+
if img is None:
|
| 136 |
+
return []
|
| 137 |
+
|
| 138 |
+
crops = []
|
| 139 |
+
h_img, w_img, _ = img.shape
|
| 140 |
+
|
| 141 |
+
for col in layout.get("columns", []):
|
| 142 |
+
for (x, y, w, h) in col["lines"]:
|
| 143 |
+
# Add small padding for HTR/OCR context
|
| 144 |
+
pad_y = int(h * 0.1) + 2
|
| 145 |
+
pad_x = int(w * 0.05) + 2
|
| 146 |
+
|
| 147 |
+
y0 = max(0, y - pad_y)
|
| 148 |
+
y1 = min(h_img, y + h + pad_y)
|
| 149 |
+
x0 = max(0, x - pad_x)
|
| 150 |
+
x1 = min(w_img, x + w + pad_x)
|
| 151 |
+
|
| 152 |
+
crop = img[y0:y1, x0:x1]
|
| 153 |
+
if crop.size > 0:
|
| 154 |
+
crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
|
| 155 |
+
|
| 156 |
+
return crops
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"[ERROR] Failed to crop layout lines: {e}")
|
| 159 |
+
return []
|
| 160 |
+
|
| 161 |
+
def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
|
| 162 |
+
"""Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
|
| 163 |
+
try:
|
| 164 |
+
# 1. Try using CLIP classifier if provided
|
| 165 |
+
if clip_classifier and clip_classifier.model and clip_classifier.processor:
|
| 166 |
+
try:
|
| 167 |
+
from PIL import Image
|
| 168 |
+
image = Image.open(image_path).convert("RGB")
|
| 169 |
+
|
| 170 |
+
styles = ["printed", "cursive"]
|
| 171 |
+
descriptions = [
|
| 172 |
+
"classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
|
| 173 |
+
"medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
inputs = clip_classifier.processor(
|
| 177 |
+
text=descriptions,
|
| 178 |
+
images=image,
|
| 179 |
+
return_tensors="pt",
|
| 180 |
+
padding=True
|
| 181 |
+
).to(clip_classifier.device)
|
| 182 |
+
|
| 183 |
+
import torch
|
| 184 |
+
with torch.no_grad():
|
| 185 |
+
outputs = clip_classifier.model(**inputs)
|
| 186 |
+
logits_per_image = outputs.logits_per_image
|
| 187 |
+
probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
|
| 188 |
+
|
| 189 |
+
best_idx = np.argmax(probs)
|
| 190 |
+
style_label = styles[best_idx]
|
| 191 |
+
confidence = float(probs[best_idx])
|
| 192 |
+
print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
|
| 193 |
+
return style_label
|
| 194 |
+
except Exception as e:
|
| 195 |
+
print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
|
| 196 |
+
|
| 197 |
+
# 2. Fallback: Computer Vision heuristics
|
| 198 |
+
print("[INFO] Running computer vision heuristics for Latin style detection...")
|
| 199 |
+
img = cv2.imread(image_path)
|
| 200 |
+
if img is None:
|
| 201 |
+
return "cursive" # Safe default
|
| 202 |
+
|
| 203 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 204 |
+
blur = cv2.GaussianBlur(gray, (5, 5), 0)
|
| 205 |
+
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
| 206 |
+
|
| 207 |
+
# Find contours without heavy dilation (character level components)
|
| 208 |
+
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 209 |
+
|
| 210 |
+
if not contours:
|
| 211 |
+
return "cursive"
|
| 212 |
+
|
| 213 |
+
aspect_ratios = []
|
| 214 |
+
widths = []
|
| 215 |
+
heights = []
|
| 216 |
+
|
| 217 |
+
for cnt in contours:
|
| 218 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
| 219 |
+
# Filter noise
|
| 220 |
+
if w < 5 or h < 5:
|
| 221 |
+
continue
|
| 222 |
+
aspect_ratios.append(w / h)
|
| 223 |
+
widths.append(w)
|
| 224 |
+
heights.append(h)
|
| 225 |
+
|
| 226 |
+
if not aspect_ratios:
|
| 227 |
+
return "cursive"
|
| 228 |
+
|
| 229 |
+
avg_aspect_ratio = np.mean(aspect_ratios)
|
| 230 |
+
median_width = np.median(widths)
|
| 231 |
+
|
| 232 |
+
# Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
|
| 233 |
+
# Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
|
| 234 |
+
print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
|
| 235 |
+
|
| 236 |
+
if avg_aspect_ratio < 1.3:
|
| 237 |
+
return "printed"
|
| 238 |
+
else:
|
| 239 |
+
return "cursive"
|
| 240 |
+
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
|
| 243 |
+
return "cursive"
|
| 244 |
+
|
services/rag_service.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, List, Tuple
|
| 5 |
+
from config import Config
|
| 6 |
+
|
| 7 |
+
class RAGService:
|
| 8 |
+
def __init__(self, references_path: str = None):
|
| 9 |
+
self.config = Config()
|
| 10 |
+
self.references_path = references_path or str(self.config.REFERENCES_PATH)
|
| 11 |
+
self.corpus = []
|
| 12 |
+
self.load_corpus()
|
| 13 |
+
|
| 14 |
+
def load_corpus(self):
|
| 15 |
+
"""Load and index the historical reference document corpus"""
|
| 16 |
+
try:
|
| 17 |
+
if os.path.exists(self.references_path):
|
| 18 |
+
with open(self.references_path, "r", encoding="utf-8") as f:
|
| 19 |
+
data = json.load(f)
|
| 20 |
+
|
| 21 |
+
# Index Egyptian
|
| 22 |
+
for term, note in data.get("egypt_symbol_notes", {}).items():
|
| 23 |
+
self.corpus.append({
|
| 24 |
+
"category": "Egyptian Hieroglyphic Sign",
|
| 25 |
+
"term": term,
|
| 26 |
+
"definition": note,
|
| 27 |
+
"citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
|
| 28 |
+
})
|
| 29 |
+
|
| 30 |
+
# Index Greek
|
| 31 |
+
for term, note in data.get("greek_symbol_notes", {}).items():
|
| 32 |
+
self.corpus.append({
|
| 33 |
+
"category": "Greek Paleography Mark",
|
| 34 |
+
"term": term,
|
| 35 |
+
"definition": note,
|
| 36 |
+
"citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
# Index Latin
|
| 40 |
+
for term, note in data.get("latin_symbol_notes", {}).items():
|
| 41 |
+
self.corpus.append({
|
| 42 |
+
"category": "Latin Scribal Abbreviation",
|
| 43 |
+
"term": term,
|
| 44 |
+
"definition": note,
|
| 45 |
+
"citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
|
| 46 |
+
})
|
| 47 |
+
|
| 48 |
+
# Index Cuneiform
|
| 49 |
+
for term, note in data.get("cuneiform_symbol_notes", {}).items():
|
| 50 |
+
self.corpus.append({
|
| 51 |
+
"category": "Mesopotamian Cuneiform Logogram",
|
| 52 |
+
"term": term,
|
| 53 |
+
"definition": note,
|
| 54 |
+
"citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
|
| 55 |
+
})
|
| 56 |
+
|
| 57 |
+
print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
|
| 58 |
+
else:
|
| 59 |
+
print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"[ERROR] Failed to initialize RAG index: {e}")
|
| 62 |
+
|
| 63 |
+
def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
|
| 64 |
+
"""Search reference records and build a grounding context string with academic citations"""
|
| 65 |
+
if not query_terms or not self.corpus:
|
| 66 |
+
return ""
|
| 67 |
+
|
| 68 |
+
matches = []
|
| 69 |
+
seen = set()
|
| 70 |
+
|
| 71 |
+
for term in query_terms:
|
| 72 |
+
if not term or len(term.strip()) < 1:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
clean_term = term.lower().strip()
|
| 76 |
+
|
| 77 |
+
# Simple keyword search with scoring
|
| 78 |
+
for record in self.corpus:
|
| 79 |
+
score = 0
|
| 80 |
+
record_term = record["term"].lower()
|
| 81 |
+
record_def = record["definition"].lower()
|
| 82 |
+
|
| 83 |
+
if clean_term == record_term:
|
| 84 |
+
score += 10
|
| 85 |
+
else:
|
| 86 |
+
# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
|
| 87 |
+
term_parts = re.split(r'[_ \-]', record_term)
|
| 88 |
+
if clean_term in term_parts:
|
| 89 |
+
score += 5
|
| 90 |
+
elif len(clean_term) > 3:
|
| 91 |
+
if clean_term in record_term:
|
| 92 |
+
score += 5
|
| 93 |
+
elif clean_term in record_def:
|
| 94 |
+
score += 2
|
| 95 |
+
|
| 96 |
+
if score > 0:
|
| 97 |
+
record_key = f"{record['category']}:{record['term']}"
|
| 98 |
+
if record_key not in seen:
|
| 99 |
+
seen.add(record_key)
|
| 100 |
+
matches.append((score, record))
|
| 101 |
+
|
| 102 |
+
# Sort matches by relevance score
|
| 103 |
+
matches.sort(key=lambda x: x[0], reverse=True)
|
| 104 |
+
top_matches = [m[1] for m in matches[:max_results]]
|
| 105 |
+
|
| 106 |
+
if not top_matches:
|
| 107 |
+
return ""
|
| 108 |
+
|
| 109 |
+
context_lines = ["### Scholarly Grounding and Sign References:"]
|
| 110 |
+
for idx, match in enumerate(top_matches, 1):
|
| 111 |
+
context_lines.append(
|
| 112 |
+
f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
|
| 113 |
+
f" *Source Citation:* {match['citation']}"
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
return "\n".join(context_lines)
|
| 117 |
+
|
| 118 |
+
def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
|
| 119 |
+
"""Search reference records and return the raw list of matching reference dicts with citations"""
|
| 120 |
+
if not query_terms or not self.corpus:
|
| 121 |
+
return []
|
| 122 |
+
|
| 123 |
+
matches = []
|
| 124 |
+
seen = set()
|
| 125 |
+
|
| 126 |
+
for term in query_terms:
|
| 127 |
+
if not term or len(term.strip()) < 1:
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
clean_term = term.lower().strip()
|
| 131 |
+
|
| 132 |
+
# Simple keyword search with scoring
|
| 133 |
+
for record in self.corpus:
|
| 134 |
+
score = 0
|
| 135 |
+
record_term = record["term"].lower()
|
| 136 |
+
record_def = record["definition"].lower()
|
| 137 |
+
|
| 138 |
+
if clean_term == record_term:
|
| 139 |
+
score += 10
|
| 140 |
+
else:
|
| 141 |
+
# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
|
| 142 |
+
term_parts = re.split(r'[_ \-]', record_term)
|
| 143 |
+
if clean_term in term_parts:
|
| 144 |
+
score += 5
|
| 145 |
+
elif len(clean_term) > 3:
|
| 146 |
+
if clean_term in record_term:
|
| 147 |
+
score += 5
|
| 148 |
+
elif clean_term in record_def:
|
| 149 |
+
score += 2
|
| 150 |
+
|
| 151 |
+
if score > 0:
|
| 152 |
+
record_key = f"{record['category']}:{record['term']}"
|
| 153 |
+
if record_key not in seen:
|
| 154 |
+
seen.add(record_key)
|
| 155 |
+
matches.append((score, record))
|
| 156 |
+
|
| 157 |
+
# Sort matches by relevance score
|
| 158 |
+
matches.sort(key=lambda x: x[0], reverse=True)
|
| 159 |
+
return [m[1] for m in matches[:max_results]]
|
| 160 |
+
|
| 161 |
+
def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
|
| 162 |
+
"""Enrich LLM prompts with RAG context and citation grounding instructions"""
|
| 163 |
+
# Parse query terms from extracted text or labels
|
| 164 |
+
query_terms = []
|
| 165 |
+
if extracted_symbols:
|
| 166 |
+
query_terms.extend(extracted_symbols)
|
| 167 |
+
|
| 168 |
+
# Split clean words from text
|
| 169 |
+
if extracted_text:
|
| 170 |
+
words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
|
| 171 |
+
query_terms.extend(words[:15]) # Cap to prevent excessive token use
|
| 172 |
+
|
| 173 |
+
grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
|
| 174 |
+
|
| 175 |
+
if not grounding_context:
|
| 176 |
+
return base_system_prompt
|
| 177 |
+
|
| 178 |
+
enriched_prompt = (
|
| 179 |
+
f"{base_system_prompt}\n\n"
|
| 180 |
+
f"Here is some verified historical and paleographical grounding information that you MUST use "
|
| 181 |
+
f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
|
| 182 |
+
f"whenever discussing these symbols:\n\n"
|
| 183 |
+
f"{grounding_context}\n\n"
|
| 184 |
+
f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
|
| 185 |
+
)
|
| 186 |
+
return enriched_prompt
|
services/script_detector.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from processors.egyptian_processor import EgyptianProcessor
|
| 2 |
+
from processors.greek_processor import GreekProcessor
|
| 3 |
+
from processors.latin_processor import LatinProcessor
|
| 4 |
+
from processors.cuneiform_processor import CuneiformProcessor
|
| 5 |
+
from .groq_vision_classifier import GroqVisionScriptClassifier
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ScriptDetectionService:
|
| 9 |
+
def __init__(self, groq_client, references, clip_classifier, translator_pipe, cuneiform_processor=None):
|
| 10 |
+
# Initialize processors including cuneiform
|
| 11 |
+
self.egyptian_processor = EgyptianProcessor(groq_client, references, clip_classifier, translator_pipe)
|
| 12 |
+
self.greek_processor = GreekProcessor(groq_client, references, clip_classifier)
|
| 13 |
+
self.latin_processor = LatinProcessor(groq_client, references, clip_classifier)
|
| 14 |
+
|
| 15 |
+
# Initialize cuneiform processor or use the shared instance
|
| 16 |
+
if cuneiform_processor:
|
| 17 |
+
self.cuneiform_processor = cuneiform_processor
|
| 18 |
+
print("[INFO] Cuneiform processor shared from global app instance")
|
| 19 |
+
else:
|
| 20 |
+
try:
|
| 21 |
+
print("[INFO] Initializing cuneiform processor in detection service...")
|
| 22 |
+
self.cuneiform_processor = CuneiformProcessor(groq_client, references, clip_classifier)
|
| 23 |
+
print("[INFO] Cuneiform processor initialized successfully")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"[WARN] Failed to initialize cuneiform processor: {e}")
|
| 26 |
+
self.cuneiform_processor = None
|
| 27 |
+
|
| 28 |
+
# FIXED: Get API key from groq_client with multiple fallback options
|
| 29 |
+
api_key = None
|
| 30 |
+
if hasattr(groq_client, 'api_key'):
|
| 31 |
+
api_key = groq_client.api_key
|
| 32 |
+
elif hasattr(groq_client, 'client') and hasattr(groq_client.client, 'api_key'):
|
| 33 |
+
api_key = groq_client.client.api_key
|
| 34 |
+
else:
|
| 35 |
+
# Fallback: get from config or environment
|
| 36 |
+
try:
|
| 37 |
+
from config import Config
|
| 38 |
+
config = Config()
|
| 39 |
+
api_key = config.GROQ_API_KEY
|
| 40 |
+
except:
|
| 41 |
+
import os
|
| 42 |
+
api_key = os.getenv('GROQ_API_KEY')
|
| 43 |
+
|
| 44 |
+
# Initialize Groq Vision script classifier if API key is present
|
| 45 |
+
if api_key:
|
| 46 |
+
try:
|
| 47 |
+
self.vision_classifier = GroqVisionScriptClassifier(api_key)
|
| 48 |
+
print("[INFO] Groq Vision Script Detection Service initialized")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"[WARN] Failed to initialize Groq Vision script classifier: {e}")
|
| 51 |
+
self.vision_classifier = None
|
| 52 |
+
else:
|
| 53 |
+
print("[WARN] GROQ_API_KEY not found! Groq Vision classifier disabled. Falling back to zero-shot CLIP classifier.")
|
| 54 |
+
self.vision_classifier = None
|
| 55 |
+
|
| 56 |
+
# Keep track of clip_classifier
|
| 57 |
+
self.clip_classifier = clip_classifier
|
| 58 |
+
|
| 59 |
+
# Enhanced processor mapping with cuneiform
|
| 60 |
+
self.processors = {
|
| 61 |
+
'egyptian': self.egyptian_processor,
|
| 62 |
+
'greek': self.greek_processor,
|
| 63 |
+
'latin': self.latin_processor,
|
| 64 |
+
'cuneiform': self.cuneiform_processor
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
if self.cuneiform_processor:
|
| 68 |
+
print("[INFO] Cuneiform support: ENABLED (praeclarum/cuneiform model)")
|
| 69 |
+
else:
|
| 70 |
+
print("[WARN] Cuneiform support: DISABLED (processor initialization failed)")
|
| 71 |
+
|
| 72 |
+
def detect_and_process(self, image_path):
|
| 73 |
+
"""Enhanced detection with cuneiform support - uses Groq Vision with CLIP fallback"""
|
| 74 |
+
try:
|
| 75 |
+
# Step 1: Get script classification from Groq Vision or CLIP
|
| 76 |
+
script_type = "unknown"
|
| 77 |
+
classification_method = "unknown"
|
| 78 |
+
classification_confidence = 0.0
|
| 79 |
+
|
| 80 |
+
if self.vision_classifier:
|
| 81 |
+
try:
|
| 82 |
+
script_type = self.vision_classifier.classify_script(image_path)
|
| 83 |
+
classification_method = 'groq_vision'
|
| 84 |
+
classification_confidence = 0.95
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"[WARN] Groq Vision classification failed: {e}. Falling back to CLIP.")
|
| 87 |
+
|
| 88 |
+
if script_type == "unknown" or not self.vision_classifier:
|
| 89 |
+
from PIL import Image
|
| 90 |
+
try:
|
| 91 |
+
img = Image.open(image_path)
|
| 92 |
+
script_type, classification_confidence = self.clip_classifier.classify_script_type(img)
|
| 93 |
+
classification_method = 'clip_zero_shot'
|
| 94 |
+
print(f"[INFO] CLIP fallback classification: {script_type} (conf={classification_confidence:.3f})")
|
| 95 |
+
except Exception as ce:
|
| 96 |
+
print(f"[ERROR] CLIP fallback classification failed: {ce}")
|
| 97 |
+
script_type = "egyptian" # default fallback
|
| 98 |
+
classification_method = "default_fallback"
|
| 99 |
+
classification_confidence = 0.5
|
| 100 |
+
|
| 101 |
+
print(f"[INFO] Final classification routed: {script_type} via {classification_method}")
|
| 102 |
+
|
| 103 |
+
# Step 2: Route to appropriate processor including cuneiform
|
| 104 |
+
if script_type == "egyptian":
|
| 105 |
+
print("[INFO] Routing to Egyptian processor...")
|
| 106 |
+
result = self.egyptian_processor.process_image(image_path)
|
| 107 |
+
|
| 108 |
+
elif script_type == "greek":
|
| 109 |
+
print("[INFO] Routing to Greek processor...")
|
| 110 |
+
result = self.greek_processor.process_image(image_path)
|
| 111 |
+
|
| 112 |
+
elif script_type == "latin":
|
| 113 |
+
print("[INFO] Routing to Latin processor...")
|
| 114 |
+
result = self.latin_processor.process_image(image_path)
|
| 115 |
+
|
| 116 |
+
elif script_type == "cuneiform":
|
| 117 |
+
print("[INFO] Routing to Cuneiform processor...")
|
| 118 |
+
if self.cuneiform_processor and self.cuneiform_processor.cuneiform_available:
|
| 119 |
+
result = self.cuneiform_processor.process_image(image_path)
|
| 120 |
+
else:
|
| 121 |
+
print("[ERROR] Cuneiform processor not available!")
|
| 122 |
+
# Create error result
|
| 123 |
+
result = {
|
| 124 |
+
'script_type': 'cuneiform',
|
| 125 |
+
'confidence': 0.0,
|
| 126 |
+
'processed_result': {
|
| 127 |
+
'text': 'Cuneiform processor unavailable',
|
| 128 |
+
'validation': {'quality_score': 0.0, 'error': 'Model not loaded'}
|
| 129 |
+
},
|
| 130 |
+
'historical_context': {},
|
| 131 |
+
'creative_story': 'Cuneiform processing failed - model not available'
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
else: # unknown
|
| 135 |
+
print(f"[INFO] Unknown classification '{script_type}', defaulting to Egyptian...")
|
| 136 |
+
result = self.egyptian_processor.process_image(image_path)
|
| 137 |
+
|
| 138 |
+
# Step 3: Return result with classification metadata
|
| 139 |
+
if result:
|
| 140 |
+
result['vision_classification'] = script_type
|
| 141 |
+
result['classification_method'] = classification_method
|
| 142 |
+
result['classification_confidence'] = classification_confidence
|
| 143 |
+
print(f"[INFO] {script_type.title()} processing completed successfully")
|
| 144 |
+
return result
|
| 145 |
+
else:
|
| 146 |
+
print(f"[ERROR] {script_type.title()} processor returned None")
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
print(f"[ERROR] Classification and processing failed: {e}")
|
| 151 |
+
import traceback
|
| 152 |
+
traceback.print_exc()
|
| 153 |
+
return None
|
| 154 |
+
|
| 155 |
+
def get_processor_by_type(self, script_type):
|
| 156 |
+
"""Get processor by script type - now includes cuneiform"""
|
| 157 |
+
processor = self.processors.get(script_type.lower())
|
| 158 |
+
|
| 159 |
+
if script_type.lower() == 'cuneiform' and processor and not processor.cuneiform_available:
|
| 160 |
+
print(f"[WARN] Cuneiform processor exists but model not available")
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
return processor
|
| 164 |
+
|
| 165 |
+
def get_supported_scripts(self):
|
| 166 |
+
"""Get list of supported script types"""
|
| 167 |
+
scripts = ['egyptian', 'greek', 'latin']
|
| 168 |
+
|
| 169 |
+
if self.cuneiform_processor and self.cuneiform_processor.cuneiform_available:
|
| 170 |
+
scripts.append('cuneiform')
|
| 171 |
+
|
| 172 |
+
return scripts
|
| 173 |
+
|
| 174 |
+
def get_processor_status(self):
|
| 175 |
+
"""Get status of all processors"""
|
| 176 |
+
status = {
|
| 177 |
+
'egyptian': self.egyptian_processor is not None,
|
| 178 |
+
'greek': self.greek_processor is not None,
|
| 179 |
+
'latin': self.latin_processor is not None,
|
| 180 |
+
'cuneiform': self.cuneiform_processor is not None and getattr(self.cuneiform_processor, 'cuneiform_available', False)
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
return status
|
| 184 |
+
|
| 185 |
+
def validate_script_detection(self, script_type, processed_result):
|
| 186 |
+
"""Validate script detection results - enhanced for cuneiform"""
|
| 187 |
+
try:
|
| 188 |
+
validation = processed_result.get('validation', {})
|
| 189 |
+
quality_score = validation.get('quality_score', 0.0)
|
| 190 |
+
|
| 191 |
+
# Script-specific validation thresholds
|
| 192 |
+
thresholds = {
|
| 193 |
+
'egyptian': 0.3,
|
| 194 |
+
'greek': 0.4,
|
| 195 |
+
'latin': 0.4,
|
| 196 |
+
'cuneiform': 0.2 # Lower threshold due to OCR challenges
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
threshold = thresholds.get(script_type, 0.3)
|
| 200 |
+
|
| 201 |
+
# Additional cuneiform validation
|
| 202 |
+
if script_type == 'cuneiform':
|
| 203 |
+
cuneiform_ratio = validation.get('cuneiform_ratio', 0.0)
|
| 204 |
+
atf_ratio = validation.get('atf_ratio', 0.0)
|
| 205 |
+
|
| 206 |
+
# Accept if either Unicode cuneiform or ATF format detected
|
| 207 |
+
if cuneiform_ratio > 0.1 or atf_ratio > 0.3:
|
| 208 |
+
print(f"[INFO] Cuneiform validation passed: cuneiform_ratio={cuneiform_ratio:.3f}, atf_ratio={atf_ratio:.3f}")
|
| 209 |
+
return True
|
| 210 |
+
|
| 211 |
+
# Standard quality validation
|
| 212 |
+
is_valid = quality_score >= threshold
|
| 213 |
+
|
| 214 |
+
if is_valid:
|
| 215 |
+
print(f"[INFO] {script_type.title()} validation passed: quality={quality_score:.3f} >= {threshold}")
|
| 216 |
+
else:
|
| 217 |
+
print(f"[WARN] {script_type.title()} validation failed: quality={quality_score:.3f} < {threshold}")
|
| 218 |
+
|
| 219 |
+
return is_valid
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"[ERROR] Validation failed: {e}")
|
| 223 |
+
return False
|
services/story_generator.py
ADDED
|
File without changes
|
utils/__init__.py
ADDED
|
File without changes
|
utils/gpu_diagnostics.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import gc
|
| 3 |
+
|
| 4 |
+
_active_processors = {}
|
| 5 |
+
|
| 6 |
+
def register_processor(name, processor_instance):
|
| 7 |
+
"""Register a processor instance for active VRAM offloading."""
|
| 8 |
+
_active_processors[name] = processor_instance
|
| 9 |
+
print(f"[VRAM MANAGER] Registered processor: {name}")
|
| 10 |
+
|
| 11 |
+
def reclaim_vram_for(target_processor_name):
|
| 12 |
+
"""Offload other processors' models from GPU to CPU to avoid Out of Memory (OOM) crashes."""
|
| 13 |
+
if not torch.cuda.is_available():
|
| 14 |
+
return
|
| 15 |
+
|
| 16 |
+
print(f"[VRAM MANAGER] Reclaiming GPU VRAM for '{target_processor_name}'...")
|
| 17 |
+
offloaded = False
|
| 18 |
+
|
| 19 |
+
for name, proc in list(_active_processors.items()):
|
| 20 |
+
if name == target_processor_name:
|
| 21 |
+
continue
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# 1. Greek Processor
|
| 25 |
+
if name == "greek" and getattr(proc, "trocr_model", None) is not None:
|
| 26 |
+
current_device = next(proc.trocr_model.parameters()).device
|
| 27 |
+
if str(current_device).startswith("cuda"):
|
| 28 |
+
print("[VRAM MANAGER] Offloading Greek TrOCR to CPU...")
|
| 29 |
+
proc.trocr_model.to("cpu")
|
| 30 |
+
offloaded = True
|
| 31 |
+
|
| 32 |
+
# 2. Latin Processor
|
| 33 |
+
elif name == "latin":
|
| 34 |
+
if getattr(proc, "tridis_model", None) is not None:
|
| 35 |
+
current_device = next(proc.tridis_model.parameters()).device
|
| 36 |
+
if str(current_device).startswith("cuda"):
|
| 37 |
+
print("[VRAM MANAGER] Offloading Latin TRIDIS to CPU...")
|
| 38 |
+
proc.tridis_model.to("cpu")
|
| 39 |
+
offloaded = True
|
| 40 |
+
if getattr(proc, "trocr_latin_model", None) is not None:
|
| 41 |
+
current_device = next(proc.trocr_latin_model.parameters()).device
|
| 42 |
+
if str(current_device).startswith("cuda"):
|
| 43 |
+
print("[VRAM MANAGER] Offloading Latin TrOCR to CPU...")
|
| 44 |
+
proc.trocr_latin_model.to("cpu")
|
| 45 |
+
offloaded = True
|
| 46 |
+
|
| 47 |
+
# 3. Cuneiform Processor
|
| 48 |
+
elif name == "cuneiform":
|
| 49 |
+
if getattr(proc, "clip_model", None) is not None:
|
| 50 |
+
current_device = next(proc.clip_model.parameters()).device
|
| 51 |
+
if str(current_device).startswith("cuda"):
|
| 52 |
+
print("[VRAM MANAGER] Offloading Cuneiform CLIP to CPU...")
|
| 53 |
+
proc.clip_model.to("cpu")
|
| 54 |
+
offloaded = True
|
| 55 |
+
if getattr(proc, "cuneiform_model", None) is not None:
|
| 56 |
+
current_device = next(proc.cuneiform_model.parameters()).device
|
| 57 |
+
if str(current_device).startswith("cuda"):
|
| 58 |
+
print("[VRAM MANAGER] Offloading Cuneiform Translator to CPU...")
|
| 59 |
+
proc.cuneiform_model.to("cpu")
|
| 60 |
+
offloaded = True
|
| 61 |
+
|
| 62 |
+
# 4. Egyptian Processor (HuggingFaceModels)
|
| 63 |
+
elif name == "egyptian" and getattr(proc, "_model", None) is not None:
|
| 64 |
+
current_device = next(proc._model.parameters()).device
|
| 65 |
+
if str(current_device).startswith("cuda"):
|
| 66 |
+
print("[VRAM MANAGER] Offloading Egyptian T5 to CPU...")
|
| 67 |
+
proc._model.to("cpu")
|
| 68 |
+
offloaded = True
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"[WARN] Failed to offload '{name}' models: {e}")
|
| 71 |
+
|
| 72 |
+
if offloaded:
|
| 73 |
+
gc.collect()
|
| 74 |
+
torch.cuda.empty_cache()
|
| 75 |
+
print("[VRAM MANAGER] VRAM cache cleared successfully.")
|
| 76 |
+
|
| 77 |
+
def get_gpu_info():
|
| 78 |
+
"""Get diagnostic information about the NVIDIA GPU if available."""
|
| 79 |
+
info = {
|
| 80 |
+
"cuda_available": torch.cuda.is_available(),
|
| 81 |
+
"gpu_name": "N/A",
|
| 82 |
+
"vram_total_gb": 0.0,
|
| 83 |
+
"vram_allocated_gb": 0.0,
|
| 84 |
+
"vram_cached_gb": 0.0,
|
| 85 |
+
"vram_free_gb": 0.0,
|
| 86 |
+
"cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A",
|
| 87 |
+
"device": "cpu"
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
if info["cuda_available"]:
|
| 91 |
+
info["device"] = "cuda"
|
| 92 |
+
try:
|
| 93 |
+
info["gpu_name"] = torch.cuda.get_device_name(0)
|
| 94 |
+
props = torch.cuda.get_device_properties(0)
|
| 95 |
+
info["vram_total_gb"] = round(props.total_memory / 1024**3, 2)
|
| 96 |
+
|
| 97 |
+
allocated = torch.cuda.memory_allocated(0)
|
| 98 |
+
cached = torch.cuda.memory_reserved(0)
|
| 99 |
+
info["vram_allocated_gb"] = round(allocated / 1024**3, 3)
|
| 100 |
+
info["vram_cached_gb"] = round(cached / 1024**3, 3)
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
free_mem, total_mem = torch.cuda.mem_get_info(0)
|
| 104 |
+
info["vram_free_gb"] = round(free_mem / 1024**3, 3)
|
| 105 |
+
except Exception:
|
| 106 |
+
info["vram_free_gb"] = round((props.total_memory - allocated) / 1024**3, 3)
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"[WARN] Error gathering detailed GPU info: {e}")
|
| 109 |
+
|
| 110 |
+
return info
|
| 111 |
+
|
| 112 |
+
def log_gpu_info():
|
| 113 |
+
"""Print clean diagnostic logs at startup."""
|
| 114 |
+
info = get_gpu_info()
|
| 115 |
+
print("=" * 60)
|
| 116 |
+
print(" NVIDIA GPU & CUDA INITIALIZATION DIAGNOSTICS")
|
| 117 |
+
print("=" * 60)
|
| 118 |
+
print(f"CUDA Available: {info['cuda_available']}")
|
| 119 |
+
if info["cuda_available"]:
|
| 120 |
+
print(f"CUDA Version: {info['cuda_version']}")
|
| 121 |
+
print(f"GPU Model: {info['gpu_name']}")
|
| 122 |
+
print(f"Total VRAM: {info['vram_total_gb']} GB")
|
| 123 |
+
print(f"Free VRAM: {info['vram_free_gb']} GB")
|
| 124 |
+
print(f"Active Device: CUDA (Dynamic Offloading Enabled)")
|
| 125 |
+
else:
|
| 126 |
+
print("Active Device: CPU (GPU acceleration not available)")
|
| 127 |
+
print("=" * 60)
|
| 128 |
+
|
| 129 |
+
def log_model_device(model_name, device):
|
| 130 |
+
"""Log the device selected for a specific model."""
|
| 131 |
+
print(f"[DEVICE LOG] Model '{model_name}' -> Assigned to: {str(device).upper()}")
|
| 132 |
+
|
| 133 |
+
def clear_gpu_cache():
|
| 134 |
+
"""Utility to clean memory cache during benchmarks or processing."""
|
| 135 |
+
if torch.cuda.is_available():
|
| 136 |
+
gc.collect()
|
| 137 |
+
torch.cuda.empty_cache()
|
utils/image_utils.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
from PIL import Image
|
| 4 |
+
|
| 5 |
+
def segment_hieroglyphs(image_path):
|
| 6 |
+
"""Segment hieroglyphs from image using OpenCV"""
|
| 7 |
+
try:
|
| 8 |
+
img = cv2.imread(image_path)
|
| 9 |
+
if img is None:
|
| 10 |
+
raise FileNotFoundError(f"Image not found or cannot be read: {image_path}")
|
| 11 |
+
|
| 12 |
+
# Convert to grayscale and apply adaptive thresholding
|
| 13 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 14 |
+
th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 15 |
+
cv2.THRESH_BINARY_INV, 25, 10)
|
| 16 |
+
|
| 17 |
+
# Apply morphological operations
|
| 18 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
|
| 19 |
+
th = cv2.morphologyEx(th, cv2.MORPH_CLOSE, kernel, iterations=1)
|
| 20 |
+
|
| 21 |
+
# Find contours
|
| 22 |
+
contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 23 |
+
|
| 24 |
+
boxes = []
|
| 25 |
+
h_img, w_img = th.shape
|
| 26 |
+
|
| 27 |
+
for cnt in contours:
|
| 28 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
| 29 |
+
area = w * h
|
| 30 |
+
|
| 31 |
+
# Filter small areas and full-image contours
|
| 32 |
+
if area < 200:
|
| 33 |
+
continue
|
| 34 |
+
if w > 0.95*w_img or h > 0.95*h_img:
|
| 35 |
+
continue
|
| 36 |
+
|
| 37 |
+
boxes.append((x, y, w, h))
|
| 38 |
+
|
| 39 |
+
# If no boxes found, return full image
|
| 40 |
+
if not boxes:
|
| 41 |
+
return [Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))]
|
| 42 |
+
|
| 43 |
+
# Sort boxes by position (top to bottom, left to right)
|
| 44 |
+
boxes = sorted(boxes, key=lambda b: (b[1]//50, b[0]))
|
| 45 |
+
|
| 46 |
+
# Extract crops
|
| 47 |
+
crops = []
|
| 48 |
+
for (x, y, w, h) in boxes:
|
| 49 |
+
pad = 6
|
| 50 |
+
x0 = max(0, x - pad)
|
| 51 |
+
y0 = max(0, y - pad)
|
| 52 |
+
x1 = min(w_img, x + w + pad)
|
| 53 |
+
y1 = min(h_img, y + h + pad)
|
| 54 |
+
|
| 55 |
+
crop = img[y0:y1, x0:x1]
|
| 56 |
+
crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
|
| 57 |
+
|
| 58 |
+
return crops
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"[ERROR] Hieroglyph segmentation failed: {e}")
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
def validate_image(file):
|
| 65 |
+
"""Validate uploaded image file"""
|
| 66 |
+
from config import Config
|
| 67 |
+
config = Config()
|
| 68 |
+
|
| 69 |
+
# Check file size
|
| 70 |
+
if hasattr(file, 'content_length') and file.content_length > config.MAX_FILE_SIZE:
|
| 71 |
+
raise ValueError(f"File too large. Maximum size: {config.MAX_FILE_SIZE} bytes")
|
| 72 |
+
|
| 73 |
+
# Check file extension
|
| 74 |
+
if not file.filename or '.' not in file.filename:
|
| 75 |
+
raise ValueError("Invalid filename")
|
| 76 |
+
|
| 77 |
+
extension = file.filename.rsplit('.', 1)[1].lower()
|
| 78 |
+
if extension not in config.ALLOWED_EXTENSIONS:
|
| 79 |
+
raise ValueError(f"Invalid file type. Allowed: {', '.join(config.ALLOWED_EXTENSIONS)}")
|
| 80 |
+
|
| 81 |
+
# Try to open as image
|
| 82 |
+
try:
|
| 83 |
+
image = Image.open(file.stream)
|
| 84 |
+
image.verify()
|
| 85 |
+
file.stream.seek(0) # Reset stream for later use
|
| 86 |
+
return True
|
| 87 |
+
except Exception:
|
| 88 |
+
raise ValueError("File is not a valid image")
|
| 89 |
+
|
| 90 |
+
def preprocess_for_latin_ocr(image_path):
|
| 91 |
+
"""Specialized preprocessing for Latin texts"""
|
| 92 |
+
try:
|
| 93 |
+
# Load image
|
| 94 |
+
image = cv2.imread(image_path)
|
| 95 |
+
if image is None:
|
| 96 |
+
raise ValueError(f"Cannot load image: {image_path}")
|
| 97 |
+
|
| 98 |
+
# Convert to grayscale
|
| 99 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 100 |
+
|
| 101 |
+
# Apply bilateral filter to reduce noise while preserving edges
|
| 102 |
+
filtered = cv2.bilateralFilter(gray, 9, 75, 75)
|
| 103 |
+
|
| 104 |
+
# Adaptive thresholding for varying lighting
|
| 105 |
+
thresh = cv2.adaptiveThreshold(
|
| 106 |
+
filtered, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 107 |
+
cv2.THRESH_BINARY, 11, 2
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
return thresh
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f"[ERROR] Latin preprocessing failed: {e}")
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
def enhance_contrast_for_manuscripts(image):
|
| 117 |
+
"""Enhanced contrast specifically for manuscript images"""
|
| 118 |
+
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
| 119 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
|
| 120 |
+
enhanced = clahe.apply(image)
|
| 121 |
+
return enhanced
|
utils/text_utils.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from collections import Counter
|
| 3 |
+
from itertools import groupby
|
| 4 |
+
|
| 5 |
+
def is_gibberish(text):
|
| 6 |
+
"""Check if text appears to be gibberish"""
|
| 7 |
+
if not text or not isinstance(text, str):
|
| 8 |
+
return True
|
| 9 |
+
|
| 10 |
+
words = re.findall(r"\w+", text.lower())
|
| 11 |
+
if len(words) == 0:
|
| 12 |
+
return True
|
| 13 |
+
|
| 14 |
+
# Check for excessive repetition
|
| 15 |
+
word_counts = Counter(words)
|
| 16 |
+
if word_counts:
|
| 17 |
+
most_common, count = word_counts.most_common(1)[0]
|
| 18 |
+
if count > 12 or (count / len(words)) > 0.4:
|
| 19 |
+
return True
|
| 20 |
+
|
| 21 |
+
# Check minimum word count
|
| 22 |
+
if len(words) < 1:
|
| 23 |
+
return True
|
| 24 |
+
if len(words) == 1 and len(words[0]) < 3:
|
| 25 |
+
return True
|
| 26 |
+
|
| 27 |
+
return False
|
| 28 |
+
|
| 29 |
+
def build_description_from_codes(codes):
|
| 30 |
+
"""Build description from Gardiner codes"""
|
| 31 |
+
from config import Config
|
| 32 |
+
config = Config()
|
| 33 |
+
|
| 34 |
+
labels = [config.CODE_TO_LABEL.get(code, code) for code in codes]
|
| 35 |
+
compressed = []
|
| 36 |
+
|
| 37 |
+
for key, group in groupby(labels):
|
| 38 |
+
count = len(list(group))
|
| 39 |
+
name = "unknown" if (key == "?" or key is None) else key
|
| 40 |
+
compressed.append(f"{name} (x{count})" if count > 1 else name)
|
| 41 |
+
|
| 42 |
+
return ", ".join(compressed)
|
| 43 |
+
|
| 44 |
+
def clean_text(text):
|
| 45 |
+
"""Clean and normalize text"""
|
| 46 |
+
if not text:
|
| 47 |
+
return ""
|
| 48 |
+
|
| 49 |
+
# Remove excessive whitespace
|
| 50 |
+
text = re.sub(r'\s+', ' ', text)
|
| 51 |
+
|
| 52 |
+
# Strip leading/trailing whitespace
|
| 53 |
+
text = text.strip()
|
| 54 |
+
|
| 55 |
+
return text
|
| 56 |
+
|
| 57 |
+
def extract_words(text, min_length=2):
|
| 58 |
+
"""Extract words from text with minimum length"""
|
| 59 |
+
if not text:
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
words = re.findall(r"\w+", text, flags=re.UNICODE)
|
| 63 |
+
return [word for word in words if len(word) >= min_length]
|
| 64 |
+
|
| 65 |
+
def calculate_text_stats(text):
|
| 66 |
+
"""Calculate basic text statistics"""
|
| 67 |
+
if not text:
|
| 68 |
+
return {
|
| 69 |
+
"char_count": 0,
|
| 70 |
+
"word_count": 0,
|
| 71 |
+
"unique_chars": 0,
|
| 72 |
+
"avg_word_length": 0
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
words = extract_words(text)
|
| 76 |
+
|
| 77 |
+
return {
|
| 78 |
+
"char_count": len(text),
|
| 79 |
+
"word_count": len(words),
|
| 80 |
+
"unique_chars": len(set(text)),
|
| 81 |
+
"avg_word_length": sum(len(word) for word in words) / max(1, len(words))
|
| 82 |
+
}
|
utils/validation.py
ADDED
|
File without changes
|