Akshay30 commited on
Commit
2f4af3f
·
0 Parent(s):

Initial DecipherAI backend deployment

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DecipherAI Backend — Hugging Face Spaces Docker Configuration
2
+ # Space SDK: Docker
3
+ # Port: 7860
4
+
5
+ FROM python:3.11-slim
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ tesseract-ocr \
10
+ wget \
11
+ libgl1 \
12
+ libglib2.0-0 \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Download Ancient Greek Tesseract model
16
+ RUN mkdir -p /usr/share/tesseract-ocr/5/tessdata && \
17
+ wget -q \
18
+ https://github.com/tesseract-ocr/tessdata/raw/main/grc.traineddata \
19
+ -O /usr/share/tesseract-ocr/5/tessdata/grc.traineddata
20
+
21
+ # Create non-root user (HF Spaces recommendation)
22
+ RUN useradd -m -u 1000 user
23
+
24
+ USER user
25
+
26
+ ENV HOME=/home/user
27
+ ENV PATH=/home/user/.local/bin:$PATH
28
+
29
+ WORKDIR /home/user/app
30
+
31
+ # Install Python dependencies
32
+ COPY --chown=user:user requirements.txt .
33
+
34
+ RUN pip install --no-cache-dir --upgrade pip && \
35
+ pip install --no-cache-dir -r requirements.txt
36
+
37
+ # Copy application
38
+ COPY --chown=user:user . .
39
+
40
+ # Hugging Face Space port
41
+ EXPOSE 7860
42
+
43
+ # Production server
44
+ CMD ["gunicorn", \
45
+ "--bind", "0.0.0.0:7860", \
46
+ "--workers", "1", \
47
+ "--timeout", "300", \
48
+ "--preload", \
49
+ "app:app"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Decipherai Api
3
+ emoji: 🔥
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ short_description: Ancient script analysis, OCR, translation and historical int
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tempfile
4
+ from dotenv import load_dotenv
5
+
6
+ # Safe stdout/stderr wrapper to prevent OSError: [Errno 22] when stdout pipe is closed/unbuffered in background
7
+ class SafeStream:
8
+ def __init__(self, original_stream):
9
+ self.original_stream = original_stream
10
+
11
+ def write(self, data):
12
+ try:
13
+ if self.original_stream:
14
+ self.original_stream.write(data)
15
+ except OSError as e:
16
+ if e.errno != 22:
17
+ raise
18
+
19
+ def flush(self):
20
+ try:
21
+ if self.original_stream:
22
+ self.original_stream.flush()
23
+ except OSError:
24
+ pass
25
+
26
+ def __getattr__(self, attr):
27
+ return getattr(self.original_stream, attr)
28
+
29
+ sys.stdout = SafeStream(sys.stdout)
30
+ sys.stderr = SafeStream(sys.stderr)
31
+
32
+ # Load .env variables (including HF_HOME and GROQ_API_KEY) before imports
33
+ load_dotenv()
34
+
35
+ from flask import Flask, request, jsonify
36
+ from flask_cors import CORS
37
+
38
+ # Import modular components
39
+ from config import Config
40
+ from models.groq_client import GroqClient
41
+ from models.clip_classifier import CLIPClassifier
42
+ from models.tesseract_ocr import TesseractOCR
43
+ from models.huggingface_models import HuggingFaceModels
44
+ from services.groq_vision_classifier import GroqVisionScriptClassifier
45
+ from services.script_detector import ScriptDetectionService
46
+ from utils.image_utils import validate_image
47
+ from utils.text_utils import clean_text
48
+ from processors.cuneiform_processor import CuneiformProcessor
49
+ from utils.gpu_diagnostics import log_gpu_info
50
+
51
+ # Initialize Flask app
52
+ app = Flask(__name__)
53
+
54
+ # CORS — restrict origins in production via ALLOWED_ORIGINS env var
55
+ # Example: ALLOWED_ORIGINS=https://your-frontend.vercel.app,https://custom-domain.com
56
+ allowed_origins = os.getenv(
57
+ "ALLOWED_ORIGINS",
58
+ "http://localhost:3000,http://localhost:5173,http://localhost:5000"
59
+ )
60
+ CORS(app, origins=allowed_origins.split(","))
61
+
62
+ # Global components
63
+ import threading
64
+ config = Config()
65
+ groq_client = None
66
+ clip_classifier = None
67
+ hf_models = None
68
+ script_detector = None
69
+ cuneiform_processor = None
70
+ references = {}
71
+
72
+ # Live model preloading status tracking
73
+ model_status = {
74
+ "status": "loading",
75
+ "groq": "pending",
76
+ "clip": "pending",
77
+ "translator": "pending",
78
+ "cuneiform": "pending",
79
+ "script_detector": "pending"
80
+ }
81
+
82
+
83
+ def load_references():
84
+ """Load references from JSON file"""
85
+ global references
86
+ try:
87
+ import json
88
+ with open(config.REFERENCES_PATH, "r", encoding="utf-8") as f:
89
+ data = json.load(f)
90
+
91
+ references = {
92
+ "egypt_symbol_notes": data.get("egypt_symbol_notes", {}),
93
+ "greek_symbol_notes": data.get("greek_symbol_notes", {}),
94
+ "greek_hint": data.get("greek_hint", "If no specific character note is found, treat as lexical marker considering diacriticals (breathing marks, accents, vowel quantity) which affect pronunciation, meaning, and grammatical function in ancient Greek texts."),
95
+ "latin_symbol_notes": data.get("latin_symbol_notes", {}),
96
+ "latin_hint": data.get("latin_hint", "If no specific character note is found, consider standard Latin letters or medieval scribal abbreviations."),
97
+ # Cuneiform references
98
+ "cuneiform_symbol_notes": data.get("cuneiform_symbol_notes", {}),
99
+ "cuneiform_hint": data.get("cuneiform_hint", "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages (Sumerian, Akkadian, etc.)")
100
+ }
101
+
102
+ print(f"[INFO] Loaded references from {config.REFERENCES_PATH}")
103
+ except Exception as e:
104
+ print(f"[WARN] Failed to load references: {e}")
105
+ references = {
106
+ "egypt_symbol_notes": {},
107
+ "greek_symbol_notes": {},
108
+ "greek_hint": "Possible Greek lexical marker.",
109
+ "latin_symbol_notes": {},
110
+ "latin_hint": "Latin scribal practice.",
111
+ "cuneiform_symbol_notes": {},
112
+ "cuneiform_hint": "Ancient cuneiform sign."
113
+ }
114
+
115
+
116
+ def initialize_models_async():
117
+ """Load models sequentially in the background to prevent blocking Flask startup"""
118
+ global groq_client, clip_classifier, hf_models, script_detector, cuneiform_processor, model_status
119
+ try:
120
+ print("[INFO] Background model preloading thread started...")
121
+
122
+ # Log GPU Diagnostics
123
+ log_gpu_info()
124
+
125
+ # Load references first
126
+ load_references()
127
+
128
+ # Groq
129
+ model_status["groq"] = "loading"
130
+ groq_client = GroqClient()
131
+ model_status["groq"] = "ready" if groq_client.is_available() else "unavailable"
132
+ print(f"[INFO] Groq client initialization complete: {model_status['groq']}")
133
+
134
+ # CLIP
135
+ model_status["clip"] = "loading"
136
+ clip_classifier = CLIPClassifier()
137
+ model_status["clip"] = "ready" if (clip_classifier and clip_classifier.pipeline is not None) else "failed"
138
+ print(f"[INFO] CLIP classifier initialization complete: {model_status['clip']}")
139
+
140
+ # HF Translator
141
+ model_status["translator"] = "loading"
142
+ hf_models = HuggingFaceModels()
143
+ model_status["translator"] = "ready" if (hf_models and hf_models.get_translator() is not None) else "failed"
144
+ print(f"[INFO] Hugging Face models initialization complete: {model_status['translator']}")
145
+
146
+ # Cuneiform Processor
147
+ model_status["cuneiform"] = "loading"
148
+ try:
149
+ print("[INFO] Initializing cuneiform processor...")
150
+ cuneiform_processor = CuneiformProcessor(
151
+ groq_client=groq_client,
152
+ references=references,
153
+ clip_classifier=clip_classifier
154
+ )
155
+ model_status["cuneiform"] = "ready" if cuneiform_processor.cuneiform_available else "unavailable"
156
+ except Exception as e:
157
+ print(f"[ERROR] Failed to initialize cuneiform processor: {e}")
158
+ model_status["cuneiform"] = "failed"
159
+ cuneiform_processor = None
160
+ print(f"[INFO] Cuneiform processor initialization complete: {model_status['cuneiform']}")
161
+
162
+ # Script Detection Service
163
+ model_status["script_detector"] = "loading"
164
+ script_detector = ScriptDetectionService(
165
+ groq_client=groq_client,
166
+ references=references,
167
+ clip_classifier=clip_classifier,
168
+ translator_pipe=hf_models.get_translator(),
169
+ cuneiform_processor=cuneiform_processor
170
+ )
171
+ model_status["script_detector"] = "ready"
172
+ print(f"[INFO] Script detection service initialization complete: {model_status['script_detector']}")
173
+
174
+ model_status["status"] = "ready"
175
+ print("[SUCCESS] All models initialized successfully in the background")
176
+
177
+ except Exception as e:
178
+ model_status["status"] = "failed"
179
+ print(f"[ERROR] Critical failure in background model initialization: {e}")
180
+
181
+
182
+ def initialize_models():
183
+ """Spawn background thread to load models"""
184
+ print("[INFO] Spawning background thread for model initialization...")
185
+ model_status["status"] = "loading"
186
+ threading.Thread(target=initialize_models_async, daemon=True).start()
187
+
188
+
189
+ @app.route('/analyze', methods=['POST'])
190
+ def analyze():
191
+ """Main analysis endpoint with Groq Vision classification"""
192
+ tmp_path = None
193
+
194
+ try:
195
+ # Check if models are fully loaded
196
+ if model_status["status"] != "ready":
197
+ return jsonify({
198
+ "error": "Models are still loading in the background. Please try again in a few moments.",
199
+ "status": "loading",
200
+ "models_status": model_status
201
+ }), 503
202
+
203
+ # Validate request
204
+ if 'image' not in request.files:
205
+ return jsonify({"error": "No image uploaded"}), 400
206
+
207
+ img_file = request.files['image']
208
+ if img_file.filename == '':
209
+ return jsonify({"error": "Empty filename"}), 400
210
+
211
+ # Validate image file
212
+ try:
213
+ validate_image(img_file)
214
+ except ValueError as e:
215
+ return jsonify({"error": str(e)}), 400
216
+
217
+ # Save temporary file
218
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
219
+ tmp_path = tmp.name
220
+ img_file.save(tmp_path)
221
+
222
+ # Process image with Groq Vision classification
223
+ result = script_detector.detect_and_process(tmp_path)
224
+
225
+ if not result:
226
+ return jsonify({"error": "Could not process image"}), 500
227
+
228
+ # Get Vision classification info
229
+ vision_classification = result.get('vision_classification', 'unknown')
230
+ classification_method = result.get('classification_method', 'unknown')
231
+ classification_confidence = result.get('classification_confidence', 0.0)
232
+ script_type = result.get('script_type', 'egyptian')
233
+
234
+ # Base response with Vision classification info
235
+ base_response = {
236
+ "script_type": script_type,
237
+ "vision_classification": vision_classification,
238
+ "classification_method": classification_method,
239
+ "classification_confidence": classification_confidence,
240
+ "confidence": result.get('confidence', 0.0),
241
+ "historical_context": result.get('historical_context', {}),
242
+ "creative_story": result.get('creative_story', ''),
243
+ "model_used": "llama-3.2-90b-vision-preview"
244
+ }
245
+
246
+ # Handle cuneiform processing
247
+ if script_type == 'cuneiform':
248
+ if not cuneiform_processor or not cuneiform_processor.cuneiform_available:
249
+ return jsonify({
250
+ **base_response,
251
+ "error": "Cuneiform processing unavailable",
252
+ "labels": [],
253
+ "gardiner_codes": [],
254
+ "translation": "Cuneiform translation model not available",
255
+ "translation_ok": False
256
+ }), 200
257
+
258
+ try:
259
+ # Process cuneiform text
260
+ processed_result = result.get('processed_result', {})
261
+ cuneiform_text = processed_result.get('text', '')
262
+
263
+ # Translate cuneiform to English
264
+ translation = ""
265
+ translation_ok = False
266
+
267
+ if cuneiform_text and len(cuneiform_text.strip()) > 2:
268
+ print(f"[INFO] Translating cuneiform: {cuneiform_text[:50]}...")
269
+ translation = cuneiform_processor.translate_cuneiform(cuneiform_text)
270
+ translation_ok = bool(translation and not translation.startswith("Error"))
271
+ else:
272
+ translation = "No readable cuneiform text extracted"
273
+
274
+ # Build cuneiform response
275
+ response_data = {
276
+ **base_response,
277
+ "labels": [],
278
+ "gardiner_codes": [],
279
+ "translation": translation,
280
+ "translation_ok": translation_ok,
281
+ "cuneiform_text": cuneiform_text,
282
+ "validation": {
283
+ "quality_score": processed_result.get('validation', {}).get('quality_score', 0.0),
284
+ "cuneiform_ratio": processed_result.get('validation', {}).get('cuneiform_ratio', 0.0),
285
+ "atf_ratio": processed_result.get('validation', {}).get('atf_ratio', 0.0),
286
+ "char_analysis": processed_result.get('char_analysis', {}),
287
+ "ocr_method": "praeclarum/cuneiform (T5-based translation)",
288
+ "supports_translation": True,
289
+ "input_format": processed_result.get('char_analysis', {}).get('text_format', 'Unknown')
290
+ }
291
+ }
292
+
293
+ return jsonify(response_data)
294
+
295
+ except Exception as e:
296
+ print(f"[ERROR] Cuneiform processing failed: {e}")
297
+ return jsonify({
298
+ **base_response,
299
+ "error": f"Cuneiform processing error: {str(e)}",
300
+ "labels": [],
301
+ "gardiner_codes": [],
302
+ "translation": "Cuneiform processing failed",
303
+ "translation_ok": False
304
+ }), 200
305
+
306
+ elif script_type in ['greek', 'latin']:
307
+ processed_result = result.get('processed_result', {})
308
+ validation = processed_result.get('validation', {})
309
+
310
+ response_data = {
311
+ **base_response,
312
+ "labels": [],
313
+ "gardiner_codes": [],
314
+ "translation": processed_result.get('text', ''),
315
+ "translation_ok": True,
316
+ }
317
+
318
+ # Add enhanced validation info for Greek
319
+ if script_type == 'greek':
320
+ response_data["validation"] = {
321
+ "quality_score": validation.get('quality_score', 0.0),
322
+ "greek_ratio": validation.get('greek_ratio', 0.0),
323
+ "has_polytonic": validation.get('has_polytonic', False),
324
+ "char_analysis": processed_result.get('char_analysis', {}),
325
+ "ocr_method": "ancient_greek_ocr" if validation.get('quality_score', 0) > 0.7 else "standard_greek_ocr"
326
+ }
327
+
328
+ elif script_type == 'latin':
329
+ response_data["validation"] = {
330
+ "quality_score": validation.get('quality_score', 0.0),
331
+ "latin_ratio": validation.get('latin_ratio', 0.0),
332
+ "trocr_used": validation.get('tridis_used', False) or (validation.get('ocr_method') in ['trocr-base-latin', 'tridis_HTR']),
333
+ "char_analysis": processed_result.get('char_analysis', {}),
334
+ "ocr_method": validation.get('ocr_method', 'standard_latin_ocr'),
335
+ "writing_style": validation.get('writing_style', 'cursive')
336
+ }
337
+
338
+
339
+ return jsonify(response_data)
340
+
341
+ else: # Egyptian
342
+ processed = result['processed_result']
343
+ return jsonify({
344
+ **base_response,
345
+ "labels": processed['labels'],
346
+ "gardiner_codes": processed['codes'],
347
+ "translation": processed['translation'],
348
+ "translation_ok": processed['translation_ok']
349
+ })
350
+
351
+ except Exception as e:
352
+ print(f"[ERROR] Analysis failed: {e}")
353
+ import traceback
354
+ traceback.print_exc()
355
+ return jsonify({"error": "Processing failed"}), 500
356
+
357
+ finally:
358
+ # Cleanup temporary file
359
+ if tmp_path:
360
+ try:
361
+ os.remove(tmp_path)
362
+ except Exception:
363
+ pass
364
+
365
+
366
+ @app.route('/chat', methods=['POST'])
367
+ def chat():
368
+ """Chatbot endpoint for manuscript queries and general dialogue"""
369
+ try:
370
+ data = request.get_json() or {}
371
+ message = data.get("message", "")
372
+ history = data.get("history", [])
373
+ context = data.get("context", "")
374
+
375
+ if not message:
376
+ return jsonify({"error": "Message is required"}), 400
377
+
378
+ system_prompt = (
379
+ "You are DecipherAI's helpful historical assistant. You are an expert paleographer and historian.\n"
380
+ "Answer the user's questions about ancient scripts, translations, and history in a helpful, "
381
+ "academic yet accessible manner. Cite historical sources when appropriate."
382
+ )
383
+ if context:
384
+ system_prompt += f"\n\nHere is the context of the current manuscript translation:\n{context}"
385
+
386
+ if not groq_client or not groq_client.is_available():
387
+ reply = (
388
+ f"Thank you for your question: '{message}'. I'm currently running in offline fallback mode "
389
+ f"because the Groq API key is not set. Once configured, I will be able to answer all your "
390
+ f"scholarly questions about the translated scripts, historical context, and paleography in real time!"
391
+ )
392
+ else:
393
+ prompt = ""
394
+ for turn in history[-5:]:
395
+ role = turn.get("role", "user")
396
+ content = turn.get("content", "")
397
+ prompt += f"{role.upper()}: {content}\n"
398
+ prompt += f"USER: {message}\nASSISTANT:"
399
+
400
+ reply = groq_client.generate_response(
401
+ system_prompt=system_prompt,
402
+ user_prompt=prompt,
403
+ max_tokens=500
404
+ ) or "I'm sorry, I encountered an error generating a response."
405
+
406
+ return jsonify({"reply": reply})
407
+ except Exception as e:
408
+ print(f"[ERROR] Chat failed: {e}")
409
+ return jsonify({"error": "Failed to process chat message"}), 500
410
+
411
+
412
+ @app.route('/health', methods=['GET'])
413
+ def health_check():
414
+ """Health check endpoint returning real-time load status"""
415
+ return jsonify({
416
+ "status": "healthy" if model_status["status"] == "ready" else "initializing",
417
+ "models_status": model_status
418
+ })
419
+
420
+
421
+ @app.route('/info', methods=['GET'])
422
+ def info():
423
+ """Information endpoint"""
424
+ return jsonify({
425
+ "app": "Ancient Script Recognition System",
426
+ "version": "2.1.0",
427
+ "supported_scripts": [
428
+ "Egyptian Hieroglyphs",
429
+ "Ancient Greek",
430
+ "Latin",
431
+ "Ancient Cuneiform"
432
+ ],
433
+ "features": [
434
+ "Multi-script detection",
435
+ "OCR text extraction",
436
+ "Historical context generation",
437
+ "Creative story generation",
438
+ "Cuneiform translation (Sumerian/Akkadian → English)"
439
+ ]
440
+ })
441
+
442
+
443
+ # --- Model initialization ---
444
+ # When running under gunicorn (or any WSGI server), __name__ != "__main__",
445
+ # so we initialize models at module level. The gunicorn --preload flag ensures
446
+ # this runs once in the master process before forking workers.
447
+ def _auto_initialize():
448
+ """Initialize models when running under a WSGI server (gunicorn, waitress, etc.)"""
449
+ if os.getenv("WERKZEUG_RUN_MAIN") == "true":
450
+ # Flask reloader child process — handled by __main__ block
451
+ return
452
+ print("[INIT] WSGI server detected — initializing models...")
453
+ initialize_models()
454
+
455
+
456
+ if __name__ == "__main__":
457
+ print("[INIT] Starting Ancient Script Recognition System...")
458
+
459
+ # Start Flask app
460
+ port = int(os.getenv("PORT", 7860))
461
+ debug = os.getenv("DEBUG", "False").lower() == "true"
462
+
463
+ # Initialize all models (only in child process if debug mode is on to avoid duplicate threads)
464
+ if not debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
465
+ initialize_models()
466
+ else:
467
+ print("[INFO] Reloader active. Model initialization deferred to child process.")
468
+
469
+ print(f"[INFO] Starting server on port {port}")
470
+ app.run(host="0.0.0.0", port=port, debug=debug)
471
+ else:
472
+ # Running under gunicorn / WSGI
473
+ _auto_initialize()
config.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import torch
4
+
5
+ class Config:
6
+ # Paths
7
+ BASE_DIR = Path(__file__).parent
8
+
9
+ TESSERACT_EXE = os.getenv("TESSERACT_EXE", "tesseract")
10
+ TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX")
11
+
12
+ REFERENCES_PATH = BASE_DIR / "references.json"
13
+ ANCIENT_GREEK_TESSDATA = BASE_DIR / "tessdata" / "ancient-greek"
14
+
15
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
16
+
17
+ # Model Settings
18
+ HF_TRANSLATOR_MODEL = "AnushS/Hieroglyph-Translator-Using-Gardiner-Codes"
19
+ CLIP_MODEL = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
20
+ DEVICE = 0 if torch.cuda.is_available() else -1
21
+
22
+ # Groq Settings
23
+ GROQ_MODEL = "openai/gpt-oss-120b"
24
+ GROQ_TEMPERATURE = 1.0
25
+ GROQ_STORY_MAX_TOKENS = 1024
26
+ GROQ_CONTEXT_MAX_TOKENS = 2048
27
+
28
+ # File Upload Settings
29
+ MAX_FILE_SIZE = 16 * 1024 * 1024 # 16MB
30
+ ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp'}
31
+
32
+ # Gardiner Code Mapping
33
+ GARDINER_MAP = {
34
+ "man_seated": "A1", "woman_seated": "B1", "god_figure": "C1",
35
+ "eye": "D4", "hippopotamus": "E25", "leg": "F28", "owl": "G17",
36
+ "feather": "H2", "lizard": "I1", "fish": "K1", "insect": "L1",
37
+ "reed": "M17", "sun": "N5", "crown": "S39", "bow": "T14",
38
+ "hoe": "U25", "rope": "V1", "jar": "W1", "bread": "X3", "scribe_tools": "Y5"
39
+ }
40
+ TESSERACT_CONFIGS = {
41
+ 'ancient_greek': "--psm 6 --oem 1 -c preserve_interword_spaces=1",
42
+ 'standard_greek': "--psm 6 --oem 1",
43
+ 'fallback': "--psm 3 --oem 1"
44
+ }
45
+
46
+ @property
47
+ def CODE_TO_LABEL(self):
48
+ return {v: k for k, v in self.GARDINER_MAP.items()}
decipherai-api ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit ed9e7fdd210f252a5309a7e6fc728a29fce274dd
models/clip_classifier.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import CLIPProcessor, CLIPModel
3
+ from PIL import Image
4
+ import numpy as np
5
+ from config import Config
6
+ from utils.gpu_diagnostics import log_model_device
7
+
8
+ class CLIPClassifier:
9
+ def __init__(self):
10
+ self.config = Config()
11
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ self.model = None
13
+ self.processor = None
14
+
15
+ # Load CLIP model and processor with fallback
16
+ model_name = getattr(self.config, 'CLIP_MODEL', 'openai/clip-vit-base-patch32')
17
+ try:
18
+ print(f"[INFO] Loading CLIP model: {model_name}...")
19
+ self.model = CLIPModel.from_pretrained(model_name)
20
+ self.processor = CLIPProcessor.from_pretrained(model_name)
21
+ self.model.to(self.device)
22
+ self.model.eval() # Set model to evaluation mode
23
+ log_model_device("CLIP script classifier", self.device)
24
+ print(f"[INFO] CLIP model loaded on {self.device}")
25
+ except Exception as e:
26
+ print(f"[WARN] Failed to load CLIP model '{model_name}': {e}")
27
+ fallback_name = "openai/clip-vit-base-patch32"
28
+ try:
29
+ print(f"[INFO] Loading fallback CLIP model: {fallback_name}...")
30
+ self.model = CLIPModel.from_pretrained(fallback_name)
31
+ self.processor = CLIPProcessor.from_pretrained(fallback_name)
32
+ self.model.to(self.device)
33
+ self.model.eval() # Set model to evaluation mode
34
+ log_model_device("CLIP script classifier (fallback)", self.device)
35
+ print(f"[INFO] Fallback CLIP model loaded on {self.device}")
36
+ except Exception as fe:
37
+ print(f"[ERROR] Failed to load fallback CLIP model: {fe}")
38
+
39
+ @property
40
+ def pipeline(self):
41
+ """Property checked in app.py/test.py to ensure model is initialized"""
42
+ return self.model if self.model is not None else None
43
+
44
+ def classify_script_type(self, image):
45
+ """Classify script type of image into one of the four supported categories"""
46
+ if not self.pipeline:
47
+ return "unknown", 0.0
48
+
49
+ try:
50
+ if isinstance(image, np.ndarray):
51
+ image = Image.fromarray(image)
52
+
53
+ # Prompts representing the four classes
54
+ scripts = ["egyptian", "greek", "latin", "cuneiform"]
55
+ descriptions = [
56
+ "ancient Egyptian hieroglyphic writing with drawings of animals and humans",
57
+ "ancient Greek alphabet script on papyrus or stone with polytonic symbols",
58
+ "medieval Latin manuscript text written in ink on parchment",
59
+ "ancient Mesopotamian cuneiform tablet with wedge-shaped markings in clay"
60
+ ]
61
+
62
+ inputs = self.processor(
63
+ text=descriptions,
64
+ images=image,
65
+ return_tensors="pt",
66
+ padding=True
67
+ ).to(self.device)
68
+
69
+ with torch.inference_mode():
70
+ outputs = self.model(**inputs)
71
+ logits_per_image = outputs.logits_per_image
72
+ probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
73
+
74
+ best_idx = np.argmax(probs)
75
+ score = float(probs[best_idx])
76
+ script_label = scripts[best_idx]
77
+
78
+ print(f"[INFO] CLIP script classification: {script_label} ({score:.3f})")
79
+ return script_label, score
80
+
81
+ except Exception as e:
82
+ print(f"[ERROR] CLIP script classification failed: {e}")
83
+ return "unknown", 0.0
84
+
85
+ def classify_symbols(self, crops, candidate_labels):
86
+ """Classify segmented symbol image crops against candidate labels"""
87
+ if not self.pipeline or not crops or not candidate_labels:
88
+ return [None] * len(crops) if crops else []
89
+
90
+ try:
91
+ print(f"[INFO] Batch classifying {len(crops)} crops using CLIP...")
92
+
93
+ # Format candidate labels into descriptive prompts for better visual matching
94
+ prompts = [f"an ancient Egyptian hieroglyph symbol of a {label.replace('_', ' ')}" for label in candidate_labels]
95
+
96
+ # Tokenize prompts once
97
+ text_inputs = self.processor(
98
+ text=prompts,
99
+ return_tensors="pt",
100
+ padding=True
101
+ ).to(self.device)
102
+
103
+ with torch.inference_mode():
104
+ text_features = self.model.get_text_features(**text_inputs)
105
+ text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
106
+
107
+ results = []
108
+ # Process crops (images)
109
+ for crop in crops:
110
+ if isinstance(crop, np.ndarray):
111
+ crop = Image.fromarray(crop)
112
+
113
+ image_inputs = self.processor(images=crop, return_tensors="pt").to(self.device)
114
+
115
+ with torch.inference_mode():
116
+ image_features = self.model.get_image_features(**image_inputs)
117
+ image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
118
+
119
+ # Compute cosine similarities
120
+ similarities = (image_features @ text_features.T).squeeze(0)
121
+ best_idx = torch.argmax(similarities).item()
122
+
123
+ results.append(candidate_labels[best_idx])
124
+
125
+ return results
126
+
127
+ except Exception as e:
128
+ print(f"[ERROR] CLIP symbol classification failed: {e}")
129
+ return [candidate_labels[0]] * len(crops) if candidate_labels else [None] * len(crops)
models/groq_client.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from config import Config
3
+
4
+ class GroqClient:
5
+ def __init__(self):
6
+ self.config = Config()
7
+ self.api_key = self.config.GROQ_API_KEY or os.getenv("GROQ_API_KEY")
8
+ self.client = None
9
+
10
+ if self.api_key:
11
+ try:
12
+ from groq import Groq
13
+ self.client = Groq(api_key=self.api_key)
14
+ print("[INFO] Groq client initialized successfully")
15
+ except ImportError:
16
+ print("[WARN] groq package not installed. Run 'pip install groq'.")
17
+ except Exception as e:
18
+ print(f"[ERROR] Failed to initialize Groq client: {e}")
19
+ else:
20
+ print("[WARN] GROQ_API_KEY not found in configuration or environment.")
21
+
22
+ def is_available(self) -> bool:
23
+ """Check if Groq API client is available and configured"""
24
+ return self.client is not None
25
+
26
+ def generate_response(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> str:
27
+ """Generate response from Groq LLM"""
28
+ if not self.is_available():
29
+ print("[WARN] GroqClient not available for generating response.")
30
+ return ""
31
+
32
+ try:
33
+ # Use stable model name or configured fallback
34
+ model = self.config.GROQ_MODEL
35
+ # Common model fallbacks if config is generic or outdated
36
+ if model == "openai/gpt-oss-120b":
37
+ model = "llama-3.1-8b-instant" # standard Groq model
38
+
39
+ completion = self.client.chat.completions.create(
40
+ model=model,
41
+ messages=[
42
+ {"role": "system", "content": system_prompt},
43
+ {"role": "user", "content": user_prompt}
44
+ ],
45
+ temperature=getattr(self.config, 'GROQ_TEMPERATURE', 0.7),
46
+ max_completion_tokens=max_tokens,
47
+ )
48
+ return completion.choices[0].message.content
49
+ except Exception as e:
50
+ print(f"[ERROR] Groq API call failed: {e}")
51
+ return ""
models/huggingface_models.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from config import Config
4
+ from utils.gpu_diagnostics import log_model_device, register_processor, reclaim_vram_for
5
+
6
+ class HuggingFaceModels:
7
+ def __init__(self):
8
+ self.config = Config()
9
+ self.device = torch.device("cpu") # Force Egyptian translator to CPU to save GPU VRAM
10
+ self._tokenizer = None
11
+ self._model = None
12
+ self.translator = self._translate_fn
13
+ print("[INFO] Egyptian translator initialized (Forced to CPU)")
14
+
15
+ def setup_translation_model(self):
16
+ """Load T5 Seq2Seq model on CPU."""
17
+ model_name = getattr(self.config, 'HF_TRANSLATOR_MODEL', 'AnushS/Hieroglyph-Translator-Using-Gardiner-Codes')
18
+ try:
19
+ print(f"[INFO] Lazily loading Hugging Face translation model on CPU: {model_name}...")
20
+ self._tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+ self._model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
22
+ self._model.to(self.device)
23
+ self._model.eval()
24
+ log_model_device("Egyptian T5 Translator", self.device)
25
+ print("[INFO] Translation model loaded successfully on CPU (Seq2Seq direct)")
26
+ except Exception as e:
27
+ print(f"[ERROR] Failed to load translation model '{model_name}': {e}")
28
+ self.translator = self._get_mock_translator()
29
+
30
+ def _translate_fn(self, prompt, max_new_tokens=128, **kwargs):
31
+ """Translate using the T5 model directly on CPU."""
32
+ try:
33
+ if self._model is None:
34
+ self.setup_translation_model()
35
+
36
+ inputs = self._tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
37
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
38
+
39
+ with torch.inference_mode():
40
+ outputs = self._model.generate(
41
+ **inputs,
42
+ max_new_tokens=max_new_tokens,
43
+ num_beams=kwargs.get("num_beams", 4),
44
+ do_sample=kwargs.get("do_sample", False),
45
+ )
46
+
47
+ decoded = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
48
+ return [{"generated_text": decoded, "translation_text": decoded}]
49
+ except Exception as e:
50
+ print(f"[ERROR] Translation inference failed: {e}")
51
+ return [{"generated_text": "", "translation_text": ""}]
52
+
53
+ def get_translator(self):
54
+ """Return the loaded translation function or mock fallback"""
55
+ return self.translator
56
+
57
+ def _get_mock_translator(self):
58
+ """Returns a dummy translator function that mimics pipeline behavior on error"""
59
+ print("[INFO] Setting up mock fallback translator")
60
+ def mock_pipeline(prompt, *args, **kwargs):
61
+ return [{"generated_text": "", "translation_text": ""}]
62
+ return mock_pipeline
models/tesseract_ocr.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ class TesseractOCR:
2
+ """
3
+ Placeholder class to satisfy imports in app.py and test.py.
4
+ The processors themselves communicate directly with pytesseract.
5
+ """
6
+ def __init__(self):
7
+ pass
processors/__init__.py ADDED
File without changes
processors/base_processor.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from PIL import Image
3
+
4
+ class BaseScriptProcessor(ABC):
5
+ def __init__(self, groq_client, references, clip_classifier=None): # Add clip_classifier parameter
6
+ self.groq_client = groq_client
7
+ self.references = references
8
+ self.clip_classifier = clip_classifier # Store clip_classifier
9
+ from services.rag_service import RAGService
10
+ from services.layout_parser import LayoutParser
11
+ self.rag_service = RAGService()
12
+ self.layout_parser = LayoutParser()
13
+
14
+
15
+ @abstractmethod
16
+ def detect_script(self, image_path):
17
+ """Detect if image contains this script type"""
18
+ pass
19
+
20
+ @abstractmethod
21
+ def extract_text(self, image_path):
22
+ """Extract text/symbols from image"""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def process_text(self, extracted_text):
27
+ """Process extracted text into meaningful output"""
28
+ pass
29
+
30
+ @abstractmethod
31
+ def generate_historical_context(self, processed_text):
32
+ """Generate historical context for the text"""
33
+ pass
34
+
35
+ @abstractmethod
36
+ def generate_story(self, processed_text):
37
+ """Generate creative story based on the text"""
38
+ pass
39
+
40
+ def process_image(self, image_path):
41
+ """Main processing pipeline"""
42
+ try:
43
+ # Step 1: Detect script
44
+ is_detected, confidence = self.detect_script(image_path)
45
+ if not is_detected:
46
+ return None
47
+
48
+ # Step 2: Extract text
49
+ extracted_text = self.extract_text(image_path)
50
+ if not extracted_text:
51
+ return None
52
+
53
+ # Step 3: Process text
54
+ processed_result = self.process_text(extracted_text)
55
+
56
+ # Step 4: Generate context and story
57
+ historical_context = self.generate_historical_context(processed_result)
58
+ creative_story = self.generate_story(processed_result)
59
+
60
+ return {
61
+ "script_type": self.__class__.__name__.replace("Processor", "").lower(),
62
+ "confidence": confidence,
63
+ "extracted_text": extracted_text,
64
+ "processed_result": processed_result,
65
+ "historical_context": historical_context,
66
+ "creative_story": creative_story
67
+ }
68
+
69
+ except Exception as e:
70
+ print(f"[ERROR] Processing failed in {self.__class__.__name__}: {e}")
71
+ return None
processors/cuneiform_processor.py ADDED
@@ -0,0 +1,804 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ import re
5
+ import time
6
+ from PIL import Image, ImageEnhance, ImageFilter
7
+ import torch
8
+ from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForSeq2SeqLM
9
+ from .base_processor import BaseScriptProcessor
10
+ from utils.text_utils import is_gibberish
11
+
12
+ BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
13
+ CUNEIFORM_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "cuneiform")
14
+
15
+
16
+ class CuneiformProcessor(BaseScriptProcessor):
17
+ def __init__(self, groq_client, references, clip_classifier):
18
+ super().__init__(groq_client, references, clip_classifier)
19
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
+
21
+ self.clip_model = None
22
+ self.clip_processor = None
23
+ self.clip_available = False
24
+
25
+ self.cuneiform_model = None
26
+ self.cuneiform_tokenizer = None
27
+ self.translator_available = False
28
+
29
+ # Register for dynamic VRAM management
30
+ from utils.gpu_diagnostics import register_processor
31
+ register_processor("cuneiform", self)
32
+
33
+ @property
34
+ def cuneiform_available(self):
35
+ """Property to match interface expected by ScriptDetectionService"""
36
+ # Always return True since we load lazily on demand
37
+ return True
38
+
39
+
40
+ def setup_cuneiform_clip(self):
41
+ """Setup CLIP for cuneiform visual recognition - MUCH better than OCR"""
42
+ try:
43
+ from utils.gpu_diagnostics import reclaim_vram_for
44
+ reclaim_vram_for("cuneiform")
45
+
46
+ print("[INFO] Lazily loading CLIP for cuneiform visual recognition...")
47
+
48
+ # Use a powerful CLIP model for better ancient script understanding
49
+ model_name = "openai/clip-vit-large-patch14"
50
+
51
+ self.clip_processor = CLIPProcessor.from_pretrained(model_name)
52
+ self.clip_model = CLIPModel.from_pretrained(model_name)
53
+
54
+ self.clip_model.to(self.device)
55
+ self.clip_model.eval() # Put in evaluation mode
56
+
57
+ from utils.gpu_diagnostics import log_model_device
58
+ log_model_device("Cuneiform CLIP Recognition", self.device)
59
+
60
+ # Define cuneiform sign categories for CLIP classification
61
+ self.cuneiform_signs = [
62
+ "ancient Sumerian cuneiform sign AN meaning god or heaven",
63
+ "ancient Akkadian cuneiform sign LUGAL meaning king or ruler",
64
+ "ancient cuneiform sign KI meaning earth or place",
65
+ "ancient cuneiform sign DINGIR divine determinative marker",
66
+ "ancient cuneiform sign UD meaning day or sun",
67
+ "ancient cuneiform sign E meaning house or temple",
68
+ "ancient cuneiform sign EN meaning lord or priest",
69
+ "ancient cuneiform sign NIN meaning lady or queen",
70
+ "ancient cuneiform administrative record with numbers",
71
+ "ancient cuneiform legal contract or treaty text",
72
+ "ancient cuneiform royal inscription or decree",
73
+ "ancient cuneiform literary or mythological text",
74
+ "ancient cuneiform school exercise or practice tablet"
75
+ ]
76
+
77
+ # Tablet layout descriptions for structural analysis
78
+ self.tablet_layouts = [
79
+ "clay tablet with cuneiform text arranged in horizontal lines",
80
+ "cuneiform tablet with vertical column organization",
81
+ "administrative record tablet with numerical entries",
82
+ "legal document tablet with witness signatures",
83
+ "literary tablet with continuous narrative text",
84
+ "damaged or fragmentary cuneiform tablet",
85
+ "clear well-preserved cuneiform inscription",
86
+ "practice tablet with student exercises"
87
+ ]
88
+
89
+ print(f"[INFO] CLIP cuneiform recognition loaded on {self.device}")
90
+ print("[INFO] Using visual pattern recognition instead of character OCR")
91
+ self.clip_available = True
92
+
93
+ except Exception as e:
94
+ print(f"[ERROR] CLIP cuneiform setup failed: {e}")
95
+ self.clip_available = False
96
+
97
+ def setup_praeclarum_translator(self):
98
+ """Setup praeclarum translation model for converting recognized content"""
99
+ try:
100
+ from utils.gpu_diagnostics import reclaim_vram_for
101
+ reclaim_vram_for("cuneiform")
102
+
103
+ print("[INFO] Lazily loading praeclarum cuneiform translation model...")
104
+
105
+ self.cuneiform_tokenizer = AutoTokenizer.from_pretrained(
106
+ "praeclarum/cuneiform",
107
+ cache_dir=CUNEIFORM_MODEL_DIR
108
+ )
109
+ self.cuneiform_model = AutoModelForSeq2SeqLM.from_pretrained(
110
+ "praeclarum/cuneiform",
111
+ cache_dir=CUNEIFORM_MODEL_DIR
112
+ )
113
+ self.cuneiform_model.to(self.device)
114
+ self.cuneiform_model.eval() # Put in evaluation mode
115
+
116
+ from utils.gpu_diagnostics import log_model_device
117
+ log_model_device("Cuneiform Translator (Praeclarum T5)", self.device)
118
+
119
+ self.translator_available = True
120
+ print("[INFO] Cuneiform translator ready for CLIP-recognized content")
121
+
122
+ except Exception as e:
123
+ print(f"[ERROR] Translation model setup failed: {e}")
124
+ self.translator_available = False
125
+
126
+ def detect_script(self, image_path):
127
+ """Detection handled by enhanced CLIP classification"""
128
+ try:
129
+ if not self.clip_available:
130
+ print("[ERROR] No cuneiform processing engines available")
131
+ return False, 0.0
132
+
133
+ print(f"[INFO] Cuneiform processor activated - Using CLIP visual recognition")
134
+ return True, 0.95
135
+
136
+ except Exception as e:
137
+ print(f"[ERROR] Cuneiform detection failed: {e}")
138
+ return False, 0.0
139
+
140
+ def extract_text(self, image_path):
141
+ """Extract cuneiform using CLIP visual recognition instead of OCR"""
142
+ if self.clip_model is None:
143
+ self.setup_cuneiform_clip()
144
+ else:
145
+ from utils.gpu_diagnostics import reclaim_vram_for
146
+ reclaim_vram_for("cuneiform")
147
+ if str(next(self.clip_model.parameters()).device) != str(self.device):
148
+ print(f"[VRAM MANAGER] Activating Cuneiform CLIP model on {self.device}...")
149
+ self.clip_model.to(self.device)
150
+
151
+ if not getattr(self, 'clip_available', False) or self.clip_model is None:
152
+ return "CUNEIFORM_CLIP_FAILED: Visual recognition model not available"
153
+
154
+ try:
155
+ start_time = time.time()
156
+
157
+ # Method 1: CLIP-based visual analysis
158
+ print("[INFO] Analyzing cuneiform using CLIP visual recognition...")
159
+ visual_analysis = self._analyze_cuneiform_with_clip(image_path)
160
+
161
+ if visual_analysis and visual_analysis['confidence'] > 0.3:
162
+ processing_time = time.time() - start_time
163
+ print(f"[SUCCESS] CLIP visual analysis completed in {processing_time:.2f}s")
164
+ return visual_analysis['description']
165
+
166
+ # Method 2: Fallback to basic tablet description
167
+ tablet_description = self._describe_tablet_layout(image_path)
168
+ if tablet_description:
169
+ return tablet_description
170
+
171
+ return "CUNEIFORM_VISUAL_ANALYSIS_INCOMPLETE: Clay tablet detected but content analysis requires higher resolution or clearer image"
172
+
173
+ except Exception as e:
174
+ print(f"[ERROR] CLIP cuneiform analysis failed: {e}")
175
+ return f"CUNEIFORM_ERROR: {str(e)}"
176
+
177
+ def _analyze_cuneiform_with_clip(self, image_path):
178
+ """Use CLIP to analyze cuneiform content visually"""
179
+ try:
180
+ image = Image.open(image_path).convert("RGB")
181
+
182
+ # Enhanced preprocessing for CLIP analysis
183
+ enhanced_image = self._preprocess_for_clip_analysis(image)
184
+
185
+ # CLIP classification of cuneiform content
186
+ print("[INFO] Running CLIP classification on cuneiform signs...")
187
+
188
+ inputs = self.clip_processor(
189
+ text=self.cuneiform_signs,
190
+ images=enhanced_image,
191
+ return_tensors="pt",
192
+ padding=True
193
+ ).to(self.device)
194
+
195
+ with torch.inference_mode():
196
+ outputs = self.clip_model(**inputs)
197
+ logits_per_image = outputs.logits_per_image
198
+ probs = logits_per_image.softmax(dim=1)
199
+
200
+ # Get top predictions
201
+ top_probs, top_indices = torch.topk(probs, k=3)
202
+
203
+ # Build description based on CLIP analysis
204
+ descriptions = []
205
+ confidences = []
206
+
207
+ for i, (prob, idx) in enumerate(zip(top_probs[0], top_indices[0])):
208
+ if prob > 0.2: # Reasonable confidence threshold
209
+ sign_desc = self.cuneiform_signs[idx]
210
+ descriptions.append(sign_desc)
211
+ confidences.append(prob.item())
212
+ print(f"[INFO] CLIP detected: {sign_desc} (confidence: {prob:.3f})")
213
+
214
+ if descriptions:
215
+ # Convert visual analysis to ATF-like description
216
+ atf_description = self._convert_visual_to_atf(descriptions, confidences)
217
+
218
+ return {
219
+ 'description': atf_description,
220
+ 'confidence': max(confidences),
221
+ 'visual_elements': descriptions,
222
+ 'method': 'CLIP_visual_analysis'
223
+ }
224
+
225
+ return None
226
+
227
+ except Exception as e:
228
+ print(f"[ERROR] CLIP cuneiform analysis failed: {e}")
229
+ return None
230
+
231
+ def _preprocess_for_clip_analysis(self, image):
232
+ """Preprocess image specifically for CLIP cuneiform analysis"""
233
+ try:
234
+ # Convert to numpy for OpenCV processing
235
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
236
+
237
+ # Enhance for CLIP visual understanding
238
+ # 1. Increase contrast to make wedges more visible
239
+ lab = cv2.cvtColor(image_cv, cv2.COLOR_BGR2LAB)
240
+ l_channel, a, b = cv2.split(lab)
241
+
242
+ # Apply CLAHE to lightness channel
243
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
244
+ l_channel = clahe.apply(l_channel)
245
+
246
+ # Merge back
247
+ enhanced_lab = cv2.merge((l_channel, a, b))
248
+ enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
249
+
250
+ # 2. Sharpen edges to help CLIP see wedge boundaries
251
+ kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
252
+ sharpened = cv2.filter2D(enhanced_bgr, -1, kernel)
253
+
254
+ # Convert back to PIL RGB
255
+ enhanced_rgb = cv2.cvtColor(sharpened, cv2.COLOR_BGR2RGB)
256
+ return Image.fromarray(enhanced_rgb)
257
+
258
+ except Exception as e:
259
+ print(f"[WARN] CLIP preprocessing failed: {e}")
260
+ return image
261
+
262
+ def _convert_visual_to_atf(self, visual_descriptions, confidences):
263
+ """Convert CLIP visual analysis to ATF-like transliteration"""
264
+ try:
265
+ atf_elements = []
266
+
267
+ for desc, conf in zip(visual_descriptions, confidences):
268
+ desc_lower = desc.lower()
269
+
270
+ # Map visual descriptions to ATF transliterations
271
+ if 'lugal' in desc_lower or 'king' in desc_lower:
272
+ atf_elements.append('lugal')
273
+ elif 'an' in desc_lower or 'god' in desc_lower or 'heaven' in desc_lower:
274
+ atf_elements.append('an')
275
+ elif 'ki' in desc_lower or 'earth' in desc_lower or 'place' in desc_lower:
276
+ atf_elements.append('ki')
277
+ elif 'dingir' in desc_lower or 'divine' in desc_lower:
278
+ atf_elements.append('{d}')
279
+ elif 'ud' in desc_lower or 'day' in desc_lower or 'sun' in desc_lower:
280
+ atf_elements.append('ud')
281
+ elif 'e' in desc_lower and ('house' in desc_lower or 'temple' in desc_lower):
282
+ atf_elements.append('e2')
283
+ elif 'en' in desc_lower and 'lord' in desc_lower:
284
+ atf_elements.append('en')
285
+ elif 'nin' in desc_lower and ('lady' in desc_lower or 'queen' in desc_lower):
286
+ atf_elements.append('nin')
287
+ elif 'administrative' in desc_lower or 'numbers' in desc_lower:
288
+ atf_elements.extend(['1(disz)', '2(disz)', 'sze'])
289
+ elif 'royal' in desc_lower or 'inscription' in desc_lower:
290
+ atf_elements.extend(['lugal', 'kur', 'kur'])
291
+ elif 'legal' in desc_lower or 'contract' in desc_lower:
292
+ atf_elements.extend(['kiszib3', 'mu', 'pad'])
293
+ elif 'literary' in desc_lower or 'mythological' in desc_lower:
294
+ atf_elements.extend(['en', 'dingir', 'kur'])
295
+ elif 'school' in desc_lower or 'practice' in desc_lower:
296
+ atf_elements.extend(['a', 'ba', 'ka', 'la'])
297
+
298
+ # Build coherent ATF string
299
+ if atf_elements:
300
+ # Add line structure typical of cuneiform tablets
301
+ atf_text = f"1. {' '.join(atf_elements[:3])}"
302
+ if len(atf_elements) > 3:
303
+ atf_text += f"\n2. {' '.join(atf_elements[3:6])}"
304
+ if len(atf_elements) > 6:
305
+ atf_text += f"\n3. {' '.join(atf_elements[6:])}"
306
+
307
+ return atf_text
308
+ else:
309
+ return "cuneiform tablet content analysis incomplete"
310
+
311
+ except Exception as e:
312
+ print(f"[ERROR] Visual to ATF conversion failed: {e}")
313
+ return "visual analysis available but ATF conversion failed"
314
+
315
+ def _describe_tablet_layout(self, image_path):
316
+ """Describe tablet layout and structure using CLIP"""
317
+ try:
318
+ image = Image.open(image_path).convert("RGB")
319
+
320
+ inputs = self.clip_processor(
321
+ text=self.tablet_layouts,
322
+ images=image,
323
+ return_tensors="pt",
324
+ padding=True
325
+ ).to(self.device)
326
+
327
+ with torch.inference_mode():
328
+ outputs = self.clip_model(**inputs)
329
+ probs = outputs.logits_per_image.softmax(dim=1)
330
+
331
+ # Get best layout description
332
+ best_idx = torch.argmax(probs)
333
+ best_desc = self.tablet_layouts[best_idx]
334
+ confidence = probs[0][best_idx].item()
335
+
336
+ print(f"[INFO] Tablet layout: {best_desc} (confidence: {confidence:.3f})")
337
+
338
+ if confidence > 0.4:
339
+ return f"tablet_layout: {best_desc}"
340
+
341
+ return "tablet_layout: unidentified cuneiform tablet structure"
342
+
343
+ except Exception as e:
344
+ print(f"[ERROR] Tablet layout analysis failed: {e}")
345
+ return "tablet_layout: analysis_failed"
346
+
347
+ def translate_cuneiform(self, cuneiform_text):
348
+ """Translate CLIP-analyzed cuneiform content using praeclarum model"""
349
+ if self.cuneiform_model is None:
350
+ self.setup_praeclarum_translator()
351
+ else:
352
+ from utils.gpu_diagnostics import reclaim_vram_for
353
+ reclaim_vram_for("cuneiform")
354
+ if str(next(self.cuneiform_model.parameters()).device) != str(self.device):
355
+ print(f"[VRAM MANAGER] Activating Cuneiform Translator model on {self.device}...")
356
+ self.cuneiform_model.to(self.device)
357
+
358
+ if not getattr(self, 'translator_available', False) or self.cuneiform_model is None:
359
+ return "Translation unavailable - praeclarum model not loaded"
360
+
361
+ # Handle CLIP analysis results
362
+ if cuneiform_text.startswith(("CUNEIFORM_CLIP_FAILED", "CUNEIFORM_ERROR:")):
363
+ return "Translation failed: Visual analysis could not identify cuneiform content"
364
+
365
+ if cuneiform_text.startswith("tablet_layout:"):
366
+ layout_desc = cuneiform_text.replace("tablet_layout: ", "")
367
+ return f"Visual analysis indicates: {layout_desc}. Specific text translation requires clearer wedge visibility."
368
+
369
+ try:
370
+ print(f"[INFO] Translating CLIP-analyzed content: {cuneiform_text[:50]}...")
371
+
372
+ # Use the praeclarum model for translation
373
+ inputs = self.cuneiform_tokenizer(
374
+ cuneiform_text,
375
+ return_tensors="pt",
376
+ max_length=512,
377
+ truncation=True
378
+ ).input_ids.to(self.device)
379
+
380
+ with torch.inference_mode():
381
+ outputs = self.cuneiform_model.generate(
382
+ inputs,
383
+ max_new_tokens=200,
384
+ do_sample=True,
385
+ top_k=30,
386
+ top_p=0.95,
387
+ temperature=0.7,
388
+ pad_token_id=self.cuneiform_tokenizer.eos_token_id
389
+ )
390
+
391
+ translation = self.cuneiform_tokenizer.decode(
392
+ outputs[0],
393
+ skip_special_tokens=True
394
+ )
395
+
396
+ translation = self._post_process_translation(translation)
397
+
398
+ if translation.strip():
399
+ print(f"[INFO] CLIP+Translation completed: {translation[:100]}...")
400
+ return translation
401
+ else:
402
+ return "Visual analysis successful, but textual translation inconclusive. This may be a non-textual or damaged tablet section."
403
+
404
+ except Exception as e:
405
+ print(f"[ERROR] Translation of CLIP content failed: {e}")
406
+ return f"Visual analysis successful, translation error: {str(e)}"
407
+
408
+ def _post_process_translation(self, translation):
409
+ """Post-process cuneiform translation"""
410
+ try:
411
+ # Clean up common translation artifacts
412
+ cleaned = translation.strip()
413
+
414
+ # Check for dots-only output (failed translation)
415
+ if cleaned in ["", "...", ". . .", "... ... ..."] or cleaned.count('.') > len(cleaned) * 0.8:
416
+ print(f"[WARN] Translation appears to be dots/empty, marking as failed")
417
+ return ""
418
+
419
+ # Remove any input text that might have been echoed
420
+ if cleaned.startswith(('lugal', 'an ', 'ki ', 'dingir')):
421
+ lines = cleaned.split('\n')
422
+ for line in lines:
423
+ if not any(line.lower().startswith(pattern) for pattern in ['lugal', 'an ', 'ki ']):
424
+ if len(line.strip()) > 10:
425
+ cleaned = line.strip()
426
+ break
427
+
428
+ # Capitalize first letter
429
+ if cleaned and not cleaned[0].isupper():
430
+ cleaned = cleaned[0].upper() + cleaned[1:]
431
+
432
+ return cleaned
433
+
434
+ except Exception as e:
435
+ print(f"[WARN] Translation post-processing failed: {e}")
436
+ return translation
437
+
438
+ def process_text(self, cuneiform_text):
439
+ """Process extracted cuneiform text with comprehensive CLIP-aware analysis"""
440
+ if not cuneiform_text:
441
+ return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
442
+
443
+ print("[INFO] Processing cuneiform text with CLIP visual analysis...")
444
+
445
+ # Handle error messages
446
+ if cuneiform_text.startswith(("CUNEIFORM_CLIP_FAILED", "CUNEIFORM_ERROR:", "CUNEIFORM_VISUAL_ANALYSIS_INCOMPLETE")):
447
+ return {
448
+ "text": cuneiform_text,
449
+ "symbols": [],
450
+ "char_analysis": {
451
+ "total_chars": 0,
452
+ "error": "CLIP visual analysis failed",
453
+ "text_format": "Error"
454
+ },
455
+ "validation": {
456
+ "quality_score": 0.0,
457
+ "confidence_level": "Failed",
458
+ "ocr_method": "CLIP Visual Recognition (Failed)",
459
+ "error": cuneiform_text
460
+ }
461
+ }
462
+
463
+ # Extract symbols for visual analysis
464
+ if cuneiform_text.startswith("tablet_layout:"):
465
+ # Layout analysis
466
+ symbols = ""
467
+ char_analysis = {
468
+ "total_chars": len(cuneiform_text),
469
+ "layout_analysis": True,
470
+ "text_format": "Layout Description"
471
+ }
472
+ else:
473
+ # ATF or visual analysis content
474
+ symbols = ''.join(filter(lambda x: x.isalnum() or x in "{}[]().-", cuneiform_text))
475
+ char_analysis = {
476
+ "total_chars": len(cuneiform_text),
477
+ "atf_elements": len(cuneiform_text.split()),
478
+ "unique_chars": len(set(cuneiform_text)),
479
+ "word_count": len(cuneiform_text.split()),
480
+ "text_format": "CLIP Visual Analysis + ATF"
481
+ }
482
+
483
+ # Enhanced validation with CLIP-specific metrics
484
+ validation = {
485
+ "quality_score": self._calculate_clip_quality_score(cuneiform_text),
486
+ "recognition_method": "CLIP Visual Pattern Recognition",
487
+ "model_specialization": "Large-scale Vision Transformer for Ancient Scripts",
488
+ "clip_analysis": True,
489
+ "supports_translation": self.translator_available,
490
+ "input_format": char_analysis.get("text_format", "Unknown"),
491
+ "confidence_level": self._determine_confidence_level(cuneiform_text)
492
+ }
493
+
494
+ return {
495
+ "text": cuneiform_text,
496
+ "symbols": symbols,
497
+ "char_analysis": char_analysis,
498
+ "validation": validation
499
+ }
500
+
501
+ def _calculate_clip_quality_score(self, text):
502
+ """Calculate quality score for CLIP-analyzed text"""
503
+ if not text:
504
+ return 0.0
505
+
506
+ score = 0.0
507
+
508
+ # Layout analysis bonus
509
+ if text.startswith("tablet_layout:"):
510
+ score = 0.7 # Good layout analysis
511
+
512
+ # ATF content bonuses
513
+ elif any(pattern in text.lower() for pattern in ['lugal', 'an', 'ki', 'dingir', '{d}', 'e2']):
514
+ score += 0.8 # High quality CLIP recognition
515
+
516
+ # Multiple lines bonus
517
+ if '\n' in text:
518
+ score += 0.1
519
+
520
+ # Coherent structure bonus
521
+ words = text.split()
522
+ if len(words) >= 3:
523
+ score += 0.1
524
+
525
+ # Error penalty
526
+ elif text.startswith(("CUNEIFORM_", "visual analysis", "tablet content")):
527
+ score = 0.3 # Some recognition but incomplete
528
+
529
+ return max(0.0, min(1.0, score))
530
+
531
+ def _determine_confidence_level(self, text):
532
+ """Determine confidence level for CLIP analysis"""
533
+ score = self._calculate_clip_quality_score(text)
534
+
535
+ if score >= 0.8:
536
+ return "Very High"
537
+ elif score >= 0.6:
538
+ return "High"
539
+ elif score >= 0.4:
540
+ return "Medium"
541
+ elif score >= 0.2:
542
+ return "Low"
543
+ else:
544
+ return "Very Low"
545
+
546
+ def process_image(self, image_path):
547
+ """Main processing method - same interface as other processors"""
548
+ try:
549
+ print(f"[INFO] Processing cuneiform image: {image_path}")
550
+
551
+ # Extract text using CLIP
552
+ extracted_text = self.extract_text(image_path)
553
+
554
+ # Process the extracted content
555
+ processed_result = self.process_text(extracted_text)
556
+
557
+ # Generate historical context
558
+ historical_context = self.generate_historical_context(processed_result)
559
+
560
+ # Generate creative story
561
+ creative_story = self.generate_story(processed_result)
562
+
563
+ return {
564
+ 'script_type': 'cuneiform',
565
+ 'confidence': processed_result['validation'].get('quality_score', 0.0),
566
+ 'processed_result': processed_result,
567
+ 'historical_context': historical_context,
568
+ 'creative_story': creative_story
569
+ }
570
+
571
+ except Exception as e:
572
+ print(f"[ERROR] Cuneiform image processing failed: {e}")
573
+ return None
574
+
575
+ def generate_historical_context(self, processed_result):
576
+ """Generate historical context for cuneiform text"""
577
+ cuneiform_text = processed_result.get("text", "")
578
+
579
+ groq_detail = self._generate_groq_context(cuneiform_text)
580
+
581
+ # Build references using words/symbols in cuneiform text
582
+ words = re.findall(r'\w+', cuneiform_text) if cuneiform_text else []
583
+ query_terms = list(words)
584
+ if cuneiform_text:
585
+ query_terms.extend([char for char in cuneiform_text if char.strip()])
586
+ refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
587
+
588
+ return {
589
+ "uses_box": {
590
+ "title": "Cuneiform symbols and their ancient usage",
591
+ "items": self._build_uses_list(cuneiform_text)
592
+ },
593
+ "meaning_box": self._build_meaning_box(cuneiform_text, groq_detail, processed_result),
594
+ "references": refs
595
+ }
596
+
597
+ def _generate_groq_context(self, cuneiform_text):
598
+ """Generate contextual information using Groq"""
599
+ if not self.groq_client.is_available():
600
+ return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
601
+
602
+ if cuneiform_text.startswith(("CUNEIFORM_", "tablet_layout:")):
603
+ prompt = (
604
+ "This appears to be a cuneiform clay tablet analyzed using computer vision. "
605
+ "Provide a concise, scholarly paragraph (6-10 sentences) covering the history of cuneiform writing, "
606
+ "its use in ancient Mesopotamia, common contexts (administrative, legal, literary), "
607
+ "and the languages it represented (Sumerian, Akkadian, etc.). Include information about "
608
+ "clay tablet creation, scribal practices, and the significance of cuneiform in ancient civilizations."
609
+ )
610
+ else:
611
+ prompt = (
612
+ f"Analyze this cuneiform content identified through visual analysis: {cuneiform_text}\n\n"
613
+ f"Provide a scholarly paragraph (6-10 sentences) on its likely historical context, "
614
+ f"period (3200 BCE to 100 CE), probable purpose (administrative, legal, literary, religious), "
615
+ f"language (Sumerian/Akkadian/other), and cultural significance in ancient Mesopotamian civilization. "
616
+ f"Consider that this was analyzed using AI vision recognition rather than traditional transliteration."
617
+ )
618
+
619
+ system_prompt = "You are an expert Assyriologist and ancient Near Eastern historian. Provide accurate, concise scholarly analysis of cuneiform texts, focusing on historical context, linguistic analysis, and cultural significance."
620
+ enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, cuneiform_text)
621
+
622
+ return self.groq_client.generate_response(
623
+ system_prompt=enriched_system_prompt,
624
+ user_prompt=prompt
625
+ ) or "(Historical context unavailable due to Groq error)"
626
+
627
+ def _build_uses_list(self, cuneiform_text):
628
+ """Build list of cuneiform symbol uses"""
629
+
630
+ # Handle error messages
631
+ if cuneiform_text.startswith(("CUNEIFORM_", "tablet_layout:")):
632
+ return [
633
+ "- Visual analysis attempted but content recognition incomplete",
634
+ "- This may be due to image quality, tablet damage, or complex wedge patterns",
635
+ "- CLIP visual recognition specializes in identifying cuneiform sign types and layouts",
636
+ "- For detailed transliteration, consider using CDLI tools or consulting cuneiform specialists"
637
+ ]
638
+
639
+ notes = self.references.get("cuneiform_symbol_notes", {}) or {}
640
+ default_hint = self.references.get("cuneiform_hint",
641
+ "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages")
642
+
643
+ seen = set()
644
+ items = []
645
+
646
+ # Process ATF elements
647
+ for element in cuneiform_text.split():
648
+ if element in seen or not element.strip():
649
+ continue
650
+ seen.add(element)
651
+
652
+ if element in notes:
653
+ note = notes[element]
654
+ else:
655
+ note = default_hint
656
+
657
+ items.append(f"- {element}: {note}")
658
+
659
+ if not items:
660
+ items.append("- Analysis incomplete: CLIP visual recognition in progress")
661
+
662
+ return items[:15] # Limit display
663
+
664
+ def _build_meaning_box(self, cuneiform_text, groq_detail, processed_result):
665
+ """Build meaning interpretation box for cuneiform"""
666
+ char_analysis = processed_result.get("char_analysis", {})
667
+ validation = processed_result.get("validation", {})
668
+
669
+ # Build introduction with CLIP context
670
+ text_format = char_analysis.get("text_format", "Unknown")
671
+ confidence = validation.get("confidence_level", "Unknown")
672
+
673
+ intro_lines = [
674
+ f"Cuneiform processed using CLIP visual recognition with confidence: {confidence}.",
675
+ ]
676
+
677
+ if validation.get("clip_analysis"):
678
+ intro_lines.extend([
679
+ "Analysis powered by OpenAI CLIP Vision Transformer (Large) for ancient script recognition.",
680
+ "Visual pattern recognition identifies cuneiform signs, layouts, and tablet structures."
681
+ ])
682
+
683
+ if self.translator_available:
684
+ intro_lines.append("Translation provided by praeclarum/cuneiform model trained on 210,247 examples.")
685
+
686
+ # Add format-specific information
687
+ if text_format == "Layout Description":
688
+ intro_lines.append("Tablet structure and organization analyzed through computer vision.")
689
+ elif text_format == "CLIP Visual Analysis + ATF":
690
+ intro_lines.append("Visual elements converted to ATF transliteration format.")
691
+
692
+ # Analysis points
693
+ points = []
694
+
695
+ points.extend([
696
+ "• CLIP Vision Transformer provides advanced visual understanding of cuneiform wedge patterns.",
697
+ "• Model trained on large-scale image-text datasets enables zero-shot cuneiform recognition.",
698
+ "• Visual analysis identifies sign types, tablet layouts, and manuscript characteristics."
699
+ ])
700
+
701
+ if validation.get("supports_translation"):
702
+ points.append("• Recognized visual elements translated using specialized Mesopotamian language models.")
703
+
704
+ if text_format == "Layout Description":
705
+ points.append("• Tablet structure analysis indicates overall document type and organization.")
706
+
707
+ layout_analysis = char_analysis.get("layout_analysis", False)
708
+ if layout_analysis:
709
+ points.append("• Computer vision successfully identified tablet layout and structural elements.")
710
+
711
+ if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
712
+ points.append(f"• Historical analysis: {groq_detail.strip()}")
713
+
714
+ # Extract key elements for frequent display
715
+ if text_format == "CLIP Visual Analysis + ATF":
716
+ frequent_elements = cuneiform_text.split()[:10]
717
+ else:
718
+ frequent_elements = ["Visual", "Analysis", "CLIP", "Recognition"]
719
+
720
+ return {
721
+ "title": "Cuneiform visual analysis:",
722
+ "intro_lines": intro_lines,
723
+ "frequent_label": "Key elements identified",
724
+ "frequent": frequent_elements,
725
+ "points": points
726
+ }
727
+
728
+ def generate_story(self, processed_result):
729
+ """Generate creative story for cuneiform text"""
730
+ cuneiform_text = processed_result.get("text", "")
731
+
732
+ if not self.groq_client.is_available():
733
+ return "Groq client unavailable, cannot generate historical narrative."
734
+
735
+ # Determine story context based on analysis type
736
+ char_analysis = processed_result.get("char_analysis", {})
737
+ validation = processed_result.get("validation", {})
738
+
739
+ text_format = char_analysis.get("text_format", "Unknown")
740
+
741
+ # Choose appropriate narrative style based on CLIP analysis
742
+ if "lugal" in cuneiform_text.lower() or "royal" in cuneiform_text.lower():
743
+ styles = [
744
+ "as a royal inscription from the court of Hammurabi",
745
+ "as a victory stela from ancient Assyria",
746
+ "as a chronicle of Mesopotamian kings",
747
+ "as a royal decree from Nebuchadnezzar's reign"
748
+ ]
749
+ elif "administrative" in cuneiform_text.lower() or "numbers" in cuneiform_text.lower():
750
+ styles = [
751
+ "as a merchant's inventory from ancient Babylon",
752
+ "as a tax record from a Sumerian temple",
753
+ "as a grain distribution list from Ur",
754
+ "as an administrative archive from Mari"
755
+ ]
756
+ elif text_format == "Layout Description":
757
+ styles = [
758
+ "as a damaged tablet discovered in archaeological excavation",
759
+ "as a mysterious cuneiform fragment found in ancient ruins",
760
+ "as a clay tablet uncovered in a Mesopotamian library",
761
+ "as an ancient document preserved in palace archives"
762
+ ]
763
+ else:
764
+ styles = [
765
+ "as a scribe's practice tablet from ancient Sumer",
766
+ "as a legal contract from Babylonian courts",
767
+ "as a temple inscription from Mesopotamia",
768
+ "as a literary work from the ancient Near East"
769
+ ]
770
+
771
+ import random
772
+ chosen_style = random.choice(styles)
773
+ seed = random.randint(1000, 9999)
774
+
775
+ processing_note = "analyzed through advanced computer vision AI specialized in ancient scripts"
776
+
777
+ prompt = (
778
+ f"This cuneiform tablet was {processing_note}: {cuneiform_text[:100]}...\n\n"
779
+ f"Historical context: This represents one of humanity's oldest writing systems, "
780
+ f"used across ancient Mesopotamia from 3200 BCE to 100 CE.\n\n"
781
+ f"Create a vivid, historically accurate narrative (250+ words) set in ancient Mesopotamia, "
782
+ f"telling the story of this cuneiform tablet's creation and significance. "
783
+ f"Write {chosen_style}.\n\n"
784
+ f"Include: Clay tablet creation process, scribe's daily life, the tablet's importance "
785
+ f"to ancient Mesopotamian society, and authentic historical details of Sumerian/Babylonian/Assyrian culture.\n"
786
+ f"Narrative seed: {seed}"
787
+ )
788
+
789
+ system_prompt = (
790
+ "You are a master storyteller and Assyriologist specializing in ancient Mesopotamian "
791
+ "history, cuneiform literature, and daily life in Sumerian, Babylonian, and Assyrian "
792
+ "civilizations. Create authentic, engaging narratives that reflect accurate knowledge "
793
+ "of ancient Near Eastern cultures, writing practices, and social contexts."
794
+ )
795
+
796
+ story = self.groq_client.generate_response(
797
+ system_prompt=system_prompt,
798
+ user_prompt=prompt
799
+ )
800
+
801
+ if not story or is_gibberish(story):
802
+ return "Failed to generate historical narrative; ancient Mesopotamian story creation unavailable."
803
+
804
+ return story
processors/egyptian_processor.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import base64
4
+ import json
5
+ from PIL import Image
6
+ from io import BytesIO
7
+ from itertools import groupby
8
+ from collections import Counter
9
+ from .base_processor import BaseScriptProcessor
10
+ from utils.image_utils import segment_hieroglyphs
11
+ from utils.text_utils import is_gibberish, build_description_from_codes
12
+ from config import Config
13
+
14
+ class EgyptianProcessor(BaseScriptProcessor):
15
+ def __init__(self, groq_client, references, clip_classifier, translator_pipe):
16
+ super().__init__(groq_client, references)
17
+ self.clip_classifier = clip_classifier
18
+ self.translator_pipe = translator_pipe
19
+ self.config = Config()
20
+
21
+ def detect_script(self, image_path):
22
+ """Simplified detection - Groq Vision handles main classification"""
23
+ try:
24
+ print("[INFO] Egyptian processor activated by Groq Vision (Llama-4-Scout)")
25
+ return True, 0.95
26
+
27
+ except Exception as e:
28
+ print(f"[ERROR] Egyptian detection failed: {e}")
29
+ return False, 0.0
30
+
31
+ def _identify_hieroglyphs_with_vision(self, image_path):
32
+ """Use Groq Vision (Llama-4-Scout) to identify hieroglyphic symbols from the full image."""
33
+ if not self.groq_client or not self.groq_client.is_available():
34
+ return None
35
+
36
+ try:
37
+ from groq import Groq
38
+
39
+ # Load and encode image
40
+ image = Image.open(image_path)
41
+ if max(image.size) > 1200:
42
+ image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
43
+ buffer = BytesIO()
44
+ image.save(buffer, format="JPEG", quality=90)
45
+ b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
46
+
47
+ gardiner_labels = list(self.config.GARDINER_MAP.keys())
48
+ gardiner_codes = list(self.config.GARDINER_MAP.values())
49
+ label_list = ", ".join(
50
+ f"{lbl} ({code})" for lbl, code in zip(gardiner_labels, gardiner_codes)
51
+ )
52
+
53
+ prompt = (
54
+ "You are an expert Egyptologist analyzing an image of Egyptian hieroglyphs.\n\n"
55
+ f"Known Gardiner signs: {label_list}\n\n"
56
+ "Identify up to 15 of the most prominent hieroglyphic symbols visible in the image, in reading order (left-to-right, top-to-bottom).\n"
57
+ "For each identified symbol, pick the BEST matching Gardiner label from the list above.\n"
58
+ "Do not output more than 15 symbols. If a symbol doesn't match any known label, use \"unknown\".\n\n"
59
+ "Respond ONLY with a JSON object:\n"
60
+ "{\"symbols\": [\"label1\", \"label2\", \"label3\", ...]}\n"
61
+ "Example: {\"symbols\": [\"owl\", \"eye\", \"reed\", \"bread\", \"sun\"]}"
62
+ )
63
+
64
+ print("[INFO] Sending request to Groq Vision model meta-llama/llama-4-scout-17b-16e-instruct...")
65
+
66
+ client = Groq(api_key=self.groq_client.api_key)
67
+ completion = client.chat.completions.create(
68
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
69
+ messages=[
70
+ {
71
+ "role": "user",
72
+ "content": [
73
+ {"type": "text", "text": prompt},
74
+ {
75
+ "type": "image_url",
76
+ "image_url": {
77
+ "url": f"data:image/jpeg;base64,{b64}",
78
+ },
79
+ },
80
+ ],
81
+ }
82
+ ],
83
+ temperature=0.1,
84
+ max_completion_tokens=1024,
85
+ response_format={"type": "json_object"},
86
+ )
87
+
88
+ raw = completion.choices[0].message.content
89
+ print(f"[INFO] Groq Vision raw response received: {raw[:150]}...")
90
+ data = json.loads(raw)
91
+ symbols = data.get("symbols", [])
92
+
93
+ if symbols and isinstance(symbols, list) and len(symbols) > 0:
94
+ # Validate labels against known set + "unknown"
95
+ valid = set(gardiner_labels) | {"unknown"}
96
+ cleaned = [s if s in valid else "unknown" for s in symbols]
97
+ if all(s == "unknown" for s in cleaned):
98
+ print("[INFO] Groq Vision identified only 'unknown' symbols. Falling back.")
99
+ return None
100
+ print(f"[INFO] Groq Vision identified {len(cleaned)} hieroglyphs: {cleaned}")
101
+ return cleaned
102
+
103
+ except Exception as e:
104
+ print(f"[WARN] Groq Vision hieroglyph identification failed: {e}")
105
+
106
+ return None
107
+
108
+ def extract_text(self, image_path):
109
+ """Extract hieroglyphs — Groq Vision primary, CLIP fallback"""
110
+ try:
111
+ print("[INFO] Starting Egyptian hieroglyph extraction...")
112
+
113
+ # PRIMARY: Use Groq Vision to identify symbols from the full image
114
+ vision_labels = self._identify_hieroglyphs_with_vision(image_path)
115
+ if vision_labels:
116
+ print(f"[INFO] Using Groq Vision result ({len(vision_labels)} symbols)")
117
+ return vision_labels
118
+
119
+ # FALLBACK: Segment + CLIP zero-shot
120
+ print("[INFO] Falling back to CLIP segmentation-based classification...")
121
+ from utils.image_utils import segment_hieroglyphs
122
+
123
+ crops = segment_hieroglyphs(image_path)
124
+ print(f"[INFO] Segmented {len(crops)} hieroglyph regions")
125
+
126
+ if not crops:
127
+ print("[WARN] No hieroglyph regions found")
128
+ return []
129
+
130
+ candidate_labels = list(self.config.GARDINER_MAP.keys())
131
+ labels = self.clip_classifier.classify_symbols(crops, candidate_labels)
132
+
133
+ print(f"[INFO] CLIP classified {len(labels)} symbols: {labels}")
134
+ return labels
135
+
136
+ except Exception as e:
137
+ print(f"[ERROR] Egyptian text extraction failed: {e}")
138
+ import traceback
139
+ traceback.print_exc()
140
+ return []
141
+ def process_text(self, labels):
142
+ """Process hieroglyph labels into translation"""
143
+ if not labels:
144
+ return {"labels": [], "codes": [], "translation": "", "translation_ok": False}
145
+
146
+ # Convert labels to Gardiner codes
147
+ codes = [self.config.GARDINER_MAP.get((lbl or "").lower(), "?") for lbl in labels]
148
+
149
+ # Attempt translation
150
+ translation, translation_ok = self._translate_sequence(labels, codes)
151
+
152
+ return {
153
+ "labels": labels,
154
+ "codes": codes,
155
+ "translation": translation,
156
+ "translation_ok": translation_ok
157
+ }
158
+
159
+ def _translate_sequence(self, labels, codes):
160
+ """Translate Gardiner sequence using HuggingFace model or Groq fallback"""
161
+ valid_codes = [c for c in codes if c != "?"]
162
+
163
+ if valid_codes and self.translator_pipe:
164
+ seq = " ".join(valid_codes)
165
+ prompt = f"Translate hieroglyph unicode sequence to English: {seq}"
166
+ try:
167
+ output = self.translator_pipe(prompt, max_new_tokens=128, do_sample=False, num_beams=4)
168
+ text = output[0].get('generated_text') or output[0].get('translation_text') or str(output[0])
169
+
170
+ if text and text.strip() != "?" and not is_gibberish(text):
171
+ return text.strip(), True
172
+
173
+ # Try alternative approach
174
+ alt_output = self.translator_pipe(seq, max_new_tokens=128, do_sample=False, num_beams=4)
175
+ alt_text = alt_output[0].get('generated_text') or alt_output[0].get('translation_text') or str(alt_output[0])
176
+
177
+ if alt_text and alt_text.strip() != "?" and not is_gibberish(alt_text):
178
+ return alt_text.strip(), True
179
+
180
+ except Exception as e:
181
+ print(f"[WARN] Seq2Seq translation failed: {e}")
182
+
183
+ # Groq Fallback for translating known symbols
184
+ if self.groq_client and self.groq_client.is_available():
185
+ try:
186
+ known_labels = [lbl for lbl in labels if lbl and lbl != "unknown"]
187
+ if known_labels:
188
+ symbols_str = ", ".join(known_labels)
189
+ system_prompt = "You are an expert Egyptologist and translator of ancient Egyptian hieroglyphs."
190
+ user_prompt = (
191
+ f"We detected a sequence of ancient Egyptian hieroglyphic symbols: {symbols_str}.\n"
192
+ "Provide a concise, scholarly English translation or logical interpretation of this combination of signs.\n"
193
+ "Keep it direct, under 15 words, and do not include any introductory phrases, explanations, or quotes."
194
+ )
195
+ translation = self.groq_client.generate_response(system_prompt, user_prompt, max_tokens=64)
196
+ translation = translation.strip().replace('"', '')
197
+ if translation and not is_gibberish(translation):
198
+ return translation, True
199
+ except Exception as e:
200
+ print(f"[WARN] Groq fallback translation failed: {e}")
201
+
202
+ # Fallback to description
203
+ description = build_description_from_codes(codes)
204
+ return f"(Symbols described as: {description})", False
205
+
206
+ def generate_historical_context(self, processed_result):
207
+ """Generate historical context for Egyptian text"""
208
+ translation = processed_result.get("translation", "")
209
+ codes = processed_result.get("codes", [])
210
+ labels = processed_result.get("labels", [])
211
+
212
+ # Generate Groq context
213
+ groq_detail = self._generate_groq_context(translation, codes)
214
+
215
+ # Build references
216
+ query_terms = list(labels) + list(codes)
217
+ refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
218
+
219
+ # Build structured context
220
+ return {
221
+ "uses_box": {
222
+ "title": "Each symbol's possible use by the egyptian people",
223
+ "items": self._build_uses_list(labels)
224
+ },
225
+ "meaning_box": self._build_meaning_box(labels, groq_detail),
226
+ "references": refs
227
+ }
228
+
229
+ def _generate_groq_context(self, translation_text, codes):
230
+ """Generate contextual information using Groq"""
231
+ if not self.groq_client.is_available():
232
+ return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
233
+
234
+ if is_gibberish(translation_text):
235
+ prompt_body = build_description_from_codes(codes)
236
+ prompt = (
237
+ f"The following sequence of ancient Egyptian symbols is described as: {prompt_body}.\n\n"
238
+ "Provide a concise, scholarly paragraph (6-10 sentences) covering cultural context, symbolic meanings, "
239
+ "typical usage, probable time period, and relevant archaeological comparisons. Avoid repeating the prompt."
240
+ )
241
+ else:
242
+ prompt = (
243
+ f"Provide a concise, scholarly paragraph (6-10 sentences) on the historical significance, cultural context, "
244
+ f"symbolism, and possible interpretations of this ancient Egyptian text: {translation_text}. Avoid repeating the prompt."
245
+ )
246
+
247
+ system_prompt = "You are a careful Egyptologist and historian. Provide accurate, concise scholarly context."
248
+ enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, translation_text, codes)
249
+
250
+ return self.groq_client.generate_response(
251
+ system_prompt=enriched_system_prompt,
252
+ user_prompt=prompt,
253
+ max_tokens=self.config.GROQ_CONTEXT_MAX_TOKENS
254
+ ) or "(context unavailable due to Groq error)"
255
+
256
+ def _build_uses_list(self, labels):
257
+ """Build list of symbol uses"""
258
+ groups = []
259
+ for key, g in groupby(labels):
260
+ if not key:
261
+ continue
262
+ groups.append((key, len(list(g))))
263
+
264
+ notes = self.references.get("egypt_symbol_notes", {}) or {}
265
+ seen = set()
266
+ items = []
267
+
268
+ for name, count in groups:
269
+ if not name or name.lower() in seen:
270
+ continue
271
+ seen.add(name.lower())
272
+
273
+ count_str = f" (x{count})" if count > 1 else ""
274
+ note = notes.get(name.lower(), "Common sign whose meaning varies by phonetic/ideogram/determinative roles.")
275
+ items.append(f"- {name}{count_str}: {note}")
276
+
277
+ if not items:
278
+ items.append("- unknown: No stable mapping; likely decorative or damaged glyphs.")
279
+
280
+ return items
281
+
282
+ def _build_meaning_box(self, labels, groq_detail):
283
+ """Build meaning interpretation box"""
284
+ freq = Counter([l for l in labels if l])
285
+ frequent = [f"{name} (x{cnt})" for name, cnt in freq.most_common(6)]
286
+
287
+ intro_lines = [
288
+ "The dense recurrence of signs suggests a formulaic or protective sequence, where phonograms articulate a core utterance and determinatives or iconic signs reinforce ritual intent.",
289
+ "Comparable sequences appear on funerary equipment from the Middle Kingdom onward."
290
+ ]
291
+
292
+ points = [
293
+ "• Offering and action signs (bread, jar, hoe, bow) commonly structure invocations or provisioning lists for the afterlife.",
294
+ "• Repetition often encodes names or epithets; determinatives (eye, feather, god_figure) frame a protective or ritual context.",
295
+ "• Repertoire and layout align with New Kingdom funerary practice focused on protection, sustenance, and legitimation."
296
+ ]
297
+
298
+ if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
299
+ points.append(groq_detail.strip())
300
+
301
+ return {
302
+ "title": "Possible meaning:",
303
+ "intro_lines": intro_lines,
304
+ "frequent_label": "Frequently observed signs",
305
+ "frequent": frequent,
306
+ "points": points
307
+ }
308
+
309
+ def generate_story(self, processed_result):
310
+ """Generate creative story for Egyptian text"""
311
+ labels = processed_result.get("labels", [])
312
+ description = ", ".join([lbl for lbl in labels if lbl])
313
+
314
+ if not self.groq_client.is_available():
315
+ return self._simple_templated_story(description)
316
+
317
+ style = [
318
+ "as an epic poem from a wandering bard",
319
+ "as a prophecy carved in stone",
320
+ "as a fireside tale with vivid emotions",
321
+ "as a dialogue between two ancient gods",
322
+ "as a lost papyrus narrative recovered from the sands",
323
+ "as a myth told by a court poet"
324
+ ]
325
+
326
+ import random
327
+ chosen_style = random.choice(style)
328
+ seed = random.randint(1000, 9999)
329
+
330
+ prompt = (
331
+ f"The following sequence of ancient Egyptian symbols is described as: {description}\n\n"
332
+ f"Can you create a long, vivid, imaginative story from ancient times "
333
+ f"based on this sequence of Egyptian symbols: [your sequence]. "
334
+ f"Write it as one rich paragraph with a lot of detail, mystery, and historical atmosphere. "
335
+ f"At least 200 words.\n\n"
336
+ f"Creative seed: {seed}\n"
337
+ f"Write a richly detailed, imaginative myth-like story {chosen_style}. "
338
+ "Include multiple characters, vivid imagery, and at least 3 short scenes. "
339
+ "Do NOT repeat the same sentence or phrase verbatim. "
340
+ "Keep it evocative and unpredictable."
341
+ )
342
+
343
+ system_prompt = "You are a creative ancient historian and myth-maker. Invent rich, imaginative tales."
344
+
345
+ story = self.groq_client.generate_response(
346
+ system_prompt=system_prompt,
347
+ user_prompt=prompt,
348
+ max_tokens=self.config.GROQ_STORY_MAX_TOKENS
349
+ )
350
+
351
+ if not story or is_gibberish(story):
352
+ return self._simple_templated_story(description)
353
+
354
+ return story
355
+
356
+ def _simple_templated_story(self, description):
357
+ """Fallback story generation"""
358
+ import re
359
+ parts = [p.strip() for p in re.split(r',\s*', description) if p.strip()]
360
+ keywords = []
361
+
362
+ for p in parts:
363
+ m = re.match(r'([a-zA-Z0-9_-]+)', p)
364
+ if m:
365
+ kw = m.group(1)
366
+ if kw not in keywords:
367
+ keywords.append(kw)
368
+ if len(keywords) >= 8:
369
+ break
370
+
371
+ flavor = {
372
+ "bow": "strength and vigilance",
373
+ "hoe": "the work of the fields",
374
+ "reed": "the scribe's craft",
375
+ "owl": "hidden wisdom of the night",
376
+ "eye": "divine sight",
377
+ "bread": "offerings to the ka",
378
+ "unknown": "mysterious signs"
379
+ }
380
+
381
+ lead = []
382
+ if keywords:
383
+ lead.append(f"In an age of river and stone, a tale was told of {flavor.get(keywords[0], keywords[0])}.")
384
+ if len(keywords) > 1:
385
+ second = flavor.get(keywords[1], keywords[1])
386
+ third = flavor.get(keywords[2], keywords[2]) if len(keywords) > 2 else "omens"
387
+ lead.append(f"It spoke of {second} and {third} guiding a soul beyond the horizon.")
388
+ lead.append("Under the stars, elders whispered a vow that the names would endure.")
389
+
390
+ return " ".join(lead)
processors/greek_processor.py ADDED
@@ -0,0 +1,774 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytesseract
2
+ import re
3
+ import os
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ from PIL import Image
8
+ from .base_processor import BaseScriptProcessor
9
+ from utils.text_utils import is_gibberish
10
+
11
+ BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
12
+ GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr")
13
+
14
+ class GreekProcessor(BaseScriptProcessor):
15
+ def __init__(self, groq_client, references, clip_classifier):
16
+ super().__init__(groq_client, references, clip_classifier)
17
+ self.clip_classifier = clip_classifier
18
+ self.setup_ancient_greek_ocr()
19
+
20
+ self.trocr_model = None
21
+ self.trocr_processor = None
22
+ self.trocr_available = False
23
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
+
25
+ # Register for dynamic VRAM management
26
+ from utils.gpu_diagnostics import register_processor
27
+ register_processor("greek", self)
28
+
29
+ def setup_greek_trocr(self):
30
+ """Setup TrOCR model — BEST for ancient Greek manuscripts"""
31
+ try:
32
+ from utils.gpu_diagnostics import reclaim_vram_for
33
+ reclaim_vram_for("greek")
34
+
35
+ print("[INFO] Lazily loading TrOCR model for ancient Greek...")
36
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
37
+ import torch
38
+
39
+ self.trocr_processor = TrOCRProcessor.from_pretrained(
40
+ 'rithwikn/trocr_greek_combined',
41
+ cache_dir=GREEK_TROCR_MODEL_DIR,
42
+ local_files_only=False
43
+ )
44
+ self.trocr_model = VisionEncoderDecoderModel.from_pretrained(
45
+ 'rithwikn/trocr_greek_combined',
46
+ cache_dir=GREEK_TROCR_MODEL_DIR,
47
+ local_files_only=False
48
+ )
49
+
50
+ self.trocr_model.to(self.device)
51
+ self.trocr_model.eval() # Put in evaluation mode
52
+
53
+ from utils.gpu_diagnostics import log_model_device
54
+ log_model_device("Greek TrOCR", self.device)
55
+
56
+ self.trocr_available = True
57
+ print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}")
58
+
59
+ except Exception as e:
60
+ print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}")
61
+ self.trocr_available = False
62
+
63
+ def setup_ancient_greek_ocr(self):
64
+ """Setup Ancient Greek OCR with specialized tessdata"""
65
+ # Path to Ancient Greek tessdata (download from ancientgreekocr.org)
66
+ self.ancient_greek_tessdata = os.path.join(
67
+ os.path.dirname(__file__),
68
+ "..", "tessdata", "ancient-greek"
69
+ )
70
+
71
+ # Verify tessdata exists
72
+ if os.path.exists(self.ancient_greek_tessdata):
73
+ print(f"[INFO] Ancient Greek tessdata found: {self.ancient_greek_tessdata}")
74
+ else:
75
+ print(f"[WARN] Ancient Greek tessdata not found at: {self.ancient_greek_tessdata}")
76
+ print("[INFO] Download from: https://ancientgreekocr.org")
77
+ def detect_script(self, image_path):
78
+ """Simplified detection - Groq Vision handles main classification"""
79
+ try:
80
+ if not getattr(self, 'trocr_available', False):
81
+ # Check if Ancient Greek OCR is available as fallback
82
+ grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
83
+ if not os.path.exists(grc_file):
84
+ print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)")
85
+ return False, 0.5
86
+
87
+ # If called by Groq Vision classification, accept with high confidence
88
+ print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)")
89
+ return True, 0.95
90
+
91
+ except Exception as e:
92
+ print(f"[ERROR] Greek detection failed: {e}")
93
+ return False, 0.0
94
+
95
+
96
+ def _quick_greek_ocr_test(self, image_path):
97
+ """Quick OCR test to validate Greek content"""
98
+ try:
99
+ # Quick test with small image crop
100
+ image = Image.open(image_path)
101
+ # Take center crop for testing
102
+ w, h = image.size
103
+ crop_box = (w//4, h//4, 3*w//4, 3*h//4)
104
+ test_crop = image.crop(crop_box)
105
+
106
+ # Test with standard Greek OCR
107
+ test_text = pytesseract.image_to_string(test_crop, lang="ell")
108
+ greek_char_count = self._count_greek_chars(test_text or "")
109
+
110
+ # If we find Greek characters, it's likely Greek
111
+ return greek_char_count >= 3
112
+
113
+ except Exception:
114
+ return False
115
+
116
+ def extract_text(self, image_path):
117
+ """Enhanced Greek text extraction with TrOCR primary, Tesseract fallback"""
118
+ try:
119
+ image = Image.open(image_path)
120
+
121
+ # Ensure the Greek TrOCR model is loaded dynamically
122
+ if self.trocr_model is None:
123
+ self.setup_greek_trocr()
124
+ else:
125
+ from utils.gpu_diagnostics import reclaim_vram_for
126
+ reclaim_vram_for("greek")
127
+ if str(next(self.trocr_model.parameters()).device) != str(self.device):
128
+ print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
129
+ self.trocr_model.to(self.device)
130
+
131
+ # Method 1: Ancient Greek TrOCR (if available)
132
+ if getattr(self, 'trocr_available', False) and self.trocr_model is not None:
133
+ print("[INFO] Attempting Ancient Greek extraction with TrOCR...")
134
+ trocr_text = self._extract_with_trocr(image_path)
135
+ if trocr_text and self._validate_greek_text(trocr_text):
136
+ print("[INFO] Using Ancient Greek TrOCR result")
137
+ return trocr_text
138
+ print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...")
139
+
140
+ # Method 2: Ancient Greek OCR (if available and safe)
141
+ grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
142
+ if os.path.exists(grc_file):
143
+ ancient_greek_text = self._extract_with_ancient_greek_ocr(image)
144
+ if ancient_greek_text and self._validate_greek_text(ancient_greek_text):
145
+ print("[INFO] Using Ancient Greek OCR result")
146
+ return ancient_greek_text
147
+
148
+ # Method 3: Standard Greek OCR
149
+ standard_greek_text = self._extract_with_standard_greek_ocr(image)
150
+ if standard_greek_text and self._validate_greek_text(standard_greek_text):
151
+ print("[INFO] Using standard Greek OCR result")
152
+ return standard_greek_text
153
+
154
+ # Method 4: Layout-aware line segment fallback
155
+ print("[INFO] Trying layout-aware Greek segmentation fallback...")
156
+ layout_aware_greek_text = self._extract_layout_aware_ocr(image_path)
157
+ if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text):
158
+ print("[INFO] Using layout-aware Greek OCR result")
159
+ return layout_aware_greek_text
160
+
161
+ # Method 5: Final validation - if no good Greek text found, return empty
162
+ print("[INFO] No valid Greek text detected")
163
+ return ""
164
+
165
+ except Exception as e:
166
+ print(f"[ERROR] Greek text extraction failed: {e}")
167
+ return ""
168
+
169
+ def _extract_with_trocr(self, image_path):
170
+ """Extract text using TrOCR Ancient Greek model line-by-line"""
171
+ if self.trocr_model is None:
172
+ self.setup_greek_trocr()
173
+ else:
174
+ from utils.gpu_diagnostics import reclaim_vram_for
175
+ reclaim_vram_for("greek")
176
+ if str(next(self.trocr_model.parameters()).device) != str(self.device):
177
+ print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
178
+ self.trocr_model.to(self.device)
179
+
180
+ if not getattr(self, 'trocr_available', False) or self.trocr_model is None:
181
+ return ""
182
+
183
+ try:
184
+ import torch
185
+ from PIL import Image
186
+ print("[INFO] Segmenting layout for Greek TrOCR...")
187
+ layout = self.layout_parser.analyze_layout(image_path)
188
+ crops = self.layout_parser.crop_lines(image_path, layout)
189
+
190
+ # Fallback to whole image if no crops detected
191
+ if not crops:
192
+ print("[WARN] No line crops found, processing full image with TrOCR")
193
+ crops = [Image.open(image_path).convert("RGB")]
194
+
195
+ line_texts = []
196
+ print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...")
197
+ for idx, crop in enumerate(crops):
198
+ # Ensure RGB mode for TrOCR
199
+ crop_rgb = crop.convert("RGB")
200
+
201
+ pixel_values = self.trocr_processor(
202
+ images=crop_rgb,
203
+ return_tensors="pt"
204
+ ).pixel_values.to(self.device)
205
+
206
+ with torch.inference_mode():
207
+ generated_ids = self.trocr_model.generate(
208
+ pixel_values,
209
+ max_length=256,
210
+ num_beams=4,
211
+ early_stopping=True,
212
+ repetition_penalty=1.2
213
+ )
214
+
215
+ text = self.trocr_processor.batch_decode(
216
+ generated_ids,
217
+ skip_special_tokens=True
218
+ )[0]
219
+
220
+ if text.strip():
221
+ line_texts.append(text.strip())
222
+
223
+ full_text = "\n".join(line_texts)
224
+ print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image")
225
+ return full_text
226
+
227
+ except Exception as e:
228
+ print(f"[ERROR] Greek TrOCR extraction failed: {e}")
229
+ return ""
230
+
231
+
232
+ def _extract_with_ancient_greek_ocr(self, image):
233
+ """Extract using specialized Ancient Greek OCR"""
234
+ try:
235
+ # Save original tessdata path
236
+ original_tessdata = os.environ.get("TESSDATA_PREFIX", "")
237
+
238
+ # Set tessdata path properly (fix the path format)
239
+ if os.path.exists(self.ancient_greek_tessdata):
240
+ # Ensure proper path format without trailing quotes
241
+ clean_path = str(self.ancient_greek_tessdata).replace('"', '')
242
+ os.environ["TESSDATA_PREFIX"] = clean_path
243
+ print(f"[INFO] Set TESSDATA_PREFIX to: {clean_path}")
244
+ else:
245
+ print(f"[WARN] Ancient Greek tessdata not found at: {self.ancient_greek_tessdata}")
246
+ return ""
247
+
248
+ # Use ancient Greek language code 'grc' with optimized settings
249
+ config = "--psm 6 --oem 1 -c preserve_interword_spaces=1"
250
+
251
+ # Try ancient Greek language pack
252
+ text = pytesseract.image_to_string(
253
+ image,
254
+ lang="grc", # Ancient Greek language code
255
+ config=config
256
+ )
257
+
258
+ # Restore original tessdata path
259
+ if original_tessdata:
260
+ os.environ["TESSDATA_PREFIX"] = original_tessdata
261
+ else:
262
+ # Remove the environment variable if it wasn't set before
263
+ if "TESSDATA_PREFIX" in os.environ:
264
+ del os.environ["TESSDATA_PREFIX"]
265
+
266
+ return text.strip()
267
+
268
+ except Exception as e:
269
+ print(f"[WARN] Ancient Greek OCR failed: {e}")
270
+ # Make sure to restore tessdata path even on error
271
+ if 'original_tessdata' in locals() and original_tessdata:
272
+ os.environ["TESSDATA_PREFIX"] = original_tessdata
273
+ return ""
274
+
275
+ def _extract_layout_aware_ocr(self, image_path):
276
+ """Extract text by segmenting the page layout into lines first for improved readability order"""
277
+ try:
278
+ import pytesseract
279
+ print("[INFO] Running layout-aware line segmentation for Greek...")
280
+ layout = self.layout_parser.analyze_layout(image_path)
281
+ crops = self.layout_parser.crop_lines(image_path, layout)
282
+
283
+ if not crops:
284
+ print("[WARN] Layout parser returned no line crops for Greek")
285
+ return ""
286
+
287
+ print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines")
288
+ line_texts = []
289
+
290
+ # Try to use Ancient Greek first
291
+ grc_file = os.path.join(self.ancient_greek_tessdata, "grc.traineddata")
292
+ use_grc = os.path.exists(grc_file)
293
+
294
+ # Save original TESSDATA_PREFIX
295
+ original_tessdata = os.environ.get("TESSDATA_PREFIX", "")
296
+ if use_grc:
297
+ clean_path = str(self.ancient_greek_tessdata).replace('"', '')
298
+ os.environ["TESSDATA_PREFIX"] = clean_path
299
+
300
+ try:
301
+ for idx, crop in enumerate(crops):
302
+ # Enhance line crop for OCR
303
+ crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
304
+ gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
305
+ clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
306
+ enhanced = clahe.apply(gray)
307
+ crop_pil = Image.fromarray(enhanced)
308
+
309
+ config = '--oem 3 --psm 7'
310
+ text = ""
311
+
312
+ if use_grc:
313
+ text = pytesseract.image_to_string(
314
+ crop_pil,
315
+ lang='grc',
316
+ config=config
317
+ ).strip()
318
+
319
+ if not text:
320
+ text = pytesseract.image_to_string(
321
+ crop_pil,
322
+ lang='ell',
323
+ config=config
324
+ ).strip()
325
+
326
+ if text:
327
+ line_texts.append(text)
328
+ finally:
329
+ if use_grc and original_tessdata:
330
+ os.environ["TESSDATA_PREFIX"] = original_tessdata
331
+
332
+ return "\n".join(line_texts)
333
+ except Exception as e:
334
+ print(f"[WARN] Layout aware Greek OCR failed: {e}")
335
+ return ""
336
+
337
+
338
+ def _extract_with_standard_greek_ocr(self, image):
339
+ """Extract using standard Greek OCR with optimized settings"""
340
+ try:
341
+ # Multiple OCR attempts with different settings
342
+ configs = [
343
+ "--psm 6 --oem 1", # Uniform text block
344
+ "--psm 4 --oem 1", # Single column text
345
+ "--psm 3 --oem 1", # Default, automatic page segmentation
346
+ "--psm 8 --oem 1" # Single word
347
+ ]
348
+
349
+ for config in configs:
350
+ try:
351
+ text = pytesseract.image_to_string(
352
+ image,
353
+ lang="ell", # Modern Greek
354
+ config=config
355
+ )
356
+
357
+ if text and self._validate_greek_text(text):
358
+ return text.strip()
359
+
360
+ except Exception:
361
+ continue
362
+
363
+ return ""
364
+
365
+ except Exception as e:
366
+ print(f"[WARN] Standard Greek OCR failed: {e}")
367
+ return ""
368
+
369
+ def _extract_with_preprocessing(self, image):
370
+ """Fallback extraction with image preprocessing"""
371
+ try:
372
+ # Convert PIL to CV2
373
+ cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
374
+
375
+ # Image preprocessing for better OCR
376
+ gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
377
+
378
+ # Try different preprocessing approaches
379
+ preprocessed_images = [
380
+ gray, # Original grayscale
381
+ cv2.GaussianBlur(gray, (1, 1), 0), # Slight blur
382
+ cv2.medianBlur(gray, 3), # Noise reduction
383
+ cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Adaptive threshold
384
+ ]
385
+
386
+ for processed_img in preprocessed_images:
387
+ try:
388
+ pil_img = Image.fromarray(processed_img)
389
+ text = pytesseract.image_to_string(
390
+ pil_img,
391
+ lang="ell",
392
+ config="--psm 6 --oem 1"
393
+ )
394
+
395
+ if self._validate_greek_text(text):
396
+ return text.strip()
397
+
398
+ except Exception:
399
+ continue
400
+
401
+ return ""
402
+
403
+ except Exception as e:
404
+ print(f"[WARN] Fallback Greek OCR failed: {e}")
405
+ return ""
406
+
407
+ def _count_greek_chars(self, text):
408
+ """Count Greek Unicode characters including polytonic marks"""
409
+ if not text:
410
+ return 0
411
+
412
+ def is_greek_char(ch):
413
+ o = ord(ch)
414
+ # Greek and Coptic (0x0370-0x03FF)
415
+ # Greek Extended (0x1F00-0x1FFF) - includes polytonic marks
416
+ return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF)
417
+
418
+ return sum(is_greek_char(ch) for ch in text)
419
+
420
+ def _validate_greek_text(self, text):
421
+ """Validate if text contains meaningful Greek content"""
422
+ if not text or len(text.strip()) < 3:
423
+ return False
424
+
425
+ # Count Greek characters
426
+ greek_char_count = self._count_greek_chars(text)
427
+ total_chars = len(re.sub(r'\s+', '', text))
428
+
429
+ if total_chars == 0:
430
+ return False
431
+
432
+ # Check for Latin characters (should reject if too many)
433
+ latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
434
+ latin_ratio = latin_chars / total_chars if total_chars > 0 else 0
435
+
436
+ # If text is mostly Latin characters, reject it
437
+ if latin_ratio > 0.8 and greek_char_count < 3:
438
+ print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}")
439
+ return False
440
+
441
+ # At least 20% should be Greek characters, or minimum 5 Greek chars
442
+ greek_ratio = greek_char_count / total_chars
443
+
444
+ return greek_char_count >= 5 or greek_ratio >= 0.20
445
+
446
+
447
+ def _extract_distinct_terms(self, text):
448
+ """Extract distinct Greek terms from text"""
449
+ if not text:
450
+ return []
451
+
452
+ # Find Greek words (including those with diacritical marks)
453
+ tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
454
+
455
+ def is_greek_word(word):
456
+ return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF)
457
+ for ch in word)
458
+
459
+ distinct_terms = []
460
+ seen = set()
461
+
462
+ for token in tokens:
463
+ if len(token) < 2: # Skip single characters
464
+ continue
465
+
466
+ if is_greek_word(token):
467
+ normalized = token.lower()
468
+ if normalized not in seen:
469
+ distinct_terms.append(token)
470
+ seen.add(normalized)
471
+
472
+ return distinct_terms[:20] # Limit to 20 terms
473
+
474
+ def process_text(self, greek_text):
475
+ """Process extracted Greek text"""
476
+ if not greek_text:
477
+ return {"text": "", "terms": [], "char_analysis": {}, "validation": {}}
478
+
479
+ # Extract distinct terms
480
+ terms = self._extract_distinct_terms(greek_text)
481
+
482
+ # Character analysis
483
+ char_analysis = {
484
+ "total_chars": len(greek_text),
485
+ "greek_chars": self._count_greek_chars(greek_text),
486
+ "unique_chars": len(set(greek_text)),
487
+ "words": len(greek_text.split())
488
+ }
489
+
490
+ # Validation metrics
491
+ validation = {
492
+ "has_polytonic": self._has_polytonic_marks(greek_text),
493
+ "greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]),
494
+ "quality_score": self._calculate_quality_score(greek_text)
495
+ }
496
+
497
+ return {
498
+ "text": greek_text,
499
+ "terms": terms,
500
+ "char_analysis": char_analysis,
501
+ "validation": validation
502
+ }
503
+
504
+ def _has_polytonic_marks(self, text):
505
+ """Check if text contains polytonic Greek marks"""
506
+ # Greek Extended block contains polytonic diacritical marks
507
+ return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text)
508
+
509
+ def _calculate_quality_score(self, text):
510
+ """Calculate a quality score for the extracted text"""
511
+ if not text:
512
+ return 0.0
513
+
514
+ score = 0.0
515
+
516
+ # Base score from Greek character ratio
517
+ greek_ratio = self._count_greek_chars(text) / max(1, len(text))
518
+ score += greek_ratio * 0.4
519
+
520
+ # Bonus for polytonic marks (indicates authentic ancient Greek)
521
+ if self._has_polytonic_marks(text):
522
+ score += 0.3
523
+
524
+ # Penalty for too many non-alphabetic characters
525
+ alpha_chars = sum(ch.isalpha() for ch in text)
526
+ alpha_ratio = alpha_chars / max(1, len(text))
527
+ score += alpha_ratio * 0.3
528
+
529
+ return min(1.0, score)
530
+
531
+ def generate_historical_context(self, processed_result):
532
+ """Generate historical context for Greek text"""
533
+ greek_text = processed_result.get("text", "")
534
+ terms = processed_result.get("terms", [])
535
+
536
+ # Generate Groq context
537
+ groq_detail = self._generate_groq_context(greek_text)
538
+
539
+ # Build references - query both words and individual characters
540
+ query_terms = list(terms) if terms else []
541
+ if greek_text:
542
+ query_terms.extend([char for char in greek_text if char.strip()])
543
+ print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}")
544
+ refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
545
+ print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}")
546
+
547
+ return {
548
+ "uses_box": {
549
+ "title": "Each symbol's possible use by the Greek people",
550
+ "items": self._build_uses_list(terms, greek_text)
551
+ },
552
+ "meaning_box": self._build_meaning_box(terms, groq_detail),
553
+ "references": refs
554
+ }
555
+
556
+ def _generate_groq_context(self, greek_text):
557
+ """Generate contextual information using Groq"""
558
+ if not self.groq_client.is_available():
559
+ return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."
560
+
561
+ prompt = (
562
+ f"This ancient Greek text was found: {greek_text}\n\n"
563
+ "Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, "
564
+ "possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), "
565
+ "and paleographic cues. Avoid repeating the prompt."
566
+ )
567
+
568
+ system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context."
569
+ enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text)
570
+
571
+ return self.groq_client.generate_response(
572
+ system_prompt=enriched_system_prompt,
573
+ user_prompt=prompt
574
+ ) or "(context unavailable due to Groq error)"
575
+
576
+ def _generate_batch_explanations(self, terms):
577
+ """Generate scholarly glossary definitions for Greek terms in a single batch query"""
578
+ if not terms or not self.groq_client or not self.groq_client.is_available():
579
+ return {}
580
+
581
+ # Limit to first 15 terms to prevent token limit/truncation issues
582
+ terms_to_query = list(terms)[:15]
583
+ terms_list = ", ".join(terms_to_query)
584
+
585
+ system_prompt = (
586
+ "You are an expert classicist and lexicographer of Ancient Greek. "
587
+ "Respond ONLY with a JSON object. Do NOT wrap values in double quotes inside the strings. "
588
+ "Use single quotes '...' for any internal quotes, definitions, or translations."
589
+ )
590
+ user_prompt = (
591
+ f"For each of the following Ancient Greek words, provide a brief, scholarly one-sentence definition, "
592
+ f"etymological note, or grammatical gloss:\n\n"
593
+ f"Words: {terms_list}\n\n"
594
+ f"Respond ONLY with a JSON object where the keys are the exact words and the values are the definitions.\n"
595
+ f"Do NOT use double quotes inside the definitions/values; use single quotes instead.\n"
596
+ f"Example: {{\"word1\": \"definition1\", \"word2\": \"definition2\"}}"
597
+ )
598
+
599
+ try:
600
+ raw_response = self.groq_client.generate_response(
601
+ system_prompt=system_prompt,
602
+ user_prompt=user_prompt,
603
+ max_tokens=2048
604
+ )
605
+ # Safe print to avoid UnicodeEncodeError in Windows command prompt
606
+ print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")
607
+
608
+ # Find JSON block in response
609
+ if "{" in raw_response and "}" in raw_response:
610
+ start = raw_response.find("{")
611
+ end = raw_response.rfind("}") + 1
612
+ json_str = raw_response[start:end]
613
+ import json
614
+ try:
615
+ definitions = json.loads(json_str)
616
+ except Exception as je:
617
+ print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")
618
+ import re
619
+ definitions = {}
620
+ # Matches "key": "value"
621
+ pattern = re.compile(r'"([^"]+)":\s*"((?:[^"\\]|\\.)*)"')
622
+ matches = pattern.findall(json_str)
623
+ for k, v in matches:
624
+ definitions[k] = v
625
+ return {k: str(v) for k, v in definitions.items()}
626
+ except Exception as e:
627
+ print(f"[WARN] Failed to generate batch Greek explanations: {e}")
628
+
629
+ return {}
630
+
631
+ def _build_uses_list(self, terms, greek_text):
632
+ """Build list of symbol/word uses using RAG and batch Groq explanations"""
633
+ import unicodedata
634
+ items = []
635
+
636
+ # 1. Get definitions for the extracted Greek words (terms)
637
+ if terms:
638
+ # Unique terms preserving order
639
+ unique_terms = list(dict.fromkeys(terms))
640
+ # Limit to top 15 terms to be concise
641
+ unique_terms = unique_terms[:15]
642
+ print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...")
643
+ definitions = {}
644
+ missing_terms = []
645
+
646
+ for term in unique_terms:
647
+ # Check RAG corpus (normalize search query)
648
+ norm_term = unicodedata.normalize('NFC', term).strip()
649
+ rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1)
650
+ if rag_matches:
651
+ definitions[term] = rag_matches[0]["definition"]
652
+ else:
653
+ missing_terms.append(term)
654
+
655
+ # Generate remaining definitions with Groq in a single batch
656
+ if missing_terms:
657
+ groq_defs = self._generate_batch_explanations(missing_terms)
658
+ # Normalize groq keys for matching
659
+ normalized_groq_defs = {}
660
+ for k, v in groq_defs.items():
661
+ nk = unicodedata.normalize('NFC', k).strip().lower()
662
+ normalized_groq_defs[nk] = v
663
+
664
+ # Assign matching definitions
665
+ for term in missing_terms:
666
+ nt = unicodedata.normalize('NFC', term).strip().lower()
667
+ if nt in normalized_groq_defs:
668
+ definitions[term] = normalized_groq_defs[nt]
669
+ else:
670
+ # Case/accent insensitive backup match (in case Groq stripped accents)
671
+ import unicodedata as ud
672
+ def strip_accents(s):
673
+ return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn')
674
+
675
+ stripped_t = strip_accents(nt)
676
+ for gk, gv in normalized_groq_defs.items():
677
+ if strip_accents(gk) == stripped_t:
678
+ definitions[term] = gv
679
+ break
680
+
681
+ for term in unique_terms:
682
+ definition = definitions.get(term)
683
+ if not definition:
684
+ definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
685
+ items.append(f"{term}: {definition}")
686
+
687
+ # 2. Add significant paleographical/character markers found in the text if they are in the references
688
+ notes = self.references.get("greek_symbol_notes", {}) or {}
689
+ seen_chars = set()
690
+ char_items = []
691
+ for ch in greek_text:
692
+ if ch in notes and ch not in seen_chars:
693
+ seen_chars.add(ch)
694
+ char_items.append(f"Character '{ch}': {notes[ch]}")
695
+
696
+ # Limit character notes to prevent clutter
697
+ items.extend(char_items[:5])
698
+
699
+ # Format as list items with bullets
700
+ formatted_items = [f"- {item}" for item in items]
701
+
702
+ if not formatted_items:
703
+ default_hint = self.references.get("greek_hint",
704
+ "Ancient Greek script marker; values are determined by polytonic diacritical marks.")
705
+ formatted_items.append(f"- —: {default_hint}")
706
+
707
+ return formatted_items
708
+
709
+
710
+ def _build_meaning_box(self, terms, groq_detail):
711
+ """Build meaning interpretation box"""
712
+ intro_lines = [
713
+ "The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.",
714
+ "Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification."
715
+ ]
716
+
717
+ points = [
718
+ "• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.",
719
+ "• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.",
720
+ ]
721
+
722
+ if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
723
+ points.append(groq_detail.strip())
724
+
725
+ return {
726
+ "title": "Possible meaning:",
727
+ "intro_lines": intro_lines,
728
+ "frequent_label": "Key terms noted",
729
+ "frequent": terms[:10],
730
+ "points": points
731
+ }
732
+
733
+ def generate_story(self, processed_result):
734
+ """Generate creative story for Greek text"""
735
+ greek_text = processed_result.get("text", "")
736
+
737
+ if not self.groq_client.is_available():
738
+ return "Groq client unavailable, cannot generate story."
739
+
740
+ styles = [
741
+ "as an epic poem told by a travelling rhapsode",
742
+ "as a prophecy inscribed on the Oracle at Delphi",
743
+ "as a philosophical dialogue in the Academy",
744
+ "as a myth recounted by ancient storytellers",
745
+ "as a recovered scroll from the Library of Alexandria",
746
+ "as a hymn sung in honor of the gods"
747
+ ]
748
+
749
+ import random
750
+ chosen_style = random.choice(styles)
751
+ seed = random.randint(1000, 9999)
752
+
753
+ prompt = (
754
+ f"The following ancient Greek text was found: {greek_text}\n\n"
755
+ f"Create a long, vivid, imaginative story from ancient Greek times "
756
+ f"based on this Greek text. Write it as one rich paragraph with "
757
+ f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n"
758
+ f"Creative seed: {seed}\n"
759
+ f"Write a detailed, imaginative myth-like story {chosen_style}. "
760
+ "Include multiple characters, rich imagery, and scenes. "
761
+ "Avoid repetition and keep it unpredictable."
762
+ )
763
+
764
+ system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture."
765
+
766
+ story = self.groq_client.generate_response(
767
+ system_prompt=system_prompt,
768
+ user_prompt=prompt
769
+ )
770
+
771
+ if not story or is_gibberish(story):
772
+ return "Failed to create quality story; the ancient texts remain silent."
773
+
774
+ return story
processors/latin_processor.py ADDED
@@ -0,0 +1,1281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ import re
5
+ import time
6
+ from PIL import Image
7
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
8
+ import torch
9
+ from .base_processor import BaseScriptProcessor
10
+ from utils.text_utils import is_gibberish
11
+
12
+ BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
13
+ TRIDIS_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "tridis")
14
+ TROCR_LATIN_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "trocr_latin")
15
+
16
+
17
+ class LatinProcessor(BaseScriptProcessor):
18
+ def __init__(self, groq_client, references, clip_classifier):
19
+ super().__init__(groq_client, references, clip_classifier)
20
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+
22
+ self.tridis_model = None
23
+ self.tridis_processor = None
24
+ self.tridis_available = False
25
+
26
+ self.trocr_latin_model = None
27
+ self.trocr_latin_processor = None
28
+ self.trocr_latin_available = False
29
+
30
+ self.active_style = "cursive"
31
+ self.active_model = "None"
32
+
33
+ self.setup_tesseract_fallback()
34
+
35
+ # Register for dynamic VRAM management
36
+ from utils.gpu_diagnostics import register_processor
37
+ register_processor("latin", self)
38
+
39
+ def setup_tridis_htr(self):
40
+ """Setup TRIDIS HTR model - BEST for medieval Latin manuscripts"""
41
+ try:
42
+ from utils.gpu_diagnostics import reclaim_vram_for
43
+ reclaim_vram_for("latin")
44
+
45
+ print("[INFO] Lazily loading TRIDIS HTR model for medieval Latin...")
46
+ print("[INFO] This model specializes in 13th-16th century manuscripts with automatic abbreviation expansion")
47
+
48
+ # TRIDIS model from Hugging Face - runs locally after download
49
+ self.tridis_processor = TrOCRProcessor.from_pretrained(
50
+ 'magistermilitum/tridis_HTR',
51
+ cache_dir=TRIDIS_MODEL_DIR,
52
+ local_files_only=False # Download first time, then cache locally
53
+ )
54
+ self.tridis_model = VisionEncoderDecoderModel.from_pretrained(
55
+ 'magistermilitum/tridis_HTR',
56
+ cache_dir=TRIDIS_MODEL_DIR,
57
+ local_files_only=False
58
+ )
59
+
60
+ self.tridis_model.to(self.device)
61
+ self.tridis_model.eval() # Put in evaluation mode
62
+
63
+ from utils.gpu_diagnostics import log_model_device
64
+ log_model_device("Latin TRIDIS HTR (Cursive)", self.device)
65
+
66
+ print(f"[INFO] TRIDIS HTR loaded successfully on {self.device}")
67
+ print("[INFO] Training: 245,000 lines of Latin/Old French/Old Spanish medieval manuscripts")
68
+ print("[INFO] Features: Automatic abbreviation expansion, named entity capitalization, cancellation markers")
69
+ self.tridis_available = True
70
+
71
+ except Exception as e:
72
+ print(f"[ERROR] TRIDIS HTR model failed to load: {e}")
73
+ print("[WARN] Falling back to Tesseract for basic Latin recognition...")
74
+ self.tridis_available = False
75
+
76
+ def setup_trocr_base_latin(self):
77
+ """Setup trocr-base-latin model - BEST for printed or carved classical Latin"""
78
+ try:
79
+ from utils.gpu_diagnostics import reclaim_vram_for
80
+ reclaim_vram_for("latin")
81
+
82
+ print("[INFO] Lazily loading trocr-base-latin model for printed/carved Latin...")
83
+ self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
84
+ 'magistermilitum/trocr-base-latin',
85
+ cache_dir=TROCR_LATIN_MODEL_DIR,
86
+ local_files_only=False
87
+ )
88
+ self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
89
+ 'magistermilitum/trocr-base-latin',
90
+ cache_dir=TROCR_LATIN_MODEL_DIR,
91
+ local_files_only=False
92
+ )
93
+
94
+ self.trocr_latin_model.to(self.device)
95
+ self.trocr_latin_model.eval() # Put in evaluation mode
96
+
97
+ from utils.gpu_diagnostics import log_model_device
98
+ log_model_device("Latin TrOCR (Printed)", self.device)
99
+
100
+ self.trocr_latin_available = True
101
+ print(f"[INFO] trocr-base-latin loaded successfully on {self.device}")
102
+ except Exception as e:
103
+ print(f"[WARN] magistermilitum/trocr-base-latin model failed to load ({e}). Trying public fallback 'microsoft/trocr-base-printed'...")
104
+ try:
105
+ # Free VRAM again in case partial allocation left residue
106
+ reclaim_vram_for("latin")
107
+ self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
108
+ 'microsoft/trocr-base-printed',
109
+ cache_dir=TROCR_LATIN_MODEL_DIR,
110
+ local_files_only=False
111
+ )
112
+ self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
113
+ 'microsoft/trocr-base-printed',
114
+ cache_dir=TROCR_LATIN_MODEL_DIR,
115
+ local_files_only=False
116
+ )
117
+ self.trocr_latin_model.to(self.device)
118
+ self.trocr_latin_model.eval() # Put in evaluation mode
119
+
120
+ from utils.gpu_diagnostics import log_model_device
121
+ log_model_device("Latin TrOCR (Printed Fallback)", self.device)
122
+
123
+ self.trocr_latin_available = True
124
+ print(f"[INFO] Public fallback microsoft/trocr-base-printed loaded successfully on {self.device}")
125
+ except Exception as ex:
126
+ print(f"[ERROR] All printed Latin models failed to load: {ex}")
127
+ self.trocr_latin_available = False
128
+
129
+
130
+ def setup_tesseract_fallback(self):
131
+ """Setup Tesseract as fallback for basic Latin recognition"""
132
+ try:
133
+ import pytesseract
134
+
135
+ # Test Tesseract availability
136
+ try:
137
+ version = pytesseract.get_tesseract_version()
138
+ print(f"[INFO] Tesseract fallback version: {version}")
139
+ except:
140
+ print("[INFO] Tesseract version check skipped")
141
+
142
+ self.ocr_configs = {
143
+ 'medieval_extended': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁꝐꝑꝒꝓꝔꝕꝖꝗꝘꝙꝚꝛꝜꝝꞀꞁꞂꞃ$',
144
+ 'medieval_basic': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-',
145
+ 'standard': r'--oem 3 --psm 6',
146
+ 'single_line': r'--oem 3 --psm 7',
147
+ 'single_word': r'--oem 3 --psm 8',
148
+ 'auto': r'--oem 3 --psm 3'
149
+ }
150
+ self.tesseract_available = True
151
+ print("[INFO] Tesseract fallback configured with medieval symbol support")
152
+
153
+ except ImportError:
154
+ print("[ERROR] pytesseract not available")
155
+ self.tesseract_available = False
156
+ except Exception as e:
157
+ print(f"[WARN] Tesseract setup failed: {e}")
158
+ self.tesseract_available = False
159
+
160
+ def detect_script(self, image_path):
161
+ """Detection handled by Groq Vision classification"""
162
+ try:
163
+ if not self.tridis_available and not self.tesseract_available:
164
+ print("[ERROR] No OCR engines available for Latin processing")
165
+ return False, 0.0
166
+
167
+ method = "TRIDIS HTR (medieval specialist)" if self.tridis_available else "Tesseract fallback"
168
+ print(f"[INFO] Latin processor activated - Using {method}")
169
+ return True, 0.98 if self.tridis_available else 0.85
170
+
171
+ except Exception as e:
172
+ print(f"[ERROR] Latin detection failed: {e}")
173
+ return False, 0.0
174
+
175
+ def extract_text(self, image_path):
176
+ """Extract text using dual-mode routing: trocr-base-latin for printed, tridis_HTR for cursive"""
177
+ try:
178
+ start_time = time.time()
179
+
180
+ # Step 1: Detect script style
181
+ style = self.layout_parser.detect_writing_style(image_path, self.clip_classifier)
182
+ print(f"[INFO] Latin writing style detected: {style.upper()}")
183
+
184
+ primary_text = ""
185
+ fallback_text = ""
186
+
187
+ # Ensure the required model is loaded dynamically
188
+ if style == "printed":
189
+ if self.trocr_latin_model is None:
190
+ self.setup_trocr_base_latin()
191
+ else:
192
+ from utils.gpu_diagnostics import reclaim_vram_for
193
+ reclaim_vram_for("latin")
194
+ if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
195
+ print(f"[VRAM MANAGER] Activating Latin TrOCR (Printed) model on {self.device}...")
196
+ self.trocr_latin_model.to(self.device)
197
+ else:
198
+ if self.tridis_model is None:
199
+ self.setup_tridis_htr()
200
+ else:
201
+ from utils.gpu_diagnostics import reclaim_vram_for
202
+ reclaim_vram_for("latin")
203
+ if str(next(self.tridis_model.parameters()).device) != str(self.device):
204
+ print(f"[VRAM MANAGER] Activating Latin TRIDIS HTR (Cursive) model on {self.device}...")
205
+ self.tridis_model.to(self.device)
206
+
207
+ if style == "printed" and self.trocr_latin_available:
208
+ print("[INFO] Routing to printed/carved Latin model (trocr-base-latin)...")
209
+ primary_text = self._extract_with_trocr_base_latin(image_path)
210
+ if primary_text and self._validate_latin_text(primary_text, style):
211
+ processing_time = time.time() - start_time
212
+ print(f"[SUCCESS] Routed to trocr-base-latin and completed in {processing_time:.2f}s")
213
+ self.active_style = "printed"
214
+ self.active_model = "trocr-base-latin"
215
+ return primary_text
216
+ else:
217
+ print("[WARN] trocr-base-latin returned poor quality result, trying TRIDIS HTR fallback...")
218
+ if self.tridis_model is None:
219
+ self.setup_tridis_htr()
220
+ if self.tridis_available:
221
+ fallback_text = self._extract_with_tridis_htr(image_path)
222
+
223
+ else: # cursive / manuscript
224
+ print("[INFO] Routing to medieval manuscript model (tridis_HTR)...")
225
+ if self.tridis_available:
226
+ primary_text = self._extract_with_tridis_htr(image_path)
227
+ if primary_text and self._validate_latin_text(primary_text, style):
228
+ processing_time = time.time() - start_time
229
+ print(f"[SUCCESS] Routed to tridis_HTR and completed in {processing_time:.2f}s")
230
+ self.active_style = "cursive"
231
+ self.active_model = "tridis_HTR"
232
+ return primary_text
233
+ else:
234
+ print("[WARN] TRIDIS HTR returned poor quality result, trying trocr-base-latin fallback...")
235
+ if self.trocr_latin_model is None:
236
+ self.setup_trocr_base_latin()
237
+ if self.trocr_latin_available:
238
+ fallback_text = self._extract_with_trocr_base_latin(image_path)
239
+
240
+ # Step 2: Check fallback text from the other model
241
+ if fallback_text and self._validate_latin_text(fallback_text, "printed" if style == "cursive" else "cursive"):
242
+ processing_time = time.time() - start_time
243
+ print(f"[SUCCESS] Fallback model transcription successful in {processing_time:.2f}s")
244
+ self.active_style = "printed" if style == "cursive" else "cursive"
245
+ self.active_model = "trocr-base-latin" if style == "cursive" else "tridis_HTR"
246
+ return fallback_text
247
+
248
+ # Step 3: Tesseract fallback
249
+ if self.tesseract_available:
250
+ print("[INFO] Neural models failed. Processing with Tesseract fallback...")
251
+ tesseract_text = self._extract_with_tesseract_enhanced(image_path)
252
+
253
+ if tesseract_text and self._validate_latin_text(tesseract_text, "any"):
254
+ processing_time = time.time() - start_time
255
+ print(f"[SUCCESS] Tesseract fallback completed in {processing_time:.2f}s")
256
+ self.active_style = "printed" # Tesseract works best on printed
257
+ self.active_model = "Tesseract OCR"
258
+ return tesseract_text
259
+ else:
260
+ print("[WARN] Tesseract returned poor quality result, trying layout-aware segmentation fallback...")
261
+
262
+ # Method 3: Layout-aware line segment fallback
263
+ layout_aware_text = self._extract_layout_aware_ocr(image_path)
264
+ if layout_aware_text and self._validate_latin_text(layout_aware_text, "any"):
265
+ processing_time = time.time() - start_time
266
+ print(f"[SUCCESS] Layout-aware OCR completed in {processing_time:.2f}s")
267
+ self.active_style = "printed"
268
+ self.active_model = "Tesseract Layout-Aware"
269
+ return layout_aware_text
270
+
271
+ print("[ERROR] All OCR methods failed or returned poor quality results")
272
+ self.active_style = "unknown"
273
+ self.active_model = "None"
274
+ return "No readable Latin text detected with sufficient confidence"
275
+
276
+ except Exception as e:
277
+ print(f"[ERROR] Latin text extraction failed: {e}")
278
+ self.active_style = "error"
279
+ self.active_model = "None"
280
+ return f"Error during text extraction: {str(e)}"
281
+
282
+ def _extract_with_trocr_base_latin(self, image_path):
283
+ """Extract text using trocr-base-latin - SPECIALIZED for printed/carved Latin"""
284
+ if self.trocr_latin_model is None:
285
+ self.setup_trocr_base_latin()
286
+ else:
287
+ from utils.gpu_diagnostics import reclaim_vram_for
288
+ reclaim_vram_for("latin")
289
+ if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
290
+ print(f"[VRAM MANAGER] Activating Latin TrOCR model on {self.device}...")
291
+ self.trocr_latin_model.to(self.device)
292
+
293
+ if not getattr(self, 'trocr_latin_available', False) or self.trocr_latin_model is None:
294
+ return ""
295
+
296
+ try:
297
+ image = Image.open(image_path).convert("RGB")
298
+ print(f"[INFO] Processing image with trocr-base-latin: {image.size[0]}x{image.size[1]} pixels")
299
+
300
+ # Since trocr models are line-level OCR models, segment into lines first
301
+ layout = self.layout_parser.analyze_layout(image_path)
302
+ crops = self.layout_parser.crop_lines(image_path, layout)
303
+
304
+ if crops and len(crops) > 1:
305
+ print(f"[INFO] Image contains multiple lines ({len(crops)}). Running line-by-line trocr-base-latin...")
306
+ line_texts = []
307
+ for idx, crop in enumerate(crops):
308
+ text = self._ocr_single_crop_with_trocr_base_latin(crop)
309
+ if text:
310
+ line_texts.append(text)
311
+ return "\n".join(line_texts)
312
+ else:
313
+ print("[INFO] Single line detected or layout parser returned no lines. Processing full image...")
314
+ return self._ocr_single_crop_with_trocr_base_latin(image)
315
+
316
+ except Exception as e:
317
+ print(f"[ERROR] trocr-base-latin extraction failed: {e}")
318
+ return ""
319
+
320
+ def _ocr_single_crop_with_trocr_base_latin(self, crop_image):
321
+ """Helper to run trocr-base-latin inference on a single image crop"""
322
+ try:
323
+ pixel_values = self.trocr_latin_processor(
324
+ images=crop_image,
325
+ return_tensors="pt"
326
+ ).pixel_values.to(self.device)
327
+
328
+ with torch.inference_mode():
329
+ generated_ids = self.trocr_latin_model.generate(
330
+ pixel_values,
331
+ max_length=512,
332
+ num_beams=4,
333
+ early_stopping=True
334
+ )
335
+
336
+ text = self.trocr_latin_processor.batch_decode(
337
+ generated_ids,
338
+ skip_special_tokens=True
339
+ )[0]
340
+
341
+ text = ' '.join(text.split())
342
+ return text.strip()
343
+ except Exception as e:
344
+ print(f"[ERROR] Single line OCR with trocr-base-latin failed: {e}")
345
+ return ""
346
+
347
+ def _extract_with_tridis_htr(self, image_path):
348
+ """Extract text using TRIDIS HTR - SPECIALIZED for medieval Latin manuscripts.
349
+ Uses layout-aware line segmentation so multi-line documents are fully transcribed."""
350
+ if self.tridis_model is None:
351
+ self.setup_tridis_htr()
352
+ else:
353
+ from utils.gpu_diagnostics import reclaim_vram_for
354
+ reclaim_vram_for("latin")
355
+ if str(next(self.tridis_model.parameters()).device) != str(self.device):
356
+ print(f"[VRAM MANAGER] Activating Latin TRIDIS model on {self.device}...")
357
+ self.tridis_model.to(self.device)
358
+
359
+ if not getattr(self, 'tridis_available', False) or self.tridis_model is None:
360
+ return ""
361
+
362
+ try:
363
+ # Load and validate image
364
+ image = Image.open(image_path).convert("RGB")
365
+ print(f"[INFO] Processing image with TRIDIS HTR: {image.size[0]}x{image.size[1]} pixels")
366
+
367
+ # Use layout parser to segment into individual lines
368
+ layout = self.layout_parser.analyze_layout(image_path)
369
+ crops = self.layout_parser.crop_lines(image_path, layout)
370
+
371
+ if crops and len(crops) > 1:
372
+ # Cap lines to prevent timeout on very large documents (CPU inference)
373
+ MAX_LINES = 50
374
+ total_detected = len(crops)
375
+ if len(crops) > MAX_LINES:
376
+ print(f"[INFO] Layout parser detected {total_detected} text lines. Capping to {MAX_LINES} for performance.")
377
+ crops = crops[:MAX_LINES]
378
+ else:
379
+ print(f"[INFO] Layout parser detected {total_detected} text lines. Running line-by-line TRIDIS HTR...")
380
+
381
+ line_texts = []
382
+ for idx, crop in enumerate(crops):
383
+ # Preprocess each line crop for medieval manuscripts
384
+ enhanced_crop = self._preprocess_for_medieval_manuscript(crop)
385
+ text = self._ocr_single_crop_with_tridis(enhanced_crop)
386
+ if text:
387
+ line_texts.append(text)
388
+ print(f" [LINE {idx+1}/{len(crops)}] {text[:80]}...")
389
+
390
+ if line_texts:
391
+ full_text = "\n".join(line_texts)
392
+ # Post-process medieval abbreviations, corrections, and formatting
393
+ processed_text = self._post_process_medieval_text(full_text)
394
+
395
+ char_count = len(processed_text)
396
+ word_count = len(processed_text.split())
397
+ print(f"[INFO] TRIDIS HTR extracted (multi-line): {char_count} characters, {word_count} words from {len(line_texts)} lines")
398
+
399
+ medieval_features = self._analyze_medieval_features(processed_text)
400
+ if medieval_features:
401
+ print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
402
+
403
+ return processed_text.strip()
404
+
405
+ # Single line or no layout detected — process full image
406
+ print("[INFO] Single line or no layout segmentation. Processing full image with TRIDIS HTR...")
407
+ enhanced_image = self._preprocess_for_medieval_manuscript(image)
408
+
409
+ # Process with TRIDIS HTR
410
+ print("[INFO] Running TRIDIS HTR inference...")
411
+ pixel_values = self.tridis_processor(
412
+ images=enhanced_image,
413
+ return_tensors="pt"
414
+ ).pixel_values.to(self.device)
415
+
416
+ # Generate text with parameters optimized for medieval manuscripts
417
+ with torch.inference_mode():
418
+ generated_ids = self.tridis_model.generate(
419
+ pixel_values,
420
+ max_length=768, # Longer sequences for medieval texts with abbreviations
421
+ num_beams=6, # Higher quality beam search for historical accuracy
422
+ early_stopping=True,
423
+ do_sample=False,
424
+ repetition_penalty=1.15, # Avoid repetition common in medieval texts
425
+ length_penalty=0.8, # Don't penalize longer expansions
426
+ no_repeat_ngram_size=2 # Avoid immediate repetitions
427
+ )
428
+
429
+ # Decode the generated text
430
+ generated_text = self.tridis_processor.batch_decode(
431
+ generated_ids,
432
+ skip_special_tokens=True
433
+ )[0]
434
+
435
+ # Post-process medieval abbreviations, corrections, and formatting
436
+ processed_text = self._post_process_medieval_text(generated_text)
437
+
438
+ # Log extraction results
439
+ char_count = len(processed_text)
440
+ word_count = len(processed_text.split())
441
+ print(f"[INFO] TRIDIS HTR extracted: {char_count} characters, {word_count} words")
442
+
443
+ # Detect medieval features
444
+ medieval_features = self._analyze_medieval_features(processed_text)
445
+ if medieval_features:
446
+ print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")
447
+
448
+ return processed_text.strip()
449
+
450
+ except Exception as e:
451
+ print(f"[ERROR] TRIDIS HTR extraction failed: {e}")
452
+ return ""
453
+
454
+ def _ocr_single_crop_with_tridis(self, crop_image):
455
+ """Helper to run TRIDIS HTR inference on a single line crop image"""
456
+ try:
457
+ pixel_values = self.tridis_processor(
458
+ images=crop_image,
459
+ return_tensors="pt"
460
+ ).pixel_values.to(self.device)
461
+
462
+ with torch.inference_mode():
463
+ generated_ids = self.tridis_model.generate(
464
+ pixel_values,
465
+ max_length=768,
466
+ num_beams=6,
467
+ early_stopping=True,
468
+ do_sample=False,
469
+ repetition_penalty=1.15,
470
+ length_penalty=0.8,
471
+ no_repeat_ngram_size=2
472
+ )
473
+
474
+ text = self.tridis_processor.batch_decode(
475
+ generated_ids,
476
+ skip_special_tokens=True
477
+ )[0]
478
+
479
+ text = ' '.join(text.split())
480
+ return text.strip()
481
+ except Exception as e:
482
+ print(f"[ERROR] Single line OCR with TRIDIS failed: {e}")
483
+ return ""
484
+
485
+ def _preprocess_for_medieval_manuscript(self, image):
486
+ """Enhanced preprocessing specifically optimized for medieval manuscripts"""
487
+ try:
488
+ print("[INFO] Applying medieval manuscript preprocessing...")
489
+
490
+ # Convert to OpenCV format
491
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
492
+ gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
493
+
494
+ # Step 1: Handle parchment/paper background variations
495
+ # CLAHE for local contrast enhancement (handles uneven illumination)
496
+ clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
497
+ contrast_enhanced = clahe.apply(gray)
498
+
499
+ # Step 2: Gentle denoising to preserve medieval letterforms and ink variations
500
+ # Bilateral filter preserves edges while reducing noise
501
+ denoised = cv2.bilateralFilter(contrast_enhanced, 7, 80, 80)
502
+
503
+ # Step 3: Enhance faded ink while preserving original stroke width
504
+ # Subtle sharpening kernel
505
+ sharpen_kernel = np.array([
506
+ [-0.5, -1, -0.5],
507
+ [-1, 6, -1 ],
508
+ [-0.5, -1, -0.5]
509
+ ])
510
+ sharpened = cv2.filter2D(denoised, -1, sharpen_kernel)
511
+
512
+ # Step 4: Normalize intensity range for optimal TRIDIS input
513
+ normalized = cv2.normalize(sharpened, None, 0, 255, cv2.NORM_MINMAX)
514
+
515
+ # Convert back to PIL format and ensure it is RGB mode
516
+ processed_image = Image.fromarray(normalized).convert("RGB")
517
+
518
+ print("[INFO] Medieval preprocessing completed: contrast enhanced, denoised, sharpened")
519
+ return processed_image
520
+
521
+ except Exception as e:
522
+ print(f"[WARN] Medieval preprocessing failed: {e}, using original image")
523
+ return image
524
+
525
+ def _post_process_medieval_text(self, text):
526
+ """Post-process text from TRIDIS HTR with medieval-specific corrections"""
527
+ try:
528
+ if not text:
529
+ return text
530
+
531
+ print("[INFO] Post-processing TRIDIS HTR output for medieval features...")
532
+ processed = text
533
+
534
+ # Handle TRIDIS cancellation/correction markers
535
+ # TRIDIS uses $word$ to mark cancelled/corrected text
536
+ import re
537
+
538
+ # Count cancellations before processing
539
+ cancellation_count = processed.count('$') // 2
540
+
541
+ # Convert $word$ to editorial brackets [word] for scholarly display
542
+ processed = re.sub(r'\$([^$]*)\$', r'[\1]', processed)
543
+
544
+ if cancellation_count > 0:
545
+ print(f"[INFO] Processed {cancellation_count} scribal corrections/cancellations")
546
+
547
+ # Clean up multiple spaces and normalize whitespace
548
+ processed = ' '.join(processed.split())
549
+
550
+ # Detect and log TRIDIS abbreviation expansions
551
+ # Common medieval abbreviations that TRIDIS expands automatically
552
+ medieval_expansions = {
553
+ 'domini': 'dñi/dni/dom̃',
554
+ 'facimus': 'facim̃/facimꝰ',
555
+ 'quod': 'qd/q̃d',
556
+ 'enim': 'enim̃/en̄',
557
+ 'pro': 'ꝓ/p̃',
558
+ 'et': '⁊/et̃',
559
+ 'cum': 'cũ/cum̃',
560
+ 'per': 'p̃/ꝑ',
561
+ 'sunt': 'sũt/sunt̃',
562
+ 'omnia': 'om̃ia/omn̄a'
563
+ }
564
+
565
+ expansions_found = []
566
+ for expansion, abbreviations in medieval_expansions.items():
567
+ if expansion in processed.lower():
568
+ expansions_found.append(f"{abbreviations}→{expansion}")
569
+
570
+ if expansions_found:
571
+ print(f"[INFO] TRIDIS expanded abbreviations: {', '.join(expansions_found[:5])}")
572
+ if len(expansions_found) > 5:
573
+ print(f"[INFO] ... and {len(expansions_found) - 5} more abbreviations")
574
+
575
+ # Detect capitalization patterns (TRIDIS capitalizes named entities)
576
+ capitalized_words = re.findall(r'\b[A-Z][a-z]+', processed)
577
+ if capitalized_words:
578
+ unique_caps = list(set(capitalized_words))
579
+ print(f"[INFO] Named entities capitalized: {', '.join(unique_caps[:5])}")
580
+ if len(unique_caps) > 5:
581
+ print(f"[INFO] ... and {len(unique_caps) - 5} more entities")
582
+
583
+ return processed
584
+
585
+ except Exception as e:
586
+ print(f"[WARN] Medieval post-processing failed: {e}")
587
+ return text
588
+
589
+ def _analyze_medieval_features(self, text):
590
+ """Analyze and identify medieval manuscript features in the text"""
591
+ features = []
592
+
593
+ if not text:
594
+ return features
595
+
596
+ try:
597
+ # Cancellation markers
598
+ if '[' in text and ']' in text:
599
+ features.append("scribal corrections")
600
+
601
+ # Expanded abbreviations
602
+ medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt', 'omnia']
603
+ found_expansions = [word for word in medieval_words if word in text.lower()]
604
+ if found_expansions:
605
+ features.append(f"abbreviation expansions ({len(found_expansions)})")
606
+
607
+ # Named entity capitalization
608
+ import re
609
+ caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
610
+ if caps_count > 0:
611
+ features.append(f"capitalized entities ({caps_count})")
612
+
613
+ # Medieval punctuation patterns
614
+ if '.' in text or ',' in text or ':' in text:
615
+ features.append("punctuation normalization")
616
+
617
+ # Special medieval characters
618
+ medieval_chars = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
619
+ if medieval_chars > 0:
620
+ features.append(f"medieval symbols ({medieval_chars})")
621
+
622
+ except Exception as e:
623
+ print(f"[WARN] Medieval feature analysis failed: {e}")
624
+
625
+ return features
626
+
627
+ def _extract_with_tesseract_enhanced(self, image_path):
628
+ """Enhanced Tesseract extraction with multiple configurations"""
629
+ try:
630
+ import pytesseract
631
+
632
+ image = Image.open(image_path).convert("RGB")
633
+
634
+ # Multiple preprocessing approaches
635
+ preprocessed_images = {
636
+ 'enhanced': self._preprocess_for_tesseract_enhanced(image),
637
+ 'basic': self._preprocess_for_tesseract_basic(image),
638
+ 'original': image
639
+ }
640
+
641
+ best_text = ""
642
+ best_score = 0
643
+ best_config = ""
644
+ best_preprocessing = ""
645
+
646
+ # Try different combinations of preprocessing and OCR configurations
647
+ for prep_name, prep_image in preprocessed_images.items():
648
+ for config_name, config in self.ocr_configs.items():
649
+ try:
650
+ # Try with Latin language first
651
+ text = pytesseract.image_to_string(
652
+ prep_image,
653
+ lang='lat',
654
+ config=config
655
+ ).strip()
656
+
657
+ # If Latin fails or produces poor results, try English
658
+ if not text or len(text) < 5:
659
+ text = pytesseract.image_to_string(
660
+ prep_image,
661
+ lang='eng',
662
+ config=config
663
+ ).strip()
664
+
665
+ # Score the result
666
+ score = self._score_tesseract_result(text)
667
+
668
+ if text and score > best_score:
669
+ best_text = text
670
+ best_score = score
671
+ best_config = config_name
672
+ best_preprocessing = prep_name
673
+
674
+ except Exception as e:
675
+ continue # Skip failed configurations
676
+
677
+ if best_text:
678
+ print(f"[INFO] Best Tesseract result: {best_preprocessing} + {best_config} (score: {best_score:.3f})")
679
+ return self._post_process_tesseract_text(best_text)
680
+
681
+ return ""
682
+
683
+ except Exception as e:
684
+ print(f"[ERROR] Enhanced Tesseract extraction failed: {e}")
685
+ return ""
686
+
687
+ def _extract_layout_aware_ocr(self, image_path):
688
+ """Extract text by segmenting the page layout into lines first for improved readability order"""
689
+ try:
690
+ import pytesseract
691
+ print("[INFO] Running layout-aware line segmentation...")
692
+ layout = self.layout_parser.analyze_layout(image_path)
693
+ crops = self.layout_parser.crop_lines(image_path, layout)
694
+
695
+ if not crops:
696
+ print("[WARN] Layout parser returned no line crops")
697
+ return ""
698
+
699
+ print(f"[INFO] Layout-aware line parser cropped {len(crops)} lines")
700
+ line_texts = []
701
+
702
+ for idx, crop in enumerate(crops):
703
+ # Enhance line crop for OCR
704
+ crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
705
+ gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
706
+ clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
707
+ enhanced = clahe.apply(gray)
708
+ crop_pil = Image.fromarray(enhanced)
709
+
710
+ # Single line OCR configuration
711
+ config = '--oem 3 --psm 7'
712
+
713
+ # Try Latin OCR first
714
+ text = pytesseract.image_to_string(
715
+ crop_pil,
716
+ lang='lat',
717
+ config=config
718
+ ).strip()
719
+
720
+ # Try English fallback
721
+ if not text or len(text) < 3:
722
+ text = pytesseract.image_to_string(
723
+ crop_pil,
724
+ lang='eng',
725
+ config=config
726
+ ).strip()
727
+
728
+ if text:
729
+ line_texts.append(self._post_process_tesseract_text(text))
730
+
731
+ return "\n".join(line_texts)
732
+ except Exception as e:
733
+ print(f"[WARN] Layout aware Latin OCR failed: {e}")
734
+ return ""
735
+
736
+ def _preprocess_for_tesseract_enhanced(self, image):
737
+ """Enhanced preprocessing for Tesseract OCR"""
738
+ try:
739
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
740
+ gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
741
+
742
+ # More aggressive enhancement for Tesseract
743
+ clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
744
+ enhanced = clahe.apply(gray)
745
+
746
+ # Morphological operations to clean up characters
747
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
748
+ cleaned = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
749
+
750
+ return Image.fromarray(cleaned)
751
+
752
+ except Exception as e:
753
+ print(f"[WARN] Enhanced Tesseract preprocessing failed: {e}")
754
+ return image
755
+
756
+ def _preprocess_for_tesseract_basic(self, image):
757
+ """Basic preprocessing for Tesseract OCR"""
758
+ try:
759
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
760
+ gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
761
+
762
+ # Simple contrast enhancement
763
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
764
+ enhanced = clahe.apply(gray)
765
+
766
+ return Image.fromarray(enhanced)
767
+
768
+ except Exception as e:
769
+ return image
770
+
771
+ def _score_tesseract_result(self, text):
772
+ """Score Tesseract OCR result quality"""
773
+ if not text or len(text.strip()) < 2:
774
+ return 0.0
775
+
776
+ score = 0.0
777
+ words = text.split()
778
+
779
+ # Base length bonus
780
+ score += min(len(words) / 15.0, 0.25)
781
+
782
+ # Latin character ratio
783
+ latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
784
+ if len(text) > 0:
785
+ latin_ratio = latin_chars / len(text)
786
+ score += latin_ratio * 0.35
787
+
788
+ # Word formation bonus
789
+ if len(words) > 1:
790
+ score += 0.2
791
+
792
+ # Common Latin words bonus
793
+ common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab', 'post', 'ante', 'inter']
794
+ latin_matches = sum(1 for word in words if word.lower() in common_latin)
795
+ if latin_matches > 0:
796
+ score += latin_matches * 0.05
797
+
798
+ # Medieval symbols bonus
799
+ medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
800
+ symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
801
+ if symbol_count > 0:
802
+ score += 0.15
803
+
804
+ # Penalize excessive garbage characters
805
+ garbage_chars = sum(1 for c in text if not c.isalnum() and c not in " .,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁ")
806
+ if len(text) > 0:
807
+ garbage_ratio = garbage_chars / len(text)
808
+ score -= garbage_ratio * 0.3
809
+
810
+ return max(0.0, min(1.0, score))
811
+
812
+ def _post_process_tesseract_text(self, text):
813
+ """Post-process Tesseract OCR result"""
814
+ try:
815
+ # Clean up common OCR errors
816
+ corrections = {
817
+ 'rn': 'm',
818
+ 'cl': 'd',
819
+ '|': 'I',
820
+ '°': 'o',
821
+ '¢': 'c',
822
+ '£': 'E'
823
+ }
824
+
825
+ processed = text
826
+ for wrong, correct in corrections.items():
827
+ processed = processed.replace(wrong, correct)
828
+
829
+ # Normalize whitespace
830
+ processed = ' '.join(processed.split())
831
+
832
+ return processed
833
+
834
+ except Exception as e:
835
+ print(f"[WARN] Tesseract post-processing failed: {e}")
836
+ return text
837
+
838
+ def _validate_latin_text(self, text, style="any"):
839
+ """Validate text with criteria appropriate for classical/printed or medieval Latin"""
840
+ if not text or len(text.strip()) < 3:
841
+ return False
842
+
843
+ try:
844
+ # Count Latin characters
845
+ latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
846
+ total_chars = len(text.replace(' ', ''))
847
+
848
+ if total_chars == 0:
849
+ return False
850
+
851
+ latin_ratio = latin_chars / max(total_chars, 1)
852
+
853
+ # For printed/classical Latin, we require a high ratio of standard alphabetical letters
854
+ if style == "printed":
855
+ return latin_chars >= 5 and latin_ratio >= 0.6
856
+
857
+ # For cursive/medieval Latin, we can be more generous and include medieval symbol weight
858
+ medieval_symbols = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§[]")
859
+ medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt']
860
+ word_bonus = sum(3 for word in medieval_words if word in text.lower())
861
+
862
+ total_meaningful = latin_chars + medieval_symbols + word_bonus
863
+ meaningful_ratio = total_meaningful / max(total_chars, 1)
864
+
865
+ if total_meaningful >= 10:
866
+ return True
867
+ elif meaningful_ratio >= 0.6:
868
+ return True
869
+ elif total_meaningful >= 5 and meaningful_ratio >= 0.3:
870
+ return True
871
+ else:
872
+ return False
873
+
874
+ except Exception as e:
875
+ print(f"[WARN] Text validation failed: {e}")
876
+ return len(text.strip()) >= 5 # Fallback validation
877
+
878
+
879
+ def process_text(self, latin_text):
880
+ """Process extracted Latin text with comprehensive TRIDIS-aware analysis"""
881
+ if not latin_text:
882
+ return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}
883
+
884
+ print("[INFO] Processing Latin text with medieval manuscript analysis...")
885
+
886
+ # Extract symbols including medieval markers and corrections
887
+ symbols = ''.join(filter(lambda x: x.isalnum() or x in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§$[]", latin_text))
888
+
889
+ # Comprehensive medieval character analysis
890
+ medieval_symbols = [c for c in latin_text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§"]
891
+ correction_markers = latin_text.count('[') + latin_text.count('$')
892
+
893
+ # Detect expanded abbreviations
894
+ medieval_abbreviations = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt', 'omnia']
895
+ expansions_found = [word for word in medieval_abbreviations if word in latin_text.lower()]
896
+
897
+ # Count capitalized entities (TRIDIS feature)
898
+ import re
899
+ capitalized_entities = re.findall(r'\b[A-Z][a-z]+', latin_text)
900
+ unique_entities = list(set(capitalized_entities))
901
+
902
+ # Comprehensive character analysis
903
+ char_analysis = {
904
+ "total_chars": len(latin_text),
905
+ "alpha_chars": sum(c.isalpha() for c in latin_text),
906
+ "unique_chars": len(set(latin_text)),
907
+ "word_count": len(latin_text.split()),
908
+ "medieval_symbols": len(medieval_symbols),
909
+ "medieval_symbol_types": list(set(medieval_symbols)),
910
+ "abbreviation_expansions": expansions_found,
911
+ "expansion_count": len(expansions_found),
912
+ "correction_markers": correction_markers,
913
+ "capitalized_entities": unique_entities,
914
+ "entity_count": len(unique_entities),
915
+ "avg_word_length": sum(len(word) for word in latin_text.split()) / max(1, len(latin_text.split()))
916
+ }
917
+
918
+ # Enhanced validation with medieval features
919
+ validation = {
920
+ "latin_ratio": sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in latin_text) / max(1, len(latin_text)),
921
+ "quality_score": self._calculate_comprehensive_quality_score(latin_text),
922
+ "ocr_method": getattr(self, 'active_model', "TRIDIS HTR (Medieval Manuscript Specialist)" if self.tridis_available else "Tesseract OCR"),
923
+ "model_specialization": "General Latin text" if getattr(self, 'active_style', '') == 'printed' else ("13th-16th century manuscripts" if self.tridis_available else "General Latin text"),
924
+ "medieval_features_detected": bool(medieval_symbols or expansions_found or correction_markers),
925
+ "tridis_used": getattr(self, 'active_model', '') == 'tridis_HTR',
926
+ "manuscript_period": "Classical/Roman Monumental" if getattr(self, 'active_style', '') == 'printed' else ("Late Medieval (13th-16th centuries)" if (medieval_symbols or expansions_found) else "Classical/Modern"),
927
+ "text_type": "classical_inscription" if getattr(self, 'active_style', '') == 'printed' else self._determine_text_type(latin_text),
928
+ "abbreviations_expanded": len(expansions_found) > 0,
929
+ "named_entities_detected": len(unique_entities) > 0,
930
+ "scribal_corrections_found": correction_markers > 0,
931
+ "confidence_level": self._determine_confidence_level(latin_text),
932
+ "writing_style": getattr(self, 'active_style', 'cursive')
933
+ }
934
+
935
+ return {
936
+ "text": latin_text,
937
+ "symbols": symbols,
938
+ "char_analysis": char_analysis,
939
+ "validation": validation
940
+ }
941
+
942
+ def _calculate_comprehensive_quality_score(self, text):
943
+ """Calculate comprehensive quality score with medieval bonuses"""
944
+ if not text:
945
+ return 0.0
946
+
947
+ score = 0.0
948
+ words = text.split()
949
+
950
+ # Base metrics
951
+ score += min(len(words) / 15.0, 0.2) # Length bonus (max 0.2)
952
+
953
+ # Latin character ratio
954
+ latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
955
+ score += (latin_chars / max(1, len(text))) * 0.25
956
+
957
+ # TRIDIS Medieval bonuses (only if TRIDIS was used)
958
+ if self.tridis_available and getattr(self, 'active_model', '') == 'tridis_HTR':
959
+ # Expanded abbreviations (major quality indicator)
960
+ medieval_expansions = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt']
961
+ expansion_count = sum(1 for exp in medieval_expansions if exp in text.lower())
962
+ score += min(expansion_count * 0.05, 0.2) # Max 0.2 bonus
963
+
964
+ # Named entity capitalization (TRIDIS feature)
965
+ import re
966
+ caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
967
+ score += min(caps_count * 0.02, 0.15) # Max 0.15 bonus
968
+
969
+ # Correction markers (authenticity indicator)
970
+ corrections = text.count('[') + text.count('$')
971
+ score += min(corrections * 0.03, 0.1) # Max 0.1 bonus
972
+
973
+ # Medieval symbols (regardless of OCR method)
974
+ medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
975
+ symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
976
+ score += min(symbol_count * 0.04, 0.15) # Max 0.15 bonus
977
+
978
+
979
+ # Word formation
980
+ if len(words) > 1:
981
+ score += 0.1
982
+
983
+ # Common Latin words
984
+ common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab']
985
+ latin_matches = sum(1 for word in words if word.lower() in common_latin)
986
+ score += min(latin_matches * 0.02, 0.1)
987
+
988
+ return max(0.0, min(1.0, score))
989
+
990
+ def _determine_text_type(self, text):
991
+ """Determine the type of Latin text based on features"""
992
+ if not text:
993
+ return "unknown"
994
+
995
+ # Medieval indicators
996
+ medieval_expansions = ['domini', 'facimus', 'quod', 'enim']
997
+ has_expansions = any(exp in text.lower() for exp in medieval_expansions)
998
+ has_corrections = '[' in text or '$' in text
999
+ has_medieval_symbols = any(c in text for c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
1000
+
1001
+ if has_expansions and has_corrections:
1002
+ return "medieval_documentary_manuscript"
1003
+ elif has_expansions or has_medieval_symbols:
1004
+ return "medieval_manuscript"
1005
+ elif has_corrections:
1006
+ return "manuscript_with_corrections"
1007
+ else:
1008
+ return "classical_latin_text"
1009
+
1010
+ def _determine_confidence_level(self, text):
1011
+ """Determine confidence level based on text characteristics"""
1012
+ score = self._calculate_comprehensive_quality_score(text)
1013
+
1014
+ if score >= 0.8:
1015
+ return "Very High"
1016
+ elif score >= 0.6:
1017
+ return "High"
1018
+ elif score >= 0.4:
1019
+ return "Medium"
1020
+ elif score >= 0.2:
1021
+ return "Low"
1022
+ else:
1023
+ return "Very Low"
1024
+
1025
+ def generate_historical_context(self, processed_result):
1026
+ """Generate comprehensive historical context for Latin text"""
1027
+ latin_text = processed_result.get("text", "")
1028
+
1029
+ groq_detail = self._generate_groq_context(latin_text)
1030
+
1031
+ # Build references using words/symbols in Latin text
1032
+ words = re.findall(r'\w+', latin_text) if latin_text else []
1033
+ query_terms = list(words)
1034
+ if latin_text:
1035
+ query_terms.extend([char for char in latin_text if char.strip()])
1036
+ refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
1037
+
1038
+ return {
1039
+ "uses_box": {
1040
+ "title": "Medieval Latin manuscript analysis",
1041
+ "items": self._build_uses_list(latin_text)
1042
+ },
1043
+ "meaning_box": self._build_enhanced_meaning_box(latin_text, groq_detail, processed_result),
1044
+ "references": refs
1045
+ }
1046
+
1047
+ def _generate_groq_context(self, latin_text):
1048
+ """Generate contextual information using Groq with medieval awareness"""
1049
+ if not self.groq_client.is_available():
1050
+ return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."
1051
+
1052
+ # Analyze medieval features for context
1053
+ has_expansions = any(word in latin_text.lower() for word in ['domini', 'facimus', 'quod', 'enim'])
1054
+ has_corrections = '[' in latin_text or '$' in latin_text
1055
+ has_caps = any(c.isupper() for c in latin_text)
1056
+
1057
+ if is_gibberish(latin_text):
1058
+ prompt = (
1059
+ "The following sequence appears to be fragmentary medieval Latin text, possibly with scribal abbreviations or corrections. "
1060
+ "Provide a concise, scholarly paragraph (6-10 sentences) covering possible meanings, historical context of medieval Latin manuscripts, "
1061
+ "common abbreviation practices, and typical documentary uses in 13th-16th century Europe."
1062
+ )
1063
+ else:
1064
+ context_note = ""
1065
+ if has_expansions:
1066
+ context_note += "The text contains expanded medieval abbreviations. "
1067
+ if has_corrections:
1068
+ context_note += "Scribal corrections or cancellations are present. "
1069
+ if has_caps:
1070
+ context_note += "Named entities appear to be properly capitalized. "
1071
+
1072
+ prompt = (
1073
+ f"Analyze this medieval Latin text: {latin_text}\n\n"
1074
+ f"Context: {context_note}This appears to be from a medieval manuscript (13th-16th centuries). "
1075
+ f"Provide a scholarly paragraph (6-10 sentences) on its historical significance, cultural context, "
1076
+ f"likely documentary purpose, and interpretations. Focus on medieval manuscript practices, "
1077
+ f"legal/administrative contexts, and paleographic significance."
1078
+ )
1079
+
1080
+ system_prompt = "You are a medieval Latin paleography specialist and historian. Provide accurate, concise scholarly analysis focusing on manuscript traditions, abbreviation practices, and documentary contexts of the late medieval period."
1081
+ enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, latin_text)
1082
+
1083
+ return self.groq_client.generate_response(
1084
+ system_prompt=enriched_system_prompt,
1085
+ user_prompt=prompt
1086
+ ) or "(Historical context unavailable due to Groq error)"
1087
+
1088
+ def _build_uses_list(self, latin_text):
1089
+ """Build enhanced list of character uses with TRIDIS context"""
1090
+ notes = self.references.get("latin_symbol_notes", {}) or {}
1091
+ default_hint = self.references.get("latin_hint",
1092
+ "Letters and symbols reflect phonetic values and scribal practices in medieval manuscripts.")
1093
+
1094
+ seen = set()
1095
+ items = []
1096
+
1097
+ # Add TRIDIS-specific information for medieval features
1098
+ tridis_notes = {
1099
+ '[': "Editorial bracket indicating scribal correction or cancellation (TRIDIS transcription standard)",
1100
+ '$': "Cancellation marker for struck-through text (TRIDIS notation)",
1101
+ }
1102
+
1103
+ for ch in latin_text:
1104
+ if ch in seen or not ch.strip():
1105
+ continue
1106
+ seen.add(ch)
1107
+
1108
+ # Check TRIDIS-specific notes first
1109
+ if ch in tridis_notes:
1110
+ note = tridis_notes[ch]
1111
+ elif ch in notes:
1112
+ note = notes[ch]
1113
+ else:
1114
+ note = default_hint
1115
+
1116
+ items.append(f"- {ch}: {note}")
1117
+
1118
+ if not items:
1119
+ items.append("- —: " + default_hint)
1120
+
1121
+ # Limit to prevent overwhelming output
1122
+ return items[:20]
1123
+
1124
+ def _build_enhanced_meaning_box(self, latin_text, groq_detail, processed_result):
1125
+ """Build comprehensive meaning box with TRIDIS medieval analysis"""
1126
+ char_analysis = processed_result.get("char_analysis", {})
1127
+ validation = processed_result.get("validation", {})
1128
+
1129
+ # Enhanced introduction with TRIDIS context
1130
+ processing_method = validation.get("ocr_method", "Unknown OCR")
1131
+ text_type = validation.get("text_type", "unknown")
1132
+ confidence = validation.get("confidence_level", "Unknown")
1133
+
1134
+ intro_lines = [
1135
+ f"Text processed using {processing_method} with confidence level: {confidence}.",
1136
+ ]
1137
+
1138
+ if self.tridis_available:
1139
+ intro_lines.extend([
1140
+ "TRIDIS HTR model trained on 245,000 lines of medieval manuscripts (13th-16th centuries).",
1141
+ "Specializes in Latin, Old French, Old Spanish documentary texts with automatic abbreviation expansion."
1142
+ ])
1143
+
1144
+ # Medieval features summary
1145
+ medieval_features = []
1146
+ expansion_count = char_analysis.get("expansion_count", 0)
1147
+ if expansion_count > 0:
1148
+ medieval_features.append(f"{expansion_count} abbreviation expansions")
1149
+
1150
+ correction_count = char_analysis.get("correction_markers", 0)
1151
+ if correction_count > 0:
1152
+ medieval_features.append(f"{correction_count} scribal corrections")
1153
+
1154
+ entity_count = char_analysis.get("entity_count", 0)
1155
+ if entity_count > 0:
1156
+ medieval_features.append(f"{entity_count} named entities")
1157
+
1158
+ if medieval_features:
1159
+ intro_lines.append(f"Medieval features detected: {', '.join(medieval_features)}.")
1160
+
1161
+ # Key terms for frequent list
1162
+ expansions = char_analysis.get("abbreviation_expansions", [])
1163
+ entities = char_analysis.get("capitalized_entities", [])
1164
+ frequent_terms = expansions + entities
1165
+
1166
+ if not frequent_terms:
1167
+ frequent_terms = list(set(w for w in latin_text.split() if len(w) > 2))[:10]
1168
+
1169
+ # Enhanced analysis points
1170
+ points = []
1171
+
1172
+ if self.tridis_available:
1173
+ points.extend([
1174
+ "• TRIDIS HTR provides semi-diplomatic transcription following scholarly editorial standards.",
1175
+ "• Automatic abbreviation expansion: dom̃→domini, facimꝰ→facimus, ꝓ→pro, ⁊→et.",
1176
+ "• Named entity capitalization and punctuation normalization applied."
1177
+ ])
1178
+ else:
1179
+ points.append("��� Tesseract OCR provides basic Latin character recognition with limited medieval symbol support.")
1180
+
1181
+ if correction_count > 0:
1182
+ points.append(f"• [{correction_count}] scribal corrections/cancellations indicate active manuscript editing process.")
1183
+
1184
+ if expansion_count > 0:
1185
+ expansions_list = ", ".join(char_analysis.get("abbreviation_expansions", [])[:5])
1186
+ points.append(f"• Expanded abbreviations suggest legal/administrative document: {expansions_list}.")
1187
+
1188
+ if validation.get("medieval_features_detected", False):
1189
+ manuscript_period = validation.get("manuscript_period", "Medieval")
1190
+ points.append(f"• {manuscript_period} characteristics indicate documentary manuscript tradition.")
1191
+
1192
+ if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
1193
+ points.append(f"• Historical analysis: {groq_detail.strip()}")
1194
+
1195
+ return {
1196
+ "title": "Medieval Latin manuscript analysis:",
1197
+ "intro_lines": intro_lines,
1198
+ "frequent_label": "Key medieval terms identified",
1199
+ "frequent": frequent_terms[:12],
1200
+ "points": points
1201
+ }
1202
+
1203
+ def generate_story(self, processed_result):
1204
+ """Generate creative story with medieval manuscript context"""
1205
+ latin_text = processed_result.get("text", "")
1206
+
1207
+ if not self.groq_client.is_available():
1208
+ return "Groq client unavailable, cannot generate historical narrative."
1209
+
1210
+ # Analyze text features for story context
1211
+ char_analysis = processed_result.get("char_analysis", {})
1212
+ validation = processed_result.get("validation", {})
1213
+
1214
+ has_expansions = char_analysis.get("expansion_count", 0) > 0
1215
+ has_corrections = char_analysis.get("correction_markers", 0) > 0
1216
+ has_entities = char_analysis.get("entity_count", 0) > 0
1217
+ text_type = validation.get("text_type", "unknown")
1218
+ used_tridis = validation.get("tridis_used", False)
1219
+
1220
+ # Choose appropriate narrative style based on detected features
1221
+ if "documentary" in text_type or has_expansions:
1222
+ styles = [
1223
+ "as a legal charter discovered in monastic archives",
1224
+ "as an administrative record from a medieval royal court",
1225
+ "as a property deed found in cathedral scriptorium",
1226
+ "as a guild register from a medieval trading city",
1227
+ "as a tax record from a 14th-century monastery"
1228
+ ]
1229
+ elif has_corrections or has_entities:
1230
+ styles = [
1231
+ "as a monk's working manuscript with personal annotations",
1232
+ "as a scholar's commentary on ancient texts",
1233
+ "as a chronicle being revised by a medieval historian",
1234
+ "as a theological treatise with scribal corrections",
1235
+ "as a copy of classical texts with medieval glosses"
1236
+ ]
1237
+ else:
1238
+ styles = [
1239
+ "as a sacred text illuminated by medieval scribes",
1240
+ "as a philosophical work from a cathedral school",
1241
+ "as a liturgical manuscript from a monastic library",
1242
+ "as a medical treatise translated in medieval Spain",
1243
+ "as an astronomical text from a medieval university"
1244
+ ]
1245
+
1246
+ import random
1247
+ chosen_style = random.choice(styles)
1248
+ seed = random.randint(1000, 9999)
1249
+
1250
+ # Craft historically-informed prompt
1251
+ processing_context = "deciphered using advanced medieval manuscript AI" if used_tridis else "carefully transcribed from the original"
1252
+ time_period = "13th-16th centuries" if (has_expansions or has_corrections) else "medieval period"
1253
+
1254
+ prompt = (
1255
+ f"This Latin manuscript text was {processing_context}: {latin_text}\n\n"
1256
+ f"Historical context: The text appears to be from the {time_period}, "
1257
+ f"{'with expanded abbreviations and scribal corrections typical of documentary manuscripts' if has_expansions else 'showing characteristics of medieval scholarly tradition'}.\n\n"
1258
+ f"Create a vivid, historically accurate narrative (250+ words) set in medieval Europe, "
1259
+ f"telling the story of this manuscript's creation and significance. "
1260
+ f"Write {chosen_style}.\n\n"
1261
+ f"Include: Medieval setting, authentic historical details, multiple characters, "
1262
+ f"the process of manuscript creation, and the document's importance to its community.\n"
1263
+ f"Narrative seed: {seed}"
1264
+ )
1265
+
1266
+ system_prompt = (
1267
+ "You are a medieval historian and storyteller specializing in manuscript culture, "
1268
+ "paleography, and daily life in 13th-16th century Europe. Create authentic, "
1269
+ "engaging narratives that reflect accurate historical knowledge of medieval "
1270
+ "scriptoriums, legal practices, and scholarly traditions."
1271
+ )
1272
+
1273
+ story = self.groq_client.generate_response(
1274
+ system_prompt=system_prompt,
1275
+ user_prompt=prompt
1276
+ )
1277
+
1278
+ if not story or is_gibberish(story):
1279
+ return "Failed to generate historical narrative; medieval story creation unavailable."
1280
+
1281
+ return story
references.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "egypt_symbol_notes": {
3
+ "bow": "Warfare and hunting tool; often a phonogram (q/p/k depending on context) and a determinative for martial power, subjugation, or protection.",
4
+ "hoe": "Agricultural implement tied to cultivation and ritual ‘preparation’; used as a determinative and phonetic sign.",
5
+ "god_figure": "Anthropomorphic marker indicating divine agency; commonly a determinative for deities.",
6
+ "bread": "Offering loaf symbolizing food and sustenance; phonetic value 't'.",
7
+ "feather": "Represents the concepts of truth and balance; phonetic value 'm'.",
8
+ "eye": "Wedjat eye symbolizing protection and healing; phonetic for 'ir' or 'jr'.",
9
+ "owl": "Common phonogram 'm'; also signifies night and hidden knowledge.",
10
+ "reed": "Symbol for writing and speech; phonetic and determinative use.",
11
+ "scribe_tools": "Represents writing, record-keeping, and administration.",
12
+ "leg": "Indicates motion, going forth, phonetic sign.",
13
+ "lizard": "Represents reptiles; associated with protective symbolism.",
14
+ "woman_seated": "Determinative for female persons or roles.",
15
+ "jar": "Associated with offerings, fluids, and ritual context.",
16
+ "crown": "Symbol of royal authority and divine power.",
17
+ "man_seated": "Determinative for male persons or generic agents."
18
+ },
19
+
20
+ "greek_symbol_notes": {
21
+ "Κ": "Kappa: Tenth letter, consonant /k/ sound, equivalent to Latin C/K.",
22
+ "γ": "Lowercase gamma, consonant /g/ sound, or nasal /ng/ before γ, κ, χ, ξ sounds.",
23
+ "ι": "Lowercase iota, vowel /i/ sound, can form diphthongs with other vowels.",
24
+ "Ν": "Nu: Thirteenth letter, consonant /n/ sound, nasal consonant with grammatical functions.",
25
+ "Τ": "Tau: Nineteenth letter, consonant /t/ sound, common in grammatical endings.",
26
+ "ο": "Lowercase omicron, short vowel /o/ sound, frequent in grammatical morphemes.",
27
+ "λ": "Lowercase lambda, consonant /l/ sound, liquid consonant with metrical significance.",
28
+ "θ": "Lowercase theta, aspirated /tʰ/ sound, distinguishes words from similar tau forms.",
29
+ "υ": "Lowercase upsilon, vowel /y/ sound, forms diphthongs and appears in many endings.",
30
+ "Θ": "Theta: Eighth letter, aspirated /tʰ/ sound in ancient Greek, /θ/ (voiceless th) in modern.",
31
+ "Η": "Eta: Seventh letter, long vowel /ē/ sound in ancient Greek, /i/ in modern Greek.",
32
+ "ς": "Lowercase sigma (final form), consonant /s/ sound, used only at word endings.",
33
+ "ε": "Lowercase epsilon, short vowel /e/ sound, appears frequently in verb conjugations.",
34
+ "-": "Hyphen; marks word division or compound elements in Greek texts.",
35
+ "Ὶ": "Capital Iota with grave accent, indicates lowered pitch or unstressed position.",
36
+ "ῖ": "Lowercase iota with circumflex accent, indicates falling tone on long vowel /ī/.",
37
+ "ί": "Lowercase iota with acute accent, vowel /i/ with raised pitch indicating word stress.",
38
+ "Ἰ": "Capital Iota with smooth breathing, vowel /i/ without initial aspiration.",
39
+ "Ἑ": "Capital Epsilon with rough breathing, indicates /h/ sound before vowel.",
40
+ "'": "Apostrophe; indicates elision (omitted vowel) or contraction in Greek.",
41
+ "Π": "Pi: Sixteenth letter, consonant /p/ sound, appears in mathematical and scientific contexts.",
42
+ "Ο": "Omicron: Fifteenth letter, short vowel /o/ sound, distinct from omega (long o).",
43
+ "Μ": "Mu: Twelfth letter, consonant /m/ sound, nasal consonant often used in word formation.",
44
+ "[": "Opening square bracket; typically editorial reconstructions or uncertain readings.",
45
+ "Α": "Alpha: First letter of Greek alphabet, vowel /a/ sound, often marks beginnings or primacy.",
46
+ "μ": "Lowercase mu, consonant /m/ sound, nasal consonant often in prefixes and roots.",
47
+ "ὲ": "Lowercase epsilon with grave accent, short /e/ sound with lowered pitch.",
48
+ "Ᾱ": "Capital Alpha with macron (long mark), indicates long /ā/ vowel quantity.",
49
+ "Γ": "Gamma: Third letter, consonant /g/ sound, or /ng/ before γ, κ, χ, ξ sounds.",
50
+ "Υ": "Upsilon: Twentieth letter, vowel /y/ sound in ancient Greek, /i/ in modern pronunciation.",
51
+ "(": "Opening parenthesis; editorial or explanatory insertions.",
52
+ ")": "Closing parenthesis; completes editorial or explanatory insertions.",
53
+ "ω": "Lowercase omega, long vowel /ō/ sound, often in verb endings and declensions.",
54
+ "ῑ": "Lowercase iota with macron, explicitly marks long vowel quantity /ī/.",
55
+ "·": "Middle dot (Greek semicolon); equivalent to modern semicolon, marks major pause.",
56
+ "ῐ": "Lowercase iota with breve, explicitly marks short vowel quantity /ĭ/.",
57
+ "Ξ": "Xi: Fourteenth letter, consonant cluster /ks/ sound, compound sound written as single letter.",
58
+ "ν": "Lowercase nu, consonant /n/ sound, assimilates before consonants in pronunciation.",
59
+ "Ε": "Epsilon: Fifth letter, short vowel /e/ sound, distinct from eta (long e).",
60
+ "η": "Lowercase eta, long vowel /ē/ sound in ancient Greek, /i/ in modern pronunciation.",
61
+ "]": "Closing square bracket; completes editorial reconstructions.",
62
+ "Ι": "Iota: Ninth letter, vowel /i/ sound, can form diphthongs with other vowels.",
63
+ "κ": "Lowercase kappa, consonant /k/ sound, common in word formation and inflection.",
64
+ "1": "Numeral one; manuscript numbering, line numbers, or verse citations.",
65
+ "ῃ": "Lowercase eta with iota subscript, indicates original diphthong /ēi/ sound.",
66
+ "ψ": "Lowercase psi, consonant cluster /ps/ sound, compound phoneme as single letter.",
67
+ "ἢ": "Lowercase eta with rough breathing and grave accent, aspirated long vowel with lowered tone.",
68
+ "Ὗ": "Capital Upsilon with rough breathing and circumflex, indicates aspiration and falling tone.",
69
+ "Ἱ": "Capital Iota with rough breathing, vowel /i/ with initial aspiration /h/.",
70
+ "Ᾰ": "Capital Alpha with breve (short mark), indicates short /ă/ vowel quantity.",
71
+ "Ί": "Capital Iota with acute accent, indicates raised pitch or primary word stress.",
72
+ "Λ": "Lambda: Eleventh letter, consonant /l/ sound, liquid consonant in Greek phonology.",
73
+ "\"": "Quotation mark; marks direct speech or citations in Greek texts.",
74
+ "σ": "Lowercase sigma (medial form), consonant /s/ sound, used within words.",
75
+ "Ἡ": "Capital Eta with rough breathing, long /ē/ sound with initial aspiration /h/.",
76
+ "Χ": "Chi: Twenty-second letter, aspirated /kʰ/ sound in ancient Greek, /x/ (voiceless velar fricative) in modern.",
77
+ "ζ": "Lowercase zeta, consonant cluster /zd/ sound, represents compound phoneme.",
78
+ "Ἷ": "Capital Iota with rough breathing and circumflex accent, complex tonal marking.",
79
+ "ὶ": "Lowercase iota with grave accent, vowel /i/ with lowered pitch or unstressed.",
80
+ "ἰ": "Lowercase iota with smooth breathing, vowel /i/ without initial aspiration.",
81
+ "α": "Lowercase alpha, vowel /a/ sound, fundamental vowel in Greek phonology.",
82
+ ",": "Comma; punctuation for pauses, lists, or grammatical separation.",
83
+ "ᾗ": "Lowercase eta with rough breathing, circumflex accent, and iota subscript, complex phonetic marking.",
84
+ "τ": "Lowercase tau, consonant /t/ sound, appears in many grammatical suffixes.",
85
+ "<": "Less-than symbol; editorial mark for textual corrections or variants.",
86
+ "Σ": "Sigma: Eighteenth letter, consonant /s/ sound, has special final form (ς) at word end.",
87
+ "ρ": "Lowercase rho, consonant /r/ sound, when word-initial requires rough breathing mark.",
88
+ "ἡ": "Lowercase eta with rough breathing, long /ē/ sound with initial /h/.",
89
+ "Ω": "Omega: Twenty-fourth letter, long vowel /ō/ sound in ancient Greek, /o/ in modern.",
90
+ ".": "Period (full stop); marks end of sentences in Greek texts.",
91
+ "Ῥ": "Capital Rho with rough breathing, indicates /hr/ sound at word beginning (all word-initial rhos are aspirated).",
92
+ "ἕ": "Lowercase epsilon with rough breathing and acute accent, aspirated short vowel with raised tone.",
93
+ "ῆ": "Lowercase eta with circumflex accent, falling tone on long vowel /ē/.",
94
+ "Ἶ": "Capital Iota with smooth breathing and circumflex accent, vowel /ī/ with falling tone, no initial aspiration.",
95
+ "β": "Lowercase beta, consonant /b/ sound in ancient Greek, /v/ sound in modern Greek pronunciation.",
96
+ "Ὁ": "Capital Omicron with rough breathing, short /o/ sound with initial aspiration /h/.",
97
+ "Ϊ": "Capital Iota with diaeresis (trema), indicates /i/ vowel pronounced separately, not as diphthong.",
98
+ "Φ": "Phi: Twenty-first letter, aspirated /pʰ/ sound in ancient Greek, /f/ in modern pronunciation.",
99
+ "ῗ": "Lowercase iota with diaeresis and circumflex, /ī/ vowel with falling tone, pronounced separately.",
100
+ "έ": "Lowercase epsilon with acute accent (modern Greek), short /e/ sound with stress marking.",
101
+ "ἷ": "Lowercase iota with rough breathing and circumflex accent, aspirated /ī/ vowel with falling tone."
102
+
103
+ }
104
+ ,
105
+ "latin_symbol_notes": {
106
+ "꜠": "Modifier letter for stress and high tone, used in phonetic transcription and transliteration.",
107
+ "꜡": "Modifier letter for stress and low tone, common in linguistic notation.",
108
+ "Ꜣ": "Capital Letter Egyptological Alef, used in transliterating Egyptian hieroglyphs.",
109
+ "ꜣ": "Small Letter Egyptological Alef, counterpart to capital version.",
110
+ "Ꜥ": "Capital Letter Egyptological Ain, reflecting voiced pharyngeal sounds in transliteration.",
111
+ "ꜥ": "Small Letter Egyptological Ain, used in Semitic transliterations.",
112
+ "Ꝁ": "Capital Letter K with Stroke, scribal abbreviation mark for legal or medieval texts.",
113
+ "ꝁ": "Small Letter K with Stroke, similar abbreviation symbol.",
114
+ "ꝑ": "Small Letter P with Stroke Through Descender, abbreviation of 'per' in medieval Latin manuscripts.",
115
+ "ꝛ": "Small Letter R Rotunda, a stylistic medieval form of 'r' to save space.",
116
+ "Ꞁ": "Capital Letter Turned L, used in paleography to denote variant forms.",
117
+ "ꞁ": "Small Letter Turned L, lowercase variant in medieval scripts.",
118
+ "Ꞃ": "Capital Letter Insular R, found in Insular script manuscripts in medieval Britain and Ireland.",
119
+ "Ꝼ": "Capital Letter Insular F, distinct letter in Celtic Insular manuscripts.",
120
+ "ꟽ": "Epigraphic Letter Inverted M, used as a logogram for 'mulier' or 'matrona' in Roman inscriptions.",
121
+ "ꟿ": "Epigraphic Letter Archaic M, represents the praenomen 'Manius' in inscriptions."
122
+ },"cuneiform_symbol_notes": {
123
+ "𒀀": "Cuneiform sign A: vowel sound /a/ in Sumerian and Akkadian, fundamental vowel marker",
124
+ "𒀭": "Cuneiform sign AN/DINGIR: divine determinative, heaven, god concept in religious texts",
125
+ "𒈗": "Cuneiform sign LUGAL: king, ruler, sovereign used in royal inscriptions and titles",
126
+ "𒊕": "Cuneiform sign UD: day, sun, light, time marker in calendrical and chronological contexts",
127
+ "𒄿": "Cuneiform sign I: vowel /i/, often used in verbal forms and grammatical particles",
128
+ "𒂍": "Cuneiform sign E: house, temple, building in architectural and religious contexts",
129
+ "𒀸": "Cuneiform sign ARAD: servant, slave, worker in administrative and legal documents",
130
+ "𒁹": "Cuneiform sign DIRIG: to exceed, surplus, extra in mathematical and accounting texts",
131
+ "𒉋": "Cuneiform sign TI: life, to live, arrow in medical, military, and philosophical contexts",
132
+ "𒆠": "Cuneiform sign KI: earth, place, land in geographical and territorial designations",
133
+ "𒌓": "Cuneiform sign ZU: to know, knowledge, wisdom in educational and scribal contexts",
134
+ "𒈨": "Cuneiform sign ME: divine powers, cultural practices in mythological and religious texts",
135
+ "𒉿": "Cuneiform sign TUKU: to have, possess, hold in commercial and legal transactions",
136
+ "𒄩": "Cuneiform sign HA: fish, to catch in texts about fishing, food, and economy",
137
+ "𒁇": "Cuneiform sign DU: to go, walk, build in construction, travel, and action contexts",
138
+ "lugal": "ATF: lugal - Sumerian/Akkadian for 'king', royal title in administrative texts",
139
+ "an": "ATF: an - Sky god An/Anu, heaven concept in religious and mythological contexts",
140
+ "ki": "ATF: ki - Earth, place, land in geographical and cosmological descriptions",
141
+ "dingir": "ATF: dingir - God, divine being, deity in religious and ceremonial texts",
142
+ "sar": "ATF: sar - To write, inscription, totality in scribal and administrative contexts",
143
+ "{d}": "ATF: determinative for divine names, indicates following word refers to a deity"
144
+ },
145
+ "cuneiform_hint": "Cuneiform signs represent syllables, words, or concepts in ancient Mesopotamian languages (Sumerian, Akkadian, Hittite, etc.). ATF format uses Latin transliteration of cuneiform symbols.",
146
+ "greek_hint": "If no specific character note is found, treat as lexical marker considering diacriticals (breathing marks, accents, vowel quantity) which affect pronunciation, meaning, and grammatical function in ancient Greek texts.",
147
+ "latin_hint": "Letters and symbols reflect phonetic values and scribal practices in manuscripts."
148
+ }
149
+
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DecipherAI Backend — Production Dependencies (Hugging Face Spaces)
2
+ # CPU-only torch for HF Spaces free tier
3
+
4
+ # Web framework
5
+ Flask==3.1.3
6
+ flask-cors==6.0.2
7
+ python-dotenv==1.2.2
8
+
9
+ # AI / ML
10
+ groq==1.2.0
11
+ transformers==5.9.0
12
+ safetensors==0.7.0
13
+
14
+ # PyTorch CPU-only (HF Spaces free tier does not have GPU)
15
+ --extra-index-url https://download.pytorch.org/whl/cpu
16
+ torch==2.12.0+cpu
17
+ torchvision==0.27.0+cpu
18
+
19
+ # Image processing
20
+ opencv-python-headless==4.13.0.92
21
+ pillow==12.2.0
22
+ pytesseract==0.3.13
23
+
24
+ # Utilities
25
+ numpy==2.4.4
26
+ regex==2026.5.9
27
+ tqdm==4.67.3
28
+
29
+ # Production WSGI server
30
+ gunicorn==23.0.0
services/__init__.py ADDED
File without changes
services/context_generator.py ADDED
File without changes
services/groq_vision_classifier.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import os
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ from groq import Groq
7
+
8
+
9
+ class GroqVisionScriptClassifier:
10
+ def __init__(self, groq_api_key):
11
+ self.groq_client = Groq(api_key=groq_api_key)
12
+ # FIXED: Use the correct stable model name
13
+ self.vision_model = "meta-llama/llama-4-scout-17b-16e-instruct"
14
+ print(f"[INFO] Groq Vision Classifier initialized with {self.vision_model}")
15
+
16
+ def classify_script(self, image_path):
17
+ """Enhanced script classification including cuneiform using Groq's Llama Vision model"""
18
+ try:
19
+ # Convert image to base64
20
+ base64_image = self._image_to_base64(image_path)
21
+ if not base64_image:
22
+ return "unknown"
23
+
24
+ # Query Groq Vision API
25
+ response = self._query_groq_vision(base64_image)
26
+
27
+ # Parse the response
28
+ script_type = self._parse_classification_response(response)
29
+
30
+ print(f"[INFO] Llama Vision classified script as: {script_type}")
31
+ return script_type.lower()
32
+
33
+ except Exception as e:
34
+ print(f"[ERROR] Groq Vision script classification failed: {e}")
35
+ return "unknown"
36
+
37
+ def _image_to_base64(self, image_path):
38
+ """Convert image to base64 for Groq Vision API (4MB limit)"""
39
+ try:
40
+ image = Image.open(image_path)
41
+
42
+ # Resize if too large (keep under 4MB base64 limit)
43
+ if max(image.size) > 1200:
44
+ image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
45
+
46
+ # Convert to base64 JPEG (smaller than PNG)
47
+ buffer = BytesIO()
48
+ image.save(buffer, format="JPEG", quality=90)
49
+ image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
50
+
51
+ # Check size (base64 should be < 4MB)
52
+ if len(image_b64) > 4 * 1024 * 1024: # 4MB limit
53
+ # Reduce quality and try again
54
+ buffer = BytesIO()
55
+ image.save(buffer, format="JPEG", quality=70)
56
+ image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
57
+
58
+ return image_b64
59
+
60
+ except Exception as e:
61
+ print(f"[ERROR] Image to base64 conversion failed: {e}")
62
+ return None
63
+
64
+ def _query_groq_vision(self, base64_image):
65
+ """Enhanced query for Groq Llama Vision API including cuneiform"""
66
+ try:
67
+ # FIXED: Simplified prompt to avoid token limit issues
68
+ prompt = """Analyze this image of ancient text/script as an expert paleographer.
69
+
70
+ Classify it as ONE of these ancient script types:
71
+
72
+ - EGYPTIAN: Hieroglyphic symbols (birds, eyes, human figures, cartouches)
73
+ - GREEK: Ancient/medieval Greek alphabet (α,β,γ,δ,ε,ζ,η,θ) with diacritics
74
+ - LATIN: Latin alphabet letters, Roman inscriptions, medieval manuscripts
75
+ - CUNEIFORM: Wedge-shaped impressions on clay tablets (triangular marks)
76
+
77
+ IMPORTANT: Cuneiform has geometric wedge patterns, NOT pictures like hieroglyphs.
78
+
79
+ Respond ONLY with JSON:
80
+ {"classification": "EGYPTIAN" or "GREEK" or "LATIN" or "CUNEIFORM", "confidence": 0.0-1.0}"""
81
+
82
+ completion = self.groq_client.chat.completions.create(
83
+ model=self.vision_model,
84
+ messages=[
85
+ {
86
+ "role": "user",
87
+ "content": [
88
+ {"type": "text", "text": prompt},
89
+ {
90
+ "type": "image_url",
91
+ "image_url": {
92
+ "url": f"data:image/jpeg;base64,{base64_image}"
93
+ }
94
+ }
95
+ ]
96
+ }
97
+ ],
98
+ temperature=0.1, # Low temperature for consistent classification
99
+ max_completion_tokens=100, # FIXED: Reduced to avoid token errors
100
+ top_p=0.9,
101
+ stream=False,
102
+ response_format={"type": "json_object"}
103
+ )
104
+
105
+ return completion.choices[0].message.content
106
+
107
+ except Exception as e:
108
+ print(f"[ERROR] Groq Vision API call failed: {e}")
109
+ return None
110
+
111
+ def _parse_classification_response(self, response):
112
+ """Enhanced parsing for JSON response including cuneiform"""
113
+ if not response:
114
+ return "unknown"
115
+
116
+ try:
117
+ # Parse JSON response
118
+ data = json.loads(response)
119
+ classification = data.get('classification', '').upper()
120
+ confidence = data.get('confidence', 0.0)
121
+
122
+ print(f"[INFO] Vision model confidence: {confidence:.3f}")
123
+
124
+ # Enhanced classification mapping including cuneiform
125
+ if classification == "EGYPTIAN":
126
+ return "egyptian"
127
+ elif classification == "GREEK":
128
+ return "greek"
129
+ elif classification == "LATIN":
130
+ return "latin"
131
+ elif classification == "CUNEIFORM":
132
+ return "cuneiform"
133
+ else:
134
+ print(f"[WARN] Unknown classification: {classification}")
135
+ return "unknown"
136
+
137
+ except json.JSONDecodeError:
138
+ print(f"[WARN] Failed to parse JSON response, trying text parsing: {response}")
139
+ # Enhanced fallback to text parsing
140
+ response_upper = response.strip().upper()
141
+
142
+ # Priority order: cuneiform keywords first (most specific)
143
+ cuneiform_keywords = ["CUNEIFORM", "WEDGE", "CLAY", "MESOPOTAMIAN", "AKKADIAN", "SUMERIAN", "BABYLONIAN"]
144
+ if any(keyword in response_upper for keyword in cuneiform_keywords):
145
+ return "cuneiform"
146
+ elif "EGYPTIAN" in response_upper or "HIEROGLYPH" in response_upper:
147
+ return "egyptian"
148
+ elif "GREEK" in response_upper:
149
+ return "greek"
150
+ elif "LATIN" in response_upper or "ROMAN" in response_upper:
151
+ return "latin"
152
+
153
+ except Exception as e:
154
+ print(f"[ERROR] Response parsing failed: {e}")
155
+
156
+ return "unknown"
157
+
158
+ def classify_with_fallback(self, image_path, max_retries=2):
159
+ """Enhanced classification with retry logic"""
160
+ for attempt in range(max_retries + 1):
161
+ try:
162
+ result = self.classify_script(image_path)
163
+
164
+ if result != "unknown":
165
+ return result
166
+ elif attempt < max_retries:
167
+ print(f"[INFO] Classification attempt {attempt + 1} returned unknown, retrying...")
168
+ continue
169
+ else:
170
+ print(f"[WARN] All classification attempts returned unknown")
171
+ return "unknown"
172
+
173
+ except Exception as e:
174
+ if attempt < max_retries:
175
+ print(f"[WARN] Classification attempt {attempt + 1} failed: {e}, retrying...")
176
+ continue
177
+ else:
178
+ print(f"[ERROR] All classification attempts failed: {e}")
179
+ return "unknown"
180
+
181
+ return "unknown"
182
+
183
+ def get_supported_scripts(self):
184
+ """Get list of supported script types"""
185
+ return ["egyptian", "greek", "latin", "cuneiform"]
186
+
187
+ def validate_classification(self, script_type, confidence_threshold=0.7):
188
+ """Validate classification result"""
189
+ supported_scripts = self.get_supported_scripts()
190
+
191
+ if script_type not in supported_scripts:
192
+ print(f"[WARN] Unsupported script type: {script_type}")
193
+ return False
194
+
195
+ # All classifications from Llama Vision are considered valid
196
+ return True
197
+
198
+ def get_model_info(self):
199
+ """Get information about the vision model being used"""
200
+ return {
201
+ "model": self.vision_model,
202
+ "provider": "Groq",
203
+ "supported_scripts": self.get_supported_scripts(),
204
+ "features": [
205
+ "Ancient script classification",
206
+ "Multi-script support",
207
+ "Cuneiform wedge detection",
208
+ "Clay tablet recognition",
209
+ "High-resolution image processing"
210
+ ]
211
+ }
212
+
213
+ def debug_classification(self, image_path, save_debug_info=False):
214
+ """Debug classification with detailed information"""
215
+ try:
216
+ print(f"[DEBUG] Starting classification for: {image_path}")
217
+
218
+ # Check image properties
219
+ image = Image.open(image_path)
220
+ print(f"[DEBUG] Image size: {image.size}")
221
+ print(f"[DEBUG] Image mode: {image.mode}")
222
+
223
+ # Get base64 size
224
+ base64_image = self._image_to_base64(image_path)
225
+ if base64_image:
226
+ print(f"[DEBUG] Base64 size: {len(base64_image)} characters")
227
+
228
+ # Get raw response
229
+ response = self._query_groq_vision(base64_image)
230
+ print(f"[DEBUG] Raw API response: {response}")
231
+
232
+ # Parse and return
233
+ result = self._parse_classification_response(response)
234
+ print(f"[DEBUG] Final classification: {result}")
235
+
236
+ if save_debug_info:
237
+ debug_info = {
238
+ "image_path": image_path,
239
+ "image_size": image.size,
240
+ "base64_length": len(base64_image) if base64_image else 0,
241
+ "raw_response": response,
242
+ "classification": result
243
+ }
244
+
245
+ debug_file = f"debug_classification_{result}_{hash(image_path) % 10000}.json"
246
+ with open(debug_file, 'w') as f:
247
+ json.dump(debug_info, f, indent=2)
248
+ print(f"[DEBUG] Debug info saved to: {debug_file}")
249
+
250
+ return result
251
+
252
+ except Exception as e:
253
+ print(f"[ERROR] Debug classification failed: {e}")
254
+ return "unknown"
services/layout_parser.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PIL import Image
4
+ from typing import List, Dict, Tuple
5
+
6
+ class LayoutParser:
7
+ def __init__(self):
8
+ pass
9
+
10
+ def analyze_layout(self, image_path: str) -> Dict:
11
+ """Analyze document image layout to detect columns, blocks, and lines of text"""
12
+ try:
13
+ img = cv2.imread(image_path)
14
+ if img is None:
15
+ raise FileNotFoundError(f"Image not found: {image_path}")
16
+
17
+ h_img, w_img, _ = img.shape
18
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
19
+
20
+ # Step 1: Preprocess to remove noise and binarize
21
+ # Use Otsu's thresholding after Gaussian blur
22
+ blur = cv2.GaussianBlur(gray, (5, 5), 0)
23
+ _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
24
+
25
+ # Step 2: Dilation to merge words into horizontal line segments
26
+ # Use larger horizontal kernel to join words along text lines
27
+ line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
28
+ dilated = cv2.dilate(thresh, line_kernel, iterations=2)
29
+
30
+ # Step 3: Find contours of lines
31
+ contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
32
+
33
+ lines = []
34
+ for cnt in contours:
35
+ x, y, w, h = cv2.boundingRect(cnt)
36
+
37
+ # Filter out small noise and full page boundaries
38
+ if w < 15 or h < 5:
39
+ continue
40
+ if w > w_img * 0.98 or h > h_img * 0.98:
41
+ continue
42
+
43
+ lines.append({
44
+ "box": (x, y, w, h),
45
+ "area": w * h
46
+ })
47
+
48
+ # Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
49
+ # We group lines into columns based on horizontal positions
50
+ lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first
51
+
52
+ columns = self._group_lines_into_columns(lines, w_img)
53
+
54
+ structured_layout = {
55
+ "width": w_img,
56
+ "height": h_img,
57
+ "column_count": len(columns),
58
+ "columns": columns
59
+ }
60
+
61
+ print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
62
+ return structured_layout
63
+
64
+ except Exception as e:
65
+ print(f"[ERROR] Layout parsing failed: {e}")
66
+ return {"width": 0, "height": 0, "column_count": 1, "columns": []}
67
+
68
+ def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
69
+ """Group detected text lines into column blocks based on horizontal overlap"""
70
+ if not lines:
71
+ return []
72
+
73
+ # Find horizontal overlaps using a histogram projection
74
+ hist = np.zeros(page_width, dtype=np.int32)
75
+ for line in lines:
76
+ x, _, w, _ = line["box"]
77
+ hist[x:x+w] += 1
78
+
79
+ # Threshold histogram to find column boundaries
80
+ min_col_width = int(page_width * 0.1)
81
+ columns_x = []
82
+ in_col = False
83
+ start_x = 0
84
+
85
+ for x, val in enumerate(hist):
86
+ if val > 1 and not in_col:
87
+ in_col = True
88
+ start_x = x
89
+ elif val <= 1 and in_col:
90
+ in_col = False
91
+ end_x = x
92
+ if (end_x - start_x) >= min_col_width:
93
+ columns_x.append((start_x, end_x))
94
+
95
+ # Handle case where column stretches to the end
96
+ if in_col:
97
+ columns_x.append((start_x, page_width))
98
+
99
+ if not columns_x:
100
+ columns_x = [(0, page_width)]
101
+
102
+ # Assign lines to closest columns
103
+ cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
104
+
105
+ for line in lines:
106
+ x, y, w, h = line["box"]
107
+ line_center_x = x + w / 2
108
+
109
+ # Find the best column index
110
+ best_idx = 0
111
+ min_dist = page_width
112
+ for idx, col in enumerate(cols_data):
113
+ cx_start, cx_end = col["x_range"]
114
+ if cx_start <= line_center_x <= cx_end:
115
+ best_idx = idx
116
+ break
117
+ else:
118
+ dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
119
+ if dist < min_dist:
120
+ min_dist = dist
121
+ best_idx = idx
122
+
123
+ cols_data[best_idx]["lines"].append((x, y, w, h))
124
+
125
+ # Sort lines inside each column by vertical (y) coordinate
126
+ for col in cols_data:
127
+ col["lines"] = sorted(col["lines"], key=lambda box: box[1])
128
+
129
+ return cols_data
130
+
131
+ def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
132
+ """Crop and return PIL images of detected text lines in reading order"""
133
+ try:
134
+ img = cv2.imread(image_path)
135
+ if img is None:
136
+ return []
137
+
138
+ crops = []
139
+ h_img, w_img, _ = img.shape
140
+
141
+ for col in layout.get("columns", []):
142
+ for (x, y, w, h) in col["lines"]:
143
+ # Add small padding for HTR/OCR context
144
+ pad_y = int(h * 0.1) + 2
145
+ pad_x = int(w * 0.05) + 2
146
+
147
+ y0 = max(0, y - pad_y)
148
+ y1 = min(h_img, y + h + pad_y)
149
+ x0 = max(0, x - pad_x)
150
+ x1 = min(w_img, x + w + pad_x)
151
+
152
+ crop = img[y0:y1, x0:x1]
153
+ if crop.size > 0:
154
+ crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
155
+
156
+ return crops
157
+ except Exception as e:
158
+ print(f"[ERROR] Failed to crop layout lines: {e}")
159
+ return []
160
+
161
+ def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
162
+ """Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
163
+ try:
164
+ # 1. Try using CLIP classifier if provided
165
+ if clip_classifier and clip_classifier.model and clip_classifier.processor:
166
+ try:
167
+ from PIL import Image
168
+ image = Image.open(image_path).convert("RGB")
169
+
170
+ styles = ["printed", "cursive"]
171
+ descriptions = [
172
+ "classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
173
+ "medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
174
+ ]
175
+
176
+ inputs = clip_classifier.processor(
177
+ text=descriptions,
178
+ images=image,
179
+ return_tensors="pt",
180
+ padding=True
181
+ ).to(clip_classifier.device)
182
+
183
+ import torch
184
+ with torch.no_grad():
185
+ outputs = clip_classifier.model(**inputs)
186
+ logits_per_image = outputs.logits_per_image
187
+ probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
188
+
189
+ best_idx = np.argmax(probs)
190
+ style_label = styles[best_idx]
191
+ confidence = float(probs[best_idx])
192
+ print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
193
+ return style_label
194
+ except Exception as e:
195
+ print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
196
+
197
+ # 2. Fallback: Computer Vision heuristics
198
+ print("[INFO] Running computer vision heuristics for Latin style detection...")
199
+ img = cv2.imread(image_path)
200
+ if img is None:
201
+ return "cursive" # Safe default
202
+
203
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
204
+ blur = cv2.GaussianBlur(gray, (5, 5), 0)
205
+ _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
206
+
207
+ # Find contours without heavy dilation (character level components)
208
+ contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
209
+
210
+ if not contours:
211
+ return "cursive"
212
+
213
+ aspect_ratios = []
214
+ widths = []
215
+ heights = []
216
+
217
+ for cnt in contours:
218
+ x, y, w, h = cv2.boundingRect(cnt)
219
+ # Filter noise
220
+ if w < 5 or h < 5:
221
+ continue
222
+ aspect_ratios.append(w / h)
223
+ widths.append(w)
224
+ heights.append(h)
225
+
226
+ if not aspect_ratios:
227
+ return "cursive"
228
+
229
+ avg_aspect_ratio = np.mean(aspect_ratios)
230
+ median_width = np.median(widths)
231
+
232
+ # Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
233
+ # Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
234
+ print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
235
+
236
+ if avg_aspect_ratio < 1.3:
237
+ return "printed"
238
+ else:
239
+ return "cursive"
240
+
241
+ except Exception as e:
242
+ print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
243
+ return "cursive"
244
+
services/rag_service.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict, List, Tuple
5
+ from config import Config
6
+
7
+ class RAGService:
8
+ def __init__(self, references_path: str = None):
9
+ self.config = Config()
10
+ self.references_path = references_path or str(self.config.REFERENCES_PATH)
11
+ self.corpus = []
12
+ self.load_corpus()
13
+
14
+ def load_corpus(self):
15
+ """Load and index the historical reference document corpus"""
16
+ try:
17
+ if os.path.exists(self.references_path):
18
+ with open(self.references_path, "r", encoding="utf-8") as f:
19
+ data = json.load(f)
20
+
21
+ # Index Egyptian
22
+ for term, note in data.get("egypt_symbol_notes", {}).items():
23
+ self.corpus.append({
24
+ "category": "Egyptian Hieroglyphic Sign",
25
+ "term": term,
26
+ "definition": note,
27
+ "citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
28
+ })
29
+
30
+ # Index Greek
31
+ for term, note in data.get("greek_symbol_notes", {}).items():
32
+ self.corpus.append({
33
+ "category": "Greek Paleography Mark",
34
+ "term": term,
35
+ "definition": note,
36
+ "citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
37
+ })
38
+
39
+ # Index Latin
40
+ for term, note in data.get("latin_symbol_notes", {}).items():
41
+ self.corpus.append({
42
+ "category": "Latin Scribal Abbreviation",
43
+ "term": term,
44
+ "definition": note,
45
+ "citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
46
+ })
47
+
48
+ # Index Cuneiform
49
+ for term, note in data.get("cuneiform_symbol_notes", {}).items():
50
+ self.corpus.append({
51
+ "category": "Mesopotamian Cuneiform Logogram",
52
+ "term": term,
53
+ "definition": note,
54
+ "citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
55
+ })
56
+
57
+ print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
58
+ else:
59
+ print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
60
+ except Exception as e:
61
+ print(f"[ERROR] Failed to initialize RAG index: {e}")
62
+
63
+ def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
64
+ """Search reference records and build a grounding context string with academic citations"""
65
+ if not query_terms or not self.corpus:
66
+ return ""
67
+
68
+ matches = []
69
+ seen = set()
70
+
71
+ for term in query_terms:
72
+ if not term or len(term.strip()) < 1:
73
+ continue
74
+
75
+ clean_term = term.lower().strip()
76
+
77
+ # Simple keyword search with scoring
78
+ for record in self.corpus:
79
+ score = 0
80
+ record_term = record["term"].lower()
81
+ record_def = record["definition"].lower()
82
+
83
+ if clean_term == record_term:
84
+ score += 10
85
+ else:
86
+ # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
87
+ term_parts = re.split(r'[_ \-]', record_term)
88
+ if clean_term in term_parts:
89
+ score += 5
90
+ elif len(clean_term) > 3:
91
+ if clean_term in record_term:
92
+ score += 5
93
+ elif clean_term in record_def:
94
+ score += 2
95
+
96
+ if score > 0:
97
+ record_key = f"{record['category']}:{record['term']}"
98
+ if record_key not in seen:
99
+ seen.add(record_key)
100
+ matches.append((score, record))
101
+
102
+ # Sort matches by relevance score
103
+ matches.sort(key=lambda x: x[0], reverse=True)
104
+ top_matches = [m[1] for m in matches[:max_results]]
105
+
106
+ if not top_matches:
107
+ return ""
108
+
109
+ context_lines = ["### Scholarly Grounding and Sign References:"]
110
+ for idx, match in enumerate(top_matches, 1):
111
+ context_lines.append(
112
+ f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
113
+ f" *Source Citation:* {match['citation']}"
114
+ )
115
+
116
+ return "\n".join(context_lines)
117
+
118
+ def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
119
+ """Search reference records and return the raw list of matching reference dicts with citations"""
120
+ if not query_terms or not self.corpus:
121
+ return []
122
+
123
+ matches = []
124
+ seen = set()
125
+
126
+ for term in query_terms:
127
+ if not term or len(term.strip()) < 1:
128
+ continue
129
+
130
+ clean_term = term.lower().strip()
131
+
132
+ # Simple keyword search with scoring
133
+ for record in self.corpus:
134
+ score = 0
135
+ record_term = record["term"].lower()
136
+ record_def = record["definition"].lower()
137
+
138
+ if clean_term == record_term:
139
+ score += 10
140
+ else:
141
+ # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
142
+ term_parts = re.split(r'[_ \-]', record_term)
143
+ if clean_term in term_parts:
144
+ score += 5
145
+ elif len(clean_term) > 3:
146
+ if clean_term in record_term:
147
+ score += 5
148
+ elif clean_term in record_def:
149
+ score += 2
150
+
151
+ if score > 0:
152
+ record_key = f"{record['category']}:{record['term']}"
153
+ if record_key not in seen:
154
+ seen.add(record_key)
155
+ matches.append((score, record))
156
+
157
+ # Sort matches by relevance score
158
+ matches.sort(key=lambda x: x[0], reverse=True)
159
+ return [m[1] for m in matches[:max_results]]
160
+
161
+ def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
162
+ """Enrich LLM prompts with RAG context and citation grounding instructions"""
163
+ # Parse query terms from extracted text or labels
164
+ query_terms = []
165
+ if extracted_symbols:
166
+ query_terms.extend(extracted_symbols)
167
+
168
+ # Split clean words from text
169
+ if extracted_text:
170
+ words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
171
+ query_terms.extend(words[:15]) # Cap to prevent excessive token use
172
+
173
+ grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
174
+
175
+ if not grounding_context:
176
+ return base_system_prompt
177
+
178
+ enriched_prompt = (
179
+ f"{base_system_prompt}\n\n"
180
+ f"Here is some verified historical and paleographical grounding information that you MUST use "
181
+ f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
182
+ f"whenever discussing these symbols:\n\n"
183
+ f"{grounding_context}\n\n"
184
+ f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
185
+ )
186
+ return enriched_prompt
services/script_detector.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from processors.egyptian_processor import EgyptianProcessor
2
+ from processors.greek_processor import GreekProcessor
3
+ from processors.latin_processor import LatinProcessor
4
+ from processors.cuneiform_processor import CuneiformProcessor
5
+ from .groq_vision_classifier import GroqVisionScriptClassifier
6
+
7
+
8
+ class ScriptDetectionService:
9
+ def __init__(self, groq_client, references, clip_classifier, translator_pipe, cuneiform_processor=None):
10
+ # Initialize processors including cuneiform
11
+ self.egyptian_processor = EgyptianProcessor(groq_client, references, clip_classifier, translator_pipe)
12
+ self.greek_processor = GreekProcessor(groq_client, references, clip_classifier)
13
+ self.latin_processor = LatinProcessor(groq_client, references, clip_classifier)
14
+
15
+ # Initialize cuneiform processor or use the shared instance
16
+ if cuneiform_processor:
17
+ self.cuneiform_processor = cuneiform_processor
18
+ print("[INFO] Cuneiform processor shared from global app instance")
19
+ else:
20
+ try:
21
+ print("[INFO] Initializing cuneiform processor in detection service...")
22
+ self.cuneiform_processor = CuneiformProcessor(groq_client, references, clip_classifier)
23
+ print("[INFO] Cuneiform processor initialized successfully")
24
+ except Exception as e:
25
+ print(f"[WARN] Failed to initialize cuneiform processor: {e}")
26
+ self.cuneiform_processor = None
27
+
28
+ # FIXED: Get API key from groq_client with multiple fallback options
29
+ api_key = None
30
+ if hasattr(groq_client, 'api_key'):
31
+ api_key = groq_client.api_key
32
+ elif hasattr(groq_client, 'client') and hasattr(groq_client.client, 'api_key'):
33
+ api_key = groq_client.client.api_key
34
+ else:
35
+ # Fallback: get from config or environment
36
+ try:
37
+ from config import Config
38
+ config = Config()
39
+ api_key = config.GROQ_API_KEY
40
+ except:
41
+ import os
42
+ api_key = os.getenv('GROQ_API_KEY')
43
+
44
+ # Initialize Groq Vision script classifier if API key is present
45
+ if api_key:
46
+ try:
47
+ self.vision_classifier = GroqVisionScriptClassifier(api_key)
48
+ print("[INFO] Groq Vision Script Detection Service initialized")
49
+ except Exception as e:
50
+ print(f"[WARN] Failed to initialize Groq Vision script classifier: {e}")
51
+ self.vision_classifier = None
52
+ else:
53
+ print("[WARN] GROQ_API_KEY not found! Groq Vision classifier disabled. Falling back to zero-shot CLIP classifier.")
54
+ self.vision_classifier = None
55
+
56
+ # Keep track of clip_classifier
57
+ self.clip_classifier = clip_classifier
58
+
59
+ # Enhanced processor mapping with cuneiform
60
+ self.processors = {
61
+ 'egyptian': self.egyptian_processor,
62
+ 'greek': self.greek_processor,
63
+ 'latin': self.latin_processor,
64
+ 'cuneiform': self.cuneiform_processor
65
+ }
66
+
67
+ if self.cuneiform_processor:
68
+ print("[INFO] Cuneiform support: ENABLED (praeclarum/cuneiform model)")
69
+ else:
70
+ print("[WARN] Cuneiform support: DISABLED (processor initialization failed)")
71
+
72
+ def detect_and_process(self, image_path):
73
+ """Enhanced detection with cuneiform support - uses Groq Vision with CLIP fallback"""
74
+ try:
75
+ # Step 1: Get script classification from Groq Vision or CLIP
76
+ script_type = "unknown"
77
+ classification_method = "unknown"
78
+ classification_confidence = 0.0
79
+
80
+ if self.vision_classifier:
81
+ try:
82
+ script_type = self.vision_classifier.classify_script(image_path)
83
+ classification_method = 'groq_vision'
84
+ classification_confidence = 0.95
85
+ except Exception as e:
86
+ print(f"[WARN] Groq Vision classification failed: {e}. Falling back to CLIP.")
87
+
88
+ if script_type == "unknown" or not self.vision_classifier:
89
+ from PIL import Image
90
+ try:
91
+ img = Image.open(image_path)
92
+ script_type, classification_confidence = self.clip_classifier.classify_script_type(img)
93
+ classification_method = 'clip_zero_shot'
94
+ print(f"[INFO] CLIP fallback classification: {script_type} (conf={classification_confidence:.3f})")
95
+ except Exception as ce:
96
+ print(f"[ERROR] CLIP fallback classification failed: {ce}")
97
+ script_type = "egyptian" # default fallback
98
+ classification_method = "default_fallback"
99
+ classification_confidence = 0.5
100
+
101
+ print(f"[INFO] Final classification routed: {script_type} via {classification_method}")
102
+
103
+ # Step 2: Route to appropriate processor including cuneiform
104
+ if script_type == "egyptian":
105
+ print("[INFO] Routing to Egyptian processor...")
106
+ result = self.egyptian_processor.process_image(image_path)
107
+
108
+ elif script_type == "greek":
109
+ print("[INFO] Routing to Greek processor...")
110
+ result = self.greek_processor.process_image(image_path)
111
+
112
+ elif script_type == "latin":
113
+ print("[INFO] Routing to Latin processor...")
114
+ result = self.latin_processor.process_image(image_path)
115
+
116
+ elif script_type == "cuneiform":
117
+ print("[INFO] Routing to Cuneiform processor...")
118
+ if self.cuneiform_processor and self.cuneiform_processor.cuneiform_available:
119
+ result = self.cuneiform_processor.process_image(image_path)
120
+ else:
121
+ print("[ERROR] Cuneiform processor not available!")
122
+ # Create error result
123
+ result = {
124
+ 'script_type': 'cuneiform',
125
+ 'confidence': 0.0,
126
+ 'processed_result': {
127
+ 'text': 'Cuneiform processor unavailable',
128
+ 'validation': {'quality_score': 0.0, 'error': 'Model not loaded'}
129
+ },
130
+ 'historical_context': {},
131
+ 'creative_story': 'Cuneiform processing failed - model not available'
132
+ }
133
+
134
+ else: # unknown
135
+ print(f"[INFO] Unknown classification '{script_type}', defaulting to Egyptian...")
136
+ result = self.egyptian_processor.process_image(image_path)
137
+
138
+ # Step 3: Return result with classification metadata
139
+ if result:
140
+ result['vision_classification'] = script_type
141
+ result['classification_method'] = classification_method
142
+ result['classification_confidence'] = classification_confidence
143
+ print(f"[INFO] {script_type.title()} processing completed successfully")
144
+ return result
145
+ else:
146
+ print(f"[ERROR] {script_type.title()} processor returned None")
147
+ return None
148
+
149
+ except Exception as e:
150
+ print(f"[ERROR] Classification and processing failed: {e}")
151
+ import traceback
152
+ traceback.print_exc()
153
+ return None
154
+
155
+ def get_processor_by_type(self, script_type):
156
+ """Get processor by script type - now includes cuneiform"""
157
+ processor = self.processors.get(script_type.lower())
158
+
159
+ if script_type.lower() == 'cuneiform' and processor and not processor.cuneiform_available:
160
+ print(f"[WARN] Cuneiform processor exists but model not available")
161
+ return None
162
+
163
+ return processor
164
+
165
+ def get_supported_scripts(self):
166
+ """Get list of supported script types"""
167
+ scripts = ['egyptian', 'greek', 'latin']
168
+
169
+ if self.cuneiform_processor and self.cuneiform_processor.cuneiform_available:
170
+ scripts.append('cuneiform')
171
+
172
+ return scripts
173
+
174
+ def get_processor_status(self):
175
+ """Get status of all processors"""
176
+ status = {
177
+ 'egyptian': self.egyptian_processor is not None,
178
+ 'greek': self.greek_processor is not None,
179
+ 'latin': self.latin_processor is not None,
180
+ 'cuneiform': self.cuneiform_processor is not None and getattr(self.cuneiform_processor, 'cuneiform_available', False)
181
+ }
182
+
183
+ return status
184
+
185
+ def validate_script_detection(self, script_type, processed_result):
186
+ """Validate script detection results - enhanced for cuneiform"""
187
+ try:
188
+ validation = processed_result.get('validation', {})
189
+ quality_score = validation.get('quality_score', 0.0)
190
+
191
+ # Script-specific validation thresholds
192
+ thresholds = {
193
+ 'egyptian': 0.3,
194
+ 'greek': 0.4,
195
+ 'latin': 0.4,
196
+ 'cuneiform': 0.2 # Lower threshold due to OCR challenges
197
+ }
198
+
199
+ threshold = thresholds.get(script_type, 0.3)
200
+
201
+ # Additional cuneiform validation
202
+ if script_type == 'cuneiform':
203
+ cuneiform_ratio = validation.get('cuneiform_ratio', 0.0)
204
+ atf_ratio = validation.get('atf_ratio', 0.0)
205
+
206
+ # Accept if either Unicode cuneiform or ATF format detected
207
+ if cuneiform_ratio > 0.1 or atf_ratio > 0.3:
208
+ print(f"[INFO] Cuneiform validation passed: cuneiform_ratio={cuneiform_ratio:.3f}, atf_ratio={atf_ratio:.3f}")
209
+ return True
210
+
211
+ # Standard quality validation
212
+ is_valid = quality_score >= threshold
213
+
214
+ if is_valid:
215
+ print(f"[INFO] {script_type.title()} validation passed: quality={quality_score:.3f} >= {threshold}")
216
+ else:
217
+ print(f"[WARN] {script_type.title()} validation failed: quality={quality_score:.3f} < {threshold}")
218
+
219
+ return is_valid
220
+
221
+ except Exception as e:
222
+ print(f"[ERROR] Validation failed: {e}")
223
+ return False
services/story_generator.py ADDED
File without changes
utils/__init__.py ADDED
File without changes
utils/gpu_diagnostics.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gc
3
+
4
+ _active_processors = {}
5
+
6
+ def register_processor(name, processor_instance):
7
+ """Register a processor instance for active VRAM offloading."""
8
+ _active_processors[name] = processor_instance
9
+ print(f"[VRAM MANAGER] Registered processor: {name}")
10
+
11
+ def reclaim_vram_for(target_processor_name):
12
+ """Offload other processors' models from GPU to CPU to avoid Out of Memory (OOM) crashes."""
13
+ if not torch.cuda.is_available():
14
+ return
15
+
16
+ print(f"[VRAM MANAGER] Reclaiming GPU VRAM for '{target_processor_name}'...")
17
+ offloaded = False
18
+
19
+ for name, proc in list(_active_processors.items()):
20
+ if name == target_processor_name:
21
+ continue
22
+
23
+ try:
24
+ # 1. Greek Processor
25
+ if name == "greek" and getattr(proc, "trocr_model", None) is not None:
26
+ current_device = next(proc.trocr_model.parameters()).device
27
+ if str(current_device).startswith("cuda"):
28
+ print("[VRAM MANAGER] Offloading Greek TrOCR to CPU...")
29
+ proc.trocr_model.to("cpu")
30
+ offloaded = True
31
+
32
+ # 2. Latin Processor
33
+ elif name == "latin":
34
+ if getattr(proc, "tridis_model", None) is not None:
35
+ current_device = next(proc.tridis_model.parameters()).device
36
+ if str(current_device).startswith("cuda"):
37
+ print("[VRAM MANAGER] Offloading Latin TRIDIS to CPU...")
38
+ proc.tridis_model.to("cpu")
39
+ offloaded = True
40
+ if getattr(proc, "trocr_latin_model", None) is not None:
41
+ current_device = next(proc.trocr_latin_model.parameters()).device
42
+ if str(current_device).startswith("cuda"):
43
+ print("[VRAM MANAGER] Offloading Latin TrOCR to CPU...")
44
+ proc.trocr_latin_model.to("cpu")
45
+ offloaded = True
46
+
47
+ # 3. Cuneiform Processor
48
+ elif name == "cuneiform":
49
+ if getattr(proc, "clip_model", None) is not None:
50
+ current_device = next(proc.clip_model.parameters()).device
51
+ if str(current_device).startswith("cuda"):
52
+ print("[VRAM MANAGER] Offloading Cuneiform CLIP to CPU...")
53
+ proc.clip_model.to("cpu")
54
+ offloaded = True
55
+ if getattr(proc, "cuneiform_model", None) is not None:
56
+ current_device = next(proc.cuneiform_model.parameters()).device
57
+ if str(current_device).startswith("cuda"):
58
+ print("[VRAM MANAGER] Offloading Cuneiform Translator to CPU...")
59
+ proc.cuneiform_model.to("cpu")
60
+ offloaded = True
61
+
62
+ # 4. Egyptian Processor (HuggingFaceModels)
63
+ elif name == "egyptian" and getattr(proc, "_model", None) is not None:
64
+ current_device = next(proc._model.parameters()).device
65
+ if str(current_device).startswith("cuda"):
66
+ print("[VRAM MANAGER] Offloading Egyptian T5 to CPU...")
67
+ proc._model.to("cpu")
68
+ offloaded = True
69
+ except Exception as e:
70
+ print(f"[WARN] Failed to offload '{name}' models: {e}")
71
+
72
+ if offloaded:
73
+ gc.collect()
74
+ torch.cuda.empty_cache()
75
+ print("[VRAM MANAGER] VRAM cache cleared successfully.")
76
+
77
+ def get_gpu_info():
78
+ """Get diagnostic information about the NVIDIA GPU if available."""
79
+ info = {
80
+ "cuda_available": torch.cuda.is_available(),
81
+ "gpu_name": "N/A",
82
+ "vram_total_gb": 0.0,
83
+ "vram_allocated_gb": 0.0,
84
+ "vram_cached_gb": 0.0,
85
+ "vram_free_gb": 0.0,
86
+ "cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A",
87
+ "device": "cpu"
88
+ }
89
+
90
+ if info["cuda_available"]:
91
+ info["device"] = "cuda"
92
+ try:
93
+ info["gpu_name"] = torch.cuda.get_device_name(0)
94
+ props = torch.cuda.get_device_properties(0)
95
+ info["vram_total_gb"] = round(props.total_memory / 1024**3, 2)
96
+
97
+ allocated = torch.cuda.memory_allocated(0)
98
+ cached = torch.cuda.memory_reserved(0)
99
+ info["vram_allocated_gb"] = round(allocated / 1024**3, 3)
100
+ info["vram_cached_gb"] = round(cached / 1024**3, 3)
101
+
102
+ try:
103
+ free_mem, total_mem = torch.cuda.mem_get_info(0)
104
+ info["vram_free_gb"] = round(free_mem / 1024**3, 3)
105
+ except Exception:
106
+ info["vram_free_gb"] = round((props.total_memory - allocated) / 1024**3, 3)
107
+ except Exception as e:
108
+ print(f"[WARN] Error gathering detailed GPU info: {e}")
109
+
110
+ return info
111
+
112
+ def log_gpu_info():
113
+ """Print clean diagnostic logs at startup."""
114
+ info = get_gpu_info()
115
+ print("=" * 60)
116
+ print(" NVIDIA GPU & CUDA INITIALIZATION DIAGNOSTICS")
117
+ print("=" * 60)
118
+ print(f"CUDA Available: {info['cuda_available']}")
119
+ if info["cuda_available"]:
120
+ print(f"CUDA Version: {info['cuda_version']}")
121
+ print(f"GPU Model: {info['gpu_name']}")
122
+ print(f"Total VRAM: {info['vram_total_gb']} GB")
123
+ print(f"Free VRAM: {info['vram_free_gb']} GB")
124
+ print(f"Active Device: CUDA (Dynamic Offloading Enabled)")
125
+ else:
126
+ print("Active Device: CPU (GPU acceleration not available)")
127
+ print("=" * 60)
128
+
129
+ def log_model_device(model_name, device):
130
+ """Log the device selected for a specific model."""
131
+ print(f"[DEVICE LOG] Model '{model_name}' -> Assigned to: {str(device).upper()}")
132
+
133
+ def clear_gpu_cache():
134
+ """Utility to clean memory cache during benchmarks or processing."""
135
+ if torch.cuda.is_available():
136
+ gc.collect()
137
+ torch.cuda.empty_cache()
utils/image_utils.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PIL import Image
4
+
5
+ def segment_hieroglyphs(image_path):
6
+ """Segment hieroglyphs from image using OpenCV"""
7
+ try:
8
+ img = cv2.imread(image_path)
9
+ if img is None:
10
+ raise FileNotFoundError(f"Image not found or cannot be read: {image_path}")
11
+
12
+ # Convert to grayscale and apply adaptive thresholding
13
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
14
+ th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
15
+ cv2.THRESH_BINARY_INV, 25, 10)
16
+
17
+ # Apply morphological operations
18
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
19
+ th = cv2.morphologyEx(th, cv2.MORPH_CLOSE, kernel, iterations=1)
20
+
21
+ # Find contours
22
+ contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
23
+
24
+ boxes = []
25
+ h_img, w_img = th.shape
26
+
27
+ for cnt in contours:
28
+ x, y, w, h = cv2.boundingRect(cnt)
29
+ area = w * h
30
+
31
+ # Filter small areas and full-image contours
32
+ if area < 200:
33
+ continue
34
+ if w > 0.95*w_img or h > 0.95*h_img:
35
+ continue
36
+
37
+ boxes.append((x, y, w, h))
38
+
39
+ # If no boxes found, return full image
40
+ if not boxes:
41
+ return [Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))]
42
+
43
+ # Sort boxes by position (top to bottom, left to right)
44
+ boxes = sorted(boxes, key=lambda b: (b[1]//50, b[0]))
45
+
46
+ # Extract crops
47
+ crops = []
48
+ for (x, y, w, h) in boxes:
49
+ pad = 6
50
+ x0 = max(0, x - pad)
51
+ y0 = max(0, y - pad)
52
+ x1 = min(w_img, x + w + pad)
53
+ y1 = min(h_img, y + h + pad)
54
+
55
+ crop = img[y0:y1, x0:x1]
56
+ crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
57
+
58
+ return crops
59
+
60
+ except Exception as e:
61
+ print(f"[ERROR] Hieroglyph segmentation failed: {e}")
62
+ return []
63
+
64
+ def validate_image(file):
65
+ """Validate uploaded image file"""
66
+ from config import Config
67
+ config = Config()
68
+
69
+ # Check file size
70
+ if hasattr(file, 'content_length') and file.content_length > config.MAX_FILE_SIZE:
71
+ raise ValueError(f"File too large. Maximum size: {config.MAX_FILE_SIZE} bytes")
72
+
73
+ # Check file extension
74
+ if not file.filename or '.' not in file.filename:
75
+ raise ValueError("Invalid filename")
76
+
77
+ extension = file.filename.rsplit('.', 1)[1].lower()
78
+ if extension not in config.ALLOWED_EXTENSIONS:
79
+ raise ValueError(f"Invalid file type. Allowed: {', '.join(config.ALLOWED_EXTENSIONS)}")
80
+
81
+ # Try to open as image
82
+ try:
83
+ image = Image.open(file.stream)
84
+ image.verify()
85
+ file.stream.seek(0) # Reset stream for later use
86
+ return True
87
+ except Exception:
88
+ raise ValueError("File is not a valid image")
89
+
90
+ def preprocess_for_latin_ocr(image_path):
91
+ """Specialized preprocessing for Latin texts"""
92
+ try:
93
+ # Load image
94
+ image = cv2.imread(image_path)
95
+ if image is None:
96
+ raise ValueError(f"Cannot load image: {image_path}")
97
+
98
+ # Convert to grayscale
99
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
100
+
101
+ # Apply bilateral filter to reduce noise while preserving edges
102
+ filtered = cv2.bilateralFilter(gray, 9, 75, 75)
103
+
104
+ # Adaptive thresholding for varying lighting
105
+ thresh = cv2.adaptiveThreshold(
106
+ filtered, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
107
+ cv2.THRESH_BINARY, 11, 2
108
+ )
109
+
110
+ return thresh
111
+
112
+ except Exception as e:
113
+ print(f"[ERROR] Latin preprocessing failed: {e}")
114
+ return None
115
+
116
+ def enhance_contrast_for_manuscripts(image):
117
+ """Enhanced contrast specifically for manuscript images"""
118
+ # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
119
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
120
+ enhanced = clahe.apply(image)
121
+ return enhanced
utils/text_utils.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from collections import Counter
3
+ from itertools import groupby
4
+
5
+ def is_gibberish(text):
6
+ """Check if text appears to be gibberish"""
7
+ if not text or not isinstance(text, str):
8
+ return True
9
+
10
+ words = re.findall(r"\w+", text.lower())
11
+ if len(words) == 0:
12
+ return True
13
+
14
+ # Check for excessive repetition
15
+ word_counts = Counter(words)
16
+ if word_counts:
17
+ most_common, count = word_counts.most_common(1)[0]
18
+ if count > 12 or (count / len(words)) > 0.4:
19
+ return True
20
+
21
+ # Check minimum word count
22
+ if len(words) < 1:
23
+ return True
24
+ if len(words) == 1 and len(words[0]) < 3:
25
+ return True
26
+
27
+ return False
28
+
29
+ def build_description_from_codes(codes):
30
+ """Build description from Gardiner codes"""
31
+ from config import Config
32
+ config = Config()
33
+
34
+ labels = [config.CODE_TO_LABEL.get(code, code) for code in codes]
35
+ compressed = []
36
+
37
+ for key, group in groupby(labels):
38
+ count = len(list(group))
39
+ name = "unknown" if (key == "?" or key is None) else key
40
+ compressed.append(f"{name} (x{count})" if count > 1 else name)
41
+
42
+ return ", ".join(compressed)
43
+
44
+ def clean_text(text):
45
+ """Clean and normalize text"""
46
+ if not text:
47
+ return ""
48
+
49
+ # Remove excessive whitespace
50
+ text = re.sub(r'\s+', ' ', text)
51
+
52
+ # Strip leading/trailing whitespace
53
+ text = text.strip()
54
+
55
+ return text
56
+
57
+ def extract_words(text, min_length=2):
58
+ """Extract words from text with minimum length"""
59
+ if not text:
60
+ return []
61
+
62
+ words = re.findall(r"\w+", text, flags=re.UNICODE)
63
+ return [word for word in words if len(word) >= min_length]
64
+
65
+ def calculate_text_stats(text):
66
+ """Calculate basic text statistics"""
67
+ if not text:
68
+ return {
69
+ "char_count": 0,
70
+ "word_count": 0,
71
+ "unique_chars": 0,
72
+ "avg_word_length": 0
73
+ }
74
+
75
+ words = extract_words(text)
76
+
77
+ return {
78
+ "char_count": len(text),
79
+ "word_count": len(words),
80
+ "unique_chars": len(set(text)),
81
+ "avg_word_length": sum(len(word) for word in words) / max(1, len(words))
82
+ }
utils/validation.py ADDED
File without changes