import pytesseract import re import os import cv2 import numpy as np import torch from PIL import Image from .base_processor import BaseScriptProcessor from utils.text_utils import is_gibberish BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models")) GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr") class GreekProcessor(BaseScriptProcessor): def __init__(self, groq_client, references, clip_classifier): super().__init__(groq_client, references, clip_classifier) self.clip_classifier = clip_classifier self.setup_ancient_greek_ocr() self.trocr_model = None self.trocr_processor = None self.trocr_available = False self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Register for dynamic VRAM management from utils.gpu_diagnostics import register_processor register_processor("greek", self) # Metrics for Greek Glossary self.glossary_success_count = 0 self.glossary_json_failure_count = 0 self.regex_recovery_count = 0 def setup_greek_trocr(self): """Setup TrOCR model — BEST for ancient Greek manuscripts""" try: from utils.gpu_diagnostics import reclaim_vram_for reclaim_vram_for("greek") print("[INFO] Lazily loading TrOCR model for ancient Greek...") from transformers import TrOCRProcessor, VisionEncoderDecoderModel import torch import os HF_TOKEN = os.getenv("HF_TOKEN") self.trocr_processor = TrOCRProcessor.from_pretrained( 'rithwikn/trocr_greek_combined', cache_dir=GREEK_TROCR_MODEL_DIR, local_files_only=False, token=HF_TOKEN ) self.trocr_model = VisionEncoderDecoderModel.from_pretrained( 'rithwikn/trocr_greek_combined', cache_dir=GREEK_TROCR_MODEL_DIR, local_files_only=False, token=HF_TOKEN ) self.trocr_model.to(self.device) self.trocr_model.eval() # Put in evaluation mode from utils.gpu_diagnostics import log_model_device log_model_device("Greek TrOCR", self.device) self.trocr_available = True print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}") except Exception as e: print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}") self.trocr_available = False def setup_ancient_greek_ocr(self): """Setup Ancient Greek OCR with Tesseract language check""" try: langs = pytesseract.get_languages(config='') self.grc_available = "grc" in langs if self.grc_available: print("[INFO] Ancient Greek Tesseract language pack 'grc' is available") else: print("[WARN] Ancient Greek Tesseract language pack 'grc' is NOT available") except Exception as e: print(f"[ERROR] Failed to check Tesseract languages: {e}") self.grc_available = False def detect_script(self, image_path): """Simplified detection - Groq Vision handles main classification""" try: if not getattr(self, 'trocr_available', False): # Check if Ancient Greek OCR is available as fallback if not getattr(self, 'grc_available', False): print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)") return False, 0.5 # If called by Groq Vision classification, accept with high confidence print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)") return True, 0.95 except Exception as e: print(f"[ERROR] Greek detection failed: {e}") return False, 0.0 def _quick_greek_ocr_test(self, image_path): """Quick OCR test to validate Greek content""" try: # Quick test with small image crop image = Image.open(image_path) # Take center crop for testing w, h = image.size crop_box = (w//4, h//4, 3*w//4, 3*h//4) test_crop = image.crop(crop_box) # Test with standard Greek OCR test_text = pytesseract.image_to_string(test_crop, lang="ell") greek_char_count = self._count_greek_chars(test_text or "") # If we find Greek characters, it's likely Greek return greek_char_count >= 3 except Exception: return False def extract_text(self, image_path): """Enhanced Greek text extraction with TrOCR primary, Tesseract fallback""" try: image = Image.open(image_path) # Ensure the Greek TrOCR model is loaded dynamically if self.trocr_model is None: self.setup_greek_trocr() else: from utils.gpu_diagnostics import reclaim_vram_for reclaim_vram_for("greek") if str(next(self.trocr_model.parameters()).device) != str(self.device): print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...") self.trocr_model.to(self.device) # Method 1: Ancient Greek TrOCR (if available) if getattr(self, 'trocr_available', False) and self.trocr_model is not None: print("[INFO] Attempting Ancient Greek extraction with TrOCR...") trocr_text = self._extract_with_trocr(image_path) if trocr_text and self._validate_greek_text(trocr_text): print("[INFO] Using Ancient Greek TrOCR result") return trocr_text print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...") # Method 2: Ancient Greek OCR (if available and safe) if getattr(self, 'grc_available', False): ancient_greek_text = self._extract_with_ancient_greek_ocr(image) if ancient_greek_text and self._validate_greek_text(ancient_greek_text): print("[INFO] Using Ancient Greek OCR result") return ancient_greek_text # Method 3: Standard Greek OCR standard_greek_text = self._extract_with_standard_greek_ocr(image) if standard_greek_text and self._validate_greek_text(standard_greek_text): print("[INFO] Using standard Greek OCR result") return standard_greek_text # Method 4: Layout-aware line segment fallback print("[INFO] Trying layout-aware Greek segmentation fallback...") layout_aware_greek_text = self._extract_layout_aware_ocr(image_path) if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text): print("[INFO] Using layout-aware Greek OCR result") return layout_aware_greek_text # Method 5: Final validation - if no good Greek text found, return empty print("[INFO] No valid Greek text detected") return "" except Exception as e: print(f"[ERROR] Greek text extraction failed: {e}") return "" def _extract_with_trocr(self, image_path): """Extract text using TrOCR Ancient Greek model line-by-line""" if self.trocr_model is None: self.setup_greek_trocr() else: from utils.gpu_diagnostics import reclaim_vram_for reclaim_vram_for("greek") if str(next(self.trocr_model.parameters()).device) != str(self.device): print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...") self.trocr_model.to(self.device) if not getattr(self, 'trocr_available', False) or self.trocr_model is None: return "" try: import torch from PIL import Image print("[INFO] Segmenting layout for Greek TrOCR...") layout = self.layout_parser.analyze_layout(image_path) crops = self.layout_parser.crop_lines(image_path, layout) # Fallback to whole image if no crops detected if not crops: print("[WARN] No line crops found, processing full image with TrOCR") crops = [Image.open(image_path).convert("RGB")] line_texts = [] print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...") for idx, crop in enumerate(crops): # Ensure RGB mode for TrOCR crop_rgb = crop.convert("RGB") pixel_values = self.trocr_processor( images=crop_rgb, return_tensors="pt" ).pixel_values.to(self.device) with torch.inference_mode(): generated_ids = self.trocr_model.generate( pixel_values, max_length=256, num_beams=4, early_stopping=True, repetition_penalty=1.2 ) text = self.trocr_processor.batch_decode( generated_ids, skip_special_tokens=True )[0] if text.strip(): line_texts.append(text.strip()) full_text = "\n".join(line_texts) print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image") return full_text except Exception as e: print(f"[ERROR] Greek TrOCR extraction failed: {e}") return "" def _extract_with_ancient_greek_ocr(self, image): """Extract using specialized Ancient Greek OCR""" try: if not getattr(self, 'grc_available', False): return "" # Use ancient Greek language code 'grc' with optimized settings config = "--psm 6 --oem 1 -c preserve_interword_spaces=1" # Try ancient Greek language pack text = pytesseract.image_to_string( image, lang="grc", # Ancient Greek language code config=config ) return text.strip() except Exception as e: print(f"[WARN] Ancient Greek OCR failed: {e}") return "" def _extract_layout_aware_ocr(self, image_path): """Extract text by segmenting the page layout into lines first for improved readability order""" try: import pytesseract print("[INFO] Running layout-aware line segmentation for Greek...") layout = self.layout_parser.analyze_layout(image_path) crops = self.layout_parser.crop_lines(image_path, layout) if not crops: print("[WARN] Layout parser returned no line crops for Greek") return "" print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines") line_texts = [] # Try to use Ancient Greek first use_grc = getattr(self, 'grc_available', False) try: for idx, crop in enumerate(crops): # Enhance line crop for OCR crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR) gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY) clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4)) enhanced = clahe.apply(gray) crop_pil = Image.fromarray(enhanced) config = '--oem 3 --psm 7' text = "" if use_grc: text = pytesseract.image_to_string( crop_pil, lang='grc', config=config ).strip() if not text: text = pytesseract.image_to_string( crop_pil, lang='ell', config=config ).strip() if text: line_texts.append(text) finally: pass return "\n".join(line_texts) except Exception as e: print(f"[WARN] Layout aware Greek OCR failed: {e}") return "" def _extract_with_standard_greek_ocr(self, image): """Extract using standard Greek OCR with optimized settings""" try: # Multiple OCR attempts with different settings configs = [ "--psm 6 --oem 1", # Uniform text block "--psm 4 --oem 1", # Single column text "--psm 3 --oem 1", # Default, automatic page segmentation "--psm 8 --oem 1" # Single word ] for config in configs: try: text = pytesseract.image_to_string( image, lang="ell", # Modern Greek config=config ) if text and self._validate_greek_text(text): return text.strip() except Exception: continue return "" except Exception as e: print(f"[WARN] Standard Greek OCR failed: {e}") return "" def _extract_with_preprocessing(self, image): """Fallback extraction with image preprocessing""" try: # Convert PIL to CV2 cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # Image preprocessing for better OCR gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) # Try different preprocessing approaches preprocessed_images = [ gray, # Original grayscale cv2.GaussianBlur(gray, (1, 1), 0), # Slight blur cv2.medianBlur(gray, 3), # Noise reduction cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Adaptive threshold ] for processed_img in preprocessed_images: try: pil_img = Image.fromarray(processed_img) text = pytesseract.image_to_string( pil_img, lang="ell", config="--psm 6 --oem 1" ) if self._validate_greek_text(text): return text.strip() except Exception: continue return "" except Exception as e: print(f"[WARN] Fallback Greek OCR failed: {e}") return "" def _count_greek_chars(self, text): """Count Greek Unicode characters including polytonic marks""" if not text: return 0 def is_greek_char(ch): o = ord(ch) # Greek and Coptic (0x0370-0x03FF) # Greek Extended (0x1F00-0x1FFF) - includes polytonic marks return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF) return sum(is_greek_char(ch) for ch in text) def _validate_greek_text(self, text): """Validate if text contains meaningful Greek content""" if not text or len(text.strip()) < 3: return False # Count Greek characters greek_char_count = self._count_greek_chars(text) total_chars = len(re.sub(r'\s+', '', text)) if total_chars == 0: return False # Check for Latin characters (should reject if too many) latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text) latin_ratio = latin_chars / total_chars if total_chars > 0 else 0 # If text is mostly Latin characters, reject it if latin_ratio > 0.8 and greek_char_count < 3: print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}") return False # At least 20% should be Greek characters, or minimum 5 Greek chars greek_ratio = greek_char_count / total_chars return greek_char_count >= 5 or greek_ratio >= 0.20 def _extract_distinct_terms(self, text): """Extract distinct Greek terms from text""" if not text: return [] # Find Greek words (including those with diacritical marks) tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE) def is_greek_word(word): return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF) for ch in word) distinct_terms = [] seen = set() for token in tokens: if len(token) < 2: # Skip single characters continue if is_greek_word(token): normalized = token.lower() if normalized not in seen: distinct_terms.append(token) seen.add(normalized) return distinct_terms[:20] # Limit to 20 terms def process_text(self, greek_text): """Process extracted Greek text""" if not greek_text: return {"text": "", "terms": [], "char_analysis": {}, "validation": {}} # Extract distinct terms terms = self._extract_distinct_terms(greek_text) # Character analysis char_analysis = { "total_chars": len(greek_text), "greek_chars": self._count_greek_chars(greek_text), "unique_chars": len(set(greek_text)), "words": len(greek_text.split()) } # Validation metrics validation = { "has_polytonic": self._has_polytonic_marks(greek_text), "greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]), "quality_score": self._calculate_quality_score(greek_text) } return { "text": greek_text, "terms": terms, "char_analysis": char_analysis, "validation": validation } def _has_polytonic_marks(self, text): """Check if text contains polytonic Greek marks""" # Greek Extended block contains polytonic diacritical marks return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text) def _calculate_quality_score(self, text): """Calculate a quality score for the extracted text""" if not text: return 0.0 score = 0.0 # Base score from Greek character ratio greek_ratio = self._count_greek_chars(text) / max(1, len(text)) score += greek_ratio * 0.4 # Bonus for polytonic marks (indicates authentic ancient Greek) if self._has_polytonic_marks(text): score += 0.3 # Penalty for too many non-alphabetic characters alpha_chars = sum(ch.isalpha() for ch in text) alpha_ratio = alpha_chars / max(1, len(text)) score += alpha_ratio * 0.3 return min(1.0, score) def generate_historical_context(self, processed_result): """Generate historical context for Greek text""" greek_text = processed_result.get("text", "") terms = processed_result.get("terms", []) # Generate Groq context groq_detail = self._generate_groq_context(greek_text) # Build references - query both words and individual characters query_terms = list(terms) if terms else [] if greek_text: query_terms.extend([char for char in greek_text if char.strip()]) print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}") refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6) print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}") return { "uses_box": { "title": "Each symbol's possible use by the Greek people", "items": self._build_uses_list(terms, greek_text) }, "meaning_box": self._build_meaning_box(terms, groq_detail), "references": refs } def _generate_groq_context(self, greek_text): """Generate contextual information using Groq""" if not self.groq_client.is_available(): return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package." prompt = ( f"This ancient Greek text was found: {greek_text}\n\n" "Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, " "possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), " "and paleographic cues. Avoid repeating the prompt." ) system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context." enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text) return self.groq_client.generate_response( system_prompt=enriched_system_prompt, user_prompt=prompt ) or "(context unavailable due to Groq error)" def _generate_batch_explanations(self, terms): """Generate scholarly glossary definitions for Greek terms in a single batch query""" if not terms or not self.groq_client or not self.groq_client.is_available(): return {} # Limit to first 15 terms to prevent token limit/truncation issues terms_to_query = list(terms)[:15] terms_list = ", ".join(terms_to_query) system_prompt = ( "You are an expert classicist and lexicographer of Ancient Greek. " "Return ONLY valid JSON matching the requested schema. " "No markdown, no code fences (like ```json), no explanations, no prose." ) user_prompt = ( f"For each of the following Ancient Greek words, provide a scholarly definition, " f"etymological note, and grammatical gloss:\n\n" f"Words: {terms_list}\n\n" f"You MUST format the output as a single JSON object where the keys are the exact words " f"and the values are objects containing 'definition', 'gloss', and 'etymology' keys.\n\n" f"Output schema:\n" f"{{\n" f" \"TERM\": {{\n" f" \"definition\": \"...\",\n" f" \"gloss\": \"...\",\n" f" \"etymology\": \"...\"\n" f" }}\n" f"}}\n" ) try: raw_response = self.groq_client.generate_response( system_prompt=system_prompt, user_prompt=user_prompt, max_tokens=2048, response_format={"type": "json_object"} ) # Safe print to avoid UnicodeEncodeError in Windows command prompt print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}") # Find JSON block in response json_str = raw_response.strip() if "{" in json_str and "}" in json_str: start = json_str.find("{") end = json_str.rfind("}") + 1 json_str = json_str[start:end] import json definitions = {} try: definitions = json.loads(json_str) self.glossary_success_count += 1 except Exception as je: self.glossary_json_failure_count += 1 import logging logger = logging.getLogger(__name__) logger.warning( "Malformed Greek glossary JSON", extra={"response": raw_response[:2000]} ) print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...") # Regex recovery fallback import re self.regex_recovery_count += 1 term_blocks = re.findall(r'"([^"]+)"\s*:\s*\{([^}]+)\}', json_str) for term, block in term_blocks: def_match = re.search(r'"definition"\s*:\s*["\']([^"\']+)["\']', block) gloss_match = re.search(r'"gloss"\s*:\s*["\']([^"\']+)["\']', block) ety_match = re.search(r'"etymology"\s*:\s*["\']([^"\']+)["\']', block) definitions[term] = { "definition": def_match.group(1) if def_match else "", "gloss": gloss_match.group(1) if gloss_match else "", "etymology": ety_match.group(1) if ety_match else "" } return definitions except Exception as e: print(f"[WARN] Failed to generate batch Greek explanations: {e}") return {} def _build_uses_list(self, terms, greek_text): """Build list of symbol/word uses using RAG and batch Groq explanations""" import unicodedata items = [] # 1. Get definitions for the extracted Greek words (terms) if terms: # Unique terms preserving order unique_terms = list(dict.fromkeys(terms)) # Limit to top 15 terms to be concise unique_terms = unique_terms[:15] print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...") definitions = {} missing_terms = [] for term in unique_terms: # Check RAG corpus (normalize search query) norm_term = unicodedata.normalize('NFC', term).strip() rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1) if rag_matches: definitions[term] = rag_matches[0]["definition"] else: missing_terms.append(term) # Generate remaining definitions with Groq in a single batch if missing_terms: groq_defs = self._generate_batch_explanations(missing_terms) # Normalize groq keys for matching normalized_groq_defs = {} for k, v in groq_defs.items(): nk = unicodedata.normalize('NFC', k).strip().lower() normalized_groq_defs[nk] = v # Assign matching definitions for term in missing_terms: nt = unicodedata.normalize('NFC', term).strip().lower() if nt in normalized_groq_defs: definitions[term] = normalized_groq_defs[nt] else: # Case/accent insensitive backup match (in case Groq stripped accents) import unicodedata as ud def strip_accents(s): return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn') stripped_t = strip_accents(nt) for gk, gv in normalized_groq_defs.items(): if strip_accents(gk) == stripped_t: definitions[term] = gv break for term in unique_terms: definition = definitions.get(term) if not definition: definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values." elif isinstance(definition, dict): parts = [] d_val = definition.get("definition", "").strip() g_val = definition.get("gloss", "").strip() e_val = definition.get("etymology", "").strip() if d_val: parts.append(d_val) if g_val: parts.append(f"Gloss: {g_val}") if e_val: parts.append(f"Etymology: {e_val}") definition = " | ".join(parts) if parts else "Ancient Greek lexical term." items.append(f"{term}: {definition}") # 2. Add significant paleographical/character markers found in the text if they are in the references notes = self.references.get("greek_symbol_notes", {}) or {} seen_chars = set() char_items = [] for ch in greek_text: if ch in notes and ch not in seen_chars: seen_chars.add(ch) char_items.append(f"Character '{ch}': {notes[ch]}") # Limit character notes to prevent clutter items.extend(char_items[:5]) # Format as list items with bullets formatted_items = [f"- {item}" for item in items] if not formatted_items: default_hint = self.references.get("greek_hint", "Ancient Greek script marker; values are determined by polytonic diacritical marks.") formatted_items.append(f"- —: {default_hint}") return formatted_items def _build_meaning_box(self, terms, groq_detail): """Build meaning interpretation box""" intro_lines = [ "The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.", "Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification." ] points = [ "• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.", "• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.", ] if groq_detail and isinstance(groq_detail, str) and groq_detail.strip(): points.append(groq_detail.strip()) return { "title": "Possible meaning:", "intro_lines": intro_lines, "frequent_label": "Key terms noted", "frequent": terms[:10], "points": points } def generate_story(self, processed_result): """Generate creative story for Greek text""" greek_text = processed_result.get("text", "") if not self.groq_client.is_available(): return "Groq client unavailable, cannot generate story." styles = [ "as an epic poem told by a travelling rhapsode", "as a prophecy inscribed on the Oracle at Delphi", "as a philosophical dialogue in the Academy", "as a myth recounted by ancient storytellers", "as a recovered scroll from the Library of Alexandria", "as a hymn sung in honor of the gods" ] import random chosen_style = random.choice(styles) seed = random.randint(1000, 9999) prompt = ( f"The following ancient Greek text was found: {greek_text}\n\n" f"Create a long, vivid, imaginative story from ancient Greek times " f"based on this Greek text. Write it as one rich paragraph with " f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n" f"Creative seed: {seed}\n" f"Write a detailed, imaginative myth-like story {chosen_style}. " "Include multiple characters, rich imagery, and scenes. " "Avoid repetition and keep it unpredictable." ) system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture." story = self.groq_client.generate_response( system_prompt=system_prompt, user_prompt=prompt ) if not story or is_gibberish(story): return "Failed to create quality story; the ancient texts remain silent." return story