Spaces:
Sleeping
Sleeping
| import pytesseract | |
| import re | |
| import os | |
| import cv2 | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| from .base_processor import BaseScriptProcessor | |
| from utils.text_utils import is_gibberish | |
| BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models")) | |
| GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr") | |
| class GreekProcessor(BaseScriptProcessor): | |
| def __init__(self, groq_client, references, clip_classifier): | |
| super().__init__(groq_client, references, clip_classifier) | |
| self.clip_classifier = clip_classifier | |
| self.setup_ancient_greek_ocr() | |
| self.trocr_model = None | |
| self.trocr_processor = None | |
| self.trocr_available = False | |
| self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| # Register for dynamic VRAM management | |
| from utils.gpu_diagnostics import register_processor | |
| register_processor("greek", self) | |
| # Metrics for Greek Glossary | |
| self.glossary_success_count = 0 | |
| self.glossary_json_failure_count = 0 | |
| self.regex_recovery_count = 0 | |
| def setup_greek_trocr(self): | |
| """Setup TrOCR model — BEST for ancient Greek manuscripts""" | |
| try: | |
| from utils.gpu_diagnostics import reclaim_vram_for | |
| reclaim_vram_for("greek") | |
| print("[INFO] Lazily loading TrOCR model for ancient Greek...") | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| import torch | |
| import os | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| self.trocr_processor = TrOCRProcessor.from_pretrained( | |
| 'rithwikn/trocr_greek_combined', | |
| cache_dir=GREEK_TROCR_MODEL_DIR, | |
| local_files_only=False, | |
| token=HF_TOKEN | |
| ) | |
| self.trocr_model = VisionEncoderDecoderModel.from_pretrained( | |
| 'rithwikn/trocr_greek_combined', | |
| cache_dir=GREEK_TROCR_MODEL_DIR, | |
| local_files_only=False, | |
| token=HF_TOKEN | |
| ) | |
| self.trocr_model.to(self.device) | |
| self.trocr_model.eval() # Put in evaluation mode | |
| from utils.gpu_diagnostics import log_model_device | |
| log_model_device("Greek TrOCR", self.device) | |
| self.trocr_available = True | |
| print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}") | |
| except Exception as e: | |
| print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}") | |
| self.trocr_available = False | |
| def setup_ancient_greek_ocr(self): | |
| """Setup Ancient Greek OCR with Tesseract language check""" | |
| try: | |
| langs = pytesseract.get_languages(config='') | |
| self.grc_available = "grc" in langs | |
| if self.grc_available: | |
| print("[INFO] Ancient Greek Tesseract language pack 'grc' is available") | |
| else: | |
| print("[WARN] Ancient Greek Tesseract language pack 'grc' is NOT available") | |
| except Exception as e: | |
| print(f"[ERROR] Failed to check Tesseract languages: {e}") | |
| self.grc_available = False | |
| def detect_script(self, image_path): | |
| """Simplified detection - Groq Vision handles main classification""" | |
| try: | |
| if not getattr(self, 'trocr_available', False): | |
| # Check if Ancient Greek OCR is available as fallback | |
| if not getattr(self, 'grc_available', False): | |
| print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)") | |
| return False, 0.5 | |
| # If called by Groq Vision classification, accept with high confidence | |
| print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)") | |
| return True, 0.95 | |
| except Exception as e: | |
| print(f"[ERROR] Greek detection failed: {e}") | |
| return False, 0.0 | |
| def _quick_greek_ocr_test(self, image_path): | |
| """Quick OCR test to validate Greek content""" | |
| try: | |
| # Quick test with small image crop | |
| image = Image.open(image_path) | |
| # Take center crop for testing | |
| w, h = image.size | |
| crop_box = (w//4, h//4, 3*w//4, 3*h//4) | |
| test_crop = image.crop(crop_box) | |
| # Test with standard Greek OCR | |
| test_text = pytesseract.image_to_string(test_crop, lang="ell") | |
| greek_char_count = self._count_greek_chars(test_text or "") | |
| # If we find Greek characters, it's likely Greek | |
| return greek_char_count >= 3 | |
| except Exception: | |
| return False | |
| def extract_text(self, image_path): | |
| """Enhanced Greek text extraction with TrOCR primary, Tesseract fallback""" | |
| try: | |
| image = Image.open(image_path) | |
| # Ensure the Greek TrOCR model is loaded dynamically | |
| if self.trocr_model is None: | |
| self.setup_greek_trocr() | |
| else: | |
| from utils.gpu_diagnostics import reclaim_vram_for | |
| reclaim_vram_for("greek") | |
| if str(next(self.trocr_model.parameters()).device) != str(self.device): | |
| print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...") | |
| self.trocr_model.to(self.device) | |
| # Method 1: Ancient Greek TrOCR (if available) | |
| if getattr(self, 'trocr_available', False) and self.trocr_model is not None: | |
| print("[INFO] Attempting Ancient Greek extraction with TrOCR...") | |
| trocr_text = self._extract_with_trocr(image_path) | |
| if trocr_text and self._validate_greek_text(trocr_text): | |
| print("[INFO] Using Ancient Greek TrOCR result") | |
| return trocr_text | |
| print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...") | |
| # Method 2: Ancient Greek OCR (if available and safe) | |
| if getattr(self, 'grc_available', False): | |
| ancient_greek_text = self._extract_with_ancient_greek_ocr(image) | |
| if ancient_greek_text and self._validate_greek_text(ancient_greek_text): | |
| print("[INFO] Using Ancient Greek OCR result") | |
| return ancient_greek_text | |
| # Method 3: Standard Greek OCR | |
| standard_greek_text = self._extract_with_standard_greek_ocr(image) | |
| if standard_greek_text and self._validate_greek_text(standard_greek_text): | |
| print("[INFO] Using standard Greek OCR result") | |
| return standard_greek_text | |
| # Method 4: Layout-aware line segment fallback | |
| print("[INFO] Trying layout-aware Greek segmentation fallback...") | |
| layout_aware_greek_text = self._extract_layout_aware_ocr(image_path) | |
| if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text): | |
| print("[INFO] Using layout-aware Greek OCR result") | |
| return layout_aware_greek_text | |
| # Method 5: Final validation - if no good Greek text found, return empty | |
| print("[INFO] No valid Greek text detected") | |
| return "" | |
| except Exception as e: | |
| print(f"[ERROR] Greek text extraction failed: {e}") | |
| return "" | |
| def _extract_with_trocr(self, image_path): | |
| """Extract text using TrOCR Ancient Greek model line-by-line""" | |
| if self.trocr_model is None: | |
| self.setup_greek_trocr() | |
| else: | |
| from utils.gpu_diagnostics import reclaim_vram_for | |
| reclaim_vram_for("greek") | |
| if str(next(self.trocr_model.parameters()).device) != str(self.device): | |
| print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...") | |
| self.trocr_model.to(self.device) | |
| if not getattr(self, 'trocr_available', False) or self.trocr_model is None: | |
| return "" | |
| try: | |
| import torch | |
| from PIL import Image | |
| print("[INFO] Segmenting layout for Greek TrOCR...") | |
| layout = self.layout_parser.analyze_layout(image_path) | |
| crops = self.layout_parser.crop_lines(image_path, layout) | |
| # Fallback to whole image if no crops detected | |
| if not crops: | |
| print("[WARN] No line crops found, processing full image with TrOCR") | |
| crops = [Image.open(image_path).convert("RGB")] | |
| line_texts = [] | |
| print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...") | |
| for idx, crop in enumerate(crops): | |
| # Ensure RGB mode for TrOCR | |
| crop_rgb = crop.convert("RGB") | |
| pixel_values = self.trocr_processor( | |
| images=crop_rgb, | |
| return_tensors="pt" | |
| ).pixel_values.to(self.device) | |
| with torch.inference_mode(): | |
| generated_ids = self.trocr_model.generate( | |
| pixel_values, | |
| max_length=256, | |
| num_beams=4, | |
| early_stopping=True, | |
| repetition_penalty=1.2 | |
| ) | |
| text = self.trocr_processor.batch_decode( | |
| generated_ids, | |
| skip_special_tokens=True | |
| )[0] | |
| if text.strip(): | |
| line_texts.append(text.strip()) | |
| full_text = "\n".join(line_texts) | |
| print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image") | |
| return full_text | |
| except Exception as e: | |
| print(f"[ERROR] Greek TrOCR extraction failed: {e}") | |
| return "" | |
| def _extract_with_ancient_greek_ocr(self, image): | |
| """Extract using specialized Ancient Greek OCR""" | |
| try: | |
| if not getattr(self, 'grc_available', False): | |
| return "" | |
| # Use ancient Greek language code 'grc' with optimized settings | |
| config = "--psm 6 --oem 1 -c preserve_interword_spaces=1" | |
| # Try ancient Greek language pack | |
| text = pytesseract.image_to_string( | |
| image, | |
| lang="grc", # Ancient Greek language code | |
| config=config | |
| ) | |
| return text.strip() | |
| except Exception as e: | |
| print(f"[WARN] Ancient Greek OCR failed: {e}") | |
| return "" | |
| def _extract_layout_aware_ocr(self, image_path): | |
| """Extract text by segmenting the page layout into lines first for improved readability order""" | |
| try: | |
| import pytesseract | |
| print("[INFO] Running layout-aware line segmentation for Greek...") | |
| layout = self.layout_parser.analyze_layout(image_path) | |
| crops = self.layout_parser.crop_lines(image_path, layout) | |
| if not crops: | |
| print("[WARN] Layout parser returned no line crops for Greek") | |
| return "" | |
| print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines") | |
| line_texts = [] | |
| # Try to use Ancient Greek first | |
| use_grc = getattr(self, 'grc_available', False) | |
| try: | |
| for idx, crop in enumerate(crops): | |
| # Enhance line crop for OCR | |
| crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR) | |
| gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY) | |
| clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4)) | |
| enhanced = clahe.apply(gray) | |
| crop_pil = Image.fromarray(enhanced) | |
| config = '--oem 3 --psm 7' | |
| text = "" | |
| if use_grc: | |
| text = pytesseract.image_to_string( | |
| crop_pil, | |
| lang='grc', | |
| config=config | |
| ).strip() | |
| if not text: | |
| text = pytesseract.image_to_string( | |
| crop_pil, | |
| lang='ell', | |
| config=config | |
| ).strip() | |
| if text: | |
| line_texts.append(text) | |
| finally: | |
| pass | |
| return "\n".join(line_texts) | |
| except Exception as e: | |
| print(f"[WARN] Layout aware Greek OCR failed: {e}") | |
| return "" | |
| def _extract_with_standard_greek_ocr(self, image): | |
| """Extract using standard Greek OCR with optimized settings""" | |
| try: | |
| # Multiple OCR attempts with different settings | |
| configs = [ | |
| "--psm 6 --oem 1", # Uniform text block | |
| "--psm 4 --oem 1", # Single column text | |
| "--psm 3 --oem 1", # Default, automatic page segmentation | |
| "--psm 8 --oem 1" # Single word | |
| ] | |
| for config in configs: | |
| try: | |
| text = pytesseract.image_to_string( | |
| image, | |
| lang="ell", # Modern Greek | |
| config=config | |
| ) | |
| if text and self._validate_greek_text(text): | |
| return text.strip() | |
| except Exception: | |
| continue | |
| return "" | |
| except Exception as e: | |
| print(f"[WARN] Standard Greek OCR failed: {e}") | |
| return "" | |
| def _extract_with_preprocessing(self, image): | |
| """Fallback extraction with image preprocessing""" | |
| try: | |
| # Convert PIL to CV2 | |
| cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) | |
| # Image preprocessing for better OCR | |
| gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) | |
| # Try different preprocessing approaches | |
| preprocessed_images = [ | |
| gray, # Original grayscale | |
| cv2.GaussianBlur(gray, (1, 1), 0), # Slight blur | |
| cv2.medianBlur(gray, 3), # Noise reduction | |
| cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Adaptive threshold | |
| ] | |
| for processed_img in preprocessed_images: | |
| try: | |
| pil_img = Image.fromarray(processed_img) | |
| text = pytesseract.image_to_string( | |
| pil_img, | |
| lang="ell", | |
| config="--psm 6 --oem 1" | |
| ) | |
| if self._validate_greek_text(text): | |
| return text.strip() | |
| except Exception: | |
| continue | |
| return "" | |
| except Exception as e: | |
| print(f"[WARN] Fallback Greek OCR failed: {e}") | |
| return "" | |
| def _count_greek_chars(self, text): | |
| """Count Greek Unicode characters including polytonic marks""" | |
| if not text: | |
| return 0 | |
| def is_greek_char(ch): | |
| o = ord(ch) | |
| # Greek and Coptic (0x0370-0x03FF) | |
| # Greek Extended (0x1F00-0x1FFF) - includes polytonic marks | |
| return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF) | |
| return sum(is_greek_char(ch) for ch in text) | |
| def _validate_greek_text(self, text): | |
| """Validate if text contains meaningful Greek content""" | |
| if not text or len(text.strip()) < 3: | |
| return False | |
| # Count Greek characters | |
| greek_char_count = self._count_greek_chars(text) | |
| total_chars = len(re.sub(r'\s+', '', text)) | |
| if total_chars == 0: | |
| return False | |
| # Check for Latin characters (should reject if too many) | |
| latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text) | |
| latin_ratio = latin_chars / total_chars if total_chars > 0 else 0 | |
| # If text is mostly Latin characters, reject it | |
| if latin_ratio > 0.8 and greek_char_count < 3: | |
| print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}") | |
| return False | |
| # At least 20% should be Greek characters, or minimum 5 Greek chars | |
| greek_ratio = greek_char_count / total_chars | |
| return greek_char_count >= 5 or greek_ratio >= 0.20 | |
| def _extract_distinct_terms(self, text): | |
| """Extract distinct Greek terms from text""" | |
| if not text: | |
| return [] | |
| # Find Greek words (including those with diacritical marks) | |
| tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE) | |
| def is_greek_word(word): | |
| return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF) | |
| for ch in word) | |
| distinct_terms = [] | |
| seen = set() | |
| for token in tokens: | |
| if len(token) < 2: # Skip single characters | |
| continue | |
| if is_greek_word(token): | |
| normalized = token.lower() | |
| if normalized not in seen: | |
| distinct_terms.append(token) | |
| seen.add(normalized) | |
| return distinct_terms[:20] # Limit to 20 terms | |
| def process_text(self, greek_text): | |
| """Process extracted Greek text""" | |
| if not greek_text: | |
| return {"text": "", "terms": [], "char_analysis": {}, "validation": {}} | |
| # Extract distinct terms | |
| terms = self._extract_distinct_terms(greek_text) | |
| # Character analysis | |
| char_analysis = { | |
| "total_chars": len(greek_text), | |
| "greek_chars": self._count_greek_chars(greek_text), | |
| "unique_chars": len(set(greek_text)), | |
| "words": len(greek_text.split()) | |
| } | |
| # Validation metrics | |
| validation = { | |
| "has_polytonic": self._has_polytonic_marks(greek_text), | |
| "greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]), | |
| "quality_score": self._calculate_quality_score(greek_text) | |
| } | |
| return { | |
| "text": greek_text, | |
| "terms": terms, | |
| "char_analysis": char_analysis, | |
| "validation": validation | |
| } | |
| def _has_polytonic_marks(self, text): | |
| """Check if text contains polytonic Greek marks""" | |
| # Greek Extended block contains polytonic diacritical marks | |
| return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text) | |
| def _calculate_quality_score(self, text): | |
| """Calculate a quality score for the extracted text""" | |
| if not text: | |
| return 0.0 | |
| score = 0.0 | |
| # Base score from Greek character ratio | |
| greek_ratio = self._count_greek_chars(text) / max(1, len(text)) | |
| score += greek_ratio * 0.4 | |
| # Bonus for polytonic marks (indicates authentic ancient Greek) | |
| if self._has_polytonic_marks(text): | |
| score += 0.3 | |
| # Penalty for too many non-alphabetic characters | |
| alpha_chars = sum(ch.isalpha() for ch in text) | |
| alpha_ratio = alpha_chars / max(1, len(text)) | |
| score += alpha_ratio * 0.3 | |
| return min(1.0, score) | |
| def generate_historical_context(self, processed_result): | |
| """Generate historical context for Greek text""" | |
| greek_text = processed_result.get("text", "") | |
| terms = processed_result.get("terms", []) | |
| # Generate Groq context | |
| groq_detail = self._generate_groq_context(greek_text) | |
| # Build references - query both words and individual characters | |
| query_terms = list(terms) if terms else [] | |
| if greek_text: | |
| query_terms.extend([char for char in greek_text if char.strip()]) | |
| print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}") | |
| refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6) | |
| print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}") | |
| return { | |
| "uses_box": { | |
| "title": "Each symbol's possible use by the Greek people", | |
| "items": self._build_uses_list(terms, greek_text) | |
| }, | |
| "meaning_box": self._build_meaning_box(terms, groq_detail), | |
| "references": refs | |
| } | |
| def _generate_groq_context(self, greek_text): | |
| """Generate contextual information using Groq""" | |
| if not self.groq_client.is_available(): | |
| return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package." | |
| prompt = ( | |
| f"This ancient Greek text was found: {greek_text}\n\n" | |
| "Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, " | |
| "possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), " | |
| "and paleographic cues. Avoid repeating the prompt." | |
| ) | |
| system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context." | |
| enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text) | |
| return self.groq_client.generate_response( | |
| system_prompt=enriched_system_prompt, | |
| user_prompt=prompt | |
| ) or "(context unavailable due to Groq error)" | |
| def _generate_batch_explanations(self, terms): | |
| """Generate scholarly glossary definitions for Greek terms in a single batch query""" | |
| if not terms or not self.groq_client or not self.groq_client.is_available(): | |
| return {} | |
| # Limit to first 15 terms to prevent token limit/truncation issues | |
| terms_to_query = list(terms)[:15] | |
| terms_list = ", ".join(terms_to_query) | |
| system_prompt = ( | |
| "You are an expert classicist and lexicographer of Ancient Greek. " | |
| "Return ONLY valid JSON matching the requested schema. " | |
| "No markdown, no code fences (like ```json), no explanations, no prose." | |
| ) | |
| user_prompt = ( | |
| f"For each of the following Ancient Greek words, provide a scholarly definition, " | |
| f"etymological note, and grammatical gloss:\n\n" | |
| f"Words: {terms_list}\n\n" | |
| f"You MUST format the output as a single JSON object where the keys are the exact words " | |
| f"and the values are objects containing 'definition', 'gloss', and 'etymology' keys.\n\n" | |
| f"Output schema:\n" | |
| f"{{\n" | |
| f" \"TERM\": {{\n" | |
| f" \"definition\": \"...\",\n" | |
| f" \"gloss\": \"...\",\n" | |
| f" \"etymology\": \"...\"\n" | |
| f" }}\n" | |
| f"}}\n" | |
| ) | |
| try: | |
| raw_response = self.groq_client.generate_response( | |
| system_prompt=system_prompt, | |
| user_prompt=user_prompt, | |
| max_tokens=2048, | |
| response_format={"type": "json_object"} | |
| ) | |
| # Safe print to avoid UnicodeEncodeError in Windows command prompt | |
| print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}") | |
| # Find JSON block in response | |
| json_str = raw_response.strip() | |
| if "{" in json_str and "}" in json_str: | |
| start = json_str.find("{") | |
| end = json_str.rfind("}") + 1 | |
| json_str = json_str[start:end] | |
| import json | |
| definitions = {} | |
| try: | |
| definitions = json.loads(json_str) | |
| self.glossary_success_count += 1 | |
| except Exception as je: | |
| self.glossary_json_failure_count += 1 | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| logger.warning( | |
| "Malformed Greek glossary JSON", | |
| extra={"response": raw_response[:2000]} | |
| ) | |
| print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...") | |
| # Regex recovery fallback | |
| import re | |
| self.regex_recovery_count += 1 | |
| term_blocks = re.findall(r'"([^"]+)"\s*:\s*\{([^}]+)\}', json_str) | |
| for term, block in term_blocks: | |
| def_match = re.search(r'"definition"\s*:\s*["\']([^"\']+)["\']', block) | |
| gloss_match = re.search(r'"gloss"\s*:\s*["\']([^"\']+)["\']', block) | |
| ety_match = re.search(r'"etymology"\s*:\s*["\']([^"\']+)["\']', block) | |
| definitions[term] = { | |
| "definition": def_match.group(1) if def_match else "", | |
| "gloss": gloss_match.group(1) if gloss_match else "", | |
| "etymology": ety_match.group(1) if ety_match else "" | |
| } | |
| return definitions | |
| except Exception as e: | |
| print(f"[WARN] Failed to generate batch Greek explanations: {e}") | |
| return {} | |
| def _build_uses_list(self, terms, greek_text): | |
| """Build list of symbol/word uses using RAG and batch Groq explanations""" | |
| import unicodedata | |
| items = [] | |
| # 1. Get definitions for the extracted Greek words (terms) | |
| if terms: | |
| # Unique terms preserving order | |
| unique_terms = list(dict.fromkeys(terms)) | |
| # Limit to top 15 terms to be concise | |
| unique_terms = unique_terms[:15] | |
| print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...") | |
| definitions = {} | |
| missing_terms = [] | |
| for term in unique_terms: | |
| # Check RAG corpus (normalize search query) | |
| norm_term = unicodedata.normalize('NFC', term).strip() | |
| rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1) | |
| if rag_matches: | |
| definitions[term] = rag_matches[0]["definition"] | |
| else: | |
| missing_terms.append(term) | |
| # Generate remaining definitions with Groq in a single batch | |
| if missing_terms: | |
| groq_defs = self._generate_batch_explanations(missing_terms) | |
| # Normalize groq keys for matching | |
| normalized_groq_defs = {} | |
| for k, v in groq_defs.items(): | |
| nk = unicodedata.normalize('NFC', k).strip().lower() | |
| normalized_groq_defs[nk] = v | |
| # Assign matching definitions | |
| for term in missing_terms: | |
| nt = unicodedata.normalize('NFC', term).strip().lower() | |
| if nt in normalized_groq_defs: | |
| definitions[term] = normalized_groq_defs[nt] | |
| else: | |
| # Case/accent insensitive backup match (in case Groq stripped accents) | |
| import unicodedata as ud | |
| def strip_accents(s): | |
| return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn') | |
| stripped_t = strip_accents(nt) | |
| for gk, gv in normalized_groq_defs.items(): | |
| if strip_accents(gk) == stripped_t: | |
| definitions[term] = gv | |
| break | |
| for term in unique_terms: | |
| definition = definitions.get(term) | |
| if not definition: | |
| definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values." | |
| elif isinstance(definition, dict): | |
| parts = [] | |
| d_val = definition.get("definition", "").strip() | |
| g_val = definition.get("gloss", "").strip() | |
| e_val = definition.get("etymology", "").strip() | |
| if d_val: | |
| parts.append(d_val) | |
| if g_val: | |
| parts.append(f"Gloss: {g_val}") | |
| if e_val: | |
| parts.append(f"Etymology: {e_val}") | |
| definition = " | ".join(parts) if parts else "Ancient Greek lexical term." | |
| items.append(f"{term}: {definition}") | |
| # 2. Add significant paleographical/character markers found in the text if they are in the references | |
| notes = self.references.get("greek_symbol_notes", {}) or {} | |
| seen_chars = set() | |
| char_items = [] | |
| for ch in greek_text: | |
| if ch in notes and ch not in seen_chars: | |
| seen_chars.add(ch) | |
| char_items.append(f"Character '{ch}': {notes[ch]}") | |
| # Limit character notes to prevent clutter | |
| items.extend(char_items[:5]) | |
| # Format as list items with bullets | |
| formatted_items = [f"- {item}" for item in items] | |
| if not formatted_items: | |
| default_hint = self.references.get("greek_hint", | |
| "Ancient Greek script marker; values are determined by polytonic diacritical marks.") | |
| formatted_items.append(f"- —: {default_hint}") | |
| return formatted_items | |
| def _build_meaning_box(self, terms, groq_detail): | |
| """Build meaning interpretation box""" | |
| intro_lines = [ | |
| "The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.", | |
| "Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification." | |
| ] | |
| points = [ | |
| "• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.", | |
| "• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.", | |
| ] | |
| if groq_detail and isinstance(groq_detail, str) and groq_detail.strip(): | |
| points.append(groq_detail.strip()) | |
| return { | |
| "title": "Possible meaning:", | |
| "intro_lines": intro_lines, | |
| "frequent_label": "Key terms noted", | |
| "frequent": terms[:10], | |
| "points": points | |
| } | |
| def generate_story(self, processed_result): | |
| """Generate creative story for Greek text""" | |
| greek_text = processed_result.get("text", "") | |
| if not self.groq_client.is_available(): | |
| return "Groq client unavailable, cannot generate story." | |
| styles = [ | |
| "as an epic poem told by a travelling rhapsode", | |
| "as a prophecy inscribed on the Oracle at Delphi", | |
| "as a philosophical dialogue in the Academy", | |
| "as a myth recounted by ancient storytellers", | |
| "as a recovered scroll from the Library of Alexandria", | |
| "as a hymn sung in honor of the gods" | |
| ] | |
| import random | |
| chosen_style = random.choice(styles) | |
| seed = random.randint(1000, 9999) | |
| prompt = ( | |
| f"The following ancient Greek text was found: {greek_text}\n\n" | |
| f"Create a long, vivid, imaginative story from ancient Greek times " | |
| f"based on this Greek text. Write it as one rich paragraph with " | |
| f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n" | |
| f"Creative seed: {seed}\n" | |
| f"Write a detailed, imaginative myth-like story {chosen_style}. " | |
| "Include multiple characters, rich imagery, and scenes. " | |
| "Avoid repetition and keep it unpredictable." | |
| ) | |
| system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture." | |
| story = self.groq_client.generate_response( | |
| system_prompt=system_prompt, | |
| user_prompt=prompt | |
| ) | |
| if not story or is_gibberish(story): | |
| return "Failed to create quality story; the ancient texts remain silent." | |
| return story | |