Spaces:

Akshay30
/

decipherai-api

Sleeping

App Files Files Community

decipherai-api / processors /greek_processor.py

Akshay30

Fix Greek OCR and update Latin OCR model

62db04d 3 days ago

raw

history blame contribute delete

33.8 kB

	import pytesseract
	import re
	import os
	import cv2
	import numpy as np
	import torch
	from PIL import Image
	from .base_processor import BaseScriptProcessor
	from utils.text_utils import is_gibberish

	BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
	GREEK_TROCR_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "greek_trocr")

	class GreekProcessor(BaseScriptProcessor):
	def __init__(self, groq_client, references, clip_classifier):
	super().__init__(groq_client, references, clip_classifier)
	self.clip_classifier = clip_classifier
	self.setup_ancient_greek_ocr()

	self.trocr_model = None
	self.trocr_processor = None
	self.trocr_available = False
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Register for dynamic VRAM management
	from utils.gpu_diagnostics import register_processor
	register_processor("greek", self)

	# Metrics for Greek Glossary
	self.glossary_success_count = 0
	self.glossary_json_failure_count = 0
	self.regex_recovery_count = 0

	def setup_greek_trocr(self):
	"""Setup TrOCR model — BEST for ancient Greek manuscripts"""
	try:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("greek")

	print("[INFO] Lazily loading TrOCR model for ancient Greek...")
	from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	import torch

	import os
	HF_TOKEN = os.getenv("HF_TOKEN")
	self.trocr_processor = TrOCRProcessor.from_pretrained(
	'rithwikn/trocr_greek_combined',
	cache_dir=GREEK_TROCR_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)
	self.trocr_model = VisionEncoderDecoderModel.from_pretrained(
	'rithwikn/trocr_greek_combined',
	cache_dir=GREEK_TROCR_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)

	self.trocr_model.to(self.device)
	self.trocr_model.eval() # Put in evaluation mode

	from utils.gpu_diagnostics import log_model_device
	log_model_device("Greek TrOCR", self.device)

	self.trocr_available = True
	print(f"[INFO] Ancient Greek TrOCR loaded successfully on {self.device}")

	except Exception as e:
	print(f"[ERROR] Ancient Greek TrOCR failed to load: {e}")
	self.trocr_available = False

	def setup_ancient_greek_ocr(self):
	"""Setup Ancient Greek OCR with Tesseract language check"""
	try:
	langs = pytesseract.get_languages(config='')
	self.grc_available = "grc" in langs
	if self.grc_available:
	print("[INFO] Ancient Greek Tesseract language pack 'grc' is available")
	else:
	print("[WARN] Ancient Greek Tesseract language pack 'grc' is NOT available")
	except Exception as e:
	print(f"[ERROR] Failed to check Tesseract languages: {e}")
	self.grc_available = False
	def detect_script(self, image_path):
	"""Simplified detection - Groq Vision handles main classification"""
	try:
	if not getattr(self, 'trocr_available', False):
	# Check if Ancient Greek OCR is available as fallback
	if not getattr(self, 'grc_available', False):
	print("[INFO] Greek processor not available (neither TrOCR nor Tesseract)")
	return False, 0.5

	# If called by Groq Vision classification, accept with high confidence
	print("[INFO] Greek processor activated by Groq Vision (Llama-4-Scout)")
	return True, 0.95

	except Exception as e:
	print(f"[ERROR] Greek detection failed: {e}")
	return False, 0.0


	def _quick_greek_ocr_test(self, image_path):
	"""Quick OCR test to validate Greek content"""
	try:
	# Quick test with small image crop
	image = Image.open(image_path)
	# Take center crop for testing
	w, h = image.size
	crop_box = (w//4, h//4, 3w//4, 3h//4)
	test_crop = image.crop(crop_box)

	# Test with standard Greek OCR
	test_text = pytesseract.image_to_string(test_crop, lang="ell")
	greek_char_count = self._count_greek_chars(test_text or "")

	# If we find Greek characters, it's likely Greek
	return greek_char_count >= 3

	except Exception:
	return False

	def extract_text(self, image_path):
	"""Enhanced Greek text extraction with TrOCR primary, Tesseract fallback"""
	try:
	image = Image.open(image_path)

	# Ensure the Greek TrOCR model is loaded dynamically
	if self.trocr_model is None:
	self.setup_greek_trocr()
	else:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("greek")
	if str(next(self.trocr_model.parameters()).device) != str(self.device):
	print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
	self.trocr_model.to(self.device)

	# Method 1: Ancient Greek TrOCR (if available)
	if getattr(self, 'trocr_available', False) and self.trocr_model is not None:
	print("[INFO] Attempting Ancient Greek extraction with TrOCR...")
	trocr_text = self._extract_with_trocr(image_path)
	if trocr_text and self._validate_greek_text(trocr_text):
	print("[INFO] Using Ancient Greek TrOCR result")
	return trocr_text
	print("[WARN] TrOCR extraction returned poor quality result, trying Tesseract fallback...")

	# Method 2: Ancient Greek OCR (if available and safe)
	if getattr(self, 'grc_available', False):
	ancient_greek_text = self._extract_with_ancient_greek_ocr(image)
	if ancient_greek_text and self._validate_greek_text(ancient_greek_text):
	print("[INFO] Using Ancient Greek OCR result")
	return ancient_greek_text

	# Method 3: Standard Greek OCR
	standard_greek_text = self._extract_with_standard_greek_ocr(image)
	if standard_greek_text and self._validate_greek_text(standard_greek_text):
	print("[INFO] Using standard Greek OCR result")
	return standard_greek_text

	# Method 4: Layout-aware line segment fallback
	print("[INFO] Trying layout-aware Greek segmentation fallback...")
	layout_aware_greek_text = self._extract_layout_aware_ocr(image_path)
	if layout_aware_greek_text and self._validate_greek_text(layout_aware_greek_text):
	print("[INFO] Using layout-aware Greek OCR result")
	return layout_aware_greek_text

	# Method 5: Final validation - if no good Greek text found, return empty
	print("[INFO] No valid Greek text detected")
	return ""

	except Exception as e:
	print(f"[ERROR] Greek text extraction failed: {e}")
	return ""

	def _extract_with_trocr(self, image_path):
	"""Extract text using TrOCR Ancient Greek model line-by-line"""
	if self.trocr_model is None:
	self.setup_greek_trocr()
	else:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("greek")
	if str(next(self.trocr_model.parameters()).device) != str(self.device):
	print(f"[VRAM MANAGER] Activating Greek TrOCR model on {self.device}...")
	self.trocr_model.to(self.device)

	if not getattr(self, 'trocr_available', False) or self.trocr_model is None:
	return ""

	try:
	import torch
	from PIL import Image
	print("[INFO] Segmenting layout for Greek TrOCR...")
	layout = self.layout_parser.analyze_layout(image_path)
	crops = self.layout_parser.crop_lines(image_path, layout)

	# Fallback to whole image if no crops detected
	if not crops:
	print("[WARN] No line crops found, processing full image with TrOCR")
	crops = [Image.open(image_path).convert("RGB")]

	line_texts = []
	print(f"[INFO] Running Ancient Greek TrOCR inference on {len(crops)} crops...")
	for idx, crop in enumerate(crops):
	# Ensure RGB mode for TrOCR
	crop_rgb = crop.convert("RGB")

	pixel_values = self.trocr_processor(
	images=crop_rgb,
	return_tensors="pt"
	).pixel_values.to(self.device)

	with torch.inference_mode():
	generated_ids = self.trocr_model.generate(
	pixel_values,
	max_length=256,
	num_beams=4,
	early_stopping=True,
	repetition_penalty=1.2
	)

	text = self.trocr_processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	if text.strip():
	line_texts.append(text.strip())

	full_text = "\n".join(line_texts)
	print(f"[SUCCESS] TrOCR extracted {len(line_texts)} lines from Greek image")
	return full_text

	except Exception as e:
	print(f"[ERROR] Greek TrOCR extraction failed: {e}")
	return ""


	def _extract_with_ancient_greek_ocr(self, image):
	"""Extract using specialized Ancient Greek OCR"""
	try:
	if not getattr(self, 'grc_available', False):
	return ""

	# Use ancient Greek language code 'grc' with optimized settings
	config = "--psm 6 --oem 1 -c preserve_interword_spaces=1"

	# Try ancient Greek language pack
	text = pytesseract.image_to_string(
	image,
	lang="grc", # Ancient Greek language code
	config=config
	)
	return text.strip()

	except Exception as e:
	print(f"[WARN] Ancient Greek OCR failed: {e}")
	return ""

	def _extract_layout_aware_ocr(self, image_path):
	"""Extract text by segmenting the page layout into lines first for improved readability order"""
	try:
	import pytesseract
	print("[INFO] Running layout-aware line segmentation for Greek...")
	layout = self.layout_parser.analyze_layout(image_path)
	crops = self.layout_parser.crop_lines(image_path, layout)

	if not crops:
	print("[WARN] Layout parser returned no line crops for Greek")
	return ""

	print(f"[INFO] Layout-aware Greek line parser cropped {len(crops)} lines")
	line_texts = []

	# Try to use Ancient Greek first
	use_grc = getattr(self, 'grc_available', False)

	try:
	for idx, crop in enumerate(crops):
	# Enhance line crop for OCR
	crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
	clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
	enhanced = clahe.apply(gray)
	crop_pil = Image.fromarray(enhanced)

	config = '--oem 3 --psm 7'
	text = ""

	if use_grc:
	text = pytesseract.image_to_string(
	crop_pil,
	lang='grc',
	config=config
	).strip()

	if not text:
	text = pytesseract.image_to_string(
	crop_pil,
	lang='ell',
	config=config
	).strip()

	if text:
	line_texts.append(text)
	finally:
	pass

	return "\n".join(line_texts)
	except Exception as e:
	print(f"[WARN] Layout aware Greek OCR failed: {e}")
	return ""


	def _extract_with_standard_greek_ocr(self, image):
	"""Extract using standard Greek OCR with optimized settings"""
	try:
	# Multiple OCR attempts with different settings
	configs = [
	"--psm 6 --oem 1", # Uniform text block
	"--psm 4 --oem 1", # Single column text
	"--psm 3 --oem 1", # Default, automatic page segmentation
	"--psm 8 --oem 1" # Single word
	]

	for config in configs:
	try:
	text = pytesseract.image_to_string(
	image,
	lang="ell", # Modern Greek
	config=config
	)

	if text and self._validate_greek_text(text):
	return text.strip()

	except Exception:
	continue

	return ""

	except Exception as e:
	print(f"[WARN] Standard Greek OCR failed: {e}")
	return ""

	def _extract_with_preprocessing(self, image):
	"""Fallback extraction with image preprocessing"""
	try:
	# Convert PIL to CV2
	cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# Image preprocessing for better OCR
	gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)

	# Try different preprocessing approaches
	preprocessed_images = [
	gray, # Original grayscale
	cv2.GaussianBlur(gray, (1, 1), 0), # Slight blur
	cv2.medianBlur(gray, 3), # Noise reduction
	cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] # Adaptive threshold
	]

	for processed_img in preprocessed_images:
	try:
	pil_img = Image.fromarray(processed_img)
	text = pytesseract.image_to_string(
	pil_img,
	lang="ell",
	config="--psm 6 --oem 1"
	)

	if self._validate_greek_text(text):
	return text.strip()

	except Exception:
	continue

	return ""

	except Exception as e:
	print(f"[WARN] Fallback Greek OCR failed: {e}")
	return ""

	def _count_greek_chars(self, text):
	"""Count Greek Unicode characters including polytonic marks"""
	if not text:
	return 0

	def is_greek_char(ch):
	o = ord(ch)
	# Greek and Coptic (0x0370-0x03FF)
	# Greek Extended (0x1F00-0x1FFF) - includes polytonic marks
	return (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF)

	return sum(is_greek_char(ch) for ch in text)

	def _validate_greek_text(self, text):
	"""Validate if text contains meaningful Greek content"""
	if not text or len(text.strip()) < 3:
	return False

	# Count Greek characters
	greek_char_count = self._count_greek_chars(text)
	total_chars = len(re.sub(r'\s+', '', text))

	if total_chars == 0:
	return False

	# Check for Latin characters (should reject if too many)
	latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
	latin_ratio = latin_chars / total_chars if total_chars > 0 else 0

	# If text is mostly Latin characters, reject it
	if latin_ratio > 0.8 and greek_char_count < 3:
	print(f"[INFO] Rejecting text as Greek - too many Latin chars: {latin_ratio:.2f}")
	return False

	# At least 20% should be Greek characters, or minimum 5 Greek chars
	greek_ratio = greek_char_count / total_chars

	return greek_char_count >= 5 or greek_ratio >= 0.20


	def _extract_distinct_terms(self, text):
	"""Extract distinct Greek terms from text"""
	if not text:
	return []

	# Find Greek words (including those with diacritical marks)
	tokens = re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)

	def is_greek_word(word):
	return any((0x0370 <= ord(ch) <= 0x03FF) or (0x1F00 <= ord(ch) <= 0x1FFF)
	for ch in word)

	distinct_terms = []
	seen = set()

	for token in tokens:
	if len(token) < 2: # Skip single characters
	continue

	if is_greek_word(token):
	normalized = token.lower()
	if normalized not in seen:
	distinct_terms.append(token)
	seen.add(normalized)

	return distinct_terms[:20] # Limit to 20 terms

	def process_text(self, greek_text):
	"""Process extracted Greek text"""
	if not greek_text:
	return {"text": "", "terms": [], "char_analysis": {}, "validation": {}}

	# Extract distinct terms
	terms = self._extract_distinct_terms(greek_text)

	# Character analysis
	char_analysis = {
	"total_chars": len(greek_text),
	"greek_chars": self._count_greek_chars(greek_text),
	"unique_chars": len(set(greek_text)),
	"words": len(greek_text.split())
	}

	# Validation metrics
	validation = {
	"has_polytonic": self._has_polytonic_marks(greek_text),
	"greek_ratio": char_analysis["greek_chars"] / max(1, char_analysis["total_chars"]),
	"quality_score": self._calculate_quality_score(greek_text)
	}

	return {
	"text": greek_text,
	"terms": terms,
	"char_analysis": char_analysis,
	"validation": validation
	}

	def _has_polytonic_marks(self, text):
	"""Check if text contains polytonic Greek marks"""
	# Greek Extended block contains polytonic diacritical marks
	return any(0x1F00 <= ord(ch) <= 0x1FFF for ch in text)

	def _calculate_quality_score(self, text):
	"""Calculate a quality score for the extracted text"""
	if not text:
	return 0.0

	score = 0.0

	# Base score from Greek character ratio
	greek_ratio = self._count_greek_chars(text) / max(1, len(text))
	score += greek_ratio * 0.4

	# Bonus for polytonic marks (indicates authentic ancient Greek)
	if self._has_polytonic_marks(text):
	score += 0.3

	# Penalty for too many non-alphabetic characters
	alpha_chars = sum(ch.isalpha() for ch in text)
	alpha_ratio = alpha_chars / max(1, len(text))
	score += alpha_ratio * 0.3

	return min(1.0, score)

	def generate_historical_context(self, processed_result):
	"""Generate historical context for Greek text"""
	greek_text = processed_result.get("text", "")
	terms = processed_result.get("terms", [])

	# Generate Groq context
	groq_detail = self._generate_groq_context(greek_text)

	# Build references - query both words and individual characters
	query_terms = list(terms) if terms else []
	if greek_text:
	query_terms.extend([char for char in greek_text if char.strip()])
	print(f"[DEBUG GREEK RAG] query_terms: {[t.encode('ascii', 'backslashreplace').decode() for t in query_terms]}")
	refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)
	print(f"[DEBUG GREEK RAG] refs: {[r['term'].encode('ascii', 'backslashreplace').decode() for r in refs]}")

	return {
	"uses_box": {
	"title": "Each symbol's possible use by the Greek people",
	"items": self._build_uses_list(terms, greek_text)
	},
	"meaning_box": self._build_meaning_box(terms, groq_detail),
	"references": refs
	}

	def _generate_groq_context(self, greek_text):
	"""Generate contextual information using Groq"""
	if not self.groq_client.is_available():
	return "(Groq unavailable) Context generation requires GROQ_API_KEY and groq package."

	prompt = (
	f"This ancient Greek text was found: {greek_text}\n\n"
	"Write a concise, scholarly paragraph (6-10 sentences) giving cultural and historical context: textual tradition, "
	"possible meanings, links to Greek culture/myth/philosophy, manuscript practices (accents, breathings, ligatures, nomina sacra), "
	"and paleographic cues. Avoid repeating the prompt."
	)

	system_prompt = "You are an expert philologist of Ancient Greece. Provide concise, accurate scholarly context."
	enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, greek_text)

	return self.groq_client.generate_response(
	system_prompt=enriched_system_prompt,
	user_prompt=prompt
	) or "(context unavailable due to Groq error)"

	def _generate_batch_explanations(self, terms):
	"""Generate scholarly glossary definitions for Greek terms in a single batch query"""
	if not terms or not self.groq_client or not self.groq_client.is_available():
	return {}

	# Limit to first 15 terms to prevent token limit/truncation issues
	terms_to_query = list(terms)[:15]
	terms_list = ", ".join(terms_to_query)

	system_prompt = (
	"You are an expert classicist and lexicographer of Ancient Greek. "
	"Return ONLY valid JSON matching the requested schema. "
	"No markdown, no code fences (like ```json), no explanations, no prose."
	)
	user_prompt = (
	f"For each of the following Ancient Greek words, provide a scholarly definition, "
	f"etymological note, and grammatical gloss:\n\n"
	f"Words: {terms_list}\n\n"
	f"You MUST format the output as a single JSON object where the keys are the exact words "
	f"and the values are objects containing 'definition', 'gloss', and 'etymology' keys.\n\n"
	f"Output schema:\n"
	f"{{\n"
	f" \"TERM\": {{\n"
	f" \"definition\": \"...\",\n"
	f" \"gloss\": \"...\",\n"
	f" \"etymology\": \"...\"\n"
	f" }}\n"
	f"}}\n"
	)

	try:
	raw_response = self.groq_client.generate_response(
	system_prompt=system_prompt,
	user_prompt=user_prompt,
	max_tokens=2048,
	response_format={"type": "json_object"}
	)
	# Safe print to avoid UnicodeEncodeError in Windows command prompt
	print(f"[INFO] Groq glossary raw response: {raw_response.encode('ascii', 'backslashreplace').decode()}")

	# Find JSON block in response
	json_str = raw_response.strip()
	if "{" in json_str and "}" in json_str:
	start = json_str.find("{")
	end = json_str.rfind("}") + 1
	json_str = json_str[start:end]

	import json
	definitions = {}
	try:
	definitions = json.loads(json_str)
	self.glossary_success_count += 1
	except Exception as je:
	self.glossary_json_failure_count += 1
	import logging
	logger = logging.getLogger(__name__)
	logger.warning(
	"Malformed Greek glossary JSON",
	extra={"response": raw_response[:2000]}
	)
	print(f"[WARN] Standard JSON load failed: {je}. Attempting regex recovery...")

	# Regex recovery fallback
	import re
	self.regex_recovery_count += 1
	term_blocks = re.findall(r'"([^"]+)"\s:\s\{([^}]+)\}', json_str)
	for term, block in term_blocks:
	def_match = re.search(r'"definition"\s:\s["\']([^"\']+)["\']', block)
	gloss_match = re.search(r'"gloss"\s:\s["\']([^"\']+)["\']', block)
	ety_match = re.search(r'"etymology"\s:\s["\']([^"\']+)["\']', block)
	definitions[term] = {
	"definition": def_match.group(1) if def_match else "",
	"gloss": gloss_match.group(1) if gloss_match else "",
	"etymology": ety_match.group(1) if ety_match else ""
	}

	return definitions
	except Exception as e:
	print(f"[WARN] Failed to generate batch Greek explanations: {e}")

	return {}

	def _build_uses_list(self, terms, greek_text):
	"""Build list of symbol/word uses using RAG and batch Groq explanations"""
	import unicodedata
	items = []

	# 1. Get definitions for the extracted Greek words (terms)
	if terms:
	# Unique terms preserving order
	unique_terms = list(dict.fromkeys(terms))
	# Limit to top 15 terms to be concise
	unique_terms = unique_terms[:15]
	print(f"[INFO] Generating glossary for {len(unique_terms)} Greek terms...")
	definitions = {}
	missing_terms = []

	for term in unique_terms:
	# Check RAG corpus (normalize search query)
	norm_term = unicodedata.normalize('NFC', term).strip()
	rag_matches = self.rag_service.retrieve_grounding_list([norm_term], max_results=1)
	if rag_matches:
	definitions[term] = rag_matches[0]["definition"]
	else:
	missing_terms.append(term)

	# Generate remaining definitions with Groq in a single batch
	if missing_terms:
	groq_defs = self._generate_batch_explanations(missing_terms)
	# Normalize groq keys for matching
	normalized_groq_defs = {}
	for k, v in groq_defs.items():
	nk = unicodedata.normalize('NFC', k).strip().lower()
	normalized_groq_defs[nk] = v

	# Assign matching definitions
	for term in missing_terms:
	nt = unicodedata.normalize('NFC', term).strip().lower()
	if nt in normalized_groq_defs:
	definitions[term] = normalized_groq_defs[nt]
	else:
	# Case/accent insensitive backup match (in case Groq stripped accents)
	import unicodedata as ud
	def strip_accents(s):
	return "".join(c for c in ud.normalize('NFD', s) if ud.category(c) != 'Mn')

	stripped_t = strip_accents(nt)
	for gk, gv in normalized_groq_defs.items():
	if strip_accents(gk) == stripped_t:
	definitions[term] = gv
	break

	for term in unique_terms:
	definition = definitions.get(term)
	if not definition:
	definition = f"Ancient Greek lexical term. Characterized by specific diacritics and phonological values."
	elif isinstance(definition, dict):
	parts = []
	d_val = definition.get("definition", "").strip()
	g_val = definition.get("gloss", "").strip()
	e_val = definition.get("etymology", "").strip()
	if d_val:
	parts.append(d_val)
	if g_val:
	parts.append(f"Gloss: {g_val}")
	if e_val:
	parts.append(f"Etymology: {e_val}")
	definition = " \| ".join(parts) if parts else "Ancient Greek lexical term."
	items.append(f"{term}: {definition}")

	# 2. Add significant paleographical/character markers found in the text if they are in the references
	notes = self.references.get("greek_symbol_notes", {}) or {}
	seen_chars = set()
	char_items = []
	for ch in greek_text:
	if ch in notes and ch not in seen_chars:
	seen_chars.add(ch)
	char_items.append(f"Character '{ch}': {notes[ch]}")

	# Limit character notes to prevent clutter
	items.extend(char_items[:5])

	# Format as list items with bullets
	formatted_items = [f"- {item}" for item in items]

	if not formatted_items:
	default_hint = self.references.get("greek_hint",
	"Ancient Greek script marker; values are determined by polytonic diacritical marks.")
	formatted_items.append(f"- —: {default_hint}")

	return formatted_items


	def _build_meaning_box(self, terms, groq_detail):
	"""Build meaning interpretation box"""
	intro_lines = [
	"The lexical concentration suggests a connected passage with recurring words or themes, consistent with Greek manuscript traditions.",
	"Scribal features such as accents/breathings, abbreviations, and marginal cues guide reading and assist with dating and genre identification."
	]

	points = [
	"• Presence of nomina sacra, lection signs, or ekphonetic marks indicates liturgical usage; scholia imply classroom or commentary context.",
	"• Orthographic variation (e.g., iotacism) and common ligatures inform palaeographic placement and regional practice.",
	]

	if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
	points.append(groq_detail.strip())

	return {
	"title": "Possible meaning:",
	"intro_lines": intro_lines,
	"frequent_label": "Key terms noted",
	"frequent": terms[:10],
	"points": points
	}

	def generate_story(self, processed_result):
	"""Generate creative story for Greek text"""
	greek_text = processed_result.get("text", "")

	if not self.groq_client.is_available():
	return "Groq client unavailable, cannot generate story."

	styles = [
	"as an epic poem told by a travelling rhapsode",
	"as a prophecy inscribed on the Oracle at Delphi",
	"as a philosophical dialogue in the Academy",
	"as a myth recounted by ancient storytellers",
	"as a recovered scroll from the Library of Alexandria",
	"as a hymn sung in honor of the gods"
	]

	import random
	chosen_style = random.choice(styles)
	seed = random.randint(1000, 9999)

	prompt = (
	f"The following ancient Greek text was found: {greek_text}\n\n"
	f"Create a long, vivid, imaginative story from ancient Greek times "
	f"based on this Greek text. Write it as one rich paragraph with "
	f"much detail, mystery, and cultural atmosphere. At least 200 words.\n\n"
	f"Creative seed: {seed}\n"
	f"Write a detailed, imaginative myth-like story {chosen_style}. "
	"Include multiple characters, rich imagery, and scenes. "
	"Avoid repetition and keep it unpredictable."
	)

	system_prompt = "You are a learned ancient Greek storyteller and scholar of Hellenic culture."

	story = self.groq_client.generate_response(
	system_prompt=system_prompt,
	user_prompt=prompt
	)

	if not story or is_gibberish(story):
	return "Failed to create quality story; the ancient texts remain silent."

	return story