Spaces:

Akshay30
/

decipherai-api

Sleeping

App Files Files Community

decipherai-api / processors /latin_processor.py

Akshay30

Fix Greek OCR and update Latin OCR model

36331c6 4 days ago

raw

history blame contribute delete

61.7 kB

	import os
	import cv2
	import numpy as np
	import re
	import time
	from PIL import Image
	from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	import torch
	from .base_processor import BaseScriptProcessor
	from utils.text_utils import is_gibberish

	BACKEND_MODELS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models"))
	TRIDIS_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "tridis")
	TROCR_LATIN_MODEL_DIR = os.path.join(BACKEND_MODELS_DIR, "trocr_latin")


	class LatinProcessor(BaseScriptProcessor):
	def __init__(self, groq_client, references, clip_classifier):
	super().__init__(groq_client, references, clip_classifier)
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	self.tridis_model = None
	self.tridis_processor = None
	self.tridis_available = False

	self.trocr_latin_model = None
	self.trocr_latin_processor = None
	self.trocr_latin_available = False

	self.active_style = "cursive"
	self.active_model = "None"

	self.setup_tesseract_fallback()

	# Register for dynamic VRAM management
	from utils.gpu_diagnostics import register_processor
	register_processor("latin", self)

	def setup_tridis_htr(self):
	"""Setup TRIDIS HTR model - BEST for medieval Latin manuscripts"""
	try:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("latin")

	print("[INFO] Lazily loading TRIDIS HTR model for medieval Latin...")
	print("[INFO] This model specializes in 13th-16th century manuscripts with automatic abbreviation expansion")

	# TRIDIS model from Hugging Face - runs locally after download
	import os
	HF_TOKEN = os.getenv("HF_TOKEN")
	self.tridis_processor = TrOCRProcessor.from_pretrained(
	'magistermilitum/tridis_HTR',
	cache_dir=TRIDIS_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)
	self.tridis_model = VisionEncoderDecoderModel.from_pretrained(
	'magistermilitum/tridis_HTR',
	cache_dir=TRIDIS_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)

	self.tridis_model.to(self.device)
	self.tridis_model.eval() # Put in evaluation mode

	from utils.gpu_diagnostics import log_model_device
	log_model_device("Latin TRIDIS HTR (Cursive)", self.device)

	print(f"[INFO] TRIDIS HTR loaded successfully on {self.device}")
	print("[INFO] Training: 245,000 lines of Latin/Old French/Old Spanish medieval manuscripts")
	print("[INFO] Features: Automatic abbreviation expansion, named entity capitalization, cancellation markers")
	self.tridis_available = True

	except Exception as e:
	print(f"[ERROR] TRIDIS HTR model failed to load: {e}")
	print("[WARN] Falling back to Tesseract for basic Latin recognition...")
	self.tridis_available = False

	def setup_trocr_base_latin(self):
	"""Setup TRIDIS v2 HTR model - Primary for printed or manuscript Latin, fallback to printed"""
	import os
	HF_TOKEN = os.getenv("HF_TOKEN")
	try:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("latin")

	print("[LATIN] Loading TRIDIS v2 model...")
	self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
	'magistermilitum/tridis_v2_HTR_historical_manuscripts',
	cache_dir=TROCR_LATIN_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)
	self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
	'magistermilitum/tridis_v2_HTR_historical_manuscripts',
	cache_dir=TROCR_LATIN_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)

	self.trocr_latin_model.to(self.device)
	self.trocr_latin_model.eval() # Put in evaluation mode

	from utils.gpu_diagnostics import log_model_device
	log_model_device("Latin TRIDIS v2 HTR", self.device)

	self.trocr_latin_available = True
	self.loaded_printed_model_name = "tridis_v2_HTR_historical_manuscripts"
	print("[LATIN] TRIDIS v2 model loaded successfully")
	print(f"processor class: {type(self.trocr_latin_processor).__name__}")
	print(f"model class: {type(self.trocr_latin_model).__name__}")
	print(f"device: {self.device}")
	print(f"parameter count: {sum(p.numel() for p in self.trocr_latin_model.parameters())}")
	except Exception as e:
	print(f"[LATIN] TRIDIS unavailable, using microsoft/trocr-base-printed")
	try:
	# Free VRAM again in case partial allocation left residue
	reclaim_vram_for("latin")
	self.trocr_latin_processor = TrOCRProcessor.from_pretrained(
	'microsoft/trocr-base-printed',
	cache_dir=TROCR_LATIN_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)
	self.trocr_latin_model = VisionEncoderDecoderModel.from_pretrained(
	'microsoft/trocr-base-printed',
	cache_dir=TROCR_LATIN_MODEL_DIR,
	local_files_only=False,
	token=HF_TOKEN
	)
	self.trocr_latin_model.to(self.device)
	self.trocr_latin_model.eval() # Put in evaluation mode

	from utils.gpu_diagnostics import log_model_device
	log_model_device("Latin TrOCR (Printed Fallback)", self.device)

	self.trocr_latin_available = True
	self.loaded_printed_model_name = "trocr-base-printed"
	print(f"[INFO] Public fallback microsoft/trocr-base-printed loaded successfully on {self.device}")
	print(f"processor class: {type(self.trocr_latin_processor).__name__}")
	print(f"model class: {type(self.trocr_latin_model).__name__}")
	print(f"device: {self.device}")
	print(f"parameter count: {sum(p.numel() for p in self.trocr_latin_model.parameters())}")
	except Exception as ex:
	print(f"[ERROR] All printed Latin models failed to load: {ex}")
	self.trocr_latin_available = False


	def setup_tesseract_fallback(self):
	"""Setup Tesseract as fallback for basic Latin recognition"""
	try:
	import pytesseract

	# Test Tesseract availability
	try:
	version = pytesseract.get_tesseract_version()
	print(f"[INFO] Tesseract fallback version: {version}")
	except:
	print("[INFO] Tesseract version check skipped")

	self.ocr_configs = {
	'medieval_extended': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁꝐꝑꝒꝓꝔꝕꝖꝗꝘꝙꝚꝛꝜꝝꞀꞁꞂꞃ$',
	'medieval_basic': r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:!?()[]{}/-',
	'standard': r'--oem 3 --psm 6',
	'single_line': r'--oem 3 --psm 7',
	'single_word': r'--oem 3 --psm 8',
	'auto': r'--oem 3 --psm 3'
	}
	self.tesseract_available = True
	print("[INFO] Tesseract fallback configured with medieval symbol support")

	except ImportError:
	print("[ERROR] pytesseract not available")
	self.tesseract_available = False
	except Exception as e:
	print(f"[WARN] Tesseract setup failed: {e}")
	self.tesseract_available = False

	def detect_script(self, image_path):
	"""Detection handled by Groq Vision classification"""
	try:
	if not self.tridis_available and not self.tesseract_available:
	print("[ERROR] No OCR engines available for Latin processing")
	return False, 0.0

	method = "TRIDIS HTR (medieval specialist)" if self.tridis_available else "Tesseract fallback"
	print(f"[INFO] Latin processor activated - Using {method}")
	return True, 0.98 if self.tridis_available else 0.85

	except Exception as e:
	print(f"[ERROR] Latin detection failed: {e}")
	return False, 0.0

	def extract_text(self, image_path):
	"""Extract text using dual-mode routing: trocr-base-latin for printed, tridis_HTR for cursive"""
	try:
	start_time = time.time()

	# Step 1: Detect script style
	style = self.layout_parser.detect_writing_style(image_path, self.clip_classifier)
	print(f"[INFO] Latin writing style detected: {style.upper()}")

	primary_text = ""
	fallback_text = ""

	# Ensure the required model is loaded dynamically
	if style == "printed":
	if self.trocr_latin_model is None:
	self.setup_trocr_base_latin()
	else:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("latin")
	if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
	print(f"[VRAM MANAGER] Activating Latin TrOCR (Printed) model on {self.device}...")
	self.trocr_latin_model.to(self.device)
	else:
	if self.tridis_model is None:
	self.setup_tridis_htr()
	else:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("latin")
	if str(next(self.tridis_model.parameters()).device) != str(self.device):
	print(f"[VRAM MANAGER] Activating Latin TRIDIS HTR (Cursive) model on {self.device}...")
	self.tridis_model.to(self.device)

	if style == "printed" and self.trocr_latin_available:
	print("[INFO] Routing to printed/carved Latin model (trocr-base-latin)...")
	primary_text = self._extract_with_trocr_base_latin(image_path)
	if primary_text and self._validate_latin_text(primary_text, style):
	processing_time = time.time() - start_time
	print(f"[SUCCESS] Routed to trocr-base-latin and completed in {processing_time:.2f}s")
	self.active_style = "printed"
	self.active_model = getattr(self, "loaded_printed_model_name", "tridis_v2_HTR_historical_manuscripts")
	return primary_text
	else:
	print("[WARN] trocr-base-latin returned poor quality result, trying TRIDIS HTR fallback...")
	if self.tridis_model is None:
	self.setup_tridis_htr()
	if self.tridis_available:
	fallback_text = self._extract_with_tridis_htr(image_path)

	else: # cursive / manuscript
	print("[INFO] Routing to medieval manuscript model (tridis_HTR)...")
	if self.tridis_available:
	primary_text = self._extract_with_tridis_htr(image_path)
	if primary_text and self._validate_latin_text(primary_text, style):
	processing_time = time.time() - start_time
	print(f"[SUCCESS] Routed to tridis_HTR and completed in {processing_time:.2f}s")
	self.active_style = "cursive"
	self.active_model = "tridis_HTR"
	return primary_text
	else:
	print("[WARN] TRIDIS HTR returned poor quality result, trying trocr-base-latin fallback...")
	if self.trocr_latin_model is None:
	self.setup_trocr_base_latin()
	if self.trocr_latin_available:
	fallback_text = self._extract_with_trocr_base_latin(image_path)

	# Step 2: Check fallback text from the other model
	if fallback_text and self._validate_latin_text(fallback_text, "printed" if style == "cursive" else "cursive"):
	processing_time = time.time() - start_time
	print(f"[SUCCESS] Fallback model transcription successful in {processing_time:.2f}s")
	self.active_style = "printed" if style == "cursive" else "cursive"
	self.active_model = getattr(self, "loaded_printed_model_name", "tridis_v2_HTR_historical_manuscripts") if style == "cursive" else "tridis_HTR"
	return fallback_text

	# Step 3: Tesseract fallback
	if self.tesseract_available:
	print("[INFO] Neural models failed. Processing with Tesseract fallback...")
	tesseract_text = self._extract_with_tesseract_enhanced(image_path)

	if tesseract_text and self._validate_latin_text(tesseract_text, "any"):
	processing_time = time.time() - start_time
	print(f"[SUCCESS] Tesseract fallback completed in {processing_time:.2f}s")
	self.active_style = "printed" # Tesseract works best on printed
	self.active_model = "Tesseract OCR"
	return tesseract_text
	else:
	print("[WARN] Tesseract returned poor quality result, trying layout-aware segmentation fallback...")

	# Method 3: Layout-aware line segment fallback
	layout_aware_text = self._extract_layout_aware_ocr(image_path)
	if layout_aware_text and self._validate_latin_text(layout_aware_text, "any"):
	processing_time = time.time() - start_time
	print(f"[SUCCESS] Layout-aware OCR completed in {processing_time:.2f}s")
	self.active_style = "printed"
	self.active_model = "Tesseract Layout-Aware"
	return layout_aware_text

	print("[ERROR] All OCR methods failed or returned poor quality results")
	self.active_style = "unknown"
	self.active_model = "None"
	return "No readable Latin text detected with sufficient confidence"

	except Exception as e:
	print(f"[ERROR] Latin text extraction failed: {e}")
	self.active_style = "error"
	self.active_model = "None"
	return f"Error during text extraction: {str(e)}"

	def _extract_with_trocr_base_latin(self, image_path):
	"""Extract text using trocr-base-latin - SPECIALIZED for printed/carved Latin"""
	if self.trocr_latin_model is None:
	self.setup_trocr_base_latin()
	else:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("latin")
	if str(next(self.trocr_latin_model.parameters()).device) != str(self.device):
	print(f"[VRAM MANAGER] Activating Latin TrOCR model on {self.device}...")
	self.trocr_latin_model.to(self.device)

	if not getattr(self, 'trocr_latin_available', False) or self.trocr_latin_model is None:
	return ""

	try:
	image = Image.open(image_path).convert("RGB")
	print(f"[INFO] Processing image with trocr-base-latin: {image.size[0]}x{image.size[1]} pixels")

	# Since trocr models are line-level OCR models, segment into lines first
	layout = self.layout_parser.analyze_layout(image_path)
	crops = self.layout_parser.crop_lines(image_path, layout)

	if crops and len(crops) > 1:
	print(f"[INFO] Image contains multiple lines ({len(crops)}). Running line-by-line trocr-base-latin...")
	line_texts = []
	for idx, crop in enumerate(crops):
	text = self._ocr_single_crop_with_trocr_base_latin(crop)
	if text:
	line_texts.append(text)
	return "\n".join(line_texts)
	else:
	print("[INFO] Single line detected or layout parser returned no lines. Processing full image...")
	return self._ocr_single_crop_with_trocr_base_latin(image)

	except Exception as e:
	print(f"[ERROR] trocr-base-latin extraction failed: {e}")
	return ""

	def _ocr_single_crop_with_trocr_base_latin(self, crop_image):
	"""Helper to run trocr-base-latin inference on a single image crop"""
	try:
	pixel_values = self.trocr_latin_processor(
	images=crop_image,
	return_tensors="pt"
	).pixel_values.to(self.device)

	with torch.inference_mode():
	generated_ids = self.trocr_latin_model.generate(
	pixel_values,
	max_length=512,
	num_beams=4,
	early_stopping=True
	)

	text = self.trocr_latin_processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	text = ' '.join(text.split())
	return text.strip()
	except Exception as e:
	print(f"[ERROR] Single line OCR with trocr-base-latin failed: {e}")
	return ""

	def _extract_with_tridis_htr(self, image_path):
	"""Extract text using TRIDIS HTR - SPECIALIZED for medieval Latin manuscripts.
	Uses layout-aware line segmentation so multi-line documents are fully transcribed."""
	if self.tridis_model is None:
	self.setup_tridis_htr()
	else:
	from utils.gpu_diagnostics import reclaim_vram_for
	reclaim_vram_for("latin")
	if str(next(self.tridis_model.parameters()).device) != str(self.device):
	print(f"[VRAM MANAGER] Activating Latin TRIDIS model on {self.device}...")
	self.tridis_model.to(self.device)

	if not getattr(self, 'tridis_available', False) or self.tridis_model is None:
	return ""

	try:
	# Load and validate image
	image = Image.open(image_path).convert("RGB")
	print(f"[INFO] Processing image with TRIDIS HTR: {image.size[0]}x{image.size[1]} pixels")

	# Use layout parser to segment into individual lines
	layout = self.layout_parser.analyze_layout(image_path)
	crops = self.layout_parser.crop_lines(image_path, layout)

	if crops and len(crops) > 1:
	# Cap lines to prevent timeout on very large documents (CPU inference)
	MAX_LINES = 50
	total_detected = len(crops)
	if len(crops) > MAX_LINES:
	print(f"[INFO] Layout parser detected {total_detected} text lines. Capping to {MAX_LINES} for performance.")
	crops = crops[:MAX_LINES]
	else:
	print(f"[INFO] Layout parser detected {total_detected} text lines. Running line-by-line TRIDIS HTR...")

	line_texts = []
	for idx, crop in enumerate(crops):
	# Preprocess each line crop for medieval manuscripts
	enhanced_crop = self._preprocess_for_medieval_manuscript(crop)
	text = self._ocr_single_crop_with_tridis(enhanced_crop)
	if text:
	line_texts.append(text)
	print(f" [LINE {idx+1}/{len(crops)}] {text[:80]}...")

	if line_texts:
	full_text = "\n".join(line_texts)
	# Post-process medieval abbreviations, corrections, and formatting
	processed_text = self._post_process_medieval_text(full_text)

	char_count = len(processed_text)
	word_count = len(processed_text.split())
	print(f"[INFO] TRIDIS HTR extracted (multi-line): {char_count} characters, {word_count} words from {len(line_texts)} lines")

	medieval_features = self._analyze_medieval_features(processed_text)
	if medieval_features:
	print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")

	return processed_text.strip()

	# Single line or no layout detected — process full image
	print("[INFO] Single line or no layout segmentation. Processing full image with TRIDIS HTR...")
	enhanced_image = self._preprocess_for_medieval_manuscript(image)

	# Process with TRIDIS HTR
	print("[INFO] Running TRIDIS HTR inference...")
	pixel_values = self.tridis_processor(
	images=enhanced_image,
	return_tensors="pt"
	).pixel_values.to(self.device)

	# Generate text with parameters optimized for medieval manuscripts
	with torch.inference_mode():
	generated_ids = self.tridis_model.generate(
	pixel_values,
	max_length=768, # Longer sequences for medieval texts with abbreviations
	num_beams=6, # Higher quality beam search for historical accuracy
	early_stopping=True,
	do_sample=False,
	repetition_penalty=1.15, # Avoid repetition common in medieval texts
	length_penalty=0.8, # Don't penalize longer expansions
	no_repeat_ngram_size=2 # Avoid immediate repetitions
	)

	# Decode the generated text
	generated_text = self.tridis_processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	# Post-process medieval abbreviations, corrections, and formatting
	processed_text = self._post_process_medieval_text(generated_text)

	# Log extraction results
	char_count = len(processed_text)
	word_count = len(processed_text.split())
	print(f"[INFO] TRIDIS HTR extracted: {char_count} characters, {word_count} words")

	# Detect medieval features
	medieval_features = self._analyze_medieval_features(processed_text)
	if medieval_features:
	print(f"[INFO] Medieval features detected: {', '.join(medieval_features)}")

	return processed_text.strip()

	except Exception as e:
	print(f"[ERROR] TRIDIS HTR extraction failed: {e}")
	return ""

	def _ocr_single_crop_with_tridis(self, crop_image):
	"""Helper to run TRIDIS HTR inference on a single line crop image"""
	try:
	pixel_values = self.tridis_processor(
	images=crop_image,
	return_tensors="pt"
	).pixel_values.to(self.device)

	with torch.inference_mode():
	generated_ids = self.tridis_model.generate(
	pixel_values,
	max_length=768,
	num_beams=6,
	early_stopping=True,
	do_sample=False,
	repetition_penalty=1.15,
	length_penalty=0.8,
	no_repeat_ngram_size=2
	)

	text = self.tridis_processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	text = ' '.join(text.split())
	return text.strip()
	except Exception as e:
	print(f"[ERROR] Single line OCR with TRIDIS failed: {e}")
	return ""

	def _preprocess_for_medieval_manuscript(self, image):
	"""Enhanced preprocessing specifically optimized for medieval manuscripts"""
	try:
	print("[INFO] Applying medieval manuscript preprocessing...")

	# Convert to OpenCV format
	image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)

	# Step 1: Handle parchment/paper background variations
	# CLAHE for local contrast enhancement (handles uneven illumination)
	clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
	contrast_enhanced = clahe.apply(gray)

	# Step 2: Gentle denoising to preserve medieval letterforms and ink variations
	# Bilateral filter preserves edges while reducing noise
	denoised = cv2.bilateralFilter(contrast_enhanced, 7, 80, 80)

	# Step 3: Enhance faded ink while preserving original stroke width
	# Subtle sharpening kernel
	sharpen_kernel = np.array([
	[-0.5, -1, -0.5],
	[-1, 6, -1 ],
	[-0.5, -1, -0.5]
	])
	sharpened = cv2.filter2D(denoised, -1, sharpen_kernel)

	# Step 4: Normalize intensity range for optimal TRIDIS input
	normalized = cv2.normalize(sharpened, None, 0, 255, cv2.NORM_MINMAX)

	# Convert back to PIL format and ensure it is RGB mode
	processed_image = Image.fromarray(normalized).convert("RGB")

	print("[INFO] Medieval preprocessing completed: contrast enhanced, denoised, sharpened")
	return processed_image

	except Exception as e:
	print(f"[WARN] Medieval preprocessing failed: {e}, using original image")
	return image

	def _post_process_medieval_text(self, text):
	"""Post-process text from TRIDIS HTR with medieval-specific corrections"""
	try:
	if not text:
	return text

	print("[INFO] Post-processing TRIDIS HTR output for medieval features...")
	processed = text

	# Handle TRIDIS cancellation/correction markers
	# TRIDIS uses $word$ to mark cancelled/corrected text
	import re

	# Count cancellations before processing
	cancellation_count = processed.count('$') // 2

	# Convert $word$ to editorial brackets [word] for scholarly display
	processed = re.sub(r'\$([^$]*)\$', r'[\1]', processed)

	if cancellation_count > 0:
	print(f"[INFO] Processed {cancellation_count} scribal corrections/cancellations")

	# Clean up multiple spaces and normalize whitespace
	processed = ' '.join(processed.split())

	# Detect and log TRIDIS abbreviation expansions
	# Common medieval abbreviations that TRIDIS expands automatically
	medieval_expansions = {
	'domini': 'dñi/dni/dom̃',
	'facimus': 'facim̃/facimꝰ',
	'quod': 'qd/q̃d',
	'enim': 'enim̃/en̄',
	'pro': 'ꝓ/p̃',
	'et': '⁊/et̃',
	'cum': 'cũ/cum̃',
	'per': 'p̃/ꝑ',
	'sunt': 'sũt/sunt̃',
	'omnia': 'om̃ia/omn̄a'
	}

	expansions_found = []
	for expansion, abbreviations in medieval_expansions.items():
	if expansion in processed.lower():
	expansions_found.append(f"{abbreviations}→{expansion}")

	if expansions_found:
	print(f"[INFO] TRIDIS expanded abbreviations: {', '.join(expansions_found[:5])}")
	if len(expansions_found) > 5:
	print(f"[INFO] ... and {len(expansions_found) - 5} more abbreviations")

	# Detect capitalization patterns (TRIDIS capitalizes named entities)
	capitalized_words = re.findall(r'\b[A-Z][a-z]+', processed)
	if capitalized_words:
	unique_caps = list(set(capitalized_words))
	print(f"[INFO] Named entities capitalized: {', '.join(unique_caps[:5])}")
	if len(unique_caps) > 5:
	print(f"[INFO] ... and {len(unique_caps) - 5} more entities")

	return processed

	except Exception as e:
	print(f"[WARN] Medieval post-processing failed: {e}")
	return text

	def _analyze_medieval_features(self, text):
	"""Analyze and identify medieval manuscript features in the text"""
	features = []

	if not text:
	return features

	try:
	# Cancellation markers
	if '[' in text and ']' in text:
	features.append("scribal corrections")

	# Expanded abbreviations
	medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt', 'omnia']
	found_expansions = [word for word in medieval_words if word in text.lower()]
	if found_expansions:
	features.append(f"abbreviation expansions ({len(found_expansions)})")

	# Named entity capitalization
	import re
	caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
	if caps_count > 0:
	features.append(f"capitalized entities ({caps_count})")

	# Medieval punctuation patterns
	if '.' in text or ',' in text or ':' in text:
	features.append("punctuation normalization")

	# Special medieval characters
	medieval_chars = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")
	if medieval_chars > 0:
	features.append(f"medieval symbols ({medieval_chars})")

	except Exception as e:
	print(f"[WARN] Medieval feature analysis failed: {e}")

	return features

	def _extract_with_tesseract_enhanced(self, image_path):
	"""Enhanced Tesseract extraction with multiple configurations"""
	try:
	import pytesseract

	image = Image.open(image_path).convert("RGB")

	# Multiple preprocessing approaches
	preprocessed_images = {
	'enhanced': self._preprocess_for_tesseract_enhanced(image),
	'basic': self._preprocess_for_tesseract_basic(image),
	'original': image
	}

	best_text = ""
	best_score = 0
	best_config = ""
	best_preprocessing = ""

	# Try different combinations of preprocessing and OCR configurations
	for prep_name, prep_image in preprocessed_images.items():
	for config_name, config in self.ocr_configs.items():
	try:
	# Try with Latin language first
	text = pytesseract.image_to_string(
	prep_image,
	lang='lat',
	config=config
	).strip()

	# If Latin fails or produces poor results, try English
	if not text or len(text) < 5:
	text = pytesseract.image_to_string(
	prep_image,
	lang='eng',
	config=config
	).strip()

	# Score the result
	score = self._score_tesseract_result(text)

	if text and score > best_score:
	best_text = text
	best_score = score
	best_config = config_name
	best_preprocessing = prep_name

	except Exception as e:
	continue # Skip failed configurations

	if best_text:
	print(f"[INFO] Best Tesseract result: {best_preprocessing} + {best_config} (score: {best_score:.3f})")
	return self._post_process_tesseract_text(best_text)

	return ""

	except Exception as e:
	print(f"[ERROR] Enhanced Tesseract extraction failed: {e}")
	return ""

	def _extract_layout_aware_ocr(self, image_path):
	"""Extract text by segmenting the page layout into lines first for improved readability order"""
	try:
	import pytesseract
	print("[INFO] Running layout-aware line segmentation...")
	layout = self.layout_parser.analyze_layout(image_path)
	crops = self.layout_parser.crop_lines(image_path, layout)

	if not crops:
	print("[WARN] Layout parser returned no line crops")
	return ""

	print(f"[INFO] Layout-aware line parser cropped {len(crops)} lines")
	line_texts = []

	for idx, crop in enumerate(crops):
	# Enhance line crop for OCR
	crop_cv = cv2.cvtColor(np.array(crop), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(crop_cv, cv2.COLOR_BGR2GRAY)
	clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(4,4))
	enhanced = clahe.apply(gray)
	crop_pil = Image.fromarray(enhanced)

	# Single line OCR configuration
	config = '--oem 3 --psm 7'

	# Try Latin OCR first
	text = pytesseract.image_to_string(
	crop_pil,
	lang='lat',
	config=config
	).strip()

	# Try English fallback
	if not text or len(text) < 3:
	text = pytesseract.image_to_string(
	crop_pil,
	lang='eng',
	config=config
	).strip()

	if text:
	line_texts.append(self._post_process_tesseract_text(text))

	return "\n".join(line_texts)
	except Exception as e:
	print(f"[WARN] Layout aware Latin OCR failed: {e}")
	return ""

	def _preprocess_for_tesseract_enhanced(self, image):
	"""Enhanced preprocessing for Tesseract OCR"""
	try:
	image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)

	# More aggressive enhancement for Tesseract
	clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
	enhanced = clahe.apply(gray)

	# Morphological operations to clean up characters
	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
	cleaned = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)

	return Image.fromarray(cleaned)

	except Exception as e:
	print(f"[WARN] Enhanced Tesseract preprocessing failed: {e}")
	return image

	def _preprocess_for_tesseract_basic(self, image):
	"""Basic preprocessing for Tesseract OCR"""
	try:
	image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)

	# Simple contrast enhancement
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
	enhanced = clahe.apply(gray)

	return Image.fromarray(enhanced)

	except Exception as e:
	return image

	def _score_tesseract_result(self, text):
	"""Score Tesseract OCR result quality"""
	if not text or len(text.strip()) < 2:
	return 0.0

	score = 0.0
	words = text.split()

	# Base length bonus
	score += min(len(words) / 15.0, 0.25)

	# Latin character ratio
	latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
	if len(text) > 0:
	latin_ratio = latin_chars / len(text)
	score += latin_ratio * 0.35

	# Word formation bonus
	if len(words) > 1:
	score += 0.2

	# Common Latin words bonus
	common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab', 'post', 'ante', 'inter']
	latin_matches = sum(1 for word in words if word.lower() in common_latin)
	if latin_matches > 0:
	score += latin_matches * 0.05

	# Medieval symbols bonus
	medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
	symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
	if symbol_count > 0:
	score += 0.15

	# Penalize excessive garbage characters
	garbage_chars = sum(1 for c in text if not c.isalnum() and c not in " .,;:!?()[]{}/-·&℞℟℣†‡¶§꜠꜡ꜢꜣꜤꜥꝀꝁ")
	if len(text) > 0:
	garbage_ratio = garbage_chars / len(text)
	score -= garbage_ratio * 0.3

	return max(0.0, min(1.0, score))

	def _post_process_tesseract_text(self, text):
	"""Post-process Tesseract OCR result"""
	try:
	# Clean up common OCR errors
	corrections = {
	'rn': 'm',
	'cl': 'd',
	'\|': 'I',
	'°': 'o',
	'¢': 'c',
	'£': 'E'
	}

	processed = text
	for wrong, correct in corrections.items():
	processed = processed.replace(wrong, correct)

	# Normalize whitespace
	processed = ' '.join(processed.split())

	return processed

	except Exception as e:
	print(f"[WARN] Tesseract post-processing failed: {e}")
	return text

	def _validate_latin_text(self, text, style="any"):
	"""Validate text with criteria appropriate for classical/printed or medieval Latin"""
	if not text or len(text.strip()) < 3:
	return False

	try:
	# Count Latin characters
	latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
	total_chars = len(text.replace(' ', ''))

	if total_chars == 0:
	return False

	latin_ratio = latin_chars / max(total_chars, 1)

	# For printed/classical Latin, we require a high ratio of standard alphabetical letters
	if style == "printed":
	return latin_chars >= 5 and latin_ratio >= 0.6

	# For cursive/medieval Latin, we can be more generous and include medieval symbol weight
	medieval_symbols = sum(1 for c in text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§[]")
	medieval_words = ['domini', 'facimus', 'quod', 'enim', 'pro', 'cum', 'per', 'sunt']
	word_bonus = sum(3 for word in medieval_words if word in text.lower())

	total_meaningful = latin_chars + medieval_symbols + word_bonus
	meaningful_ratio = total_meaningful / max(total_chars, 1)

	if total_meaningful >= 10:
	return True
	elif meaningful_ratio >= 0.6:
	return True
	elif total_meaningful >= 5 and meaningful_ratio >= 0.3:
	return True
	else:
	return False

	except Exception as e:
	print(f"[WARN] Text validation failed: {e}")
	return len(text.strip()) >= 5 # Fallback validation


	def process_text(self, latin_text):
	"""Process extracted Latin text with comprehensive TRIDIS-aware analysis"""
	if not latin_text:
	return {"text": "", "symbols": [], "char_analysis": {}, "validation": {}}

	print("[INFO] Processing Latin text with medieval manuscript analysis...")

	# Extract symbols including medieval markers and corrections
	symbols = ''.join(filter(lambda x: x.isalnum() or x in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§$[]", latin_text))

	# Comprehensive medieval character analysis
	medieval_symbols = [c for c in latin_text if c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§"]
	correction_markers = latin_text.count('[') + latin_text.count('$')

	# Detect expanded abbreviations
	medieval_abbreviations = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt', 'omnia']
	expansions_found = [word for word in medieval_abbreviations if word in latin_text.lower()]

	# Count capitalized entities (TRIDIS feature)
	import re
	capitalized_entities = re.findall(r'\b[A-Z][a-z]+', latin_text)
	unique_entities = list(set(capitalized_entities))

	# Comprehensive character analysis
	char_analysis = {
	"total_chars": len(latin_text),
	"alpha_chars": sum(c.isalpha() for c in latin_text),
	"unique_chars": len(set(latin_text)),
	"word_count": len(latin_text.split()),
	"medieval_symbols": len(medieval_symbols),
	"medieval_symbol_types": list(set(medieval_symbols)),
	"abbreviation_expansions": expansions_found,
	"expansion_count": len(expansions_found),
	"correction_markers": correction_markers,
	"capitalized_entities": unique_entities,
	"entity_count": len(unique_entities),
	"avg_word_length": sum(len(word) for word in latin_text.split()) / max(1, len(latin_text.split()))
	}

	# Enhanced validation with medieval features
	validation = {
	"latin_ratio": sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in latin_text) / max(1, len(latin_text)),
	"quality_score": self._calculate_comprehensive_quality_score(latin_text),
	"ocr_method": getattr(self, 'active_model', "TRIDIS HTR (Medieval Manuscript Specialist)" if self.tridis_available else "Tesseract OCR"),
	"model_specialization": "General Latin text" if getattr(self, 'active_style', '') == 'printed' else ("13th-16th century manuscripts" if self.tridis_available else "General Latin text"),
	"medieval_features_detected": bool(medieval_symbols or expansions_found or correction_markers),
	"tridis_used": getattr(self, 'active_model', '') == 'tridis_HTR',
	"manuscript_period": "Classical/Roman Monumental" if getattr(self, 'active_style', '') == 'printed' else ("Late Medieval (13th-16th centuries)" if (medieval_symbols or expansions_found) else "Classical/Modern"),
	"text_type": "classical_inscription" if getattr(self, 'active_style', '') == 'printed' else self._determine_text_type(latin_text),
	"abbreviations_expanded": len(expansions_found) > 0,
	"named_entities_detected": len(unique_entities) > 0,
	"scribal_corrections_found": correction_markers > 0,
	"confidence_level": self._determine_confidence_level(latin_text),
	"writing_style": getattr(self, 'active_style', 'cursive')
	}

	return {
	"text": latin_text,
	"symbols": symbols,
	"char_analysis": char_analysis,
	"validation": validation
	}

	def _calculate_comprehensive_quality_score(self, text):
	"""Calculate comprehensive quality score with medieval bonuses"""
	if not text:
	return 0.0

	score = 0.0
	words = text.split()

	# Base metrics
	score += min(len(words) / 15.0, 0.2) # Length bonus (max 0.2)

	# Latin character ratio
	latin_chars = sum(c.isalpha() and c.lower() in "abcdefghijklmnopqrstuvwxyz" for c in text)
	score += (latin_chars / max(1, len(text))) * 0.25

	# TRIDIS Medieval bonuses (only if TRIDIS was used)
	if self.tridis_available and getattr(self, 'active_model', '') == 'tridis_HTR':
	# Expanded abbreviations (major quality indicator)
	medieval_expansions = ['domini', 'facimus', 'pro', 'quod', 'enim', 'cum', 'per', 'sunt']
	expansion_count = sum(1 for exp in medieval_expansions if exp in text.lower())
	score += min(expansion_count * 0.05, 0.2) # Max 0.2 bonus

	# Named entity capitalization (TRIDIS feature)
	import re
	caps_count = len(re.findall(r'\b[A-Z][a-z]+', text))
	score += min(caps_count * 0.02, 0.15) # Max 0.15 bonus

	# Correction markers (authenticity indicator)
	corrections = text.count('[') + text.count('$')
	score += min(corrections * 0.03, 0.1) # Max 0.1 bonus

	# Medieval symbols (regardless of OCR method)
	medieval_symbols = ['꜠', '꜡', 'ꜣ', 'ꜥ', 'ꝁ', 'ꝑ', 'ꝛ', 'ꞁ', 'ꞃ', '℞', '℟', '℣', '†', '‡', '¶', '§']
	symbol_count = sum(1 for symbol in medieval_symbols if symbol in text)
	score += min(symbol_count * 0.04, 0.15) # Max 0.15 bonus


	# Word formation
	if len(words) > 1:
	score += 0.1

	# Common Latin words
	common_latin = ['et', 'in', 'de', 'ad', 'cum', 'pro', 'per', 'ex', 'ab']
	latin_matches = sum(1 for word in words if word.lower() in common_latin)
	score += min(latin_matches * 0.02, 0.1)

	return max(0.0, min(1.0, score))

	def _determine_text_type(self, text):
	"""Determine the type of Latin text based on features"""
	if not text:
	return "unknown"

	# Medieval indicators
	medieval_expansions = ['domini', 'facimus', 'quod', 'enim']
	has_expansions = any(exp in text.lower() for exp in medieval_expansions)
	has_corrections = '[' in text or '$' in text
	has_medieval_symbols = any(c in text for c in "꜠꜡ꜣꜥꝁꝑꝛꞁꞃ℞℟℣†‡¶§")

	if has_expansions and has_corrections:
	return "medieval_documentary_manuscript"
	elif has_expansions or has_medieval_symbols:
	return "medieval_manuscript"
	elif has_corrections:
	return "manuscript_with_corrections"
	else:
	return "classical_latin_text"

	def _determine_confidence_level(self, text):
	"""Determine confidence level based on text characteristics"""
	score = self._calculate_comprehensive_quality_score(text)

	if score >= 0.8:
	return "Very High"
	elif score >= 0.6:
	return "High"
	elif score >= 0.4:
	return "Medium"
	elif score >= 0.2:
	return "Low"
	else:
	return "Very Low"

	def generate_historical_context(self, processed_result):
	"""Generate comprehensive historical context for Latin text"""
	latin_text = processed_result.get("text", "")

	groq_detail = self._generate_groq_context(latin_text)

	# Build references using words/symbols in Latin text
	words = re.findall(r'\w+', latin_text) if latin_text else []
	query_terms = list(words)
	if latin_text:
	query_terms.extend([char for char in latin_text if char.strip()])
	refs = self.rag_service.retrieve_grounding_list(query_terms, max_results=6)

	return {
	"uses_box": {
	"title": "Medieval Latin manuscript analysis",
	"items": self._build_uses_list(latin_text)
	},
	"meaning_box": self._build_enhanced_meaning_box(latin_text, groq_detail, processed_result),
	"references": refs
	}

	def _generate_groq_context(self, latin_text):
	"""Generate contextual information using Groq with medieval awareness"""
	if not self.groq_client.is_available():
	return "(Groq unavailable) Historical context generation requires GROQ_API_KEY and groq package."

	# Analyze medieval features for context
	has_expansions = any(word in latin_text.lower() for word in ['domini', 'facimus', 'quod', 'enim'])
	has_corrections = '[' in latin_text or '$' in latin_text
	has_caps = any(c.isupper() for c in latin_text)

	if is_gibberish(latin_text):
	prompt = (
	"The following sequence appears to be fragmentary medieval Latin text, possibly with scribal abbreviations or corrections. "
	"Provide a concise, scholarly paragraph (6-10 sentences) covering possible meanings, historical context of medieval Latin manuscripts, "
	"common abbreviation practices, and typical documentary uses in 13th-16th century Europe."
	)
	else:
	context_note = ""
	if has_expansions:
	context_note += "The text contains expanded medieval abbreviations. "
	if has_corrections:
	context_note += "Scribal corrections or cancellations are present. "
	if has_caps:
	context_note += "Named entities appear to be properly capitalized. "

	prompt = (
	f"Analyze this medieval Latin text: {latin_text}\n\n"
	f"Context: {context_note}This appears to be from a medieval manuscript (13th-16th centuries). "
	f"Provide a scholarly paragraph (6-10 sentences) on its historical significance, cultural context, "
	f"likely documentary purpose, and interpretations. Focus on medieval manuscript practices, "
	f"legal/administrative contexts, and paleographic significance."
	)

	system_prompt = "You are a medieval Latin paleography specialist and historian. Provide accurate, concise scholarly analysis focusing on manuscript traditions, abbreviation practices, and documentary contexts of the late medieval period."
	enriched_system_prompt = self.rag_service.enrich_prompt(system_prompt, latin_text)

	return self.groq_client.generate_response(
	system_prompt=enriched_system_prompt,
	user_prompt=prompt
	) or "(Historical context unavailable due to Groq error)"

	def _build_uses_list(self, latin_text):
	"""Build enhanced list of character uses with TRIDIS context"""
	notes = self.references.get("latin_symbol_notes", {}) or {}
	default_hint = self.references.get("latin_hint",
	"Letters and symbols reflect phonetic values and scribal practices in medieval manuscripts.")

	seen = set()
	items = []

	# Add TRIDIS-specific information for medieval features
	tridis_notes = {
	'[': "Editorial bracket indicating scribal correction or cancellation (TRIDIS transcription standard)",
	'$': "Cancellation marker for struck-through text (TRIDIS notation)",
	}

	for ch in latin_text:
	if ch in seen or not ch.strip():
	continue
	seen.add(ch)

	# Check TRIDIS-specific notes first
	if ch in tridis_notes:
	note = tridis_notes[ch]
	elif ch in notes:
	note = notes[ch]
	else:
	note = default_hint

	items.append(f"- {ch}: {note}")

	if not items:
	items.append("- —: " + default_hint)

	# Limit to prevent overwhelming output
	return items[:20]

	def _build_enhanced_meaning_box(self, latin_text, groq_detail, processed_result):
	"""Build comprehensive meaning box with TRIDIS medieval analysis"""
	char_analysis = processed_result.get("char_analysis", {})
	validation = processed_result.get("validation", {})

	# Enhanced introduction with TRIDIS context
	processing_method = validation.get("ocr_method", "Unknown OCR")
	text_type = validation.get("text_type", "unknown")
	confidence = validation.get("confidence_level", "Unknown")

	intro_lines = [
	f"Text processed using {processing_method} with confidence level: {confidence}.",
	]

	if self.tridis_available:
	intro_lines.extend([
	"TRIDIS HTR model trained on 245,000 lines of medieval manuscripts (13th-16th centuries).",
	"Specializes in Latin, Old French, Old Spanish documentary texts with automatic abbreviation expansion."
	])

	# Medieval features summary
	medieval_features = []
	expansion_count = char_analysis.get("expansion_count", 0)
	if expansion_count > 0:
	medieval_features.append(f"{expansion_count} abbreviation expansions")

	correction_count = char_analysis.get("correction_markers", 0)
	if correction_count > 0:
	medieval_features.append(f"{correction_count} scribal corrections")

	entity_count = char_analysis.get("entity_count", 0)
	if entity_count > 0:
	medieval_features.append(f"{entity_count} named entities")

	if medieval_features:
	intro_lines.append(f"Medieval features detected: {', '.join(medieval_features)}.")

	# Key terms for frequent list
	expansions = char_analysis.get("abbreviation_expansions", [])
	entities = char_analysis.get("capitalized_entities", [])
	frequent_terms = expansions + entities

	if not frequent_terms:
	frequent_terms = list(set(w for w in latin_text.split() if len(w) > 2))[:10]

	# Enhanced analysis points
	points = []

	if self.tridis_available:
	points.extend([
	"• TRIDIS HTR provides semi-diplomatic transcription following scholarly editorial standards.",
	"• Automatic abbreviation expansion: dom̃→domini, facimꝰ→facimus, ꝓ→pro, ⁊→et.",
	"• Named entity capitalization and punctuation normalization applied."
	])
	else:
	points.append("• Tesseract OCR provides basic Latin character recognition with limited medieval symbol support.")

	if correction_count > 0:
	points.append(f"• [{correction_count}] scribal corrections/cancellations indicate active manuscript editing process.")

	if expansion_count > 0:
	expansions_list = ", ".join(char_analysis.get("abbreviation_expansions", [])[:5])
	points.append(f"• Expanded abbreviations suggest legal/administrative document: {expansions_list}.")

	if validation.get("medieval_features_detected", False):
	manuscript_period = validation.get("manuscript_period", "Medieval")
	points.append(f"• {manuscript_period} characteristics indicate documentary manuscript tradition.")

	if groq_detail and isinstance(groq_detail, str) and groq_detail.strip():
	points.append(f"• Historical analysis: {groq_detail.strip()}")

	return {
	"title": "Medieval Latin manuscript analysis:",
	"intro_lines": intro_lines,
	"frequent_label": "Key medieval terms identified",
	"frequent": frequent_terms[:12],
	"points": points
	}

	def generate_story(self, processed_result):
	"""Generate creative story with medieval manuscript context"""
	latin_text = processed_result.get("text", "")

	if not self.groq_client.is_available():
	return "Groq client unavailable, cannot generate historical narrative."

	# Analyze text features for story context
	char_analysis = processed_result.get("char_analysis", {})
	validation = processed_result.get("validation", {})

	has_expansions = char_analysis.get("expansion_count", 0) > 0
	has_corrections = char_analysis.get("correction_markers", 0) > 0
	has_entities = char_analysis.get("entity_count", 0) > 0
	text_type = validation.get("text_type", "unknown")
	used_tridis = validation.get("tridis_used", False)

	# Choose appropriate narrative style based on detected features
	if "documentary" in text_type or has_expansions:
	styles = [
	"as a legal charter discovered in monastic archives",
	"as an administrative record from a medieval royal court",
	"as a property deed found in cathedral scriptorium",
	"as a guild register from a medieval trading city",
	"as a tax record from a 14th-century monastery"
	]
	elif has_corrections or has_entities:
	styles = [
	"as a monk's working manuscript with personal annotations",
	"as a scholar's commentary on ancient texts",
	"as a chronicle being revised by a medieval historian",
	"as a theological treatise with scribal corrections",
	"as a copy of classical texts with medieval glosses"
	]
	else:
	styles = [
	"as a sacred text illuminated by medieval scribes",
	"as a philosophical work from a cathedral school",
	"as a liturgical manuscript from a monastic library",
	"as a medical treatise translated in medieval Spain",
	"as an astronomical text from a medieval university"
	]

	import random
	chosen_style = random.choice(styles)
	seed = random.randint(1000, 9999)

	# Craft historically-informed prompt
	processing_context = "deciphered using advanced medieval manuscript AI" if used_tridis else "carefully transcribed from the original"
	time_period = "13th-16th centuries" if (has_expansions or has_corrections) else "medieval period"

	prompt = (
	f"This Latin manuscript text was {processing_context}: {latin_text}\n\n"
	f"Historical context: The text appears to be from the {time_period}, "
	f"{'with expanded abbreviations and scribal corrections typical of documentary manuscripts' if has_expansions else 'showing characteristics of medieval scholarly tradition'}.\n\n"
	f"Create a vivid, historically accurate narrative (250+ words) set in medieval Europe, "
	f"telling the story of this manuscript's creation and significance. "
	f"Write {chosen_style}.\n\n"
	f"Include: Medieval setting, authentic historical details, multiple characters, "
	f"the process of manuscript creation, and the document's importance to its community.\n"
	f"Narrative seed: {seed}"
	)

	system_prompt = (
	"You are a medieval historian and storyteller specializing in manuscript culture, "
	"paleography, and daily life in 13th-16th century Europe. Create authentic, "
	"engaging narratives that reflect accurate historical knowledge of medieval "
	"scriptoriums, legal practices, and scholarly traditions."
	)

	story = self.groq_client.generate_response(
	system_prompt=system_prompt,
	user_prompt=prompt
	)

	if not story or is_gibberish(story):
	return "Failed to generate historical narrative; medieval story creation unavailable."

	return story