Spaces:

Norelad
/

coptic-translation-interface

Sleeping

Rogaton

Fix Gradio 6.0 compatibility issues

84b26b8 about 2 months ago

23.7 kB

	#!/usr/bin/env python3
	"""
	Coptic Translation Interface - Hugging Face Space
	Supports Coptic↔English translation using megalaa models
	"""

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import torch

	# Coptic alphabet for virtual keyboard
	COPTIC_LETTERS = [
	'ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ',
	'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ',
	'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'
	]

	# Coptic-Greek character mappings (from handler.py)
	COPTIC_TO_GREEK = {
	"ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
	"ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
	"ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
	"ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", "ⲱ": "ω",
	"ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ", "ϭ": "c", "ϯ": "ti"
	}

	GREEK_TO_COPTIC = {v: k for k, v in COPTIC_TO_GREEK.items()}

	def greekify(coptic_text):
	"""Convert Coptic Unicode to Greek transcription"""
	result = []
	for char in coptic_text:
	result.append(COPTIC_TO_GREEK.get(char.lower(), char.lower()))
	return "".join(result)

	def degreekify(greek_text):
	"""Convert Greek transcription back to Coptic Unicode"""
	result = []
	i = 0
	while i < len(greek_text):
	if i < len(greek_text) - 1 and greek_text[i:i+2].lower() == 'ti':
	result.append(GREEK_TO_COPTIC.get('ti', greek_text[i:i+2]))
	i += 2
	else:
	result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i]))
	i += 1
	return ''.join(result)

	# Model caching
	coptic_to_english_model = None
	english_to_coptic_model = None
	device = "cuda" if torch.cuda.is_available() else "cpu"

	def load_coptic_to_english():
	"""Load Coptic → English translation model"""
	global coptic_to_english_model
	if coptic_to_english_model is None:
	tokenizer = AutoTokenizer.from_pretrained("megalaa/coptic-english-translator")
	model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/coptic-english-translator")
	model = model.to(device)
	coptic_to_english_model = (tokenizer, model)
	return coptic_to_english_model

	def load_english_to_coptic():
	"""Load English → Coptic translation model"""
	global english_to_coptic_model
	if english_to_coptic_model is None:
	tokenizer = AutoTokenizer.from_pretrained("megalaa/english-coptic-translator")
	model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/english-coptic-translator")
	model = model.to(device)
	english_to_coptic_model = (tokenizer, model)
	return english_to_coptic_model

	def translate_coptic_to_english(text, dialect):
	"""Translate Coptic to English"""
	if not text or not text.strip():
	return "Please enter Coptic text to translate."

	try:
	tokenizer, model = load_coptic_to_english()

	# Preprocess: convert Coptic to Greek transcription
	greek_text = greekify(text)

	# Add dialect tag (from handler.py)
	if dialect == "Bohairic":
	greek_text = "б " + greek_text # Bohairic tag
	else:
	greek_text = "з " + greek_text # Sahidic tag

	# Tokenize and generate
	inputs = tokenizer(greek_text, return_tensors="pt", padding=True).to(device)
	outputs = model.generate(
	**inputs,
	max_new_tokens=128,
	num_beams=5,
	early_stopping=True
	)

	# Decode
	translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return translation

	except Exception as e:
	return f"Translation error: {str(e)}"

	def translate_english_to_coptic(text, dialect):
	"""Translate English to Coptic"""
	if not text or not text.strip():
	return "Please enter English text to translate."

	try:
	tokenizer, model = load_english_to_coptic()

	# Add dialect tag
	if dialect == "Bohairic":
	input_text = "б " + text # Bohairic tag
	else:
	input_text = "з " + text # Sahidic tag

	# Tokenize and generate
	inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
	outputs = model.generate(
	**inputs,
	max_new_tokens=128,
	num_beams=5,
	early_stopping=True
	)

	# Decode and convert back to Coptic
	greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
	coptic_output = degreekify(greek_output)
	return coptic_output

	except Exception as e:
	return f"Translation error: {str(e)}"

	def add_letter(current_text, letter):
	"""Add a Coptic letter to the current text"""
	return current_text + letter if current_text else letter

	def add_space(current_text):
	"""Add a space to the current text"""
	return current_text + " " if current_text else " "

	def backspace(current_text):
	"""Remove last character from current text"""
	return current_text[:-1] if current_text else ""

	def clear_text():
	"""Clear all text"""
	return ""

	# Load comprehensive test corpus
	import json
	from pathlib import Path

	def load_test_corpus():
	"""Load the comprehensive Coptic test corpus"""
	corpus_path = Path(__file__).parent / "coptic_test_corpus.json"
	if corpus_path.exists():
	with open(corpus_path, 'r', encoding='utf-8') as f:
	return json.load(f)
	return None

	# Example texts organized by category
	# SAHIDIC EXAMPLES
	COPTIC_EXAMPLES_SIMPLE = [
	["ⲁⲩⲱ ⲁϥⲙⲟⲩⲧⲉ ⲉⲣⲟϥ", "Sahidic"], # and he called him
	["ⲁⲛⲟⲕ ⲡⲉ ⲡⲛⲟⲩⲧⲉ ⲙⲡⲉⲕⲉⲓⲱⲧ", "Sahidic"], # I am the God of your father
	["ⲙⲡⲣⲣ ϩⲟⲧⲉ", "Sahidic"], # Do not be afraid
	["ⲡϫⲟⲉⲓⲥ ⲡⲉ ⲡⲁⲛⲟⲩⲧⲉ", "Sahidic"], # The Lord is my God
	["ⲁϥⲃⲱⲕ ⲉϩⲣⲁⲓ ⲉⲡⲉⲣⲡⲉ", "Sahidic"], # he went up to the temple
	]

	COPTIC_EXAMPLES_COMPLEX = [
	["ⲁⲩⲱ ⲛⲧⲉⲣⲉϥⲛⲁⲩ ⲉⲡⲙⲏⲏϣⲉ ⲁϥϣⲡϩⲧⲏϥ ⲉϩⲣⲁⲓ ⲉϫⲱⲟⲩ", "Sahidic"], # when he saw the crowd
	["ⲉϣⲱⲡⲉ ⲇⲉ ⲁⲩⲛⲁⲩ ⲉⲣⲟϥ ⲉϥⲙⲟⲟϣⲉ ϩⲓϫⲛ ⲧⲉⲑⲁⲗⲁⲥⲥⲁ ⲁⲩϣⲧⲟⲣⲧⲣ", "Sahidic"], # when they saw him walking
	["ⲁⲓⲉⲓ ⲅⲁⲣ ⲉⲙⲟⲩⲧⲉ ⲁⲛ ⲉⲛⲇⲓⲕⲁⲓⲟⲥ ⲁⲗⲗⲁ ⲛⲣⲉϥⲣⲛⲟⲃⲉ", "Sahidic"], # I came not to call the righteous
	]

	COPTIC_EXAMPLES_TEXTS = [
	["ⲛⲉⲩⲛⲟⲩⲙⲏⲏϣⲉ ⲇⲉ ⲛϣⲱⲛⲉ ⲉⲩⲛⲕⲟⲧⲕ ϩⲙ ⲡⲙⲁ ⲉⲧⲙⲙⲁⲩ· ⲛϩⲁⲛⲃⲗⲗⲉ ⲙⲛ ⲛϩⲁⲛϭⲁⲗⲉ ⲙⲛ ⲛϣⲟⲩⲱⲟⲩ·", "Sahidic"], # Healing at the pool
	["ⲉⲓⲥ ⲡⲉⲧϫⲟ ⲁϥⲉⲓ ⲉⲃⲟⲗ ⲉϫⲟ· ⲁⲩⲱ ⲛⲧⲉⲣⲉϥϫⲟ ϩⲟⲓⲛⲉ ⲙⲉⲛ ⲁⲩϩⲉ ϩⲁⲧⲏ ⲧⲉϩⲓⲏ·", "Sahidic"], # The Sower parable
	]

	# BOHAIRIC EXAMPLES
	BOHAIRIC_EXAMPLES_SIMPLE = [
	["ⲟⲩⲟϩ ⲁϥⲙⲟⲩϯ ⲉⲣⲟϥ", "Bohairic"], # and he called him
	["ⲁⲛⲟⲕ ⲡⲉ ⲫϯ ⲛⲧⲉ ⲡⲉⲕⲓⲱⲧ", "Bohairic"], # I am the God of your father
	["ⲙⲡⲉⲣⲉⲣϩⲟϯ", "Bohairic"], # Do not be afraid
	["ⲡϭⲟⲓⲥ ⲡⲉ ⲡⲁⲛⲟⲩϯ", "Bohairic"], # The Lord is my God
	["ⲁϥϣⲉⲛⲁϥ ⲉⲡϣⲱⲓ ⲉⲡⲓⲉⲣⲫⲉⲓ", "Bohairic"], # he went up to the temple
	]

	BOHAIRIC_EXAMPLES_COMPLEX = [
	["ⲟⲩⲟϩ ⲉⲧⲁϥⲛⲁⲩ ⲉⲡⲓⲙⲏϣ ⲁϥϣⲉⲛϩⲏⲧ ϧⲁⲣⲱⲟⲩ", "Bohairic"], # when he saw the crowd
	["ⲡϭⲟⲓⲥ ⲡⲉⲧⲁⲙⲟⲛⲓ", "Bohairic"], # The Lord is my shepherd (Psalm 23:1)
	]

	BOHAIRIC_EXAMPLES_TEXTS = [
	["ⲛⲉ ⲟⲩⲟⲛ ⲟⲩⲙⲏϣ ⲛϣⲱⲛⲓ ⲉⲩⲉⲛⲕⲟⲧ ϧⲉⲛ ⲡⲓⲙⲁ ⲉⲧⲉⲙⲙⲁⲩ· ϩⲁⲛⲃⲉⲗⲗⲉⲩ ⲛⲉⲙ ϩⲁⲛϭⲁⲗⲉⲩ ⲛⲉⲙ ϩⲁⲛϣⲁⲩⲟⲩⲱⲟⲩ·", "Bohairic"], # Healing at the pool (Bohairic)
	]

	ENGLISH_EXAMPLES = [
	["The Lord is good", "Sahidic"],
	["I am a teacher", "Sahidic"],
	["We give thanks to God", "Sahidic"],
	["Do not be afraid", "Sahidic"],
	["He went to the house", "Sahidic"],
	]

	# Create Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("""
	# 🔮 Coptic Translation Interface

	Translate between Coptic and English using specialized models from [megalaa](https://huggingface.co/megalaa):
	- Coptic → English: `megalaa/coptic-english-translator`
	- English → Coptic: `megalaa/english-coptic-translator`

	Based on neural machine translation models trained on Coptic-English parallel corpus.
	""")

	with gr.Tabs():
	# Tab 1: Coptic → English
	with gr.TabItem("Coptic → English"):
	gr.Markdown("### Translate Coptic text to English")

	with gr.Row():
	with gr.Column(scale=1):
	cop_input = gr.Textbox(
	label="Coptic Text",
	placeholder="Enter Coptic text or use the virtual keyboard below...",
	lines=8,
	max_lines=15
	)

	cop_dialect = gr.Radio(
	choices=["Sahidic", "Bohairic"],
	value="Sahidic",
	label="Coptic Dialect"
	)

	# Virtual Coptic Keyboard
	with gr.Group():
	gr.Markdown("Virtual Coptic Keyboard")

	# Create keyboard in rows of 8
	for i in range(0, len(COPTIC_LETTERS), 8):
	with gr.Row():
	for letter in COPTIC_LETTERS[i:i+8]:
	btn = gr.Button(letter, size="sm", scale=1)
	btn.click(
	fn=lambda current, l=letter: add_letter(current, l),
	inputs=[cop_input],
	outputs=[cop_input]
	)

	with gr.Row():
	space_btn = gr.Button("Space", size="sm", scale=2)
	back_btn = gr.Button("⌫ Backspace", size="sm", scale=2)
	clear_btn = gr.Button("Clear", size="sm", scale=1)

	space_btn.click(fn=add_space, inputs=[cop_input], outputs=[cop_input])
	back_btn.click(fn=backspace, inputs=[cop_input], outputs=[cop_input])
	clear_btn.click(fn=clear_text, outputs=[cop_input])

	cop_translate_btn = gr.Button("🔄 Translate to English", variant="primary", size="lg")

	with gr.Column(scale=1):
	cop_output = gr.Textbox(
	label="English Translation",
	lines=8,
	max_lines=15,
	interactive=False
	)

	with gr.Accordion("📖 Example Texts", open=True):
	gr.Markdown("### Sahidic Dialect (Literary Standard)")

	gr.Markdown("Simple Sentences: Basic grammatical structures")
	gr.Examples(
	examples=COPTIC_EXAMPLES_SIMPLE,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output,
	fn=translate_coptic_to_english,
	cache_examples=False,
	label="Sahidic Simple"
	)

	gr.Markdown("Complex Sentences: Multi-clause with subordination")
	gr.Examples(
	examples=COPTIC_EXAMPLES_COMPLEX,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output,
	fn=translate_coptic_to_english,
	cache_examples=False,
	label="Sahidic Complex"
	)

	gr.Markdown("Full Texts: Connected discourse (paragraphs)")
	gr.Examples(
	examples=COPTIC_EXAMPLES_TEXTS,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output,
	fn=translate_coptic_to_english,
	cache_examples=False,
	label="Sahidic Texts"
	)

	gr.Markdown("---")
	gr.Markdown("### Bohairic Dialect (Northern/Liturgical)")

	gr.Markdown("Simple Sentences: Basic grammatical structures")
	gr.Examples(
	examples=BOHAIRIC_EXAMPLES_SIMPLE,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output,
	fn=translate_coptic_to_english,
	cache_examples=False,
	label="Bohairic Simple"
	)

	gr.Markdown("Complex Sentences: Multi-clause constructions")
	gr.Examples(
	examples=BOHAIRIC_EXAMPLES_COMPLEX,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output,
	fn=translate_coptic_to_english,
	cache_examples=False,
	label="Bohairic Complex"
	)

	gr.Markdown("Full Texts: Connected discourse")
	gr.Examples(
	examples=BOHAIRIC_EXAMPLES_TEXTS,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output,
	fn=translate_coptic_to_english,
	cache_examples=False,
	label="Bohairic Texts"
	)

	cop_translate_btn.click(
	fn=translate_coptic_to_english,
	inputs=[cop_input, cop_dialect],
	outputs=cop_output
	)

	# Tab 2: English → Coptic
	with gr.TabItem("English → Coptic"):
	gr.Markdown("### Translate English text to Coptic")

	with gr.Row():
	with gr.Column(scale=1):
	eng_input = gr.Textbox(
	label="English Text",
	placeholder="Enter English text...",
	lines=8,
	max_lines=15
	)

	eng_dialect = gr.Radio(
	choices=["Sahidic", "Bohairic"],
	value="Sahidic",
	label="Target Coptic Dialect"
	)

	eng_translate_btn = gr.Button("🔄 Translate to Coptic", variant="primary", size="lg")

	with gr.Column(scale=1):
	eng_output = gr.Textbox(
	label="Coptic Translation",
	lines=8,
	max_lines=15,
	interactive=False
	)

	gr.Examples(
	examples=ENGLISH_EXAMPLES,
	inputs=[eng_input, eng_dialect],
	outputs=eng_output,
	fn=translate_english_to_coptic,
	cache_examples=False,
	label="📖 Example English Texts"
	)

	eng_translate_btn.click(
	fn=translate_english_to_coptic,
	inputs=[eng_input, eng_dialect],
	outputs=eng_output
	)

	# Tab 3: Dependency Parsing (Neural-Symbolic)
	with gr.TabItem("📊 Dependency Analysis"):
	gr.Markdown("""
	### Neural-Symbolic Coptic Parser

	Hybrid architecture combining:
	- Neural: Stanza + DiaParser for dependency parsing
	- Symbolic: Prolog rules implementing Walter Till's grammar
	- Lexicon: Crum's Coptic Dictionary integration
	""")

	with gr.Row():
	with gr.Column(scale=1):
	parse_input = gr.Textbox(
	label="Coptic Text to Parse",
	placeholder="Enter Coptic text for grammatical analysis...",
	lines=6,
	max_lines=10
	)

	parse_btn = gr.Button("🔍 Parse & Validate", variant="primary", size="lg")

	with gr.Column(scale=1):
	parse_output = gr.Markdown(
	label="Dependency Parse Results",
	value="Parse results will appear here..."
	)

	with gr.Accordion("Prolog Validation Results", open=False):
	prolog_output = gr.Markdown(
	value="Grammatical validation results will appear here..."
	)

	with gr.Accordion("Download Options", open=False):
	conllu_download = gr.File(
	label="Download CoNLL-U Format",
	visible=False
	)

	with gr.Accordion("📖 Example Texts for Parsing", open=True):
	gr.Markdown("Simple Structures - Test basic dependency relations")
	simple_parse_examples = [
	"ⲁⲩⲱ ⲁϥⲙⲟⲩⲧⲉ ⲉⲣⲟϥ", # and he called him
	"ⲁⲛⲟⲕ ⲡⲉ ⲡⲛⲟⲩⲧⲉ ⲙⲡⲉⲕⲉⲓⲱⲧ", # Tripartite nominal
	"ⲡϫⲟⲉⲓⲥ ⲡⲉ ⲡⲁⲛⲟⲩⲧⲉ", # The Lord is my God
	]
	gr.Examples(
	examples=[[ex] for ex in simple_parse_examples],
	inputs=parse_input,
	label="Simple"
	)

	gr.Markdown("Complex Structures - Test subordination and coordination")
	complex_parse_examples = [
	"ⲁⲩⲱ ⲛⲧⲉⲣⲉϥⲛⲁⲩ ⲉⲡⲙⲏⲏϣⲉ ⲁϥϣⲡϩⲧⲏϥ ⲉϩⲣⲁⲓ ⲉϫⲱⲟⲩ", # Temporal clause
	"ⲁⲓⲉⲓ ⲅⲁⲣ ⲉⲙⲟⲩⲧⲉ ⲁⲛ ⲉⲛⲇⲓⲕⲁⲓⲟⲥ ⲁⲗⲗⲁ ⲛⲣⲉϥⲣⲛⲟⲃⲉ", # Purpose with negation
	]
	gr.Examples(
	examples=[[ex] for ex in complex_parse_examples],
	inputs=parse_input,
	label="Complex"
	)

	gr.Markdown("Full Texts - Test discourse-level parsing")
	text_parse_examples = [
	"ⲛⲉⲩⲛⲟⲩⲙⲏⲏϣⲉ ⲇⲉ ⲛϣⲱⲛⲉ ⲉⲩⲛⲕⲟⲧⲕ ϩⲙ ⲡⲙⲁ ⲉⲧⲙⲙⲁⲩ· ⲛϩⲁⲛⲃⲗⲗⲉ ⲙⲛ ⲛϩⲁⲛϭⲁⲗⲉ ⲙⲛ ⲛϣⲟⲩⲱⲟⲩ·",
	]
	gr.Examples(
	examples=[[ex] for ex in text_parse_examples],
	inputs=parse_input,
	label="Texts"
	)

	def parse_coptic_text(text):
	"""Parse Coptic text with neural-symbolic validation"""
	if not text or not text.strip():
	return "Please enter Coptic text to parse.", "", None

	try:
	from coptic_parser_core import CopticParserCore

	# Initialize parser (cached)
	parser = CopticParserCore()
	parser.load_parser()

	# Parse the text
	result = parser.parse_text(text)

	if not result:
	return "❌ Parsing failed. Please check input.", "", None

	# Format main output
	main_output = f"""
	## Parse Results

	Total Sentences: {result['total_sentences']}
	Total Tokens: {result['total_tokens']}

	### Dependency Structure

	{parser.format_table(result)}
	"""

	# Format Prolog validation output
	prolog_output_text = ""
	if 'prolog_validation' in result and result['prolog_validation']:
	validation = result['prolog_validation']
	prolog_output_text = "## 🔍 Prolog Validation (Walter Till Grammar)\n\n"

	if validation.get('patterns_detected'):
	prolog_output_text += "### ✅ Detected Grammatical Patterns\n\n"
	for pattern in validation['patterns_detected']:
	if isinstance(pattern, dict):
	if pattern.get('is_tripartite'):
	prolog_output_text += f"- Tripartite Sentence: {pattern.get('description', '')}\n"
	prolog_output_text += f" ```\n {pattern.get('pattern', '')}\n ```\n"
	else:
	prolog_output_text += f"- {pattern}\n"
	else:
	prolog_output_text += f"- {pattern}\n"

	if validation.get('warnings'):
	prolog_output_text += "\n### ⚠️ Grammatical Warnings\n\n"
	for warning in validation['warnings']:
	prolog_output_text += f"- {warning}\n"

	if not validation.get('warnings') and not validation.get('patterns_detected'):
	prolog_output_text += "✓ No grammatical issues detected\n"
	else:
	prolog_output_text = "ℹ️ Prolog validation not available (requires SWI-Prolog)"

	# Create CoNLL-U file for download
	conllu_content = parser.format_conllu(result)
	conllu_path = "/tmp/coptic_parse.conllu"
	with open(conllu_path, 'w', encoding='utf-8') as f:
	f.write(conllu_content)

	return main_output, prolog_output_text, conllu_path

	except Exception as e:
	return f"❌ Error: {str(e)}", "", None

	parse_btn.click(
	fn=parse_coptic_text,
	inputs=parse_input,
	outputs=[parse_output, prolog_output, conllu_download]
	)

	gr.Markdown("""
	---
	### About This Research Interface

	Translation Models:
	- [megalaa/coptic-english-translator](https://huggingface.co/megalaa/coptic-english-translator) & [megalaa/english-coptic-translator](https://huggingface.co/megalaa/english-coptic-translator)
	- Based on work by Enis & Megalaa (2024)

	Dependency Parser (Neural-Symbolic Hybrid):
	- Neural: Stanza NLP pipeline + DiaParser for Coptic
	- Symbolic: Prolog implementation of Walter Till's Coptic grammar
	- Lexicon: Integration with Crum's Coptic Dictionary
	- Error Detection: Prolog validation catches neural parser hallucinations

	Research Features:
	- CoNLL-U format export for corpus analysis
	- Grammatical pattern detection (tripartite sentences, etc.)
	- Dialect-aware processing (Sahidic/Bohairic)
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)