Spaces:

al1808th
/

macronizer

Running

App Files Files Community

macronizer / app.py

al1808th

Switch UI palette from brown to black theme

aaf8c40 7 days ago

raw

history blame contribute delete

20.3 kB

	import html
	import re
	from typing import Dict, List, Tuple

	import gradio as gr
	import torch
	import torch.nn.functional as F
	from transformers import AutoModelForTokenClassification, AutoTokenizer

	from grc_utils import lower_grc, normalize_word, heavy, vowel, only_bases

	from syllabify import syllabify_joined
	from preprocess import process_word

	MODEL_OPTIONS: Dict[str, str] = {
	"SyllaMoBert (current)": "Ericu950/SyllaMoBert-grc-macronizer-v1",
	"Macronizer Mini": "Ericu950/macronizer_mini",
	}
	DEFAULT_MODEL_LABEL = "SyllaMoBert (current)"
	DEFAULT_MODEL_ID = MODEL_OPTIONS[DEFAULT_MODEL_LABEL]
	MAX_LENGTH = 512

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	_MODEL_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]] = {}


	def _get_model_bundle(model_id: str) -> Tuple[AutoTokenizer, AutoModelForTokenClassification, Dict[int, str]]:
	if model_id in _MODEL_CACHE:
	return _MODEL_CACHE[model_id]

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForTokenClassification.from_pretrained(model_id)
	model.to(device)
	model.eval()
	id2label = model.config.id2label

	_MODEL_CACHE[model_id] = (tokenizer, model, id2label)
	return _MODEL_CACHE[model_id]


	def preprocess_greek_line(line: str) -> List[str]:
	# Normalize accents and keep only Greek-letter word spans.
	normalized = normalize_word(line)
	lower = lower_grc(normalized)
	words = lower.split()
	token_lists = [process_word(word) for word in words]
	return [token for tokens in token_lists for token in tokens]


	def _normalize_label(raw_label: str) -> int:
	text = raw_label.lower()
	if "long" in text:
	return 1
	if "short" in text:
	return 2
	if text.endswith("_1") or text == "1":
	return 1
	if text.endswith("_2") or text == "2":
	return 2
	return 0


	def preprocess_and_syllabify(line: str):
	tokens = preprocess_greek_line(line)
	return syllabify_joined(tokens)


	def classify_line(line: str, model_id: str):
	syllables = preprocess_and_syllabify(line)
	if not syllables:
	return []

	tokenizer, model, id2label = _get_model_bundle(model_id)

	encoded = tokenizer(
	syllables,
	is_split_into_words=True,
	return_tensors="pt",
	truncation=True,
	max_length=MAX_LENGTH,
	)

	word_ids = encoded.word_ids(batch_index=0)

	if "token_type_ids" in encoded:
	del encoded["token_type_ids"]

	model_inputs = {k: v.to(device) for k, v in encoded.items()}

	with torch.no_grad():
	outputs = model(**model_inputs)
	probs = F.softmax(outputs.logits, dim=-1)
	predictions = torch.argmax(probs, dim=-1).squeeze(0).cpu().tolist()

	aligned = []
	seen_word_ids = set()

	for i, word_id in enumerate(word_ids):
	if word_id is None:
	continue
	if word_id in seen_word_ids:
	continue
	if word_id >= len(syllables):
	break

	seen_word_ids.add(word_id)
	pred_id = int(predictions[i])
	label_name = id2label.get(pred_id, str(pred_id))
	normalized = _normalize_label(str(label_name))
	aligned.append((syllables[word_id], normalized))

	return aligned


	def _syllable_chip(syllable: str, label_id: int) -> str:
	escaped = html.escape(syllable)
	if label_id == 1:
	return f'<span class="chip long">{escaped}<small>long</small></span>'
	if label_id == 2:
	return f'<span class="chip short">{escaped}<small>short</small></span>'
	return f'<span class="chip clear">{escaped}</span>'


	def _mark_syllable_plain(syllable: str, label_id: int) -> str:
	if label_id not in (1, 2):
	return syllable

	marker = "_" if label_id == 1 else "^"
	chars = list(syllable)

	for i in range(len(chars) - 1, -1, -1):
	if vowel(chars[i]):
	return "".join(chars[: i + 1]) + marker + "".join(chars[i + 1 :])

	return syllable + marker


	def _to_final_sigma(text: str) -> str:
	# Step 3: in rendered output, only word-final sigmas become final-sigma.
	def _convert_word(token: str) -> str:
	if not token.strip():
	return token

	chars = list(token)
	last_greek_idx = -1
	for i, ch in enumerate(chars):
	if "\u0370" <= ch <= "\u03ff" or "\u1f00" <= ch <= "\u1fff":
	last_greek_idx = i

	if last_greek_idx != -1 and chars[last_greek_idx] == "σ":
	chars[last_greek_idx] = "ς"

	return "".join(chars)

	return "".join(_convert_word(tok) for tok in re.findall(r"\S+\|\s+", text))


	def _restore_expanded_word(marked_word: str, reference_word: str) -> str:
	restored = marked_word.replace("δσ", "ζ").replace("κσ", "ξ").replace("πσ", "ψ")

	ref_norm = lower_grc(normalize_word(reference_word))
	if "ῥ" in ref_norm:
	rho_idx = restored.find("ρ")
	if rho_idx != -1:
	restored = restored[:rho_idx] + "ῥ" + restored[rho_idx + 1 :]

	return _to_final_sigma(restored)


	def _consume_word_alignment(
	aligned: List[Tuple[str, int]],
	start_idx: int,
	expected_syllables: List[str],
	) -> Tuple[List[Tuple[str, int]], int]:
	if start_idx >= len(aligned):
	return [], start_idx

	expected_bases = only_bases("".join(expected_syllables))
	if expected_bases:
	taken: List[Tuple[str, int]] = []
	i = start_idx
	while i < len(aligned):
	taken.append(aligned[i])
	current_bases = only_bases("".join(s for s, _ in taken))
	if current_bases == expected_bases:
	return taken, i + 1
	if len(current_bases) > len(expected_bases) and not current_bases.startswith(expected_bases):
	break
	i += 1

	fallback_count = len(expected_syllables)
	if fallback_count <= 0:
	return [], start_idx

	end_idx = min(len(aligned), start_idx + fallback_count)
	return aligned[start_idx:end_idx], end_idx


	def _render_plain_line_with_spacing(line: str, aligned: List[Tuple[str, int]]) -> str:
	# Step 1: normalize input final sigma to medial sigma for matching only.
	line_for_matching = line.replace("ς", "σ")
	parts = re.findall(r"\S+\|\s+", line)
	parts_for_matching = re.findall(r"\S+\|\s+", line_for_matching)
	out_parts: List[str] = []
	cursor = 0

	for part, part_for_matching in zip(parts, parts_for_matching):
	if part_for_matching.isspace():
	# Step 2: preserve original spacing exactly.
	out_parts.append(part_for_matching)
	continue

	normalized_word = lower_grc(normalize_word(part_for_matching)).replace("ς", "σ")
	expected_tokens = process_word(normalized_word)
	expected_syllables = syllabify_joined(expected_tokens)

	taken, cursor = _consume_word_alignment(aligned, cursor, expected_syllables)
	if not taken:
	out_parts.append(part_for_matching)
	continue

	marked = "".join(_mark_syllable_plain(syl, label) for syl, label in taken)
	restored = _restore_expanded_word(marked, part)
	out_parts.append(restored)

	if cursor < len(aligned):
	tail = "".join(_mark_syllable_plain(syl, label) for syl, label in aligned[cursor:])
	out_parts.append(_to_final_sigma(tail))

	return "".join(out_parts)


	def render_results(text: str, model_label: str):
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	if not lines:
	return "<div class='empty'>Enter one or more Greek lines to classify syllables.</div>", ""

	model_id = MODEL_OPTIONS.get(model_label, DEFAULT_MODEL_ID)

	cards = []
	export_lines = []

	for idx, line in enumerate(lines, start=1):
	aligned = classify_line(line, model_id)
	chips = "".join(_syllable_chip(syl, label) for syl, label in aligned)
	plain_line = _render_plain_line_with_spacing(line, aligned)

	cards.append(
	f"""
	<section class="card">
	<div class="line-number">Line {idx}</div>
	<div class="source">{html.escape(line)}</div>
	<div class="chips">{chips or '<span class="chip clear">(no syllables found)</span>'}</div>
	</section>
	"""
	)

	export_lines.append(f"Line {idx}: {line}")
	export_lines.append(f" {plain_line}" if plain_line else " (no syllables found)")

	html_result = (
	"<div class='legend'><span class='dot long'></span>Long"
	"<span class='dot short'></span>Short"
	"<span class='dot clear'></span>Unmarked</div>"
	+ "".join(cards)
	)

	export_header = [f"Model: {model_label} ({model_id})", ""]
	return html_result, "\n".join(export_header + export_lines)


	examples = [
	"νεανίας ἀάατός ἐστιν καὶ καλός. τὰ παῖδες τὰ καλά\nκαλὰ μὲν ἠέξευ, καλὰ δ᾽ ἔτραφες, οὐράνιε Ζεῦ,",
	"Ἆρες, Ἄρες βροτολοιγὲ μιαιφόνε τειχεσιπλῆτα\nἈτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί",
	"ἢ τυφλὸς ἤ τις σκνιπὸς ἢ λέγα βλέπων\nψάμμου θαλασσῶν ἢ σκνιπῶν Αἰγυπτίων",
	]


	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@500;600;700&family=Space+Grotesk:wght@400;500;700&display=swap');

	:root {
	--bg-start: #0b0b0d;
	--bg-end: #15151b;
	--ink: #f0f0f5;
	--long: #ff7868;
	--short: #66dbd8;
	--clear: #a0a0ab;
	--paper: rgba(22, 22, 28, 0.9);
	--chip-long-color: var(--long);
	--chip-short-color: var(--short);
	--chip-clear-color: #c9c9d3;
	--source-text: var(--ink);
	}

	@media (prefers-color-scheme: dark) {
	:root {
	--bg-start: #050506;
	--bg-end: #101015;
	--ink: #f3f3f8;
	--long: #ff7f70;
	--short: #69e2de;
	--clear: #b5b5c2;
	--paper: rgba(16, 16, 22, 0.94);
	--chip-long-color: #ff9b8d;
	--chip-short-color: #7cebe7;
	--chip-clear-color: #d4d4de;
	--source-text: #fcfcff;
	}
	body.dark-mode {
	--bg-start: #050506;
	--bg-end: #101015;
	--ink: #f3f3f8;
	--long: #ff7f70;
	--short: #69e2de;
	--clear: #b5b5c2;
	--paper: rgba(16, 16, 22, 0.94);
	--chip-long-color: #ff9b8d;
	--chip-short-color: #7cebe7;
	--chip-clear-color: #d4d4de;
	--source-text: #fcfcff;
	}
	}

	body.dark-mode {
	--bg-start: #050506;
	--bg-end: #101015;
	--ink: #f3f3f8;
	--long: #ff7f70;
	--short: #69e2de;
	--clear: #b5b5c2;
	--paper: rgba(16, 16, 22, 0.94);
	--chip-long-color: #ff9b8d;
	--chip-short-color: #7cebe7;
	--chip-clear-color: #d4d4de;
	--source-text: #fcfcff;
	}

	html.dark-mode {
	--bg-start: #050506;
	--bg-end: #101015;
	--ink: #f3f3f8;
	--long: #ff7f70;
	--short: #69e2de;
	--clear: #b5b5c2;
	--paper: rgba(16, 16, 22, 0.94);
	--chip-long-color: #ff9b8d;
	--chip-short-color: #7cebe7;
	--chip-clear-color: #d4d4de;
	--source-text: #fcfcff;
	}

	.gradio-container {
	font-family: 'Space Grotesk', sans-serif;
	background: radial-gradient(circle at top left, var(--bg-start), var(--bg-end));
	color: var(--ink);
	transition: background-color 0.3s, color 0.3s;
	}

	.dark-mode-toggle {
	position: fixed;
	top: 20px;
	right: 20px;
	background: var(--paper);
	border: 2px solid var(--ink);
	color: var(--ink);
	padding: 0.6rem 1.2rem;
	border-radius: 999px;
	cursor: pointer;
	font-weight: 600;
	font-family: 'Space Grotesk', sans-serif;
	font-size: 0.95rem;
	z-index: 1000;
	transition: all 0.3s;
	}

	.dark-mode-toggle:hover {
	transform: scale(1.05);
	opacity: 0.9;
	}

	.title h1 {
	font-family: 'Cormorant Garamond', serif;
	font-size: 3rem;
	letter-spacing: 0.02em;
	margin-bottom: 0.2rem;
	}

	.title p {
	opacity: 0.82;
	}

	.panel {
	backdrop-filter: blur(8px);
	background: var(--paper);
	border: 1px solid rgba(255, 255, 255, 0.16);
	border-radius: 18px;
	padding: 0.9rem;
	}

	.dark-mode .panel {
	border-color: rgba(232, 228, 220, 0.22);
	}

	.panel label,
	.panel .gr-markdown,
	.panel .gradio-markdown,
	.panel .gr-form label,
	.panel .gr-form span {
	color: var(--ink) !important;
	}

	.panel textarea,
	.panel input,
	.panel .gr-textbox,
	.panel .gr-textbox textarea,
	.panel .gr-textbox input,
	.panel .gr-radio,
	.panel .gr-radio label,
	.panel .gr-box,
	.panel .gr-form {
	color: var(--ink) !important;
	}

	.dark-mode .panel textarea,
	.dark-mode .panel input,
	.dark-mode .panel .gr-textbox,
	.dark-mode .panel .gr-textbox textarea,
	.dark-mode .panel .gr-textbox input,
	.dark-mode .panel .gr-radio,
	.dark-mode .panel .gr-box,
	.dark-mode .panel .gr-form {
	background: rgba(10, 10, 14, 0.9) !important;
	border-color: rgba(232, 228, 220, 0.22) !important;
	}

	.dark-mode .panel .gr-button,
	.dark-mode .panel button {
	color: #f6f2e8 !important;
	border-color: rgba(232, 228, 220, 0.28) !important;
	}

	.dark-mode .panel .gr-button.gr-button-primary,
	.dark-mode .panel button.primary {
	background: #3e74f2 !important;
	color: #f7f9ff !important;
	}

	.legend {
	display: flex;
	align-items: center;
	gap: 0.9rem;
	font-weight: 600;
	margin-bottom: 0.8rem;
	}

	.dot {
	display: inline-block;
	width: 10px;
	height: 10px;
	border-radius: 999px;
	margin-left: 0.7rem;
	margin-right: 0.25rem;
	}

	.dot.long { background: var(--long); }
	.dot.short { background: var(--short); }
	.dot.clear { background: var(--clear); }

	.card {
	background: rgba(24, 24, 32, 0.84);
	border-radius: 14px;
	padding: 0.9rem;
	margin: 0.8rem 0;
	border: 1px solid rgba(255, 255, 255, 0.14);
	animation: rise 420ms ease both;
	color: var(--ink);
	}

	.dark-mode .card {
	background: rgba(14, 14, 20, 0.9);
	border: 1px solid rgba(232, 228, 220, 0.15);
	}

	.line-number {
	font-size: 0.8rem;
	font-weight: 700;
	text-transform: uppercase;
	letter-spacing: 0.06em;
	color: #afb0bc;
	}

	.dark-mode .line-number {
	color: #d3d3df;
	}

	.source {
	font-family: 'Cormorant Garamond', serif;
	font-size: 1.45rem;
	margin: 0.25rem 0 0.7rem;
	color: var(--source-text);
	}

	.chips {
	display: flex;
	flex-wrap: wrap;
	gap: 0.45rem;
	}

	.chip {
	display: inline-flex;
	align-items: baseline;
	gap: 0.35rem;
	border-radius: 999px;
	padding: 0.28rem 0.65rem;
	font-family: 'Cormorant Garamond', serif;
	font-size: 1.1rem;
	border: 1px solid transparent;
	}

	.chip small {
	font-size: 0.75rem;
	font-family: 'Space Grotesk', sans-serif;
	text-transform: uppercase;
	letter-spacing: 0.04em;
	}

	.chip.long {
	color: var(--chip-long-color);
	background: rgba(186, 58, 41, 0.15);
	border-color: rgba(186, 58, 41, 0.3);
	}

	.chip.long:before {
	content: '';
	}

	.dark-mode .chip.long {
	background: rgba(255, 107, 90, 0.2);
	border-color: rgba(255, 107, 90, 0.4);
	}

	.chip.short {
	color: var(--chip-short-color);
	background: rgba(31, 111, 109, 0.15);
	border-color: rgba(31, 111, 109, 0.3);
	}

	.dark-mode .chip.short {
	background: rgba(77, 217, 213, 0.2);
	border-color: rgba(77, 217, 213, 0.4);
	}

	.chip.clear {
	color: var(--chip-clear-color);
	background: rgba(116, 108, 95, 0.12);
	border-color: rgba(116, 108, 95, 0.25);
	}

	.dark-mode .chip.clear {
	color: #c8c0b0;
	background: rgba(170, 160, 144, 0.15);
	border-color: rgba(170, 160, 144, 0.3);
	}

	.empty {
	padding: 1rem;
	border-radius: 12px;
	background: rgba(255, 255, 255, 0.6);
	border: 1px dashed rgba(47, 43, 38, 0.2);
	color: var(--ink);
	}

	.dark-mode .empty {
	background: rgba(40, 35, 28, 0.7);
	border: 1px dashed rgba(232, 228, 220, 0.15);
	}

	@keyframes rise {
	from { transform: translateY(8px); opacity: 0; }
	to { transform: translateY(0); opacity: 1; }
	}

	@media (max-width: 820px) {
	.title h1 { font-size: 2.2rem; }
	.source { font-size: 1.25rem; }
	.dark-mode-toggle {
	position: relative;
	top: auto;
	right: auto;
	margin-bottom: 1rem;
	}
	}
	"""


	with gr.Blocks() as demo:
	gr.HTML("""
	<script>
	// Detect system dark mode preference and apply on load
	function applyDarkModePreference() {
	const darkModeToggle = document.getElementById('dark-mode-toggle');
	const isDarkMode = localStorage.getItem('darkMode') === 'true' \|\|
	(!localStorage.getItem('darkMode') && window.matchMedia('(prefers-color-scheme: dark)').matches);

	if (isDarkMode) {
	document.body.classList.add('dark-mode');
	document.documentElement.classList.add('dark-mode');
	if (darkModeToggle) darkModeToggle.textContent = '☀️ Light Mode';
	} else {
	document.body.classList.remove('dark-mode');
	document.documentElement.classList.remove('dark-mode');
	if (darkModeToggle) darkModeToggle.textContent = '🌙 Dark Mode';
	}
	}

	// Apply preference on page load
	window.addEventListener('load', applyDarkModePreference);
	setTimeout(applyDarkModePreference, 100);

	// Listen for system dark mode changes
	window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', (e) => {
	if (!localStorage.getItem('darkMode')) {
	if (e.matches) {
	document.body.classList.add('dark-mode');
	document.documentElement.classList.add('dark-mode');
	document.getElementById('dark-mode-toggle').textContent = '☀️ Light Mode';
	} else {
	document.body.classList.remove('dark-mode');
	document.documentElement.classList.remove('dark-mode');
	document.getElementById('dark-mode-toggle').textContent = '🌙 Dark Mode';
	}
	}
	});
	</script>
	<button id="dark-mode-toggle" class="dark-mode-toggle" onclick="
	document.body.classList.toggle('dark-mode');
	document.documentElement.classList.toggle('dark-mode');
	const isDark = document.body.classList.contains('dark-mode');
	localStorage.setItem('darkMode', isDark);
	document.getElementById('dark-mode-toggle').textContent = isDark ? '☀️ Light Mode' : '🌙 Dark Mode';
	">🌙 Dark Mode</button>
	""")

	gr.Markdown(
	"""
	<div class="title">
	<h1>Ancient Greek Macronizer</h1>
	<p>Syllable-level long/short classification with a modern, readable presentation.</p>
	</div>
	"""
	)

	with gr.Column():
	with gr.Column(elem_classes=["panel"]):
	model_choice = gr.Radio(
	label="Model",
	choices=list(MODEL_OPTIONS.keys()),
	value=DEFAULT_MODEL_LABEL,
	)
	text_input = gr.Textbox(
	label="Greek Lines",
	lines=8,
	placeholder="Paste one or multiple lines; each line is processed separately.",
	)
	with gr.Row():
	classify_btn = gr.Button("Classify", variant="primary")
	clear_btn = gr.Button("Clear")
	gr.Examples(examples=examples, inputs=text_input, label="Try examples")

	with gr.Column(elem_classes=["panel"]):
	html_output = gr.HTML(label="Styled Results")
	text_output = gr.Textbox(label="Plain Output", lines=12)

	classify_btn.click(render_results, inputs=[text_input, model_choice], outputs=[html_output, text_output])
	clear_btn.click(lambda: ("", "", ""), outputs=[text_input, html_output, text_output])


	if __name__ == "__main__":
	demo.launch(css=CSS)