""" Voynich Manuscript Linguistic Analyzer Gradio app for statistical analysis of EVA-transliterated Voynich text, compared against Latin, Arabic, Hebrew, Medieval Welsh, and Georgian corpora. """ import io import math import re import os import collections import tempfile import atexit from typing import Optional import gradio as gr import numpy as np import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from PIL import Image as PILImage # ───────────────────────────────────────────── # SAMPLE VOYNICH EVA TEXT (small illustrative corpus) # ───────────────────────────────────────────── SAMPLE_EVA = """ fachys ykal ar ataiin shol shory cth res y kor sholdy qokaiin qokey qokaiin ytaiin yteor qokain qokal y tain ytaiin daiin okaiin chedy qokeedy dain shedy daiin ol cheds daiin okeedy chedal okeedy dar ar aiin daiin daiin oteedy keedy chal chol dy kaiin dar shey qodar ytedy cheds chol dain kaiin oteedy sheedy chedal dar ytedy okaiin chedy kaiin shedy chol qokeol dar yteedy cheds dar chol qokeedy okeedy chedal dar daiin yteedy qokain chedy dar chol daiin okaiin chedy kaiin shedy chol oteedy chedal qodar yteol dar daiin okaiin chedy qokeol chedal dar chol daiin oteedy chedal dar yteedy okeedy fachys ykal ar ataiin shol shory cth res y kor sholdy qokaiin qokey qokaiin ytaiin yteor qokain qokal y tain ytaiin daiin okaiin chedy qokeedy dain shedy daiin ol cheds daiin okeedy chedal okeedy dar ar aiin daiin daiin oteedy keedy chal chol dy kaiin dar shey qodar ytedy cheds chol dain kaiin oteedy sheedy chedal dar ytedy okaiin chedy kaiin shedy chol qokeol dar yteedy cheds dar chol qokeedy okeedy chedal dar daiin yteedy qokain chedy dar chol daiin okaiin chedy kaiin shedy chol oteedy chedal qodar yteol dar daiin okaiin chedy qokeol chedal dar chol daiin oteedy chedal dar yteedy okeedy daiin okaiin shedy chol daiin oteedy qokain chedal cheds chol shory chol daiin keedy okaiin chedal dar daiin shedy kaiin qokeedy chedal yteedy dar chol okeedy daiin chedal shedy dar ytaiin qokain daiin chedy qodar shedy okaiin cheds chol daiin """ # ───────────────────────────────────────────── # REFERENCE LANGUAGE CORPORA (romanized / transliterated samples) # These are small illustrative samples — real analysis needs larger corpora # ───────────────────────────────────────────── LANGUAGE_CORPORA = { "Latin": """ arma virumque cano troiae qui primus ab oris italiam fato profugus laviniaque venit litora multum ille et terris iactatus et alto vi superum saevae memorem iunonis ob iram multa quoque et bello passus dum conderet urbem inferretque deos latio genus unde latinum albanique patres atque altae moenia romae musa mihi causas memora quo numine laeso quidve dolens regina deum tot volvere casus insignem pietate virum tot adire labores impulerit tantaene animis caelestibus irae urbs antiqua fuit tyrii tenuere coloni carthago italiam contra tiberinaque longe ostia dives opum studiisque asperrima belli quam iuno fertur terris magis omnibus unam posthabita coluisse samo hic illius arma hic currus fuit hoc regnum dea gentibus esse si qua fata sinant iam tum tenditque fovetque progeniem sed enim troiano a sanguine duci audierat tyrias olim quae verteret arces """, "Arabic": """ bismi allahi alrrahmani alrrahimi alhamdu lillahi rabbi alealamina alrrahmani alrrahimi maliki yawmi alddini iyyaka nabudu waiyyaka nastainu ihdina alssirata almustaqima sirata alladhina anamta ealayhim ghayri almaghdubi ealayhim wala alddalina qul huwa allahu ahadun allahu alssamadu lam yalid walam yulad walam yakun lahu kufuan ahadun inna anzalnahu fi laylati alqadri wama adraka ma laylatu alqadri laylatu alqadri khayrun min alfi shahrin tanazzalu almalaaikatu waalrruhu fiha bidni rabbihim min kulli amrin salamun hiya hatta matlaei alfajri alam nashrah laka sadraka wawadaena anka wizraka allathee anqada zahraka warafaena laka dhikraka fainna maea aleusri yusran inna maea aleusri yusran faitha faraghta """, "Hebrew": """ bereshit bara elohim et hashamayim veet haaretz vehaaretz hayta tohu vavohu vechoshech al pney tehom veruach elohim merachefet al pney hamayim vayomer elohim yehi or vayehi or vayar elohim et haor ki tov vayavdel elohim beyn haor uveyn hachoshech vayikra elohim laor yom velachoshech kara layla vayehi erev vayehi voker yom echad vayomer elohim yehi rakia betoch hamayim vihi mavdil beyn mayim lammayim vayaas elohim et harakia vayavdel beyn hamayim asher mitachat larakia uveyn hamayim asher meal larakia vayehi chen vayikra elohim larakia shamayim vayehi erev vayehi voker yom sheni vayomer elohim yikavu hamayim mitachat hashamayim el makom echad vetera hayabashah vayehi chen """, "Medieval Welsh": """ yn y dechreuad y creodd duw y nefoedd ar ddaear yr oedd y ddaear yn adfeilion ac yn wag ac yr oedd tywyllwch ar wyneb y dyfnder ac ysbryd duw yn symud ar wyneb y dyfroedd a duw a ddywedodd bydded goleuni a bu goleuni a duw a welodd y goleuni ei fod yn dda a duw a wahanodd y goleuni oddi wrth y tywyllwch ac a alwodd duw y goleuni yn ddydd a galwodd y tywyllwch yn nos ac aeth yr hwyr ar bore dydd cyntaf a duw a ddywedodd bydded ffurfafen yng nghanol y dyfroedd a bydded yn gwahanu dyfroedd oddi wrth ddyfroedd a gwnaeth duw y ffurfafen ac a wahanodd rhwng y dyfroedd oedd tan y ffurfafen ar dyfroedd oedd uwch """, "Georgian": """ tavisupali pirovneba arsebobs rasac unda iqos da rasac unda aketebs piradi tanxmobis gareSe arc erTi pirovneba verc aaketebs samarTliani sazogadoeba romelic TiToeul wevrze mzrunvelobas iCens Tavisuflebas da Tanasworobas uzrunvelyofs yvela moqalaqe kanonis winaSe Tanasworad aris arc erTi diskriminacia ar aris Semosvla pirovnebis uflebebSi samarTliani da Tavisufali sazogadoeba unda aRiardes Tavisufali arCevnis ufleba TiToeuli adamiani ibadeba TavisuflebiT da Tanasworad RirsebiT da uflebiT isini jildosulia gonebisa da sindisisa da urTierTobaSi erTmaneTis mimarT ZmobisduliT unda moiqcnen """, } # ───────────────────────────────────────────── # DEFAULT BIGRAPH SPLIT RULES (EVA-based) # ───────────────────────────────────────────── DEFAULT_BIGRAPH_RULES = """qo -> q o ch -> c h sh -> s h ee -> e e ai -> a i ol -> o l or -> o r ar -> a r al -> a l """ # ───────────────────────────────────────────── # CORE ANALYSIS FUNCTIONS # ───────────────────────────────────────────── def parse_bigraph_rules(rules_text: str) -> list[tuple[str, str]]: """Parse bigraph split rules from text format 'xy -> x y'""" rules = [] for line in rules_text.strip().splitlines(): line = line.strip() if not line or "->" not in line: continue lhs, rhs = line.split("->", 1) bigraph = lhs.strip() replacement = rhs.strip() rules.append((bigraph, replacement)) return rules def apply_bigraph_splits(text: str, rules: list[tuple[str, str]]) -> str: """Apply bigraph splitting rules to text""" for bigraph, replacement in rules: text = text.replace(bigraph, replacement) return text def tokenize(text: str) -> list[str]: """Extract clean character tokens (letters only, lowercase)""" return [c for c in text.lower() if c.isalpha()] def tokenize_words(text: str) -> list[str]: """Extract word tokens""" return [w for w in re.findall(r"[a-zA-Z]+", text.lower()) if w] def ioc(tokens: list[str]) -> float: """Index of Coincidence""" if len(tokens) < 2: return 0.0 freq = collections.Counter(tokens) n = len(tokens) return sum(f * (f - 1) for f in freq.values()) / (n * (n - 1)) def entropy_order0(tokens: list[str]) -> float: """Unigram (order-0) entropy in bits""" if not tokens: return 0.0 freq = collections.Counter(tokens) n = len(tokens) return -sum((c / n) * math.log2(c / n) for c in freq.values()) def entropy_order1(tokens: list[str]) -> float: """Bigram conditional entropy H(X|Y)""" if len(tokens) < 2: return 0.0 bigrams = list(zip(tokens[:-1], tokens[1:])) bigram_counts = collections.Counter(bigrams) unigram_counts = collections.Counter(tokens[:-1]) total_bigrams = len(bigrams) h = 0.0 for (a, b), cnt in bigram_counts.items(): p_ab = cnt / total_bigrams p_b_given_a = cnt / unigram_counts[a] h -= p_ab * math.log2(p_b_given_a) return h def entropy_order2(tokens: list[str]) -> float: """Trigram conditional entropy H(X|YZ)""" if len(tokens) < 3: return 0.0 trigrams = list(zip(tokens[:-2], tokens[1:-1], tokens[2:])) trigram_counts = collections.Counter(trigrams) bigram_counts = collections.Counter(zip(tokens[:-2], tokens[1:-1])) total_trigrams = len(trigrams) h = 0.0 for (a, b, c), cnt in trigram_counts.items(): p_abc = cnt / total_trigrams p_c_given_ab = cnt / bigram_counts[(a, b)] h -= p_abc * math.log2(p_c_given_ab) return h def zipf_slope(tokens: list[str]) -> float: """Compute slope of Zipf log-log plot (should be near -1 for natural language)""" freq = collections.Counter(tokens) counts = sorted(freq.values(), reverse=True) if len(counts) < 2: return 0.0 ranks = np.arange(1, len(counts) + 1) log_ranks = np.log(ranks) log_counts = np.log(np.array(counts, dtype=float)) slope, _ = np.polyfit(log_ranks, log_counts, 1) return slope def type_token_ratio(words: list[str]) -> float: if not words: return 0.0 return len(set(words)) / len(words) def hapax_ratio(words: list[str]) -> float: if not words: return 0.0 freq = collections.Counter(words) hapax = sum(1 for v in freq.values() if v == 1) return hapax / len(set(words)) def pmi_top_pairs(tokens: list[str], top_n: int = 10) -> list[tuple[tuple, float]]: """Compute top PMI bigram pairs""" if len(tokens) < 2: return [] bigrams = list(zip(tokens[:-1], tokens[1:])) bg_counts = collections.Counter(bigrams) ug_counts = collections.Counter(tokens) n = len(tokens) pmi_scores = {} for (a, b), cnt in bg_counts.items(): p_ab = cnt / len(bigrams) p_a = ug_counts[a] / n p_b = ug_counts[b] / n if p_a > 0 and p_b > 0 and p_ab > 0: pmi_scores[(a, b)] = math.log2(p_ab / (p_a * p_b)) return sorted(pmi_scores.items(), key=lambda x: -x[1])[:top_n] def compute_perplexity(test_tokens: list[str], train_tokens: list[str]) -> float: """Cross-entropy perplexity of test under train bigram model""" if len(test_tokens) < 2 or len(train_tokens) < 2: return float("inf") bigrams = list(zip(train_tokens[:-1], train_tokens[1:])) bg_counts = collections.Counter(bigrams) ug_counts = collections.Counter(train_tokens[:-1]) vocab_size = len(set(train_tokens)) k = 0.5 # Laplace smoothing test_bigrams = list(zip(test_tokens[:-1], test_tokens[1:])) log_prob = 0.0 for a, b in test_bigrams: numerator = bg_counts.get((a, b), 0) + k denominator = ug_counts.get(a, 0) + k * vocab_size log_prob += math.log2(numerator / denominator) cross_entropy = -log_prob / len(test_bigrams) return 2 ** cross_entropy def build_stats(tokens: list[str], words: list[str], label: str) -> dict: return { "label": label, "n_chars": len(tokens), "n_words": len(words), "vocab_chars": len(set(tokens)), "vocab_words": len(set(words)), "ioc": ioc(tokens), "h0": entropy_order0(tokens), "h1": entropy_order1(tokens), "h2": entropy_order2(tokens), "ttr": type_token_ratio(words), "hapax": hapax_ratio(words), "zipf_slope": zipf_slope(words) if len(words) > 10 else 0.0, "tokens": tokens, "words": words, } def distance_vector(stats: dict, voynich_stats: dict) -> float: """Simple Euclidean distance in metric space""" features = ["ioc", "h0", "h1", "h2", "ttr", "hapax"] v_vec = np.array([voynich_stats[f] for f in features]) l_vec = np.array([stats[f] for f in features]) return float(np.linalg.norm(v_vec - l_vec)) # ───────────────────────────────────────────── # PLOTTING FUNCTIONS # ───────────────────────────────────────────── VOYNICH_COLOR = "#e8c97a" LANG_COLORS = ["#7ab8e8", "#e87a7a", "#7ae8a5", "#c87ae8", "#e8a57a"] BG_COLOR = "#0f0f14" PANEL_COLOR = "#16161f" TEXT_COLOR = "#d4cfc8" GRID_COLOR = "#2a2a38" def fig_to_pil(fig) -> PILImage.Image: """Render a matplotlib figure to a PIL Image, then close the figure.""" buf = io.BytesIO() fig.savefig(buf, format="png", dpi=140, bbox_inches="tight", facecolor=BG_COLOR, edgecolor="none") plt.close(fig) # FIXED: always close to prevent memory leak buf.seek(0) return PILImage.open(buf).copy() def style_ax(ax, title=""): ax.set_facecolor(PANEL_COLOR) ax.tick_params(colors=TEXT_COLOR, labelsize=8) ax.xaxis.label.set_color(TEXT_COLOR) ax.yaxis.label.set_color(TEXT_COLOR) for spine in ax.spines.values(): spine.set_edgecolor(GRID_COLOR) ax.grid(True, color=GRID_COLOR, linewidth=0.5, alpha=0.7) if title: ax.set_title(title, color=TEXT_COLOR, fontsize=9, fontweight="bold", pad=6) def plot_ioc_comparison(voynich_stats, lang_stats_list, selected_langs): fig, ax = plt.subplots(figsize=(9, 4), facecolor=BG_COLOR) labels = ["Voynich"] + selected_langs + ["Random", "English ref"] values = [voynich_stats["ioc"]] + [ls["ioc"] for ls in lang_stats_list] + [0.038, 0.065] colors = [VOYNICH_COLOR] + LANG_COLORS[:len(selected_langs)] + ["#555566", "#445544"] bars = ax.barh(labels, values, color=colors, height=0.55, edgecolor="none") ax.axvline(0.038, color="#555566", lw=1, ls="--", alpha=0.6, label="Random (0.038)") ax.axvline(0.065, color="#445544", lw=1, ls="--", alpha=0.6, label="English (0.065)") ax.set_xlabel("Index of Coincidence", color=TEXT_COLOR) style_ax(ax, "Index of Coincidence") ax.legend(fontsize=7, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR) for bar, val in zip(bars, values): ax.text(val + 0.001, bar.get_y() + bar.get_height() / 2, f"{val:.4f}", va="center", color=TEXT_COLOR, fontsize=7.5) fig.tight_layout(pad=1.2) return fig_to_pil(fig) def plot_entropy_curves(voynich_stats, lang_stats_list, selected_langs): fig, ax = plt.subplots(figsize=(9, 4.5), facecolor=BG_COLOR) orders = [0, 1, 2] order_labels = ["H₀ (unigram)", "H₁ (bigram)", "H₂ (trigram)"] vy = [voynich_stats["h0"], voynich_stats["h1"], voynich_stats["h2"]] ax.plot(orders, vy, "o-", color=VOYNICH_COLOR, lw=2.2, ms=7, label="Voynich", zorder=5) for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)): lv = [ls["h0"], ls["h1"], ls["h2"]] ax.plot(orders, lv, "o--", color=LANG_COLORS[i], lw=1.6, ms=5, label=lang, alpha=0.85) ax.set_xticks(orders) ax.set_xticklabels(order_labels, color=TEXT_COLOR, fontsize=8) ax.set_ylabel("Entropy (bits)", color=TEXT_COLOR) style_ax(ax, "Entropy Curves (H₀ → H₁ → H₂)") ax.legend(fontsize=7.5, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR) fig.tight_layout(pad=1.2) return fig_to_pil(fig) def plot_zipf(voynich_stats, lang_stats_list, selected_langs): n_plots = len(selected_langs) + 1 fig, axes = plt.subplots(1, n_plots, figsize=(3 * n_plots, 4), facecolor=BG_COLOR) if n_plots == 1: axes = [axes] def _draw_zipf(ax, words, color, label): freq = collections.Counter(words) counts = sorted(freq.values(), reverse=True) if not counts: return ranks = np.arange(1, len(counts) + 1) ax.loglog(ranks, counts, ".", color=color, ms=3, alpha=0.7) if len(counts) > 2: lr = np.log(ranks) lc = np.log(np.array(counts, dtype=float)) slope, intercept = np.polyfit(lr, lc, 1) fit = np.exp(intercept + slope * lr) ax.loglog(ranks, fit, "-", color=color, lw=1.5, alpha=0.5) ax.set_title(f"{label}\nslope={slope:.2f}", color=TEXT_COLOR, fontsize=8, pad=4) style_ax(ax) _draw_zipf(axes[0], voynich_stats["words"], VOYNICH_COLOR, "Voynich") for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)): _draw_zipf(axes[i + 1], ls["words"], LANG_COLORS[i], lang) fig.suptitle("Zipf Word-Frequency Plots (log-log)", color=TEXT_COLOR, fontsize=9, y=1.01) fig.tight_layout(pad=1.2) return fig_to_pil(fig) def plot_distance_radar(voynich_stats, lang_stats_list, selected_langs): features = ["ioc", "h0", "h1", "h2", "ttr", "hapax"] feat_labels = ["IoC", "H₀", "H₁", "H₂", "TTR", "Hapax"] N = len(features) angles = [n / float(N) * 2 * math.pi for n in range(N)] angles += angles[:1] fig, ax = plt.subplots(figsize=(6, 6), subplot_kw={"polar": True}, facecolor=BG_COLOR) ax.set_facecolor(PANEL_COLOR) all_stats = [voynich_stats] + lang_stats_list mins = {f: min(s[f] for s in all_stats) for f in features} maxs = {f: max(s[f] for s in all_stats) + 1e-10 for f in features} def norm(stats, f): return (stats[f] - mins[f]) / (maxs[f] - mins[f]) vy = [norm(voynich_stats, f) for f in features] vy += vy[:1] ax.plot(angles, vy, "-", color=VOYNICH_COLOR, lw=2.2, label="Voynich") ax.fill(angles, vy, color=VOYNICH_COLOR, alpha=0.12) for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)): lv = [norm(ls, f) for f in features] lv += lv[:1] ax.plot(angles, lv, "--", color=LANG_COLORS[i], lw=1.6, label=lang, alpha=0.85) ax.fill(angles, lv, color=LANG_COLORS[i], alpha=0.05) ax.set_xticks(angles[:-1]) ax.set_xticklabels(feat_labels, color=TEXT_COLOR, fontsize=9) ax.tick_params(colors=TEXT_COLOR) ax.yaxis.set_tick_params(colors=GRID_COLOR) ax.grid(color=GRID_COLOR, linewidth=0.5) ax.spines["polar"].set_color(GRID_COLOR) ax.set_title("Metric Radar (normalized)", color=TEXT_COLOR, fontsize=9, pad=15) ax.legend(loc="upper right", bbox_to_anchor=(1.35, 1.15), fontsize=7.5, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR) fig.tight_layout() return fig_to_pil(fig) def plot_perplexity(voynich_tokens, lang_stats_list, selected_langs): perplexities = [] for ls in lang_stats_list: p = compute_perplexity(voynich_tokens, ls["tokens"]) perplexities.append(min(p, 9999)) fig, ax = plt.subplots(figsize=(8, 4), facecolor=BG_COLOR) bars = ax.bar(selected_langs, perplexities, color=LANG_COLORS[:len(selected_langs)], edgecolor="none", width=0.55) ax.set_ylabel("Perplexity (lower = more similar)", color=TEXT_COLOR) style_ax(ax, "Cross-Entropy Perplexity of Voynich under Each Language Model") for bar, val in zip(bars, perplexities): ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5, f"{val:.1f}", ha="center", color=TEXT_COLOR, fontsize=8) fig.tight_layout(pad=1.2) return fig_to_pil(fig) def plot_char_freq(voynich_stats, lang_stats_list, selected_langs): """Character frequency distribution comparison""" fig, axes = plt.subplots(2, 3, figsize=(14, 7), facecolor=BG_COLOR) axes = axes.flatten() all_entries = [("Voynich", voynich_stats, VOYNICH_COLOR)] + \ [(lang, ls, LANG_COLORS[i]) for i, (lang, ls) in enumerate(zip(selected_langs, lang_stats_list))] for ax, (label, stats, color) in zip(axes, all_entries): freq = collections.Counter(stats["tokens"]) top = freq.most_common(20) chars, counts = zip(*top) if top else ([], []) ax.bar(chars, counts, color=color, alpha=0.85, edgecolor="none") style_ax(ax, f"{label} — top 20 chars") ax.set_xlabel("Character", color=TEXT_COLOR) ax.set_ylabel("Count", color=TEXT_COLOR) for ax in axes[len(all_entries):]: ax.set_visible(False) fig.suptitle("Character Frequency Distributions", color=TEXT_COLOR, fontsize=10, y=1.01) fig.tight_layout(pad=1.5) return fig_to_pil(fig) # ───────────────────────────────────────────── # SUMMARY TABLE # ───────────────────────────────────────────── def build_summary_html(voynich_stats, lang_stats_list, selected_langs): rows = [] all_entries = [("Voynich", voynich_stats)] + list(zip(selected_langs, lang_stats_list)) for label, stats in all_entries: dist = distance_vector(stats, voynich_stats) if label != "Voynich" else "—" perp = compute_perplexity(voynich_stats["tokens"], stats["tokens"]) if label != "Voynich" else "—" dist_str = f"{dist:.4f}" if isinstance(dist, float) else dist perp_str = f"{min(perp, 9999):.1f}" if isinstance(perp, float) else perp rows.append({ "Corpus": label, "Chars": stats["n_chars"], "Words": stats["n_words"], "IoC": f"{stats['ioc']:.4f}", "H₀": f"{stats['h0']:.3f}", "H₁": f"{stats['h1']:.3f}", "H₂": f"{stats['h2']:.3f}", "TTR": f"{stats['ttr']:.3f}", "Hapax": f"{stats['hapax']:.3f}", "Zipf slope": f"{stats['zipf_slope']:.3f}", "Distance": dist_str, "Perplexity": perp_str, }) cols = list(rows[0].keys()) th_style = "background:#1e1e2e;color:#c9a84c;padding:7px 12px;border:1px solid #2a2a38;font-size:12px;" td_style = "padding:6px 12px;border:1px solid #2a2a38;color:#d4cfc8;font-size:11px;text-align:center;" td_voynich = "padding:6px 12px;border:1px solid #2a2a38;color:#e8c97a;font-size:11px;text-align:center;font-weight:bold;background:#1a1a22;" html = '' html += "" + "".join(f"" for c in cols) + "" html += "" for row in rows: is_voynich = row["Corpus"] == "Voynich" td = td_voynich if is_voynich else td_style html += "" + "".join(f"" for c in cols) + "" html += "
{c}
{row[c]}
" return html def build_ranking_html(voynich_stats, lang_stats_list, selected_langs): ranked = [] for lang, ls in zip(selected_langs, lang_stats_list): dist = distance_vector(ls, voynich_stats) perp = min(compute_perplexity(voynich_stats["tokens"], ls["tokens"]), 9999) ranked.append((lang, dist, perp)) ranked_by_dist = sorted(ranked, key=lambda x: x[1]) ranked_by_perp = sorted(ranked, key=lambda x: x[2]) def medal(i): return ["🥇", "🥈", "🥉", "4th", "5th"][i] if i < 5 else str(i + 1) html = '
' html += '
' html += '

Closest by Metric Distance

' for i, (lang, dist, _) in enumerate(ranked_by_dist): html += f'
{medal(i)} {lang} — dist={dist:.4f}
' html += "
" html += '
' html += '

Closest by Perplexity

' for i, (lang, _, perp) in enumerate(ranked_by_perp): html += f'
{medal(i)} {lang} — perp={perp:.1f}
' html += "
" html += "
" return html # ───────────────────────────────────────────── # MAIN ANALYSIS PIPELINE # ───────────────────────────────────────────── def run_analysis( uploaded_file, use_sample: bool, bigraph_rules_text: str, selected_langs: list[str], apply_splits: bool, ): # FIXED: error returns now correctly produce exactly 8 outputs if uploaded_file is None and not use_sample: return [None, None, None, None, None, None, "

Please upload a file or enable the sample corpus.

", ""] if not selected_langs: return [None, None, None, None, None, None, "

Please select at least one comparison language.

", ""] # 1. Load Voynich text if uploaded_file is not None: with open(uploaded_file, "r", encoding="utf-8", errors="replace") as f: raw_voynich = f.read() else: raw_voynich = SAMPLE_EVA # 2. Apply bigraph splits (optionally) rules = parse_bigraph_rules(bigraph_rules_text) if apply_splits else [] processed_voynich = apply_bigraph_splits(raw_voynich, rules) if rules else raw_voynich # 3. Tokenize vy_tokens = tokenize(processed_voynich) vy_words = tokenize_words(processed_voynich) if not vy_tokens: return [None, None, None, None, None, None, "

Could not extract tokens from the text. Check input format.

", ""] voynich_stats = build_stats(vy_tokens, vy_words, "Voynich") # 4. Process each selected language lang_stats_list = [] for lang in selected_langs: corpus = LANGUAGE_CORPORA.get(lang, "") l_tokens = tokenize(corpus) l_words = tokenize_words(corpus) lang_stats_list.append(build_stats(l_tokens, l_words, lang)) # 5. Produce all plots — each returns a PIL Image (no temp files needed) ioc_img = plot_ioc_comparison(voynich_stats, lang_stats_list, selected_langs) entropy_img = plot_entropy_curves(voynich_stats, lang_stats_list, selected_langs) zipf_img = plot_zipf(voynich_stats, lang_stats_list, selected_langs) radar_img = plot_distance_radar(voynich_stats, lang_stats_list, selected_langs) perp_img = plot_perplexity(vy_tokens, lang_stats_list, selected_langs) freq_img = plot_char_freq(voynich_stats, lang_stats_list, selected_langs) # 6. Summary table + ranking summary_html = build_summary_html(voynich_stats, lang_stats_list, selected_langs) ranking_html = build_ranking_html(voynich_stats, lang_stats_list, selected_langs) # 7. PMI info block pmi_pairs = pmi_top_pairs(vy_tokens, top_n=15) pmi_html = '

Top PMI Bigram Pairs (Voynich)

' pmi_html += '
' for (a, b), score in pmi_pairs: pmi_html += ( f'' f'{a}+{b} {score:.2f}' ) pmi_html += "
" return [ioc_img, entropy_img, zipf_img, radar_img, perp_img, freq_img, summary_html, ranking_html + "
" + pmi_html] # ───────────────────────────────────────────── # GRADIO UI # ───────────────────────────────────────────── CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=IM+Fell+English:ital@0;1&family=JetBrains+Mono:wght@400;600&display=swap'); body, .gradio-container { background: #0f0f14 !important; color: #d4cfc8 !important; font-family: 'IM Fell English', serif !important; } h1, h2, h3 { color: #e8c97a !important; letter-spacing: 0.04em; } .gr-panel, .gr-box, .gr-form { background: #13131b !important; border-color: #2a2a38 !important; } .gr-button { background: #c9a84c !important; color: #0f0f14 !important; border: none !important; font-family: 'JetBrains Mono', monospace !important; font-weight: 600 !important; letter-spacing: 0.05em; border-radius: 3px !important; } .gr-button:hover { background: #e8c97a !important; } .gr-check-radio { accent-color: #c9a84c !important; } textarea, input[type=text] { background: #16161f !important; color: #d4cfc8 !important; border: 1px solid #2a2a38 !important; font-family: 'JetBrains Mono', monospace !important; font-size: 12px !important; } label { color: #b8b0a8 !important; font-size: 13px !important; } .gr-tab-item { color: #c9a84c !important; border-color: #2a2a38 !important; } .gr-tab-item.selected { background: #1e1e2e !important; } .gr-image img { border-radius: 4px; border: 1px solid #2a2a38; } #header-block { text-align: center; padding: 24px 0 12px; border-bottom: 1px solid #2a2a38; margin-bottom: 16px; } #header-block h1 { font-size: 2rem; margin-bottom: 4px; } #header-block p { color: #888; font-size: 0.9rem; font-style: italic; } """ HEADER_HTML = """

⚗ Voynich Linguistic Analyzer

EVA transliteration · statistical cryptolinguistics · language comparison

""" def create_app(): with gr.Blocks(css=CUSTOM_CSS, title="Voynich Analyzer") as demo: gr.HTML(HEADER_HTML) with gr.Row(): # ── LEFT PANEL: Controls ────────────────────────────── with gr.Column(scale=1, min_width=280): gr.Markdown("### 📂 Input") uploaded_file = gr.File( label="Upload EVA transliteration (.txt)", file_types=[".txt"], type="filepath", ) use_sample = gr.Checkbox( label="Use built-in sample corpus (fallback if no upload)", value=True, ) gr.Markdown("### ✂️ Bigraph Splitting") apply_splits = gr.Checkbox(label="Apply bigraph splitting", value=True) bigraph_rules = gr.Textbox( label="Rules (format: xy -> x y, one per line)", value=DEFAULT_BIGRAPH_RULES, lines=10, ) gr.Markdown("### 🌍 Comparison Languages") selected_langs = gr.CheckboxGroup( choices=list(LANGUAGE_CORPORA.keys()), value=list(LANGUAGE_CORPORA.keys()), label="Languages to compare", ) run_btn = gr.Button("▶ Run Analysis", variant="primary") # ── RIGHT PANEL: Results ───────────────────────────── with gr.Column(scale=3): with gr.Tabs(): with gr.Tab("📊 Summary"): ranking_out = gr.HTML(label="Ranking") summary_out = gr.HTML(label="Metrics Table") with gr.Tab("📈 Index of Coincidence"): # FIXED: type="pil" — no deprecated filepath mode, no temp files ioc_out = gr.Image(label="IoC Comparison", type="pil") with gr.Tab("🌀 Entropy Curves"): entropy_out = gr.Image(label="Entropy Curves", type="pil") with gr.Tab("📉 Zipf Plots"): zipf_out = gr.Image(label="Zipf Word-Frequency", type="pil") with gr.Tab("🎯 Radar Chart"): radar_out = gr.Image(label="Metric Radar", type="pil") with gr.Tab("🔮 Perplexity"): perp_out = gr.Image(label="Cross-Entropy Perplexity", type="pil") with gr.Tab("🔤 Char Frequencies"): freq_out = gr.Image(label="Character Frequency Distributions", type="pil") # FIXED: run_analysis now returns exactly 8 items directly — # no wrapper needed, no temp file writing run_btn.click( fn=run_analysis, inputs=[uploaded_file, use_sample, bigraph_rules, selected_langs, apply_splits], outputs=[ioc_out, entropy_out, zipf_out, radar_out, perp_out, freq_out, summary_out, ranking_out], ) return demo if __name__ == "__main__": app = create_app() # FIXED: no hardcoded host/port — HF Spaces injects its own server config app.launch()