| """ |
| Voynich Manuscript Linguistic Analyzer |
| Gradio app for statistical analysis of EVA-transliterated Voynich text, |
| compared against Latin, Arabic, Hebrew, Medieval Welsh, and Georgian corpora. |
| """ |
|
|
| import io |
| import math |
| import re |
| import os |
| import collections |
| import tempfile |
| import atexit |
| from typing import Optional |
|
|
| import gradio as gr |
| import numpy as np |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| from PIL import Image as PILImage |
|
|
| |
| |
| |
| SAMPLE_EVA = """ |
| fachys ykal ar ataiin shol shory cth res y kor sholdy |
| qokaiin qokey qokaiin ytaiin yteor qokain qokal y tain ytaiin |
| daiin okaiin chedy qokeedy dain shedy daiin ol cheds daiin |
| okeedy chedal okeedy dar ar aiin daiin daiin oteedy keedy chal |
| chol dy kaiin dar shey qodar ytedy cheds chol dain kaiin |
| oteedy sheedy chedal dar ytedy okaiin chedy kaiin shedy chol |
| qokeol dar yteedy cheds dar chol qokeedy okeedy chedal dar |
| daiin yteedy qokain chedy dar chol daiin okaiin chedy kaiin |
| shedy chol oteedy chedal qodar yteol dar daiin okaiin chedy |
| qokeol chedal dar chol daiin oteedy chedal dar yteedy okeedy |
| fachys ykal ar ataiin shol shory cth res y kor sholdy |
| qokaiin qokey qokaiin ytaiin yteor qokain qokal y tain ytaiin |
| daiin okaiin chedy qokeedy dain shedy daiin ol cheds daiin |
| okeedy chedal okeedy dar ar aiin daiin daiin oteedy keedy chal |
| chol dy kaiin dar shey qodar ytedy cheds chol dain kaiin |
| oteedy sheedy chedal dar ytedy okaiin chedy kaiin shedy chol |
| qokeol dar yteedy cheds dar chol qokeedy okeedy chedal dar |
| daiin yteedy qokain chedy dar chol daiin okaiin chedy kaiin |
| shedy chol oteedy chedal qodar yteol dar daiin okaiin chedy |
| qokeol chedal dar chol daiin oteedy chedal dar yteedy okeedy |
| daiin okaiin shedy chol daiin oteedy qokain chedal cheds chol |
| shory chol daiin keedy okaiin chedal dar daiin shedy kaiin |
| qokeedy chedal yteedy dar chol okeedy daiin chedal shedy dar |
| ytaiin qokain daiin chedy qodar shedy okaiin cheds chol daiin |
| """ |
|
|
| |
| |
| |
| |
| LANGUAGE_CORPORA = { |
| "Latin": """ |
| arma virumque cano troiae qui primus ab oris italiam fato profugus laviniaque venit |
| litora multum ille et terris iactatus et alto vi superum saevae memorem iunonis ob iram |
| multa quoque et bello passus dum conderet urbem inferretque deos latio genus unde latinum |
| albanique patres atque altae moenia romae musa mihi causas memora quo numine laeso |
| quidve dolens regina deum tot volvere casus insignem pietate virum tot adire labores |
| impulerit tantaene animis caelestibus irae urbs antiqua fuit tyrii tenuere coloni |
| carthago italiam contra tiberinaque longe ostia dives opum studiisque asperrima belli |
| quam iuno fertur terris magis omnibus unam posthabita coluisse samo hic illius arma |
| hic currus fuit hoc regnum dea gentibus esse si qua fata sinant iam tum tenditque fovetque |
| progeniem sed enim troiano a sanguine duci audierat tyrias olim quae verteret arces |
| """, |
| "Arabic": """ |
| bismi allahi alrrahmani alrrahimi alhamdu lillahi rabbi alealamina alrrahmani alrrahimi |
| maliki yawmi alddini iyyaka nabudu waiyyaka nastainu ihdina alssirata almustaqima |
| sirata alladhina anamta ealayhim ghayri almaghdubi ealayhim wala alddalina |
| qul huwa allahu ahadun allahu alssamadu lam yalid walam yulad walam yakun lahu |
| kufuan ahadun inna anzalnahu fi laylati alqadri wama adraka ma laylatu alqadri |
| laylatu alqadri khayrun min alfi shahrin tanazzalu almalaaikatu waalrruhu fiha |
| bidni rabbihim min kulli amrin salamun hiya hatta matlaei alfajri alam nashrah |
| laka sadraka wawadaena anka wizraka allathee anqada zahraka warafaena laka |
| dhikraka fainna maea aleusri yusran inna maea aleusri yusran faitha faraghta |
| """, |
| "Hebrew": """ |
| bereshit bara elohim et hashamayim veet haaretz vehaaretz hayta tohu vavohu |
| vechoshech al pney tehom veruach elohim merachefet al pney hamayim vayomer |
| elohim yehi or vayehi or vayar elohim et haor ki tov vayavdel elohim beyn haor |
| uveyn hachoshech vayikra elohim laor yom velachoshech kara layla vayehi erev |
| vayehi voker yom echad vayomer elohim yehi rakia betoch hamayim vihi mavdil |
| beyn mayim lammayim vayaas elohim et harakia vayavdel beyn hamayim asher |
| mitachat larakia uveyn hamayim asher meal larakia vayehi chen vayikra elohim |
| larakia shamayim vayehi erev vayehi voker yom sheni vayomer elohim yikavu |
| hamayim mitachat hashamayim el makom echad vetera hayabashah vayehi chen |
| """, |
| "Medieval Welsh": """ |
| yn y dechreuad y creodd duw y nefoedd ar ddaear yr oedd y ddaear yn adfeilion |
| ac yn wag ac yr oedd tywyllwch ar wyneb y dyfnder ac ysbryd duw yn symud ar |
| wyneb y dyfroedd a duw a ddywedodd bydded goleuni a bu goleuni a duw a welodd |
| y goleuni ei fod yn dda a duw a wahanodd y goleuni oddi wrth y tywyllwch |
| ac a alwodd duw y goleuni yn ddydd a galwodd y tywyllwch yn nos ac aeth |
| yr hwyr ar bore dydd cyntaf a duw a ddywedodd bydded ffurfafen yng nghanol y dyfroedd |
| a bydded yn gwahanu dyfroedd oddi wrth ddyfroedd a gwnaeth duw y ffurfafen |
| ac a wahanodd rhwng y dyfroedd oedd tan y ffurfafen ar dyfroedd oedd uwch |
| """, |
| "Georgian": """ |
| tavisupali pirovneba arsebobs rasac unda iqos da rasac unda aketebs piradi |
| tanxmobis gareSe arc erTi pirovneba verc aaketebs samarTliani sazogadoeba |
| romelic TiToeul wevrze mzrunvelobas iCens Tavisuflebas da Tanasworobas |
| uzrunvelyofs yvela moqalaqe kanonis winaSe Tanasworad aris arc erTi |
| diskriminacia ar aris Semosvla pirovnebis uflebebSi samarTliani |
| da Tavisufali sazogadoeba unda aRiardes Tavisufali arCevnis ufleba |
| TiToeuli adamiani ibadeba TavisuflebiT da Tanasworad RirsebiT da |
| uflebiT isini jildosulia gonebisa da sindisisa da urTierTobaSi |
| erTmaneTis mimarT ZmobisduliT unda moiqcnen |
| """, |
| } |
|
|
| |
| |
| |
| DEFAULT_BIGRAPH_RULES = """qo -> q o |
| ch -> c h |
| sh -> s h |
| ee -> e e |
| ai -> a i |
| ol -> o l |
| or -> o r |
| ar -> a r |
| al -> a l |
| """ |
|
|
| |
| |
| |
|
|
| def parse_bigraph_rules(rules_text: str) -> list[tuple[str, str]]: |
| """Parse bigraph split rules from text format 'xy -> x y'""" |
| rules = [] |
| for line in rules_text.strip().splitlines(): |
| line = line.strip() |
| if not line or "->" not in line: |
| continue |
| lhs, rhs = line.split("->", 1) |
| bigraph = lhs.strip() |
| replacement = rhs.strip() |
| rules.append((bigraph, replacement)) |
| return rules |
|
|
|
|
| def apply_bigraph_splits(text: str, rules: list[tuple[str, str]]) -> str: |
| """Apply bigraph splitting rules to text""" |
| for bigraph, replacement in rules: |
| text = text.replace(bigraph, replacement) |
| return text |
|
|
|
|
| def tokenize(text: str) -> list[str]: |
| """Extract clean character tokens (letters only, lowercase)""" |
| return [c for c in text.lower() if c.isalpha()] |
|
|
|
|
| def tokenize_words(text: str) -> list[str]: |
| """Extract word tokens""" |
| return [w for w in re.findall(r"[a-zA-Z]+", text.lower()) if w] |
|
|
|
|
| def ioc(tokens: list[str]) -> float: |
| """Index of Coincidence""" |
| if len(tokens) < 2: |
| return 0.0 |
| freq = collections.Counter(tokens) |
| n = len(tokens) |
| return sum(f * (f - 1) for f in freq.values()) / (n * (n - 1)) |
|
|
|
|
| def entropy_order0(tokens: list[str]) -> float: |
| """Unigram (order-0) entropy in bits""" |
| if not tokens: |
| return 0.0 |
| freq = collections.Counter(tokens) |
| n = len(tokens) |
| return -sum((c / n) * math.log2(c / n) for c in freq.values()) |
|
|
|
|
| def entropy_order1(tokens: list[str]) -> float: |
| """Bigram conditional entropy H(X|Y)""" |
| if len(tokens) < 2: |
| return 0.0 |
| bigrams = list(zip(tokens[:-1], tokens[1:])) |
| bigram_counts = collections.Counter(bigrams) |
| unigram_counts = collections.Counter(tokens[:-1]) |
| total_bigrams = len(bigrams) |
| h = 0.0 |
| for (a, b), cnt in bigram_counts.items(): |
| p_ab = cnt / total_bigrams |
| p_b_given_a = cnt / unigram_counts[a] |
| h -= p_ab * math.log2(p_b_given_a) |
| return h |
|
|
|
|
| def entropy_order2(tokens: list[str]) -> float: |
| """Trigram conditional entropy H(X|YZ)""" |
| if len(tokens) < 3: |
| return 0.0 |
| trigrams = list(zip(tokens[:-2], tokens[1:-1], tokens[2:])) |
| trigram_counts = collections.Counter(trigrams) |
| bigram_counts = collections.Counter(zip(tokens[:-2], tokens[1:-1])) |
| total_trigrams = len(trigrams) |
| h = 0.0 |
| for (a, b, c), cnt in trigram_counts.items(): |
| p_abc = cnt / total_trigrams |
| p_c_given_ab = cnt / bigram_counts[(a, b)] |
| h -= p_abc * math.log2(p_c_given_ab) |
| return h |
|
|
|
|
| def zipf_slope(tokens: list[str]) -> float: |
| """Compute slope of Zipf log-log plot (should be near -1 for natural language)""" |
| freq = collections.Counter(tokens) |
| counts = sorted(freq.values(), reverse=True) |
| if len(counts) < 2: |
| return 0.0 |
| ranks = np.arange(1, len(counts) + 1) |
| log_ranks = np.log(ranks) |
| log_counts = np.log(np.array(counts, dtype=float)) |
| slope, _ = np.polyfit(log_ranks, log_counts, 1) |
| return slope |
|
|
|
|
| def type_token_ratio(words: list[str]) -> float: |
| if not words: |
| return 0.0 |
| return len(set(words)) / len(words) |
|
|
|
|
| def hapax_ratio(words: list[str]) -> float: |
| if not words: |
| return 0.0 |
| freq = collections.Counter(words) |
| hapax = sum(1 for v in freq.values() if v == 1) |
| return hapax / len(set(words)) |
|
|
|
|
| def pmi_top_pairs(tokens: list[str], top_n: int = 10) -> list[tuple[tuple, float]]: |
| """Compute top PMI bigram pairs""" |
| if len(tokens) < 2: |
| return [] |
| bigrams = list(zip(tokens[:-1], tokens[1:])) |
| bg_counts = collections.Counter(bigrams) |
| ug_counts = collections.Counter(tokens) |
| n = len(tokens) |
| pmi_scores = {} |
| for (a, b), cnt in bg_counts.items(): |
| p_ab = cnt / len(bigrams) |
| p_a = ug_counts[a] / n |
| p_b = ug_counts[b] / n |
| if p_a > 0 and p_b > 0 and p_ab > 0: |
| pmi_scores[(a, b)] = math.log2(p_ab / (p_a * p_b)) |
| return sorted(pmi_scores.items(), key=lambda x: -x[1])[:top_n] |
|
|
|
|
| def compute_perplexity(test_tokens: list[str], train_tokens: list[str]) -> float: |
| """Cross-entropy perplexity of test under train bigram model""" |
| if len(test_tokens) < 2 or len(train_tokens) < 2: |
| return float("inf") |
| bigrams = list(zip(train_tokens[:-1], train_tokens[1:])) |
| bg_counts = collections.Counter(bigrams) |
| ug_counts = collections.Counter(train_tokens[:-1]) |
| vocab_size = len(set(train_tokens)) |
| k = 0.5 |
|
|
| test_bigrams = list(zip(test_tokens[:-1], test_tokens[1:])) |
| log_prob = 0.0 |
| for a, b in test_bigrams: |
| numerator = bg_counts.get((a, b), 0) + k |
| denominator = ug_counts.get(a, 0) + k * vocab_size |
| log_prob += math.log2(numerator / denominator) |
|
|
| cross_entropy = -log_prob / len(test_bigrams) |
| return 2 ** cross_entropy |
|
|
|
|
| def build_stats(tokens: list[str], words: list[str], label: str) -> dict: |
| return { |
| "label": label, |
| "n_chars": len(tokens), |
| "n_words": len(words), |
| "vocab_chars": len(set(tokens)), |
| "vocab_words": len(set(words)), |
| "ioc": ioc(tokens), |
| "h0": entropy_order0(tokens), |
| "h1": entropy_order1(tokens), |
| "h2": entropy_order2(tokens), |
| "ttr": type_token_ratio(words), |
| "hapax": hapax_ratio(words), |
| "zipf_slope": zipf_slope(words) if len(words) > 10 else 0.0, |
| "tokens": tokens, |
| "words": words, |
| } |
|
|
|
|
| def distance_vector(stats: dict, voynich_stats: dict) -> float: |
| """Simple Euclidean distance in metric space""" |
| features = ["ioc", "h0", "h1", "h2", "ttr", "hapax"] |
| v_vec = np.array([voynich_stats[f] for f in features]) |
| l_vec = np.array([stats[f] for f in features]) |
| return float(np.linalg.norm(v_vec - l_vec)) |
|
|
|
|
| |
| |
| |
|
|
| VOYNICH_COLOR = "#e8c97a" |
| LANG_COLORS = ["#7ab8e8", "#e87a7a", "#7ae8a5", "#c87ae8", "#e8a57a"] |
| BG_COLOR = "#0f0f14" |
| PANEL_COLOR = "#16161f" |
| TEXT_COLOR = "#d4cfc8" |
| GRID_COLOR = "#2a2a38" |
|
|
|
|
| def fig_to_pil(fig) -> PILImage.Image: |
| """Render a matplotlib figure to a PIL Image, then close the figure.""" |
| buf = io.BytesIO() |
| fig.savefig(buf, format="png", dpi=140, bbox_inches="tight", |
| facecolor=BG_COLOR, edgecolor="none") |
| plt.close(fig) |
| buf.seek(0) |
| return PILImage.open(buf).copy() |
|
|
|
|
| def style_ax(ax, title=""): |
| ax.set_facecolor(PANEL_COLOR) |
| ax.tick_params(colors=TEXT_COLOR, labelsize=8) |
| ax.xaxis.label.set_color(TEXT_COLOR) |
| ax.yaxis.label.set_color(TEXT_COLOR) |
| for spine in ax.spines.values(): |
| spine.set_edgecolor(GRID_COLOR) |
| ax.grid(True, color=GRID_COLOR, linewidth=0.5, alpha=0.7) |
| if title: |
| ax.set_title(title, color=TEXT_COLOR, fontsize=9, fontweight="bold", pad=6) |
|
|
|
|
| def plot_ioc_comparison(voynich_stats, lang_stats_list, selected_langs): |
| fig, ax = plt.subplots(figsize=(9, 4), facecolor=BG_COLOR) |
| labels = ["Voynich"] + selected_langs + ["Random", "English ref"] |
| values = [voynich_stats["ioc"]] + [ls["ioc"] for ls in lang_stats_list] + [0.038, 0.065] |
| colors = [VOYNICH_COLOR] + LANG_COLORS[:len(selected_langs)] + ["#555566", "#445544"] |
| bars = ax.barh(labels, values, color=colors, height=0.55, edgecolor="none") |
| ax.axvline(0.038, color="#555566", lw=1, ls="--", alpha=0.6, label="Random (0.038)") |
| ax.axvline(0.065, color="#445544", lw=1, ls="--", alpha=0.6, label="English (0.065)") |
| ax.set_xlabel("Index of Coincidence", color=TEXT_COLOR) |
| style_ax(ax, "Index of Coincidence") |
| ax.legend(fontsize=7, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR) |
| for bar, val in zip(bars, values): |
| ax.text(val + 0.001, bar.get_y() + bar.get_height() / 2, |
| f"{val:.4f}", va="center", color=TEXT_COLOR, fontsize=7.5) |
| fig.tight_layout(pad=1.2) |
| return fig_to_pil(fig) |
|
|
|
|
| def plot_entropy_curves(voynich_stats, lang_stats_list, selected_langs): |
| fig, ax = plt.subplots(figsize=(9, 4.5), facecolor=BG_COLOR) |
| orders = [0, 1, 2] |
| order_labels = ["Hโ (unigram)", "Hโ (bigram)", "Hโ (trigram)"] |
|
|
| vy = [voynich_stats["h0"], voynich_stats["h1"], voynich_stats["h2"]] |
| ax.plot(orders, vy, "o-", color=VOYNICH_COLOR, lw=2.2, ms=7, |
| label="Voynich", zorder=5) |
|
|
| for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)): |
| lv = [ls["h0"], ls["h1"], ls["h2"]] |
| ax.plot(orders, lv, "o--", color=LANG_COLORS[i], lw=1.6, ms=5, |
| label=lang, alpha=0.85) |
|
|
| ax.set_xticks(orders) |
| ax.set_xticklabels(order_labels, color=TEXT_COLOR, fontsize=8) |
| ax.set_ylabel("Entropy (bits)", color=TEXT_COLOR) |
| style_ax(ax, "Entropy Curves (Hโ โ Hโ โ Hโ)") |
| ax.legend(fontsize=7.5, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR) |
| fig.tight_layout(pad=1.2) |
| return fig_to_pil(fig) |
|
|
|
|
| def plot_zipf(voynich_stats, lang_stats_list, selected_langs): |
| n_plots = len(selected_langs) + 1 |
| fig, axes = plt.subplots(1, n_plots, |
| figsize=(3 * n_plots, 4), |
| facecolor=BG_COLOR) |
| if n_plots == 1: |
| axes = [axes] |
|
|
| def _draw_zipf(ax, words, color, label): |
| freq = collections.Counter(words) |
| counts = sorted(freq.values(), reverse=True) |
| if not counts: |
| return |
| ranks = np.arange(1, len(counts) + 1) |
| ax.loglog(ranks, counts, ".", color=color, ms=3, alpha=0.7) |
| if len(counts) > 2: |
| lr = np.log(ranks) |
| lc = np.log(np.array(counts, dtype=float)) |
| slope, intercept = np.polyfit(lr, lc, 1) |
| fit = np.exp(intercept + slope * lr) |
| ax.loglog(ranks, fit, "-", color=color, lw=1.5, alpha=0.5) |
| ax.set_title(f"{label}\nslope={slope:.2f}", color=TEXT_COLOR, fontsize=8, pad=4) |
| style_ax(ax) |
|
|
| _draw_zipf(axes[0], voynich_stats["words"], VOYNICH_COLOR, "Voynich") |
| for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)): |
| _draw_zipf(axes[i + 1], ls["words"], LANG_COLORS[i], lang) |
|
|
| fig.suptitle("Zipf Word-Frequency Plots (log-log)", color=TEXT_COLOR, fontsize=9, y=1.01) |
| fig.tight_layout(pad=1.2) |
| return fig_to_pil(fig) |
|
|
|
|
| def plot_distance_radar(voynich_stats, lang_stats_list, selected_langs): |
| features = ["ioc", "h0", "h1", "h2", "ttr", "hapax"] |
| feat_labels = ["IoC", "Hโ", "Hโ", "Hโ", "TTR", "Hapax"] |
| N = len(features) |
| angles = [n / float(N) * 2 * math.pi for n in range(N)] |
| angles += angles[:1] |
|
|
| fig, ax = plt.subplots(figsize=(6, 6), subplot_kw={"polar": True}, facecolor=BG_COLOR) |
| ax.set_facecolor(PANEL_COLOR) |
|
|
| all_stats = [voynich_stats] + lang_stats_list |
| mins = {f: min(s[f] for s in all_stats) for f in features} |
| maxs = {f: max(s[f] for s in all_stats) + 1e-10 for f in features} |
|
|
| def norm(stats, f): |
| return (stats[f] - mins[f]) / (maxs[f] - mins[f]) |
|
|
| vy = [norm(voynich_stats, f) for f in features] |
| vy += vy[:1] |
| ax.plot(angles, vy, "-", color=VOYNICH_COLOR, lw=2.2, label="Voynich") |
| ax.fill(angles, vy, color=VOYNICH_COLOR, alpha=0.12) |
|
|
| for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)): |
| lv = [norm(ls, f) for f in features] |
| lv += lv[:1] |
| ax.plot(angles, lv, "--", color=LANG_COLORS[i], lw=1.6, label=lang, alpha=0.85) |
| ax.fill(angles, lv, color=LANG_COLORS[i], alpha=0.05) |
|
|
| ax.set_xticks(angles[:-1]) |
| ax.set_xticklabels(feat_labels, color=TEXT_COLOR, fontsize=9) |
| ax.tick_params(colors=TEXT_COLOR) |
| ax.yaxis.set_tick_params(colors=GRID_COLOR) |
| ax.grid(color=GRID_COLOR, linewidth=0.5) |
| ax.spines["polar"].set_color(GRID_COLOR) |
| ax.set_title("Metric Radar (normalized)", color=TEXT_COLOR, fontsize=9, pad=15) |
| ax.legend(loc="upper right", bbox_to_anchor=(1.35, 1.15), |
| fontsize=7.5, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR) |
| fig.tight_layout() |
| return fig_to_pil(fig) |
|
|
|
|
| def plot_perplexity(voynich_tokens, lang_stats_list, selected_langs): |
| perplexities = [] |
| for ls in lang_stats_list: |
| p = compute_perplexity(voynich_tokens, ls["tokens"]) |
| perplexities.append(min(p, 9999)) |
|
|
| fig, ax = plt.subplots(figsize=(8, 4), facecolor=BG_COLOR) |
| bars = ax.bar(selected_langs, perplexities, |
| color=LANG_COLORS[:len(selected_langs)], edgecolor="none", width=0.55) |
| ax.set_ylabel("Perplexity (lower = more similar)", color=TEXT_COLOR) |
| style_ax(ax, "Cross-Entropy Perplexity of Voynich under Each Language Model") |
| for bar, val in zip(bars, perplexities): |
| ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5, |
| f"{val:.1f}", ha="center", color=TEXT_COLOR, fontsize=8) |
| fig.tight_layout(pad=1.2) |
| return fig_to_pil(fig) |
|
|
|
|
| def plot_char_freq(voynich_stats, lang_stats_list, selected_langs): |
| """Character frequency distribution comparison""" |
| fig, axes = plt.subplots(2, 3, figsize=(14, 7), facecolor=BG_COLOR) |
| axes = axes.flatten() |
|
|
| all_entries = [("Voynich", voynich_stats, VOYNICH_COLOR)] + \ |
| [(lang, ls, LANG_COLORS[i]) for i, (lang, ls) in enumerate(zip(selected_langs, lang_stats_list))] |
|
|
| for ax, (label, stats, color) in zip(axes, all_entries): |
| freq = collections.Counter(stats["tokens"]) |
| top = freq.most_common(20) |
| chars, counts = zip(*top) if top else ([], []) |
| ax.bar(chars, counts, color=color, alpha=0.85, edgecolor="none") |
| style_ax(ax, f"{label} โ top 20 chars") |
| ax.set_xlabel("Character", color=TEXT_COLOR) |
| ax.set_ylabel("Count", color=TEXT_COLOR) |
|
|
| for ax in axes[len(all_entries):]: |
| ax.set_visible(False) |
|
|
| fig.suptitle("Character Frequency Distributions", color=TEXT_COLOR, fontsize=10, y=1.01) |
| fig.tight_layout(pad=1.5) |
| return fig_to_pil(fig) |
|
|
|
|
| |
| |
| |
|
|
| def build_summary_html(voynich_stats, lang_stats_list, selected_langs): |
| rows = [] |
| all_entries = [("Voynich", voynich_stats)] + list(zip(selected_langs, lang_stats_list)) |
|
|
| for label, stats in all_entries: |
| dist = distance_vector(stats, voynich_stats) if label != "Voynich" else "โ" |
| perp = compute_perplexity(voynich_stats["tokens"], stats["tokens"]) if label != "Voynich" else "โ" |
| dist_str = f"{dist:.4f}" if isinstance(dist, float) else dist |
| perp_str = f"{min(perp, 9999):.1f}" if isinstance(perp, float) else perp |
| rows.append({ |
| "Corpus": label, |
| "Chars": stats["n_chars"], |
| "Words": stats["n_words"], |
| "IoC": f"{stats['ioc']:.4f}", |
| "Hโ": f"{stats['h0']:.3f}", |
| "Hโ": f"{stats['h1']:.3f}", |
| "Hโ": f"{stats['h2']:.3f}", |
| "TTR": f"{stats['ttr']:.3f}", |
| "Hapax": f"{stats['hapax']:.3f}", |
| "Zipf slope": f"{stats['zipf_slope']:.3f}", |
| "Distance": dist_str, |
| "Perplexity": perp_str, |
| }) |
|
|
| cols = list(rows[0].keys()) |
| th_style = "background:#1e1e2e;color:#c9a84c;padding:7px 12px;border:1px solid #2a2a38;font-size:12px;" |
| td_style = "padding:6px 12px;border:1px solid #2a2a38;color:#d4cfc8;font-size:11px;text-align:center;" |
| td_voynich = "padding:6px 12px;border:1px solid #2a2a38;color:#e8c97a;font-size:11px;text-align:center;font-weight:bold;background:#1a1a22;" |
|
|
| html = '<table style="border-collapse:collapse;width:100%;background:#0f0f14;">' |
| html += "<thead><tr>" + "".join(f"<th style='{th_style}'>{c}</th>" for c in cols) + "</tr></thead>" |
| html += "<tbody>" |
| for row in rows: |
| is_voynich = row["Corpus"] == "Voynich" |
| td = td_voynich if is_voynich else td_style |
| html += "<tr>" + "".join(f"<td style='{td}'>{row[c]}</td>" for c in cols) + "</tr>" |
| html += "</tbody></table>" |
| return html |
|
|
|
|
| def build_ranking_html(voynich_stats, lang_stats_list, selected_langs): |
| ranked = [] |
| for lang, ls in zip(selected_langs, lang_stats_list): |
| dist = distance_vector(ls, voynich_stats) |
| perp = min(compute_perplexity(voynich_stats["tokens"], ls["tokens"]), 9999) |
| ranked.append((lang, dist, perp)) |
|
|
| ranked_by_dist = sorted(ranked, key=lambda x: x[1]) |
| ranked_by_perp = sorted(ranked, key=lambda x: x[2]) |
|
|
| def medal(i): |
| return ["๐ฅ", "๐ฅ", "๐ฅ", "4th", "5th"][i] if i < 5 else str(i + 1) |
|
|
| html = '<div style="display:flex;gap:24px;flex-wrap:wrap;">' |
|
|
| html += '<div style="flex:1;min-width:240px;">' |
| html += '<h3 style="color:#c9a84c;font-size:13px;margin-bottom:8px;">Closest by Metric Distance</h3>' |
| for i, (lang, dist, _) in enumerate(ranked_by_dist): |
| html += f'<div style="margin:4px 0;color:#d4cfc8;font-size:12px;">{medal(i)} <b>{lang}</b> โ dist={dist:.4f}</div>' |
| html += "</div>" |
|
|
| html += '<div style="flex:1;min-width:240px;">' |
| html += '<h3 style="color:#c9a84c;font-size:13px;margin-bottom:8px;">Closest by Perplexity</h3>' |
| for i, (lang, _, perp) in enumerate(ranked_by_perp): |
| html += f'<div style="margin:4px 0;color:#d4cfc8;font-size:12px;">{medal(i)} <b>{lang}</b> โ perp={perp:.1f}</div>' |
| html += "</div>" |
|
|
| html += "</div>" |
| return html |
|
|
|
|
| |
| |
| |
|
|
| def run_analysis( |
| uploaded_file, |
| use_sample: bool, |
| bigraph_rules_text: str, |
| selected_langs: list[str], |
| apply_splits: bool, |
| ): |
| |
| if uploaded_file is None and not use_sample: |
| return [None, None, None, None, None, None, |
| "<p style='color:#e87a7a'>Please upload a file or enable the sample corpus.</p>", ""] |
|
|
| if not selected_langs: |
| return [None, None, None, None, None, None, |
| "<p style='color:#e87a7a'>Please select at least one comparison language.</p>", ""] |
|
|
| |
| if uploaded_file is not None: |
| with open(uploaded_file, "r", encoding="utf-8", errors="replace") as f: |
| raw_voynich = f.read() |
| else: |
| raw_voynich = SAMPLE_EVA |
|
|
| |
| rules = parse_bigraph_rules(bigraph_rules_text) if apply_splits else [] |
| processed_voynich = apply_bigraph_splits(raw_voynich, rules) if rules else raw_voynich |
|
|
| |
| vy_tokens = tokenize(processed_voynich) |
| vy_words = tokenize_words(processed_voynich) |
|
|
| if not vy_tokens: |
| return [None, None, None, None, None, None, |
| "<p style='color:#e87a7a'>Could not extract tokens from the text. Check input format.</p>", ""] |
|
|
| voynich_stats = build_stats(vy_tokens, vy_words, "Voynich") |
|
|
| |
| lang_stats_list = [] |
| for lang in selected_langs: |
| corpus = LANGUAGE_CORPORA.get(lang, "") |
| l_tokens = tokenize(corpus) |
| l_words = tokenize_words(corpus) |
| lang_stats_list.append(build_stats(l_tokens, l_words, lang)) |
|
|
| |
| ioc_img = plot_ioc_comparison(voynich_stats, lang_stats_list, selected_langs) |
| entropy_img = plot_entropy_curves(voynich_stats, lang_stats_list, selected_langs) |
| zipf_img = plot_zipf(voynich_stats, lang_stats_list, selected_langs) |
| radar_img = plot_distance_radar(voynich_stats, lang_stats_list, selected_langs) |
| perp_img = plot_perplexity(vy_tokens, lang_stats_list, selected_langs) |
| freq_img = plot_char_freq(voynich_stats, lang_stats_list, selected_langs) |
|
|
| |
| summary_html = build_summary_html(voynich_stats, lang_stats_list, selected_langs) |
| ranking_html = build_ranking_html(voynich_stats, lang_stats_list, selected_langs) |
|
|
| |
| pmi_pairs = pmi_top_pairs(vy_tokens, top_n=15) |
| pmi_html = '<h3 style="color:#c9a84c;font-size:13px;">Top PMI Bigram Pairs (Voynich)</h3>' |
| pmi_html += '<div style="display:flex;flex-wrap:wrap;gap:8px;">' |
| for (a, b), score in pmi_pairs: |
| pmi_html += ( |
| f'<span style="background:#1e1e2e;border:1px solid #2a2a38;padding:3px 8px;' |
| f'border-radius:4px;color:#d4cfc8;font-size:11px;">' |
| f'{a}+{b} <b style="color:#e8c97a">{score:.2f}</b></span>' |
| ) |
| pmi_html += "</div>" |
|
|
| return [ioc_img, entropy_img, zipf_img, radar_img, perp_img, freq_img, |
| summary_html, ranking_html + "<br>" + pmi_html] |
|
|
|
|
| |
| |
| |
|
|
| CUSTOM_CSS = """ |
| @import url('https://fonts.googleapis.com/css2?family=IM+Fell+English:ital@0;1&family=JetBrains+Mono:wght@400;600&display=swap'); |
| |
| body, .gradio-container { |
| background: #0f0f14 !important; |
| color: #d4cfc8 !important; |
| font-family: 'IM Fell English', serif !important; |
| } |
| |
| h1, h2, h3 { color: #e8c97a !important; letter-spacing: 0.04em; } |
| |
| .gr-panel, .gr-box, .gr-form { background: #13131b !important; border-color: #2a2a38 !important; } |
| |
| .gr-button { |
| background: #c9a84c !important; |
| color: #0f0f14 !important; |
| border: none !important; |
| font-family: 'JetBrains Mono', monospace !important; |
| font-weight: 600 !important; |
| letter-spacing: 0.05em; |
| border-radius: 3px !important; |
| } |
| .gr-button:hover { background: #e8c97a !important; } |
| |
| .gr-check-radio { accent-color: #c9a84c !important; } |
| |
| textarea, input[type=text] { |
| background: #16161f !important; |
| color: #d4cfc8 !important; |
| border: 1px solid #2a2a38 !important; |
| font-family: 'JetBrains Mono', monospace !important; |
| font-size: 12px !important; |
| } |
| |
| label { color: #b8b0a8 !important; font-size: 13px !important; } |
| |
| .gr-tab-item { color: #c9a84c !important; border-color: #2a2a38 !important; } |
| .gr-tab-item.selected { background: #1e1e2e !important; } |
| |
| .gr-image img { border-radius: 4px; border: 1px solid #2a2a38; } |
| |
| #header-block { |
| text-align: center; |
| padding: 24px 0 12px; |
| border-bottom: 1px solid #2a2a38; |
| margin-bottom: 16px; |
| } |
| #header-block h1 { font-size: 2rem; margin-bottom: 4px; } |
| #header-block p { color: #888; font-size: 0.9rem; font-style: italic; } |
| """ |
|
|
| HEADER_HTML = """ |
| <div id="header-block"> |
| <h1>โ Voynich Linguistic Analyzer</h1> |
| <p>EVA transliteration ยท statistical cryptolinguistics ยท language comparison</p> |
| </div> |
| """ |
|
|
|
|
| def create_app(): |
| with gr.Blocks(css=CUSTOM_CSS, title="Voynich Analyzer") as demo: |
|
|
| gr.HTML(HEADER_HTML) |
|
|
| with gr.Row(): |
| |
| with gr.Column(scale=1, min_width=280): |
| gr.Markdown("### ๐ Input") |
|
|
| uploaded_file = gr.File( |
| label="Upload EVA transliteration (.txt)", |
| file_types=[".txt"], |
| type="filepath", |
| ) |
| use_sample = gr.Checkbox( |
| label="Use built-in sample corpus (fallback if no upload)", |
| value=True, |
| ) |
|
|
| gr.Markdown("### โ๏ธ Bigraph Splitting") |
| apply_splits = gr.Checkbox(label="Apply bigraph splitting", value=True) |
| bigraph_rules = gr.Textbox( |
| label="Rules (format: xy -> x y, one per line)", |
| value=DEFAULT_BIGRAPH_RULES, |
| lines=10, |
| ) |
|
|
| gr.Markdown("### ๐ Comparison Languages") |
| selected_langs = gr.CheckboxGroup( |
| choices=list(LANGUAGE_CORPORA.keys()), |
| value=list(LANGUAGE_CORPORA.keys()), |
| label="Languages to compare", |
| ) |
|
|
| run_btn = gr.Button("โถ Run Analysis", variant="primary") |
|
|
| |
| with gr.Column(scale=3): |
| with gr.Tabs(): |
| with gr.Tab("๐ Summary"): |
| ranking_out = gr.HTML(label="Ranking") |
| summary_out = gr.HTML(label="Metrics Table") |
|
|
| with gr.Tab("๐ Index of Coincidence"): |
| |
| ioc_out = gr.Image(label="IoC Comparison", type="pil") |
|
|
| with gr.Tab("๐ Entropy Curves"): |
| entropy_out = gr.Image(label="Entropy Curves", type="pil") |
|
|
| with gr.Tab("๐ Zipf Plots"): |
| zipf_out = gr.Image(label="Zipf Word-Frequency", type="pil") |
|
|
| with gr.Tab("๐ฏ Radar Chart"): |
| radar_out = gr.Image(label="Metric Radar", type="pil") |
|
|
| with gr.Tab("๐ฎ Perplexity"): |
| perp_out = gr.Image(label="Cross-Entropy Perplexity", type="pil") |
|
|
| with gr.Tab("๐ค Char Frequencies"): |
| freq_out = gr.Image(label="Character Frequency Distributions", type="pil") |
|
|
| |
| |
| run_btn.click( |
| fn=run_analysis, |
| inputs=[uploaded_file, use_sample, bigraph_rules, selected_langs, apply_splits], |
| outputs=[ioc_out, entropy_out, zipf_out, radar_out, perp_out, freq_out, |
| summary_out, ranking_out], |
| ) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| app = create_app() |
| |
| app.launch() |