V4 / app.py
kambris's picture
Create app.py
2187547 verified
"""
Voynich Manuscript Linguistic Analyzer
Gradio app for statistical analysis of EVA-transliterated Voynich text,
compared against Latin, Arabic, Hebrew, Medieval Welsh, and Georgian corpora.
"""
import io
import math
import re
import os
import collections
import tempfile
import atexit
from typing import Optional
import gradio as gr
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from PIL import Image as PILImage
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# SAMPLE VOYNICH EVA TEXT (small illustrative corpus)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SAMPLE_EVA = """
fachys ykal ar ataiin shol shory cth res y kor sholdy
qokaiin qokey qokaiin ytaiin yteor qokain qokal y tain ytaiin
daiin okaiin chedy qokeedy dain shedy daiin ol cheds daiin
okeedy chedal okeedy dar ar aiin daiin daiin oteedy keedy chal
chol dy kaiin dar shey qodar ytedy cheds chol dain kaiin
oteedy sheedy chedal dar ytedy okaiin chedy kaiin shedy chol
qokeol dar yteedy cheds dar chol qokeedy okeedy chedal dar
daiin yteedy qokain chedy dar chol daiin okaiin chedy kaiin
shedy chol oteedy chedal qodar yteol dar daiin okaiin chedy
qokeol chedal dar chol daiin oteedy chedal dar yteedy okeedy
fachys ykal ar ataiin shol shory cth res y kor sholdy
qokaiin qokey qokaiin ytaiin yteor qokain qokal y tain ytaiin
daiin okaiin chedy qokeedy dain shedy daiin ol cheds daiin
okeedy chedal okeedy dar ar aiin daiin daiin oteedy keedy chal
chol dy kaiin dar shey qodar ytedy cheds chol dain kaiin
oteedy sheedy chedal dar ytedy okaiin chedy kaiin shedy chol
qokeol dar yteedy cheds dar chol qokeedy okeedy chedal dar
daiin yteedy qokain chedy dar chol daiin okaiin chedy kaiin
shedy chol oteedy chedal qodar yteol dar daiin okaiin chedy
qokeol chedal dar chol daiin oteedy chedal dar yteedy okeedy
daiin okaiin shedy chol daiin oteedy qokain chedal cheds chol
shory chol daiin keedy okaiin chedal dar daiin shedy kaiin
qokeedy chedal yteedy dar chol okeedy daiin chedal shedy dar
ytaiin qokain daiin chedy qodar shedy okaiin cheds chol daiin
"""
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# REFERENCE LANGUAGE CORPORA (romanized / transliterated samples)
# These are small illustrative samples โ€” real analysis needs larger corpora
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
LANGUAGE_CORPORA = {
"Latin": """
arma virumque cano troiae qui primus ab oris italiam fato profugus laviniaque venit
litora multum ille et terris iactatus et alto vi superum saevae memorem iunonis ob iram
multa quoque et bello passus dum conderet urbem inferretque deos latio genus unde latinum
albanique patres atque altae moenia romae musa mihi causas memora quo numine laeso
quidve dolens regina deum tot volvere casus insignem pietate virum tot adire labores
impulerit tantaene animis caelestibus irae urbs antiqua fuit tyrii tenuere coloni
carthago italiam contra tiberinaque longe ostia dives opum studiisque asperrima belli
quam iuno fertur terris magis omnibus unam posthabita coluisse samo hic illius arma
hic currus fuit hoc regnum dea gentibus esse si qua fata sinant iam tum tenditque fovetque
progeniem sed enim troiano a sanguine duci audierat tyrias olim quae verteret arces
""",
"Arabic": """
bismi allahi alrrahmani alrrahimi alhamdu lillahi rabbi alealamina alrrahmani alrrahimi
maliki yawmi alddini iyyaka nabudu waiyyaka nastainu ihdina alssirata almustaqima
sirata alladhina anamta ealayhim ghayri almaghdubi ealayhim wala alddalina
qul huwa allahu ahadun allahu alssamadu lam yalid walam yulad walam yakun lahu
kufuan ahadun inna anzalnahu fi laylati alqadri wama adraka ma laylatu alqadri
laylatu alqadri khayrun min alfi shahrin tanazzalu almalaaikatu waalrruhu fiha
bidni rabbihim min kulli amrin salamun hiya hatta matlaei alfajri alam nashrah
laka sadraka wawadaena anka wizraka allathee anqada zahraka warafaena laka
dhikraka fainna maea aleusri yusran inna maea aleusri yusran faitha faraghta
""",
"Hebrew": """
bereshit bara elohim et hashamayim veet haaretz vehaaretz hayta tohu vavohu
vechoshech al pney tehom veruach elohim merachefet al pney hamayim vayomer
elohim yehi or vayehi or vayar elohim et haor ki tov vayavdel elohim beyn haor
uveyn hachoshech vayikra elohim laor yom velachoshech kara layla vayehi erev
vayehi voker yom echad vayomer elohim yehi rakia betoch hamayim vihi mavdil
beyn mayim lammayim vayaas elohim et harakia vayavdel beyn hamayim asher
mitachat larakia uveyn hamayim asher meal larakia vayehi chen vayikra elohim
larakia shamayim vayehi erev vayehi voker yom sheni vayomer elohim yikavu
hamayim mitachat hashamayim el makom echad vetera hayabashah vayehi chen
""",
"Medieval Welsh": """
yn y dechreuad y creodd duw y nefoedd ar ddaear yr oedd y ddaear yn adfeilion
ac yn wag ac yr oedd tywyllwch ar wyneb y dyfnder ac ysbryd duw yn symud ar
wyneb y dyfroedd a duw a ddywedodd bydded goleuni a bu goleuni a duw a welodd
y goleuni ei fod yn dda a duw a wahanodd y goleuni oddi wrth y tywyllwch
ac a alwodd duw y goleuni yn ddydd a galwodd y tywyllwch yn nos ac aeth
yr hwyr ar bore dydd cyntaf a duw a ddywedodd bydded ffurfafen yng nghanol y dyfroedd
a bydded yn gwahanu dyfroedd oddi wrth ddyfroedd a gwnaeth duw y ffurfafen
ac a wahanodd rhwng y dyfroedd oedd tan y ffurfafen ar dyfroedd oedd uwch
""",
"Georgian": """
tavisupali pirovneba arsebobs rasac unda iqos da rasac unda aketebs piradi
tanxmobis gareSe arc erTi pirovneba verc aaketebs samarTliani sazogadoeba
romelic TiToeul wevrze mzrunvelobas iCens Tavisuflebas da Tanasworobas
uzrunvelyofs yvela moqalaqe kanonis winaSe Tanasworad aris arc erTi
diskriminacia ar aris Semosvla pirovnebis uflebebSi samarTliani
da Tavisufali sazogadoeba unda aRiardes Tavisufali arCevnis ufleba
TiToeuli adamiani ibadeba TavisuflebiT da Tanasworad RirsebiT da
uflebiT isini jildosulia gonebisa da sindisisa da urTierTobaSi
erTmaneTis mimarT ZmobisduliT unda moiqcnen
""",
}
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# DEFAULT BIGRAPH SPLIT RULES (EVA-based)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
DEFAULT_BIGRAPH_RULES = """qo -> q o
ch -> c h
sh -> s h
ee -> e e
ai -> a i
ol -> o l
or -> o r
ar -> a r
al -> a l
"""
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# CORE ANALYSIS FUNCTIONS
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def parse_bigraph_rules(rules_text: str) -> list[tuple[str, str]]:
"""Parse bigraph split rules from text format 'xy -> x y'"""
rules = []
for line in rules_text.strip().splitlines():
line = line.strip()
if not line or "->" not in line:
continue
lhs, rhs = line.split("->", 1)
bigraph = lhs.strip()
replacement = rhs.strip()
rules.append((bigraph, replacement))
return rules
def apply_bigraph_splits(text: str, rules: list[tuple[str, str]]) -> str:
"""Apply bigraph splitting rules to text"""
for bigraph, replacement in rules:
text = text.replace(bigraph, replacement)
return text
def tokenize(text: str) -> list[str]:
"""Extract clean character tokens (letters only, lowercase)"""
return [c for c in text.lower() if c.isalpha()]
def tokenize_words(text: str) -> list[str]:
"""Extract word tokens"""
return [w for w in re.findall(r"[a-zA-Z]+", text.lower()) if w]
def ioc(tokens: list[str]) -> float:
"""Index of Coincidence"""
if len(tokens) < 2:
return 0.0
freq = collections.Counter(tokens)
n = len(tokens)
return sum(f * (f - 1) for f in freq.values()) / (n * (n - 1))
def entropy_order0(tokens: list[str]) -> float:
"""Unigram (order-0) entropy in bits"""
if not tokens:
return 0.0
freq = collections.Counter(tokens)
n = len(tokens)
return -sum((c / n) * math.log2(c / n) for c in freq.values())
def entropy_order1(tokens: list[str]) -> float:
"""Bigram conditional entropy H(X|Y)"""
if len(tokens) < 2:
return 0.0
bigrams = list(zip(tokens[:-1], tokens[1:]))
bigram_counts = collections.Counter(bigrams)
unigram_counts = collections.Counter(tokens[:-1])
total_bigrams = len(bigrams)
h = 0.0
for (a, b), cnt in bigram_counts.items():
p_ab = cnt / total_bigrams
p_b_given_a = cnt / unigram_counts[a]
h -= p_ab * math.log2(p_b_given_a)
return h
def entropy_order2(tokens: list[str]) -> float:
"""Trigram conditional entropy H(X|YZ)"""
if len(tokens) < 3:
return 0.0
trigrams = list(zip(tokens[:-2], tokens[1:-1], tokens[2:]))
trigram_counts = collections.Counter(trigrams)
bigram_counts = collections.Counter(zip(tokens[:-2], tokens[1:-1]))
total_trigrams = len(trigrams)
h = 0.0
for (a, b, c), cnt in trigram_counts.items():
p_abc = cnt / total_trigrams
p_c_given_ab = cnt / bigram_counts[(a, b)]
h -= p_abc * math.log2(p_c_given_ab)
return h
def zipf_slope(tokens: list[str]) -> float:
"""Compute slope of Zipf log-log plot (should be near -1 for natural language)"""
freq = collections.Counter(tokens)
counts = sorted(freq.values(), reverse=True)
if len(counts) < 2:
return 0.0
ranks = np.arange(1, len(counts) + 1)
log_ranks = np.log(ranks)
log_counts = np.log(np.array(counts, dtype=float))
slope, _ = np.polyfit(log_ranks, log_counts, 1)
return slope
def type_token_ratio(words: list[str]) -> float:
if not words:
return 0.0
return len(set(words)) / len(words)
def hapax_ratio(words: list[str]) -> float:
if not words:
return 0.0
freq = collections.Counter(words)
hapax = sum(1 for v in freq.values() if v == 1)
return hapax / len(set(words))
def pmi_top_pairs(tokens: list[str], top_n: int = 10) -> list[tuple[tuple, float]]:
"""Compute top PMI bigram pairs"""
if len(tokens) < 2:
return []
bigrams = list(zip(tokens[:-1], tokens[1:]))
bg_counts = collections.Counter(bigrams)
ug_counts = collections.Counter(tokens)
n = len(tokens)
pmi_scores = {}
for (a, b), cnt in bg_counts.items():
p_ab = cnt / len(bigrams)
p_a = ug_counts[a] / n
p_b = ug_counts[b] / n
if p_a > 0 and p_b > 0 and p_ab > 0:
pmi_scores[(a, b)] = math.log2(p_ab / (p_a * p_b))
return sorted(pmi_scores.items(), key=lambda x: -x[1])[:top_n]
def compute_perplexity(test_tokens: list[str], train_tokens: list[str]) -> float:
"""Cross-entropy perplexity of test under train bigram model"""
if len(test_tokens) < 2 or len(train_tokens) < 2:
return float("inf")
bigrams = list(zip(train_tokens[:-1], train_tokens[1:]))
bg_counts = collections.Counter(bigrams)
ug_counts = collections.Counter(train_tokens[:-1])
vocab_size = len(set(train_tokens))
k = 0.5 # Laplace smoothing
test_bigrams = list(zip(test_tokens[:-1], test_tokens[1:]))
log_prob = 0.0
for a, b in test_bigrams:
numerator = bg_counts.get((a, b), 0) + k
denominator = ug_counts.get(a, 0) + k * vocab_size
log_prob += math.log2(numerator / denominator)
cross_entropy = -log_prob / len(test_bigrams)
return 2 ** cross_entropy
def build_stats(tokens: list[str], words: list[str], label: str) -> dict:
return {
"label": label,
"n_chars": len(tokens),
"n_words": len(words),
"vocab_chars": len(set(tokens)),
"vocab_words": len(set(words)),
"ioc": ioc(tokens),
"h0": entropy_order0(tokens),
"h1": entropy_order1(tokens),
"h2": entropy_order2(tokens),
"ttr": type_token_ratio(words),
"hapax": hapax_ratio(words),
"zipf_slope": zipf_slope(words) if len(words) > 10 else 0.0,
"tokens": tokens,
"words": words,
}
def distance_vector(stats: dict, voynich_stats: dict) -> float:
"""Simple Euclidean distance in metric space"""
features = ["ioc", "h0", "h1", "h2", "ttr", "hapax"]
v_vec = np.array([voynich_stats[f] for f in features])
l_vec = np.array([stats[f] for f in features])
return float(np.linalg.norm(v_vec - l_vec))
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# PLOTTING FUNCTIONS
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
VOYNICH_COLOR = "#e8c97a"
LANG_COLORS = ["#7ab8e8", "#e87a7a", "#7ae8a5", "#c87ae8", "#e8a57a"]
BG_COLOR = "#0f0f14"
PANEL_COLOR = "#16161f"
TEXT_COLOR = "#d4cfc8"
GRID_COLOR = "#2a2a38"
def fig_to_pil(fig) -> PILImage.Image:
"""Render a matplotlib figure to a PIL Image, then close the figure."""
buf = io.BytesIO()
fig.savefig(buf, format="png", dpi=140, bbox_inches="tight",
facecolor=BG_COLOR, edgecolor="none")
plt.close(fig) # FIXED: always close to prevent memory leak
buf.seek(0)
return PILImage.open(buf).copy()
def style_ax(ax, title=""):
ax.set_facecolor(PANEL_COLOR)
ax.tick_params(colors=TEXT_COLOR, labelsize=8)
ax.xaxis.label.set_color(TEXT_COLOR)
ax.yaxis.label.set_color(TEXT_COLOR)
for spine in ax.spines.values():
spine.set_edgecolor(GRID_COLOR)
ax.grid(True, color=GRID_COLOR, linewidth=0.5, alpha=0.7)
if title:
ax.set_title(title, color=TEXT_COLOR, fontsize=9, fontweight="bold", pad=6)
def plot_ioc_comparison(voynich_stats, lang_stats_list, selected_langs):
fig, ax = plt.subplots(figsize=(9, 4), facecolor=BG_COLOR)
labels = ["Voynich"] + selected_langs + ["Random", "English ref"]
values = [voynich_stats["ioc"]] + [ls["ioc"] for ls in lang_stats_list] + [0.038, 0.065]
colors = [VOYNICH_COLOR] + LANG_COLORS[:len(selected_langs)] + ["#555566", "#445544"]
bars = ax.barh(labels, values, color=colors, height=0.55, edgecolor="none")
ax.axvline(0.038, color="#555566", lw=1, ls="--", alpha=0.6, label="Random (0.038)")
ax.axvline(0.065, color="#445544", lw=1, ls="--", alpha=0.6, label="English (0.065)")
ax.set_xlabel("Index of Coincidence", color=TEXT_COLOR)
style_ax(ax, "Index of Coincidence")
ax.legend(fontsize=7, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR)
for bar, val in zip(bars, values):
ax.text(val + 0.001, bar.get_y() + bar.get_height() / 2,
f"{val:.4f}", va="center", color=TEXT_COLOR, fontsize=7.5)
fig.tight_layout(pad=1.2)
return fig_to_pil(fig)
def plot_entropy_curves(voynich_stats, lang_stats_list, selected_langs):
fig, ax = plt.subplots(figsize=(9, 4.5), facecolor=BG_COLOR)
orders = [0, 1, 2]
order_labels = ["Hโ‚€ (unigram)", "Hโ‚ (bigram)", "Hโ‚‚ (trigram)"]
vy = [voynich_stats["h0"], voynich_stats["h1"], voynich_stats["h2"]]
ax.plot(orders, vy, "o-", color=VOYNICH_COLOR, lw=2.2, ms=7,
label="Voynich", zorder=5)
for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)):
lv = [ls["h0"], ls["h1"], ls["h2"]]
ax.plot(orders, lv, "o--", color=LANG_COLORS[i], lw=1.6, ms=5,
label=lang, alpha=0.85)
ax.set_xticks(orders)
ax.set_xticklabels(order_labels, color=TEXT_COLOR, fontsize=8)
ax.set_ylabel("Entropy (bits)", color=TEXT_COLOR)
style_ax(ax, "Entropy Curves (Hโ‚€ โ†’ Hโ‚ โ†’ Hโ‚‚)")
ax.legend(fontsize=7.5, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR)
fig.tight_layout(pad=1.2)
return fig_to_pil(fig)
def plot_zipf(voynich_stats, lang_stats_list, selected_langs):
n_plots = len(selected_langs) + 1
fig, axes = plt.subplots(1, n_plots,
figsize=(3 * n_plots, 4),
facecolor=BG_COLOR)
if n_plots == 1:
axes = [axes]
def _draw_zipf(ax, words, color, label):
freq = collections.Counter(words)
counts = sorted(freq.values(), reverse=True)
if not counts:
return
ranks = np.arange(1, len(counts) + 1)
ax.loglog(ranks, counts, ".", color=color, ms=3, alpha=0.7)
if len(counts) > 2:
lr = np.log(ranks)
lc = np.log(np.array(counts, dtype=float))
slope, intercept = np.polyfit(lr, lc, 1)
fit = np.exp(intercept + slope * lr)
ax.loglog(ranks, fit, "-", color=color, lw=1.5, alpha=0.5)
ax.set_title(f"{label}\nslope={slope:.2f}", color=TEXT_COLOR, fontsize=8, pad=4)
style_ax(ax)
_draw_zipf(axes[0], voynich_stats["words"], VOYNICH_COLOR, "Voynich")
for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)):
_draw_zipf(axes[i + 1], ls["words"], LANG_COLORS[i], lang)
fig.suptitle("Zipf Word-Frequency Plots (log-log)", color=TEXT_COLOR, fontsize=9, y=1.01)
fig.tight_layout(pad=1.2)
return fig_to_pil(fig)
def plot_distance_radar(voynich_stats, lang_stats_list, selected_langs):
features = ["ioc", "h0", "h1", "h2", "ttr", "hapax"]
feat_labels = ["IoC", "Hโ‚€", "Hโ‚", "Hโ‚‚", "TTR", "Hapax"]
N = len(features)
angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw={"polar": True}, facecolor=BG_COLOR)
ax.set_facecolor(PANEL_COLOR)
all_stats = [voynich_stats] + lang_stats_list
mins = {f: min(s[f] for s in all_stats) for f in features}
maxs = {f: max(s[f] for s in all_stats) + 1e-10 for f in features}
def norm(stats, f):
return (stats[f] - mins[f]) / (maxs[f] - mins[f])
vy = [norm(voynich_stats, f) for f in features]
vy += vy[:1]
ax.plot(angles, vy, "-", color=VOYNICH_COLOR, lw=2.2, label="Voynich")
ax.fill(angles, vy, color=VOYNICH_COLOR, alpha=0.12)
for i, (ls, lang) in enumerate(zip(lang_stats_list, selected_langs)):
lv = [norm(ls, f) for f in features]
lv += lv[:1]
ax.plot(angles, lv, "--", color=LANG_COLORS[i], lw=1.6, label=lang, alpha=0.85)
ax.fill(angles, lv, color=LANG_COLORS[i], alpha=0.05)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(feat_labels, color=TEXT_COLOR, fontsize=9)
ax.tick_params(colors=TEXT_COLOR)
ax.yaxis.set_tick_params(colors=GRID_COLOR)
ax.grid(color=GRID_COLOR, linewidth=0.5)
ax.spines["polar"].set_color(GRID_COLOR)
ax.set_title("Metric Radar (normalized)", color=TEXT_COLOR, fontsize=9, pad=15)
ax.legend(loc="upper right", bbox_to_anchor=(1.35, 1.15),
fontsize=7.5, labelcolor=TEXT_COLOR, facecolor=PANEL_COLOR, edgecolor=GRID_COLOR)
fig.tight_layout()
return fig_to_pil(fig)
def plot_perplexity(voynich_tokens, lang_stats_list, selected_langs):
perplexities = []
for ls in lang_stats_list:
p = compute_perplexity(voynich_tokens, ls["tokens"])
perplexities.append(min(p, 9999))
fig, ax = plt.subplots(figsize=(8, 4), facecolor=BG_COLOR)
bars = ax.bar(selected_langs, perplexities,
color=LANG_COLORS[:len(selected_langs)], edgecolor="none", width=0.55)
ax.set_ylabel("Perplexity (lower = more similar)", color=TEXT_COLOR)
style_ax(ax, "Cross-Entropy Perplexity of Voynich under Each Language Model")
for bar, val in zip(bars, perplexities):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5,
f"{val:.1f}", ha="center", color=TEXT_COLOR, fontsize=8)
fig.tight_layout(pad=1.2)
return fig_to_pil(fig)
def plot_char_freq(voynich_stats, lang_stats_list, selected_langs):
"""Character frequency distribution comparison"""
fig, axes = plt.subplots(2, 3, figsize=(14, 7), facecolor=BG_COLOR)
axes = axes.flatten()
all_entries = [("Voynich", voynich_stats, VOYNICH_COLOR)] + \
[(lang, ls, LANG_COLORS[i]) for i, (lang, ls) in enumerate(zip(selected_langs, lang_stats_list))]
for ax, (label, stats, color) in zip(axes, all_entries):
freq = collections.Counter(stats["tokens"])
top = freq.most_common(20)
chars, counts = zip(*top) if top else ([], [])
ax.bar(chars, counts, color=color, alpha=0.85, edgecolor="none")
style_ax(ax, f"{label} โ€” top 20 chars")
ax.set_xlabel("Character", color=TEXT_COLOR)
ax.set_ylabel("Count", color=TEXT_COLOR)
for ax in axes[len(all_entries):]:
ax.set_visible(False)
fig.suptitle("Character Frequency Distributions", color=TEXT_COLOR, fontsize=10, y=1.01)
fig.tight_layout(pad=1.5)
return fig_to_pil(fig)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# SUMMARY TABLE
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def build_summary_html(voynich_stats, lang_stats_list, selected_langs):
rows = []
all_entries = [("Voynich", voynich_stats)] + list(zip(selected_langs, lang_stats_list))
for label, stats in all_entries:
dist = distance_vector(stats, voynich_stats) if label != "Voynich" else "โ€”"
perp = compute_perplexity(voynich_stats["tokens"], stats["tokens"]) if label != "Voynich" else "โ€”"
dist_str = f"{dist:.4f}" if isinstance(dist, float) else dist
perp_str = f"{min(perp, 9999):.1f}" if isinstance(perp, float) else perp
rows.append({
"Corpus": label,
"Chars": stats["n_chars"],
"Words": stats["n_words"],
"IoC": f"{stats['ioc']:.4f}",
"Hโ‚€": f"{stats['h0']:.3f}",
"Hโ‚": f"{stats['h1']:.3f}",
"Hโ‚‚": f"{stats['h2']:.3f}",
"TTR": f"{stats['ttr']:.3f}",
"Hapax": f"{stats['hapax']:.3f}",
"Zipf slope": f"{stats['zipf_slope']:.3f}",
"Distance": dist_str,
"Perplexity": perp_str,
})
cols = list(rows[0].keys())
th_style = "background:#1e1e2e;color:#c9a84c;padding:7px 12px;border:1px solid #2a2a38;font-size:12px;"
td_style = "padding:6px 12px;border:1px solid #2a2a38;color:#d4cfc8;font-size:11px;text-align:center;"
td_voynich = "padding:6px 12px;border:1px solid #2a2a38;color:#e8c97a;font-size:11px;text-align:center;font-weight:bold;background:#1a1a22;"
html = '<table style="border-collapse:collapse;width:100%;background:#0f0f14;">'
html += "<thead><tr>" + "".join(f"<th style='{th_style}'>{c}</th>" for c in cols) + "</tr></thead>"
html += "<tbody>"
for row in rows:
is_voynich = row["Corpus"] == "Voynich"
td = td_voynich if is_voynich else td_style
html += "<tr>" + "".join(f"<td style='{td}'>{row[c]}</td>" for c in cols) + "</tr>"
html += "</tbody></table>"
return html
def build_ranking_html(voynich_stats, lang_stats_list, selected_langs):
ranked = []
for lang, ls in zip(selected_langs, lang_stats_list):
dist = distance_vector(ls, voynich_stats)
perp = min(compute_perplexity(voynich_stats["tokens"], ls["tokens"]), 9999)
ranked.append((lang, dist, perp))
ranked_by_dist = sorted(ranked, key=lambda x: x[1])
ranked_by_perp = sorted(ranked, key=lambda x: x[2])
def medal(i):
return ["๐Ÿฅ‡", "๐Ÿฅˆ", "๐Ÿฅ‰", "4th", "5th"][i] if i < 5 else str(i + 1)
html = '<div style="display:flex;gap:24px;flex-wrap:wrap;">'
html += '<div style="flex:1;min-width:240px;">'
html += '<h3 style="color:#c9a84c;font-size:13px;margin-bottom:8px;">Closest by Metric Distance</h3>'
for i, (lang, dist, _) in enumerate(ranked_by_dist):
html += f'<div style="margin:4px 0;color:#d4cfc8;font-size:12px;">{medal(i)} <b>{lang}</b> โ€” dist={dist:.4f}</div>'
html += "</div>"
html += '<div style="flex:1;min-width:240px;">'
html += '<h3 style="color:#c9a84c;font-size:13px;margin-bottom:8px;">Closest by Perplexity</h3>'
for i, (lang, _, perp) in enumerate(ranked_by_perp):
html += f'<div style="margin:4px 0;color:#d4cfc8;font-size:12px;">{medal(i)} <b>{lang}</b> โ€” perp={perp:.1f}</div>'
html += "</div>"
html += "</div>"
return html
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# MAIN ANALYSIS PIPELINE
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def run_analysis(
uploaded_file,
use_sample: bool,
bigraph_rules_text: str,
selected_langs: list[str],
apply_splits: bool,
):
# FIXED: error returns now correctly produce exactly 8 outputs
if uploaded_file is None and not use_sample:
return [None, None, None, None, None, None,
"<p style='color:#e87a7a'>Please upload a file or enable the sample corpus.</p>", ""]
if not selected_langs:
return [None, None, None, None, None, None,
"<p style='color:#e87a7a'>Please select at least one comparison language.</p>", ""]
# 1. Load Voynich text
if uploaded_file is not None:
with open(uploaded_file, "r", encoding="utf-8", errors="replace") as f:
raw_voynich = f.read()
else:
raw_voynich = SAMPLE_EVA
# 2. Apply bigraph splits (optionally)
rules = parse_bigraph_rules(bigraph_rules_text) if apply_splits else []
processed_voynich = apply_bigraph_splits(raw_voynich, rules) if rules else raw_voynich
# 3. Tokenize
vy_tokens = tokenize(processed_voynich)
vy_words = tokenize_words(processed_voynich)
if not vy_tokens:
return [None, None, None, None, None, None,
"<p style='color:#e87a7a'>Could not extract tokens from the text. Check input format.</p>", ""]
voynich_stats = build_stats(vy_tokens, vy_words, "Voynich")
# 4. Process each selected language
lang_stats_list = []
for lang in selected_langs:
corpus = LANGUAGE_CORPORA.get(lang, "")
l_tokens = tokenize(corpus)
l_words = tokenize_words(corpus)
lang_stats_list.append(build_stats(l_tokens, l_words, lang))
# 5. Produce all plots โ€” each returns a PIL Image (no temp files needed)
ioc_img = plot_ioc_comparison(voynich_stats, lang_stats_list, selected_langs)
entropy_img = plot_entropy_curves(voynich_stats, lang_stats_list, selected_langs)
zipf_img = plot_zipf(voynich_stats, lang_stats_list, selected_langs)
radar_img = plot_distance_radar(voynich_stats, lang_stats_list, selected_langs)
perp_img = plot_perplexity(vy_tokens, lang_stats_list, selected_langs)
freq_img = plot_char_freq(voynich_stats, lang_stats_list, selected_langs)
# 6. Summary table + ranking
summary_html = build_summary_html(voynich_stats, lang_stats_list, selected_langs)
ranking_html = build_ranking_html(voynich_stats, lang_stats_list, selected_langs)
# 7. PMI info block
pmi_pairs = pmi_top_pairs(vy_tokens, top_n=15)
pmi_html = '<h3 style="color:#c9a84c;font-size:13px;">Top PMI Bigram Pairs (Voynich)</h3>'
pmi_html += '<div style="display:flex;flex-wrap:wrap;gap:8px;">'
for (a, b), score in pmi_pairs:
pmi_html += (
f'<span style="background:#1e1e2e;border:1px solid #2a2a38;padding:3px 8px;'
f'border-radius:4px;color:#d4cfc8;font-size:11px;">'
f'{a}+{b} <b style="color:#e8c97a">{score:.2f}</b></span>'
)
pmi_html += "</div>"
return [ioc_img, entropy_img, zipf_img, radar_img, perp_img, freq_img,
summary_html, ranking_html + "<br>" + pmi_html]
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# GRADIO UI
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=IM+Fell+English:ital@0;1&family=JetBrains+Mono:wght@400;600&display=swap');
body, .gradio-container {
background: #0f0f14 !important;
color: #d4cfc8 !important;
font-family: 'IM Fell English', serif !important;
}
h1, h2, h3 { color: #e8c97a !important; letter-spacing: 0.04em; }
.gr-panel, .gr-box, .gr-form { background: #13131b !important; border-color: #2a2a38 !important; }
.gr-button {
background: #c9a84c !important;
color: #0f0f14 !important;
border: none !important;
font-family: 'JetBrains Mono', monospace !important;
font-weight: 600 !important;
letter-spacing: 0.05em;
border-radius: 3px !important;
}
.gr-button:hover { background: #e8c97a !important; }
.gr-check-radio { accent-color: #c9a84c !important; }
textarea, input[type=text] {
background: #16161f !important;
color: #d4cfc8 !important;
border: 1px solid #2a2a38 !important;
font-family: 'JetBrains Mono', monospace !important;
font-size: 12px !important;
}
label { color: #b8b0a8 !important; font-size: 13px !important; }
.gr-tab-item { color: #c9a84c !important; border-color: #2a2a38 !important; }
.gr-tab-item.selected { background: #1e1e2e !important; }
.gr-image img { border-radius: 4px; border: 1px solid #2a2a38; }
#header-block {
text-align: center;
padding: 24px 0 12px;
border-bottom: 1px solid #2a2a38;
margin-bottom: 16px;
}
#header-block h1 { font-size: 2rem; margin-bottom: 4px; }
#header-block p { color: #888; font-size: 0.9rem; font-style: italic; }
"""
HEADER_HTML = """
<div id="header-block">
<h1>โš— Voynich Linguistic Analyzer</h1>
<p>EVA transliteration ยท statistical cryptolinguistics ยท language comparison</p>
</div>
"""
def create_app():
with gr.Blocks(css=CUSTOM_CSS, title="Voynich Analyzer") as demo:
gr.HTML(HEADER_HTML)
with gr.Row():
# โ”€โ”€ LEFT PANEL: Controls โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Column(scale=1, min_width=280):
gr.Markdown("### ๐Ÿ“‚ Input")
uploaded_file = gr.File(
label="Upload EVA transliteration (.txt)",
file_types=[".txt"],
type="filepath",
)
use_sample = gr.Checkbox(
label="Use built-in sample corpus (fallback if no upload)",
value=True,
)
gr.Markdown("### โœ‚๏ธ Bigraph Splitting")
apply_splits = gr.Checkbox(label="Apply bigraph splitting", value=True)
bigraph_rules = gr.Textbox(
label="Rules (format: xy -> x y, one per line)",
value=DEFAULT_BIGRAPH_RULES,
lines=10,
)
gr.Markdown("### ๐ŸŒ Comparison Languages")
selected_langs = gr.CheckboxGroup(
choices=list(LANGUAGE_CORPORA.keys()),
value=list(LANGUAGE_CORPORA.keys()),
label="Languages to compare",
)
run_btn = gr.Button("โ–ถ Run Analysis", variant="primary")
# โ”€โ”€ RIGHT PANEL: Results โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Column(scale=3):
with gr.Tabs():
with gr.Tab("๐Ÿ“Š Summary"):
ranking_out = gr.HTML(label="Ranking")
summary_out = gr.HTML(label="Metrics Table")
with gr.Tab("๐Ÿ“ˆ Index of Coincidence"):
# FIXED: type="pil" โ€” no deprecated filepath mode, no temp files
ioc_out = gr.Image(label="IoC Comparison", type="pil")
with gr.Tab("๐ŸŒ€ Entropy Curves"):
entropy_out = gr.Image(label="Entropy Curves", type="pil")
with gr.Tab("๐Ÿ“‰ Zipf Plots"):
zipf_out = gr.Image(label="Zipf Word-Frequency", type="pil")
with gr.Tab("๐ŸŽฏ Radar Chart"):
radar_out = gr.Image(label="Metric Radar", type="pil")
with gr.Tab("๐Ÿ”ฎ Perplexity"):
perp_out = gr.Image(label="Cross-Entropy Perplexity", type="pil")
with gr.Tab("๐Ÿ”ค Char Frequencies"):
freq_out = gr.Image(label="Character Frequency Distributions", type="pil")
# FIXED: run_analysis now returns exactly 8 items directly โ€”
# no wrapper needed, no temp file writing
run_btn.click(
fn=run_analysis,
inputs=[uploaded_file, use_sample, bigraph_rules, selected_langs, apply_splits],
outputs=[ioc_out, entropy_out, zipf_out, radar_out, perp_out, freq_out,
summary_out, ranking_out],
)
return demo
if __name__ == "__main__":
app = create_app()
# FIXED: no hardcoded host/port โ€” HF Spaces injects its own server config
app.launch()