|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import numpy as np |
|
|
from collections import defaultdict, Counter |
|
|
import base64 |
|
|
from sklearn.manifold import MDS |
|
|
import networkx as nx |
|
|
from streamlit_float import * |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
|
|
|
|
|
|
|
float_init() |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
[data-testid="stSidebar"] { |
|
|
position: fixed; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
ALLOWED_CHARS = set('4O892ERSZPBFVQWXYACIGH1TU0DNM3JKL567(n)(v)') |
|
|
|
|
|
def parse_voynich_word(word): |
|
|
"""Parse a Voynich word into individual characters - filtering to allowed characters only""" |
|
|
if not word or word.strip() == '': |
|
|
return None, None |
|
|
|
|
|
word = word.strip() |
|
|
|
|
|
chars = [c for c in list(word) if c in ALLOWED_CHARS] |
|
|
|
|
|
|
|
|
if not chars: |
|
|
return None, None |
|
|
|
|
|
|
|
|
filtered_word = ''.join(chars) |
|
|
|
|
|
return filtered_word, chars |
|
|
|
|
|
def analyze_csv(df): |
|
|
words = [] |
|
|
chars_list = [] |
|
|
char_positions = defaultdict(list) |
|
|
char_connections = defaultdict(Counter) |
|
|
word_positions = [] |
|
|
line_word_map = defaultdict(Counter) |
|
|
|
|
|
for line_idx, row in df.iterrows(): |
|
|
line_words = [] |
|
|
|
|
|
|
|
|
for col_idx, cell_value in enumerate(row): |
|
|
if pd.notna(cell_value) and str(cell_value).strip(): |
|
|
word, chars = parse_voynich_word(str(cell_value)) |
|
|
if word and chars: |
|
|
words.append(word) |
|
|
chars_list.append(chars) |
|
|
line_words.append((word, col_idx, chars)) |
|
|
line_word_map[line_idx][word] += 1 |
|
|
|
|
|
for j, char in enumerate(chars): |
|
|
char_positions[char].append(j) |
|
|
|
|
|
for j in range(len(chars) - 1): |
|
|
char_connections[chars[j]][chars[j+1]] += 1 |
|
|
|
|
|
if line_words: |
|
|
word_positions.append({ |
|
|
'line': line_idx, |
|
|
'words': line_words |
|
|
}) |
|
|
|
|
|
return words, chars_list, char_positions, char_connections, word_positions, line_word_map |
|
|
|
|
|
def analyze_trigrams(words, chars_list): |
|
|
char_trigrams = Counter() |
|
|
word_trigrams = Counter() |
|
|
|
|
|
for chars in chars_list: |
|
|
for i in range(len(chars)-2): |
|
|
trigram = tuple(chars[i:i+3]) |
|
|
char_trigrams[trigram] += 1 |
|
|
|
|
|
for i in range(len(words)-2): |
|
|
trigram = tuple(words[i:i+3]) |
|
|
word_trigrams[trigram] += 1 |
|
|
|
|
|
return char_trigrams, word_trigrams |
|
|
|
|
|
def create_12_slot_table(chars_list): |
|
|
slot_frequencies = [Counter() for _ in range(12)] |
|
|
|
|
|
for chars in chars_list: |
|
|
for i, char in enumerate(chars[:12]): |
|
|
slot_frequencies[i][char] += 1 |
|
|
|
|
|
data = [] |
|
|
all_chars = sorted(set(char for counter in slot_frequencies for char in counter)) |
|
|
|
|
|
for char in all_chars: |
|
|
row = {'Character': char} |
|
|
for i in range(12): |
|
|
row[f'Slot_{i+1}'] = slot_frequencies[i][char] |
|
|
data.append(row) |
|
|
|
|
|
return pd.DataFrame(data) |
|
|
|
|
|
def analyze_slot_structure(chars_list): |
|
|
slot_contents = defaultdict(Counter) |
|
|
max_slots = 0 |
|
|
|
|
|
for chars in chars_list: |
|
|
if len(chars) > max_slots: |
|
|
max_slots = len(chars) |
|
|
|
|
|
for i, char in enumerate(chars): |
|
|
slot_contents[i][char] += 1 |
|
|
|
|
|
slot_summary = {} |
|
|
for slot in range(max_slots): |
|
|
if slot in slot_contents: |
|
|
common_chars = slot_contents[slot].most_common(10) |
|
|
slot_summary[slot] = common_chars |
|
|
|
|
|
return slot_summary, max_slots |
|
|
|
|
|
def create_line_word_scatter(line_word_map): |
|
|
all_words = set() |
|
|
for word_counter in line_word_map.values(): |
|
|
all_words.update(word_counter.keys()) |
|
|
|
|
|
lines = sorted(line_word_map.keys()) |
|
|
word_freq_matrix = np.zeros((len(lines), len(all_words))) |
|
|
|
|
|
for i, line in enumerate(lines): |
|
|
for j, word in enumerate(all_words): |
|
|
word_freq_matrix[i, j] = line_word_map[line][word] |
|
|
|
|
|
mds = MDS(n_components=2, random_state=42) |
|
|
line_coords = mds.fit_transform(word_freq_matrix) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
|
scatter = ax.scatter(line_coords[:, 0], line_coords[:, 1]) |
|
|
|
|
|
for i, line in enumerate(lines): |
|
|
ax.annotate(f"L{line}", (line_coords[i, 0], line_coords[i, 1])) |
|
|
|
|
|
ax.set_title('Line Similarity based on Word Usage') |
|
|
ax.set_xlabel('Dimension 1') |
|
|
ax.set_ylabel('Dimension 2') |
|
|
|
|
|
return fig |
|
|
|
|
|
def get_download_link_csv(df, filename): |
|
|
csv = df.to_csv(index=False) |
|
|
b64 = base64.b64encode(csv.encode()).decode() |
|
|
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>' |
|
|
return href |
|
|
|
|
|
st.title("Voynich Manuscript Analyzer") |
|
|
st.write("Upload your CSV file.") |
|
|
|
|
|
|
|
|
floating_image_file = st.sidebar.file_uploader("Upload an image", |
|
|
type=['png', 'jpg', 'jpeg', 'gif'], |
|
|
key="floating_image") |
|
|
|
|
|
if floating_image_file is not None: |
|
|
st.sidebar.image(floating_image_file, width=150, caption="Legend") |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") |
|
|
|
|
|
if uploaded_file is not None: |
|
|
|
|
|
uploaded_file.seek(0) |
|
|
content = uploaded_file.read().decode('utf-8') |
|
|
|
|
|
|
|
|
lines = content.replace('\r\n', '\n').replace('\r', '\n').strip().split('\n') |
|
|
|
|
|
lines = [line for line in lines if line.strip()] |
|
|
data = [line.split(',') for line in lines] |
|
|
|
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
words, chars_list, char_positions, char_connections, word_positions, line_word_map = analyze_csv(df) |
|
|
|
|
|
st.subheader("Basic Statistics") |
|
|
st.write(f"Total words: {len(words)}") |
|
|
st.write(f"Total unique words: {len(set(words))}") |
|
|
unique_chars = set() |
|
|
for chars in chars_list: |
|
|
unique_chars.update(chars) |
|
|
st.write(f"Total unique characters: {len(unique_chars)}") |
|
|
st.write("Unique characters:", ", ".join(sorted(unique_chars))) |
|
|
|
|
|
st.subheader("Sample Words (Character-by-Character)") |
|
|
sample_df = pd.DataFrame([ |
|
|
{'Word': word, 'Characters': ' | '.join(chars), 'Length': len(chars)} |
|
|
for word, chars in zip(words[:20], chars_list[:20]) |
|
|
]) |
|
|
st.dataframe(sample_df) |
|
|
|
|
|
st.subheader("Character Bigram Analysis") |
|
|
st.write("This reveals which character pairs occur most frequently - potential digraphs emerge from the data") |
|
|
|
|
|
char_bigrams = Counter() |
|
|
for chars in chars_list: |
|
|
for i in range(len(chars)-1): |
|
|
bigram = tuple(chars[i:i+2]) |
|
|
char_bigrams[bigram] += 1 |
|
|
|
|
|
char_bigram_df = pd.DataFrame([ |
|
|
{'Bigram': ''.join(str(c) for c in bigram), |
|
|
'Char1': str(bigram[0]), |
|
|
'Char2': str(bigram[1]), |
|
|
'Count': int(count)} |
|
|
for bigram, count in char_bigrams.most_common(30) |
|
|
]) |
|
|
st.dataframe(char_bigram_df) |
|
|
st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True) |
|
|
|
|
|
st.subheader("Character Trigram Analysis") |
|
|
st.write("Three-character sequences - looking for common patterns") |
|
|
|
|
|
char_trigrams = Counter() |
|
|
for chars in chars_list: |
|
|
for i in range(len(chars)-2): |
|
|
trigram = tuple(chars[i:i+3]) |
|
|
char_trigrams[trigram] += 1 |
|
|
|
|
|
char_trigram_df = pd.DataFrame([ |
|
|
{'Trigram': ''.join(str(c) for c in trigram), 'Count': int(count)} |
|
|
for trigram, count in char_trigrams.most_common(30) |
|
|
]) |
|
|
st.dataframe(char_trigram_df) |
|
|
st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True) |
|
|
|
|
|
st.subheader("Word Bigram Analysis") |
|
|
word_bigrams = Counter() |
|
|
for i in range(len(words)-1): |
|
|
bigram = tuple(words[i:i+2]) |
|
|
word_bigrams[bigram] += 1 |
|
|
|
|
|
word_bigram_df = pd.DataFrame([ |
|
|
{'Word1': str(bigram[0]), 'Word2': str(bigram[1]), 'Count': int(count)} |
|
|
for bigram, count in word_bigrams.most_common(20) |
|
|
]) |
|
|
st.dataframe(word_bigram_df) |
|
|
st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True) |
|
|
|
|
|
st.subheader("Word Trigram Analysis") |
|
|
word_trigrams = Counter() |
|
|
for i in range(len(words)-2): |
|
|
trigram = tuple(words[i:i+3]) |
|
|
word_trigrams[trigram] += 1 |
|
|
|
|
|
word_trigram_df = pd.DataFrame([ |
|
|
{'Word1': str(trigram[0]), |
|
|
'Word2': str(trigram[1]), |
|
|
'Word3': str(trigram[2]), |
|
|
'Count': int(count)} |
|
|
for trigram, count in word_trigrams.most_common(20) |
|
|
]) |
|
|
st.dataframe(word_trigram_df) |
|
|
st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True) |
|
|
|
|
|
st.subheader("Character Frequency by Position") |
|
|
slot_freq_df = create_12_slot_table(chars_list) |
|
|
st.dataframe(slot_freq_df) |
|
|
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True) |
|
|
|
|
|
slot_summary, max_slots = analyze_slot_structure(chars_list) |
|
|
|
|
|
st.subheader("Words by Length Analysis") |
|
|
|
|
|
length_groups = defaultdict(list) |
|
|
for word, chars in zip(words, chars_list): |
|
|
length = len(chars) |
|
|
if length <= 20: |
|
|
length_groups[length].append((word, chars)) |
|
|
|
|
|
selected_length = st.selectbox("Select word length to analyze:", |
|
|
sorted(length_groups.keys())) |
|
|
|
|
|
if selected_length: |
|
|
words_of_length = length_groups[selected_length] |
|
|
|
|
|
position_chars = [Counter() for _ in range(selected_length)] |
|
|
for _, chars in words_of_length: |
|
|
for i, char in enumerate(chars): |
|
|
position_chars[i][char] += 1 |
|
|
|
|
|
st.write(f"Found {len(words_of_length)} words of length {selected_length}") |
|
|
|
|
|
freq_data = [] |
|
|
for char in sorted(unique_chars): |
|
|
row = {'Character': char} |
|
|
for pos in range(selected_length): |
|
|
row[f'Pos_{pos+1}'] = position_chars[pos][char] |
|
|
freq_data.append(row) |
|
|
|
|
|
freq_df = pd.DataFrame(freq_data) |
|
|
st.dataframe(freq_df) |
|
|
st.markdown(get_download_link_csv(freq_df, f"length_{selected_length}_analysis.csv"), |
|
|
unsafe_allow_html=True) |
|
|
|
|
|
st.write("Sample words of this length:") |
|
|
sample_df = pd.DataFrame([ |
|
|
{'Word': word, 'Characters': ' | '.join(chars)} |
|
|
for word, chars in words_of_length[:30] |
|
|
]) |
|
|
st.dataframe(sample_df) |
|
|
|
|
|
st.subheader("Word Distribution Across Lines") |
|
|
line_scatter = create_line_word_scatter(line_word_map) |
|
|
st.pyplot(line_scatter) |
|
|
|
|
|
st.subheader("Character Context Analysis") |
|
|
st.write("Select a character to see what comes before and after it") |
|
|
|
|
|
unique_chars = sorted(set(char for chars in chars_list for char in chars)) |
|
|
selected_char = st.selectbox("Select a character to analyze:", unique_chars) |
|
|
|
|
|
if selected_char: |
|
|
before_counter = Counter() |
|
|
after_counter = Counter() |
|
|
|
|
|
for chars in chars_list: |
|
|
for i, char in enumerate(chars): |
|
|
if char == selected_char: |
|
|
if i > 0: |
|
|
before_counter[chars[i-1]] += 1 |
|
|
if i < len(chars) - 1: |
|
|
after_counter[chars[i+1]] += 1 |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
|
st.write(f"Characters that commonly PRECEDE '{selected_char}':") |
|
|
before_df = pd.DataFrame(before_counter.most_common(15), |
|
|
columns=['Character', 'Count']) |
|
|
st.dataframe(before_df) |
|
|
|
|
|
fig1, ax1 = plt.subplots(figsize=(8, 6)) |
|
|
plt.bar(before_df['Character'], before_df['Count']) |
|
|
plt.title(f"Characters before '{selected_char}'") |
|
|
plt.xticks(rotation=45) |
|
|
st.pyplot(fig1) |
|
|
|
|
|
with col2: |
|
|
st.write(f"Characters that commonly FOLLOW '{selected_char}':") |
|
|
after_df = pd.DataFrame(after_counter.most_common(15), |
|
|
columns=['Character', 'Count']) |
|
|
st.dataframe(after_df) |
|
|
|
|
|
fig2, ax2 = plt.subplots(figsize=(8, 6)) |
|
|
plt.bar(after_df['Character'], after_df['Count']) |
|
|
plt.title(f"Characters after '{selected_char}'") |
|
|
plt.xticks(rotation=45) |
|
|
st.pyplot(fig2) |
|
|
|
|
|
st.subheader("Line Viewer") |
|
|
|
|
|
available_lines = sorted(set(line_data['line'] for line_data in word_positions)) |
|
|
selected_line = st.selectbox("Select Line:", [''] + [f"Line {line}" for line in available_lines]) |
|
|
|
|
|
if selected_line: |
|
|
line_num = int(selected_line.replace('Line ', '')) |
|
|
|
|
|
line_words = next((line_data['words'] |
|
|
for line_data in word_positions |
|
|
if line_data['line'] == line_num), []) |
|
|
|
|
|
for word, _, chars in line_words: |
|
|
st.write(f"**Word: {word}** ({len(chars)} characters)") |
|
|
cols = st.columns(min(20, max(12, len(chars)))) |
|
|
for i in range(len(chars)): |
|
|
with cols[i]: |
|
|
char = chars[i] |
|
|
st.markdown(f""" |
|
|
<div style=' |
|
|
width: 40px; |
|
|
height: 40px; |
|
|
border: 2px solid #ccc; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
font-size: 16px; |
|
|
font-weight: bold; |
|
|
background-color: #e6f3ff; |
|
|
margin: 2px; |
|
|
'> |
|
|
{char} |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.subheader("Language Structure Analysis") |
|
|
|
|
|
|
|
|
fig1 = plt.figure(figsize=(12, 6)) |
|
|
word_lengths = [len(chars) for chars in chars_list] |
|
|
sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2)) |
|
|
plt.title("Word Length Distribution") |
|
|
plt.xlabel("Word Length (number of characters)") |
|
|
plt.ylabel("Frequency") |
|
|
st.pyplot(fig1) |
|
|
|
|
|
|
|
|
st.subheader("Overall Character Frequency") |
|
|
all_chars_flat = [char for chars in chars_list for char in chars] |
|
|
char_freq = Counter(all_chars_flat) |
|
|
total_chars = len(all_chars_flat) |
|
|
|
|
|
fig_freq = plt.figure(figsize=(12, 6)) |
|
|
char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count']) |
|
|
char_freq_df['Percentage'] = (char_freq_df['Count'] / total_chars * 100).round(2) |
|
|
plt.bar(char_freq_df['Character'], char_freq_df['Count']) |
|
|
plt.title("Character Frequency Distribution") |
|
|
plt.xlabel("Character") |
|
|
plt.ylabel("Frequency") |
|
|
plt.xticks(rotation=45) |
|
|
st.pyplot(fig_freq) |
|
|
st.dataframe(char_freq_df) |
|
|
st.markdown(get_download_link_csv(char_freq_df, "character_frequency.csv"), unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.subheader("Character Position Heatmap") |
|
|
st.write("Shows which characters appear at which positions in words") |
|
|
|
|
|
max_len = max(word_lengths) |
|
|
char_pos_matrix = np.zeros((len(unique_chars), min(max_len, 20))) |
|
|
unique_chars_list = sorted(unique_chars) |
|
|
|
|
|
for chars in chars_list: |
|
|
for i, char in enumerate(chars): |
|
|
if i < 20: |
|
|
char_idx = unique_chars_list.index(char) |
|
|
char_pos_matrix[char_idx, i] += 1 |
|
|
|
|
|
fig2 = plt.figure(figsize=(15, 10)) |
|
|
sns.heatmap(char_pos_matrix, |
|
|
xticklabels=range(1, min(max_len, 20)+1), |
|
|
yticklabels=unique_chars_list, |
|
|
cmap='YlOrRd', |
|
|
cbar_kws={'label': 'Frequency'}) |
|
|
plt.title("Character Position Preferences") |
|
|
plt.xlabel("Position in Word") |
|
|
plt.ylabel("Character") |
|
|
st.pyplot(fig2) |
|
|
|
|
|
|
|
|
st.subheader("Character Bigram Network") |
|
|
st.write("Visual representation of which characters commonly follow each other") |
|
|
|
|
|
G = nx.DiGraph() |
|
|
for (char1, char2), count in char_bigrams.most_common(50): |
|
|
G.add_edge(char1, char2, weight=count) |
|
|
|
|
|
fig4 = plt.figure(figsize=(14, 14)) |
|
|
pos = nx.spring_layout(G, k=2, iterations=50, seed=42) |
|
|
|
|
|
edge_weights = [G[u][v]['weight'] for u,v in G.edges()] |
|
|
max_weight = max(edge_weights) if edge_weights else 1 |
|
|
|
|
|
nx.draw(G, pos, with_labels=True, |
|
|
node_color='lightblue', |
|
|
node_size=2000, |
|
|
font_size=11, |
|
|
font_weight='bold', |
|
|
arrows=True, |
|
|
arrowsize=15, |
|
|
width=[G[u][v]['weight']/max_weight * 4 for u,v in G.edges()], |
|
|
edge_color='gray', |
|
|
connectionstyle='arc3,rad=0.1') |
|
|
plt.title("Character Sequence Network (Directed)") |
|
|
st.pyplot(fig4) |
|
|
|
|
|
|
|
|
st.subheader("Line Structure Analysis") |
|
|
line_lengths = [len(line_data['words']) for line_data in word_positions] |
|
|
|
|
|
fig5 = plt.figure(figsize=(10, 6)) |
|
|
sns.histplot(line_lengths, bins=range(1, max(line_lengths)+2)) |
|
|
plt.title("Words per Line Distribution") |
|
|
plt.xlabel("Number of Words in Line") |
|
|
plt.ylabel("Frequency") |
|
|
st.pyplot(fig5) |
|
|
|
|
|
|
|
|
st.subheader("Word Boundary Analysis") |
|
|
first_chars = Counter(chars[0] for chars in chars_list) |
|
|
last_chars = Counter(chars[-1] for chars in chars_list) |
|
|
|
|
|
fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) |
|
|
|
|
|
first_df = pd.DataFrame(first_chars.most_common(15), |
|
|
columns=['Character', 'Count']) |
|
|
sns.barplot(data=first_df, x='Character', y='Count', ax=ax1) |
|
|
ax1.set_title("Most Common Word-Initial Characters") |
|
|
ax1.tick_params(axis='x', rotation=45) |
|
|
|
|
|
last_df = pd.DataFrame(last_chars.most_common(15), |
|
|
columns=['Character', 'Count']) |
|
|
sns.barplot(data=last_df, x='Character', y='Count', ax=ax2) |
|
|
ax2.set_title("Most Common Word-Final Characters") |
|
|
ax2.tick_params(axis='x', rotation=45) |
|
|
st.pyplot(fig6) |
|
|
|
|
|
|
|
|
st.subheader("N-gram Pattern Discovery") |
|
|
st.write("Discover recurring character sequences of different lengths") |
|
|
|
|
|
ngram_length = st.slider("Select n-gram length:", 2, 6, 3) |
|
|
|
|
|
ngrams = Counter() |
|
|
for chars in chars_list: |
|
|
for i in range(len(chars) - ngram_length + 1): |
|
|
ngram = tuple(chars[i:i+ngram_length]) |
|
|
ngrams[ngram] += 1 |
|
|
|
|
|
ngram_df = pd.DataFrame([ |
|
|
{'Pattern': ''.join(str(c) for c in ngram), |
|
|
'Count': int(count), |
|
|
'Percentage': f"{count/len(chars_list)*100:.2f}%"} |
|
|
for ngram, count in ngrams.most_common(30) |
|
|
]) |
|
|
st.dataframe(ngram_df) |
|
|
st.markdown(get_download_link_csv(ngram_df, f"{ngram_length}gram_patterns.csv"), unsafe_allow_html=True) |