Update app.py
Browse files
app.py
CHANGED
|
@@ -1,66 +1,240 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
line_words = []
|
| 32 |
-
for col in
|
| 33 |
-
|
| 34 |
-
if pd.notna(
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
from collections import defaultdict, Counter
|
| 9 |
+
import base64
|
| 10 |
+
from sklearn.manifold import MDS
|
| 11 |
+
import networkx as nx
|
| 12 |
+
|
| 13 |
+
# Set page configuration
|
| 14 |
+
st.set_page_config(layout="wide", page_title="Voynich Manuscript Analyzer", page_icon="📜")
|
| 15 |
+
|
| 16 |
+
# Caching expensive computations
|
| 17 |
+
@st.cache_data
|
| 18 |
+
def load_data(uploaded_file):
|
| 19 |
+
return pd.read_csv(uploaded_file)
|
| 20 |
+
|
| 21 |
+
@st.cache_data
|
| 22 |
+
def extract_word_and_chars(token):
|
| 23 |
+
if token == '$' or '<START>' in token or '<END>' in token:
|
| 24 |
+
return None, None
|
| 25 |
+
|
| 26 |
+
chars = []
|
| 27 |
+
temp_chars = token.split(',')
|
| 28 |
+
|
| 29 |
+
for char in temp_chars:
|
| 30 |
+
if '?' in char:
|
| 31 |
+
base_char = char.replace('?', '')
|
| 32 |
+
if base_char:
|
| 33 |
+
chars.append(base_char)
|
| 34 |
+
chars.append('?')
|
| 35 |
+
else:
|
| 36 |
+
chars.append(char)
|
| 37 |
+
|
| 38 |
+
word = ''.join(chars)
|
| 39 |
+
return word, chars
|
| 40 |
+
|
| 41 |
+
@st.cache_data
|
| 42 |
+
def analyze_csv(df):
|
| 43 |
+
words = []
|
| 44 |
+
chars_list = []
|
| 45 |
+
char_positions = defaultdict(list)
|
| 46 |
+
char_connections = defaultdict(Counter)
|
| 47 |
+
word_positions = []
|
| 48 |
+
folio_word_map = defaultdict(Counter)
|
| 49 |
+
token_columns = [col for col in df.columns if col.startswith('t')]
|
| 50 |
+
|
| 51 |
+
for _, row in df.iterrows():
|
| 52 |
line_words = []
|
| 53 |
+
for pos, col in enumerate(token_columns):
|
| 54 |
+
token = row[col]
|
| 55 |
+
if pd.notna(token) and token != '$':
|
| 56 |
+
if token.startswith('"') and token.endswith('"'):
|
| 57 |
+
token = token[1:-1]
|
| 58 |
|
| 59 |
+
word, chars = extract_word_and_chars(token)
|
| 60 |
+
if word:
|
| 61 |
+
words.append(word)
|
| 62 |
+
chars_list.append(chars)
|
| 63 |
+
line_words.append((word, pos, chars))
|
| 64 |
+
folio_word_map[row['folio']][word] += 1
|
| 65 |
+
|
| 66 |
+
for j, char in enumerate(chars):
|
| 67 |
+
char_positions[char].append(j)
|
| 68 |
+
|
| 69 |
+
for j in range(len(chars) - 1):
|
| 70 |
+
char_connections[chars[j]][chars[j+1]] += 1
|
| 71 |
+
|
| 72 |
+
if line_words:
|
| 73 |
+
word_positions.append({
|
| 74 |
+
'folio': row['folio'],
|
| 75 |
+
'par': row['par'],
|
| 76 |
+
'line': row['line'],
|
| 77 |
+
'words': line_words
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
return words, chars_list, char_positions, char_connections, word_positions, folio_word_map
|
| 81 |
+
|
| 82 |
+
@st.cache_data
|
| 83 |
+
def analyze_trigrams(words, chars_list):
|
| 84 |
+
char_trigrams = Counter()
|
| 85 |
+
word_trigrams = Counter()
|
| 86 |
+
|
| 87 |
+
for chars in chars_list:
|
| 88 |
+
for i in range(len(chars)-2):
|
| 89 |
+
trigram = tuple(chars[i:i+3])
|
| 90 |
+
char_trigrams[trigram] += 1
|
| 91 |
+
|
| 92 |
+
for i in range(len(words)-2):
|
| 93 |
+
trigram = tuple(words[i:i+3])
|
| 94 |
+
word_trigrams[trigram] += 1
|
| 95 |
+
|
| 96 |
+
return char_trigrams, word_trigrams
|
| 97 |
+
|
| 98 |
+
@st.cache_data
|
| 99 |
+
def create_12_slot_table(chars_list):
|
| 100 |
+
slot_frequencies = [Counter() for _ in range(12)]
|
| 101 |
+
|
| 102 |
+
for chars in chars_list:
|
| 103 |
+
for i, char in enumerate(chars[:12]):
|
| 104 |
+
slot_frequencies[i][char] += 1
|
| 105 |
|
| 106 |
+
data = []
|
| 107 |
+
all_chars = sorted(set(char for counter in slot_frequencies for char in counter))
|
| 108 |
+
|
| 109 |
+
for char in all_chars:
|
| 110 |
+
row = {'Character': char}
|
| 111 |
+
for i in range(12):
|
| 112 |
+
row[f'Slot_{i+1}'] = slot_frequencies[i][char]
|
| 113 |
+
data.append(row)
|
| 114 |
+
|
| 115 |
+
return pd.DataFrame(data)
|
| 116 |
+
|
| 117 |
+
@st.cache_data
|
| 118 |
+
def analyze_slot_structure(chars_list):
|
| 119 |
+
slot_contents = defaultdict(Counter)
|
| 120 |
+
max_slots = 0
|
| 121 |
+
|
| 122 |
+
for chars in chars_list:
|
| 123 |
+
if len(chars) > max_slots:
|
| 124 |
+
max_slots = len(chars)
|
| 125 |
+
|
| 126 |
+
for i, char in enumerate(chars):
|
| 127 |
+
slot_contents[i][char] += 1
|
| 128 |
+
|
| 129 |
+
slot_summary = {}
|
| 130 |
+
for slot in range(max_slots):
|
| 131 |
+
if slot in slot_contents:
|
| 132 |
+
common_chars = slot_contents[slot].most_common(10)
|
| 133 |
+
slot_summary[slot] = common_chars
|
| 134 |
+
|
| 135 |
+
return slot_summary, max_slots
|
| 136 |
+
|
| 137 |
+
@st.cache_data
|
| 138 |
+
def create_folio_word_scatter(folio_word_map):
|
| 139 |
+
all_words = set()
|
| 140 |
+
for word_counter in folio_word_map.values():
|
| 141 |
+
all_words.update(word_counter.keys())
|
| 142 |
+
|
| 143 |
+
folios = sorted(folio_word_map.keys())
|
| 144 |
+
word_freq_matrix = np.zeros((len(folios), len(all_words)))
|
| 145 |
+
|
| 146 |
+
for i, folio in enumerate(folios):
|
| 147 |
+
for j, word in enumerate(all_words):
|
| 148 |
+
word_freq_matrix[i, j] = folio_word_map[folio][word]
|
| 149 |
+
|
| 150 |
+
mds = MDS(n_components=2, random_state=42)
|
| 151 |
+
folio_coords = mds.fit_transform(word_freq_matrix)
|
| 152 |
+
|
| 153 |
+
fig = px.scatter(x=folio_coords[:, 0], y=folio_coords[:, 1], text=folios)
|
| 154 |
+
fig.update_traces(textposition='top center')
|
| 155 |
+
fig.update_layout(title='Folio Similarity based on Word Usage',
|
| 156 |
+
xaxis_title='Dimension 1',
|
| 157 |
+
yaxis_title='Dimension 2')
|
| 158 |
+
|
| 159 |
+
return fig
|
| 160 |
+
|
| 161 |
+
@st.cache_data
|
| 162 |
+
def plot_char_positions(char_positions, max_slots):
|
| 163 |
+
chars = []
|
| 164 |
+
positions = []
|
| 165 |
+
counts = []
|
| 166 |
+
|
| 167 |
+
for char, pos_list in char_positions.items():
|
| 168 |
+
pos_counts = Counter(pos_list)
|
| 169 |
+
for pos, count in pos_counts.items():
|
| 170 |
+
if pos < max_slots:
|
| 171 |
+
chars.append(char)
|
| 172 |
+
positions.append(pos)
|
| 173 |
+
counts.append(count)
|
| 174 |
+
|
| 175 |
+
df = pd.DataFrame({
|
| 176 |
+
'Character': chars,
|
| 177 |
+
'Position': positions,
|
| 178 |
+
'Count': counts
|
| 179 |
+
})
|
| 180 |
+
|
| 181 |
+
pivot_df = df.pivot(index='Character', columns='Position', values='Count').fillna(0)
|
| 182 |
+
|
| 183 |
+
fig = px.imshow(pivot_df, color_continuous_scale='YlGnBu', labels=dict(x="Position in Word", y="Character", color="Frequency"))
|
| 184 |
+
fig.update_layout(title='Character Position Heatmap')
|
| 185 |
+
return fig
|
| 186 |
+
|
| 187 |
+
@st.cache_data
|
| 188 |
+
def get_download_link_csv(df, filename):
|
| 189 |
+
csv = df.to_csv(index=False)
|
| 190 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
| 191 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
|
| 192 |
+
return href
|
| 193 |
+
|
| 194 |
+
# Main App
|
| 195 |
+
st.title("Voynich Manuscript Analyzer")
|
| 196 |
+
st.write("Upload your CSV file to discover potential patterns and character distributions.")
|
| 197 |
+
|
| 198 |
+
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 199 |
+
|
| 200 |
+
if uploaded_file is not None:
|
| 201 |
+
df = load_data(uploaded_file)
|
| 202 |
+
words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
|
| 203 |
+
|
| 204 |
+
with st.expander("Basic Statistics"):
|
| 205 |
+
st.write(f"Total words: {len(words)}")
|
| 206 |
+
st.write(f"Total unique words: {len(set(words))}")
|
| 207 |
+
unique_chars = set(char for chars in chars_list for char in chars)
|
| 208 |
+
st.write(f"Total unique characters: {len(unique_chars)}")
|
| 209 |
+
st.write("Unique characters:", ", ".join(sorted(unique_chars)))
|
| 210 |
+
|
| 211 |
+
with st.expander("Trigram Analysis"):
|
| 212 |
+
char_trigrams, word_trigrams = analyze_trigrams(words, chars_list)
|
| 213 |
+
|
| 214 |
+
st.write("Top 20 Character Trigrams")
|
| 215 |
+
char_trigram_df = pd.DataFrame([
|
| 216 |
+
{'Trigram': ' - '.join(trigram), 'Count': count}
|
| 217 |
+
for trigram, count in char_trigrams.most_common(20)
|
| 218 |
+
])
|
| 219 |
+
st.dataframe(char_trigram_df)
|
| 220 |
+
st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
|
| 221 |
+
|
| 222 |
+
st.write("Top 20 Word Trigrams")
|
| 223 |
+
word_trigram_df = pd.DataFrame([
|
| 224 |
+
{'Trigram': ' - '.join(trigram), 'Count': count}
|
| 225 |
+
for trigram, count in word_trigrams.most_common(20)
|
| 226 |
+
])
|
| 227 |
+
st.dataframe(word_trigram_df)
|
| 228 |
+
st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
|
| 229 |
+
|
| 230 |
+
with st.expander("Character Position Analysis"):
|
| 231 |
+
slot_summary, max_slots = analyze_slot_structure(chars_list)
|
| 232 |
+
st.plotly_chart(plot_char_positions(char_positions, max_slots))
|
| 233 |
+
|
| 234 |
+
with st.expander("Folio Similarity Analysis"):
|
| 235 |
+
st.plotly_chart(create_folio_word_scatter(folio_word_map))
|
| 236 |
+
|
| 237 |
+
with st.expander("12-Slot Character Frequency Table"):
|
| 238 |
+
slot_freq_df = create_12_slot_table(chars_list)
|
| 239 |
+
st.dataframe(slot_freq_df)
|
| 240 |
+
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|