Update app.py
Browse files
app.py
CHANGED
|
@@ -191,9 +191,32 @@ def get_download_link_csv(df, filename):
|
|
| 191 |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
|
| 192 |
return href
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
# Main App
|
| 195 |
st.title("Voynich Manuscript Analyzer")
|
| 196 |
-
st.write("Upload your CSV file to discover potential patterns and
|
| 197 |
|
| 198 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 199 |
|
|
@@ -201,59 +224,35 @@ if uploaded_file is not None:
|
|
| 201 |
df = load_data(uploaded_file)
|
| 202 |
words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
|
| 203 |
|
|
|
|
| 204 |
with st.expander("Basic Statistics"):
|
| 205 |
st.write(f"Total words: {len(words)}")
|
| 206 |
st.write(f"Total unique words: {len(set(words))}")
|
| 207 |
unique_chars = set(char for chars in chars_list for char in chars)
|
| 208 |
-
st.write(f"Total unique
|
| 209 |
-
st.write("Unique
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
st.write("Top 20 Character Trigrams")
|
| 215 |
-
char_trigram_df = pd.DataFrame([
|
| 216 |
-
{'Trigram': ' - '.join(trigram), 'Count': count}
|
| 217 |
-
for trigram, count in char_trigrams.most_common(20)
|
| 218 |
-
])
|
| 219 |
-
st.dataframe(char_trigram_df)
|
| 220 |
-
st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
|
| 221 |
|
| 222 |
-
st.write("
|
| 223 |
-
word_trigram_df = pd.DataFrame([
|
| 224 |
-
{'Trigram': ' - '.join(trigram), 'Count': count}
|
| 225 |
-
for trigram, count in word_trigrams.most_common(20)
|
| 226 |
-
])
|
| 227 |
-
st.dataframe(word_trigram_df)
|
| 228 |
-
st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
|
| 229 |
-
|
| 230 |
-
with st.expander("Character Position Analysis"):
|
| 231 |
-
slot_summary, max_slots = analyze_slot_structure(chars_list)
|
| 232 |
-
st.plotly_chart(plot_char_positions(char_positions, max_slots))
|
| 233 |
-
|
| 234 |
-
with st.expander("Folio Similarity Analysis"):
|
| 235 |
-
st.plotly_chart(create_folio_word_scatter(folio_word_map))
|
| 236 |
-
|
| 237 |
-
with st.expander("12-Slot Character Frequency Table"):
|
| 238 |
-
slot_freq_df = create_12_slot_table(chars_list)
|
| 239 |
-
st.dataframe(slot_freq_df)
|
| 240 |
-
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|
| 241 |
-
|
| 242 |
-
with st.expander("Word Length Distribution"):
|
| 243 |
-
word_lengths = [len(chars) for chars in chars_list]
|
| 244 |
fig = px.histogram(word_lengths, nbins=20, labels={'value': 'Word Length', 'count': 'Frequency'})
|
| 245 |
fig.update_layout(title="Word Length Distribution")
|
| 246 |
st.plotly_chart(fig)
|
| 247 |
-
|
| 248 |
-
with st.expander("Character Bigram Network"):
|
| 249 |
-
char_bigrams = Counter()
|
| 250 |
-
for chars in chars_list:
|
| 251 |
-
for i in range(len(chars)-1):
|
| 252 |
-
char_bigrams[tuple(chars[i:i+2])] += 1
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
pos = nx.spring_layout(G)
|
| 259 |
edge_trace = []
|
|
@@ -272,9 +271,25 @@ if uploaded_file is not None:
|
|
| 272 |
)
|
| 273 |
|
| 274 |
fig = go.Figure(data=edge_trace + [node_trace])
|
| 275 |
-
fig.update_layout(title="
|
| 276 |
st.plotly_chart(fig)
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
with st.expander("Line Viewer"):
|
| 279 |
available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
|
| 280 |
selected_folio = st.selectbox("Select Folio:", [''] + available_folios)
|
|
@@ -316,4 +331,52 @@ if uploaded_file is not None:
|
|
| 316 |
'>
|
| 317 |
{char}
|
| 318 |
</div>
|
| 319 |
-
""", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
|
| 192 |
return href
|
| 193 |
|
| 194 |
+
# New Features
|
| 195 |
+
@st.cache_data
|
| 196 |
+
def analyze_word_lengths(words, chars_list):
|
| 197 |
+
word_lengths = [len(chars) for chars in chars_list]
|
| 198 |
+
length_counter = Counter(word_lengths)
|
| 199 |
+
|
| 200 |
+
# Group words by length
|
| 201 |
+
words_by_length = defaultdict(list)
|
| 202 |
+
for word, chars in zip(words, chars_list):
|
| 203 |
+
words_by_length[len(chars)].append((word, chars))
|
| 204 |
+
|
| 205 |
+
return word_lengths, length_counter, words_by_length
|
| 206 |
+
|
| 207 |
+
@st.cache_data
|
| 208 |
+
def analyze_symbol_transitions(char_connections):
|
| 209 |
+
G = nx.DiGraph()
|
| 210 |
+
|
| 211 |
+
for symbol1, connections in char_connections.items():
|
| 212 |
+
for symbol2, count in connections.items():
|
| 213 |
+
G.add_edge(symbol1, symbol2, weight=count)
|
| 214 |
+
|
| 215 |
+
return G
|
| 216 |
+
|
| 217 |
# Main App
|
| 218 |
st.title("Voynich Manuscript Analyzer")
|
| 219 |
+
st.write("Upload your CSV file to discover potential patterns and symbol distributions.")
|
| 220 |
|
| 221 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 222 |
|
|
|
|
| 224 |
df = load_data(uploaded_file)
|
| 225 |
words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
|
| 226 |
|
| 227 |
+
# Basic Statistics
|
| 228 |
with st.expander("Basic Statistics"):
|
| 229 |
st.write(f"Total words: {len(words)}")
|
| 230 |
st.write(f"Total unique words: {len(set(words))}")
|
| 231 |
unique_chars = set(char for chars in chars_list for char in chars)
|
| 232 |
+
st.write(f"Total unique symbols: {len(unique_chars)}")
|
| 233 |
+
st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
|
| 234 |
|
| 235 |
+
# Word Length Analysis
|
| 236 |
+
with st.expander("Word Length Analysis"):
|
| 237 |
+
word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
+
st.write("Word Length Distribution")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
fig = px.histogram(word_lengths, nbins=20, labels={'value': 'Word Length', 'count': 'Frequency'})
|
| 241 |
fig.update_layout(title="Word Length Distribution")
|
| 242 |
st.plotly_chart(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
+
st.write("Most Common Words by Length")
|
| 245 |
+
selected_length = st.selectbox("Select word length", sorted(words_by_length.keys()))
|
| 246 |
+
if selected_length:
|
| 247 |
+
words_of_length = words_by_length[selected_length]
|
| 248 |
+
st.write(f"Top 10 {selected_length}-symbol words:")
|
| 249 |
+
top_words = Counter([word for word, _ in words_of_length]).most_common(10)
|
| 250 |
+
top_words_df = pd.DataFrame([{'Word': word, 'Count': count} for word, count in top_words])
|
| 251 |
+
st.dataframe(top_words_df)
|
| 252 |
+
|
| 253 |
+
# Symbol Transition Network
|
| 254 |
+
with st.expander("Symbol Transition Network"):
|
| 255 |
+
G = analyze_symbol_transitions(char_connections)
|
| 256 |
|
| 257 |
pos = nx.spring_layout(G)
|
| 258 |
edge_trace = []
|
|
|
|
| 271 |
)
|
| 272 |
|
| 273 |
fig = go.Figure(data=edge_trace + [node_trace])
|
| 274 |
+
fig.update_layout(title="Symbol Transition Network", showlegend=False)
|
| 275 |
st.plotly_chart(fig)
|
| 276 |
+
|
| 277 |
+
# 12-Slot Character Frequency Table
|
| 278 |
+
with st.expander("12-Slot Symbol Frequency Table"):
|
| 279 |
+
slot_freq_df = create_12_slot_table(chars_list)
|
| 280 |
+
st.dataframe(slot_freq_df)
|
| 281 |
+
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|
| 282 |
+
|
| 283 |
+
# Character Position Heatmap
|
| 284 |
+
with st.expander("Symbol Position Heatmap"):
|
| 285 |
+
slot_summary, max_slots = analyze_slot_structure(chars_list)
|
| 286 |
+
st.plotly_chart(plot_char_positions(char_positions, max_slots))
|
| 287 |
+
|
| 288 |
+
# Folio Similarity Analysis
|
| 289 |
+
with st.expander("Folio Similarity Analysis"):
|
| 290 |
+
st.plotly_chart(create_folio_word_scatter(folio_word_map))
|
| 291 |
+
|
| 292 |
+
# Line Viewer
|
| 293 |
with st.expander("Line Viewer"):
|
| 294 |
available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
|
| 295 |
selected_folio = st.selectbox("Select Folio:", [''] + available_folios)
|
|
|
|
| 331 |
'>
|
| 332 |
{char}
|
| 333 |
</div>
|
| 334 |
+
""", unsafe_allow_html=True)
|
| 335 |
+
|
| 336 |
+
# First/Last Symbol Analysis
|
| 337 |
+
with st.expander("First/Last Symbol Analysis"):
|
| 338 |
+
first_chars = Counter(chars[0] for chars in chars_list)
|
| 339 |
+
last_chars = Counter(chars[-1] for chars in chars_list)
|
| 340 |
+
|
| 341 |
+
col1, col2 = st.columns(2)
|
| 342 |
+
|
| 343 |
+
with col1:
|
| 344 |
+
st.write("Most Common Initial Symbols")
|
| 345 |
+
first_df = pd.DataFrame(first_chars.most_common(10), columns=['Symbol', 'Count'])
|
| 346 |
+
st.dataframe(first_df)
|
| 347 |
+
|
| 348 |
+
with col2:
|
| 349 |
+
st.write("Most Common Final Symbols")
|
| 350 |
+
last_df = pd.DataFrame(last_chars.most_common(10), columns=['Symbol', 'Count'])
|
| 351 |
+
st.dataframe(last_df)
|
| 352 |
+
|
| 353 |
+
# Symbol Trigram Patterns
|
| 354 |
+
with st.expander("Symbol Trigram Patterns"):
|
| 355 |
+
char_trigrams = Counter()
|
| 356 |
+
for chars in chars_list:
|
| 357 |
+
if len(chars) >= 3:
|
| 358 |
+
for i in range(len(chars)-2):
|
| 359 |
+
char_trigrams[tuple(chars[i:i+3])] += 1
|
| 360 |
+
|
| 361 |
+
st.write("Top 20 Symbol Trigrams")
|
| 362 |
+
trigram_df = pd.DataFrame([{'Trigram': ' - '.join(trigram), 'Count': count}
|
| 363 |
+
for trigram, count in char_trigrams.most_common(20)])
|
| 364 |
+
st.dataframe(trigram_df)
|
| 365 |
+
|
| 366 |
+
# Word Length Correlation Matrix
|
| 367 |
+
with st.expander("Word Length Correlation Matrix"):
|
| 368 |
+
word_lengths_by_line = []
|
| 369 |
+
for line_data in word_positions:
|
| 370 |
+
line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
|
| 371 |
+
if len(line_word_lengths) >= 5: # Only lines with 5+ words
|
| 372 |
+
word_lengths_by_line.append(line_word_lengths[:5]) # First 5 words
|
| 373 |
+
|
| 374 |
+
if word_lengths_by_line:
|
| 375 |
+
length_corr = np.corrcoef(np.array(word_lengths_by_line).T)
|
| 376 |
+
fig = px.imshow(length_corr,
|
| 377 |
+
labels=dict(x="Position", y="Position", color="Correlation"),
|
| 378 |
+
x=[f"Pos {i+1}" for i in range(5)],
|
| 379 |
+
y=[f"Pos {i+1}" for i in range(5)],
|
| 380 |
+
color_continuous_scale='coolwarm')
|
| 381 |
+
fig.update_layout(title="Word Length Correlations by Position")
|
| 382 |
+
st.plotly_chart(fig)
|