Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,23 @@ import networkx as nx
|
|
| 16 |
# Set page configuration
|
| 17 |
st.set_page_config(layout="wide", page_title="Voynich Manuscript Analyzer", page_icon="📜")
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Caching expensive computations
|
| 20 |
@st.cache_data
|
| 21 |
def load_data(uploaded_file):
|
|
@@ -228,7 +245,7 @@ if uploaded_file is not None:
|
|
| 228 |
words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
|
| 229 |
|
| 230 |
# Basic Statistics
|
| 231 |
-
with st.expander("Basic Statistics"):
|
| 232 |
st.write(f"Total words: {len(words)}")
|
| 233 |
st.write(f"Total unique words: {len(set(words))}")
|
| 234 |
unique_chars = set(char for chars in chars_list for char in chars)
|
|
@@ -236,7 +253,7 @@ if uploaded_file is not None:
|
|
| 236 |
st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
|
| 237 |
|
| 238 |
# Word Length Analysis
|
| 239 |
-
with st.expander("Word Length Analysis"):
|
| 240 |
word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
|
| 241 |
|
| 242 |
st.write("Word Length Distribution")
|
|
@@ -254,7 +271,7 @@ if uploaded_file is not None:
|
|
| 254 |
st.dataframe(top_words_df)
|
| 255 |
|
| 256 |
# Symbol Transition Network
|
| 257 |
-
with st.expander("Symbol Transition Network"):
|
| 258 |
G = analyze_symbol_transitions(char_connections)
|
| 259 |
|
| 260 |
pos = nx.spring_layout(G)
|
|
@@ -278,22 +295,22 @@ if uploaded_file is not None:
|
|
| 278 |
st.plotly_chart(fig)
|
| 279 |
|
| 280 |
# 12-Slot Character Frequency Table
|
| 281 |
-
with st.expander("12-Slot Symbol Frequency Table"):
|
| 282 |
slot_freq_df = create_12_slot_table(chars_list)
|
| 283 |
st.dataframe(slot_freq_df)
|
| 284 |
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|
| 285 |
|
| 286 |
# Character Position Heatmap
|
| 287 |
-
with st.expander("Symbol Position Heatmap"):
|
| 288 |
slot_summary, max_slots = analyze_slot_structure(chars_list)
|
| 289 |
st.plotly_chart(plot_char_positions(char_positions, max_slots))
|
| 290 |
|
| 291 |
# Folio Similarity Analysis
|
| 292 |
-
with st.expander("Folio Similarity Analysis"):
|
| 293 |
st.plotly_chart(create_folio_word_scatter(folio_word_map))
|
| 294 |
|
| 295 |
# Word Length Visualization
|
| 296 |
-
with st.expander("Word Length Visualization"):
|
| 297 |
# Group words by length
|
| 298 |
words_by_length = defaultdict(list)
|
| 299 |
for word, chars in zip(words, chars_list):
|
|
@@ -337,7 +354,7 @@ if uploaded_file is not None:
|
|
| 337 |
""", unsafe_allow_html=True)
|
| 338 |
|
| 339 |
# Line Viewer
|
| 340 |
-
with st.expander("Line Viewer"):
|
| 341 |
available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
|
| 342 |
selected_folio = st.selectbox("Select Folio:", [''] + available_folios, key="folio_select")
|
| 343 |
|
|
@@ -381,7 +398,7 @@ if uploaded_file is not None:
|
|
| 381 |
""", unsafe_allow_html=True)
|
| 382 |
|
| 383 |
# First/Last Symbol Analysis
|
| 384 |
-
with st.expander("First/Last Symbol Analysis"):
|
| 385 |
first_chars = Counter(chars[0] for chars in chars_list)
|
| 386 |
last_chars = Counter(chars[-1] for chars in chars_list)
|
| 387 |
|
|
@@ -398,7 +415,7 @@ if uploaded_file is not None:
|
|
| 398 |
st.dataframe(last_df)
|
| 399 |
|
| 400 |
# Symbol Trigram Patterns
|
| 401 |
-
with st.expander("Symbol Trigram Patterns"):
|
| 402 |
char_trigrams = Counter()
|
| 403 |
for chars in chars_list:
|
| 404 |
if len(chars) >= 3:
|
|
@@ -411,7 +428,7 @@ if uploaded_file is not None:
|
|
| 411 |
st.dataframe(trigram_df)
|
| 412 |
|
| 413 |
# Word Length Correlation Matrix
|
| 414 |
-
with st.expander("Word Length Correlation Matrix"):
|
| 415 |
word_lengths_by_line = []
|
| 416 |
for line_data in word_positions:
|
| 417 |
line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
|
|
@@ -429,45 +446,53 @@ if uploaded_file is not None:
|
|
| 429 |
st.plotly_chart(fig)
|
| 430 |
|
| 431 |
# Folio Clustering Section
|
| 432 |
-
with st.expander("Folio Clustering Based on Word Usage Patterns"):
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
# Dimensionality Reduction Option
|
| 449 |
-
dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
|
| 450 |
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
| 460 |
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 461 |
clusters = kmeans.fit_predict(word_freq_matrix)
|
| 462 |
-
|
| 463 |
-
#
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Set page configuration
|
| 17 |
st.set_page_config(layout="wide", page_title="Voynich Manuscript Analyzer", page_icon="📜")
|
| 18 |
|
| 19 |
+
# Initialize session state for multiple expanders
|
| 20 |
+
if 'expander_states' not in st.session_state:
|
| 21 |
+
st.session_state.expander_states = {
|
| 22 |
+
'basic_statistics': True, # Expander 1: Basic Statistics
|
| 23 |
+
'word_length_analysis': True, # Expander 2: Word Length Analysis
|
| 24 |
+
'symbol_transition_network': True, # Expander 3: Symbol Transition Network
|
| 25 |
+
'slot_frequency_table': True, # Expander 4: 12-Slot Symbol Frequency Table
|
| 26 |
+
'symbol_position_heatmap': True, # Expander 5: Symbol Position Heatmap
|
| 27 |
+
'folio_similarity_analysis': True, # Expander 6: Folio Similarity Analysis
|
| 28 |
+
'word_length_visualization': True, # Expander 7: Word Length Visualization
|
| 29 |
+
'line_viewer': True, # Expander 8: Line Viewer
|
| 30 |
+
'first_last_symbol_analysis': True, # Expander 9: First/Last Symbol Analysis
|
| 31 |
+
'symbol_trigram_patterns': True, # Expander 10: Symbol Trigram Patterns
|
| 32 |
+
'word_length_correlation_matrix': True, # Expander 11: Word Length Correlation Matrix
|
| 33 |
+
'folio_clustering': True # Expander 12: Folio Clustering
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
# Caching expensive computations
|
| 37 |
@st.cache_data
|
| 38 |
def load_data(uploaded_file):
|
|
|
|
| 245 |
words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
|
| 246 |
|
| 247 |
# Basic Statistics
|
| 248 |
+
with st.expander("Basic Statistics", expanded=st.session_state.expander_states['basic_statistics']):
|
| 249 |
st.write(f"Total words: {len(words)}")
|
| 250 |
st.write(f"Total unique words: {len(set(words))}")
|
| 251 |
unique_chars = set(char for chars in chars_list for char in chars)
|
|
|
|
| 253 |
st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
|
| 254 |
|
| 255 |
# Word Length Analysis
|
| 256 |
+
with st.expander("Word Length Analysis", expanded=st.session_state.expander_states['word_length_analysis']):
|
| 257 |
word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
|
| 258 |
|
| 259 |
st.write("Word Length Distribution")
|
|
|
|
| 271 |
st.dataframe(top_words_df)
|
| 272 |
|
| 273 |
# Symbol Transition Network
|
| 274 |
+
with st.expander("Symbol Transition Network", expanded=st.session_state.expander_states['symbol_transition_network']):
|
| 275 |
G = analyze_symbol_transitions(char_connections)
|
| 276 |
|
| 277 |
pos = nx.spring_layout(G)
|
|
|
|
| 295 |
st.plotly_chart(fig)
|
| 296 |
|
| 297 |
# 12-Slot Character Frequency Table
|
| 298 |
+
with st.expander("12-Slot Symbol Frequency Table", expanded=st.session_state.expander_states['slot_frequency_table']):
|
| 299 |
slot_freq_df = create_12_slot_table(chars_list)
|
| 300 |
st.dataframe(slot_freq_df)
|
| 301 |
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|
| 302 |
|
| 303 |
# Character Position Heatmap
|
| 304 |
+
with st.expander("Symbol Position Heatmap", expanded=st.session_state.expander_states['symbol_position_heatmap']):
|
| 305 |
slot_summary, max_slots = analyze_slot_structure(chars_list)
|
| 306 |
st.plotly_chart(plot_char_positions(char_positions, max_slots))
|
| 307 |
|
| 308 |
# Folio Similarity Analysis
|
| 309 |
+
with st.expander("Folio Similarity Analysis", expanded=st.session_state.expander_states['folio_similarity_analysis']):
|
| 310 |
st.plotly_chart(create_folio_word_scatter(folio_word_map))
|
| 311 |
|
| 312 |
# Word Length Visualization
|
| 313 |
+
with st.expander("Word Length Visualization", expanded=st.session_state.expander_states['word_length_visualization']):
|
| 314 |
# Group words by length
|
| 315 |
words_by_length = defaultdict(list)
|
| 316 |
for word, chars in zip(words, chars_list):
|
|
|
|
| 354 |
""", unsafe_allow_html=True)
|
| 355 |
|
| 356 |
# Line Viewer
|
| 357 |
+
with st.expander("Line Viewer", expanded=st.session_state.expander_states['line_viewer']):
|
| 358 |
available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
|
| 359 |
selected_folio = st.selectbox("Select Folio:", [''] + available_folios, key="folio_select")
|
| 360 |
|
|
|
|
| 398 |
""", unsafe_allow_html=True)
|
| 399 |
|
| 400 |
# First/Last Symbol Analysis
|
| 401 |
+
with st.expander("First/Last Symbol Analysis", expanded=st.session_state.expander_states['first_last_symbol_analysis']):
|
| 402 |
first_chars = Counter(chars[0] for chars in chars_list)
|
| 403 |
last_chars = Counter(chars[-1] for chars in chars_list)
|
| 404 |
|
|
|
|
| 415 |
st.dataframe(last_df)
|
| 416 |
|
| 417 |
# Symbol Trigram Patterns
|
| 418 |
+
with st.expander("Symbol Trigram Patterns", expanded=st.session_state.expander_states['symbol_trigram_patterns']):
|
| 419 |
char_trigrams = Counter()
|
| 420 |
for chars in chars_list:
|
| 421 |
if len(chars) >= 3:
|
|
|
|
| 428 |
st.dataframe(trigram_df)
|
| 429 |
|
| 430 |
# Word Length Correlation Matrix
|
| 431 |
+
with st.expander("Word Length Correlation Matrix", expanded=st.session_state.expander_states['word_length_correlation_matrix']):
|
| 432 |
word_lengths_by_line = []
|
| 433 |
for line_data in word_positions:
|
| 434 |
line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
|
|
|
|
| 446 |
st.plotly_chart(fig)
|
| 447 |
|
| 448 |
# Folio Clustering Section
|
| 449 |
+
with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
|
| 450 |
+
st.write("""
|
| 451 |
+
This section groups folios into clusters based on their word usage patterns.
|
| 452 |
+
- **PCA**: Reduces the data to 2D using Principal Component Analysis.
|
| 453 |
+
- **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
|
| 454 |
+
- **K-Means**: Groups folios into clusters based on their word frequencies.
|
| 455 |
+
""")
|
| 456 |
+
|
| 457 |
+
# Feature Extraction
|
| 458 |
+
all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
|
| 459 |
+
word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
|
| 460 |
|
| 461 |
+
for folio, word_counter in folio_word_map.items():
|
| 462 |
+
for word, count in word_counter.items():
|
| 463 |
+
word_freq_matrix.loc[folio, word] = count
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
+
# Dimensionality Reduction Option
|
| 466 |
+
dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
|
| 467 |
+
|
| 468 |
+
if dim_reduction_method == "PCA":
|
| 469 |
+
reducer = PCA(n_components=2)
|
| 470 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 471 |
+
else:
|
| 472 |
+
reducer = TSNE(n_components=2, random_state=42)
|
| 473 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 474 |
+
|
| 475 |
+
# Clustering Algorithm Option
|
| 476 |
+
clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
|
| 477 |
+
|
| 478 |
+
if clustering_method == "K-Means":
|
| 479 |
+
# K-Means Clustering
|
| 480 |
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
| 481 |
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 482 |
clusters = kmeans.fit_predict(word_freq_matrix)
|
| 483 |
+
else:
|
| 484 |
+
# DBSCAN Clustering
|
| 485 |
+
dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
|
| 486 |
+
clusters = dbscan.fit_predict(word_freq_matrix)
|
| 487 |
+
|
| 488 |
+
# Visualization
|
| 489 |
+
plot_data = pd.DataFrame({
|
| 490 |
+
'Folio': word_freq_matrix.index,
|
| 491 |
+
'Dim1': folio_coords[:, 0],
|
| 492 |
+
'Dim2': folio_coords[:, 1],
|
| 493 |
+
'Cluster': clusters
|
| 494 |
+
})
|
| 495 |
+
|
| 496 |
+
fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
|
| 497 |
+
hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
|
| 498 |
+
st.plotly_chart(fig)
|