kambris commited on
Commit
f57ddb1
·
verified ·
1 Parent(s): f7c8af2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -48
app.py CHANGED
@@ -16,6 +16,23 @@ import networkx as nx
16
  # Set page configuration
17
  st.set_page_config(layout="wide", page_title="Voynich Manuscript Analyzer", page_icon="📜")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Caching expensive computations
20
  @st.cache_data
21
  def load_data(uploaded_file):
@@ -228,7 +245,7 @@ if uploaded_file is not None:
228
  words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
229
 
230
  # Basic Statistics
231
- with st.expander("Basic Statistics"):
232
  st.write(f"Total words: {len(words)}")
233
  st.write(f"Total unique words: {len(set(words))}")
234
  unique_chars = set(char for chars in chars_list for char in chars)
@@ -236,7 +253,7 @@ if uploaded_file is not None:
236
  st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
237
 
238
  # Word Length Analysis
239
- with st.expander("Word Length Analysis"):
240
  word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
241
 
242
  st.write("Word Length Distribution")
@@ -254,7 +271,7 @@ if uploaded_file is not None:
254
  st.dataframe(top_words_df)
255
 
256
  # Symbol Transition Network
257
- with st.expander("Symbol Transition Network"):
258
  G = analyze_symbol_transitions(char_connections)
259
 
260
  pos = nx.spring_layout(G)
@@ -278,22 +295,22 @@ if uploaded_file is not None:
278
  st.plotly_chart(fig)
279
 
280
  # 12-Slot Character Frequency Table
281
- with st.expander("12-Slot Symbol Frequency Table"):
282
  slot_freq_df = create_12_slot_table(chars_list)
283
  st.dataframe(slot_freq_df)
284
  st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
285
 
286
  # Character Position Heatmap
287
- with st.expander("Symbol Position Heatmap"):
288
  slot_summary, max_slots = analyze_slot_structure(chars_list)
289
  st.plotly_chart(plot_char_positions(char_positions, max_slots))
290
 
291
  # Folio Similarity Analysis
292
- with st.expander("Folio Similarity Analysis"):
293
  st.plotly_chart(create_folio_word_scatter(folio_word_map))
294
 
295
  # Word Length Visualization
296
- with st.expander("Word Length Visualization"):
297
  # Group words by length
298
  words_by_length = defaultdict(list)
299
  for word, chars in zip(words, chars_list):
@@ -337,7 +354,7 @@ if uploaded_file is not None:
337
  """, unsafe_allow_html=True)
338
 
339
  # Line Viewer
340
- with st.expander("Line Viewer"):
341
  available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
342
  selected_folio = st.selectbox("Select Folio:", [''] + available_folios, key="folio_select")
343
 
@@ -381,7 +398,7 @@ if uploaded_file is not None:
381
  """, unsafe_allow_html=True)
382
 
383
  # First/Last Symbol Analysis
384
- with st.expander("First/Last Symbol Analysis"):
385
  first_chars = Counter(chars[0] for chars in chars_list)
386
  last_chars = Counter(chars[-1] for chars in chars_list)
387
 
@@ -398,7 +415,7 @@ if uploaded_file is not None:
398
  st.dataframe(last_df)
399
 
400
  # Symbol Trigram Patterns
401
- with st.expander("Symbol Trigram Patterns"):
402
  char_trigrams = Counter()
403
  for chars in chars_list:
404
  if len(chars) >= 3:
@@ -411,7 +428,7 @@ if uploaded_file is not None:
411
  st.dataframe(trigram_df)
412
 
413
  # Word Length Correlation Matrix
414
- with st.expander("Word Length Correlation Matrix"):
415
  word_lengths_by_line = []
416
  for line_data in word_positions:
417
  line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
@@ -429,45 +446,53 @@ if uploaded_file is not None:
429
  st.plotly_chart(fig)
430
 
431
  # Folio Clustering Section
432
- with st.expander("Folio Clustering Based on Word Usage Patterns"):
433
- st.write("""
434
- This section groups folios into clusters based on their word usage patterns.
435
- - **PCA**: Reduces the data to 2D using Principal Component Analysis.
436
- - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
437
- - **K-Means**: Groups folios into clusters based on their word frequencies.
438
- """)
439
-
440
- # Feature Extraction
441
- all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
442
- word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
443
 
444
- for folio, word_counter in folio_word_map.items():
445
- for word, count in word_counter.items():
446
- word_freq_matrix.loc[folio, word] = count
447
-
448
- # Dimensionality Reduction Option
449
- dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
450
 
451
- if dim_reduction_method == "PCA":
452
- reducer = PCA(n_components=2)
453
- folio_coords = reducer.fit_transform(word_freq_matrix)
454
- else:
455
- reducer = TSNE(n_components=2, random_state=42)
456
- folio_coords = reducer.fit_transform(word_freq_matrix)
457
-
458
- # Clustering (K-Means)
 
 
 
 
 
 
 
459
  n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
460
  kmeans = KMeans(n_clusters=n_clusters, random_state=42)
461
  clusters = kmeans.fit_predict(word_freq_matrix)
462
-
463
- # Visualization
464
- plot_data = pd.DataFrame({
465
- 'Folio': word_freq_matrix.index,
466
- 'Dim1': folio_coords[:, 0],
467
- 'Dim2': folio_coords[:, 1],
468
- 'Cluster': clusters
469
- })
470
-
471
- fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
472
- hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method})")
473
- st.plotly_chart(fig)
 
 
 
 
 
16
  # Set page configuration
17
  st.set_page_config(layout="wide", page_title="Voynich Manuscript Analyzer", page_icon="📜")
18
 
19
+ # Initialize session state for multiple expanders
20
+ if 'expander_states' not in st.session_state:
21
+ st.session_state.expander_states = {
22
+ 'basic_statistics': True, # Expander 1: Basic Statistics
23
+ 'word_length_analysis': True, # Expander 2: Word Length Analysis
24
+ 'symbol_transition_network': True, # Expander 3: Symbol Transition Network
25
+ 'slot_frequency_table': True, # Expander 4: 12-Slot Symbol Frequency Table
26
+ 'symbol_position_heatmap': True, # Expander 5: Symbol Position Heatmap
27
+ 'folio_similarity_analysis': True, # Expander 6: Folio Similarity Analysis
28
+ 'word_length_visualization': True, # Expander 7: Word Length Visualization
29
+ 'line_viewer': True, # Expander 8: Line Viewer
30
+ 'first_last_symbol_analysis': True, # Expander 9: First/Last Symbol Analysis
31
+ 'symbol_trigram_patterns': True, # Expander 10: Symbol Trigram Patterns
32
+ 'word_length_correlation_matrix': True, # Expander 11: Word Length Correlation Matrix
33
+ 'folio_clustering': True # Expander 12: Folio Clustering
34
+ }
35
+
36
  # Caching expensive computations
37
  @st.cache_data
38
  def load_data(uploaded_file):
 
245
  words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
246
 
247
  # Basic Statistics
248
+ with st.expander("Basic Statistics", expanded=st.session_state.expander_states['basic_statistics']):
249
  st.write(f"Total words: {len(words)}")
250
  st.write(f"Total unique words: {len(set(words))}")
251
  unique_chars = set(char for chars in chars_list for char in chars)
 
253
  st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
254
 
255
  # Word Length Analysis
256
+ with st.expander("Word Length Analysis", expanded=st.session_state.expander_states['word_length_analysis']):
257
  word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
258
 
259
  st.write("Word Length Distribution")
 
271
  st.dataframe(top_words_df)
272
 
273
  # Symbol Transition Network
274
+ with st.expander("Symbol Transition Network", expanded=st.session_state.expander_states['symbol_transition_network']):
275
  G = analyze_symbol_transitions(char_connections)
276
 
277
  pos = nx.spring_layout(G)
 
295
  st.plotly_chart(fig)
296
 
297
  # 12-Slot Character Frequency Table
298
+ with st.expander("12-Slot Symbol Frequency Table", expanded=st.session_state.expander_states['slot_frequency_table']):
299
  slot_freq_df = create_12_slot_table(chars_list)
300
  st.dataframe(slot_freq_df)
301
  st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
302
 
303
  # Character Position Heatmap
304
+ with st.expander("Symbol Position Heatmap", expanded=st.session_state.expander_states['symbol_position_heatmap']):
305
  slot_summary, max_slots = analyze_slot_structure(chars_list)
306
  st.plotly_chart(plot_char_positions(char_positions, max_slots))
307
 
308
  # Folio Similarity Analysis
309
+ with st.expander("Folio Similarity Analysis", expanded=st.session_state.expander_states['folio_similarity_analysis']):
310
  st.plotly_chart(create_folio_word_scatter(folio_word_map))
311
 
312
  # Word Length Visualization
313
+ with st.expander("Word Length Visualization", expanded=st.session_state.expander_states['word_length_visualization']):
314
  # Group words by length
315
  words_by_length = defaultdict(list)
316
  for word, chars in zip(words, chars_list):
 
354
  """, unsafe_allow_html=True)
355
 
356
  # Line Viewer
357
+ with st.expander("Line Viewer", expanded=st.session_state.expander_states['line_viewer']):
358
  available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
359
  selected_folio = st.selectbox("Select Folio:", [''] + available_folios, key="folio_select")
360
 
 
398
  """, unsafe_allow_html=True)
399
 
400
  # First/Last Symbol Analysis
401
+ with st.expander("First/Last Symbol Analysis", expanded=st.session_state.expander_states['first_last_symbol_analysis']):
402
  first_chars = Counter(chars[0] for chars in chars_list)
403
  last_chars = Counter(chars[-1] for chars in chars_list)
404
 
 
415
  st.dataframe(last_df)
416
 
417
  # Symbol Trigram Patterns
418
+ with st.expander("Symbol Trigram Patterns", expanded=st.session_state.expander_states['symbol_trigram_patterns']):
419
  char_trigrams = Counter()
420
  for chars in chars_list:
421
  if len(chars) >= 3:
 
428
  st.dataframe(trigram_df)
429
 
430
  # Word Length Correlation Matrix
431
+ with st.expander("Word Length Correlation Matrix", expanded=st.session_state.expander_states['word_length_correlation_matrix']):
432
  word_lengths_by_line = []
433
  for line_data in word_positions:
434
  line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
 
446
  st.plotly_chart(fig)
447
 
448
  # Folio Clustering Section
449
+ with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
450
+ st.write("""
451
+ This section groups folios into clusters based on their word usage patterns.
452
+ - **PCA**: Reduces the data to 2D using Principal Component Analysis.
453
+ - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
454
+ - **K-Means**: Groups folios into clusters based on their word frequencies.
455
+ """)
456
+
457
+ # Feature Extraction
458
+ all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
459
+ word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
460
 
461
+ for folio, word_counter in folio_word_map.items():
462
+ for word, count in word_counter.items():
463
+ word_freq_matrix.loc[folio, word] = count
 
 
 
464
 
465
+ # Dimensionality Reduction Option
466
+ dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
467
+
468
+ if dim_reduction_method == "PCA":
469
+ reducer = PCA(n_components=2)
470
+ folio_coords = reducer.fit_transform(word_freq_matrix)
471
+ else:
472
+ reducer = TSNE(n_components=2, random_state=42)
473
+ folio_coords = reducer.fit_transform(word_freq_matrix)
474
+
475
+ # Clustering Algorithm Option
476
+ clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
477
+
478
+ if clustering_method == "K-Means":
479
+ # K-Means Clustering
480
  n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
481
  kmeans = KMeans(n_clusters=n_clusters, random_state=42)
482
  clusters = kmeans.fit_predict(word_freq_matrix)
483
+ else:
484
+ # DBSCAN Clustering
485
+ dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
486
+ clusters = dbscan.fit_predict(word_freq_matrix)
487
+
488
+ # Visualization
489
+ plot_data = pd.DataFrame({
490
+ 'Folio': word_freq_matrix.index,
491
+ 'Dim1': folio_coords[:, 0],
492
+ 'Dim2': folio_coords[:, 1],
493
+ 'Cluster': clusters
494
+ })
495
+
496
+ fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
497
+ hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
498
+ st.plotly_chart(fig)