kambris commited on
Commit
25bbeb7
·
verified ·
1 Parent(s): c2b141b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -47
app.py CHANGED
@@ -191,9 +191,32 @@ def get_download_link_csv(df, filename):
191
  href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
192
  return href
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  # Main App
195
  st.title("Voynich Manuscript Analyzer")
196
- st.write("Upload your CSV file to discover potential patterns and character distributions.")
197
 
198
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
199
 
@@ -201,59 +224,35 @@ if uploaded_file is not None:
201
  df = load_data(uploaded_file)
202
  words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
203
 
 
204
  with st.expander("Basic Statistics"):
205
  st.write(f"Total words: {len(words)}")
206
  st.write(f"Total unique words: {len(set(words))}")
207
  unique_chars = set(char for chars in chars_list for char in chars)
208
- st.write(f"Total unique characters: {len(unique_chars)}")
209
- st.write("Unique characters:", ", ".join(sorted(unique_chars)))
210
 
211
- with st.expander("Trigram Analysis"):
212
- char_trigrams, word_trigrams = analyze_trigrams(words, chars_list)
213
-
214
- st.write("Top 20 Character Trigrams")
215
- char_trigram_df = pd.DataFrame([
216
- {'Trigram': ' - '.join(trigram), 'Count': count}
217
- for trigram, count in char_trigrams.most_common(20)
218
- ])
219
- st.dataframe(char_trigram_df)
220
- st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
221
 
222
- st.write("Top 20 Word Trigrams")
223
- word_trigram_df = pd.DataFrame([
224
- {'Trigram': ' - '.join(trigram), 'Count': count}
225
- for trigram, count in word_trigrams.most_common(20)
226
- ])
227
- st.dataframe(word_trigram_df)
228
- st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
229
-
230
- with st.expander("Character Position Analysis"):
231
- slot_summary, max_slots = analyze_slot_structure(chars_list)
232
- st.plotly_chart(plot_char_positions(char_positions, max_slots))
233
-
234
- with st.expander("Folio Similarity Analysis"):
235
- st.plotly_chart(create_folio_word_scatter(folio_word_map))
236
-
237
- with st.expander("12-Slot Character Frequency Table"):
238
- slot_freq_df = create_12_slot_table(chars_list)
239
- st.dataframe(slot_freq_df)
240
- st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
241
-
242
- with st.expander("Word Length Distribution"):
243
- word_lengths = [len(chars) for chars in chars_list]
244
  fig = px.histogram(word_lengths, nbins=20, labels={'value': 'Word Length', 'count': 'Frequency'})
245
  fig.update_layout(title="Word Length Distribution")
246
  st.plotly_chart(fig)
247
-
248
- with st.expander("Character Bigram Network"):
249
- char_bigrams = Counter()
250
- for chars in chars_list:
251
- for i in range(len(chars)-1):
252
- char_bigrams[tuple(chars[i:i+2])] += 1
253
 
254
- G = nx.Graph()
255
- for (char1, char2), count in char_bigrams.most_common(20):
256
- G.add_edge(char1, char2, weight=count)
 
 
 
 
 
 
 
 
 
257
 
258
  pos = nx.spring_layout(G)
259
  edge_trace = []
@@ -272,9 +271,25 @@ if uploaded_file is not None:
272
  )
273
 
274
  fig = go.Figure(data=edge_trace + [node_trace])
275
- fig.update_layout(title="Character Bigram Network", showlegend=False)
276
  st.plotly_chart(fig)
277
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  with st.expander("Line Viewer"):
279
  available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
280
  selected_folio = st.selectbox("Select Folio:", [''] + available_folios)
@@ -316,4 +331,52 @@ if uploaded_file is not None:
316
  '>
317
  {char}
318
  </div>
319
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
192
  return href
193
 
194
+ # New Features
195
+ @st.cache_data
196
+ def analyze_word_lengths(words, chars_list):
197
+ word_lengths = [len(chars) for chars in chars_list]
198
+ length_counter = Counter(word_lengths)
199
+
200
+ # Group words by length
201
+ words_by_length = defaultdict(list)
202
+ for word, chars in zip(words, chars_list):
203
+ words_by_length[len(chars)].append((word, chars))
204
+
205
+ return word_lengths, length_counter, words_by_length
206
+
207
+ @st.cache_data
208
+ def analyze_symbol_transitions(char_connections):
209
+ G = nx.DiGraph()
210
+
211
+ for symbol1, connections in char_connections.items():
212
+ for symbol2, count in connections.items():
213
+ G.add_edge(symbol1, symbol2, weight=count)
214
+
215
+ return G
216
+
217
  # Main App
218
  st.title("Voynich Manuscript Analyzer")
219
+ st.write("Upload your CSV file to discover potential patterns and symbol distributions.")
220
 
221
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
222
 
 
224
  df = load_data(uploaded_file)
225
  words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
226
 
227
+ # Basic Statistics
228
  with st.expander("Basic Statistics"):
229
  st.write(f"Total words: {len(words)}")
230
  st.write(f"Total unique words: {len(set(words))}")
231
  unique_chars = set(char for chars in chars_list for char in chars)
232
+ st.write(f"Total unique symbols: {len(unique_chars)}")
233
+ st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
234
 
235
+ # Word Length Analysis
236
+ with st.expander("Word Length Analysis"):
237
+ word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
 
 
 
 
 
 
 
238
 
239
+ st.write("Word Length Distribution")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  fig = px.histogram(word_lengths, nbins=20, labels={'value': 'Word Length', 'count': 'Frequency'})
241
  fig.update_layout(title="Word Length Distribution")
242
  st.plotly_chart(fig)
 
 
 
 
 
 
243
 
244
+ st.write("Most Common Words by Length")
245
+ selected_length = st.selectbox("Select word length", sorted(words_by_length.keys()))
246
+ if selected_length:
247
+ words_of_length = words_by_length[selected_length]
248
+ st.write(f"Top 10 {selected_length}-symbol words:")
249
+ top_words = Counter([word for word, _ in words_of_length]).most_common(10)
250
+ top_words_df = pd.DataFrame([{'Word': word, 'Count': count} for word, count in top_words])
251
+ st.dataframe(top_words_df)
252
+
253
+ # Symbol Transition Network
254
+ with st.expander("Symbol Transition Network"):
255
+ G = analyze_symbol_transitions(char_connections)
256
 
257
  pos = nx.spring_layout(G)
258
  edge_trace = []
 
271
  )
272
 
273
  fig = go.Figure(data=edge_trace + [node_trace])
274
+ fig.update_layout(title="Symbol Transition Network", showlegend=False)
275
  st.plotly_chart(fig)
276
+
277
+ # 12-Slot Character Frequency Table
278
+ with st.expander("12-Slot Symbol Frequency Table"):
279
+ slot_freq_df = create_12_slot_table(chars_list)
280
+ st.dataframe(slot_freq_df)
281
+ st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
282
+
283
+ # Character Position Heatmap
284
+ with st.expander("Symbol Position Heatmap"):
285
+ slot_summary, max_slots = analyze_slot_structure(chars_list)
286
+ st.plotly_chart(plot_char_positions(char_positions, max_slots))
287
+
288
+ # Folio Similarity Analysis
289
+ with st.expander("Folio Similarity Analysis"):
290
+ st.plotly_chart(create_folio_word_scatter(folio_word_map))
291
+
292
+ # Line Viewer
293
  with st.expander("Line Viewer"):
294
  available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
295
  selected_folio = st.selectbox("Select Folio:", [''] + available_folios)
 
331
  '>
332
  {char}
333
  </div>
334
+ """, unsafe_allow_html=True)
335
+
336
+ # First/Last Symbol Analysis
337
+ with st.expander("First/Last Symbol Analysis"):
338
+ first_chars = Counter(chars[0] for chars in chars_list)
339
+ last_chars = Counter(chars[-1] for chars in chars_list)
340
+
341
+ col1, col2 = st.columns(2)
342
+
343
+ with col1:
344
+ st.write("Most Common Initial Symbols")
345
+ first_df = pd.DataFrame(first_chars.most_common(10), columns=['Symbol', 'Count'])
346
+ st.dataframe(first_df)
347
+
348
+ with col2:
349
+ st.write("Most Common Final Symbols")
350
+ last_df = pd.DataFrame(last_chars.most_common(10), columns=['Symbol', 'Count'])
351
+ st.dataframe(last_df)
352
+
353
+ # Symbol Trigram Patterns
354
+ with st.expander("Symbol Trigram Patterns"):
355
+ char_trigrams = Counter()
356
+ for chars in chars_list:
357
+ if len(chars) >= 3:
358
+ for i in range(len(chars)-2):
359
+ char_trigrams[tuple(chars[i:i+3])] += 1
360
+
361
+ st.write("Top 20 Symbol Trigrams")
362
+ trigram_df = pd.DataFrame([{'Trigram': ' - '.join(trigram), 'Count': count}
363
+ for trigram, count in char_trigrams.most_common(20)])
364
+ st.dataframe(trigram_df)
365
+
366
+ # Word Length Correlation Matrix
367
+ with st.expander("Word Length Correlation Matrix"):
368
+ word_lengths_by_line = []
369
+ for line_data in word_positions:
370
+ line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
371
+ if len(line_word_lengths) >= 5: # Only lines with 5+ words
372
+ word_lengths_by_line.append(line_word_lengths[:5]) # First 5 words
373
+
374
+ if word_lengths_by_line:
375
+ length_corr = np.corrcoef(np.array(word_lengths_by_line).T)
376
+ fig = px.imshow(length_corr,
377
+ labels=dict(x="Position", y="Position", color="Correlation"),
378
+ x=[f"Pos {i+1}" for i in range(5)],
379
+ y=[f"Pos {i+1}" for i in range(5)],
380
+ color_continuous_scale='coolwarm')
381
+ fig.update_layout(title="Word Length Correlations by Position")
382
+ st.plotly_chart(fig)