kambris commited on
Commit
8f5e990
·
verified ·
1 Parent(s): e7e7a90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -55
app.py CHANGED
@@ -22,18 +22,32 @@ st.markdown("""
22
  </style>
23
  """, unsafe_allow_html=True)
24
 
 
 
 
25
  def parse_voynich_word(word):
26
- """Parse a Voynich word into individual characters - NO assumptions about digraphs"""
27
  if not word or word.strip() == '':
28
  return None, None
29
 
30
  word = word.strip()
31
- # Simply convert to list of individual characters
32
- chars = list(word)
 
 
 
 
33
 
34
- return word, chars
 
 
 
35
 
36
- def analyze_csv(df):
 
 
 
 
37
  words = []
38
  chars_list = []
39
  char_positions = defaultdict(list)
@@ -68,20 +82,15 @@ def analyze_csv(df):
68
 
69
  return words, chars_list, char_positions, char_connections, word_positions, line_word_map
70
 
71
- def analyze_trigrams(words, chars_list):
72
- char_trigrams = Counter()
73
- word_trigrams = Counter()
74
-
75
- for chars in chars_list:
76
- for i in range(len(chars)-2):
77
- trigram = tuple(chars[i:i+3])
78
- char_trigrams[trigram] += 1
79
-
80
- for i in range(len(words)-2):
81
- trigram = tuple(words[i:i+3])
82
- word_trigrams[trigram] += 1
83
-
84
- return char_trigrams, word_trigrams
85
 
86
  def create_12_slot_table(chars_list):
87
  slot_frequencies = [Counter() for _ in range(12)]
@@ -89,17 +98,32 @@ def create_12_slot_table(chars_list):
89
  for chars in chars_list:
90
  for i, char in enumerate(chars[:12]):
91
  slot_frequencies[i][char] += 1
92
-
 
 
 
93
  data = []
94
  all_chars = sorted(set(char for counter in slot_frequencies for char in counter))
95
 
96
  for char in all_chars:
97
  row = {'Character': char}
98
  for i in range(12):
99
- row[f'Slot_{i+1}'] = slot_frequencies[i][char]
 
 
 
 
 
100
  data.append(row)
101
-
102
- return pd.DataFrame(data)
 
 
 
 
 
 
 
103
 
104
  def analyze_slot_structure(chars_list):
105
  slot_contents = defaultdict(Counter)
@@ -154,10 +178,10 @@ def get_download_link_csv(df, filename):
154
  return href
155
 
156
  st.title("Voynich Manuscript Analyzer")
157
- st.write("Upload your CSV file to discover potential patterns and character distributions.")
 
158
 
159
- # Upload eva legend
160
- # Add image uploader in sidebar
161
  floating_image_file = st.sidebar.file_uploader("Upload an image",
162
  type=['png', 'jpg', 'jpeg', 'gif'],
163
  key="floating_image")
@@ -181,7 +205,15 @@ if uploaded_file is not None:
181
  # Create DataFrame from parsed data
182
  df = pd.DataFrame(data)
183
 
184
- words, chars_list, char_positions, char_connections, word_positions, line_word_map = analyze_csv(df)
 
 
 
 
 
 
 
 
185
 
186
  st.subheader("Basic Statistics")
187
  st.write(f"Total words: {len(words)}")
@@ -208,11 +240,13 @@ if uploaded_file is not None:
208
  bigram = tuple(chars[i:i+2])
209
  char_bigrams[bigram] += 1
210
 
 
211
  char_bigram_df = pd.DataFrame([
212
  {'Bigram': ''.join(str(c) for c in bigram),
213
  'Char1': str(bigram[0]),
214
  'Char2': str(bigram[1]),
215
- 'Count': int(count)}
 
216
  for bigram, count in char_bigrams.most_common(30)
217
  ])
218
  st.dataframe(char_bigram_df)
@@ -227,8 +261,11 @@ if uploaded_file is not None:
227
  trigram = tuple(chars[i:i+3])
228
  char_trigrams[trigram] += 1
229
 
 
230
  char_trigram_df = pd.DataFrame([
231
- {'Trigram': ''.join(str(c) for c in trigram), 'Count': int(count)}
 
 
232
  for trigram, count in char_trigrams.most_common(30)
233
  ])
234
  st.dataframe(char_trigram_df)
@@ -239,9 +276,13 @@ if uploaded_file is not None:
239
  for i in range(len(words)-1):
240
  bigram = tuple(words[i:i+2])
241
  word_bigrams[bigram] += 1
242
-
 
243
  word_bigram_df = pd.DataFrame([
244
- {'Word1': str(bigram[0]), 'Word2': str(bigram[1]), 'Count': int(count)}
 
 
 
245
  for bigram, count in word_bigrams.most_common(20)
246
  ])
247
  st.dataframe(word_bigram_df)
@@ -252,12 +293,14 @@ if uploaded_file is not None:
252
  for i in range(len(words)-2):
253
  trigram = tuple(words[i:i+3])
254
  word_trigrams[trigram] += 1
255
-
 
256
  word_trigram_df = pd.DataFrame([
257
  {'Word1': str(trigram[0]),
258
  'Word2': str(trigram[1]),
259
  'Word3': str(trigram[2]),
260
- 'Count': int(count)}
 
261
  for trigram, count in word_trigrams.most_common(20)
262
  ])
263
  st.dataframe(word_trigram_df)
@@ -272,14 +315,9 @@ if uploaded_file is not None:
272
 
273
  st.subheader("Words by Length Analysis")
274
 
275
- length_groups = defaultdict(list)
276
- for word, chars in zip(words, chars_list):
277
- length = len(chars)
278
- if length <= 20: # Extended range
279
- length_groups[length].append((word, chars))
280
-
281
  selected_length = st.selectbox("Select word length to analyze:",
282
- sorted(length_groups.keys()))
 
283
 
284
  if selected_length:
285
  words_of_length = length_groups[selected_length]
@@ -289,16 +327,31 @@ if uploaded_file is not None:
289
  for i, char in enumerate(chars):
290
  position_chars[i][char] += 1
291
 
 
 
 
292
  st.write(f"Found {len(words_of_length)} words of length {selected_length}")
293
 
294
  freq_data = []
295
  for char in sorted(unique_chars):
296
  row = {'Character': char}
297
  for pos in range(selected_length):
298
- row[f'Pos_{pos+1}'] = position_chars[pos][char]
 
 
 
 
 
299
  freq_data.append(row)
300
 
301
  freq_df = pd.DataFrame(freq_data)
 
 
 
 
 
 
 
302
  st.dataframe(freq_df)
303
  st.markdown(get_download_link_csv(freq_df, f"length_{selected_length}_analysis.csv"),
304
  unsafe_allow_html=True)
@@ -317,8 +370,10 @@ if uploaded_file is not None:
317
  st.subheader("Character Context Analysis")
318
  st.write("Select a character to see what comes before and after it")
319
 
320
- unique_chars = sorted(set(char for chars in chars_list for char in chars))
321
- selected_char = st.selectbox("Select a character to analyze:", unique_chars)
 
 
322
 
323
  if selected_char:
324
  before_counter = Counter()
@@ -336,8 +391,14 @@ if uploaded_file is not None:
336
 
337
  with col1:
338
  st.write(f"Characters that commonly PRECEDE '{selected_char}':")
339
- before_df = pd.DataFrame(before_counter.most_common(15),
340
- columns=['Character', 'Count'])
 
 
 
 
 
 
341
  st.dataframe(before_df)
342
 
343
  fig1, ax1 = plt.subplots(figsize=(8, 6))
@@ -348,8 +409,14 @@ if uploaded_file is not None:
348
 
349
  with col2:
350
  st.write(f"Characters that commonly FOLLOW '{selected_char}':")
351
- after_df = pd.DataFrame(after_counter.most_common(15),
352
- columns=['Character', 'Count'])
 
 
 
 
 
 
353
  st.dataframe(after_df)
354
 
355
  fig2, ax2 = plt.subplots(figsize=(8, 6))
@@ -361,7 +428,9 @@ if uploaded_file is not None:
361
  st.subheader("Line Viewer")
362
 
363
  available_lines = sorted(set(line_data['line'] for line_data in word_positions))
364
- selected_line = st.selectbox("Select Line:", [''] + [f"Line {line}" for line in available_lines])
 
 
365
 
366
  if selected_line:
367
  line_num = int(selected_line.replace('Line ', ''))
@@ -413,7 +482,9 @@ if uploaded_file is not None:
413
  fig_freq = plt.figure(figsize=(12, 6))
414
  char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
415
  char_freq_df['Percentage'] = (char_freq_df['Count'] / total_chars * 100).round(2)
416
- plt.bar(char_freq_df['Character'], char_freq_df['Count'])
 
 
417
  plt.title("Character Frequency Distribution")
418
  plt.xlabel("Character")
419
  plt.ylabel("Frequency")
@@ -490,26 +561,46 @@ if uploaded_file is not None:
490
  first_chars = Counter(chars[0] for chars in chars_list)
491
  last_chars = Counter(chars[-1] for chars in chars_list)
492
 
 
 
 
493
  fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
494
 
495
- first_df = pd.DataFrame(first_chars.most_common(15),
496
- columns=['Character', 'Count'])
 
 
 
 
497
  sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
498
  ax1.set_title("Most Common Word-Initial Characters")
499
  ax1.tick_params(axis='x', rotation=45)
500
 
501
- last_df = pd.DataFrame(last_chars.most_common(15),
502
- columns=['Character', 'Count'])
 
 
 
 
503
  sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
504
  ax2.set_title("Most Common Word-Final Characters")
505
  ax2.tick_params(axis='x', rotation=45)
506
  st.pyplot(fig6)
507
 
 
 
 
 
 
 
 
 
 
508
  # N-gram Pattern Discovery
509
  st.subheader("N-gram Pattern Discovery")
510
  st.write("Discover recurring character sequences of different lengths")
511
 
512
- ngram_length = st.slider("Select n-gram length:", 2, 6, 3)
513
 
514
  ngrams = Counter()
515
  for chars in chars_list:
@@ -517,10 +608,11 @@ if uploaded_file is not None:
517
  ngram = tuple(chars[i:i+ngram_length])
518
  ngrams[ngram] += 1
519
 
 
520
  ngram_df = pd.DataFrame([
521
  {'Pattern': ''.join(str(c) for c in ngram),
522
  'Count': int(count),
523
- 'Percentage': f"{count/len(chars_list)*100:.2f}%"}
524
  for ngram, count in ngrams.most_common(30)
525
  ])
526
  st.dataframe(ngram_df)
 
22
  </style>
23
  """, unsafe_allow_html=True)
24
 
25
+ # Define allowed characters
26
+ ALLOWED_CHARS = set('4O892ERSZPBFVQWXYACIGH1TU0DNM3JKL567(n)(v)')
27
+
28
  def parse_voynich_word(word):
29
+ """Parse a Voynich word into individual characters - filtering to allowed characters only"""
30
  if not word or word.strip() == '':
31
  return None, None
32
 
33
  word = word.strip()
34
+ # Filter to only allowed characters
35
+ chars = [c for c in list(word) if c in ALLOWED_CHARS]
36
+
37
+ # If no valid characters remain, return None
38
+ if not chars:
39
+ return None, None
40
 
41
+ # Reconstruct the filtered word
42
+ filtered_word = ''.join(chars)
43
+
44
+ return filtered_word, chars
45
 
46
+ @st.cache_data
47
+ def analyze_csv(df_hash):
48
+ """Cached analysis function - only recalculates when CSV changes"""
49
+ df = st.session_state.df_data
50
+
51
  words = []
52
  chars_list = []
53
  char_positions = defaultdict(list)
 
82
 
83
  return words, chars_list, char_positions, char_connections, word_positions, line_word_map
84
 
85
+ @st.cache_data
86
+ def create_length_groups(words, chars_list):
87
+ """Pre-calculate all length groups - cached for performance"""
88
+ length_groups = defaultdict(list)
89
+ for word, chars in zip(words, chars_list):
90
+ length = len(chars)
91
+ if length <= 20:
92
+ length_groups[length].append((word, chars))
93
+ return length_groups
 
 
 
 
 
94
 
95
  def create_12_slot_table(chars_list):
96
  slot_frequencies = [Counter() for _ in range(12)]
 
98
  for chars in chars_list:
99
  for i, char in enumerate(chars[:12]):
100
  slot_frequencies[i][char] += 1
101
+
102
+ # Calculate totals for each slot
103
+ slot_totals = [sum(counter.values()) for counter in slot_frequencies]
104
+
105
  data = []
106
  all_chars = sorted(set(char for counter in slot_frequencies for char in counter))
107
 
108
  for char in all_chars:
109
  row = {'Character': char}
110
  for i in range(12):
111
+ count = slot_frequencies[i][char]
112
+ row[f'Slot_{i+1}'] = count
113
+ if slot_totals[i] > 0:
114
+ row[f'Slot_{i+1}_Pct'] = f"{(count / slot_totals[i] * 100):.2f}%"
115
+ else:
116
+ row[f'Slot_{i+1}_Pct'] = "0.00%"
117
  data.append(row)
118
+
119
+ # Reorder columns to alternate count and percentage
120
+ df = pd.DataFrame(data)
121
+ ordered_cols = ['Character']
122
+ for i in range(12):
123
+ ordered_cols.append(f'Slot_{i+1}')
124
+ ordered_cols.append(f'Slot_{i+1}_Pct')
125
+
126
+ return df[ordered_cols]
127
 
128
  def analyze_slot_structure(chars_list):
129
  slot_contents = defaultdict(Counter)
 
178
  return href
179
 
180
  st.title("Voynich Manuscript Analyzer")
181
+ st.write("Upload your CSV file.")
182
+ st.info(f"**Filtered Character Set:** {' '.join(sorted(ALLOWED_CHARS))}")
183
 
184
+ # Upload eva legend to sidebar
 
185
  floating_image_file = st.sidebar.file_uploader("Upload an image",
186
  type=['png', 'jpg', 'jpeg', 'gif'],
187
  key="floating_image")
 
205
  # Create DataFrame from parsed data
206
  df = pd.DataFrame(data)
207
 
208
+ # Store in session state and create hash for caching
209
+ st.session_state.df_data = df
210
+ df_hash = hash(content)
211
+
212
+ # Use cached analysis
213
+ words, chars_list, char_positions, char_connections, word_positions, line_word_map = analyze_csv(df_hash)
214
+
215
+ # Pre-calculate length groups (cached)
216
+ length_groups = create_length_groups(words, chars_list)
217
 
218
  st.subheader("Basic Statistics")
219
  st.write(f"Total words: {len(words)}")
 
240
  bigram = tuple(chars[i:i+2])
241
  char_bigrams[bigram] += 1
242
 
243
+ total_char_bigrams = sum(char_bigrams.values())
244
  char_bigram_df = pd.DataFrame([
245
  {'Bigram': ''.join(str(c) for c in bigram),
246
  'Char1': str(bigram[0]),
247
  'Char2': str(bigram[1]),
248
+ 'Count': int(count),
249
+ 'Percentage': f"{(count / total_char_bigrams * 100):.2f}%"}
250
  for bigram, count in char_bigrams.most_common(30)
251
  ])
252
  st.dataframe(char_bigram_df)
 
261
  trigram = tuple(chars[i:i+3])
262
  char_trigrams[trigram] += 1
263
 
264
+ total_char_trigrams = sum(char_trigrams.values())
265
  char_trigram_df = pd.DataFrame([
266
+ {'Trigram': ''.join(str(c) for c in trigram),
267
+ 'Count': int(count),
268
+ 'Percentage': f"{(count / total_char_trigrams * 100):.2f}%"}
269
  for trigram, count in char_trigrams.most_common(30)
270
  ])
271
  st.dataframe(char_trigram_df)
 
276
  for i in range(len(words)-1):
277
  bigram = tuple(words[i:i+2])
278
  word_bigrams[bigram] += 1
279
+
280
+ total_word_bigrams = sum(word_bigrams.values())
281
  word_bigram_df = pd.DataFrame([
282
+ {'Word1': str(bigram[0]),
283
+ 'Word2': str(bigram[1]),
284
+ 'Count': int(count),
285
+ 'Percentage': f"{(count / total_word_bigrams * 100):.2f}%"}
286
  for bigram, count in word_bigrams.most_common(20)
287
  ])
288
  st.dataframe(word_bigram_df)
 
293
  for i in range(len(words)-2):
294
  trigram = tuple(words[i:i+3])
295
  word_trigrams[trigram] += 1
296
+
297
+ total_word_trigrams = sum(word_trigrams.values())
298
  word_trigram_df = pd.DataFrame([
299
  {'Word1': str(trigram[0]),
300
  'Word2': str(trigram[1]),
301
  'Word3': str(trigram[2]),
302
+ 'Count': int(count),
303
+ 'Percentage': f"{(count / total_word_trigrams * 100):.2f}%"}
304
  for trigram, count in word_trigrams.most_common(20)
305
  ])
306
  st.dataframe(word_trigram_df)
 
315
 
316
  st.subheader("Words by Length Analysis")
317
 
 
 
 
 
 
 
318
  selected_length = st.selectbox("Select word length to analyze:",
319
+ sorted(length_groups.keys()),
320
+ key="length_selector")
321
 
322
  if selected_length:
323
  words_of_length = length_groups[selected_length]
 
327
  for i, char in enumerate(chars):
328
  position_chars[i][char] += 1
329
 
330
+ # Calculate totals for each position
331
+ position_totals = [sum(counter.values()) for counter in position_chars]
332
+
333
  st.write(f"Found {len(words_of_length)} words of length {selected_length}")
334
 
335
  freq_data = []
336
  for char in sorted(unique_chars):
337
  row = {'Character': char}
338
  for pos in range(selected_length):
339
+ count = position_chars[pos][char]
340
+ row[f'Pos_{pos+1}'] = count
341
+ if position_totals[pos] > 0:
342
+ row[f'Pos_{pos+1}_Pct'] = f"{(count / position_totals[pos] * 100):.2f}%"
343
+ else:
344
+ row[f'Pos_{pos+1}_Pct'] = "0.00%"
345
  freq_data.append(row)
346
 
347
  freq_df = pd.DataFrame(freq_data)
348
+ # Reorder columns to alternate count and percentage
349
+ ordered_cols = ['Character']
350
+ for pos in range(selected_length):
351
+ ordered_cols.append(f'Pos_{pos+1}')
352
+ ordered_cols.append(f'Pos_{pos+1}_Pct')
353
+ freq_df = freq_df[ordered_cols]
354
+
355
  st.dataframe(freq_df)
356
  st.markdown(get_download_link_csv(freq_df, f"length_{selected_length}_analysis.csv"),
357
  unsafe_allow_html=True)
 
370
  st.subheader("Character Context Analysis")
371
  st.write("Select a character to see what comes before and after it")
372
 
373
+ unique_chars_sorted = sorted(set(char for chars in chars_list for char in chars))
374
+ selected_char = st.selectbox("Select a character to analyze:",
375
+ unique_chars_sorted,
376
+ key="char_selector")
377
 
378
  if selected_char:
379
  before_counter = Counter()
 
391
 
392
  with col1:
393
  st.write(f"Characters that commonly PRECEDE '{selected_char}':")
394
+ total_before = sum(before_counter.values())
395
+ before_data = [
396
+ {'Character': char,
397
+ 'Count': count,
398
+ 'Percentage': f"{(count / total_before * 100):.2f}%"}
399
+ for char, count in before_counter.most_common(15)
400
+ ]
401
+ before_df = pd.DataFrame(before_data)
402
  st.dataframe(before_df)
403
 
404
  fig1, ax1 = plt.subplots(figsize=(8, 6))
 
409
 
410
  with col2:
411
  st.write(f"Characters that commonly FOLLOW '{selected_char}':")
412
+ total_after = sum(after_counter.values())
413
+ after_data = [
414
+ {'Character': char,
415
+ 'Count': count,
416
+ 'Percentage': f"{(count / total_after * 100):.2f}%"}
417
+ for char, count in after_counter.most_common(15)
418
+ ]
419
+ after_df = pd.DataFrame(after_data)
420
  st.dataframe(after_df)
421
 
422
  fig2, ax2 = plt.subplots(figsize=(8, 6))
 
428
  st.subheader("Line Viewer")
429
 
430
  available_lines = sorted(set(line_data['line'] for line_data in word_positions))
431
+ selected_line = st.selectbox("Select Line:",
432
+ [''] + [f"Line {line}" for line in available_lines],
433
+ key="line_selector")
434
 
435
  if selected_line:
436
  line_num = int(selected_line.replace('Line ', ''))
 
482
  fig_freq = plt.figure(figsize=(12, 6))
483
  char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
484
  char_freq_df['Percentage'] = (char_freq_df['Count'] / total_chars * 100).round(2)
485
+ char_freq_df['Percentage'] = char_freq_df['Percentage'].apply(lambda x: f"{x:.2f}%")
486
+ plt.bar([row['Character'] for _, row in char_freq_df.iterrows()],
487
+ [int(row['Count']) for _, row in char_freq_df.iterrows()])
488
  plt.title("Character Frequency Distribution")
489
  plt.xlabel("Character")
490
  plt.ylabel("Frequency")
 
561
  first_chars = Counter(chars[0] for chars in chars_list)
562
  last_chars = Counter(chars[-1] for chars in chars_list)
563
 
564
+ total_first = sum(first_chars.values())
565
+ total_last = sum(last_chars.values())
566
+
567
  fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
568
 
569
+ first_df = pd.DataFrame([
570
+ {'Character': char,
571
+ 'Count': count,
572
+ 'Percentage': f"{(count / total_first * 100):.2f}%"}
573
+ for char, count in first_chars.most_common(15)
574
+ ])
575
  sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
576
  ax1.set_title("Most Common Word-Initial Characters")
577
  ax1.tick_params(axis='x', rotation=45)
578
 
579
+ last_df = pd.DataFrame([
580
+ {'Character': char,
581
+ 'Count': count,
582
+ 'Percentage': f"{(count / total_last * 100):.2f}%"}
583
+ for char, count in last_chars.most_common(15)
584
+ ])
585
  sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
586
  ax2.set_title("Most Common Word-Final Characters")
587
  ax2.tick_params(axis='x', rotation=45)
588
  st.pyplot(fig6)
589
 
590
+ # Display the dataframes with percentages
591
+ col1, col2 = st.columns(2)
592
+ with col1:
593
+ st.write("Word-Initial Character Statistics:")
594
+ st.dataframe(first_df)
595
+ with col2:
596
+ st.write("Word-Final Character Statistics:")
597
+ st.dataframe(last_df)
598
+
599
  # N-gram Pattern Discovery
600
  st.subheader("N-gram Pattern Discovery")
601
  st.write("Discover recurring character sequences of different lengths")
602
 
603
+ ngram_length = st.slider("Select n-gram length:", 2, 6, 3, key="ngram_slider")
604
 
605
  ngrams = Counter()
606
  for chars in chars_list:
 
608
  ngram = tuple(chars[i:i+ngram_length])
609
  ngrams[ngram] += 1
610
 
611
+ total_ngrams = sum(ngrams.values())
612
  ngram_df = pd.DataFrame([
613
  {'Pattern': ''.join(str(c) for c in ngram),
614
  'Count': int(count),
615
+ 'Percentage': f"{(count / total_ngrams * 100):.2f}%"}
616
  for ngram, count in ngrams.most_common(30)
617
  ])
618
  st.dataframe(ngram_df)