kambris commited on
Commit
3d15a21
·
verified ·
1 Parent(s): e8e3bd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -143
app.py CHANGED
@@ -11,30 +11,13 @@ import networkx as nx
11
  st.set_page_config(layout="wide")
12
 
13
  def parse_voynich_word(word):
14
- """Parse a Voynich word into its component characters"""
15
  if not word or word.strip() == '':
16
  return None, None
17
 
18
  word = word.strip()
19
- chars = []
20
- i = 0
21
-
22
- while i < len(word):
23
- # Handle multi-character sequences
24
- if i < len(word) - 1:
25
- two_char = word[i:i+2]
26
- # Common Voynich digraphs
27
- if two_char in ['CH', 'SH', 'EE', 'II', 'AI', 'OE', 'OR', 'AR',
28
- 'AM', 'AN', 'AL', 'OD', 'OL', 'OT', 'DZ', 'PZ',
29
- 'HZ', 'FZ', 'TZ', 'GZ', 'SO', 'DO', 'TO', 'HO',
30
- 'SC', 'TC', 'HC', 'FC', 'GC', 'PC', 'DC']:
31
- chars.append(two_char)
32
- i += 2
33
- continue
34
-
35
- # Single character
36
- chars.append(word[i])
37
- i += 1
38
 
39
  return word, chars
40
 
@@ -160,6 +143,7 @@ def get_download_link_csv(df, filename):
160
 
161
  st.title("Voynich Manuscript Analyzer")
162
  st.write("Upload your CSV file to discover potential patterns and character distributions.")
 
163
 
164
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
165
 
@@ -176,70 +160,72 @@ if uploaded_file is not None:
176
  st.write(f"Total unique characters: {len(unique_chars)}")
177
  st.write("Unique characters:", ", ".join(sorted(unique_chars)))
178
 
179
- st.subheader("Sample Words")
180
  sample_df = pd.DataFrame([
181
- {'Word': word, 'Characters': ' + '.join(chars), 'Length': len(chars)}
182
  for word, chars in zip(words[:20], chars_list[:20])
183
  ])
184
  st.dataframe(sample_df)
185
 
186
- st.subheader("Trigram Analysis")
187
- char_trigrams, word_trigrams = analyze_trigrams(words, chars_list)
188
 
189
- col1, col2 = st.columns(2)
 
 
 
 
190
 
191
- with col1:
192
- st.write("Top 20 Character Trigrams")
193
- char_trigram_df = pd.DataFrame([
194
- {'Trigram': ' - '.join(trigram), 'Count': count}
195
- for trigram, count in char_trigrams.most_common(20)
196
- ])
197
- st.dataframe(char_trigram_df)
198
- st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
199
-
200
- with col2:
201
- st.write("Top 20 Word Trigrams")
202
- word_trigram_df = pd.DataFrame([
203
- {'Trigram': ' - '.join(trigram), 'Count': count}
204
- for trigram, count in word_trigrams.most_common(20)
205
- ])
206
- st.dataframe(word_trigram_df)
207
- st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
208
-
209
- st.subheader("Bigram Analysis")
210
 
211
- col1, col2 = st.columns(2)
 
212
 
213
- with col1:
214
- st.write("Character Bigrams")
215
- char_bigrams = Counter()
216
- for chars in chars_list:
217
- for i in range(len(chars)-1):
218
- bigram = tuple(chars[i:i+2])
219
- char_bigrams[bigram] += 1
220
-
221
- char_bigram_df = pd.DataFrame([
222
- {'Bigram': ' - '.join(bigram), 'Count': count}
223
- for bigram, count in char_bigrams.most_common(20)
224
- ])
225
- st.dataframe(char_bigram_df)
226
- st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True)
227
-
228
- with col2:
229
- st.write("Word Bigrams")
230
- word_bigrams = Counter()
231
- for i in range(len(words)-1):
232
- bigram = tuple(words[i:i+2])
233
- word_bigrams[bigram] += 1
234
-
235
- word_bigram_df = pd.DataFrame([
236
- {'Bigram': ' - '.join(bigram), 'Count': count}
237
- for bigram, count in word_bigrams.most_common(20)
238
- ])
239
- st.dataframe(word_bigram_df)
240
- st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
241
 
242
- st.subheader("12-Slot Character Frequency Table")
243
  slot_freq_df = create_12_slot_table(chars_list)
244
  st.dataframe(slot_freq_df)
245
  st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
@@ -251,7 +237,7 @@ if uploaded_file is not None:
251
  length_groups = defaultdict(list)
252
  for word, chars in zip(words, chars_list):
253
  length = len(chars)
254
- if length <= 12:
255
  length_groups[length].append((word, chars))
256
 
257
  selected_length = st.selectbox("Select word length to analyze:",
@@ -268,7 +254,7 @@ if uploaded_file is not None:
268
  st.write(f"Found {len(words_of_length)} words of length {selected_length}")
269
 
270
  freq_data = []
271
- for char in unique_chars:
272
  row = {'Character': char}
273
  for pos in range(selected_length):
274
  row[f'Pos_{pos+1}'] = position_chars[pos][char]
@@ -281,8 +267,8 @@ if uploaded_file is not None:
281
 
282
  st.write("Sample words of this length:")
283
  sample_df = pd.DataFrame([
284
- {'Word': word, 'Characters': ' + '.join(chars)}
285
- for word, chars in words_of_length[:20]
286
  ])
287
  st.dataframe(sample_df)
288
 
@@ -290,7 +276,8 @@ if uploaded_file is not None:
290
  line_scatter = create_line_word_scatter(line_word_map)
291
  st.pyplot(line_scatter)
292
 
293
- st.subheader("Character Pattern Analysis")
 
294
 
295
  unique_chars = sorted(set(char for chars in chars_list for char in chars))
296
  selected_char = st.selectbox("Select a character to analyze:", unique_chars)
@@ -311,11 +298,11 @@ if uploaded_file is not None:
311
 
312
  with col1:
313
  st.write(f"Characters that commonly PRECEDE '{selected_char}':")
314
- before_df = pd.DataFrame(before_counter.most_common(10),
315
  columns=['Character', 'Count'])
316
  st.dataframe(before_df)
317
 
318
- fig1, ax1 = plt.subplots()
319
  plt.bar(before_df['Character'], before_df['Count'])
320
  plt.title(f"Characters before '{selected_char}'")
321
  plt.xticks(rotation=45)
@@ -323,11 +310,11 @@ if uploaded_file is not None:
323
 
324
  with col2:
325
  st.write(f"Characters that commonly FOLLOW '{selected_char}':")
326
- after_df = pd.DataFrame(after_counter.most_common(10),
327
  columns=['Character', 'Count'])
328
  st.dataframe(after_df)
329
 
330
- fig2, ax2 = plt.subplots()
331
  plt.bar(after_df['Character'], after_df['Count'])
332
  plt.title(f"Characters after '{selected_char}'")
333
  plt.xticks(rotation=45)
@@ -346,11 +333,11 @@ if uploaded_file is not None:
346
  if line_data['line'] == line_num), [])
347
 
348
  for word, _, chars in line_words:
349
- st.write(f"**Word: {word}**")
350
- cols = st.columns(12)
351
- for i in range(12):
352
  with cols[i]:
353
- char = chars[i] if i < len(chars) else ""
354
  st.markdown(f"""
355
  <div style='
356
  width: 40px;
@@ -360,7 +347,8 @@ if uploaded_file is not None:
360
  align-items: center;
361
  justify-content: center;
362
  font-size: 16px;
363
- background-color: {"#e6f3ff" if char else "white"};
 
364
  margin: 2px;
365
  '>
366
  {char}
@@ -370,105 +358,127 @@ if uploaded_file is not None:
370
  st.subheader("Language Structure Analysis")
371
 
372
  # Word Length Distribution
373
- fig1 = plt.figure(figsize=(10, 6))
374
  word_lengths = [len(chars) for chars in chars_list]
375
  sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2))
376
  plt.title("Word Length Distribution")
377
- plt.xlabel("Word Length")
378
  plt.ylabel("Frequency")
379
  st.pyplot(fig1)
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  # Character Position Heatmap
382
- char_pos_matrix = np.zeros((len(unique_chars), 12))
 
 
 
 
 
 
383
  for chars in chars_list:
384
  for i, char in enumerate(chars):
385
- if i < 12:
386
- char_idx = list(unique_chars).index(char)
387
  char_pos_matrix[char_idx, i] += 1
388
 
389
- fig2 = plt.figure(figsize=(12, 8))
390
  sns.heatmap(char_pos_matrix,
391
- xticklabels=range(1, 13),
392
- yticklabels=sorted(unique_chars),
393
- cmap='YlOrRd')
 
394
  plt.title("Character Position Preferences")
395
  plt.xlabel("Position in Word")
396
  plt.ylabel("Character")
397
  st.pyplot(fig2)
398
 
399
- # Word Position in Line Analysis
400
- st.subheader("Word Position Analysis")
401
-
402
- word_positions_in_lines = []
403
- line_lengths = []
404
-
405
- for line_data in word_positions:
406
- line_len = len(line_data['words'])
407
- line_lengths.append(line_len)
408
- for pos, (word, _, chars) in enumerate(line_data['words']):
409
- word_positions_in_lines.append({
410
- 'position': pos + 1,
411
- 'word_length': len(chars),
412
- 'line_length': line_len
413
- })
414
-
415
- pos_df = pd.DataFrame(word_positions_in_lines)
416
-
417
- fig3 = plt.figure(figsize=(10, 6))
418
- sns.boxplot(data=pos_df, x='position', y='word_length')
419
- plt.title("Word Length by Position in Line")
420
- plt.xlabel("Position in Line")
421
- plt.ylabel("Word Length")
422
- plt.xticks(rotation=45)
423
- st.pyplot(fig3)
424
-
425
  # Character Bigram Network
426
- char_bigrams = Counter()
427
- for chars in chars_list:
428
- for i in range(len(chars)-1):
429
- char_bigrams[tuple(chars[i:i+2])] += 1
430
 
431
- G = nx.Graph()
432
- for (char1, char2), count in char_bigrams.most_common(30):
433
  G.add_edge(char1, char2, weight=count)
434
 
435
- fig4 = plt.figure(figsize=(12, 12))
436
- pos = nx.spring_layout(G, k=1, seed=42)
437
 
438
  edge_weights = [G[u][v]['weight'] for u,v in G.edges()]
439
  max_weight = max(edge_weights) if edge_weights else 1
440
 
441
  nx.draw(G, pos, with_labels=True,
442
  node_color='lightblue',
443
- node_size=1500,
444
- font_size=10,
445
- width=[G[u][v]['weight']/max_weight * 5 for u,v in G.edges()])
446
- plt.title("Top Character Connections")
 
 
 
 
 
447
  st.pyplot(fig4)
448
 
449
- # Line Length Distribution
 
 
 
450
  fig5 = plt.figure(figsize=(10, 6))
451
- sns.histplot(line_lengths)
452
  plt.title("Words per Line Distribution")
453
  plt.xlabel("Number of Words in Line")
454
  plt.ylabel("Frequency")
455
  st.pyplot(fig5)
456
 
457
  # First/Last Character Analysis
 
458
  first_chars = Counter(chars[0] for chars in chars_list)
459
  last_chars = Counter(chars[-1] for chars in chars_list)
460
 
461
- fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
462
 
463
- first_df = pd.DataFrame(first_chars.most_common(10),
464
  columns=['Character', 'Count'])
465
  sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
466
- ax1.set_title("Most Common Initial Characters")
467
  ax1.tick_params(axis='x', rotation=45)
468
 
469
- last_df = pd.DataFrame(last_chars.most_common(10),
470
  columns=['Character', 'Count'])
471
  sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
472
- ax2.set_title("Most Common Final Characters")
473
  ax2.tick_params(axis='x', rotation=45)
474
- st.pyplot(fig6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  st.set_page_config(layout="wide")
12
 
13
  def parse_voynich_word(word):
14
+ """Parse a Voynich word into individual characters - NO assumptions about digraphs"""
15
  if not word or word.strip() == '':
16
  return None, None
17
 
18
  word = word.strip()
19
+ # Simply convert to list of individual characters
20
+ chars = list(word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  return word, chars
23
 
 
143
 
144
  st.title("Voynich Manuscript Analyzer")
145
  st.write("Upload your CSV file to discover potential patterns and character distributions.")
146
+ st.write("**Bottom-up analysis**: Each character is treated independently - no assumptions about digraphs")
147
 
148
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
149
 
 
160
  st.write(f"Total unique characters: {len(unique_chars)}")
161
  st.write("Unique characters:", ", ".join(sorted(unique_chars)))
162
 
163
+ st.subheader("Sample Words (Character-by-Character)")
164
  sample_df = pd.DataFrame([
165
+ {'Word': word, 'Characters': ' | '.join(chars), 'Length': len(chars)}
166
  for word, chars in zip(words[:20], chars_list[:20])
167
  ])
168
  st.dataframe(sample_df)
169
 
170
+ st.subheader("Character Bigram Analysis")
171
+ st.write("This reveals which character pairs occur most frequently - potential digraphs emerge from the data")
172
 
173
+ char_bigrams = Counter()
174
+ for chars in chars_list:
175
+ for i in range(len(chars)-1):
176
+ bigram = tuple(chars[i:i+2])
177
+ char_bigrams[bigram] += 1
178
 
179
+ char_bigram_df = pd.DataFrame([
180
+ {'Bigram': ''.join(bigram), 'Char1': bigram[0], 'Char2': bigram[1], 'Count': count}
181
+ for bigram, count in char_bigrams.most_common(30)
182
+ ])
183
+ st.dataframe(char_bigram_df)
184
+ st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ st.subheader("Character Trigram Analysis")
187
+ st.write("Three-character sequences - looking for common patterns")
188
 
189
+ char_trigrams = Counter()
190
+ for chars in chars_list:
191
+ for i in range(len(chars)-2):
192
+ trigram = tuple(chars[i:i+3])
193
+ char_trigrams[trigram] += 1
194
+
195
+ char_trigram_df = pd.DataFrame([
196
+ {'Trigram': ''.join(trigram), 'Count': count}
197
+ for trigram, count in char_trigrams.most_common(30)
198
+ ])
199
+ st.dataframe(char_trigram_df)
200
+ st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
201
+
202
+ st.subheader("Word Bigram Analysis")
203
+ word_bigrams = Counter()
204
+ for i in range(len(words)-1):
205
+ bigram = tuple(words[i:i+2])
206
+ word_bigrams[bigram] += 1
207
+
208
+ word_bigram_df = pd.DataFrame([
209
+ {'Word1': bigram[0], 'Word2': bigram[1], 'Count': count}
210
+ for bigram, count in word_bigrams.most_common(20)
211
+ ])
212
+ st.dataframe(word_bigram_df)
213
+ st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True)
214
+
215
+ st.subheader("Word Trigram Analysis")
216
+ word_trigrams = Counter()
217
+ for i in range(len(words)-2):
218
+ trigram = tuple(words[i:i+3])
219
+ word_trigrams[trigram] += 1
220
+
221
+ word_trigram_df = pd.DataFrame([
222
+ {'Word1': trigram[0], 'Word2': trigram[1], 'Word3': trigram[2], 'Count': count}
223
+ for trigram, count in word_trigrams.most_common(20)
224
+ ])
225
+ st.dataframe(word_trigram_df)
226
+ st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
227
 
228
+ st.subheader("Character Frequency by Position")
229
  slot_freq_df = create_12_slot_table(chars_list)
230
  st.dataframe(slot_freq_df)
231
  st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
 
237
  length_groups = defaultdict(list)
238
  for word, chars in zip(words, chars_list):
239
  length = len(chars)
240
+ if length <= 20: # Extended range
241
  length_groups[length].append((word, chars))
242
 
243
  selected_length = st.selectbox("Select word length to analyze:",
 
254
  st.write(f"Found {len(words_of_length)} words of length {selected_length}")
255
 
256
  freq_data = []
257
+ for char in sorted(unique_chars):
258
  row = {'Character': char}
259
  for pos in range(selected_length):
260
  row[f'Pos_{pos+1}'] = position_chars[pos][char]
 
267
 
268
  st.write("Sample words of this length:")
269
  sample_df = pd.DataFrame([
270
+ {'Word': word, 'Characters': ' | '.join(chars)}
271
+ for word, chars in words_of_length[:30]
272
  ])
273
  st.dataframe(sample_df)
274
 
 
276
  line_scatter = create_line_word_scatter(line_word_map)
277
  st.pyplot(line_scatter)
278
 
279
+ st.subheader("Character Context Analysis")
280
+ st.write("Select a character to see what comes before and after it")
281
 
282
  unique_chars = sorted(set(char for chars in chars_list for char in chars))
283
  selected_char = st.selectbox("Select a character to analyze:", unique_chars)
 
298
 
299
  with col1:
300
  st.write(f"Characters that commonly PRECEDE '{selected_char}':")
301
+ before_df = pd.DataFrame(before_counter.most_common(15),
302
  columns=['Character', 'Count'])
303
  st.dataframe(before_df)
304
 
305
+ fig1, ax1 = plt.subplots(figsize=(8, 6))
306
  plt.bar(before_df['Character'], before_df['Count'])
307
  plt.title(f"Characters before '{selected_char}'")
308
  plt.xticks(rotation=45)
 
310
 
311
  with col2:
312
  st.write(f"Characters that commonly FOLLOW '{selected_char}':")
313
+ after_df = pd.DataFrame(after_counter.most_common(15),
314
  columns=['Character', 'Count'])
315
  st.dataframe(after_df)
316
 
317
+ fig2, ax2 = plt.subplots(figsize=(8, 6))
318
  plt.bar(after_df['Character'], after_df['Count'])
319
  plt.title(f"Characters after '{selected_char}'")
320
  plt.xticks(rotation=45)
 
333
  if line_data['line'] == line_num), [])
334
 
335
  for word, _, chars in line_words:
336
+ st.write(f"**Word: {word}** ({len(chars)} characters)")
337
+ cols = st.columns(min(20, max(12, len(chars))))
338
+ for i in range(len(chars)):
339
  with cols[i]:
340
+ char = chars[i]
341
  st.markdown(f"""
342
  <div style='
343
  width: 40px;
 
347
  align-items: center;
348
  justify-content: center;
349
  font-size: 16px;
350
+ font-weight: bold;
351
+ background-color: #e6f3ff;
352
  margin: 2px;
353
  '>
354
  {char}
 
358
  st.subheader("Language Structure Analysis")
359
 
360
  # Word Length Distribution
361
+ fig1 = plt.figure(figsize=(12, 6))
362
  word_lengths = [len(chars) for chars in chars_list]
363
  sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2))
364
  plt.title("Word Length Distribution")
365
+ plt.xlabel("Word Length (number of characters)")
366
  plt.ylabel("Frequency")
367
  st.pyplot(fig1)
368
 
369
+ # Character Frequency Overall
370
+ st.subheader("Overall Character Frequency")
371
+ all_chars_flat = [char for chars in chars_list for char in chars]
372
+ char_freq = Counter(all_chars_flat)
373
+
374
+ fig_freq = plt.figure(figsize=(12, 6))
375
+ char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
376
+ plt.bar(char_freq_df['Character'], char_freq_df['Count'])
377
+ plt.title("Character Frequency Distribution")
378
+ plt.xlabel("Character")
379
+ plt.ylabel("Frequency")
380
+ plt.xticks(rotation=45)
381
+ st.pyplot(fig_freq)
382
+ st.dataframe(char_freq_df)
383
+
384
  # Character Position Heatmap
385
+ st.subheader("Character Position Heatmap")
386
+ st.write("Shows which characters appear at which positions in words")
387
+
388
+ max_len = max(word_lengths)
389
+ char_pos_matrix = np.zeros((len(unique_chars), min(max_len, 20)))
390
+ unique_chars_list = sorted(unique_chars)
391
+
392
  for chars in chars_list:
393
  for i, char in enumerate(chars):
394
+ if i < 20:
395
+ char_idx = unique_chars_list.index(char)
396
  char_pos_matrix[char_idx, i] += 1
397
 
398
+ fig2 = plt.figure(figsize=(15, 10))
399
  sns.heatmap(char_pos_matrix,
400
+ xticklabels=range(1, min(max_len, 20)+1),
401
+ yticklabels=unique_chars_list,
402
+ cmap='YlOrRd',
403
+ cbar_kws={'label': 'Frequency'})
404
  plt.title("Character Position Preferences")
405
  plt.xlabel("Position in Word")
406
  plt.ylabel("Character")
407
  st.pyplot(fig2)
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  # Character Bigram Network
410
+ st.subheader("Character Bigram Network")
411
+ st.write("Visual representation of which characters commonly follow each other")
 
 
412
 
413
+ G = nx.DiGraph() # Directed graph to show flow
414
+ for (char1, char2), count in char_bigrams.most_common(50):
415
  G.add_edge(char1, char2, weight=count)
416
 
417
+ fig4 = plt.figure(figsize=(14, 14))
418
+ pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
419
 
420
  edge_weights = [G[u][v]['weight'] for u,v in G.edges()]
421
  max_weight = max(edge_weights) if edge_weights else 1
422
 
423
  nx.draw(G, pos, with_labels=True,
424
  node_color='lightblue',
425
+ node_size=2000,
426
+ font_size=11,
427
+ font_weight='bold',
428
+ arrows=True,
429
+ arrowsize=15,
430
+ width=[G[u][v]['weight']/max_weight * 4 for u,v in G.edges()],
431
+ edge_color='gray',
432
+ connectionstyle='arc3,rad=0.1')
433
+ plt.title("Character Sequence Network (Directed)")
434
  st.pyplot(fig4)
435
 
436
+ # Words per Line Distribution
437
+ st.subheader("Line Structure Analysis")
438
+ line_lengths = [len(line_data['words']) for line_data in word_positions]
439
+
440
  fig5 = plt.figure(figsize=(10, 6))
441
+ sns.histplot(line_lengths, bins=range(1, max(line_lengths)+2))
442
  plt.title("Words per Line Distribution")
443
  plt.xlabel("Number of Words in Line")
444
  plt.ylabel("Frequency")
445
  st.pyplot(fig5)
446
 
447
  # First/Last Character Analysis
448
+ st.subheader("Word Boundary Analysis")
449
  first_chars = Counter(chars[0] for chars in chars_list)
450
  last_chars = Counter(chars[-1] for chars in chars_list)
451
 
452
+ fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
453
 
454
+ first_df = pd.DataFrame(first_chars.most_common(15),
455
  columns=['Character', 'Count'])
456
  sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
457
+ ax1.set_title("Most Common Word-Initial Characters")
458
  ax1.tick_params(axis='x', rotation=45)
459
 
460
+ last_df = pd.DataFrame(last_chars.most_common(15),
461
  columns=['Character', 'Count'])
462
  sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
463
+ ax2.set_title("Most Common Word-Final Characters")
464
  ax2.tick_params(axis='x', rotation=45)
465
+ st.pyplot(fig6)
466
+
467
+ # N-gram Pattern Discovery
468
+ st.subheader("N-gram Pattern Discovery")
469
+ st.write("Discover recurring character sequences of different lengths")
470
+
471
+ ngram_length = st.slider("Select n-gram length:", 2, 6, 3)
472
+
473
+ ngrams = Counter()
474
+ for chars in chars_list:
475
+ for i in range(len(chars) - ngram_length + 1):
476
+ ngram = tuple(chars[i:i+ngram_length])
477
+ ngrams[ngram] += 1
478
+
479
+ ngram_df = pd.DataFrame([
480
+ {'Pattern': ''.join(ngram), 'Count': count, 'Percentage': f"{count/len(chars_list)*100:.2f}%"}
481
+ for ngram, count in ngrams.most_common(30)
482
+ ])
483
+ st.dataframe(ngram_df)
484
+ st.markdown(get_download_link_csv(ngram_df, f"{ngram_length}gram_patterns.csv"), unsafe_allow_html=True)