Update app.py
Browse files
app.py
CHANGED
|
@@ -11,30 +11,13 @@ import networkx as nx
|
|
| 11 |
st.set_page_config(layout="wide")
|
| 12 |
|
| 13 |
def parse_voynich_word(word):
|
| 14 |
-
"""Parse a Voynich word into
|
| 15 |
if not word or word.strip() == '':
|
| 16 |
return None, None
|
| 17 |
|
| 18 |
word = word.strip()
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
while i < len(word):
|
| 23 |
-
# Handle multi-character sequences
|
| 24 |
-
if i < len(word) - 1:
|
| 25 |
-
two_char = word[i:i+2]
|
| 26 |
-
# Common Voynich digraphs
|
| 27 |
-
if two_char in ['CH', 'SH', 'EE', 'II', 'AI', 'OE', 'OR', 'AR',
|
| 28 |
-
'AM', 'AN', 'AL', 'OD', 'OL', 'OT', 'DZ', 'PZ',
|
| 29 |
-
'HZ', 'FZ', 'TZ', 'GZ', 'SO', 'DO', 'TO', 'HO',
|
| 30 |
-
'SC', 'TC', 'HC', 'FC', 'GC', 'PC', 'DC']:
|
| 31 |
-
chars.append(two_char)
|
| 32 |
-
i += 2
|
| 33 |
-
continue
|
| 34 |
-
|
| 35 |
-
# Single character
|
| 36 |
-
chars.append(word[i])
|
| 37 |
-
i += 1
|
| 38 |
|
| 39 |
return word, chars
|
| 40 |
|
|
@@ -160,6 +143,7 @@ def get_download_link_csv(df, filename):
|
|
| 160 |
|
| 161 |
st.title("Voynich Manuscript Analyzer")
|
| 162 |
st.write("Upload your CSV file to discover potential patterns and character distributions.")
|
|
|
|
| 163 |
|
| 164 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 165 |
|
|
@@ -176,70 +160,72 @@ if uploaded_file is not None:
|
|
| 176 |
st.write(f"Total unique characters: {len(unique_chars)}")
|
| 177 |
st.write("Unique characters:", ", ".join(sorted(unique_chars)))
|
| 178 |
|
| 179 |
-
st.subheader("Sample Words")
|
| 180 |
sample_df = pd.DataFrame([
|
| 181 |
-
{'Word': word, 'Characters': '
|
| 182 |
for word, chars in zip(words[:20], chars_list[:20])
|
| 183 |
])
|
| 184 |
st.dataframe(sample_df)
|
| 185 |
|
| 186 |
-
st.subheader("
|
| 187 |
-
|
| 188 |
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
st.dataframe(char_trigram_df)
|
| 198 |
-
st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
|
| 199 |
-
|
| 200 |
-
with col2:
|
| 201 |
-
st.write("Top 20 Word Trigrams")
|
| 202 |
-
word_trigram_df = pd.DataFrame([
|
| 203 |
-
{'Trigram': ' - '.join(trigram), 'Count': count}
|
| 204 |
-
for trigram, count in word_trigrams.most_common(20)
|
| 205 |
-
])
|
| 206 |
-
st.dataframe(word_trigram_df)
|
| 207 |
-
st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
|
| 208 |
-
|
| 209 |
-
st.subheader("Bigram Analysis")
|
| 210 |
|
| 211 |
-
|
|
|
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
word_bigrams =
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
-
st.subheader("
|
| 243 |
slot_freq_df = create_12_slot_table(chars_list)
|
| 244 |
st.dataframe(slot_freq_df)
|
| 245 |
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|
|
@@ -251,7 +237,7 @@ if uploaded_file is not None:
|
|
| 251 |
length_groups = defaultdict(list)
|
| 252 |
for word, chars in zip(words, chars_list):
|
| 253 |
length = len(chars)
|
| 254 |
-
if length <=
|
| 255 |
length_groups[length].append((word, chars))
|
| 256 |
|
| 257 |
selected_length = st.selectbox("Select word length to analyze:",
|
|
@@ -268,7 +254,7 @@ if uploaded_file is not None:
|
|
| 268 |
st.write(f"Found {len(words_of_length)} words of length {selected_length}")
|
| 269 |
|
| 270 |
freq_data = []
|
| 271 |
-
for char in unique_chars:
|
| 272 |
row = {'Character': char}
|
| 273 |
for pos in range(selected_length):
|
| 274 |
row[f'Pos_{pos+1}'] = position_chars[pos][char]
|
|
@@ -281,8 +267,8 @@ if uploaded_file is not None:
|
|
| 281 |
|
| 282 |
st.write("Sample words of this length:")
|
| 283 |
sample_df = pd.DataFrame([
|
| 284 |
-
{'Word': word, 'Characters': '
|
| 285 |
-
for word, chars in words_of_length[:
|
| 286 |
])
|
| 287 |
st.dataframe(sample_df)
|
| 288 |
|
|
@@ -290,7 +276,8 @@ if uploaded_file is not None:
|
|
| 290 |
line_scatter = create_line_word_scatter(line_word_map)
|
| 291 |
st.pyplot(line_scatter)
|
| 292 |
|
| 293 |
-
st.subheader("Character
|
|
|
|
| 294 |
|
| 295 |
unique_chars = sorted(set(char for chars in chars_list for char in chars))
|
| 296 |
selected_char = st.selectbox("Select a character to analyze:", unique_chars)
|
|
@@ -311,11 +298,11 @@ if uploaded_file is not None:
|
|
| 311 |
|
| 312 |
with col1:
|
| 313 |
st.write(f"Characters that commonly PRECEDE '{selected_char}':")
|
| 314 |
-
before_df = pd.DataFrame(before_counter.most_common(
|
| 315 |
columns=['Character', 'Count'])
|
| 316 |
st.dataframe(before_df)
|
| 317 |
|
| 318 |
-
fig1, ax1 = plt.subplots()
|
| 319 |
plt.bar(before_df['Character'], before_df['Count'])
|
| 320 |
plt.title(f"Characters before '{selected_char}'")
|
| 321 |
plt.xticks(rotation=45)
|
|
@@ -323,11 +310,11 @@ if uploaded_file is not None:
|
|
| 323 |
|
| 324 |
with col2:
|
| 325 |
st.write(f"Characters that commonly FOLLOW '{selected_char}':")
|
| 326 |
-
after_df = pd.DataFrame(after_counter.most_common(
|
| 327 |
columns=['Character', 'Count'])
|
| 328 |
st.dataframe(after_df)
|
| 329 |
|
| 330 |
-
fig2, ax2 = plt.subplots()
|
| 331 |
plt.bar(after_df['Character'], after_df['Count'])
|
| 332 |
plt.title(f"Characters after '{selected_char}'")
|
| 333 |
plt.xticks(rotation=45)
|
|
@@ -346,11 +333,11 @@ if uploaded_file is not None:
|
|
| 346 |
if line_data['line'] == line_num), [])
|
| 347 |
|
| 348 |
for word, _, chars in line_words:
|
| 349 |
-
st.write(f"**Word: {word}**")
|
| 350 |
-
cols = st.columns(12)
|
| 351 |
-
for i in range(
|
| 352 |
with cols[i]:
|
| 353 |
-
char = chars[i]
|
| 354 |
st.markdown(f"""
|
| 355 |
<div style='
|
| 356 |
width: 40px;
|
|
@@ -360,7 +347,8 @@ if uploaded_file is not None:
|
|
| 360 |
align-items: center;
|
| 361 |
justify-content: center;
|
| 362 |
font-size: 16px;
|
| 363 |
-
|
|
|
|
| 364 |
margin: 2px;
|
| 365 |
'>
|
| 366 |
{char}
|
|
@@ -370,105 +358,127 @@ if uploaded_file is not None:
|
|
| 370 |
st.subheader("Language Structure Analysis")
|
| 371 |
|
| 372 |
# Word Length Distribution
|
| 373 |
-
fig1 = plt.figure(figsize=(
|
| 374 |
word_lengths = [len(chars) for chars in chars_list]
|
| 375 |
sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2))
|
| 376 |
plt.title("Word Length Distribution")
|
| 377 |
-
plt.xlabel("Word Length")
|
| 378 |
plt.ylabel("Frequency")
|
| 379 |
st.pyplot(fig1)
|
| 380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
# Character Position Heatmap
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
for chars in chars_list:
|
| 384 |
for i, char in enumerate(chars):
|
| 385 |
-
if i <
|
| 386 |
-
char_idx =
|
| 387 |
char_pos_matrix[char_idx, i] += 1
|
| 388 |
|
| 389 |
-
fig2 = plt.figure(figsize=(
|
| 390 |
sns.heatmap(char_pos_matrix,
|
| 391 |
-
xticklabels=range(1,
|
| 392 |
-
yticklabels=
|
| 393 |
-
cmap='YlOrRd'
|
|
|
|
| 394 |
plt.title("Character Position Preferences")
|
| 395 |
plt.xlabel("Position in Word")
|
| 396 |
plt.ylabel("Character")
|
| 397 |
st.pyplot(fig2)
|
| 398 |
|
| 399 |
-
# Word Position in Line Analysis
|
| 400 |
-
st.subheader("Word Position Analysis")
|
| 401 |
-
|
| 402 |
-
word_positions_in_lines = []
|
| 403 |
-
line_lengths = []
|
| 404 |
-
|
| 405 |
-
for line_data in word_positions:
|
| 406 |
-
line_len = len(line_data['words'])
|
| 407 |
-
line_lengths.append(line_len)
|
| 408 |
-
for pos, (word, _, chars) in enumerate(line_data['words']):
|
| 409 |
-
word_positions_in_lines.append({
|
| 410 |
-
'position': pos + 1,
|
| 411 |
-
'word_length': len(chars),
|
| 412 |
-
'line_length': line_len
|
| 413 |
-
})
|
| 414 |
-
|
| 415 |
-
pos_df = pd.DataFrame(word_positions_in_lines)
|
| 416 |
-
|
| 417 |
-
fig3 = plt.figure(figsize=(10, 6))
|
| 418 |
-
sns.boxplot(data=pos_df, x='position', y='word_length')
|
| 419 |
-
plt.title("Word Length by Position in Line")
|
| 420 |
-
plt.xlabel("Position in Line")
|
| 421 |
-
plt.ylabel("Word Length")
|
| 422 |
-
plt.xticks(rotation=45)
|
| 423 |
-
st.pyplot(fig3)
|
| 424 |
-
|
| 425 |
# Character Bigram Network
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
for i in range(len(chars)-1):
|
| 429 |
-
char_bigrams[tuple(chars[i:i+2])] += 1
|
| 430 |
|
| 431 |
-
G = nx.
|
| 432 |
-
for (char1, char2), count in char_bigrams.most_common(
|
| 433 |
G.add_edge(char1, char2, weight=count)
|
| 434 |
|
| 435 |
-
fig4 = plt.figure(figsize=(
|
| 436 |
-
pos = nx.spring_layout(G, k=
|
| 437 |
|
| 438 |
edge_weights = [G[u][v]['weight'] for u,v in G.edges()]
|
| 439 |
max_weight = max(edge_weights) if edge_weights else 1
|
| 440 |
|
| 441 |
nx.draw(G, pos, with_labels=True,
|
| 442 |
node_color='lightblue',
|
| 443 |
-
node_size=
|
| 444 |
-
font_size=
|
| 445 |
-
|
| 446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
st.pyplot(fig4)
|
| 448 |
|
| 449 |
-
#
|
|
|
|
|
|
|
|
|
|
| 450 |
fig5 = plt.figure(figsize=(10, 6))
|
| 451 |
-
sns.histplot(line_lengths)
|
| 452 |
plt.title("Words per Line Distribution")
|
| 453 |
plt.xlabel("Number of Words in Line")
|
| 454 |
plt.ylabel("Frequency")
|
| 455 |
st.pyplot(fig5)
|
| 456 |
|
| 457 |
# First/Last Character Analysis
|
|
|
|
| 458 |
first_chars = Counter(chars[0] for chars in chars_list)
|
| 459 |
last_chars = Counter(chars[-1] for chars in chars_list)
|
| 460 |
|
| 461 |
-
fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(
|
| 462 |
|
| 463 |
-
first_df = pd.DataFrame(first_chars.most_common(
|
| 464 |
columns=['Character', 'Count'])
|
| 465 |
sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
|
| 466 |
-
ax1.set_title("Most Common Initial Characters")
|
| 467 |
ax1.tick_params(axis='x', rotation=45)
|
| 468 |
|
| 469 |
-
last_df = pd.DataFrame(last_chars.most_common(
|
| 470 |
columns=['Character', 'Count'])
|
| 471 |
sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
|
| 472 |
-
ax2.set_title("Most Common Final Characters")
|
| 473 |
ax2.tick_params(axis='x', rotation=45)
|
| 474 |
-
st.pyplot(fig6)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
st.set_page_config(layout="wide")
|
| 12 |
|
| 13 |
def parse_voynich_word(word):
|
| 14 |
+
"""Parse a Voynich word into individual characters - NO assumptions about digraphs"""
|
| 15 |
if not word or word.strip() == '':
|
| 16 |
return None, None
|
| 17 |
|
| 18 |
word = word.strip()
|
| 19 |
+
# Simply convert to list of individual characters
|
| 20 |
+
chars = list(word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
return word, chars
|
| 23 |
|
|
|
|
| 143 |
|
| 144 |
st.title("Voynich Manuscript Analyzer")
|
| 145 |
st.write("Upload your CSV file to discover potential patterns and character distributions.")
|
| 146 |
+
st.write("**Bottom-up analysis**: Each character is treated independently - no assumptions about digraphs")
|
| 147 |
|
| 148 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 149 |
|
|
|
|
| 160 |
st.write(f"Total unique characters: {len(unique_chars)}")
|
| 161 |
st.write("Unique characters:", ", ".join(sorted(unique_chars)))
|
| 162 |
|
| 163 |
+
st.subheader("Sample Words (Character-by-Character)")
|
| 164 |
sample_df = pd.DataFrame([
|
| 165 |
+
{'Word': word, 'Characters': ' | '.join(chars), 'Length': len(chars)}
|
| 166 |
for word, chars in zip(words[:20], chars_list[:20])
|
| 167 |
])
|
| 168 |
st.dataframe(sample_df)
|
| 169 |
|
| 170 |
+
st.subheader("Character Bigram Analysis")
|
| 171 |
+
st.write("This reveals which character pairs occur most frequently - potential digraphs emerge from the data")
|
| 172 |
|
| 173 |
+
char_bigrams = Counter()
|
| 174 |
+
for chars in chars_list:
|
| 175 |
+
for i in range(len(chars)-1):
|
| 176 |
+
bigram = tuple(chars[i:i+2])
|
| 177 |
+
char_bigrams[bigram] += 1
|
| 178 |
|
| 179 |
+
char_bigram_df = pd.DataFrame([
|
| 180 |
+
{'Bigram': ''.join(bigram), 'Char1': bigram[0], 'Char2': bigram[1], 'Count': count}
|
| 181 |
+
for bigram, count in char_bigrams.most_common(30)
|
| 182 |
+
])
|
| 183 |
+
st.dataframe(char_bigram_df)
|
| 184 |
+
st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
st.subheader("Character Trigram Analysis")
|
| 187 |
+
st.write("Three-character sequences - looking for common patterns")
|
| 188 |
|
| 189 |
+
char_trigrams = Counter()
|
| 190 |
+
for chars in chars_list:
|
| 191 |
+
for i in range(len(chars)-2):
|
| 192 |
+
trigram = tuple(chars[i:i+3])
|
| 193 |
+
char_trigrams[trigram] += 1
|
| 194 |
+
|
| 195 |
+
char_trigram_df = pd.DataFrame([
|
| 196 |
+
{'Trigram': ''.join(trigram), 'Count': count}
|
| 197 |
+
for trigram, count in char_trigrams.most_common(30)
|
| 198 |
+
])
|
| 199 |
+
st.dataframe(char_trigram_df)
|
| 200 |
+
st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
|
| 201 |
+
|
| 202 |
+
st.subheader("Word Bigram Analysis")
|
| 203 |
+
word_bigrams = Counter()
|
| 204 |
+
for i in range(len(words)-1):
|
| 205 |
+
bigram = tuple(words[i:i+2])
|
| 206 |
+
word_bigrams[bigram] += 1
|
| 207 |
+
|
| 208 |
+
word_bigram_df = pd.DataFrame([
|
| 209 |
+
{'Word1': bigram[0], 'Word2': bigram[1], 'Count': count}
|
| 210 |
+
for bigram, count in word_bigrams.most_common(20)
|
| 211 |
+
])
|
| 212 |
+
st.dataframe(word_bigram_df)
|
| 213 |
+
st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True)
|
| 214 |
+
|
| 215 |
+
st.subheader("Word Trigram Analysis")
|
| 216 |
+
word_trigrams = Counter()
|
| 217 |
+
for i in range(len(words)-2):
|
| 218 |
+
trigram = tuple(words[i:i+3])
|
| 219 |
+
word_trigrams[trigram] += 1
|
| 220 |
+
|
| 221 |
+
word_trigram_df = pd.DataFrame([
|
| 222 |
+
{'Word1': trigram[0], 'Word2': trigram[1], 'Word3': trigram[2], 'Count': count}
|
| 223 |
+
for trigram, count in word_trigrams.most_common(20)
|
| 224 |
+
])
|
| 225 |
+
st.dataframe(word_trigram_df)
|
| 226 |
+
st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
|
| 227 |
|
| 228 |
+
st.subheader("Character Frequency by Position")
|
| 229 |
slot_freq_df = create_12_slot_table(chars_list)
|
| 230 |
st.dataframe(slot_freq_df)
|
| 231 |
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
|
|
|
|
| 237 |
length_groups = defaultdict(list)
|
| 238 |
for word, chars in zip(words, chars_list):
|
| 239 |
length = len(chars)
|
| 240 |
+
if length <= 20: # Extended range
|
| 241 |
length_groups[length].append((word, chars))
|
| 242 |
|
| 243 |
selected_length = st.selectbox("Select word length to analyze:",
|
|
|
|
| 254 |
st.write(f"Found {len(words_of_length)} words of length {selected_length}")
|
| 255 |
|
| 256 |
freq_data = []
|
| 257 |
+
for char in sorted(unique_chars):
|
| 258 |
row = {'Character': char}
|
| 259 |
for pos in range(selected_length):
|
| 260 |
row[f'Pos_{pos+1}'] = position_chars[pos][char]
|
|
|
|
| 267 |
|
| 268 |
st.write("Sample words of this length:")
|
| 269 |
sample_df = pd.DataFrame([
|
| 270 |
+
{'Word': word, 'Characters': ' | '.join(chars)}
|
| 271 |
+
for word, chars in words_of_length[:30]
|
| 272 |
])
|
| 273 |
st.dataframe(sample_df)
|
| 274 |
|
|
|
|
| 276 |
line_scatter = create_line_word_scatter(line_word_map)
|
| 277 |
st.pyplot(line_scatter)
|
| 278 |
|
| 279 |
+
st.subheader("Character Context Analysis")
|
| 280 |
+
st.write("Select a character to see what comes before and after it")
|
| 281 |
|
| 282 |
unique_chars = sorted(set(char for chars in chars_list for char in chars))
|
| 283 |
selected_char = st.selectbox("Select a character to analyze:", unique_chars)
|
|
|
|
| 298 |
|
| 299 |
with col1:
|
| 300 |
st.write(f"Characters that commonly PRECEDE '{selected_char}':")
|
| 301 |
+
before_df = pd.DataFrame(before_counter.most_common(15),
|
| 302 |
columns=['Character', 'Count'])
|
| 303 |
st.dataframe(before_df)
|
| 304 |
|
| 305 |
+
fig1, ax1 = plt.subplots(figsize=(8, 6))
|
| 306 |
plt.bar(before_df['Character'], before_df['Count'])
|
| 307 |
plt.title(f"Characters before '{selected_char}'")
|
| 308 |
plt.xticks(rotation=45)
|
|
|
|
| 310 |
|
| 311 |
with col2:
|
| 312 |
st.write(f"Characters that commonly FOLLOW '{selected_char}':")
|
| 313 |
+
after_df = pd.DataFrame(after_counter.most_common(15),
|
| 314 |
columns=['Character', 'Count'])
|
| 315 |
st.dataframe(after_df)
|
| 316 |
|
| 317 |
+
fig2, ax2 = plt.subplots(figsize=(8, 6))
|
| 318 |
plt.bar(after_df['Character'], after_df['Count'])
|
| 319 |
plt.title(f"Characters after '{selected_char}'")
|
| 320 |
plt.xticks(rotation=45)
|
|
|
|
| 333 |
if line_data['line'] == line_num), [])
|
| 334 |
|
| 335 |
for word, _, chars in line_words:
|
| 336 |
+
st.write(f"**Word: {word}** ({len(chars)} characters)")
|
| 337 |
+
cols = st.columns(min(20, max(12, len(chars))))
|
| 338 |
+
for i in range(len(chars)):
|
| 339 |
with cols[i]:
|
| 340 |
+
char = chars[i]
|
| 341 |
st.markdown(f"""
|
| 342 |
<div style='
|
| 343 |
width: 40px;
|
|
|
|
| 347 |
align-items: center;
|
| 348 |
justify-content: center;
|
| 349 |
font-size: 16px;
|
| 350 |
+
font-weight: bold;
|
| 351 |
+
background-color: #e6f3ff;
|
| 352 |
margin: 2px;
|
| 353 |
'>
|
| 354 |
{char}
|
|
|
|
| 358 |
st.subheader("Language Structure Analysis")
|
| 359 |
|
| 360 |
# Word Length Distribution
|
| 361 |
+
fig1 = plt.figure(figsize=(12, 6))
|
| 362 |
word_lengths = [len(chars) for chars in chars_list]
|
| 363 |
sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2))
|
| 364 |
plt.title("Word Length Distribution")
|
| 365 |
+
plt.xlabel("Word Length (number of characters)")
|
| 366 |
plt.ylabel("Frequency")
|
| 367 |
st.pyplot(fig1)
|
| 368 |
|
| 369 |
+
# Character Frequency Overall
|
| 370 |
+
st.subheader("Overall Character Frequency")
|
| 371 |
+
all_chars_flat = [char for chars in chars_list for char in chars]
|
| 372 |
+
char_freq = Counter(all_chars_flat)
|
| 373 |
+
|
| 374 |
+
fig_freq = plt.figure(figsize=(12, 6))
|
| 375 |
+
char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
|
| 376 |
+
plt.bar(char_freq_df['Character'], char_freq_df['Count'])
|
| 377 |
+
plt.title("Character Frequency Distribution")
|
| 378 |
+
plt.xlabel("Character")
|
| 379 |
+
plt.ylabel("Frequency")
|
| 380 |
+
plt.xticks(rotation=45)
|
| 381 |
+
st.pyplot(fig_freq)
|
| 382 |
+
st.dataframe(char_freq_df)
|
| 383 |
+
|
| 384 |
# Character Position Heatmap
|
| 385 |
+
st.subheader("Character Position Heatmap")
|
| 386 |
+
st.write("Shows which characters appear at which positions in words")
|
| 387 |
+
|
| 388 |
+
max_len = max(word_lengths)
|
| 389 |
+
char_pos_matrix = np.zeros((len(unique_chars), min(max_len, 20)))
|
| 390 |
+
unique_chars_list = sorted(unique_chars)
|
| 391 |
+
|
| 392 |
for chars in chars_list:
|
| 393 |
for i, char in enumerate(chars):
|
| 394 |
+
if i < 20:
|
| 395 |
+
char_idx = unique_chars_list.index(char)
|
| 396 |
char_pos_matrix[char_idx, i] += 1
|
| 397 |
|
| 398 |
+
fig2 = plt.figure(figsize=(15, 10))
|
| 399 |
sns.heatmap(char_pos_matrix,
|
| 400 |
+
xticklabels=range(1, min(max_len, 20)+1),
|
| 401 |
+
yticklabels=unique_chars_list,
|
| 402 |
+
cmap='YlOrRd',
|
| 403 |
+
cbar_kws={'label': 'Frequency'})
|
| 404 |
plt.title("Character Position Preferences")
|
| 405 |
plt.xlabel("Position in Word")
|
| 406 |
plt.ylabel("Character")
|
| 407 |
st.pyplot(fig2)
|
| 408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
# Character Bigram Network
|
| 410 |
+
st.subheader("Character Bigram Network")
|
| 411 |
+
st.write("Visual representation of which characters commonly follow each other")
|
|
|
|
|
|
|
| 412 |
|
| 413 |
+
G = nx.DiGraph() # Directed graph to show flow
|
| 414 |
+
for (char1, char2), count in char_bigrams.most_common(50):
|
| 415 |
G.add_edge(char1, char2, weight=count)
|
| 416 |
|
| 417 |
+
fig4 = plt.figure(figsize=(14, 14))
|
| 418 |
+
pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
|
| 419 |
|
| 420 |
edge_weights = [G[u][v]['weight'] for u,v in G.edges()]
|
| 421 |
max_weight = max(edge_weights) if edge_weights else 1
|
| 422 |
|
| 423 |
nx.draw(G, pos, with_labels=True,
|
| 424 |
node_color='lightblue',
|
| 425 |
+
node_size=2000,
|
| 426 |
+
font_size=11,
|
| 427 |
+
font_weight='bold',
|
| 428 |
+
arrows=True,
|
| 429 |
+
arrowsize=15,
|
| 430 |
+
width=[G[u][v]['weight']/max_weight * 4 for u,v in G.edges()],
|
| 431 |
+
edge_color='gray',
|
| 432 |
+
connectionstyle='arc3,rad=0.1')
|
| 433 |
+
plt.title("Character Sequence Network (Directed)")
|
| 434 |
st.pyplot(fig4)
|
| 435 |
|
| 436 |
+
# Words per Line Distribution
|
| 437 |
+
st.subheader("Line Structure Analysis")
|
| 438 |
+
line_lengths = [len(line_data['words']) for line_data in word_positions]
|
| 439 |
+
|
| 440 |
fig5 = plt.figure(figsize=(10, 6))
|
| 441 |
+
sns.histplot(line_lengths, bins=range(1, max(line_lengths)+2))
|
| 442 |
plt.title("Words per Line Distribution")
|
| 443 |
plt.xlabel("Number of Words in Line")
|
| 444 |
plt.ylabel("Frequency")
|
| 445 |
st.pyplot(fig5)
|
| 446 |
|
| 447 |
# First/Last Character Analysis
|
| 448 |
+
st.subheader("Word Boundary Analysis")
|
| 449 |
first_chars = Counter(chars[0] for chars in chars_list)
|
| 450 |
last_chars = Counter(chars[-1] for chars in chars_list)
|
| 451 |
|
| 452 |
+
fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
| 453 |
|
| 454 |
+
first_df = pd.DataFrame(first_chars.most_common(15),
|
| 455 |
columns=['Character', 'Count'])
|
| 456 |
sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
|
| 457 |
+
ax1.set_title("Most Common Word-Initial Characters")
|
| 458 |
ax1.tick_params(axis='x', rotation=45)
|
| 459 |
|
| 460 |
+
last_df = pd.DataFrame(last_chars.most_common(15),
|
| 461 |
columns=['Character', 'Count'])
|
| 462 |
sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
|
| 463 |
+
ax2.set_title("Most Common Word-Final Characters")
|
| 464 |
ax2.tick_params(axis='x', rotation=45)
|
| 465 |
+
st.pyplot(fig6)
|
| 466 |
+
|
| 467 |
+
# N-gram Pattern Discovery
|
| 468 |
+
st.subheader("N-gram Pattern Discovery")
|
| 469 |
+
st.write("Discover recurring character sequences of different lengths")
|
| 470 |
+
|
| 471 |
+
ngram_length = st.slider("Select n-gram length:", 2, 6, 3)
|
| 472 |
+
|
| 473 |
+
ngrams = Counter()
|
| 474 |
+
for chars in chars_list:
|
| 475 |
+
for i in range(len(chars) - ngram_length + 1):
|
| 476 |
+
ngram = tuple(chars[i:i+ngram_length])
|
| 477 |
+
ngrams[ngram] += 1
|
| 478 |
+
|
| 479 |
+
ngram_df = pd.DataFrame([
|
| 480 |
+
{'Pattern': ''.join(ngram), 'Count': count, 'Percentage': f"{count/len(chars_list)*100:.2f}%"}
|
| 481 |
+
for ngram, count in ngrams.most_common(30)
|
| 482 |
+
])
|
| 483 |
+
st.dataframe(ngram_df)
|
| 484 |
+
st.markdown(get_download_link_csv(ngram_df, f"{ngram_length}gram_patterns.csv"), unsafe_allow_html=True)
|