Spaces:

kambris
/

V3

Sleeping

App Files Files Community

kambris commited on Dec 17, 2025

Commit

3d15a21

verified ·

1 Parent(s): e8e3bd0

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -143

app.py CHANGED Viewed

@@ -11,30 +11,13 @@ import networkx as nx
 st.set_page_config(layout="wide")
 def parse_voynich_word(word):
-    """Parse a Voynich word into its component characters"""
     if not word or word.strip() == '':
         return None, None
     word = word.strip()
-    chars = []
-    i = 0
-    while i < len(word):
-        # Handle multi-character sequences
-        if i < len(word) - 1:
-            two_char = word[i:i+2]
-            # Common Voynich digraphs
-            if two_char in ['CH', 'SH', 'EE', 'II', 'AI', 'OE', 'OR', 'AR',
-                           'AM', 'AN', 'AL', 'OD', 'OL', 'OT', 'DZ', 'PZ',
-                           'HZ', 'FZ', 'TZ', 'GZ', 'SO', 'DO', 'TO', 'HO',
-                           'SC', 'TC', 'HC', 'FC', 'GC', 'PC', 'DC']:
-                chars.append(two_char)
-                i += 2
-                continue
-        # Single character
-        chars.append(word[i])
-        i += 1
     return word, chars
@@ -160,6 +143,7 @@ def get_download_link_csv(df, filename):
 st.title("Voynich Manuscript Analyzer")
 st.write("Upload your CSV file to discover potential patterns and character distributions.")
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
@@ -176,70 +160,72 @@ if uploaded_file is not None:
     st.write(f"Total unique characters: {len(unique_chars)}")
     st.write("Unique characters:", ", ".join(sorted(unique_chars)))
-    st.subheader("Sample Words")
     sample_df = pd.DataFrame([
-        {'Word': word, 'Characters': ' + '.join(chars), 'Length': len(chars)}
         for word, chars in zip(words[:20], chars_list[:20])
     ])
     st.dataframe(sample_df)
-    st.subheader("Trigram Analysis")
-    char_trigrams, word_trigrams = analyze_trigrams(words, chars_list)
-    col1, col2 = st.columns(2)
-    with col1:
-        st.write("Top 20 Character Trigrams")
-        char_trigram_df = pd.DataFrame([
-            {'Trigram': ' - '.join(trigram), 'Count': count}
-            for trigram, count in char_trigrams.most_common(20)
-        ])
-        st.dataframe(char_trigram_df)
-        st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
-    with col2:
-        st.write("Top 20 Word Trigrams")
-        word_trigram_df = pd.DataFrame([
-            {'Trigram': ' - '.join(trigram), 'Count': count}
-            for trigram, count in word_trigrams.most_common(20)
-        ])
-        st.dataframe(word_trigram_df)
-        st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
-    st.subheader("Bigram Analysis")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.write("Character Bigrams")
-        char_bigrams = Counter()
-        for chars in chars_list:
-            for i in range(len(chars)-1):
-                bigram = tuple(chars[i:i+2])
-                char_bigrams[bigram] += 1
-        char_bigram_df = pd.DataFrame([
-            {'Bigram': ' - '.join(bigram), 'Count': count}
-            for bigram, count in char_bigrams.most_common(20)
-        ])
-        st.dataframe(char_bigram_df)
-        st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True)
-    with col2:
-        st.write("Word Bigrams")
-        word_bigrams = Counter()
-        for i in range(len(words)-1):
-            bigram = tuple(words[i:i+2])
-            word_bigrams[bigram] += 1
-        word_bigram_df = pd.DataFrame([
-            {'Bigram': ' - '.join(bigram), 'Count': count}
-            for bigram, count in word_bigrams.most_common(20)
-        ])
-        st.dataframe(word_bigram_df)
-        st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True)
-    st.subheader("12-Slot Character Frequency Table")
     slot_freq_df = create_12_slot_table(chars_list)
     st.dataframe(slot_freq_df)
     st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
@@ -251,7 +237,7 @@ if uploaded_file is not None:
     length_groups = defaultdict(list)
     for word, chars in zip(words, chars_list):
         length = len(chars)
-        if length <= 12:
             length_groups[length].append((word, chars))
     selected_length = st.selectbox("Select word length to analyze:",
@@ -268,7 +254,7 @@ if uploaded_file is not None:
         st.write(f"Found {len(words_of_length)} words of length {selected_length}")
         freq_data = []
-        for char in unique_chars:
             row = {'Character': char}
             for pos in range(selected_length):
                 row[f'Pos_{pos+1}'] = position_chars[pos][char]
@@ -281,8 +267,8 @@ if uploaded_file is not None:
         st.write("Sample words of this length:")
         sample_df = pd.DataFrame([
-            {'Word': word, 'Characters': ' + '.join(chars)}
-            for word, chars in words_of_length[:20]
         ])
         st.dataframe(sample_df)
@@ -290,7 +276,8 @@ if uploaded_file is not None:
     line_scatter = create_line_word_scatter(line_word_map)
     st.pyplot(line_scatter)
-    st.subheader("Character Pattern Analysis")
     unique_chars = sorted(set(char for chars in chars_list for char in chars))
     selected_char = st.selectbox("Select a character to analyze:", unique_chars)
@@ -311,11 +298,11 @@ if uploaded_file is not None:
         with col1:
             st.write(f"Characters that commonly PRECEDE '{selected_char}':")
-            before_df = pd.DataFrame(before_counter.most_common(10),
                                    columns=['Character', 'Count'])
             st.dataframe(before_df)
-            fig1, ax1 = plt.subplots()
             plt.bar(before_df['Character'], before_df['Count'])
             plt.title(f"Characters before '{selected_char}'")
             plt.xticks(rotation=45)
@@ -323,11 +310,11 @@ if uploaded_file is not None:
         with col2:
             st.write(f"Characters that commonly FOLLOW '{selected_char}':")
-            after_df = pd.DataFrame(after_counter.most_common(10),
                                   columns=['Character', 'Count'])
             st.dataframe(after_df)
-            fig2, ax2 = plt.subplots()
             plt.bar(after_df['Character'], after_df['Count'])
             plt.title(f"Characters after '{selected_char}'")
             plt.xticks(rotation=45)
@@ -346,11 +333,11 @@ if uploaded_file is not None:
                           if line_data['line'] == line_num), [])
         for word, _, chars in line_words:
-            st.write(f"**Word: {word}**")
-            cols = st.columns(12)
-            for i in range(12):
                 with cols[i]:
-                    char = chars[i] if i < len(chars) else ""
                     st.markdown(f"""
                         <div style='
                             width: 40px;
@@ -360,7 +347,8 @@ if uploaded_file is not None:
                             align-items: center;
                             justify-content: center;
                             font-size: 16px;
-                            background-color: {"#e6f3ff" if char else "white"};
                             margin: 2px;
                         '>
                             {char}
@@ -370,105 +358,127 @@ if uploaded_file is not None:
     st.subheader("Language Structure Analysis")
     # Word Length Distribution
-    fig1 = plt.figure(figsize=(10, 6))
     word_lengths = [len(chars) for chars in chars_list]
     sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2))
     plt.title("Word Length Distribution")
-    plt.xlabel("Word Length")
     plt.ylabel("Frequency")
     st.pyplot(fig1)
     # Character Position Heatmap
-    char_pos_matrix = np.zeros((len(unique_chars), 12))
     for chars in chars_list:
         for i, char in enumerate(chars):
-            if i < 12:
-                char_idx = list(unique_chars).index(char)
                 char_pos_matrix[char_idx, i] += 1
-    fig2 = plt.figure(figsize=(12, 8))
     sns.heatmap(char_pos_matrix,
-                xticklabels=range(1, 13),
-                yticklabels=sorted(unique_chars),
-                cmap='YlOrRd')
     plt.title("Character Position Preferences")
     plt.xlabel("Position in Word")
     plt.ylabel("Character")
     st.pyplot(fig2)
-    # Word Position in Line Analysis
-    st.subheader("Word Position Analysis")
-    word_positions_in_lines = []
-    line_lengths = []
-    for line_data in word_positions:
-        line_len = len(line_data['words'])
-        line_lengths.append(line_len)
-        for pos, (word, _, chars) in enumerate(line_data['words']):
-            word_positions_in_lines.append({
-                'position': pos + 1,
-                'word_length': len(chars),
-                'line_length': line_len
-            })
-    pos_df = pd.DataFrame(word_positions_in_lines)
-    fig3 = plt.figure(figsize=(10, 6))
-    sns.boxplot(data=pos_df, x='position', y='word_length')
-    plt.title("Word Length by Position in Line")
-    plt.xlabel("Position in Line")
-    plt.ylabel("Word Length")
-    plt.xticks(rotation=45)
-    st.pyplot(fig3)
     # Character Bigram Network
-    char_bigrams = Counter()
-    for chars in chars_list:
-        for i in range(len(chars)-1):
-            char_bigrams[tuple(chars[i:i+2])] += 1
-    G = nx.Graph()
-    for (char1, char2), count in char_bigrams.most_common(30):
         G.add_edge(char1, char2, weight=count)
-    fig4 = plt.figure(figsize=(12, 12))
-    pos = nx.spring_layout(G, k=1, seed=42)
     edge_weights = [G[u][v]['weight'] for u,v in G.edges()]
     max_weight = max(edge_weights) if edge_weights else 1
     nx.draw(G, pos, with_labels=True,
             node_color='lightblue',
-            node_size=1500,
-            font_size=10,
-            width=[G[u][v]['weight']/max_weight * 5 for u,v in G.edges()])
-    plt.title("Top Character Connections")
     st.pyplot(fig4)
-    # Line Length Distribution
     fig5 = plt.figure(figsize=(10, 6))
-    sns.histplot(line_lengths)
     plt.title("Words per Line Distribution")
     plt.xlabel("Number of Words in Line")
     plt.ylabel("Frequency")
     st.pyplot(fig5)
     # First/Last Character Analysis
     first_chars = Counter(chars[0] for chars in chars_list)
     last_chars = Counter(chars[-1] for chars in chars_list)
-    fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
-    first_df = pd.DataFrame(first_chars.most_common(10),
                            columns=['Character', 'Count'])
     sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
-    ax1.set_title("Most Common Initial Characters")
     ax1.tick_params(axis='x', rotation=45)
-    last_df = pd.DataFrame(last_chars.most_common(10),
                           columns=['Character', 'Count'])
     sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
-    ax2.set_title("Most Common Final Characters")
     ax2.tick_params(axis='x', rotation=45)
-    st.pyplot(fig6)

 st.set_page_config(layout="wide")
 def parse_voynich_word(word):
+    """Parse a Voynich word into individual characters - NO assumptions about digraphs"""
     if not word or word.strip() == '':
         return None, None
     word = word.strip()
+    # Simply convert to list of individual characters
+    chars = list(word)
     return word, chars
 st.title("Voynich Manuscript Analyzer")
 st.write("Upload your CSV file to discover potential patterns and character distributions.")
+st.write("**Bottom-up analysis**: Each character is treated independently - no assumptions about digraphs")
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
     st.write(f"Total unique characters: {len(unique_chars)}")
     st.write("Unique characters:", ", ".join(sorted(unique_chars)))
+    st.subheader("Sample Words (Character-by-Character)")
     sample_df = pd.DataFrame([
+        {'Word': word, 'Characters': ' | '.join(chars), 'Length': len(chars)}
         for word, chars in zip(words[:20], chars_list[:20])
     ])
     st.dataframe(sample_df)
+    st.subheader("Character Bigram Analysis")
+    st.write("This reveals which character pairs occur most frequently - potential digraphs emerge from the data")
+    char_bigrams = Counter()
+    for chars in chars_list:
+        for i in range(len(chars)-1):
+            bigram = tuple(chars[i:i+2])
+            char_bigrams[bigram] += 1
+    char_bigram_df = pd.DataFrame([
+        {'Bigram': ''.join(bigram), 'Char1': bigram[0], 'Char2': bigram[1], 'Count': count}
+        for bigram, count in char_bigrams.most_common(30)
+    ])
+    st.dataframe(char_bigram_df)
+    st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True)
+    st.subheader("Character Trigram Analysis")
+    st.write("Three-character sequences - looking for common patterns")
+    char_trigrams = Counter()
+    for chars in chars_list:
+        for i in range(len(chars)-2):
+            trigram = tuple(chars[i:i+3])
+            char_trigrams[trigram] += 1
+    char_trigram_df = pd.DataFrame([
+        {'Trigram': ''.join(trigram), 'Count': count}
+        for trigram, count in char_trigrams.most_common(30)
+    ])
+    st.dataframe(char_trigram_df)
+    st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
+    st.subheader("Word Bigram Analysis")
+    word_bigrams = Counter()
+    for i in range(len(words)-1):
+        bigram = tuple(words[i:i+2])
+        word_bigrams[bigram] += 1
+    word_bigram_df = pd.DataFrame([
+        {'Word1': bigram[0], 'Word2': bigram[1], 'Count': count}
+        for bigram, count in word_bigrams.most_common(20)
+    ])
+    st.dataframe(word_bigram_df)
+    st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True)
+    st.subheader("Word Trigram Analysis")
+    word_trigrams = Counter()
+    for i in range(len(words)-2):
+        trigram = tuple(words[i:i+3])
+        word_trigrams[trigram] += 1
+    word_trigram_df = pd.DataFrame([
+        {'Word1': trigram[0], 'Word2': trigram[1], 'Word3': trigram[2], 'Count': count}
+        for trigram, count in word_trigrams.most_common(20)
+    ])
+    st.dataframe(word_trigram_df)
+    st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
+    st.subheader("Character Frequency by Position")
     slot_freq_df = create_12_slot_table(chars_list)
     st.dataframe(slot_freq_df)
     st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
     length_groups = defaultdict(list)
     for word, chars in zip(words, chars_list):
         length = len(chars)
+        if length <= 20:  # Extended range
             length_groups[length].append((word, chars))
     selected_length = st.selectbox("Select word length to analyze:",
         st.write(f"Found {len(words_of_length)} words of length {selected_length}")
         freq_data = []
+        for char in sorted(unique_chars):
             row = {'Character': char}
             for pos in range(selected_length):
                 row[f'Pos_{pos+1}'] = position_chars[pos][char]
         st.write("Sample words of this length:")
         sample_df = pd.DataFrame([
+            {'Word': word, 'Characters': ' | '.join(chars)}
+            for word, chars in words_of_length[:30]
         ])
         st.dataframe(sample_df)
     line_scatter = create_line_word_scatter(line_word_map)
     st.pyplot(line_scatter)
+    st.subheader("Character Context Analysis")
+    st.write("Select a character to see what comes before and after it")
     unique_chars = sorted(set(char for chars in chars_list for char in chars))
     selected_char = st.selectbox("Select a character to analyze:", unique_chars)
         with col1:
             st.write(f"Characters that commonly PRECEDE '{selected_char}':")
+            before_df = pd.DataFrame(before_counter.most_common(15),
                                    columns=['Character', 'Count'])
             st.dataframe(before_df)
+            fig1, ax1 = plt.subplots(figsize=(8, 6))
             plt.bar(before_df['Character'], before_df['Count'])
             plt.title(f"Characters before '{selected_char}'")
             plt.xticks(rotation=45)
         with col2:
             st.write(f"Characters that commonly FOLLOW '{selected_char}':")
+            after_df = pd.DataFrame(after_counter.most_common(15),
                                   columns=['Character', 'Count'])
             st.dataframe(after_df)
+            fig2, ax2 = plt.subplots(figsize=(8, 6))
             plt.bar(after_df['Character'], after_df['Count'])
             plt.title(f"Characters after '{selected_char}'")
             plt.xticks(rotation=45)
                           if line_data['line'] == line_num), [])
         for word, _, chars in line_words:
+            st.write(f"**Word: {word}** ({len(chars)} characters)")
+            cols = st.columns(min(20, max(12, len(chars))))
+            for i in range(len(chars)):
                 with cols[i]:
+                    char = chars[i]
                     st.markdown(f"""
                         <div style='
                             width: 40px;
                             align-items: center;
                             justify-content: center;
                             font-size: 16px;
+                            font-weight: bold;
+                            background-color: #e6f3ff;
                             margin: 2px;
                         '>
                             {char}
     st.subheader("Language Structure Analysis")
     # Word Length Distribution
+    fig1 = plt.figure(figsize=(12, 6))
     word_lengths = [len(chars) for chars in chars_list]
     sns.histplot(word_lengths, bins=range(1, max(word_lengths)+2))
     plt.title("Word Length Distribution")
+    plt.xlabel("Word Length (number of characters)")
     plt.ylabel("Frequency")
     st.pyplot(fig1)
+    # Character Frequency Overall
+    st.subheader("Overall Character Frequency")
+    all_chars_flat = [char for chars in chars_list for char in chars]
+    char_freq = Counter(all_chars_flat)
+    fig_freq = plt.figure(figsize=(12, 6))
+    char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
+    plt.bar(char_freq_df['Character'], char_freq_df['Count'])
+    plt.title("Character Frequency Distribution")
+    plt.xlabel("Character")
+    plt.ylabel("Frequency")
+    plt.xticks(rotation=45)
+    st.pyplot(fig_freq)
+    st.dataframe(char_freq_df)
     # Character Position Heatmap
+    st.subheader("Character Position Heatmap")
+    st.write("Shows which characters appear at which positions in words")
+    max_len = max(word_lengths)
+    char_pos_matrix = np.zeros((len(unique_chars), min(max_len, 20)))
+    unique_chars_list = sorted(unique_chars)
     for chars in chars_list:
         for i, char in enumerate(chars):
+            if i < 20:
+                char_idx = unique_chars_list.index(char)
                 char_pos_matrix[char_idx, i] += 1
+    fig2 = plt.figure(figsize=(15, 10))
     sns.heatmap(char_pos_matrix,
+                xticklabels=range(1, min(max_len, 20)+1),
+                yticklabels=unique_chars_list,
+                cmap='YlOrRd',
+                cbar_kws={'label': 'Frequency'})
     plt.title("Character Position Preferences")
     plt.xlabel("Position in Word")
     plt.ylabel("Character")
     st.pyplot(fig2)
     # Character Bigram Network
+    st.subheader("Character Bigram Network")
+    st.write("Visual representation of which characters commonly follow each other")
+    G = nx.DiGraph()  # Directed graph to show flow
+    for (char1, char2), count in char_bigrams.most_common(50):
         G.add_edge(char1, char2, weight=count)
+    fig4 = plt.figure(figsize=(14, 14))
+    pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
     edge_weights = [G[u][v]['weight'] for u,v in G.edges()]
     max_weight = max(edge_weights) if edge_weights else 1
     nx.draw(G, pos, with_labels=True,
             node_color='lightblue',
+            node_size=2000,
+            font_size=11,
+            font_weight='bold',
+            arrows=True,
+            arrowsize=15,
+            width=[G[u][v]['weight']/max_weight * 4 for u,v in G.edges()],
+            edge_color='gray',
+            connectionstyle='arc3,rad=0.1')
+    plt.title("Character Sequence Network (Directed)")
     st.pyplot(fig4)
+    # Words per Line Distribution
+    st.subheader("Line Structure Analysis")
+    line_lengths = [len(line_data['words']) for line_data in word_positions]
     fig5 = plt.figure(figsize=(10, 6))
+    sns.histplot(line_lengths, bins=range(1, max(line_lengths)+2))
     plt.title("Words per Line Distribution")
     plt.xlabel("Number of Words in Line")
     plt.ylabel("Frequency")
     st.pyplot(fig5)
     # First/Last Character Analysis
+    st.subheader("Word Boundary Analysis")
     first_chars = Counter(chars[0] for chars in chars_list)
     last_chars = Counter(chars[-1] for chars in chars_list)
+    fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+    first_df = pd.DataFrame(first_chars.most_common(15),
                            columns=['Character', 'Count'])
     sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
+    ax1.set_title("Most Common Word-Initial Characters")
     ax1.tick_params(axis='x', rotation=45)
+    last_df = pd.DataFrame(last_chars.most_common(15),
                           columns=['Character', 'Count'])
     sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
+    ax2.set_title("Most Common Word-Final Characters")
     ax2.tick_params(axis='x', rotation=45)
+    st.pyplot(fig6)
+    # N-gram Pattern Discovery
+    st.subheader("N-gram Pattern Discovery")
+    st.write("Discover recurring character sequences of different lengths")
+    ngram_length = st.slider("Select n-gram length:", 2, 6, 3)
+    ngrams = Counter()
+    for chars in chars_list:
+        for i in range(len(chars) - ngram_length + 1):
+            ngram = tuple(chars[i:i+ngram_length])
+            ngrams[ngram] += 1
+    ngram_df = pd.DataFrame([
+        {'Pattern': ''.join(ngram), 'Count': count, 'Percentage': f"{count/len(chars_list)*100:.2f}%"}
+        for ngram, count in ngrams.most_common(30)
+    ])
+    st.dataframe(ngram_df)
+    st.markdown(get_download_link_csv(ngram_df, f"{ngram_length}gram_patterns.csv"), unsafe_allow_html=True)