Spaces:

kambris
/

V3

Sleeping

App Files Files Community

kambris commited on Feb 15, 2025

Commit

25bbeb7

verified ·

1 Parent(s): c2b141b

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -47

app.py CHANGED Viewed

@@ -191,9 +191,32 @@ def get_download_link_csv(df, filename):
     href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
     return href
 # Main App
 st.title("Voynich Manuscript Analyzer")
-st.write("Upload your CSV file to discover potential patterns and character distributions.")
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
@@ -201,59 +224,35 @@ if uploaded_file is not None:
     df = load_data(uploaded_file)
     words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
     with st.expander("Basic Statistics"):
         st.write(f"Total words: {len(words)}")
         st.write(f"Total unique words: {len(set(words))}")
         unique_chars = set(char for chars in chars_list for char in chars)
-        st.write(f"Total unique characters: {len(unique_chars)}")
-        st.write("Unique characters:", ", ".join(sorted(unique_chars)))
-    with st.expander("Trigram Analysis"):
-        char_trigrams, word_trigrams = analyze_trigrams(words, chars_list)
-        st.write("Top 20 Character Trigrams")
-        char_trigram_df = pd.DataFrame([
-            {'Trigram': ' - '.join(trigram), 'Count': count}
-            for trigram, count in char_trigrams.most_common(20)
-        ])
-        st.dataframe(char_trigram_df)
-        st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
-        st.write("Top 20 Word Trigrams")
-        word_trigram_df = pd.DataFrame([
-            {'Trigram': ' - '.join(trigram), 'Count': count}
-            for trigram, count in word_trigrams.most_common(20)
-        ])
-        st.dataframe(word_trigram_df)
-        st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
-    with st.expander("Character Position Analysis"):
-        slot_summary, max_slots = analyze_slot_structure(chars_list)
-        st.plotly_chart(plot_char_positions(char_positions, max_slots))
-    with st.expander("Folio Similarity Analysis"):
-        st.plotly_chart(create_folio_word_scatter(folio_word_map))
-    with st.expander("12-Slot Character Frequency Table"):
-        slot_freq_df = create_12_slot_table(chars_list)
-        st.dataframe(slot_freq_df)
-        st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
-    with st.expander("Word Length Distribution"):
-        word_lengths = [len(chars) for chars in chars_list]
         fig = px.histogram(word_lengths, nbins=20, labels={'value': 'Word Length', 'count': 'Frequency'})
         fig.update_layout(title="Word Length Distribution")
         st.plotly_chart(fig)
-    with st.expander("Character Bigram Network"):
-        char_bigrams = Counter()
-        for chars in chars_list:
-            for i in range(len(chars)-1):
-                char_bigrams[tuple(chars[i:i+2])] += 1
-        G = nx.Graph()
-        for (char1, char2), count in char_bigrams.most_common(20):
-            G.add_edge(char1, char2, weight=count)
         pos = nx.spring_layout(G)
         edge_trace = []
@@ -272,9 +271,25 @@ if uploaded_file is not None:
         )
         fig = go.Figure(data=edge_trace + [node_trace])
-        fig.update_layout(title="Character Bigram Network", showlegend=False)
         st.plotly_chart(fig)
     with st.expander("Line Viewer"):
         available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
         selected_folio = st.selectbox("Select Folio:", [''] + available_folios)
@@ -316,4 +331,52 @@ if uploaded_file is not None:
                                 '>
                                     {char}
                                 </div>
-                                """, unsafe_allow_html=True)

     href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
     return href
+# New Features
+@st.cache_data
+def analyze_word_lengths(words, chars_list):
+    word_lengths = [len(chars) for chars in chars_list]
+    length_counter = Counter(word_lengths)
+    # Group words by length
+    words_by_length = defaultdict(list)
+    for word, chars in zip(words, chars_list):
+        words_by_length[len(chars)].append((word, chars))
+    return word_lengths, length_counter, words_by_length
+@st.cache_data
+def analyze_symbol_transitions(char_connections):
+    G = nx.DiGraph()
+    for symbol1, connections in char_connections.items():
+        for symbol2, count in connections.items():
+            G.add_edge(symbol1, symbol2, weight=count)
+    return G
 # Main App
 st.title("Voynich Manuscript Analyzer")
+st.write("Upload your CSV file to discover potential patterns and symbol distributions.")
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
     df = load_data(uploaded_file)
     words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
+    # Basic Statistics
     with st.expander("Basic Statistics"):
         st.write(f"Total words: {len(words)}")
         st.write(f"Total unique words: {len(set(words))}")
         unique_chars = set(char for chars in chars_list for char in chars)
+        st.write(f"Total unique symbols: {len(unique_chars)}")
+        st.write("Unique symbols:", ", ".join(sorted(unique_chars)))
+    # Word Length Analysis
+    with st.expander("Word Length Analysis"):
+        word_lengths, length_counter, words_by_length = analyze_word_lengths(words, chars_list)
+        st.write("Word Length Distribution")
         fig = px.histogram(word_lengths, nbins=20, labels={'value': 'Word Length', 'count': 'Frequency'})
         fig.update_layout(title="Word Length Distribution")
         st.plotly_chart(fig)
+        st.write("Most Common Words by Length")
+        selected_length = st.selectbox("Select word length", sorted(words_by_length.keys()))
+        if selected_length:
+            words_of_length = words_by_length[selected_length]
+            st.write(f"Top 10 {selected_length}-symbol words:")
+            top_words = Counter([word for word, _ in words_of_length]).most_common(10)
+            top_words_df = pd.DataFrame([{'Word': word, 'Count': count} for word, count in top_words])
+            st.dataframe(top_words_df)
+    # Symbol Transition Network
+    with st.expander("Symbol Transition Network"):
+        G = analyze_symbol_transitions(char_connections)
         pos = nx.spring_layout(G)
         edge_trace = []
         )
         fig = go.Figure(data=edge_trace + [node_trace])
+        fig.update_layout(title="Symbol Transition Network", showlegend=False)
         st.plotly_chart(fig)
+    # 12-Slot Character Frequency Table
+    with st.expander("12-Slot Symbol Frequency Table"):
+        slot_freq_df = create_12_slot_table(chars_list)
+        st.dataframe(slot_freq_df)
+        st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)
+    # Character Position Heatmap
+    with st.expander("Symbol Position Heatmap"):
+        slot_summary, max_slots = analyze_slot_structure(chars_list)
+        st.plotly_chart(plot_char_positions(char_positions, max_slots))
+    # Folio Similarity Analysis
+    with st.expander("Folio Similarity Analysis"):
+        st.plotly_chart(create_folio_word_scatter(folio_word_map))
+    # Line Viewer
     with st.expander("Line Viewer"):
         available_folios = sorted(set(line_data['folio'] for line_data in word_positions))
         selected_folio = st.selectbox("Select Folio:", [''] + available_folios)
                                 '>
                                     {char}
                                 </div>
+                                """, unsafe_allow_html=True)
+    # First/Last Symbol Analysis
+    with st.expander("First/Last Symbol Analysis"):
+        first_chars = Counter(chars[0] for chars in chars_list)
+        last_chars = Counter(chars[-1] for chars in chars_list)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("Most Common Initial Symbols")
+            first_df = pd.DataFrame(first_chars.most_common(10), columns=['Symbol', 'Count'])
+            st.dataframe(first_df)
+        with col2:
+            st.write("Most Common Final Symbols")
+            last_df = pd.DataFrame(last_chars.most_common(10), columns=['Symbol', 'Count'])
+            st.dataframe(last_df)
+    # Symbol Trigram Patterns
+    with st.expander("Symbol Trigram Patterns"):
+        char_trigrams = Counter()
+        for chars in chars_list:
+            if len(chars) >= 3:
+                for i in range(len(chars)-2):
+                    char_trigrams[tuple(chars[i:i+3])] += 1
+        st.write("Top 20 Symbol Trigrams")
+        trigram_df = pd.DataFrame([{'Trigram': ' - '.join(trigram), 'Count': count}
+                                  for trigram, count in char_trigrams.most_common(20)])
+        st.dataframe(trigram_df)
+    # Word Length Correlation Matrix
+    with st.expander("Word Length Correlation Matrix"):
+        word_lengths_by_line = []
+        for line_data in word_positions:
+            line_word_lengths = [len(chars) for _, _, chars in line_data['words']]
+            if len(line_word_lengths) >= 5:  # Only lines with 5+ words
+                word_lengths_by_line.append(line_word_lengths[:5])  # First 5 words
+        if word_lengths_by_line:
+            length_corr = np.corrcoef(np.array(word_lengths_by_line).T)
+            fig = px.imshow(length_corr,
+                            labels=dict(x="Position", y="Position", color="Correlation"),
+                            x=[f"Pos {i+1}" for i in range(5)],
+                            y=[f"Pos {i+1}" for i in range(5)],
+                            color_continuous_scale='coolwarm')
+            fig.update_layout(title="Word Length Correlations by Position")
+            st.plotly_chart(fig)