Spaces:

kambris
/

V3

Sleeping

App Files Files Community

kambris commited on Dec 18, 2025

Commit

8f5e990

verified ·

1 Parent(s): e7e7a90

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -55

app.py CHANGED Viewed

@@ -22,18 +22,32 @@ st.markdown("""
     </style>
     """, unsafe_allow_html=True)
 def parse_voynich_word(word):
-    """Parse a Voynich word into individual characters - NO assumptions about digraphs"""
     if not word or word.strip() == '':
         return None, None
     word = word.strip()
-    # Simply convert to list of individual characters
-    chars = list(word)
-    return word, chars
-def analyze_csv(df):
     words = []
     chars_list = []
     char_positions = defaultdict(list)
@@ -68,20 +82,15 @@ def analyze_csv(df):
     return words, chars_list, char_positions, char_connections, word_positions, line_word_map
-def analyze_trigrams(words, chars_list):
-    char_trigrams = Counter()
-    word_trigrams = Counter()
-    for chars in chars_list:
-        for i in range(len(chars)-2):
-            trigram = tuple(chars[i:i+3])
-            char_trigrams[trigram] += 1
-    for i in range(len(words)-2):
-        trigram = tuple(words[i:i+3])
-        word_trigrams[trigram] += 1
-    return char_trigrams, word_trigrams
 def create_12_slot_table(chars_list):
     slot_frequencies = [Counter() for _ in range(12)]
@@ -89,17 +98,32 @@ def create_12_slot_table(chars_list):
     for chars in chars_list:
         for i, char in enumerate(chars[:12]):
             slot_frequencies[i][char] += 1
     data = []
     all_chars = sorted(set(char for counter in slot_frequencies for char in counter))
     for char in all_chars:
         row = {'Character': char}
         for i in range(12):
-            row[f'Slot_{i+1}'] = slot_frequencies[i][char]
         data.append(row)
-    return pd.DataFrame(data)
 def analyze_slot_structure(chars_list):
     slot_contents = defaultdict(Counter)
@@ -154,10 +178,10 @@ def get_download_link_csv(df, filename):
     return href
 st.title("Voynich Manuscript Analyzer")
-st.write("Upload your CSV file to discover potential patterns and character distributions.")
-# Upload eva legend
-# Add image uploader in sidebar
 floating_image_file = st.sidebar.file_uploader("Upload an image",
                                                 type=['png', 'jpg', 'jpeg', 'gif'],
                                                 key="floating_image")
@@ -181,7 +205,15 @@ if uploaded_file is not None:
     # Create DataFrame from parsed data
     df = pd.DataFrame(data)
-    words, chars_list, char_positions, char_connections, word_positions, line_word_map = analyze_csv(df)
     st.subheader("Basic Statistics")
     st.write(f"Total words: {len(words)}")
@@ -208,11 +240,13 @@ if uploaded_file is not None:
             bigram = tuple(chars[i:i+2])
             char_bigrams[bigram] += 1
     char_bigram_df = pd.DataFrame([
         {'Bigram': ''.join(str(c) for c in bigram),
          'Char1': str(bigram[0]),
          'Char2': str(bigram[1]),
-         'Count': int(count)}
         for bigram, count in char_bigrams.most_common(30)
     ])
     st.dataframe(char_bigram_df)
@@ -227,8 +261,11 @@ if uploaded_file is not None:
             trigram = tuple(chars[i:i+3])
             char_trigrams[trigram] += 1
     char_trigram_df = pd.DataFrame([
-        {'Trigram': ''.join(str(c) for c in trigram), 'Count': int(count)}
         for trigram, count in char_trigrams.most_common(30)
     ])
     st.dataframe(char_trigram_df)
@@ -239,9 +276,13 @@ if uploaded_file is not None:
     for i in range(len(words)-1):
         bigram = tuple(words[i:i+2])
         word_bigrams[bigram] += 1
     word_bigram_df = pd.DataFrame([
-        {'Word1': str(bigram[0]), 'Word2': str(bigram[1]), 'Count': int(count)}
         for bigram, count in word_bigrams.most_common(20)
     ])
     st.dataframe(word_bigram_df)
@@ -252,12 +293,14 @@ if uploaded_file is not None:
     for i in range(len(words)-2):
         trigram = tuple(words[i:i+3])
         word_trigrams[trigram] += 1
     word_trigram_df = pd.DataFrame([
         {'Word1': str(trigram[0]),
          'Word2': str(trigram[1]),
          'Word3': str(trigram[2]),
-         'Count': int(count)}
         for trigram, count in word_trigrams.most_common(20)
     ])
     st.dataframe(word_trigram_df)
@@ -272,14 +315,9 @@ if uploaded_file is not None:
     st.subheader("Words by Length Analysis")
-    length_groups = defaultdict(list)
-    for word, chars in zip(words, chars_list):
-        length = len(chars)
-        if length <= 20:  # Extended range
-            length_groups[length].append((word, chars))
     selected_length = st.selectbox("Select word length to analyze:",
-                                 sorted(length_groups.keys()))
     if selected_length:
         words_of_length = length_groups[selected_length]
@@ -289,16 +327,31 @@ if uploaded_file is not None:
             for i, char in enumerate(chars):
                 position_chars[i][char] += 1
         st.write(f"Found {len(words_of_length)} words of length {selected_length}")
         freq_data = []
         for char in sorted(unique_chars):
             row = {'Character': char}
             for pos in range(selected_length):
-                row[f'Pos_{pos+1}'] = position_chars[pos][char]
             freq_data.append(row)
         freq_df = pd.DataFrame(freq_data)
         st.dataframe(freq_df)
         st.markdown(get_download_link_csv(freq_df, f"length_{selected_length}_analysis.csv"),
                    unsafe_allow_html=True)
@@ -317,8 +370,10 @@ if uploaded_file is not None:
     st.subheader("Character Context Analysis")
     st.write("Select a character to see what comes before and after it")
-    unique_chars = sorted(set(char for chars in chars_list for char in chars))
-    selected_char = st.selectbox("Select a character to analyze:", unique_chars)
     if selected_char:
         before_counter = Counter()
@@ -336,8 +391,14 @@ if uploaded_file is not None:
         with col1:
             st.write(f"Characters that commonly PRECEDE '{selected_char}':")
-            before_df = pd.DataFrame(before_counter.most_common(15),
-                                   columns=['Character', 'Count'])
             st.dataframe(before_df)
             fig1, ax1 = plt.subplots(figsize=(8, 6))
@@ -348,8 +409,14 @@ if uploaded_file is not None:
         with col2:
             st.write(f"Characters that commonly FOLLOW '{selected_char}':")
-            after_df = pd.DataFrame(after_counter.most_common(15),
-                                  columns=['Character', 'Count'])
             st.dataframe(after_df)
             fig2, ax2 = plt.subplots(figsize=(8, 6))
@@ -361,7 +428,9 @@ if uploaded_file is not None:
     st.subheader("Line Viewer")
     available_lines = sorted(set(line_data['line'] for line_data in word_positions))
-    selected_line = st.selectbox("Select Line:", [''] + [f"Line {line}" for line in available_lines])
     if selected_line:
         line_num = int(selected_line.replace('Line ', ''))
@@ -413,7 +482,9 @@ if uploaded_file is not None:
     fig_freq = plt.figure(figsize=(12, 6))
     char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
     char_freq_df['Percentage'] = (char_freq_df['Count'] / total_chars * 100).round(2)
-    plt.bar(char_freq_df['Character'], char_freq_df['Count'])
     plt.title("Character Frequency Distribution")
     plt.xlabel("Character")
     plt.ylabel("Frequency")
@@ -490,26 +561,46 @@ if uploaded_file is not None:
     first_chars = Counter(chars[0] for chars in chars_list)
     last_chars = Counter(chars[-1] for chars in chars_list)
     fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
-    first_df = pd.DataFrame(first_chars.most_common(15),
-                           columns=['Character', 'Count'])
     sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
     ax1.set_title("Most Common Word-Initial Characters")
     ax1.tick_params(axis='x', rotation=45)
-    last_df = pd.DataFrame(last_chars.most_common(15),
-                          columns=['Character', 'Count'])
     sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
     ax2.set_title("Most Common Word-Final Characters")
     ax2.tick_params(axis='x', rotation=45)
     st.pyplot(fig6)
     # N-gram Pattern Discovery
     st.subheader("N-gram Pattern Discovery")
     st.write("Discover recurring character sequences of different lengths")
-    ngram_length = st.slider("Select n-gram length:", 2, 6, 3)
     ngrams = Counter()
     for chars in chars_list:
@@ -517,10 +608,11 @@ if uploaded_file is not None:
             ngram = tuple(chars[i:i+ngram_length])
             ngrams[ngram] += 1
     ngram_df = pd.DataFrame([
         {'Pattern': ''.join(str(c) for c in ngram),
          'Count': int(count),
-         'Percentage': f"{count/len(chars_list)*100:.2f}%"}
         for ngram, count in ngrams.most_common(30)
     ])
     st.dataframe(ngram_df)

     </style>
     """, unsafe_allow_html=True)
+# Define allowed characters
+ALLOWED_CHARS = set('4O892ERSZPBFVQWXYACIGH1TU0DNM3JKL567(n)(v)')
 def parse_voynich_word(word):
+    """Parse a Voynich word into individual characters - filtering to allowed characters only"""
     if not word or word.strip() == '':
         return None, None
     word = word.strip()
+    # Filter to only allowed characters
+    chars = [c for c in list(word) if c in ALLOWED_CHARS]
+    # If no valid characters remain, return None
+    if not chars:
+        return None, None
+    # Reconstruct the filtered word
+    filtered_word = ''.join(chars)
+    return filtered_word, chars
+@st.cache_data
+def analyze_csv(df_hash):
+    """Cached analysis function - only recalculates when CSV changes"""
+    df = st.session_state.df_data
     words = []
     chars_list = []
     char_positions = defaultdict(list)
     return words, chars_list, char_positions, char_connections, word_positions, line_word_map
+@st.cache_data
+def create_length_groups(words, chars_list):
+    """Pre-calculate all length groups - cached for performance"""
+    length_groups = defaultdict(list)
+    for word, chars in zip(words, chars_list):
+        length = len(chars)
+        if length <= 20:
+            length_groups[length].append((word, chars))
+    return length_groups
 def create_12_slot_table(chars_list):
     slot_frequencies = [Counter() for _ in range(12)]
     for chars in chars_list:
         for i, char in enumerate(chars[:12]):
             slot_frequencies[i][char] += 1
+    # Calculate totals for each slot
+    slot_totals = [sum(counter.values()) for counter in slot_frequencies]
     data = []
     all_chars = sorted(set(char for counter in slot_frequencies for char in counter))
     for char in all_chars:
         row = {'Character': char}
         for i in range(12):
+            count = slot_frequencies[i][char]
+            row[f'Slot_{i+1}'] = count
+            if slot_totals[i] > 0:
+                row[f'Slot_{i+1}_Pct'] = f"{(count / slot_totals[i] * 100):.2f}%"
+            else:
+                row[f'Slot_{i+1}_Pct'] = "0.00%"
         data.append(row)
+    # Reorder columns to alternate count and percentage
+    df = pd.DataFrame(data)
+    ordered_cols = ['Character']
+    for i in range(12):
+        ordered_cols.append(f'Slot_{i+1}')
+        ordered_cols.append(f'Slot_{i+1}_Pct')
+    return df[ordered_cols]
 def analyze_slot_structure(chars_list):
     slot_contents = defaultdict(Counter)
     return href
 st.title("Voynich Manuscript Analyzer")
+st.write("Upload your CSV file.")
+st.info(f"**Filtered Character Set:** {' '.join(sorted(ALLOWED_CHARS))}")
+# Upload eva legend to sidebar
 floating_image_file = st.sidebar.file_uploader("Upload an image",
                                                 type=['png', 'jpg', 'jpeg', 'gif'],
                                                 key="floating_image")
     # Create DataFrame from parsed data
     df = pd.DataFrame(data)
+    # Store in session state and create hash for caching
+    st.session_state.df_data = df
+    df_hash = hash(content)
+    # Use cached analysis
+    words, chars_list, char_positions, char_connections, word_positions, line_word_map = analyze_csv(df_hash)
+    # Pre-calculate length groups (cached)
+    length_groups = create_length_groups(words, chars_list)
     st.subheader("Basic Statistics")
     st.write(f"Total words: {len(words)}")
             bigram = tuple(chars[i:i+2])
             char_bigrams[bigram] += 1
+    total_char_bigrams = sum(char_bigrams.values())
     char_bigram_df = pd.DataFrame([
         {'Bigram': ''.join(str(c) for c in bigram),
          'Char1': str(bigram[0]),
          'Char2': str(bigram[1]),
+         'Count': int(count),
+         'Percentage': f"{(count / total_char_bigrams * 100):.2f}%"}
         for bigram, count in char_bigrams.most_common(30)
     ])
     st.dataframe(char_bigram_df)
             trigram = tuple(chars[i:i+3])
             char_trigrams[trigram] += 1
+    total_char_trigrams = sum(char_trigrams.values())
     char_trigram_df = pd.DataFrame([
+        {'Trigram': ''.join(str(c) for c in trigram),
+         'Count': int(count),
+         'Percentage': f"{(count / total_char_trigrams * 100):.2f}%"}
         for trigram, count in char_trigrams.most_common(30)
     ])
     st.dataframe(char_trigram_df)
     for i in range(len(words)-1):
         bigram = tuple(words[i:i+2])
         word_bigrams[bigram] += 1
+    total_word_bigrams = sum(word_bigrams.values())
     word_bigram_df = pd.DataFrame([
+        {'Word1': str(bigram[0]),
+         'Word2': str(bigram[1]),
+         'Count': int(count),
+         'Percentage': f"{(count / total_word_bigrams * 100):.2f}%"}
         for bigram, count in word_bigrams.most_common(20)
     ])
     st.dataframe(word_bigram_df)
     for i in range(len(words)-2):
         trigram = tuple(words[i:i+3])
         word_trigrams[trigram] += 1
+    total_word_trigrams = sum(word_trigrams.values())
     word_trigram_df = pd.DataFrame([
         {'Word1': str(trigram[0]),
          'Word2': str(trigram[1]),
          'Word3': str(trigram[2]),
+         'Count': int(count),
+         'Percentage': f"{(count / total_word_trigrams * 100):.2f}%"}
         for trigram, count in word_trigrams.most_common(20)
     ])
     st.dataframe(word_trigram_df)
     st.subheader("Words by Length Analysis")
     selected_length = st.selectbox("Select word length to analyze:",
+                                 sorted(length_groups.keys()),
+                                 key="length_selector")
     if selected_length:
         words_of_length = length_groups[selected_length]
             for i, char in enumerate(chars):
                 position_chars[i][char] += 1
+        # Calculate totals for each position
+        position_totals = [sum(counter.values()) for counter in position_chars]
         st.write(f"Found {len(words_of_length)} words of length {selected_length}")
         freq_data = []
         for char in sorted(unique_chars):
             row = {'Character': char}
             for pos in range(selected_length):
+                count = position_chars[pos][char]
+                row[f'Pos_{pos+1}'] = count
+                if position_totals[pos] > 0:
+                    row[f'Pos_{pos+1}_Pct'] = f"{(count / position_totals[pos] * 100):.2f}%"
+                else:
+                    row[f'Pos_{pos+1}_Pct'] = "0.00%"
             freq_data.append(row)
         freq_df = pd.DataFrame(freq_data)
+        # Reorder columns to alternate count and percentage
+        ordered_cols = ['Character']
+        for pos in range(selected_length):
+            ordered_cols.append(f'Pos_{pos+1}')
+            ordered_cols.append(f'Pos_{pos+1}_Pct')
+        freq_df = freq_df[ordered_cols]
         st.dataframe(freq_df)
         st.markdown(get_download_link_csv(freq_df, f"length_{selected_length}_analysis.csv"),
                    unsafe_allow_html=True)
     st.subheader("Character Context Analysis")
     st.write("Select a character to see what comes before and after it")
+    unique_chars_sorted = sorted(set(char for chars in chars_list for char in chars))
+    selected_char = st.selectbox("Select a character to analyze:",
+                                unique_chars_sorted,
+                                key="char_selector")
     if selected_char:
         before_counter = Counter()
         with col1:
             st.write(f"Characters that commonly PRECEDE '{selected_char}':")
+            total_before = sum(before_counter.values())
+            before_data = [
+                {'Character': char,
+                 'Count': count,
+                 'Percentage': f"{(count / total_before * 100):.2f}%"}
+                for char, count in before_counter.most_common(15)
+            ]
+            before_df = pd.DataFrame(before_data)
             st.dataframe(before_df)
             fig1, ax1 = plt.subplots(figsize=(8, 6))
         with col2:
             st.write(f"Characters that commonly FOLLOW '{selected_char}':")
+            total_after = sum(after_counter.values())
+            after_data = [
+                {'Character': char,
+                 'Count': count,
+                 'Percentage': f"{(count / total_after * 100):.2f}%"}
+                for char, count in after_counter.most_common(15)
+            ]
+            after_df = pd.DataFrame(after_data)
             st.dataframe(after_df)
             fig2, ax2 = plt.subplots(figsize=(8, 6))
     st.subheader("Line Viewer")
     available_lines = sorted(set(line_data['line'] for line_data in word_positions))
+    selected_line = st.selectbox("Select Line:",
+                                [''] + [f"Line {line}" for line in available_lines],
+                                key="line_selector")
     if selected_line:
         line_num = int(selected_line.replace('Line ', ''))
     fig_freq = plt.figure(figsize=(12, 6))
     char_freq_df = pd.DataFrame(char_freq.most_common(), columns=['Character', 'Count'])
     char_freq_df['Percentage'] = (char_freq_df['Count'] / total_chars * 100).round(2)
+    char_freq_df['Percentage'] = char_freq_df['Percentage'].apply(lambda x: f"{x:.2f}%")
+    plt.bar([row['Character'] for _, row in char_freq_df.iterrows()],
+            [int(row['Count']) for _, row in char_freq_df.iterrows()])
     plt.title("Character Frequency Distribution")
     plt.xlabel("Character")
     plt.ylabel("Frequency")
     first_chars = Counter(chars[0] for chars in chars_list)
     last_chars = Counter(chars[-1] for chars in chars_list)
+    total_first = sum(first_chars.values())
+    total_last = sum(last_chars.values())
     fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+    first_df = pd.DataFrame([
+        {'Character': char,
+         'Count': count,
+         'Percentage': f"{(count / total_first * 100):.2f}%"}
+        for char, count in first_chars.most_common(15)
+    ])
     sns.barplot(data=first_df, x='Character', y='Count', ax=ax1)
     ax1.set_title("Most Common Word-Initial Characters")
     ax1.tick_params(axis='x', rotation=45)
+    last_df = pd.DataFrame([
+        {'Character': char,
+         'Count': count,
+         'Percentage': f"{(count / total_last * 100):.2f}%"}
+        for char, count in last_chars.most_common(15)
+    ])
     sns.barplot(data=last_df, x='Character', y='Count', ax=ax2)
     ax2.set_title("Most Common Word-Final Characters")
     ax2.tick_params(axis='x', rotation=45)
     st.pyplot(fig6)
+    # Display the dataframes with percentages
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("Word-Initial Character Statistics:")
+        st.dataframe(first_df)
+    with col2:
+        st.write("Word-Final Character Statistics:")
+        st.dataframe(last_df)
     # N-gram Pattern Discovery
     st.subheader("N-gram Pattern Discovery")
     st.write("Discover recurring character sequences of different lengths")
+    ngram_length = st.slider("Select n-gram length:", 2, 6, 3, key="ngram_slider")
     ngrams = Counter()
     for chars in chars_list:
             ngram = tuple(chars[i:i+ngram_length])
             ngrams[ngram] += 1
+    total_ngrams = sum(ngrams.values())
     ngram_df = pd.DataFrame([
         {'Pattern': ''.join(str(c) for c in ngram),
          'Count': int(count),
+         'Percentage': f"{(count / total_ngrams * 100):.2f}%"}
         for ngram, count in ngrams.most_common(30)
     ])
     st.dataframe(ngram_df)