kambris commited on
Commit
8637d3a
·
verified ·
1 Parent(s): 8841e8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -63
app.py CHANGED
@@ -1,66 +1,240 @@
1
- def analyze_character_positions(df):
2
- # Character position analysis
3
- char_positions = {}
4
- char_neighbors = {}
5
-
6
- for row in df.itertuples():
7
- for col in [c for c in df.columns if c.startswith('t')]:
8
- word = getattr(row, col)
9
- if pd.notna(word) and word != '$':
10
- chars = word.strip('"').split(',')
11
- for i, char in enumerate(chars):
12
- # Track positions
13
- if char not in char_positions:
14
- char_positions[char] = []
15
- char_positions[char].append(i)
16
-
17
- # Track neighbors
18
- if char not in char_neighbors:
19
- char_neighbors[char] = {'before': [], 'after': []}
20
- if i > 0:
21
- char_neighbors[char]['before'].append(chars[i-1])
22
- if i < len(chars) - 1:
23
- char_neighbors[char]['after'].append(chars[i+1])
24
-
25
- def analyze_word_positions(df):
26
- # Word position and neighbor analysis
27
- word_positions = {}
28
- word_neighbors = {}
29
-
30
- for row in df.itertuples():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  line_words = []
32
- for col in [c for c in df.columns if c.startswith('t')]:
33
- word = getattr(row, col)
34
- if pd.notna(word) and word != '$':
35
- line_words.append(word)
 
36
 
37
- for i, word in enumerate(line_words):
38
- # Track positions
39
- if word not in word_positions:
40
- word_positions[word] = []
41
- word_positions[word].append(i)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Track neighbors
44
- if word not in word_neighbors:
45
- word_neighbors[word] = {'before': [], 'after': []}
46
- if i > 0:
47
- word_neighbors[word]['before'].append(line_words[i-1])
48
- if i < len(line_words) - 1:
49
- word_neighbors[word]['after'].append(line_words[i+1])
50
-
51
- return word_positions, word_neighbors
52
-
53
- def display_analysis(char_positions, char_neighbors, word_positions, word_neighbors):
54
- st.subheader("Character Analysis")
55
- selected_char = st.selectbox("Select character to analyze", sorted(char_positions.keys()))
56
-
57
- st.write("Positions:", Counter(char_positions[selected_char]))
58
- st.write("Most common previous characters:", Counter(char_neighbors[selected_char]['before']).most_common(5))
59
- st.write("Most common following characters:", Counter(char_neighbors[selected_char]['after']).most_common(5))
60
-
61
- st.subheader("Word Analysis")
62
- selected_word = st.selectbox("Select word to analyze", sorted(word_positions.keys()))
63
-
64
- st.write("Positions in line:", Counter(word_positions[selected_word]))
65
- st.write("Most common previous words:", Counter(word_neighbors[selected_word]['before']).most_common(5))
66
- st.write("Most common following words:", Counter(word_neighbors[selected_word]['after']).most_common(5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ from collections import defaultdict, Counter
9
+ import base64
10
+ from sklearn.manifold import MDS
11
+ import networkx as nx
12
+
13
+ # Set page configuration
14
+ st.set_page_config(layout="wide", page_title="Voynich Manuscript Analyzer", page_icon="📜")
15
+
16
+ # Caching expensive computations
17
+ @st.cache_data
18
+ def load_data(uploaded_file):
19
+ return pd.read_csv(uploaded_file)
20
+
21
+ @st.cache_data
22
+ def extract_word_and_chars(token):
23
+ if token == '$' or '<START>' in token or '<END>' in token:
24
+ return None, None
25
+
26
+ chars = []
27
+ temp_chars = token.split(',')
28
+
29
+ for char in temp_chars:
30
+ if '?' in char:
31
+ base_char = char.replace('?', '')
32
+ if base_char:
33
+ chars.append(base_char)
34
+ chars.append('?')
35
+ else:
36
+ chars.append(char)
37
+
38
+ word = ''.join(chars)
39
+ return word, chars
40
+
41
+ @st.cache_data
42
+ def analyze_csv(df):
43
+ words = []
44
+ chars_list = []
45
+ char_positions = defaultdict(list)
46
+ char_connections = defaultdict(Counter)
47
+ word_positions = []
48
+ folio_word_map = defaultdict(Counter)
49
+ token_columns = [col for col in df.columns if col.startswith('t')]
50
+
51
+ for _, row in df.iterrows():
52
  line_words = []
53
+ for pos, col in enumerate(token_columns):
54
+ token = row[col]
55
+ if pd.notna(token) and token != '$':
56
+ if token.startswith('"') and token.endswith('"'):
57
+ token = token[1:-1]
58
 
59
+ word, chars = extract_word_and_chars(token)
60
+ if word:
61
+ words.append(word)
62
+ chars_list.append(chars)
63
+ line_words.append((word, pos, chars))
64
+ folio_word_map[row['folio']][word] += 1
65
+
66
+ for j, char in enumerate(chars):
67
+ char_positions[char].append(j)
68
+
69
+ for j in range(len(chars) - 1):
70
+ char_connections[chars[j]][chars[j+1]] += 1
71
+
72
+ if line_words:
73
+ word_positions.append({
74
+ 'folio': row['folio'],
75
+ 'par': row['par'],
76
+ 'line': row['line'],
77
+ 'words': line_words
78
+ })
79
+
80
+ return words, chars_list, char_positions, char_connections, word_positions, folio_word_map
81
+
82
+ @st.cache_data
83
+ def analyze_trigrams(words, chars_list):
84
+ char_trigrams = Counter()
85
+ word_trigrams = Counter()
86
+
87
+ for chars in chars_list:
88
+ for i in range(len(chars)-2):
89
+ trigram = tuple(chars[i:i+3])
90
+ char_trigrams[trigram] += 1
91
+
92
+ for i in range(len(words)-2):
93
+ trigram = tuple(words[i:i+3])
94
+ word_trigrams[trigram] += 1
95
+
96
+ return char_trigrams, word_trigrams
97
+
98
+ @st.cache_data
99
+ def create_12_slot_table(chars_list):
100
+ slot_frequencies = [Counter() for _ in range(12)]
101
+
102
+ for chars in chars_list:
103
+ for i, char in enumerate(chars[:12]):
104
+ slot_frequencies[i][char] += 1
105
 
106
+ data = []
107
+ all_chars = sorted(set(char for counter in slot_frequencies for char in counter))
108
+
109
+ for char in all_chars:
110
+ row = {'Character': char}
111
+ for i in range(12):
112
+ row[f'Slot_{i+1}'] = slot_frequencies[i][char]
113
+ data.append(row)
114
+
115
+ return pd.DataFrame(data)
116
+
117
+ @st.cache_data
118
+ def analyze_slot_structure(chars_list):
119
+ slot_contents = defaultdict(Counter)
120
+ max_slots = 0
121
+
122
+ for chars in chars_list:
123
+ if len(chars) > max_slots:
124
+ max_slots = len(chars)
125
+
126
+ for i, char in enumerate(chars):
127
+ slot_contents[i][char] += 1
128
+
129
+ slot_summary = {}
130
+ for slot in range(max_slots):
131
+ if slot in slot_contents:
132
+ common_chars = slot_contents[slot].most_common(10)
133
+ slot_summary[slot] = common_chars
134
+
135
+ return slot_summary, max_slots
136
+
137
+ @st.cache_data
138
+ def create_folio_word_scatter(folio_word_map):
139
+ all_words = set()
140
+ for word_counter in folio_word_map.values():
141
+ all_words.update(word_counter.keys())
142
+
143
+ folios = sorted(folio_word_map.keys())
144
+ word_freq_matrix = np.zeros((len(folios), len(all_words)))
145
+
146
+ for i, folio in enumerate(folios):
147
+ for j, word in enumerate(all_words):
148
+ word_freq_matrix[i, j] = folio_word_map[folio][word]
149
+
150
+ mds = MDS(n_components=2, random_state=42)
151
+ folio_coords = mds.fit_transform(word_freq_matrix)
152
+
153
+ fig = px.scatter(x=folio_coords[:, 0], y=folio_coords[:, 1], text=folios)
154
+ fig.update_traces(textposition='top center')
155
+ fig.update_layout(title='Folio Similarity based on Word Usage',
156
+ xaxis_title='Dimension 1',
157
+ yaxis_title='Dimension 2')
158
+
159
+ return fig
160
+
161
+ @st.cache_data
162
+ def plot_char_positions(char_positions, max_slots):
163
+ chars = []
164
+ positions = []
165
+ counts = []
166
+
167
+ for char, pos_list in char_positions.items():
168
+ pos_counts = Counter(pos_list)
169
+ for pos, count in pos_counts.items():
170
+ if pos < max_slots:
171
+ chars.append(char)
172
+ positions.append(pos)
173
+ counts.append(count)
174
+
175
+ df = pd.DataFrame({
176
+ 'Character': chars,
177
+ 'Position': positions,
178
+ 'Count': counts
179
+ })
180
+
181
+ pivot_df = df.pivot(index='Character', columns='Position', values='Count').fillna(0)
182
+
183
+ fig = px.imshow(pivot_df, color_continuous_scale='YlGnBu', labels=dict(x="Position in Word", y="Character", color="Frequency"))
184
+ fig.update_layout(title='Character Position Heatmap')
185
+ return fig
186
+
187
+ @st.cache_data
188
+ def get_download_link_csv(df, filename):
189
+ csv = df.to_csv(index=False)
190
+ b64 = base64.b64encode(csv.encode()).decode()
191
+ href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
192
+ return href
193
+
194
+ # Main App
195
+ st.title("Voynich Manuscript Analyzer")
196
+ st.write("Upload your CSV file to discover potential patterns and character distributions.")
197
+
198
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
199
+
200
+ if uploaded_file is not None:
201
+ df = load_data(uploaded_file)
202
+ words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df)
203
+
204
+ with st.expander("Basic Statistics"):
205
+ st.write(f"Total words: {len(words)}")
206
+ st.write(f"Total unique words: {len(set(words))}")
207
+ unique_chars = set(char for chars in chars_list for char in chars)
208
+ st.write(f"Total unique characters: {len(unique_chars)}")
209
+ st.write("Unique characters:", ", ".join(sorted(unique_chars)))
210
+
211
+ with st.expander("Trigram Analysis"):
212
+ char_trigrams, word_trigrams = analyze_trigrams(words, chars_list)
213
+
214
+ st.write("Top 20 Character Trigrams")
215
+ char_trigram_df = pd.DataFrame([
216
+ {'Trigram': ' - '.join(trigram), 'Count': count}
217
+ for trigram, count in char_trigrams.most_common(20)
218
+ ])
219
+ st.dataframe(char_trigram_df)
220
+ st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True)
221
+
222
+ st.write("Top 20 Word Trigrams")
223
+ word_trigram_df = pd.DataFrame([
224
+ {'Trigram': ' - '.join(trigram), 'Count': count}
225
+ for trigram, count in word_trigrams.most_common(20)
226
+ ])
227
+ st.dataframe(word_trigram_df)
228
+ st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True)
229
+
230
+ with st.expander("Character Position Analysis"):
231
+ slot_summary, max_slots = analyze_slot_structure(chars_list)
232
+ st.plotly_chart(plot_char_positions(char_positions, max_slots))
233
+
234
+ with st.expander("Folio Similarity Analysis"):
235
+ st.plotly_chart(create_folio_word_scatter(folio_word_map))
236
+
237
+ with st.expander("12-Slot Character Frequency Table"):
238
+ slot_freq_df = create_12_slot_table(chars_list)
239
+ st.dataframe(slot_freq_df)
240
+ st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True)