egumasa commited on
Commit
dcb572b
·
1 Parent(s): a33296e
web_app/components/comparison_functions.py CHANGED
@@ -117,28 +117,204 @@ def display_visual_comparison(results_a, results_b):
117
  st.write("No detailed data available for this measure.")
118
  continue
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  # Create plotly figure
121
  fig = go.Figure()
122
 
123
- # Add histogram for Text A
124
  fig.add_trace(go.Histogram(
125
  x=data_a,
126
  name="Text A",
127
  opacity=0.5,
128
  marker_color="blue",
129
- nbinsx=min(30, len(data_a)),
130
- histnorm='probability density'
 
 
 
 
 
 
131
  ))
132
 
133
-
134
- # Add histogram for Text B
135
  fig.add_trace(go.Histogram(
136
  x=data_b,
137
  name="Text B",
138
  opacity=0.5,
139
  marker_color="red",
140
- nbinsx=min(30, len(data_b)),
141
- histnorm='probability density'
 
 
 
 
 
 
142
  ))
143
 
144
  # Calculate and add KDE (kernel density estimation) curve
@@ -151,7 +327,7 @@ def display_visual_comparison(results_a, results_b):
151
  x=x_range_a,
152
  y=kde_values_a,
153
  mode='lines',
154
- name='Density',
155
  line=dict(color='blue', width=2)
156
  ))
157
 
@@ -165,9 +341,33 @@ def display_visual_comparison(results_a, results_b):
165
  x=x_range_b,
166
  y=kde_values_b,
167
  mode='lines',
168
- name='Density',
169
  line=dict(color='red', width=2)
170
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  # Update layout
173
  fig.update_layout(
@@ -321,4 +521,4 @@ def display_token_comparison(results_a, results_b):
321
  data=csv_data_b,
322
  file_name="text_b_tokens.csv",
323
  mime="text/csv"
324
- )
 
117
  st.write("No detailed data available for this measure.")
118
  continue
119
 
120
+ # Create word-to-score mapping for both texts
121
+ word_score_map_a = {}
122
+ word_score_map_b = {}
123
+
124
+ # Build word mappings for Text A
125
+ if '_bigram_' in measure:
126
+ if 'bigram_details' in results_a and results_a['bigram_details']:
127
+ idx = measure.rfind('_bigram')
128
+ index_measure_col = measure[:idx] + measure[idx+7:] if idx != -1 else measure
129
+ for bigram_detail in results_a['bigram_details']:
130
+ if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
131
+ bigram_text = bigram_detail.get('bigram', '')
132
+ word_score_map_a[bigram_text] = bigram_detail[index_measure_col]
133
+ elif '_trigram_' in measure:
134
+ if 'trigram_details' in results_a and results_a['trigram_details']:
135
+ idx = measure.rfind('_trigram')
136
+ index_measure_col = measure[:idx] + measure[idx+8:] if idx != -1 else measure
137
+ for trigram_detail in results_a['trigram_details']:
138
+ if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
139
+ trigram_text = trigram_detail.get('trigram', '')
140
+ word_score_map_a[trigram_text] = trigram_detail[index_measure_col]
141
+ else:
142
+ if 'token_details' in results_a:
143
+ matching_column = None
144
+ if any(measure in token for token in results_a['token_details']):
145
+ matching_column = measure
146
+ else:
147
+ base_key = measure
148
+ for suffix in ['_CW', '_FW']:
149
+ if measure.endswith(suffix):
150
+ base_key = measure[:-len(suffix)]
151
+ break
152
+ if any(base_key in token for token in results_a['token_details']):
153
+ matching_column = base_key
154
+ else:
155
+ for token in results_a['token_details']:
156
+ for col_name in token.keys():
157
+ if col_name not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
158
+ if col_name in measure or measure.startswith(col_name):
159
+ matching_column = col_name
160
+ break
161
+ if matching_column:
162
+ break
163
+
164
+ if matching_column:
165
+ for token in results_a['token_details']:
166
+ if matching_column in token and token[matching_column] is not None:
167
+ word_score_map_a[token['token']] = token[matching_column]
168
+
169
+ # Build word mappings for Text B (same logic)
170
+ if '_bigram_' in measure:
171
+ if 'bigram_details' in results_b and results_b['bigram_details']:
172
+ idx = measure.rfind('_bigram')
173
+ index_measure_col = measure[:idx] + measure[idx+7:] if idx != -1 else measure
174
+ for bigram_detail in results_b['bigram_details']:
175
+ if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
176
+ bigram_text = bigram_detail.get('bigram', '')
177
+ word_score_map_b[bigram_text] = bigram_detail[index_measure_col]
178
+ elif '_trigram_' in measure:
179
+ if 'trigram_details' in results_b and results_b['trigram_details']:
180
+ idx = measure.rfind('_trigram')
181
+ index_measure_col = measure[:idx] + measure[idx+8:] if idx != -1 else measure
182
+ for trigram_detail in results_b['trigram_details']:
183
+ if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
184
+ trigram_text = trigram_detail.get('trigram', '')
185
+ word_score_map_b[trigram_text] = trigram_detail[index_measure_col]
186
+ else:
187
+ if 'token_details' in results_b:
188
+ matching_column = None
189
+ if any(measure in token for token in results_b['token_details']):
190
+ matching_column = measure
191
+ else:
192
+ base_key = measure
193
+ for suffix in ['_CW', '_FW']:
194
+ if measure.endswith(suffix):
195
+ base_key = measure[:-len(suffix)]
196
+ break
197
+ if any(base_key in token for token in results_b['token_details']):
198
+ matching_column = base_key
199
+ else:
200
+ for token in results_b['token_details']:
201
+ for col_name in token.keys():
202
+ if col_name not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
203
+ if col_name in measure or measure.startswith(col_name):
204
+ matching_column = col_name
205
+ break
206
+ if matching_column:
207
+ break
208
+
209
+ if matching_column:
210
+ for token in results_b['token_details']:
211
+ if matching_column in token and token[matching_column] is not None:
212
+ word_score_map_b[token['token']] = token[matching_column]
213
+
214
+ # Calculate bins for consistent binning
215
+ all_data = data_a + data_b
216
+ nbins = min(30, len(all_data))
217
+ data_min, data_max = min(all_data), max(all_data)
218
+ data_range = data_max - data_min
219
+ padding = data_range * 0.02
220
+ adjusted_min = data_min - padding
221
+ adjusted_max = data_max + padding
222
+ bin_edges = np.linspace(adjusted_min, adjusted_max, nbins + 1)
223
+
224
+ # Assign words to bins for both texts
225
+ bin_examples_a = {}
226
+ bin_examples_b = {}
227
+
228
+ if word_score_map_a:
229
+ import random
230
+ for word, score in word_score_map_a.items():
231
+ bin_idx = np.digitize(score, bin_edges) - 1
232
+ bin_idx = max(0, min(bin_idx, len(bin_edges) - 2))
233
+ if bin_idx not in bin_examples_a:
234
+ bin_examples_a[bin_idx] = []
235
+ bin_examples_a[bin_idx].append(word)
236
+
237
+ for bin_idx in bin_examples_a:
238
+ if len(bin_examples_a[bin_idx]) > 3:
239
+ bin_examples_a[bin_idx] = random.sample(bin_examples_a[bin_idx], 3)
240
+
241
+ if word_score_map_b:
242
+ import random
243
+ for word, score in word_score_map_b.items():
244
+ bin_idx = np.digitize(score, bin_edges) - 1
245
+ bin_idx = max(0, min(bin_idx, len(bin_edges) - 2))
246
+ if bin_idx not in bin_examples_b:
247
+ bin_examples_b[bin_idx] = []
248
+ bin_examples_b[bin_idx].append(word)
249
+
250
+ for bin_idx in bin_examples_b:
251
+ if len(bin_examples_b[bin_idx]) > 3:
252
+ bin_examples_b[bin_idx] = random.sample(bin_examples_b[bin_idx], 3)
253
+
254
+ # Create hover text for each bin
255
+ hist_data_a, _ = np.histogram(data_a, bins=bin_edges)
256
+ hist_data_b, _ = np.histogram(data_b, bins=bin_edges)
257
+
258
+ hover_texts_a = []
259
+ hover_texts_b = []
260
+
261
+ for i in range(len(bin_edges) - 1):
262
+ bin_start = bin_edges[i]
263
+ bin_end = bin_edges[i + 1]
264
+ examples_a = bin_examples_a.get(i, [])
265
+ examples_b = bin_examples_b.get(i, [])
266
+
267
+ # Hover text for Text A
268
+ hover_text_a = f"Text A<br>Range: {bin_start:.3f} - {bin_end:.3f}<br>"
269
+ hover_text_a += f"Count: {hist_data_a[i]}<br>"
270
+ if examples_a:
271
+ hover_text_a += f"Examples: {', '.join(examples_a)}"
272
+ else:
273
+ hover_text_a += "Examples: none"
274
+ hover_texts_a.append(hover_text_a)
275
+
276
+ # Hover text for Text B
277
+ hover_text_b = f"Text B<br>Range: {bin_start:.3f} - {bin_end:.3f}<br>"
278
+ hover_text_b += f"Count: {hist_data_b[i]}<br>"
279
+ if examples_b:
280
+ hover_text_b += f"Examples: {', '.join(examples_b)}"
281
+ else:
282
+ hover_text_b += "Examples: none"
283
+ hover_texts_b.append(hover_text_b)
284
+
285
  # Create plotly figure
286
  fig = go.Figure()
287
 
288
+ # Add histogram for Text A with custom hover
289
  fig.add_trace(go.Histogram(
290
  x=data_a,
291
  name="Text A",
292
  opacity=0.5,
293
  marker_color="blue",
294
+ xbins=dict(
295
+ start=bin_edges[0],
296
+ end=bin_edges[-1],
297
+ size=(bin_edges[-1] - bin_edges[0]) / nbins
298
+ ),
299
+ histnorm='probability density',
300
+ hovertemplate='%{customdata}<extra></extra>',
301
+ customdata=hover_texts_a
302
  ))
303
 
304
+ # Add histogram for Text B with custom hover
 
305
  fig.add_trace(go.Histogram(
306
  x=data_b,
307
  name="Text B",
308
  opacity=0.5,
309
  marker_color="red",
310
+ xbins=dict(
311
+ start=bin_edges[0],
312
+ end=bin_edges[-1],
313
+ size=(bin_edges[-1] - bin_edges[0]) / nbins
314
+ ),
315
+ histnorm='probability density',
316
+ hovertemplate='%{customdata}<extra></extra>',
317
+ customdata=hover_texts_b
318
  ))
319
 
320
  # Calculate and add KDE (kernel density estimation) curve
 
327
  x=x_range_a,
328
  y=kde_values_a,
329
  mode='lines',
330
+ name='Text A Density',
331
  line=dict(color='blue', width=2)
332
  ))
333
 
 
341
  x=x_range_b,
342
  y=kde_values_b,
343
  mode='lines',
344
+ name='Text B Density',
345
  line=dict(color='red', width=2)
346
+ ))
347
+
348
+ # Add vertical mean lines
349
+ mean_a = np.mean(data_a)
350
+ mean_b = np.mean(data_b)
351
+
352
+ # Add mean line for Text A
353
+ fig.add_vline(
354
+ x=mean_a,
355
+ line_dash="dash",
356
+ line_color="blue",
357
+ line_width=2,
358
+ annotation_text=f"Text A Mean: {mean_a:.3f}",
359
+ annotation_position="top left"
360
+ )
361
+
362
+ # Add mean line for Text B
363
+ fig.add_vline(
364
+ x=mean_b,
365
+ line_dash="dash",
366
+ line_color="red",
367
+ line_width=2,
368
+ annotation_text=f"Text B Mean: {mean_b:.3f}",
369
+ annotation_position="top right"
370
+ )
371
 
372
  # Update layout
373
  fig.update_layout(
 
521
  data=csv_data_b,
522
  file_name="text_b_tokens.csv",
523
  mime="text/csv"
524
+ )