Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

egumasa commited on Aug 5, 2025

Commit

dcb572b

1 Parent(s): a33296e

plot

Browse files

Files changed (1) hide show

web_app/components/comparison_functions.py +211 -11

web_app/components/comparison_functions.py CHANGED Viewed

@@ -117,28 +117,204 @@ def display_visual_comparison(results_a, results_b):
                 st.write("No detailed data available for this measure.")
                 continue
             # Create plotly figure
             fig = go.Figure()
-            # Add histogram for Text A
             fig.add_trace(go.Histogram(
                 x=data_a,
                 name="Text A",
                 opacity=0.5,
                 marker_color="blue",
-                nbinsx=min(30, len(data_a)),
-                histnorm='probability density'
             ))
-            # Add histogram for Text B
             fig.add_trace(go.Histogram(
                 x=data_b,
                 name="Text B",
                 opacity=0.5,
                 marker_color="red",
-                nbinsx=min(30, len(data_b)),
-                histnorm='probability density'
             ))
             # Calculate and add KDE (kernel density estimation) curve
@@ -151,7 +327,7 @@ def display_visual_comparison(results_a, results_b):
                 x=x_range_a,
                 y=kde_values_a,
                 mode='lines',
-                name='Density',
                 line=dict(color='blue', width=2)
             ))
@@ -165,9 +341,33 @@ def display_visual_comparison(results_a, results_b):
                 x=x_range_b,
                 y=kde_values_b,
                 mode='lines',
-                name='Density',
                 line=dict(color='red', width=2)
-            ))
             # Update layout
             fig.update_layout(
@@ -321,4 +521,4 @@ def display_token_comparison(results_a, results_b):
             data=csv_data_b,
             file_name="text_b_tokens.csv",
             mime="text/csv"
-        )

                 st.write("No detailed data available for this measure.")
                 continue
+            # Create word-to-score mapping for both texts
+            word_score_map_a = {}
+            word_score_map_b = {}
+            # Build word mappings for Text A
+            if '_bigram_' in measure:
+                if 'bigram_details' in results_a and results_a['bigram_details']:
+                    idx = measure.rfind('_bigram')
+                    index_measure_col = measure[:idx] + measure[idx+7:] if idx != -1 else measure
+                    for bigram_detail in results_a['bigram_details']:
+                        if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
+                            bigram_text = bigram_detail.get('bigram', '')
+                            word_score_map_a[bigram_text] = bigram_detail[index_measure_col]
+            elif '_trigram_' in measure:
+                if 'trigram_details' in results_a and results_a['trigram_details']:
+                    idx = measure.rfind('_trigram')
+                    index_measure_col = measure[:idx] + measure[idx+8:] if idx != -1 else measure
+                    for trigram_detail in results_a['trigram_details']:
+                        if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
+                            trigram_text = trigram_detail.get('trigram', '')
+                            word_score_map_a[trigram_text] = trigram_detail[index_measure_col]
+            else:
+                if 'token_details' in results_a:
+                    matching_column = None
+                    if any(measure in token for token in results_a['token_details']):
+                        matching_column = measure
+                    else:
+                        base_key = measure
+                        for suffix in ['_CW', '_FW']:
+                            if measure.endswith(suffix):
+                                base_key = measure[:-len(suffix)]
+                                break
+                        if any(base_key in token for token in results_a['token_details']):
+                            matching_column = base_key
+                        else:
+                            for token in results_a['token_details']:
+                                for col_name in token.keys():
+                                    if col_name not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
+                                        if col_name in measure or measure.startswith(col_name):
+                                            matching_column = col_name
+                                            break
+                                if matching_column:
+                                    break
+                    if matching_column:
+                        for token in results_a['token_details']:
+                            if matching_column in token and token[matching_column] is not None:
+                                word_score_map_a[token['token']] = token[matching_column]
+            # Build word mappings for Text B (same logic)
+            if '_bigram_' in measure:
+                if 'bigram_details' in results_b and results_b['bigram_details']:
+                    idx = measure.rfind('_bigram')
+                    index_measure_col = measure[:idx] + measure[idx+7:] if idx != -1 else measure
+                    for bigram_detail in results_b['bigram_details']:
+                        if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
+                            bigram_text = bigram_detail.get('bigram', '')
+                            word_score_map_b[bigram_text] = bigram_detail[index_measure_col]
+            elif '_trigram_' in measure:
+                if 'trigram_details' in results_b and results_b['trigram_details']:
+                    idx = measure.rfind('_trigram')
+                    index_measure_col = measure[:idx] + measure[idx+8:] if idx != -1 else measure
+                    for trigram_detail in results_b['trigram_details']:
+                        if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
+                            trigram_text = trigram_detail.get('trigram', '')
+                            word_score_map_b[trigram_text] = trigram_detail[index_measure_col]
+            else:
+                if 'token_details' in results_b:
+                    matching_column = None
+                    if any(measure in token for token in results_b['token_details']):
+                        matching_column = measure
+                    else:
+                        base_key = measure
+                        for suffix in ['_CW', '_FW']:
+                            if measure.endswith(suffix):
+                                base_key = measure[:-len(suffix)]
+                                break
+                        if any(base_key in token for token in results_b['token_details']):
+                            matching_column = base_key
+                        else:
+                            for token in results_b['token_details']:
+                                for col_name in token.keys():
+                                    if col_name not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
+                                        if col_name in measure or measure.startswith(col_name):
+                                            matching_column = col_name
+                                            break
+                                if matching_column:
+                                    break
+                    if matching_column:
+                        for token in results_b['token_details']:
+                            if matching_column in token and token[matching_column] is not None:
+                                word_score_map_b[token['token']] = token[matching_column]
+            # Calculate bins for consistent binning
+            all_data = data_a + data_b
+            nbins = min(30, len(all_data))
+            data_min, data_max = min(all_data), max(all_data)
+            data_range = data_max - data_min
+            padding = data_range * 0.02
+            adjusted_min = data_min - padding
+            adjusted_max = data_max + padding
+            bin_edges = np.linspace(adjusted_min, adjusted_max, nbins + 1)
+            # Assign words to bins for both texts
+            bin_examples_a = {}
+            bin_examples_b = {}
+            if word_score_map_a:
+                import random
+                for word, score in word_score_map_a.items():
+                    bin_idx = np.digitize(score, bin_edges) - 1
+                    bin_idx = max(0, min(bin_idx, len(bin_edges) - 2))
+                    if bin_idx not in bin_examples_a:
+                        bin_examples_a[bin_idx] = []
+                    bin_examples_a[bin_idx].append(word)
+                for bin_idx in bin_examples_a:
+                    if len(bin_examples_a[bin_idx]) > 3:
+                        bin_examples_a[bin_idx] = random.sample(bin_examples_a[bin_idx], 3)
+            if word_score_map_b:
+                import random
+                for word, score in word_score_map_b.items():
+                    bin_idx = np.digitize(score, bin_edges) - 1
+                    bin_idx = max(0, min(bin_idx, len(bin_edges) - 2))
+                    if bin_idx not in bin_examples_b:
+                        bin_examples_b[bin_idx] = []
+                    bin_examples_b[bin_idx].append(word)
+                for bin_idx in bin_examples_b:
+                    if len(bin_examples_b[bin_idx]) > 3:
+                        bin_examples_b[bin_idx] = random.sample(bin_examples_b[bin_idx], 3)
+            # Create hover text for each bin
+            hist_data_a, _ = np.histogram(data_a, bins=bin_edges)
+            hist_data_b, _ = np.histogram(data_b, bins=bin_edges)
+            hover_texts_a = []
+            hover_texts_b = []
+            for i in range(len(bin_edges) - 1):
+                bin_start = bin_edges[i]
+                bin_end = bin_edges[i + 1]
+                examples_a = bin_examples_a.get(i, [])
+                examples_b = bin_examples_b.get(i, [])
+                # Hover text for Text A
+                hover_text_a = f"Text A<br>Range: {bin_start:.3f} - {bin_end:.3f}<br>"
+                hover_text_a += f"Count: {hist_data_a[i]}<br>"
+                if examples_a:
+                    hover_text_a += f"Examples: {', '.join(examples_a)}"
+                else:
+                    hover_text_a += "Examples: none"
+                hover_texts_a.append(hover_text_a)
+                # Hover text for Text B
+                hover_text_b = f"Text B<br>Range: {bin_start:.3f} - {bin_end:.3f}<br>"
+                hover_text_b += f"Count: {hist_data_b[i]}<br>"
+                if examples_b:
+                    hover_text_b += f"Examples: {', '.join(examples_b)}"
+                else:
+                    hover_text_b += "Examples: none"
+                hover_texts_b.append(hover_text_b)
             # Create plotly figure
             fig = go.Figure()
+            # Add histogram for Text A with custom hover
             fig.add_trace(go.Histogram(
                 x=data_a,
                 name="Text A",
                 opacity=0.5,
                 marker_color="blue",
+                xbins=dict(
+                    start=bin_edges[0],
+                    end=bin_edges[-1],
+                    size=(bin_edges[-1] - bin_edges[0]) / nbins
+                ),
+                histnorm='probability density',
+                hovertemplate='%{customdata}<extra></extra>',
+                customdata=hover_texts_a
             ))
+            # Add histogram for Text B with custom hover
             fig.add_trace(go.Histogram(
                 x=data_b,
                 name="Text B",
                 opacity=0.5,
                 marker_color="red",
+                xbins=dict(
+                    start=bin_edges[0],
+                    end=bin_edges[-1],
+                    size=(bin_edges[-1] - bin_edges[0]) / nbins
+                ),
+                histnorm='probability density',
+                hovertemplate='%{customdata}<extra></extra>',
+                customdata=hover_texts_b
             ))
             # Calculate and add KDE (kernel density estimation) curve
                 x=x_range_a,
                 y=kde_values_a,
                 mode='lines',
+                name='Text A Density',
                 line=dict(color='blue', width=2)
             ))
                 x=x_range_b,
                 y=kde_values_b,
                 mode='lines',
+                name='Text B Density',
                 line=dict(color='red', width=2)
+            ))
+            # Add vertical mean lines
+            mean_a = np.mean(data_a)
+            mean_b = np.mean(data_b)
+            # Add mean line for Text A
+            fig.add_vline(
+                x=mean_a,
+                line_dash="dash",
+                line_color="blue",
+                line_width=2,
+                annotation_text=f"Text A Mean: {mean_a:.3f}",
+                annotation_position="top left"
+            )
+            # Add mean line for Text B
+            fig.add_vline(
+                x=mean_b,
+                line_dash="dash",
+                line_color="red",
+                line_width=2,
+                annotation_text=f"Text B Mean: {mean_b:.3f}",
+                annotation_position="top right"
+            )
             # Update layout
             fig.update_layout(
             data=csv_data_b,
             file_name="text_b_tokens.csv",
             mime="text/csv"
+        )