Spaces:

egumasa
/

simple-text-analyzer

Building

File size: 22,378 Bytes

"""
Additional functions for two-text comparison feature.
These functions should be added to the main app.py file.
"""

import streamlit as st
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy import stats
from web_app.utils import MemoryFileHandler


def get_text_input(label, key_suffix):
    """Get text input via upload or paste."""
    text_input_method = st.radio(
        "Input Method",
        options=['Paste Text', 'Upload File'],
        horizontal=True,
        key=f"input_method_{key_suffix}"
    )
    
    text_content = ""
    if text_input_method == 'Upload File':
        uploaded_file = st.file_uploader(
            "Upload Text File",
            type=['txt'],
            accept_multiple_files=False,
            key=f"file_upload_{key_suffix}"
        )
        if uploaded_file:
            try:
                # Use memory-based approach to avoid filesystem restrictions
                text_content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
                if not text_content:
                    st.error("Failed to read uploaded file. Please try again.")
                    return ""
                
            except Exception as e:
                st.error(f"Error reading uploaded file: {str(e)}")
                return ""
    else:
        text_content = st.text_area(
            f"Enter {label}",
            height=200,
            placeholder=f"Paste your {label.lower()} here...",
            key=f"text_area_{key_suffix}"
        )
    
    return text_content


def display_comparison_results(results_a, results_b):
    """Display results for two-text comparison."""
    st.subheader("📊 Comparison Results")
    
    # Basic text statistics comparison
    st.write("**Text Statistics Comparison**")
    stats_col_a, stats_col_b, stats_diff = st.columns(3)
    
    with stats_col_a:
        st.write("**Text A**")
        st.metric("Total Tokens", results_a['text_stats']['total_tokens'])
        st.metric("Unique Tokens", results_a['text_stats']['unique_tokens'])
        st.metric("Content Words", results_a['text_stats']['content_words'])
        st.metric("Function Words", results_a['text_stats']['function_words'])
    
    with stats_col_b:
        st.write("**Text B**")
        st.metric("Total Tokens", results_b['text_stats']['total_tokens'])
        st.metric("Unique Tokens", results_b['text_stats']['unique_tokens'])
        st.metric("Content Words", results_b['text_stats']['content_words'])
        st.metric("Function Words", results_b['text_stats']['function_words'])
    
    with stats_diff:
        st.write("**Difference**")
        diff_total = results_b['text_stats']['total_tokens'] - results_a['text_stats']['total_tokens']
        diff_unique = results_b['text_stats']['unique_tokens'] - results_a['text_stats']['unique_tokens']
        diff_content = results_b['text_stats']['content_words'] - results_a['text_stats']['content_words']
        diff_function = results_b['text_stats']['function_words'] - results_a['text_stats']['function_words']
        
        st.metric("Total Tokens", f"{diff_total:+d}")
        st.metric("Unique Tokens", f"{diff_unique:+d}")
        st.metric("Content Words", f"{diff_content:+d}")
        st.metric("Function Words", f"{diff_function:+d}")
    
    # Visual comparison
    display_visual_comparison(results_a, results_b)
    
    # Statistical significance testing
    display_statistical_comparison(results_a, results_b)
    
    # Token-level comparison
    display_token_comparison(results_a, results_b)


def display_visual_comparison(results_a, results_b):
    """Display visual comparison charts."""
    st.subheader("📈 Visual Comparison")
    
    if not results_a.get('summary') or not results_b.get('summary'):
        st.warning("No sophistication scores available for visual comparison.")
        return
    
    # Create distribution plots for each measure
    measures = list(results_a['summary'].keys())
    
    for measure in measures:
        if measure in results_b['summary']:
            st.write(f"**{measure} Distribution Comparison**")
            
            # Get data for both texts
            data_a = results_a['raw_scores'].get(measure, [])
            data_b = results_b['raw_scores'].get(measure, [])
            
            if not data_a or not data_b:
                st.write("No detailed data available for this measure.")
                continue
            
            # Create word-to-score mapping for both texts
            word_score_map_a = {}
            word_score_map_b = {}
            
            # Build word mappings for Text A
            if '_bigram_' in measure:
                if 'bigram_details' in results_a and results_a['bigram_details']:
                    idx = measure.rfind('_bigram')
                    index_measure_col = measure[:idx] + measure[idx+7:] if idx != -1 else measure
                    for bigram_detail in results_a['bigram_details']:
                        if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
                            bigram_text = bigram_detail.get('bigram', '')
                            word_score_map_a[bigram_text] = bigram_detail[index_measure_col]
            elif '_trigram_' in measure:
                if 'trigram_details' in results_a and results_a['trigram_details']:
                    idx = measure.rfind('_trigram')
                    index_measure_col = measure[:idx] + measure[idx+8:] if idx != -1 else measure
                    for trigram_detail in results_a['trigram_details']:
                        if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
                            trigram_text = trigram_detail.get('trigram', '')
                            word_score_map_a[trigram_text] = trigram_detail[index_measure_col]
            else:
                if 'token_details' in results_a:
                    matching_column = None
                    if any(measure in token for token in results_a['token_details']):
                        matching_column = measure
                    else:
                        base_key = measure
                        for suffix in ['_CW', '_FW']:
                            if measure.endswith(suffix):
                                base_key = measure[:-len(suffix)]
                                break
                        if any(base_key in token for token in results_a['token_details']):
                            matching_column = base_key
                        else:
                            for token in results_a['token_details']:
                                for col_name in token.keys():
                                    if col_name not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
                                        if col_name in measure or measure.startswith(col_name):
                                            matching_column = col_name
                                            break
                                if matching_column:
                                    break
                    
                    if matching_column:
                        for token in results_a['token_details']:
                            if matching_column in token and token[matching_column] is not None:
                                word_score_map_a[token['token']] = token[matching_column]
            
            # Build word mappings for Text B (same logic)
            if '_bigram_' in measure:
                if 'bigram_details' in results_b and results_b['bigram_details']:
                    idx = measure.rfind('_bigram')
                    index_measure_col = measure[:idx] + measure[idx+7:] if idx != -1 else measure
                    for bigram_detail in results_b['bigram_details']:
                        if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
                            bigram_text = bigram_detail.get('bigram', '')
                            word_score_map_b[bigram_text] = bigram_detail[index_measure_col]
            elif '_trigram_' in measure:
                if 'trigram_details' in results_b and results_b['trigram_details']:
                    idx = measure.rfind('_trigram')
                    index_measure_col = measure[:idx] + measure[idx+8:] if idx != -1 else measure
                    for trigram_detail in results_b['trigram_details']:
                        if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
                            trigram_text = trigram_detail.get('trigram', '')
                            word_score_map_b[trigram_text] = trigram_detail[index_measure_col]
            else:
                if 'token_details' in results_b:
                    matching_column = None
                    if any(measure in token for token in results_b['token_details']):
                        matching_column = measure
                    else:
                        base_key = measure
                        for suffix in ['_CW', '_FW']:
                            if measure.endswith(suffix):
                                base_key = measure[:-len(suffix)]
                                break
                        if any(base_key in token for token in results_b['token_details']):
                            matching_column = base_key
                        else:
                            for token in results_b['token_details']:
                                for col_name in token.keys():
                                    if col_name not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
                                        if col_name in measure or measure.startswith(col_name):
                                            matching_column = col_name
                                            break
                                if matching_column:
                                    break
                    
                    if matching_column:
                        for token in results_b['token_details']:
                            if matching_column in token and token[matching_column] is not None:
                                word_score_map_b[token['token']] = token[matching_column]
            
            # Calculate bins for consistent binning
            all_data = data_a + data_b
            nbins = min(30, len(all_data))
            data_min, data_max = min(all_data), max(all_data)
            data_range = data_max - data_min
            padding = data_range * 0.02
            adjusted_min = data_min - padding
            adjusted_max = data_max + padding
            bin_edges = np.linspace(adjusted_min, adjusted_max, nbins + 1)
            
            # Assign words to bins for both texts
            bin_examples_a = {}
            bin_examples_b = {}
            
            if word_score_map_a:
                import random
                for word, score in word_score_map_a.items():
                    bin_idx = np.digitize(score, bin_edges) - 1
                    bin_idx = max(0, min(bin_idx, len(bin_edges) - 2))
                    if bin_idx not in bin_examples_a:
                        bin_examples_a[bin_idx] = []
                    bin_examples_a[bin_idx].append(word)
                
                for bin_idx in bin_examples_a:
                    if len(bin_examples_a[bin_idx]) > 3:
                        bin_examples_a[bin_idx] = random.sample(bin_examples_a[bin_idx], 3)
            
            if word_score_map_b:
                import random
                for word, score in word_score_map_b.items():
                    bin_idx = np.digitize(score, bin_edges) - 1
                    bin_idx = max(0, min(bin_idx, len(bin_edges) - 2))
                    if bin_idx not in bin_examples_b:
                        bin_examples_b[bin_idx] = []
                    bin_examples_b[bin_idx].append(word)
                
                for bin_idx in bin_examples_b:
                    if len(bin_examples_b[bin_idx]) > 3:
                        bin_examples_b[bin_idx] = random.sample(bin_examples_b[bin_idx], 3)
            
            # Create hover text for each bin
            hist_data_a, _ = np.histogram(data_a, bins=bin_edges)
            hist_data_b, _ = np.histogram(data_b, bins=bin_edges)
            
            hover_texts_a = []
            hover_texts_b = []
            
            for i in range(len(bin_edges) - 1):
                bin_start = bin_edges[i]
                bin_end = bin_edges[i + 1]
                examples_a = bin_examples_a.get(i, [])
                examples_b = bin_examples_b.get(i, [])
                
                # Hover text for Text A
                hover_text_a = f"Text A<br>Range: {bin_start:.3f} - {bin_end:.3f}<br>"
                hover_text_a += f"Count: {hist_data_a[i]}<br>"
                if examples_a:
                    hover_text_a += f"Examples: {', '.join(examples_a)}"
                else:
                    hover_text_a += "Examples: none"
                hover_texts_a.append(hover_text_a)
                
                # Hover text for Text B
                hover_text_b = f"Text B<br>Range: {bin_start:.3f} - {bin_end:.3f}<br>"
                hover_text_b += f"Count: {hist_data_b[i]}<br>"
                if examples_b:
                    hover_text_b += f"Examples: {', '.join(examples_b)}"
                else:
                    hover_text_b += "Examples: none"
                hover_texts_b.append(hover_text_b)
            
            # Create plotly figure
            fig = go.Figure()
            
            # Add histogram for Text A with custom hover
            fig.add_trace(go.Histogram(
                x=data_a,
                name="Text A",
                opacity=0.5,
                marker_color="blue",
                xbins=dict(
                    start=bin_edges[0],
                    end=bin_edges[-1],
                    size=(bin_edges[-1] - bin_edges[0]) / nbins
                ),
                histnorm='probability density',
                hovertemplate='%{customdata}<extra></extra>',
                customdata=hover_texts_a
            ))
            
            # Add histogram for Text B with custom hover
            fig.add_trace(go.Histogram(
                x=data_b,
                name="Text B",
                opacity=0.5,
                marker_color="red",
                xbins=dict(
                    start=bin_edges[0],
                    end=bin_edges[-1],
                    size=(bin_edges[-1] - bin_edges[0]) / nbins
                ),
                histnorm='probability density',
                hovertemplate='%{customdata}<extra></extra>',
                customdata=hover_texts_b
            ))

            # Calculate and add KDE (kernel density estimation) curve
            # Create smooth curve for KDE
            kde_a = stats.gaussian_kde(data_a)
            x_range_a = np.linspace(min(data_a), max(data_a), 100)
            kde_values_a = kde_a(x_range_a)
            
            fig.add_trace(go.Scatter(
                x=x_range_a,
                y=kde_values_a,
                mode='lines',
                name='Text A Density',
                line=dict(color='blue', width=2)
            ))

             # Calculate and add KDE (kernel density estimation) curve
            # Create smooth curve for KDE
            kde_b = stats.gaussian_kde(data_b)
            x_range_b = np.linspace(min(data_b), max(data_b), 100)
            kde_values_b = kde_b(x_range_b)
            
            fig.add_trace(go.Scatter(
                x=x_range_b,
                y=kde_values_b,
                mode='lines',
                name='Text B Density',
                line=dict(color='red', width=2)
            ))
            
            # Add vertical mean lines
            mean_a = np.mean(data_a)
            mean_b = np.mean(data_b)
            
            # Add mean line for Text A
            fig.add_vline(
                x=mean_a,
                line_dash="dash",
                line_color="blue",
                line_width=2,
                annotation_text=f"Text A Mean: {mean_a:.3f}",
                annotation_position="top left"
            )
            
            # Add mean line for Text B
            fig.add_vline(
                x=mean_b,
                line_dash="dash", 
                line_color="red",
                line_width=2,
                annotation_text=f"Text B Mean: {mean_b:.3f}",
                annotation_position="top right"
            )
            
            # Update layout
            fig.update_layout(
                title=f"{measure} Distribution Comparison",
                xaxis_title="Score",
                yaxis_title="Frequency",
                barmode='overlay',
                height=400,
                showlegend=True
            )
            
            st.plotly_chart(fig, use_container_width=True)


def display_statistical_comparison(results_a, results_b):
    """Display statistical significance testing results."""
    st.subheader("📊 Statistical Analysis")
    
    if not results_a.get('summary') or not results_b.get('summary'):
        st.warning("No sophistication scores available for statistical analysis.")
        return
    
    # Statistical comparison table
    stat_data = []
    measures = list(results_a['summary'].keys())
    
    for measure in measures:
        if measure in results_b['summary']:
            data_a = results_a['raw_scores'].get(measure, [])
            data_b = results_b['raw_scores'].get(measure, [])
            
            if len(data_a) > 1 and len(data_b) > 1:
                # Perform t-test
                t_stat, p_value = stats.ttest_ind(data_a, data_b)
                
                # Calculate effect size (Cohen's d)
                pooled_std = np.sqrt(((len(data_a) - 1) * np.var(data_a, ddof=1) + 
                                     (len(data_b) - 1) * np.var(data_b, ddof=1)) / 
                                    (len(data_a) + len(data_b) - 2))
                cohens_d = (np.mean(data_a) - np.mean(data_b)) / pooled_std if pooled_std > 0 else 0
                
                # Effect size interpretation
                if abs(cohens_d) < 0.2:
                    effect_size = "Negligible"
                elif abs(cohens_d) < 0.5:
                    effect_size = "Small"
                elif abs(cohens_d) < 0.8:
                    effect_size = "Medium"
                else:
                    effect_size = "Large"
                
                # Significance level
                if p_value < 0.001:
                    significance = "***"
                elif p_value < 0.01:
                    significance = "**"
                elif p_value < 0.05:
                    significance = "*"
                else:
                    significance = "ns"
                
                stat_data.append({
                    'Measure': measure,
                    't-statistic': round(t_stat, 3),
                    'p-value': f"{p_value:.6f}",
                    'Significance': significance,
                    "Cohen's d": round(cohens_d, 3),
                    'Effect Size': effect_size
                })
    
    if stat_data:
        st.write("Statistical analysis completed - results available in detailed outputs.")


def display_token_comparison(results_a, results_b):
    """Display token-level comparison in two side-by-side tables."""
    st.subheader("🔍 Token-Level Comparison")
    
    if not results_a.get('token_details') or not results_b.get('token_details'):
        st.warning("No token-level data available for comparison.")
        return
    
    # Get token data
    tokens_a = results_a['token_details']
    tokens_b = results_b['token_details']
    
    # Create two separate dataframes
    def create_token_dataframe(tokens, text_name):
        """Create a dataframe for token data."""
        token_data = []
        for token in tokens:
            row = {
                'Token': token.get('token', ''),
                'Lemma': token.get('lemma', ''),
                'POS': token.get('pos', ''),
                "TAG": token.get('tag', ''),
                'Type': token.get('word_type', '')
            }
            
            # Add scores for each measure (skip basic fields)
            for key, value in token.items():
                if key not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
                    row[key] = value if value != 'NA' else 'N/A'
            
            token_data.append(row)
        
        return pd.DataFrame(token_data)
    
    # Create dataframes for both texts
    df_a = create_token_dataframe(tokens_a, "Text A")
    df_b = create_token_dataframe(tokens_b, "Text B")
    
    # Display tables side by side
    col_a, col_b = st.columns(2)
    
    with col_a:
        st.write("**Text A Token Details**")
        if len(df_a) > 100:
            st.write(f"(showing first 100 of {len(df_a)} tokens)")
            st.dataframe(df_a.head(100), use_container_width=True)
        else:
            st.write(f"({len(df_a)} tokens)")
            st.dataframe(df_a, use_container_width=True)
    
    with col_b:
        st.write("**Text B Token Details**")
        if len(df_b) > 100:
            st.write(f"(showing first 100 of {len(df_b)} tokens)")
            st.dataframe(df_b.head(100), use_container_width=True)
        else:
            st.write(f"({len(df_b)} tokens)")
            st.dataframe(df_b, use_container_width=True)
    
    # Download options
    st.write("**Download Options**")
    download_col1, download_col2 = st.columns(2)
    
    with download_col1:
        csv_data_a = df_a.to_csv(index=False)
        st.download_button(
            label="Download Text A Tokens (CSV)",
            data=csv_data_a,
            file_name="text_a_tokens.csv",
            mime="text/csv"
        )
    
    with download_col2:
        csv_data_b = df_b.to_csv(index=False)
        st.download_button(
            label="Download Text B Tokens (CSV)",
            data=csv_data_b,
            file_name="text_b_tokens.csv",
            mime="text/csv"
        )