Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

egumasa commited on Aug 6, 2025

Commit

864b9a2

1 Parent(s): 385ead1

plot function update

Browse files

Files changed (5) hide show

pyproject.toml +1 -1
text_analyzer/corpus_visualizer.py +516 -13
uv.lock +1 -1
web_app/app.py +1 -1
web_app/handlers/corpus_viz_handlers.py +133 -14

pyproject.toml CHANGED Viewed

@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "streamlit>=1.28.0",
-    "spacy[cuda12x]>=3.7.0",
     "pandas>=2.0.0",
     "numpy>=1.24.0,<2.0",
     "plotly>=5.15.0",

 requires-python = ">=3.12"
 dependencies = [
     "streamlit>=1.28.0",
+    "spacy>=3.7.0",
     "pandas>=2.0.0",
     "numpy>=1.24.0,<2.0",
     "plotly>=5.15.0",

text_analyzer/corpus_visualizer.py CHANGED Viewed

@@ -16,6 +16,9 @@ import re
 from io import StringIO
 import natsort
 import csv
 logger = logging.getLogger(__name__)
@@ -399,9 +402,9 @@ class CorpusVisualizer:
     def create_boxplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
                       title: Optional[str] = None, height: int = 600,
-                      category_orders: Optional[Dict[str, List[str]]] = None) -> go.Figure:
         """
-        Create a box plot visualization using Plotly.
         Args:
             x_column: Categorical column for x-axis
@@ -412,7 +415,7 @@ class CorpusVisualizer:
             category_orders: Optional custom category orders
         Returns:
-            Plotly figure object
         """
         if self.merged_df is None:
             raise ValueError("Must perform merge before creating visualizations")
@@ -446,13 +449,27 @@ class CorpusVisualizer:
                         category_orders=plot_category_orders)
         fig.update_layout(template="plotly_white")
-        return fig
     def create_scatterplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
                           title: Optional[str] = None, height: int = 600,
-                          category_orders: Optional[Dict[str, List[str]]] = None) -> go.Figure:
         """
-        Create a scatter plot visualization using Plotly.
         Args:
             x_column: Numeric column for x-axis
@@ -461,9 +478,11 @@ class CorpusVisualizer:
             title: Plot title
             height: Plot height
             category_orders: Optional custom category orders
         Returns:
-            Plotly figure object
         """
         if self.merged_df is None:
             raise ValueError("Must perform merge before creating visualizations")
@@ -482,18 +501,83 @@ class CorpusVisualizer:
                 else:
                     plot_category_orders[color_column] = self.get_category_order(color_column, plot_df)
-        # Create the plot
-        fig = px.scatter(plot_df, x=x_column, y=y_column, color=color_column,
-                        title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height,
-                        category_orders=plot_category_orders if plot_category_orders else None)
         fig.update_layout(template="plotly_white")
-        return fig
     def export_merged_data(self) -> pd.DataFrame:
         """
         Export merged dataframe.
         Returns:
             pd.DataFrame: DataFrame ready for export
         """
@@ -501,3 +585,422 @@ class CorpusVisualizer:
             raise ValueError("Must perform merge before exporting")
         return self.merged_df

 from io import StringIO
 import natsort
 import csv
+from scipy import stats
+from scipy.stats import f_oneway
+import warnings
 logger = logging.getLogger(__name__)
     def create_boxplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
                       title: Optional[str] = None, height: int = 600,
+                      category_orders: Optional[Dict[str, List[str]]] = None) -> Tuple[go.Figure, Optional[Dict[str, Any]]]:
         """
+        Create a box plot visualization using Plotly with statistical analysis.
         Args:
             x_column: Categorical column for x-axis
             category_orders: Optional custom category orders
         Returns:
+            Tuple of (Plotly figure object, Statistical results dict)
         """
         if self.merged_df is None:
             raise ValueError("Must perform merge before creating visualizations")
                         category_orders=plot_category_orders)
         fig.update_layout(template="plotly_white")
+        # Perform statistical analysis
+        stats_results = None
+        try:
+            if color_column:
+                # Two-way ANOVA
+                stats_results = self.perform_two_way_anova(plot_df, x_column, y_column, color_column)
+            else:
+                # One-way ANOVA
+                stats_results = self.perform_one_way_anova(plot_df, x_column, y_column)
+        except Exception as e:
+            stats_results = {"error": f"Statistical analysis failed: {str(e)}"}
+        return fig, stats_results
     def create_scatterplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
                           title: Optional[str] = None, height: int = 600,
+                          category_orders: Optional[Dict[str, List[str]]] = None,
+                          add_trendline: bool = True, add_confidence_interval: bool = True) -> Tuple[go.Figure, Optional[Dict[str, Any]]]:
         """
+        Create a scatter plot visualization using Plotly with statistical analysis.
         Args:
             x_column: Numeric column for x-axis
             title: Plot title
             height: Plot height
             category_orders: Optional custom category orders
+            add_trendline: Whether to add regression line (default True)
+            add_confidence_interval: Whether to add confidence interval around trendline (default True)
         Returns:
+            Tuple of (Plotly figure object, Statistical results dict)
         """
         if self.merged_df is None:
             raise ValueError("Must perform merge before creating visualizations")
                 else:
                     plot_category_orders[color_column] = self.get_category_order(color_column, plot_df)
+        # Create the base scatter plot
+        if color_column:
+            fig = px.scatter(plot_df, x=x_column, y=y_column, color=color_column,
+                            title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height,
+                            category_orders=plot_category_orders if plot_category_orders else None)
+        else:
+            fig = px.scatter(plot_df, x=x_column, y=y_column,
+                            title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height)
+        # Perform statistical analysis
+        stats_results = None
+        try:
+            stats_results = self.perform_simple_regression(plot_df, x_column, y_column)
+            # Add trendline and confidence interval if requested and regression successful
+            if add_trendline and 'error' not in stats_results:
+                clean_df = plot_df[[x_column, y_column]].dropna()
+                x_vals = clean_df[x_column].values
+                y_vals = clean_df[y_column].values
+                # Get regression parameters
+                slope = stats_results['regression']['slope']
+                intercept = stats_results['regression']['intercept']
+                # Create more detailed x range for smooth curves
+                x_min, x_max = x_vals.min(), x_vals.max()
+                x_range = np.linspace(x_min, x_max, 100)
+                y_range = slope * x_range + intercept
+                # Calculate confidence intervals if requested
+                if add_confidence_interval:
+                    n = len(x_vals)
+                    mean_x = np.mean(x_vals)
+                    ss_x = np.sum((x_vals - mean_x) ** 2)
+                    mse = np.sum((y_vals - (slope * x_vals + intercept)) ** 2) / (n - 2)
+                    # Standard error for each prediction point
+                    se_y = np.sqrt(mse * (1/n + (x_range - mean_x)**2 / ss_x))
+                    # 95% confidence interval (t-distribution for small samples)
+                    from scipy.stats import t
+                    t_val = t.ppf(0.975, n - 2)  # 95% confidence
+                    y_upper = y_range + t_val * se_y
+                    y_lower = y_range - t_val * se_y
+                    # Add confidence interval as filled area
+                    fig.add_trace(go.Scatter(
+                        x=np.concatenate([x_range, x_range[::-1]]),
+                        y=np.concatenate([y_upper, y_lower[::-1]]),
+                        fill='toself',
+                        fillcolor='rgba(255, 0, 0, 0.2)',
+                        line=dict(color='rgba(255,255,255,0)'),
+                        hoverinfo="skip",
+                        showlegend=True,
+                        name='95% Confidence Interval'
+                    ))
+                # Add trendline to the plot
+                fig.add_trace(go.Scatter(
+                    x=x_range,
+                    y=y_range,
+                    mode='lines',
+                    name=f'Trendline (R² = {stats_results["regression"]["r_squared"]:.3f})',
+                    line=dict(color='red', dash='dash', width=2)
+                ))
+        except Exception as e:
+            stats_results = {"error": f"Statistical analysis failed: {str(e)}"}
         fig.update_layout(template="plotly_white")
+        return fig, stats_results
     def export_merged_data(self) -> pd.DataFrame:
         """
         Export merged dataframe.
         Returns:
             pd.DataFrame: DataFrame ready for export
         """
             raise ValueError("Must perform merge before exporting")
         return self.merged_df
+    # Statistical Analysis Methods
+    def calculate_eta_squared(self, ss_between: float, ss_total: float) -> float:
+        """
+        Calculate eta-squared effect size for ANOVA.
+        Args:
+            ss_between: Sum of squares between groups
+            ss_total: Total sum of squares
+        Returns:
+            float: Eta-squared value
+        """
+        if ss_total == 0:
+            return 0.0
+        return ss_between / ss_total
+    def calculate_partial_eta_squared(self, ss_effect: float, ss_error: float) -> float:
+        """
+        Calculate partial eta-squared effect size for factorial ANOVA.
+        Args:
+            ss_effect: Sum of squares for the effect
+            ss_error: Sum of squares for error
+        Returns:
+            float: Partial eta-squared value
+        """
+        if (ss_effect + ss_error) == 0:
+            return 0.0
+        return ss_effect / (ss_effect + ss_error)
+    def calculate_cohens_d(self, group1: np.ndarray, group2: np.ndarray) -> float:
+        """
+        Calculate Cohen's d effect size for two groups.
+        Args:
+            group1: Data for first group
+            group2: Data for second group
+        Returns:
+            float: Cohen's d value
+        """
+        n1, n2 = len(group1), len(group2)
+        if n1 < 2 or n2 < 2:
+            return 0.0
+        # Calculate pooled standard deviation
+        pooled_std = np.sqrt(((n1 - 1) * np.var(group1, ddof=1) +
+                             (n2 - 1) * np.var(group2, ddof=1)) / (n1 + n2 - 2))
+        if pooled_std == 0:
+            return 0.0
+        return (np.mean(group1) - np.mean(group2)) / pooled_std
+    def calculate_cohens_f_squared(self, r_squared: float) -> float:
+        """
+        Calculate Cohen's f² effect size for regression.
+        Args:
+            r_squared: R-squared value from regression
+        Returns:
+            float: Cohen's f² value
+        """
+        if r_squared >= 1.0 or r_squared < 0:
+            return 0.0
+        return r_squared / (1 - r_squared)
+    def interpret_effect_size(self, value: float, metric_type: str) -> str:
+        """
+        Provide interpretation for effect sizes.
+        Args:
+            value: Effect size value
+            metric_type: Type of effect size ('eta_squared', 'cohens_d', 'r_squared', 'cohens_f')
+        Returns:
+            str: Interpretation (Small, Medium, Large)
+        """
+        if metric_type == 'eta_squared' or metric_type == 'partial_eta_squared':
+            if value < 0.01:
+                return "Small"
+            elif value < 0.06:
+                return "Small"
+            elif value < 0.14:
+                return "Medium"
+            else:
+                return "Large"
+        elif metric_type == 'cohens_d':
+            abs_value = abs(value)
+            if abs_value < 0.2:
+                return "Small"
+            elif abs_value < 0.5:
+                return "Small"
+            elif abs_value < 0.8:
+                return "Medium"
+            else:
+                return "Large"
+        elif metric_type == 'r_squared':
+            if value < 0.01:
+                return "Small"
+            elif value < 0.09:
+                return "Small"
+            elif value < 0.25:
+                return "Medium"
+            else:
+                return "Large"
+        elif metric_type == 'cohens_f':
+            if value < 0.02:
+                return "Small"
+            elif value < 0.15:
+                return "Small"
+            elif value < 0.35:
+                return "Medium"
+            else:
+                return "Large"
+        else:
+            return "Unknown"
+    def perform_one_way_anova(self, df: pd.DataFrame, x_column: str, y_column: str) -> Dict[str, Any]:
+        """
+        Perform one-way ANOVA analysis.
+        Args:
+            df: DataFrame containing the data
+            x_column: Categorical column (groups)
+            y_column: Numeric column (dependent variable)
+        Returns:
+            Dict containing ANOVA results and effect sizes
+        """
+        try:
+            # Remove missing values
+            clean_df = df[[x_column, y_column]].dropna()
+            if len(clean_df) < 3:
+                return {"error": "Insufficient data for ANOVA (need at least 3 observations)"}
+            # Get groups
+            groups = [group[y_column].values for name, group in clean_df.groupby(x_column)]
+            # Check if we have at least 2 groups with data
+            valid_groups = [g for g in groups if len(g) > 0]
+            if len(valid_groups) < 2:
+                return {"error": "Need at least 2 groups for ANOVA"}
+            # Perform ANOVA
+            f_stat, p_value = f_oneway(*valid_groups)
+            # Calculate effect size (eta-squared)
+            group_data = []
+            group_names = []
+            for name, group in clean_df.groupby(x_column):
+                if len(group) > 0:
+                    group_data.append(group[y_column].values)
+                    group_names.append(name)
+            # Calculate sums of squares
+            grand_mean = clean_df[y_column].mean()
+            ss_total = np.sum((clean_df[y_column] - grand_mean) ** 2)
+            ss_between = 0
+            for group in group_data:
+                ss_between += len(group) * (np.mean(group) - grand_mean) ** 2
+            eta_squared = self.calculate_eta_squared(ss_between, ss_total)
+            # Degrees of freedom
+            df_between = len(valid_groups) - 1
+            df_within = len(clean_df) - len(valid_groups)
+            results = {
+                "test_type": "One-way ANOVA",
+                "f_statistic": f_stat,
+                "p_value": p_value,
+                "df_between": df_between,
+                "df_within": df_within,
+                "eta_squared": eta_squared,
+                "eta_squared_interpretation": self.interpret_effect_size(eta_squared, "eta_squared"),
+                "sample_size": len(clean_df),
+                "groups": group_names,
+                "group_means": [np.mean(group) for group in group_data],
+                "group_sizes": [len(group) for group in group_data]
+            }
+            # Post hoc analysis if significant and more than 2 groups
+            if p_value < 0.05 and len(valid_groups) > 2:
+                try:
+                    posthoc_results = []
+                    for i in range(len(group_data)):
+                        for j in range(i + 1, len(group_data)):
+                            # Calculate Cohen's d for this pair
+                            cohens_d = self.calculate_cohens_d(group_data[i], group_data[j])
+                            # Simple t-test for this pair (for p-value)
+                            t_stat, t_p = stats.ttest_ind(group_data[i], group_data[j])
+                            posthoc_results.append({
+                                "group1": group_names[i],
+                                "group2": group_names[j],
+                                "cohens_d": cohens_d,
+                                "cohens_d_interpretation": self.interpret_effect_size(cohens_d, "cohens_d"),
+                                "p_value": t_p,
+                                "mean_diff": np.mean(group_data[i]) - np.mean(group_data[j])
+                            })
+                    results["posthoc"] = posthoc_results
+                except Exception as e:
+                    results["posthoc_error"] = f"Error in post hoc analysis: {str(e)}"
+            return results
+        except Exception as e:
+            return {"error": f"Error performing ANOVA: {str(e)}"}
+    def perform_two_way_anova(self, df: pd.DataFrame, x_column: str, y_column: str, color_column: str) -> Dict[str, Any]:
+        """
+        Perform two-way ANOVA analysis.
+        Args:
+            df: DataFrame containing the data
+            x_column: First factor (categorical)
+            y_column: Dependent variable (numeric)
+            color_column: Second factor (categorical)
+        Returns:
+            Dict containing two-way ANOVA results and effect sizes
+        """
+        try:
+            # Remove missing values
+            clean_df = df[[x_column, y_column, color_column]].dropna()
+            if len(clean_df) < 6:  # Need minimum samples for 2-way ANOVA
+                return {"error": "Insufficient data for two-way ANOVA (need at least 6 observations)"}
+            # Get factor levels
+            factor1_levels = clean_df[x_column].unique()
+            factor2_levels = clean_df[color_column].unique()
+            if len(factor1_levels) < 2 or len(factor2_levels) < 2:
+                return {"error": "Need at least 2 levels per factor for two-way ANOVA"}
+            # Manual two-way ANOVA calculation
+            grand_mean = clean_df[y_column].mean()
+            n_total = len(clean_df)
+            # Calculate sums of squares
+            ss_total = np.sum((clean_df[y_column] - grand_mean) ** 2)
+            # Factor A (x_column) effect
+            ss_a = 0
+            for level in factor1_levels:
+                group_data = clean_df[clean_df[x_column] == level][y_column]
+                if len(group_data) > 0:
+                    ss_a += len(group_data) * (np.mean(group_data) - grand_mean) ** 2
+            # Factor B (color_column) effect
+            ss_b = 0
+            for level in factor2_levels:
+                group_data = clean_df[clean_df[color_column] == level][y_column]
+                if len(group_data) > 0:
+                    ss_b += len(group_data) * (np.mean(group_data) - grand_mean) ** 2
+            # Interaction effect
+            ss_ab = 0
+            for a_level in factor1_levels:
+                for b_level in factor2_levels:
+                    cell_data = clean_df[(clean_df[x_column] == a_level) & (clean_df[color_column] == b_level)][y_column]
+                    if len(cell_data) > 0:
+                        # Cell mean
+                        cell_mean = np.mean(cell_data)
+                        # Marginal means
+                        a_mean = np.mean(clean_df[clean_df[x_column] == a_level][y_column])
+                        b_mean = np.mean(clean_df[clean_df[color_column] == b_level][y_column])
+                        # Interaction sum of squares
+                        ss_ab += len(cell_data) * (cell_mean - a_mean - b_mean + grand_mean) ** 2
+            # Error sum of squares
+            ss_error = ss_total - ss_a - ss_b - ss_ab
+            # Degrees of freedom
+            df_a = len(factor1_levels) - 1
+            df_b = len(factor2_levels) - 1
+            df_ab = df_a * df_b
+            df_error = n_total - len(factor1_levels) * len(factor2_levels)
+            # Mean squares
+            ms_a = ss_a / df_a if df_a > 0 else 0
+            ms_b = ss_b / df_b if df_b > 0 else 0
+            ms_ab = ss_ab / df_ab if df_ab > 0 else 0
+            ms_error = ss_error / df_error if df_error > 0 else 1
+            # F statistics
+            f_a = ms_a / ms_error if ms_error > 0 else 0
+            f_b = ms_b / ms_error if ms_error > 0 else 0
+            f_ab = ms_ab / ms_error if ms_error > 0 else 0
+            # P values
+            p_a = 1 - stats.f.cdf(f_a, df_a, df_error) if f_a > 0 else 1
+            p_b = 1 - stats.f.cdf(f_b, df_b, df_error) if f_b > 0 else 1
+            p_ab = 1 - stats.f.cdf(f_ab, df_ab, df_error) if f_ab > 0 else 1
+            # Effect sizes (partial eta squared)
+            eta_squared_a = self.calculate_partial_eta_squared(ss_a, ss_error)
+            eta_squared_b = self.calculate_partial_eta_squared(ss_b, ss_error)
+            eta_squared_ab = self.calculate_partial_eta_squared(ss_ab, ss_error)
+            results = {
+                "test_type": "Two-way ANOVA",
+                "factor_a": {
+                    "name": x_column,
+                    "f_statistic": f_a,
+                    "p_value": p_a,
+                    "df": df_a,
+                    "partial_eta_squared": eta_squared_a,
+                    "interpretation": self.interpret_effect_size(eta_squared_a, "partial_eta_squared")
+                },
+                "factor_b": {
+                    "name": color_column,
+                    "f_statistic": f_b,
+                    "p_value": p_b,
+                    "df": df_b,
+                    "partial_eta_squared": eta_squared_b,
+                    "interpretation": self.interpret_effect_size(eta_squared_b, "partial_eta_squared")
+                },
+                "interaction": {
+                    "name": f"{x_column} × {color_column}",
+                    "f_statistic": f_ab,
+                    "p_value": p_ab,
+                    "df": df_ab,
+                    "partial_eta_squared": eta_squared_ab,
+                    "interpretation": self.interpret_effect_size(eta_squared_ab, "partial_eta_squared")
+                },
+                "df_error": df_error,
+                "sample_size": n_total,
+                "factor_a_levels": list(factor1_levels),
+                "factor_b_levels": list(factor2_levels)
+            }
+            return results
+        except Exception as e:
+            return {"error": f"Error performing two-way ANOVA: {str(e)}"}
+    def perform_simple_regression(self, df: pd.DataFrame, x_column: str, y_column: str) -> Dict[str, Any]:
+        """
+        Perform simple linear regression analysis.
+        Args:
+            df: DataFrame containing the data
+            x_column: Independent variable (numeric)
+            y_column: Dependent variable (numeric)
+        Returns:
+            Dict containing regression results and effect sizes
+        """
+        try:
+            # Remove missing values
+            clean_df = df[[x_column, y_column]].dropna()
+            if len(clean_df) < 3:
+                return {"error": "Insufficient data for regression (need at least 3 observations)"}
+            x = clean_df[x_column].values
+            y = clean_df[y_column].values
+            # Calculate correlation
+            correlation, corr_p = stats.pearsonr(x, y)
+            # Simple linear regression
+            slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
+            # Calculate additional statistics
+            r_squared = r_value ** 2
+            adjusted_r_squared = 1 - (1 - r_squared) * (len(clean_df) - 1) / (len(clean_df) - 2)
+            # Effect sizes
+            cohens_f_squared = self.calculate_cohens_f_squared(r_squared)
+            # Standard error of the slope
+            n = len(clean_df)
+            mean_x = np.mean(x)
+            ss_x = np.sum((x - mean_x) ** 2)
+            mse = np.sum((y - (slope * x + intercept)) ** 2) / (n - 2)
+            se_slope = np.sqrt(mse / ss_x)
+            # t-statistic for slope
+            t_stat = slope / se_slope if se_slope > 0 else 0
+            results = {
+                "test_type": "Simple Linear Regression",
+                "correlation": {
+                    "pearson_r": correlation,
+                    "p_value": corr_p,
+                    "interpretation": self.interpret_effect_size(abs(correlation), "r_squared")
+                },
+                "regression": {
+                    "slope": slope,
+                    "intercept": intercept,
+                    "r_squared": r_squared,
+                    "adjusted_r_squared": adjusted_r_squared,
+                    "p_value": p_value,
+                    "standard_error": std_err,
+                    "t_statistic": t_stat,
+                    "cohens_f_squared": cohens_f_squared,
+                    "f_squared_interpretation": self.interpret_effect_size(cohens_f_squared, "cohens_f")
+                },
+                "sample_size": len(clean_df),
+                "variance_explained": f"{r_squared * 100:.1f}%"
+            }
+            return results
+        except Exception as e:
+            return {"error": f"Error performing regression: {str(e)}"}

uv.lock CHANGED Viewed

@@ -1751,7 +1751,7 @@ requires-dist = [
     { name = "plotly", specifier = ">=5.15.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "scipy", specifier = ">=1.11.0" },
-    { name = "spacy", extras = ["cuda11", "cuda12"], specifier = ">=3.7.0" },
     { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
     { name = "spacy-transformers", specifier = ">=1.3.0" },
     { name = "streamlit", specifier = ">=1.28.0" },

     { name = "plotly", specifier = ">=5.15.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "scipy", specifier = ">=1.11.0" },
+    { name = "spacy", specifier = ">=3.7.0" },
     { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
     { name = "spacy-transformers", specifier = ">=1.3.0" },
     { name = "streamlit", specifier = ">=1.28.0" },

web_app/app.py CHANGED Viewed

@@ -41,7 +41,7 @@ st.set_page_config(
 def main():
     """Main application entry point."""
-    st.title("�� Linguistic Data Analysis I - Text Analysis Tools")
     st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
     # GPU status is already initialized in gpu_init module

 def main():
     """Main application entry point."""
+    st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
     st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
     # GPU status is already initialized in gpu_init module

web_app/handlers/corpus_viz_handlers.py CHANGED Viewed

@@ -695,7 +695,7 @@ class CorpusVizHandlers:
                         # Get custom category orders if set
                         category_orders = st.session_state.corpus_viz_category_orders
-                        fig = visualizer.create_boxplot(
                             x_column=x_column,
                             y_column=y_column,
                             color_column=color_column,
@@ -710,7 +710,8 @@ class CorpusVizHandlers:
                             'title': title,
                             'x_column': x_column,
                             'y_column': y_column,
-                            'color_column': color_column
                         }
                         # Add to plots list
@@ -763,12 +764,6 @@ class CorpusVizHandlers:
                 if size_column == "None":
                     size_column = None
-            add_trendline = st.checkbox(
-                "Add Trendline",
-                value=False,
-                help="Add a linear trendline to the scatter plot"
-            )
             title = st.text_input(
                 "Plot Title",
                 value=f"Scatter Plot: {y_column} vs {x_column}",
@@ -782,13 +777,15 @@ class CorpusVizHandlers:
                         # Get custom category orders if set
                         category_orders = st.session_state.corpus_viz_category_orders
-                        # Note: Current implementation only supports basic parameters
-                        fig = visualizer.create_scatterplot(
                             x_column=x_column,
                             y_column=y_column,
                             color_column=color_column,
                             title=title,
-                            category_orders=category_orders
                         )
                         # Store in session state
@@ -800,7 +797,9 @@ class CorpusVizHandlers:
                             'y_column': y_column,
                             'color_column': color_column,
                             'size_column': size_column,
-                            'add_trendline': add_trendline
                         }
                         # Add to plots list
@@ -813,10 +812,124 @@ class CorpusVizHandlers:
                 except Exception as e:
                     st.error(f"Error creating scatter plot: {str(e)}")
     @staticmethod
     def render_visualization_results():
         """
-        Render visualization results.
         """
         plots = st.session_state.corpus_viz_plots
@@ -831,6 +944,12 @@ class CorpusVizHandlers:
                 # Display the plot
                 st.plotly_chart(plot_config['figure'], use_container_width=True)
                 # Plot details
                 col1, col2, col3 = st.columns(3)
@@ -848,7 +967,7 @@ class CorpusVizHandlers:
                 with col3:
                     if plot_config['color_column']:
                         st.write(f"**Color By:** {plot_config['color_column']}")
-                    if plot_config['type'] == 'scatter' and plot_config['size_column']:
                         st.write(f"**Size By:** {plot_config['size_column']}")
                 # Remove plot button

                         # Get custom category orders if set
                         category_orders = st.session_state.corpus_viz_category_orders
+                        fig, stats_results = visualizer.create_boxplot(
                             x_column=x_column,
                             y_column=y_column,
                             color_column=color_column,
                             'title': title,
                             'x_column': x_column,
                             'y_column': y_column,
+                            'color_column': color_column,
+                            'stats_results': stats_results
                         }
                         # Add to plots list
                 if size_column == "None":
                     size_column = None
             title = st.text_input(
                 "Plot Title",
                 value=f"Scatter Plot: {y_column} vs {x_column}",
                         # Get custom category orders if set
                         category_orders = st.session_state.corpus_viz_category_orders
+                        # Create scatter plot with statistical analysis (trendline and confidence intervals enabled by default)
+                        fig, stats_results = visualizer.create_scatterplot(
                             x_column=x_column,
                             y_column=y_column,
                             color_column=color_column,
                             title=title,
+                            category_orders=category_orders,
+                            add_trendline=True,
+                            add_confidence_interval=True
                         )
                         # Store in session state
                             'y_column': y_column,
                             'color_column': color_column,
                             'size_column': size_column,
+                            'add_trendline': True,
+                            'add_confidence_interval': True,
+                            'stats_results': stats_results
                         }
                         # Add to plots list
                 except Exception as e:
                     st.error(f"Error creating scatter plot: {str(e)}")
+    @staticmethod
+    def render_statistical_results(stats_results: Dict[str, Any]):
+        """
+        Render statistical analysis results in a formatted way.
+        Args:
+            stats_results: Statistical results dictionary
+        """
+        if not stats_results or 'error' in stats_results:
+            if stats_results and 'error' in stats_results:
+                st.error(f"Statistical analysis error: {stats_results['error']}")
+            return
+        st.write("### 📊 Statistical Analysis")
+        test_type = stats_results.get('test_type', 'Unknown')
+        if test_type == "One-way ANOVA":
+            # One-way ANOVA results in APA table format
+            f_stat = stats_results.get('f_statistic', 0)
+            p_val = stats_results.get('p_value', 1)
+            p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< .001"
+            eta_sq = stats_results.get('eta_squared', 0)
+            interpretation = stats_results.get('eta_squared_interpretation', 'Unknown')
+            df_between = stats_results.get('df_between', 0)
+            df_within = stats_results.get('df_within', 0)
+            # Create APA-style table
+            anova_table = f"""
+| Statistic | Value | df | p | η² | Effect Size |
+|-----------|-------|----|----|----|-----------|
+| F | {f_stat:.3f} | ({df_between}, {df_within}) | {p_display} | {eta_sq:.3f} | {interpretation} |
+"""
+            st.markdown(anova_table)
+            # Sample size
+            sample_size = stats_results.get('sample_size', 0)
+            st.write(f"**Model:** F({df_between}, {df_within}) = {f_stat:.3f}, p = {p_display}, N = {sample_size}")
+            # Post hoc results
+            if 'posthoc' in stats_results and stats_results['posthoc']:
+                st.write("**Post Hoc Comparisons (Tukey HSD):**")
+                posthoc_data = []
+                for comparison in stats_results['posthoc']:
+                    group1 = comparison.get('group1', '')
+                    group2 = comparison.get('group2', '')
+                    cohens_d = comparison.get('cohens_d', 0)
+                    d_interp = comparison.get('cohens_d_interpretation', '')
+                    p_val = comparison.get('p_value', 1)
+                    mean_diff = comparison.get('mean_diff', 0)
+                    p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< 0.001"
+                    posthoc_data.append({
+                        'Comparison': f"{group1} vs {group2}",
+                        'Mean Diff': f"{mean_diff:.3f}",
+                        "Cohen's d": f"{cohens_d:.3f}",
+                        'Effect Size': d_interp,
+                        'p-value': p_display
+                    })
+                if posthoc_data:
+                    st.dataframe(pd.DataFrame(posthoc_data), use_container_width=True)
+        elif test_type == "Two-way ANOVA":
+            # Two-way ANOVA results in APA table format
+            st.write("**Main Effects and Interaction:**")
+            # Factor A
+            factor_a = stats_results.get('factor_a', {})
+            factor_b = stats_results.get('factor_b', {})
+            interaction = stats_results.get('interaction', {})
+            df_error = stats_results.get('df_error', 0)
+            # Create APA-style table
+            twoway_table = "| Source | F | df | p | ηp² | Effect Size |\n|--------|---|----|----|-----|-------------|\n"
+            for effect_name, effect_data in [("Factor A", factor_a), ("Factor B", factor_b), ("A × B", interaction)]:
+                f_stat = effect_data.get('f_statistic', 0)
+                p_val = effect_data.get('p_value', 1)
+                df = effect_data.get('df', 0)
+                eta_sq = effect_data.get('partial_eta_squared', 0)
+                interpretation = effect_data.get('interpretation', '')
+                p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< .001"
+                twoway_table += f"| {effect_name} | {f_stat:.3f} | ({df}, {df_error}) | {p_display} | {eta_sq:.3f} | {interpretation} |\n"
+            st.markdown(twoway_table)
+            sample_size = stats_results.get('sample_size', 0)
+            st.write(f"**Sample Size:** N = {sample_size}, df_error = {df_error}")
+        elif test_type == "Simple Linear Regression":
+            # Bivariate correlation results in APA table format
+            correlation = stats_results.get('correlation', {})
+            regression = stats_results.get('regression', {})
+            pearson_r = correlation.get('pearson_r', 0)
+            corr_p = correlation.get('p_value', 1)
+            corr_p_display = f"{corr_p:.3f}" if corr_p >= 0.001 else "< .001"
+            corr_interp = correlation.get('interpretation', 'Unknown')
+            r_squared = regression.get('r_squared', 0)
+            # Create APA-style table for bivariate correlation
+            correlation_table = f"""
+| Statistic | Value | p | Effect Size |
+|-----------|-------|---|-------------|
+| Pearson r | {pearson_r:.3f} | {corr_p_display} | {corr_interp} |
+| R² | {r_squared:.3f} | {corr_p_display} | - |
+"""
+            st.markdown(correlation_table)
     @staticmethod
     def render_visualization_results():
         """
+        Render visualization results with statistical analysis.
         """
         plots = st.session_state.corpus_viz_plots
                 # Display the plot
                 st.plotly_chart(plot_config['figure'], use_container_width=True)
+                # Display statistical results if available
+                if 'stats_results' in plot_config and plot_config['stats_results']:
+                    CorpusVizHandlers.render_statistical_results(plot_config['stats_results'])
+                st.markdown("---")
                 # Plot details
                 col1, col2, col3 = st.columns(3)
                 with col3:
                     if plot_config['color_column']:
                         st.write(f"**Color By:** {plot_config['color_column']}")
+                    if plot_config['type'] == 'scatter' and plot_config.get('size_column'):
                         st.write(f"**Size By:** {plot_config['size_column']}")
                 # Remove plot button