egumasa commited on
Commit
864b9a2
·
1 Parent(s): 385ead1

plot function update

Browse files
pyproject.toml CHANGED
@@ -6,7 +6,7 @@ readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
8
  "streamlit>=1.28.0",
9
- "spacy[cuda12x]>=3.7.0",
10
  "pandas>=2.0.0",
11
  "numpy>=1.24.0,<2.0",
12
  "plotly>=5.15.0",
 
6
  requires-python = ">=3.12"
7
  dependencies = [
8
  "streamlit>=1.28.0",
9
+ "spacy>=3.7.0",
10
  "pandas>=2.0.0",
11
  "numpy>=1.24.0,<2.0",
12
  "plotly>=5.15.0",
text_analyzer/corpus_visualizer.py CHANGED
@@ -16,6 +16,9 @@ import re
16
  from io import StringIO
17
  import natsort
18
  import csv
 
 
 
19
 
20
  logger = logging.getLogger(__name__)
21
 
@@ -399,9 +402,9 @@ class CorpusVisualizer:
399
 
400
  def create_boxplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
401
  title: Optional[str] = None, height: int = 600,
402
- category_orders: Optional[Dict[str, List[str]]] = None) -> go.Figure:
403
  """
404
- Create a box plot visualization using Plotly.
405
 
406
  Args:
407
  x_column: Categorical column for x-axis
@@ -412,7 +415,7 @@ class CorpusVisualizer:
412
  category_orders: Optional custom category orders
413
 
414
  Returns:
415
- Plotly figure object
416
  """
417
  if self.merged_df is None:
418
  raise ValueError("Must perform merge before creating visualizations")
@@ -446,13 +449,27 @@ class CorpusVisualizer:
446
  category_orders=plot_category_orders)
447
 
448
  fig.update_layout(template="plotly_white")
449
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
  def create_scatterplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
452
  title: Optional[str] = None, height: int = 600,
453
- category_orders: Optional[Dict[str, List[str]]] = None) -> go.Figure:
 
454
  """
455
- Create a scatter plot visualization using Plotly.
456
 
457
  Args:
458
  x_column: Numeric column for x-axis
@@ -461,9 +478,11 @@ class CorpusVisualizer:
461
  title: Plot title
462
  height: Plot height
463
  category_orders: Optional custom category orders
 
 
464
 
465
  Returns:
466
- Plotly figure object
467
  """
468
  if self.merged_df is None:
469
  raise ValueError("Must perform merge before creating visualizations")
@@ -482,18 +501,83 @@ class CorpusVisualizer:
482
  else:
483
  plot_category_orders[color_column] = self.get_category_order(color_column, plot_df)
484
 
485
- # Create the plot
486
- fig = px.scatter(plot_df, x=x_column, y=y_column, color=color_column,
487
- title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height,
488
- category_orders=plot_category_orders if plot_category_orders else None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
  fig.update_layout(template="plotly_white")
491
- return fig
492
 
493
  def export_merged_data(self) -> pd.DataFrame:
494
  """
495
  Export merged dataframe.
496
-
497
  Returns:
498
  pd.DataFrame: DataFrame ready for export
499
  """
@@ -501,3 +585,422 @@ class CorpusVisualizer:
501
  raise ValueError("Must perform merge before exporting")
502
 
503
  return self.merged_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from io import StringIO
17
  import natsort
18
  import csv
19
+ from scipy import stats
20
+ from scipy.stats import f_oneway
21
+ import warnings
22
 
23
  logger = logging.getLogger(__name__)
24
 
 
402
 
403
  def create_boxplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
404
  title: Optional[str] = None, height: int = 600,
405
+ category_orders: Optional[Dict[str, List[str]]] = None) -> Tuple[go.Figure, Optional[Dict[str, Any]]]:
406
  """
407
+ Create a box plot visualization using Plotly with statistical analysis.
408
 
409
  Args:
410
  x_column: Categorical column for x-axis
 
415
  category_orders: Optional custom category orders
416
 
417
  Returns:
418
+ Tuple of (Plotly figure object, Statistical results dict)
419
  """
420
  if self.merged_df is None:
421
  raise ValueError("Must perform merge before creating visualizations")
 
449
  category_orders=plot_category_orders)
450
 
451
  fig.update_layout(template="plotly_white")
452
+
453
+ # Perform statistical analysis
454
+ stats_results = None
455
+ try:
456
+ if color_column:
457
+ # Two-way ANOVA
458
+ stats_results = self.perform_two_way_anova(plot_df, x_column, y_column, color_column)
459
+ else:
460
+ # One-way ANOVA
461
+ stats_results = self.perform_one_way_anova(plot_df, x_column, y_column)
462
+ except Exception as e:
463
+ stats_results = {"error": f"Statistical analysis failed: {str(e)}"}
464
+
465
+ return fig, stats_results
466
 
467
  def create_scatterplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
468
  title: Optional[str] = None, height: int = 600,
469
+ category_orders: Optional[Dict[str, List[str]]] = None,
470
+ add_trendline: bool = True, add_confidence_interval: bool = True) -> Tuple[go.Figure, Optional[Dict[str, Any]]]:
471
  """
472
+ Create a scatter plot visualization using Plotly with statistical analysis.
473
 
474
  Args:
475
  x_column: Numeric column for x-axis
 
478
  title: Plot title
479
  height: Plot height
480
  category_orders: Optional custom category orders
481
+ add_trendline: Whether to add regression line (default True)
482
+ add_confidence_interval: Whether to add confidence interval around trendline (default True)
483
 
484
  Returns:
485
+ Tuple of (Plotly figure object, Statistical results dict)
486
  """
487
  if self.merged_df is None:
488
  raise ValueError("Must perform merge before creating visualizations")
 
501
  else:
502
  plot_category_orders[color_column] = self.get_category_order(color_column, plot_df)
503
 
504
+ # Create the base scatter plot
505
+ if color_column:
506
+ fig = px.scatter(plot_df, x=x_column, y=y_column, color=color_column,
507
+ title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height,
508
+ category_orders=plot_category_orders if plot_category_orders else None)
509
+ else:
510
+ fig = px.scatter(plot_df, x=x_column, y=y_column,
511
+ title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height)
512
+
513
+ # Perform statistical analysis
514
+ stats_results = None
515
+ try:
516
+ stats_results = self.perform_simple_regression(plot_df, x_column, y_column)
517
+
518
+ # Add trendline and confidence interval if requested and regression successful
519
+ if add_trendline and 'error' not in stats_results:
520
+ clean_df = plot_df[[x_column, y_column]].dropna()
521
+ x_vals = clean_df[x_column].values
522
+ y_vals = clean_df[y_column].values
523
+
524
+ # Get regression parameters
525
+ slope = stats_results['regression']['slope']
526
+ intercept = stats_results['regression']['intercept']
527
+
528
+ # Create more detailed x range for smooth curves
529
+ x_min, x_max = x_vals.min(), x_vals.max()
530
+ x_range = np.linspace(x_min, x_max, 100)
531
+ y_range = slope * x_range + intercept
532
+
533
+ # Calculate confidence intervals if requested
534
+ if add_confidence_interval:
535
+ n = len(x_vals)
536
+ mean_x = np.mean(x_vals)
537
+ ss_x = np.sum((x_vals - mean_x) ** 2)
538
+ mse = np.sum((y_vals - (slope * x_vals + intercept)) ** 2) / (n - 2)
539
+
540
+ # Standard error for each prediction point
541
+ se_y = np.sqrt(mse * (1/n + (x_range - mean_x)**2 / ss_x))
542
+
543
+ # 95% confidence interval (t-distribution for small samples)
544
+ from scipy.stats import t
545
+ t_val = t.ppf(0.975, n - 2) # 95% confidence
546
+
547
+ y_upper = y_range + t_val * se_y
548
+ y_lower = y_range - t_val * se_y
549
+
550
+ # Add confidence interval as filled area
551
+ fig.add_trace(go.Scatter(
552
+ x=np.concatenate([x_range, x_range[::-1]]),
553
+ y=np.concatenate([y_upper, y_lower[::-1]]),
554
+ fill='toself',
555
+ fillcolor='rgba(255, 0, 0, 0.2)',
556
+ line=dict(color='rgba(255,255,255,0)'),
557
+ hoverinfo="skip",
558
+ showlegend=True,
559
+ name='95% Confidence Interval'
560
+ ))
561
+
562
+ # Add trendline to the plot
563
+ fig.add_trace(go.Scatter(
564
+ x=x_range,
565
+ y=y_range,
566
+ mode='lines',
567
+ name=f'Trendline (R² = {stats_results["regression"]["r_squared"]:.3f})',
568
+ line=dict(color='red', dash='dash', width=2)
569
+ ))
570
+
571
+ except Exception as e:
572
+ stats_results = {"error": f"Statistical analysis failed: {str(e)}"}
573
 
574
  fig.update_layout(template="plotly_white")
575
+ return fig, stats_results
576
 
577
  def export_merged_data(self) -> pd.DataFrame:
578
  """
579
  Export merged dataframe.
580
+
581
  Returns:
582
  pd.DataFrame: DataFrame ready for export
583
  """
 
585
  raise ValueError("Must perform merge before exporting")
586
 
587
  return self.merged_df
588
+
589
+ # Statistical Analysis Methods
590
+
591
+ def calculate_eta_squared(self, ss_between: float, ss_total: float) -> float:
592
+ """
593
+ Calculate eta-squared effect size for ANOVA.
594
+
595
+ Args:
596
+ ss_between: Sum of squares between groups
597
+ ss_total: Total sum of squares
598
+
599
+ Returns:
600
+ float: Eta-squared value
601
+ """
602
+ if ss_total == 0:
603
+ return 0.0
604
+ return ss_between / ss_total
605
+
606
+ def calculate_partial_eta_squared(self, ss_effect: float, ss_error: float) -> float:
607
+ """
608
+ Calculate partial eta-squared effect size for factorial ANOVA.
609
+
610
+ Args:
611
+ ss_effect: Sum of squares for the effect
612
+ ss_error: Sum of squares for error
613
+
614
+ Returns:
615
+ float: Partial eta-squared value
616
+ """
617
+ if (ss_effect + ss_error) == 0:
618
+ return 0.0
619
+ return ss_effect / (ss_effect + ss_error)
620
+
621
+ def calculate_cohens_d(self, group1: np.ndarray, group2: np.ndarray) -> float:
622
+ """
623
+ Calculate Cohen's d effect size for two groups.
624
+
625
+ Args:
626
+ group1: Data for first group
627
+ group2: Data for second group
628
+
629
+ Returns:
630
+ float: Cohen's d value
631
+ """
632
+ n1, n2 = len(group1), len(group2)
633
+ if n1 < 2 or n2 < 2:
634
+ return 0.0
635
+
636
+ # Calculate pooled standard deviation
637
+ pooled_std = np.sqrt(((n1 - 1) * np.var(group1, ddof=1) +
638
+ (n2 - 1) * np.var(group2, ddof=1)) / (n1 + n2 - 2))
639
+
640
+ if pooled_std == 0:
641
+ return 0.0
642
+
643
+ return (np.mean(group1) - np.mean(group2)) / pooled_std
644
+
645
+ def calculate_cohens_f_squared(self, r_squared: float) -> float:
646
+ """
647
+ Calculate Cohen's f² effect size for regression.
648
+
649
+ Args:
650
+ r_squared: R-squared value from regression
651
+
652
+ Returns:
653
+ float: Cohen's f² value
654
+ """
655
+ if r_squared >= 1.0 or r_squared < 0:
656
+ return 0.0
657
+ return r_squared / (1 - r_squared)
658
+
659
+ def interpret_effect_size(self, value: float, metric_type: str) -> str:
660
+ """
661
+ Provide interpretation for effect sizes.
662
+
663
+ Args:
664
+ value: Effect size value
665
+ metric_type: Type of effect size ('eta_squared', 'cohens_d', 'r_squared', 'cohens_f')
666
+
667
+ Returns:
668
+ str: Interpretation (Small, Medium, Large)
669
+ """
670
+ if metric_type == 'eta_squared' or metric_type == 'partial_eta_squared':
671
+ if value < 0.01:
672
+ return "Small"
673
+ elif value < 0.06:
674
+ return "Small"
675
+ elif value < 0.14:
676
+ return "Medium"
677
+ else:
678
+ return "Large"
679
+ elif metric_type == 'cohens_d':
680
+ abs_value = abs(value)
681
+ if abs_value < 0.2:
682
+ return "Small"
683
+ elif abs_value < 0.5:
684
+ return "Small"
685
+ elif abs_value < 0.8:
686
+ return "Medium"
687
+ else:
688
+ return "Large"
689
+ elif metric_type == 'r_squared':
690
+ if value < 0.01:
691
+ return "Small"
692
+ elif value < 0.09:
693
+ return "Small"
694
+ elif value < 0.25:
695
+ return "Medium"
696
+ else:
697
+ return "Large"
698
+ elif metric_type == 'cohens_f':
699
+ if value < 0.02:
700
+ return "Small"
701
+ elif value < 0.15:
702
+ return "Small"
703
+ elif value < 0.35:
704
+ return "Medium"
705
+ else:
706
+ return "Large"
707
+ else:
708
+ return "Unknown"
709
+
710
+ def perform_one_way_anova(self, df: pd.DataFrame, x_column: str, y_column: str) -> Dict[str, Any]:
711
+ """
712
+ Perform one-way ANOVA analysis.
713
+
714
+ Args:
715
+ df: DataFrame containing the data
716
+ x_column: Categorical column (groups)
717
+ y_column: Numeric column (dependent variable)
718
+
719
+ Returns:
720
+ Dict containing ANOVA results and effect sizes
721
+ """
722
+ try:
723
+ # Remove missing values
724
+ clean_df = df[[x_column, y_column]].dropna()
725
+
726
+ if len(clean_df) < 3:
727
+ return {"error": "Insufficient data for ANOVA (need at least 3 observations)"}
728
+
729
+ # Get groups
730
+ groups = [group[y_column].values for name, group in clean_df.groupby(x_column)]
731
+
732
+ # Check if we have at least 2 groups with data
733
+ valid_groups = [g for g in groups if len(g) > 0]
734
+ if len(valid_groups) < 2:
735
+ return {"error": "Need at least 2 groups for ANOVA"}
736
+
737
+ # Perform ANOVA
738
+ f_stat, p_value = f_oneway(*valid_groups)
739
+
740
+ # Calculate effect size (eta-squared)
741
+ group_data = []
742
+ group_names = []
743
+ for name, group in clean_df.groupby(x_column):
744
+ if len(group) > 0:
745
+ group_data.append(group[y_column].values)
746
+ group_names.append(name)
747
+
748
+ # Calculate sums of squares
749
+ grand_mean = clean_df[y_column].mean()
750
+ ss_total = np.sum((clean_df[y_column] - grand_mean) ** 2)
751
+
752
+ ss_between = 0
753
+ for group in group_data:
754
+ ss_between += len(group) * (np.mean(group) - grand_mean) ** 2
755
+
756
+ eta_squared = self.calculate_eta_squared(ss_between, ss_total)
757
+
758
+ # Degrees of freedom
759
+ df_between = len(valid_groups) - 1
760
+ df_within = len(clean_df) - len(valid_groups)
761
+
762
+ results = {
763
+ "test_type": "One-way ANOVA",
764
+ "f_statistic": f_stat,
765
+ "p_value": p_value,
766
+ "df_between": df_between,
767
+ "df_within": df_within,
768
+ "eta_squared": eta_squared,
769
+ "eta_squared_interpretation": self.interpret_effect_size(eta_squared, "eta_squared"),
770
+ "sample_size": len(clean_df),
771
+ "groups": group_names,
772
+ "group_means": [np.mean(group) for group in group_data],
773
+ "group_sizes": [len(group) for group in group_data]
774
+ }
775
+
776
+ # Post hoc analysis if significant and more than 2 groups
777
+ if p_value < 0.05 and len(valid_groups) > 2:
778
+ try:
779
+ posthoc_results = []
780
+ for i in range(len(group_data)):
781
+ for j in range(i + 1, len(group_data)):
782
+ # Calculate Cohen's d for this pair
783
+ cohens_d = self.calculate_cohens_d(group_data[i], group_data[j])
784
+
785
+ # Simple t-test for this pair (for p-value)
786
+ t_stat, t_p = stats.ttest_ind(group_data[i], group_data[j])
787
+
788
+ posthoc_results.append({
789
+ "group1": group_names[i],
790
+ "group2": group_names[j],
791
+ "cohens_d": cohens_d,
792
+ "cohens_d_interpretation": self.interpret_effect_size(cohens_d, "cohens_d"),
793
+ "p_value": t_p,
794
+ "mean_diff": np.mean(group_data[i]) - np.mean(group_data[j])
795
+ })
796
+
797
+ results["posthoc"] = posthoc_results
798
+
799
+ except Exception as e:
800
+ results["posthoc_error"] = f"Error in post hoc analysis: {str(e)}"
801
+
802
+ return results
803
+
804
+ except Exception as e:
805
+ return {"error": f"Error performing ANOVA: {str(e)}"}
806
+
807
+ def perform_two_way_anova(self, df: pd.DataFrame, x_column: str, y_column: str, color_column: str) -> Dict[str, Any]:
808
+ """
809
+ Perform two-way ANOVA analysis.
810
+
811
+ Args:
812
+ df: DataFrame containing the data
813
+ x_column: First factor (categorical)
814
+ y_column: Dependent variable (numeric)
815
+ color_column: Second factor (categorical)
816
+
817
+ Returns:
818
+ Dict containing two-way ANOVA results and effect sizes
819
+ """
820
+ try:
821
+ # Remove missing values
822
+ clean_df = df[[x_column, y_column, color_column]].dropna()
823
+
824
+ if len(clean_df) < 6: # Need minimum samples for 2-way ANOVA
825
+ return {"error": "Insufficient data for two-way ANOVA (need at least 6 observations)"}
826
+
827
+ # Get factor levels
828
+ factor1_levels = clean_df[x_column].unique()
829
+ factor2_levels = clean_df[color_column].unique()
830
+
831
+ if len(factor1_levels) < 2 or len(factor2_levels) < 2:
832
+ return {"error": "Need at least 2 levels per factor for two-way ANOVA"}
833
+
834
+ # Manual two-way ANOVA calculation
835
+ grand_mean = clean_df[y_column].mean()
836
+ n_total = len(clean_df)
837
+
838
+ # Calculate sums of squares
839
+ ss_total = np.sum((clean_df[y_column] - grand_mean) ** 2)
840
+
841
+ # Factor A (x_column) effect
842
+ ss_a = 0
843
+ for level in factor1_levels:
844
+ group_data = clean_df[clean_df[x_column] == level][y_column]
845
+ if len(group_data) > 0:
846
+ ss_a += len(group_data) * (np.mean(group_data) - grand_mean) ** 2
847
+
848
+ # Factor B (color_column) effect
849
+ ss_b = 0
850
+ for level in factor2_levels:
851
+ group_data = clean_df[clean_df[color_column] == level][y_column]
852
+ if len(group_data) > 0:
853
+ ss_b += len(group_data) * (np.mean(group_data) - grand_mean) ** 2
854
+
855
+ # Interaction effect
856
+ ss_ab = 0
857
+ for a_level in factor1_levels:
858
+ for b_level in factor2_levels:
859
+ cell_data = clean_df[(clean_df[x_column] == a_level) & (clean_df[color_column] == b_level)][y_column]
860
+ if len(cell_data) > 0:
861
+ # Cell mean
862
+ cell_mean = np.mean(cell_data)
863
+ # Marginal means
864
+ a_mean = np.mean(clean_df[clean_df[x_column] == a_level][y_column])
865
+ b_mean = np.mean(clean_df[clean_df[color_column] == b_level][y_column])
866
+ # Interaction sum of squares
867
+ ss_ab += len(cell_data) * (cell_mean - a_mean - b_mean + grand_mean) ** 2
868
+
869
+ # Error sum of squares
870
+ ss_error = ss_total - ss_a - ss_b - ss_ab
871
+
872
+ # Degrees of freedom
873
+ df_a = len(factor1_levels) - 1
874
+ df_b = len(factor2_levels) - 1
875
+ df_ab = df_a * df_b
876
+ df_error = n_total - len(factor1_levels) * len(factor2_levels)
877
+
878
+ # Mean squares
879
+ ms_a = ss_a / df_a if df_a > 0 else 0
880
+ ms_b = ss_b / df_b if df_b > 0 else 0
881
+ ms_ab = ss_ab / df_ab if df_ab > 0 else 0
882
+ ms_error = ss_error / df_error if df_error > 0 else 1
883
+
884
+ # F statistics
885
+ f_a = ms_a / ms_error if ms_error > 0 else 0
886
+ f_b = ms_b / ms_error if ms_error > 0 else 0
887
+ f_ab = ms_ab / ms_error if ms_error > 0 else 0
888
+
889
+ # P values
890
+ p_a = 1 - stats.f.cdf(f_a, df_a, df_error) if f_a > 0 else 1
891
+ p_b = 1 - stats.f.cdf(f_b, df_b, df_error) if f_b > 0 else 1
892
+ p_ab = 1 - stats.f.cdf(f_ab, df_ab, df_error) if f_ab > 0 else 1
893
+
894
+ # Effect sizes (partial eta squared)
895
+ eta_squared_a = self.calculate_partial_eta_squared(ss_a, ss_error)
896
+ eta_squared_b = self.calculate_partial_eta_squared(ss_b, ss_error)
897
+ eta_squared_ab = self.calculate_partial_eta_squared(ss_ab, ss_error)
898
+
899
+ results = {
900
+ "test_type": "Two-way ANOVA",
901
+ "factor_a": {
902
+ "name": x_column,
903
+ "f_statistic": f_a,
904
+ "p_value": p_a,
905
+ "df": df_a,
906
+ "partial_eta_squared": eta_squared_a,
907
+ "interpretation": self.interpret_effect_size(eta_squared_a, "partial_eta_squared")
908
+ },
909
+ "factor_b": {
910
+ "name": color_column,
911
+ "f_statistic": f_b,
912
+ "p_value": p_b,
913
+ "df": df_b,
914
+ "partial_eta_squared": eta_squared_b,
915
+ "interpretation": self.interpret_effect_size(eta_squared_b, "partial_eta_squared")
916
+ },
917
+ "interaction": {
918
+ "name": f"{x_column} × {color_column}",
919
+ "f_statistic": f_ab,
920
+ "p_value": p_ab,
921
+ "df": df_ab,
922
+ "partial_eta_squared": eta_squared_ab,
923
+ "interpretation": self.interpret_effect_size(eta_squared_ab, "partial_eta_squared")
924
+ },
925
+ "df_error": df_error,
926
+ "sample_size": n_total,
927
+ "factor_a_levels": list(factor1_levels),
928
+ "factor_b_levels": list(factor2_levels)
929
+ }
930
+
931
+ return results
932
+
933
+ except Exception as e:
934
+ return {"error": f"Error performing two-way ANOVA: {str(e)}"}
935
+
936
+ def perform_simple_regression(self, df: pd.DataFrame, x_column: str, y_column: str) -> Dict[str, Any]:
937
+ """
938
+ Perform simple linear regression analysis.
939
+
940
+ Args:
941
+ df: DataFrame containing the data
942
+ x_column: Independent variable (numeric)
943
+ y_column: Dependent variable (numeric)
944
+
945
+ Returns:
946
+ Dict containing regression results and effect sizes
947
+ """
948
+ try:
949
+ # Remove missing values
950
+ clean_df = df[[x_column, y_column]].dropna()
951
+
952
+ if len(clean_df) < 3:
953
+ return {"error": "Insufficient data for regression (need at least 3 observations)"}
954
+
955
+ x = clean_df[x_column].values
956
+ y = clean_df[y_column].values
957
+
958
+ # Calculate correlation
959
+ correlation, corr_p = stats.pearsonr(x, y)
960
+
961
+ # Simple linear regression
962
+ slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
963
+
964
+ # Calculate additional statistics
965
+ r_squared = r_value ** 2
966
+ adjusted_r_squared = 1 - (1 - r_squared) * (len(clean_df) - 1) / (len(clean_df) - 2)
967
+
968
+ # Effect sizes
969
+ cohens_f_squared = self.calculate_cohens_f_squared(r_squared)
970
+
971
+ # Standard error of the slope
972
+ n = len(clean_df)
973
+ mean_x = np.mean(x)
974
+ ss_x = np.sum((x - mean_x) ** 2)
975
+ mse = np.sum((y - (slope * x + intercept)) ** 2) / (n - 2)
976
+ se_slope = np.sqrt(mse / ss_x)
977
+
978
+ # t-statistic for slope
979
+ t_stat = slope / se_slope if se_slope > 0 else 0
980
+
981
+ results = {
982
+ "test_type": "Simple Linear Regression",
983
+ "correlation": {
984
+ "pearson_r": correlation,
985
+ "p_value": corr_p,
986
+ "interpretation": self.interpret_effect_size(abs(correlation), "r_squared")
987
+ },
988
+ "regression": {
989
+ "slope": slope,
990
+ "intercept": intercept,
991
+ "r_squared": r_squared,
992
+ "adjusted_r_squared": adjusted_r_squared,
993
+ "p_value": p_value,
994
+ "standard_error": std_err,
995
+ "t_statistic": t_stat,
996
+ "cohens_f_squared": cohens_f_squared,
997
+ "f_squared_interpretation": self.interpret_effect_size(cohens_f_squared, "cohens_f")
998
+ },
999
+ "sample_size": len(clean_df),
1000
+ "variance_explained": f"{r_squared * 100:.1f}%"
1001
+ }
1002
+
1003
+ return results
1004
+
1005
+ except Exception as e:
1006
+ return {"error": f"Error performing regression: {str(e)}"}
uv.lock CHANGED
@@ -1751,7 +1751,7 @@ requires-dist = [
1751
  { name = "plotly", specifier = ">=5.15.0" },
1752
  { name = "pyyaml", specifier = ">=6.0" },
1753
  { name = "scipy", specifier = ">=1.11.0" },
1754
- { name = "spacy", extras = ["cuda11", "cuda12"], specifier = ">=3.7.0" },
1755
  { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
1756
  { name = "spacy-transformers", specifier = ">=1.3.0" },
1757
  { name = "streamlit", specifier = ">=1.28.0" },
 
1751
  { name = "plotly", specifier = ">=5.15.0" },
1752
  { name = "pyyaml", specifier = ">=6.0" },
1753
  { name = "scipy", specifier = ">=1.11.0" },
1754
+ { name = "spacy", specifier = ">=3.7.0" },
1755
  { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
1756
  { name = "spacy-transformers", specifier = ">=1.3.0" },
1757
  { name = "streamlit", specifier = ">=1.28.0" },
web_app/app.py CHANGED
@@ -41,7 +41,7 @@ st.set_page_config(
41
 
42
  def main():
43
  """Main application entry point."""
44
- st.title("�� Linguistic Data Analysis I - Text Analysis Tools")
45
  st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
46
 
47
  # GPU status is already initialized in gpu_init module
 
41
 
42
  def main():
43
  """Main application entry point."""
44
+ st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
45
  st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
46
 
47
  # GPU status is already initialized in gpu_init module
web_app/handlers/corpus_viz_handlers.py CHANGED
@@ -695,7 +695,7 @@ class CorpusVizHandlers:
695
  # Get custom category orders if set
696
  category_orders = st.session_state.corpus_viz_category_orders
697
 
698
- fig = visualizer.create_boxplot(
699
  x_column=x_column,
700
  y_column=y_column,
701
  color_column=color_column,
@@ -710,7 +710,8 @@ class CorpusVizHandlers:
710
  'title': title,
711
  'x_column': x_column,
712
  'y_column': y_column,
713
- 'color_column': color_column
 
714
  }
715
 
716
  # Add to plots list
@@ -763,12 +764,6 @@ class CorpusVizHandlers:
763
  if size_column == "None":
764
  size_column = None
765
 
766
- add_trendline = st.checkbox(
767
- "Add Trendline",
768
- value=False,
769
- help="Add a linear trendline to the scatter plot"
770
- )
771
-
772
  title = st.text_input(
773
  "Plot Title",
774
  value=f"Scatter Plot: {y_column} vs {x_column}",
@@ -782,13 +777,15 @@ class CorpusVizHandlers:
782
  # Get custom category orders if set
783
  category_orders = st.session_state.corpus_viz_category_orders
784
 
785
- # Note: Current implementation only supports basic parameters
786
- fig = visualizer.create_scatterplot(
787
  x_column=x_column,
788
  y_column=y_column,
789
  color_column=color_column,
790
  title=title,
791
- category_orders=category_orders
 
 
792
  )
793
 
794
  # Store in session state
@@ -800,7 +797,9 @@ class CorpusVizHandlers:
800
  'y_column': y_column,
801
  'color_column': color_column,
802
  'size_column': size_column,
803
- 'add_trendline': add_trendline
 
 
804
  }
805
 
806
  # Add to plots list
@@ -813,10 +812,124 @@ class CorpusVizHandlers:
813
  except Exception as e:
814
  st.error(f"Error creating scatter plot: {str(e)}")
815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  @staticmethod
817
  def render_visualization_results():
818
  """
819
- Render visualization results.
820
  """
821
  plots = st.session_state.corpus_viz_plots
822
 
@@ -831,6 +944,12 @@ class CorpusVizHandlers:
831
  # Display the plot
832
  st.plotly_chart(plot_config['figure'], use_container_width=True)
833
 
 
 
 
 
 
 
834
  # Plot details
835
  col1, col2, col3 = st.columns(3)
836
 
@@ -848,7 +967,7 @@ class CorpusVizHandlers:
848
  with col3:
849
  if plot_config['color_column']:
850
  st.write(f"**Color By:** {plot_config['color_column']}")
851
- if plot_config['type'] == 'scatter' and plot_config['size_column']:
852
  st.write(f"**Size By:** {plot_config['size_column']}")
853
 
854
  # Remove plot button
 
695
  # Get custom category orders if set
696
  category_orders = st.session_state.corpus_viz_category_orders
697
 
698
+ fig, stats_results = visualizer.create_boxplot(
699
  x_column=x_column,
700
  y_column=y_column,
701
  color_column=color_column,
 
710
  'title': title,
711
  'x_column': x_column,
712
  'y_column': y_column,
713
+ 'color_column': color_column,
714
+ 'stats_results': stats_results
715
  }
716
 
717
  # Add to plots list
 
764
  if size_column == "None":
765
  size_column = None
766
 
 
 
 
 
 
 
767
  title = st.text_input(
768
  "Plot Title",
769
  value=f"Scatter Plot: {y_column} vs {x_column}",
 
777
  # Get custom category orders if set
778
  category_orders = st.session_state.corpus_viz_category_orders
779
 
780
+ # Create scatter plot with statistical analysis (trendline and confidence intervals enabled by default)
781
+ fig, stats_results = visualizer.create_scatterplot(
782
  x_column=x_column,
783
  y_column=y_column,
784
  color_column=color_column,
785
  title=title,
786
+ category_orders=category_orders,
787
+ add_trendline=True,
788
+ add_confidence_interval=True
789
  )
790
 
791
  # Store in session state
 
797
  'y_column': y_column,
798
  'color_column': color_column,
799
  'size_column': size_column,
800
+ 'add_trendline': True,
801
+ 'add_confidence_interval': True,
802
+ 'stats_results': stats_results
803
  }
804
 
805
  # Add to plots list
 
812
  except Exception as e:
813
  st.error(f"Error creating scatter plot: {str(e)}")
814
 
815
+ @staticmethod
816
+ def render_statistical_results(stats_results: Dict[str, Any]):
817
+ """
818
+ Render statistical analysis results in a formatted way.
819
+
820
+ Args:
821
+ stats_results: Statistical results dictionary
822
+ """
823
+ if not stats_results or 'error' in stats_results:
824
+ if stats_results and 'error' in stats_results:
825
+ st.error(f"Statistical analysis error: {stats_results['error']}")
826
+ return
827
+
828
+ st.write("### 📊 Statistical Analysis")
829
+
830
+ test_type = stats_results.get('test_type', 'Unknown')
831
+
832
+ if test_type == "One-way ANOVA":
833
+ # One-way ANOVA results in APA table format
834
+ f_stat = stats_results.get('f_statistic', 0)
835
+ p_val = stats_results.get('p_value', 1)
836
+ p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< .001"
837
+ eta_sq = stats_results.get('eta_squared', 0)
838
+ interpretation = stats_results.get('eta_squared_interpretation', 'Unknown')
839
+ df_between = stats_results.get('df_between', 0)
840
+ df_within = stats_results.get('df_within', 0)
841
+
842
+ # Create APA-style table
843
+ anova_table = f"""
844
+ | Statistic | Value | df | p | η² | Effect Size |
845
+ |-----------|-------|----|----|----|-----------|
846
+ | F | {f_stat:.3f} | ({df_between}, {df_within}) | {p_display} | {eta_sq:.3f} | {interpretation} |
847
+ """
848
+ st.markdown(anova_table)
849
+
850
+ # Sample size
851
+ sample_size = stats_results.get('sample_size', 0)
852
+ st.write(f"**Model:** F({df_between}, {df_within}) = {f_stat:.3f}, p = {p_display}, N = {sample_size}")
853
+
854
+ # Post hoc results
855
+ if 'posthoc' in stats_results and stats_results['posthoc']:
856
+ st.write("**Post Hoc Comparisons (Tukey HSD):**")
857
+
858
+ posthoc_data = []
859
+ for comparison in stats_results['posthoc']:
860
+ group1 = comparison.get('group1', '')
861
+ group2 = comparison.get('group2', '')
862
+ cohens_d = comparison.get('cohens_d', 0)
863
+ d_interp = comparison.get('cohens_d_interpretation', '')
864
+ p_val = comparison.get('p_value', 1)
865
+ mean_diff = comparison.get('mean_diff', 0)
866
+
867
+ p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< 0.001"
868
+
869
+ posthoc_data.append({
870
+ 'Comparison': f"{group1} vs {group2}",
871
+ 'Mean Diff': f"{mean_diff:.3f}",
872
+ "Cohen's d": f"{cohens_d:.3f}",
873
+ 'Effect Size': d_interp,
874
+ 'p-value': p_display
875
+ })
876
+
877
+ if posthoc_data:
878
+ st.dataframe(pd.DataFrame(posthoc_data), use_container_width=True)
879
+
880
+ elif test_type == "Two-way ANOVA":
881
+ # Two-way ANOVA results in APA table format
882
+ st.write("**Main Effects and Interaction:**")
883
+
884
+ # Factor A
885
+ factor_a = stats_results.get('factor_a', {})
886
+ factor_b = stats_results.get('factor_b', {})
887
+ interaction = stats_results.get('interaction', {})
888
+ df_error = stats_results.get('df_error', 0)
889
+
890
+ # Create APA-style table
891
+ twoway_table = "| Source | F | df | p | ηp² | Effect Size |\n|--------|---|----|----|-----|-------------|\n"
892
+
893
+ for effect_name, effect_data in [("Factor A", factor_a), ("Factor B", factor_b), ("A × B", interaction)]:
894
+ f_stat = effect_data.get('f_statistic', 0)
895
+ p_val = effect_data.get('p_value', 1)
896
+ df = effect_data.get('df', 0)
897
+ eta_sq = effect_data.get('partial_eta_squared', 0)
898
+ interpretation = effect_data.get('interpretation', '')
899
+
900
+ p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< .001"
901
+
902
+ twoway_table += f"| {effect_name} | {f_stat:.3f} | ({df}, {df_error}) | {p_display} | {eta_sq:.3f} | {interpretation} |\n"
903
+
904
+ st.markdown(twoway_table)
905
+
906
+ sample_size = stats_results.get('sample_size', 0)
907
+ st.write(f"**Sample Size:** N = {sample_size}, df_error = {df_error}")
908
+
909
+ elif test_type == "Simple Linear Regression":
910
+ # Bivariate correlation results in APA table format
911
+ correlation = stats_results.get('correlation', {})
912
+ regression = stats_results.get('regression', {})
913
+
914
+ pearson_r = correlation.get('pearson_r', 0)
915
+ corr_p = correlation.get('p_value', 1)
916
+ corr_p_display = f"{corr_p:.3f}" if corr_p >= 0.001 else "< .001"
917
+ corr_interp = correlation.get('interpretation', 'Unknown')
918
+ r_squared = regression.get('r_squared', 0)
919
+
920
+ # Create APA-style table for bivariate correlation
921
+ correlation_table = f"""
922
+ | Statistic | Value | p | Effect Size |
923
+ |-----------|-------|---|-------------|
924
+ | Pearson r | {pearson_r:.3f} | {corr_p_display} | {corr_interp} |
925
+ | R² | {r_squared:.3f} | {corr_p_display} | - |
926
+ """
927
+ st.markdown(correlation_table)
928
+
929
  @staticmethod
930
  def render_visualization_results():
931
  """
932
+ Render visualization results with statistical analysis.
933
  """
934
  plots = st.session_state.corpus_viz_plots
935
 
 
944
  # Display the plot
945
  st.plotly_chart(plot_config['figure'], use_container_width=True)
946
 
947
+ # Display statistical results if available
948
+ if 'stats_results' in plot_config and plot_config['stats_results']:
949
+ CorpusVizHandlers.render_statistical_results(plot_config['stats_results'])
950
+
951
+ st.markdown("---")
952
+
953
  # Plot details
954
  col1, col2, col3 = st.columns(3)
955
 
 
967
  with col3:
968
  if plot_config['color_column']:
969
  st.write(f"**Color By:** {plot_config['color_column']}")
970
+ if plot_config['type'] == 'scatter' and plot_config.get('size_column'):
971
  st.write(f"**Size By:** {plot_config['size_column']}")
972
 
973
  # Remove plot button