Spaces:
Building
Building
plot function update
Browse files- pyproject.toml +1 -1
- text_analyzer/corpus_visualizer.py +516 -13
- uv.lock +1 -1
- web_app/app.py +1 -1
- web_app/handlers/corpus_viz_handlers.py +133 -14
pyproject.toml
CHANGED
|
@@ -6,7 +6,7 @@ readme = "README.md"
|
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
| 8 |
"streamlit>=1.28.0",
|
| 9 |
-
"spacy
|
| 10 |
"pandas>=2.0.0",
|
| 11 |
"numpy>=1.24.0,<2.0",
|
| 12 |
"plotly>=5.15.0",
|
|
|
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
| 8 |
"streamlit>=1.28.0",
|
| 9 |
+
"spacy>=3.7.0",
|
| 10 |
"pandas>=2.0.0",
|
| 11 |
"numpy>=1.24.0,<2.0",
|
| 12 |
"plotly>=5.15.0",
|
text_analyzer/corpus_visualizer.py
CHANGED
|
@@ -16,6 +16,9 @@ import re
|
|
| 16 |
from io import StringIO
|
| 17 |
import natsort
|
| 18 |
import csv
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
|
@@ -399,9 +402,9 @@ class CorpusVisualizer:
|
|
| 399 |
|
| 400 |
def create_boxplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
|
| 401 |
title: Optional[str] = None, height: int = 600,
|
| 402 |
-
category_orders: Optional[Dict[str, List[str]]] = None) -> go.Figure:
|
| 403 |
"""
|
| 404 |
-
Create a box plot visualization using Plotly.
|
| 405 |
|
| 406 |
Args:
|
| 407 |
x_column: Categorical column for x-axis
|
|
@@ -412,7 +415,7 @@ class CorpusVisualizer:
|
|
| 412 |
category_orders: Optional custom category orders
|
| 413 |
|
| 414 |
Returns:
|
| 415 |
-
Plotly figure object
|
| 416 |
"""
|
| 417 |
if self.merged_df is None:
|
| 418 |
raise ValueError("Must perform merge before creating visualizations")
|
|
@@ -446,13 +449,27 @@ class CorpusVisualizer:
|
|
| 446 |
category_orders=plot_category_orders)
|
| 447 |
|
| 448 |
fig.update_layout(template="plotly_white")
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
|
| 451 |
def create_scatterplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
|
| 452 |
title: Optional[str] = None, height: int = 600,
|
| 453 |
-
category_orders: Optional[Dict[str, List[str]]] = None
|
|
|
|
| 454 |
"""
|
| 455 |
-
Create a scatter plot visualization using Plotly.
|
| 456 |
|
| 457 |
Args:
|
| 458 |
x_column: Numeric column for x-axis
|
|
@@ -461,9 +478,11 @@ class CorpusVisualizer:
|
|
| 461 |
title: Plot title
|
| 462 |
height: Plot height
|
| 463 |
category_orders: Optional custom category orders
|
|
|
|
|
|
|
| 464 |
|
| 465 |
Returns:
|
| 466 |
-
Plotly figure object
|
| 467 |
"""
|
| 468 |
if self.merged_df is None:
|
| 469 |
raise ValueError("Must perform merge before creating visualizations")
|
|
@@ -482,18 +501,83 @@ class CorpusVisualizer:
|
|
| 482 |
else:
|
| 483 |
plot_category_orders[color_column] = self.get_category_order(color_column, plot_df)
|
| 484 |
|
| 485 |
-
# Create the plot
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
fig.update_layout(template="plotly_white")
|
| 491 |
-
return fig
|
| 492 |
|
| 493 |
def export_merged_data(self) -> pd.DataFrame:
|
| 494 |
"""
|
| 495 |
Export merged dataframe.
|
| 496 |
-
|
| 497 |
Returns:
|
| 498 |
pd.DataFrame: DataFrame ready for export
|
| 499 |
"""
|
|
@@ -501,3 +585,422 @@ class CorpusVisualizer:
|
|
| 501 |
raise ValueError("Must perform merge before exporting")
|
| 502 |
|
| 503 |
return self.merged_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from io import StringIO
|
| 17 |
import natsort
|
| 18 |
import csv
|
| 19 |
+
from scipy import stats
|
| 20 |
+
from scipy.stats import f_oneway
|
| 21 |
+
import warnings
|
| 22 |
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
|
|
| 402 |
|
| 403 |
def create_boxplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
|
| 404 |
title: Optional[str] = None, height: int = 600,
|
| 405 |
+
category_orders: Optional[Dict[str, List[str]]] = None) -> Tuple[go.Figure, Optional[Dict[str, Any]]]:
|
| 406 |
"""
|
| 407 |
+
Create a box plot visualization using Plotly with statistical analysis.
|
| 408 |
|
| 409 |
Args:
|
| 410 |
x_column: Categorical column for x-axis
|
|
|
|
| 415 |
category_orders: Optional custom category orders
|
| 416 |
|
| 417 |
Returns:
|
| 418 |
+
Tuple of (Plotly figure object, Statistical results dict)
|
| 419 |
"""
|
| 420 |
if self.merged_df is None:
|
| 421 |
raise ValueError("Must perform merge before creating visualizations")
|
|
|
|
| 449 |
category_orders=plot_category_orders)
|
| 450 |
|
| 451 |
fig.update_layout(template="plotly_white")
|
| 452 |
+
|
| 453 |
+
# Perform statistical analysis
|
| 454 |
+
stats_results = None
|
| 455 |
+
try:
|
| 456 |
+
if color_column:
|
| 457 |
+
# Two-way ANOVA
|
| 458 |
+
stats_results = self.perform_two_way_anova(plot_df, x_column, y_column, color_column)
|
| 459 |
+
else:
|
| 460 |
+
# One-way ANOVA
|
| 461 |
+
stats_results = self.perform_one_way_anova(plot_df, x_column, y_column)
|
| 462 |
+
except Exception as e:
|
| 463 |
+
stats_results = {"error": f"Statistical analysis failed: {str(e)}"}
|
| 464 |
+
|
| 465 |
+
return fig, stats_results
|
| 466 |
|
| 467 |
def create_scatterplot(self, x_column: str, y_column: str, color_column: Optional[str] = None,
|
| 468 |
title: Optional[str] = None, height: int = 600,
|
| 469 |
+
category_orders: Optional[Dict[str, List[str]]] = None,
|
| 470 |
+
add_trendline: bool = True, add_confidence_interval: bool = True) -> Tuple[go.Figure, Optional[Dict[str, Any]]]:
|
| 471 |
"""
|
| 472 |
+
Create a scatter plot visualization using Plotly with statistical analysis.
|
| 473 |
|
| 474 |
Args:
|
| 475 |
x_column: Numeric column for x-axis
|
|
|
|
| 478 |
title: Plot title
|
| 479 |
height: Plot height
|
| 480 |
category_orders: Optional custom category orders
|
| 481 |
+
add_trendline: Whether to add regression line (default True)
|
| 482 |
+
add_confidence_interval: Whether to add confidence interval around trendline (default True)
|
| 483 |
|
| 484 |
Returns:
|
| 485 |
+
Tuple of (Plotly figure object, Statistical results dict)
|
| 486 |
"""
|
| 487 |
if self.merged_df is None:
|
| 488 |
raise ValueError("Must perform merge before creating visualizations")
|
|
|
|
| 501 |
else:
|
| 502 |
plot_category_orders[color_column] = self.get_category_order(color_column, plot_df)
|
| 503 |
|
| 504 |
+
# Create the base scatter plot
|
| 505 |
+
if color_column:
|
| 506 |
+
fig = px.scatter(plot_df, x=x_column, y=y_column, color=color_column,
|
| 507 |
+
title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height,
|
| 508 |
+
category_orders=plot_category_orders if plot_category_orders else None)
|
| 509 |
+
else:
|
| 510 |
+
fig = px.scatter(plot_df, x=x_column, y=y_column,
|
| 511 |
+
title=title or f"Scatter Plot: {y_column} vs {x_column}", height=height)
|
| 512 |
+
|
| 513 |
+
# Perform statistical analysis
|
| 514 |
+
stats_results = None
|
| 515 |
+
try:
|
| 516 |
+
stats_results = self.perform_simple_regression(plot_df, x_column, y_column)
|
| 517 |
+
|
| 518 |
+
# Add trendline and confidence interval if requested and regression successful
|
| 519 |
+
if add_trendline and 'error' not in stats_results:
|
| 520 |
+
clean_df = plot_df[[x_column, y_column]].dropna()
|
| 521 |
+
x_vals = clean_df[x_column].values
|
| 522 |
+
y_vals = clean_df[y_column].values
|
| 523 |
+
|
| 524 |
+
# Get regression parameters
|
| 525 |
+
slope = stats_results['regression']['slope']
|
| 526 |
+
intercept = stats_results['regression']['intercept']
|
| 527 |
+
|
| 528 |
+
# Create more detailed x range for smooth curves
|
| 529 |
+
x_min, x_max = x_vals.min(), x_vals.max()
|
| 530 |
+
x_range = np.linspace(x_min, x_max, 100)
|
| 531 |
+
y_range = slope * x_range + intercept
|
| 532 |
+
|
| 533 |
+
# Calculate confidence intervals if requested
|
| 534 |
+
if add_confidence_interval:
|
| 535 |
+
n = len(x_vals)
|
| 536 |
+
mean_x = np.mean(x_vals)
|
| 537 |
+
ss_x = np.sum((x_vals - mean_x) ** 2)
|
| 538 |
+
mse = np.sum((y_vals - (slope * x_vals + intercept)) ** 2) / (n - 2)
|
| 539 |
+
|
| 540 |
+
# Standard error for each prediction point
|
| 541 |
+
se_y = np.sqrt(mse * (1/n + (x_range - mean_x)**2 / ss_x))
|
| 542 |
+
|
| 543 |
+
# 95% confidence interval (t-distribution for small samples)
|
| 544 |
+
from scipy.stats import t
|
| 545 |
+
t_val = t.ppf(0.975, n - 2) # 95% confidence
|
| 546 |
+
|
| 547 |
+
y_upper = y_range + t_val * se_y
|
| 548 |
+
y_lower = y_range - t_val * se_y
|
| 549 |
+
|
| 550 |
+
# Add confidence interval as filled area
|
| 551 |
+
fig.add_trace(go.Scatter(
|
| 552 |
+
x=np.concatenate([x_range, x_range[::-1]]),
|
| 553 |
+
y=np.concatenate([y_upper, y_lower[::-1]]),
|
| 554 |
+
fill='toself',
|
| 555 |
+
fillcolor='rgba(255, 0, 0, 0.2)',
|
| 556 |
+
line=dict(color='rgba(255,255,255,0)'),
|
| 557 |
+
hoverinfo="skip",
|
| 558 |
+
showlegend=True,
|
| 559 |
+
name='95% Confidence Interval'
|
| 560 |
+
))
|
| 561 |
+
|
| 562 |
+
# Add trendline to the plot
|
| 563 |
+
fig.add_trace(go.Scatter(
|
| 564 |
+
x=x_range,
|
| 565 |
+
y=y_range,
|
| 566 |
+
mode='lines',
|
| 567 |
+
name=f'Trendline (R² = {stats_results["regression"]["r_squared"]:.3f})',
|
| 568 |
+
line=dict(color='red', dash='dash', width=2)
|
| 569 |
+
))
|
| 570 |
+
|
| 571 |
+
except Exception as e:
|
| 572 |
+
stats_results = {"error": f"Statistical analysis failed: {str(e)}"}
|
| 573 |
|
| 574 |
fig.update_layout(template="plotly_white")
|
| 575 |
+
return fig, stats_results
|
| 576 |
|
| 577 |
def export_merged_data(self) -> pd.DataFrame:
|
| 578 |
"""
|
| 579 |
Export merged dataframe.
|
| 580 |
+
|
| 581 |
Returns:
|
| 582 |
pd.DataFrame: DataFrame ready for export
|
| 583 |
"""
|
|
|
|
| 585 |
raise ValueError("Must perform merge before exporting")
|
| 586 |
|
| 587 |
return self.merged_df
|
| 588 |
+
|
| 589 |
+
# Statistical Analysis Methods
|
| 590 |
+
|
| 591 |
+
def calculate_eta_squared(self, ss_between: float, ss_total: float) -> float:
|
| 592 |
+
"""
|
| 593 |
+
Calculate eta-squared effect size for ANOVA.
|
| 594 |
+
|
| 595 |
+
Args:
|
| 596 |
+
ss_between: Sum of squares between groups
|
| 597 |
+
ss_total: Total sum of squares
|
| 598 |
+
|
| 599 |
+
Returns:
|
| 600 |
+
float: Eta-squared value
|
| 601 |
+
"""
|
| 602 |
+
if ss_total == 0:
|
| 603 |
+
return 0.0
|
| 604 |
+
return ss_between / ss_total
|
| 605 |
+
|
| 606 |
+
def calculate_partial_eta_squared(self, ss_effect: float, ss_error: float) -> float:
|
| 607 |
+
"""
|
| 608 |
+
Calculate partial eta-squared effect size for factorial ANOVA.
|
| 609 |
+
|
| 610 |
+
Args:
|
| 611 |
+
ss_effect: Sum of squares for the effect
|
| 612 |
+
ss_error: Sum of squares for error
|
| 613 |
+
|
| 614 |
+
Returns:
|
| 615 |
+
float: Partial eta-squared value
|
| 616 |
+
"""
|
| 617 |
+
if (ss_effect + ss_error) == 0:
|
| 618 |
+
return 0.0
|
| 619 |
+
return ss_effect / (ss_effect + ss_error)
|
| 620 |
+
|
| 621 |
+
def calculate_cohens_d(self, group1: np.ndarray, group2: np.ndarray) -> float:
|
| 622 |
+
"""
|
| 623 |
+
Calculate Cohen's d effect size for two groups.
|
| 624 |
+
|
| 625 |
+
Args:
|
| 626 |
+
group1: Data for first group
|
| 627 |
+
group2: Data for second group
|
| 628 |
+
|
| 629 |
+
Returns:
|
| 630 |
+
float: Cohen's d value
|
| 631 |
+
"""
|
| 632 |
+
n1, n2 = len(group1), len(group2)
|
| 633 |
+
if n1 < 2 or n2 < 2:
|
| 634 |
+
return 0.0
|
| 635 |
+
|
| 636 |
+
# Calculate pooled standard deviation
|
| 637 |
+
pooled_std = np.sqrt(((n1 - 1) * np.var(group1, ddof=1) +
|
| 638 |
+
(n2 - 1) * np.var(group2, ddof=1)) / (n1 + n2 - 2))
|
| 639 |
+
|
| 640 |
+
if pooled_std == 0:
|
| 641 |
+
return 0.0
|
| 642 |
+
|
| 643 |
+
return (np.mean(group1) - np.mean(group2)) / pooled_std
|
| 644 |
+
|
| 645 |
+
def calculate_cohens_f_squared(self, r_squared: float) -> float:
|
| 646 |
+
"""
|
| 647 |
+
Calculate Cohen's f² effect size for regression.
|
| 648 |
+
|
| 649 |
+
Args:
|
| 650 |
+
r_squared: R-squared value from regression
|
| 651 |
+
|
| 652 |
+
Returns:
|
| 653 |
+
float: Cohen's f² value
|
| 654 |
+
"""
|
| 655 |
+
if r_squared >= 1.0 or r_squared < 0:
|
| 656 |
+
return 0.0
|
| 657 |
+
return r_squared / (1 - r_squared)
|
| 658 |
+
|
| 659 |
+
def interpret_effect_size(self, value: float, metric_type: str) -> str:
|
| 660 |
+
"""
|
| 661 |
+
Provide interpretation for effect sizes.
|
| 662 |
+
|
| 663 |
+
Args:
|
| 664 |
+
value: Effect size value
|
| 665 |
+
metric_type: Type of effect size ('eta_squared', 'cohens_d', 'r_squared', 'cohens_f')
|
| 666 |
+
|
| 667 |
+
Returns:
|
| 668 |
+
str: Interpretation (Small, Medium, Large)
|
| 669 |
+
"""
|
| 670 |
+
if metric_type == 'eta_squared' or metric_type == 'partial_eta_squared':
|
| 671 |
+
if value < 0.01:
|
| 672 |
+
return "Small"
|
| 673 |
+
elif value < 0.06:
|
| 674 |
+
return "Small"
|
| 675 |
+
elif value < 0.14:
|
| 676 |
+
return "Medium"
|
| 677 |
+
else:
|
| 678 |
+
return "Large"
|
| 679 |
+
elif metric_type == 'cohens_d':
|
| 680 |
+
abs_value = abs(value)
|
| 681 |
+
if abs_value < 0.2:
|
| 682 |
+
return "Small"
|
| 683 |
+
elif abs_value < 0.5:
|
| 684 |
+
return "Small"
|
| 685 |
+
elif abs_value < 0.8:
|
| 686 |
+
return "Medium"
|
| 687 |
+
else:
|
| 688 |
+
return "Large"
|
| 689 |
+
elif metric_type == 'r_squared':
|
| 690 |
+
if value < 0.01:
|
| 691 |
+
return "Small"
|
| 692 |
+
elif value < 0.09:
|
| 693 |
+
return "Small"
|
| 694 |
+
elif value < 0.25:
|
| 695 |
+
return "Medium"
|
| 696 |
+
else:
|
| 697 |
+
return "Large"
|
| 698 |
+
elif metric_type == 'cohens_f':
|
| 699 |
+
if value < 0.02:
|
| 700 |
+
return "Small"
|
| 701 |
+
elif value < 0.15:
|
| 702 |
+
return "Small"
|
| 703 |
+
elif value < 0.35:
|
| 704 |
+
return "Medium"
|
| 705 |
+
else:
|
| 706 |
+
return "Large"
|
| 707 |
+
else:
|
| 708 |
+
return "Unknown"
|
| 709 |
+
|
| 710 |
+
def perform_one_way_anova(self, df: pd.DataFrame, x_column: str, y_column: str) -> Dict[str, Any]:
|
| 711 |
+
"""
|
| 712 |
+
Perform one-way ANOVA analysis.
|
| 713 |
+
|
| 714 |
+
Args:
|
| 715 |
+
df: DataFrame containing the data
|
| 716 |
+
x_column: Categorical column (groups)
|
| 717 |
+
y_column: Numeric column (dependent variable)
|
| 718 |
+
|
| 719 |
+
Returns:
|
| 720 |
+
Dict containing ANOVA results and effect sizes
|
| 721 |
+
"""
|
| 722 |
+
try:
|
| 723 |
+
# Remove missing values
|
| 724 |
+
clean_df = df[[x_column, y_column]].dropna()
|
| 725 |
+
|
| 726 |
+
if len(clean_df) < 3:
|
| 727 |
+
return {"error": "Insufficient data for ANOVA (need at least 3 observations)"}
|
| 728 |
+
|
| 729 |
+
# Get groups
|
| 730 |
+
groups = [group[y_column].values for name, group in clean_df.groupby(x_column)]
|
| 731 |
+
|
| 732 |
+
# Check if we have at least 2 groups with data
|
| 733 |
+
valid_groups = [g for g in groups if len(g) > 0]
|
| 734 |
+
if len(valid_groups) < 2:
|
| 735 |
+
return {"error": "Need at least 2 groups for ANOVA"}
|
| 736 |
+
|
| 737 |
+
# Perform ANOVA
|
| 738 |
+
f_stat, p_value = f_oneway(*valid_groups)
|
| 739 |
+
|
| 740 |
+
# Calculate effect size (eta-squared)
|
| 741 |
+
group_data = []
|
| 742 |
+
group_names = []
|
| 743 |
+
for name, group in clean_df.groupby(x_column):
|
| 744 |
+
if len(group) > 0:
|
| 745 |
+
group_data.append(group[y_column].values)
|
| 746 |
+
group_names.append(name)
|
| 747 |
+
|
| 748 |
+
# Calculate sums of squares
|
| 749 |
+
grand_mean = clean_df[y_column].mean()
|
| 750 |
+
ss_total = np.sum((clean_df[y_column] - grand_mean) ** 2)
|
| 751 |
+
|
| 752 |
+
ss_between = 0
|
| 753 |
+
for group in group_data:
|
| 754 |
+
ss_between += len(group) * (np.mean(group) - grand_mean) ** 2
|
| 755 |
+
|
| 756 |
+
eta_squared = self.calculate_eta_squared(ss_between, ss_total)
|
| 757 |
+
|
| 758 |
+
# Degrees of freedom
|
| 759 |
+
df_between = len(valid_groups) - 1
|
| 760 |
+
df_within = len(clean_df) - len(valid_groups)
|
| 761 |
+
|
| 762 |
+
results = {
|
| 763 |
+
"test_type": "One-way ANOVA",
|
| 764 |
+
"f_statistic": f_stat,
|
| 765 |
+
"p_value": p_value,
|
| 766 |
+
"df_between": df_between,
|
| 767 |
+
"df_within": df_within,
|
| 768 |
+
"eta_squared": eta_squared,
|
| 769 |
+
"eta_squared_interpretation": self.interpret_effect_size(eta_squared, "eta_squared"),
|
| 770 |
+
"sample_size": len(clean_df),
|
| 771 |
+
"groups": group_names,
|
| 772 |
+
"group_means": [np.mean(group) for group in group_data],
|
| 773 |
+
"group_sizes": [len(group) for group in group_data]
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
# Post hoc analysis if significant and more than 2 groups
|
| 777 |
+
if p_value < 0.05 and len(valid_groups) > 2:
|
| 778 |
+
try:
|
| 779 |
+
posthoc_results = []
|
| 780 |
+
for i in range(len(group_data)):
|
| 781 |
+
for j in range(i + 1, len(group_data)):
|
| 782 |
+
# Calculate Cohen's d for this pair
|
| 783 |
+
cohens_d = self.calculate_cohens_d(group_data[i], group_data[j])
|
| 784 |
+
|
| 785 |
+
# Simple t-test for this pair (for p-value)
|
| 786 |
+
t_stat, t_p = stats.ttest_ind(group_data[i], group_data[j])
|
| 787 |
+
|
| 788 |
+
posthoc_results.append({
|
| 789 |
+
"group1": group_names[i],
|
| 790 |
+
"group2": group_names[j],
|
| 791 |
+
"cohens_d": cohens_d,
|
| 792 |
+
"cohens_d_interpretation": self.interpret_effect_size(cohens_d, "cohens_d"),
|
| 793 |
+
"p_value": t_p,
|
| 794 |
+
"mean_diff": np.mean(group_data[i]) - np.mean(group_data[j])
|
| 795 |
+
})
|
| 796 |
+
|
| 797 |
+
results["posthoc"] = posthoc_results
|
| 798 |
+
|
| 799 |
+
except Exception as e:
|
| 800 |
+
results["posthoc_error"] = f"Error in post hoc analysis: {str(e)}"
|
| 801 |
+
|
| 802 |
+
return results
|
| 803 |
+
|
| 804 |
+
except Exception as e:
|
| 805 |
+
return {"error": f"Error performing ANOVA: {str(e)}"}
|
| 806 |
+
|
| 807 |
+
def perform_two_way_anova(self, df: pd.DataFrame, x_column: str, y_column: str, color_column: str) -> Dict[str, Any]:
|
| 808 |
+
"""
|
| 809 |
+
Perform two-way ANOVA analysis.
|
| 810 |
+
|
| 811 |
+
Args:
|
| 812 |
+
df: DataFrame containing the data
|
| 813 |
+
x_column: First factor (categorical)
|
| 814 |
+
y_column: Dependent variable (numeric)
|
| 815 |
+
color_column: Second factor (categorical)
|
| 816 |
+
|
| 817 |
+
Returns:
|
| 818 |
+
Dict containing two-way ANOVA results and effect sizes
|
| 819 |
+
"""
|
| 820 |
+
try:
|
| 821 |
+
# Remove missing values
|
| 822 |
+
clean_df = df[[x_column, y_column, color_column]].dropna()
|
| 823 |
+
|
| 824 |
+
if len(clean_df) < 6: # Need minimum samples for 2-way ANOVA
|
| 825 |
+
return {"error": "Insufficient data for two-way ANOVA (need at least 6 observations)"}
|
| 826 |
+
|
| 827 |
+
# Get factor levels
|
| 828 |
+
factor1_levels = clean_df[x_column].unique()
|
| 829 |
+
factor2_levels = clean_df[color_column].unique()
|
| 830 |
+
|
| 831 |
+
if len(factor1_levels) < 2 or len(factor2_levels) < 2:
|
| 832 |
+
return {"error": "Need at least 2 levels per factor for two-way ANOVA"}
|
| 833 |
+
|
| 834 |
+
# Manual two-way ANOVA calculation
|
| 835 |
+
grand_mean = clean_df[y_column].mean()
|
| 836 |
+
n_total = len(clean_df)
|
| 837 |
+
|
| 838 |
+
# Calculate sums of squares
|
| 839 |
+
ss_total = np.sum((clean_df[y_column] - grand_mean) ** 2)
|
| 840 |
+
|
| 841 |
+
# Factor A (x_column) effect
|
| 842 |
+
ss_a = 0
|
| 843 |
+
for level in factor1_levels:
|
| 844 |
+
group_data = clean_df[clean_df[x_column] == level][y_column]
|
| 845 |
+
if len(group_data) > 0:
|
| 846 |
+
ss_a += len(group_data) * (np.mean(group_data) - grand_mean) ** 2
|
| 847 |
+
|
| 848 |
+
# Factor B (color_column) effect
|
| 849 |
+
ss_b = 0
|
| 850 |
+
for level in factor2_levels:
|
| 851 |
+
group_data = clean_df[clean_df[color_column] == level][y_column]
|
| 852 |
+
if len(group_data) > 0:
|
| 853 |
+
ss_b += len(group_data) * (np.mean(group_data) - grand_mean) ** 2
|
| 854 |
+
|
| 855 |
+
# Interaction effect
|
| 856 |
+
ss_ab = 0
|
| 857 |
+
for a_level in factor1_levels:
|
| 858 |
+
for b_level in factor2_levels:
|
| 859 |
+
cell_data = clean_df[(clean_df[x_column] == a_level) & (clean_df[color_column] == b_level)][y_column]
|
| 860 |
+
if len(cell_data) > 0:
|
| 861 |
+
# Cell mean
|
| 862 |
+
cell_mean = np.mean(cell_data)
|
| 863 |
+
# Marginal means
|
| 864 |
+
a_mean = np.mean(clean_df[clean_df[x_column] == a_level][y_column])
|
| 865 |
+
b_mean = np.mean(clean_df[clean_df[color_column] == b_level][y_column])
|
| 866 |
+
# Interaction sum of squares
|
| 867 |
+
ss_ab += len(cell_data) * (cell_mean - a_mean - b_mean + grand_mean) ** 2
|
| 868 |
+
|
| 869 |
+
# Error sum of squares
|
| 870 |
+
ss_error = ss_total - ss_a - ss_b - ss_ab
|
| 871 |
+
|
| 872 |
+
# Degrees of freedom
|
| 873 |
+
df_a = len(factor1_levels) - 1
|
| 874 |
+
df_b = len(factor2_levels) - 1
|
| 875 |
+
df_ab = df_a * df_b
|
| 876 |
+
df_error = n_total - len(factor1_levels) * len(factor2_levels)
|
| 877 |
+
|
| 878 |
+
# Mean squares
|
| 879 |
+
ms_a = ss_a / df_a if df_a > 0 else 0
|
| 880 |
+
ms_b = ss_b / df_b if df_b > 0 else 0
|
| 881 |
+
ms_ab = ss_ab / df_ab if df_ab > 0 else 0
|
| 882 |
+
ms_error = ss_error / df_error if df_error > 0 else 1
|
| 883 |
+
|
| 884 |
+
# F statistics
|
| 885 |
+
f_a = ms_a / ms_error if ms_error > 0 else 0
|
| 886 |
+
f_b = ms_b / ms_error if ms_error > 0 else 0
|
| 887 |
+
f_ab = ms_ab / ms_error if ms_error > 0 else 0
|
| 888 |
+
|
| 889 |
+
# P values
|
| 890 |
+
p_a = 1 - stats.f.cdf(f_a, df_a, df_error) if f_a > 0 else 1
|
| 891 |
+
p_b = 1 - stats.f.cdf(f_b, df_b, df_error) if f_b > 0 else 1
|
| 892 |
+
p_ab = 1 - stats.f.cdf(f_ab, df_ab, df_error) if f_ab > 0 else 1
|
| 893 |
+
|
| 894 |
+
# Effect sizes (partial eta squared)
|
| 895 |
+
eta_squared_a = self.calculate_partial_eta_squared(ss_a, ss_error)
|
| 896 |
+
eta_squared_b = self.calculate_partial_eta_squared(ss_b, ss_error)
|
| 897 |
+
eta_squared_ab = self.calculate_partial_eta_squared(ss_ab, ss_error)
|
| 898 |
+
|
| 899 |
+
results = {
|
| 900 |
+
"test_type": "Two-way ANOVA",
|
| 901 |
+
"factor_a": {
|
| 902 |
+
"name": x_column,
|
| 903 |
+
"f_statistic": f_a,
|
| 904 |
+
"p_value": p_a,
|
| 905 |
+
"df": df_a,
|
| 906 |
+
"partial_eta_squared": eta_squared_a,
|
| 907 |
+
"interpretation": self.interpret_effect_size(eta_squared_a, "partial_eta_squared")
|
| 908 |
+
},
|
| 909 |
+
"factor_b": {
|
| 910 |
+
"name": color_column,
|
| 911 |
+
"f_statistic": f_b,
|
| 912 |
+
"p_value": p_b,
|
| 913 |
+
"df": df_b,
|
| 914 |
+
"partial_eta_squared": eta_squared_b,
|
| 915 |
+
"interpretation": self.interpret_effect_size(eta_squared_b, "partial_eta_squared")
|
| 916 |
+
},
|
| 917 |
+
"interaction": {
|
| 918 |
+
"name": f"{x_column} × {color_column}",
|
| 919 |
+
"f_statistic": f_ab,
|
| 920 |
+
"p_value": p_ab,
|
| 921 |
+
"df": df_ab,
|
| 922 |
+
"partial_eta_squared": eta_squared_ab,
|
| 923 |
+
"interpretation": self.interpret_effect_size(eta_squared_ab, "partial_eta_squared")
|
| 924 |
+
},
|
| 925 |
+
"df_error": df_error,
|
| 926 |
+
"sample_size": n_total,
|
| 927 |
+
"factor_a_levels": list(factor1_levels),
|
| 928 |
+
"factor_b_levels": list(factor2_levels)
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
+
return results
|
| 932 |
+
|
| 933 |
+
except Exception as e:
|
| 934 |
+
return {"error": f"Error performing two-way ANOVA: {str(e)}"}
|
| 935 |
+
|
| 936 |
+
def perform_simple_regression(self, df: pd.DataFrame, x_column: str, y_column: str) -> Dict[str, Any]:
|
| 937 |
+
"""
|
| 938 |
+
Perform simple linear regression analysis.
|
| 939 |
+
|
| 940 |
+
Args:
|
| 941 |
+
df: DataFrame containing the data
|
| 942 |
+
x_column: Independent variable (numeric)
|
| 943 |
+
y_column: Dependent variable (numeric)
|
| 944 |
+
|
| 945 |
+
Returns:
|
| 946 |
+
Dict containing regression results and effect sizes
|
| 947 |
+
"""
|
| 948 |
+
try:
|
| 949 |
+
# Remove missing values
|
| 950 |
+
clean_df = df[[x_column, y_column]].dropna()
|
| 951 |
+
|
| 952 |
+
if len(clean_df) < 3:
|
| 953 |
+
return {"error": "Insufficient data for regression (need at least 3 observations)"}
|
| 954 |
+
|
| 955 |
+
x = clean_df[x_column].values
|
| 956 |
+
y = clean_df[y_column].values
|
| 957 |
+
|
| 958 |
+
# Calculate correlation
|
| 959 |
+
correlation, corr_p = stats.pearsonr(x, y)
|
| 960 |
+
|
| 961 |
+
# Simple linear regression
|
| 962 |
+
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
| 963 |
+
|
| 964 |
+
# Calculate additional statistics
|
| 965 |
+
r_squared = r_value ** 2
|
| 966 |
+
adjusted_r_squared = 1 - (1 - r_squared) * (len(clean_df) - 1) / (len(clean_df) - 2)
|
| 967 |
+
|
| 968 |
+
# Effect sizes
|
| 969 |
+
cohens_f_squared = self.calculate_cohens_f_squared(r_squared)
|
| 970 |
+
|
| 971 |
+
# Standard error of the slope
|
| 972 |
+
n = len(clean_df)
|
| 973 |
+
mean_x = np.mean(x)
|
| 974 |
+
ss_x = np.sum((x - mean_x) ** 2)
|
| 975 |
+
mse = np.sum((y - (slope * x + intercept)) ** 2) / (n - 2)
|
| 976 |
+
se_slope = np.sqrt(mse / ss_x)
|
| 977 |
+
|
| 978 |
+
# t-statistic for slope
|
| 979 |
+
t_stat = slope / se_slope if se_slope > 0 else 0
|
| 980 |
+
|
| 981 |
+
results = {
|
| 982 |
+
"test_type": "Simple Linear Regression",
|
| 983 |
+
"correlation": {
|
| 984 |
+
"pearson_r": correlation,
|
| 985 |
+
"p_value": corr_p,
|
| 986 |
+
"interpretation": self.interpret_effect_size(abs(correlation), "r_squared")
|
| 987 |
+
},
|
| 988 |
+
"regression": {
|
| 989 |
+
"slope": slope,
|
| 990 |
+
"intercept": intercept,
|
| 991 |
+
"r_squared": r_squared,
|
| 992 |
+
"adjusted_r_squared": adjusted_r_squared,
|
| 993 |
+
"p_value": p_value,
|
| 994 |
+
"standard_error": std_err,
|
| 995 |
+
"t_statistic": t_stat,
|
| 996 |
+
"cohens_f_squared": cohens_f_squared,
|
| 997 |
+
"f_squared_interpretation": self.interpret_effect_size(cohens_f_squared, "cohens_f")
|
| 998 |
+
},
|
| 999 |
+
"sample_size": len(clean_df),
|
| 1000 |
+
"variance_explained": f"{r_squared * 100:.1f}%"
|
| 1001 |
+
}
|
| 1002 |
+
|
| 1003 |
+
return results
|
| 1004 |
+
|
| 1005 |
+
except Exception as e:
|
| 1006 |
+
return {"error": f"Error performing regression: {str(e)}"}
|
uv.lock
CHANGED
|
@@ -1751,7 +1751,7 @@ requires-dist = [
|
|
| 1751 |
{ name = "plotly", specifier = ">=5.15.0" },
|
| 1752 |
{ name = "pyyaml", specifier = ">=6.0" },
|
| 1753 |
{ name = "scipy", specifier = ">=1.11.0" },
|
| 1754 |
-
{ name = "spacy",
|
| 1755 |
{ name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
|
| 1756 |
{ name = "spacy-transformers", specifier = ">=1.3.0" },
|
| 1757 |
{ name = "streamlit", specifier = ">=1.28.0" },
|
|
|
|
| 1751 |
{ name = "plotly", specifier = ">=5.15.0" },
|
| 1752 |
{ name = "pyyaml", specifier = ">=6.0" },
|
| 1753 |
{ name = "scipy", specifier = ">=1.11.0" },
|
| 1754 |
+
{ name = "spacy", specifier = ">=3.7.0" },
|
| 1755 |
{ name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
|
| 1756 |
{ name = "spacy-transformers", specifier = ">=1.3.0" },
|
| 1757 |
{ name = "streamlit", specifier = ">=1.28.0" },
|
web_app/app.py
CHANGED
|
@@ -41,7 +41,7 @@ st.set_page_config(
|
|
| 41 |
|
| 42 |
def main():
|
| 43 |
"""Main application entry point."""
|
| 44 |
-
st.title("
|
| 45 |
st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
|
| 46 |
|
| 47 |
# GPU status is already initialized in gpu_init module
|
|
|
|
| 41 |
|
| 42 |
def main():
|
| 43 |
"""Main application entry point."""
|
| 44 |
+
st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
|
| 45 |
st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
|
| 46 |
|
| 47 |
# GPU status is already initialized in gpu_init module
|
web_app/handlers/corpus_viz_handlers.py
CHANGED
|
@@ -695,7 +695,7 @@ class CorpusVizHandlers:
|
|
| 695 |
# Get custom category orders if set
|
| 696 |
category_orders = st.session_state.corpus_viz_category_orders
|
| 697 |
|
| 698 |
-
fig = visualizer.create_boxplot(
|
| 699 |
x_column=x_column,
|
| 700 |
y_column=y_column,
|
| 701 |
color_column=color_column,
|
|
@@ -710,7 +710,8 @@ class CorpusVizHandlers:
|
|
| 710 |
'title': title,
|
| 711 |
'x_column': x_column,
|
| 712 |
'y_column': y_column,
|
| 713 |
-
'color_column': color_column
|
|
|
|
| 714 |
}
|
| 715 |
|
| 716 |
# Add to plots list
|
|
@@ -763,12 +764,6 @@ class CorpusVizHandlers:
|
|
| 763 |
if size_column == "None":
|
| 764 |
size_column = None
|
| 765 |
|
| 766 |
-
add_trendline = st.checkbox(
|
| 767 |
-
"Add Trendline",
|
| 768 |
-
value=False,
|
| 769 |
-
help="Add a linear trendline to the scatter plot"
|
| 770 |
-
)
|
| 771 |
-
|
| 772 |
title = st.text_input(
|
| 773 |
"Plot Title",
|
| 774 |
value=f"Scatter Plot: {y_column} vs {x_column}",
|
|
@@ -782,13 +777,15 @@ class CorpusVizHandlers:
|
|
| 782 |
# Get custom category orders if set
|
| 783 |
category_orders = st.session_state.corpus_viz_category_orders
|
| 784 |
|
| 785 |
-
#
|
| 786 |
-
fig = visualizer.create_scatterplot(
|
| 787 |
x_column=x_column,
|
| 788 |
y_column=y_column,
|
| 789 |
color_column=color_column,
|
| 790 |
title=title,
|
| 791 |
-
category_orders=category_orders
|
|
|
|
|
|
|
| 792 |
)
|
| 793 |
|
| 794 |
# Store in session state
|
|
@@ -800,7 +797,9 @@ class CorpusVizHandlers:
|
|
| 800 |
'y_column': y_column,
|
| 801 |
'color_column': color_column,
|
| 802 |
'size_column': size_column,
|
| 803 |
-
'add_trendline':
|
|
|
|
|
|
|
| 804 |
}
|
| 805 |
|
| 806 |
# Add to plots list
|
|
@@ -813,10 +812,124 @@ class CorpusVizHandlers:
|
|
| 813 |
except Exception as e:
|
| 814 |
st.error(f"Error creating scatter plot: {str(e)}")
|
| 815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
@staticmethod
|
| 817 |
def render_visualization_results():
|
| 818 |
"""
|
| 819 |
-
Render visualization results.
|
| 820 |
"""
|
| 821 |
plots = st.session_state.corpus_viz_plots
|
| 822 |
|
|
@@ -831,6 +944,12 @@ class CorpusVizHandlers:
|
|
| 831 |
# Display the plot
|
| 832 |
st.plotly_chart(plot_config['figure'], use_container_width=True)
|
| 833 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
# Plot details
|
| 835 |
col1, col2, col3 = st.columns(3)
|
| 836 |
|
|
@@ -848,7 +967,7 @@ class CorpusVizHandlers:
|
|
| 848 |
with col3:
|
| 849 |
if plot_config['color_column']:
|
| 850 |
st.write(f"**Color By:** {plot_config['color_column']}")
|
| 851 |
-
if plot_config['type'] == 'scatter' and plot_config
|
| 852 |
st.write(f"**Size By:** {plot_config['size_column']}")
|
| 853 |
|
| 854 |
# Remove plot button
|
|
|
|
| 695 |
# Get custom category orders if set
|
| 696 |
category_orders = st.session_state.corpus_viz_category_orders
|
| 697 |
|
| 698 |
+
fig, stats_results = visualizer.create_boxplot(
|
| 699 |
x_column=x_column,
|
| 700 |
y_column=y_column,
|
| 701 |
color_column=color_column,
|
|
|
|
| 710 |
'title': title,
|
| 711 |
'x_column': x_column,
|
| 712 |
'y_column': y_column,
|
| 713 |
+
'color_column': color_column,
|
| 714 |
+
'stats_results': stats_results
|
| 715 |
}
|
| 716 |
|
| 717 |
# Add to plots list
|
|
|
|
| 764 |
if size_column == "None":
|
| 765 |
size_column = None
|
| 766 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 767 |
title = st.text_input(
|
| 768 |
"Plot Title",
|
| 769 |
value=f"Scatter Plot: {y_column} vs {x_column}",
|
|
|
|
| 777 |
# Get custom category orders if set
|
| 778 |
category_orders = st.session_state.corpus_viz_category_orders
|
| 779 |
|
| 780 |
+
# Create scatter plot with statistical analysis (trendline and confidence intervals enabled by default)
|
| 781 |
+
fig, stats_results = visualizer.create_scatterplot(
|
| 782 |
x_column=x_column,
|
| 783 |
y_column=y_column,
|
| 784 |
color_column=color_column,
|
| 785 |
title=title,
|
| 786 |
+
category_orders=category_orders,
|
| 787 |
+
add_trendline=True,
|
| 788 |
+
add_confidence_interval=True
|
| 789 |
)
|
| 790 |
|
| 791 |
# Store in session state
|
|
|
|
| 797 |
'y_column': y_column,
|
| 798 |
'color_column': color_column,
|
| 799 |
'size_column': size_column,
|
| 800 |
+
'add_trendline': True,
|
| 801 |
+
'add_confidence_interval': True,
|
| 802 |
+
'stats_results': stats_results
|
| 803 |
}
|
| 804 |
|
| 805 |
# Add to plots list
|
|
|
|
| 812 |
except Exception as e:
|
| 813 |
st.error(f"Error creating scatter plot: {str(e)}")
|
| 814 |
|
| 815 |
+
@staticmethod
|
| 816 |
+
def render_statistical_results(stats_results: Dict[str, Any]):
|
| 817 |
+
"""
|
| 818 |
+
Render statistical analysis results in a formatted way.
|
| 819 |
+
|
| 820 |
+
Args:
|
| 821 |
+
stats_results: Statistical results dictionary
|
| 822 |
+
"""
|
| 823 |
+
if not stats_results or 'error' in stats_results:
|
| 824 |
+
if stats_results and 'error' in stats_results:
|
| 825 |
+
st.error(f"Statistical analysis error: {stats_results['error']}")
|
| 826 |
+
return
|
| 827 |
+
|
| 828 |
+
st.write("### 📊 Statistical Analysis")
|
| 829 |
+
|
| 830 |
+
test_type = stats_results.get('test_type', 'Unknown')
|
| 831 |
+
|
| 832 |
+
if test_type == "One-way ANOVA":
|
| 833 |
+
# One-way ANOVA results in APA table format
|
| 834 |
+
f_stat = stats_results.get('f_statistic', 0)
|
| 835 |
+
p_val = stats_results.get('p_value', 1)
|
| 836 |
+
p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< .001"
|
| 837 |
+
eta_sq = stats_results.get('eta_squared', 0)
|
| 838 |
+
interpretation = stats_results.get('eta_squared_interpretation', 'Unknown')
|
| 839 |
+
df_between = stats_results.get('df_between', 0)
|
| 840 |
+
df_within = stats_results.get('df_within', 0)
|
| 841 |
+
|
| 842 |
+
# Create APA-style table
|
| 843 |
+
anova_table = f"""
|
| 844 |
+
| Statistic | Value | df | p | η² | Effect Size |
|
| 845 |
+
|-----------|-------|----|----|----|-----------|
|
| 846 |
+
| F | {f_stat:.3f} | ({df_between}, {df_within}) | {p_display} | {eta_sq:.3f} | {interpretation} |
|
| 847 |
+
"""
|
| 848 |
+
st.markdown(anova_table)
|
| 849 |
+
|
| 850 |
+
# Sample size
|
| 851 |
+
sample_size = stats_results.get('sample_size', 0)
|
| 852 |
+
st.write(f"**Model:** F({df_between}, {df_within}) = {f_stat:.3f}, p = {p_display}, N = {sample_size}")
|
| 853 |
+
|
| 854 |
+
# Post hoc results
|
| 855 |
+
if 'posthoc' in stats_results and stats_results['posthoc']:
|
| 856 |
+
st.write("**Post Hoc Comparisons (Tukey HSD):**")
|
| 857 |
+
|
| 858 |
+
posthoc_data = []
|
| 859 |
+
for comparison in stats_results['posthoc']:
|
| 860 |
+
group1 = comparison.get('group1', '')
|
| 861 |
+
group2 = comparison.get('group2', '')
|
| 862 |
+
cohens_d = comparison.get('cohens_d', 0)
|
| 863 |
+
d_interp = comparison.get('cohens_d_interpretation', '')
|
| 864 |
+
p_val = comparison.get('p_value', 1)
|
| 865 |
+
mean_diff = comparison.get('mean_diff', 0)
|
| 866 |
+
|
| 867 |
+
p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< 0.001"
|
| 868 |
+
|
| 869 |
+
posthoc_data.append({
|
| 870 |
+
'Comparison': f"{group1} vs {group2}",
|
| 871 |
+
'Mean Diff': f"{mean_diff:.3f}",
|
| 872 |
+
"Cohen's d": f"{cohens_d:.3f}",
|
| 873 |
+
'Effect Size': d_interp,
|
| 874 |
+
'p-value': p_display
|
| 875 |
+
})
|
| 876 |
+
|
| 877 |
+
if posthoc_data:
|
| 878 |
+
st.dataframe(pd.DataFrame(posthoc_data), use_container_width=True)
|
| 879 |
+
|
| 880 |
+
elif test_type == "Two-way ANOVA":
|
| 881 |
+
# Two-way ANOVA results in APA table format
|
| 882 |
+
st.write("**Main Effects and Interaction:**")
|
| 883 |
+
|
| 884 |
+
# Factor A
|
| 885 |
+
factor_a = stats_results.get('factor_a', {})
|
| 886 |
+
factor_b = stats_results.get('factor_b', {})
|
| 887 |
+
interaction = stats_results.get('interaction', {})
|
| 888 |
+
df_error = stats_results.get('df_error', 0)
|
| 889 |
+
|
| 890 |
+
# Create APA-style table
|
| 891 |
+
twoway_table = "| Source | F | df | p | ηp² | Effect Size |\n|--------|---|----|----|-----|-------------|\n"
|
| 892 |
+
|
| 893 |
+
for effect_name, effect_data in [("Factor A", factor_a), ("Factor B", factor_b), ("A × B", interaction)]:
|
| 894 |
+
f_stat = effect_data.get('f_statistic', 0)
|
| 895 |
+
p_val = effect_data.get('p_value', 1)
|
| 896 |
+
df = effect_data.get('df', 0)
|
| 897 |
+
eta_sq = effect_data.get('partial_eta_squared', 0)
|
| 898 |
+
interpretation = effect_data.get('interpretation', '')
|
| 899 |
+
|
| 900 |
+
p_display = f"{p_val:.3f}" if p_val >= 0.001 else "< .001"
|
| 901 |
+
|
| 902 |
+
twoway_table += f"| {effect_name} | {f_stat:.3f} | ({df}, {df_error}) | {p_display} | {eta_sq:.3f} | {interpretation} |\n"
|
| 903 |
+
|
| 904 |
+
st.markdown(twoway_table)
|
| 905 |
+
|
| 906 |
+
sample_size = stats_results.get('sample_size', 0)
|
| 907 |
+
st.write(f"**Sample Size:** N = {sample_size}, df_error = {df_error}")
|
| 908 |
+
|
| 909 |
+
elif test_type == "Simple Linear Regression":
|
| 910 |
+
# Bivariate correlation results in APA table format
|
| 911 |
+
correlation = stats_results.get('correlation', {})
|
| 912 |
+
regression = stats_results.get('regression', {})
|
| 913 |
+
|
| 914 |
+
pearson_r = correlation.get('pearson_r', 0)
|
| 915 |
+
corr_p = correlation.get('p_value', 1)
|
| 916 |
+
corr_p_display = f"{corr_p:.3f}" if corr_p >= 0.001 else "< .001"
|
| 917 |
+
corr_interp = correlation.get('interpretation', 'Unknown')
|
| 918 |
+
r_squared = regression.get('r_squared', 0)
|
| 919 |
+
|
| 920 |
+
# Create APA-style table for bivariate correlation
|
| 921 |
+
correlation_table = f"""
|
| 922 |
+
| Statistic | Value | p | Effect Size |
|
| 923 |
+
|-----------|-------|---|-------------|
|
| 924 |
+
| Pearson r | {pearson_r:.3f} | {corr_p_display} | {corr_interp} |
|
| 925 |
+
| R² | {r_squared:.3f} | {corr_p_display} | - |
|
| 926 |
+
"""
|
| 927 |
+
st.markdown(correlation_table)
|
| 928 |
+
|
| 929 |
@staticmethod
|
| 930 |
def render_visualization_results():
|
| 931 |
"""
|
| 932 |
+
Render visualization results with statistical analysis.
|
| 933 |
"""
|
| 934 |
plots = st.session_state.corpus_viz_plots
|
| 935 |
|
|
|
|
| 944 |
# Display the plot
|
| 945 |
st.plotly_chart(plot_config['figure'], use_container_width=True)
|
| 946 |
|
| 947 |
+
# Display statistical results if available
|
| 948 |
+
if 'stats_results' in plot_config and plot_config['stats_results']:
|
| 949 |
+
CorpusVizHandlers.render_statistical_results(plot_config['stats_results'])
|
| 950 |
+
|
| 951 |
+
st.markdown("---")
|
| 952 |
+
|
| 953 |
# Plot details
|
| 954 |
col1, col2, col3 = st.columns(3)
|
| 955 |
|
|
|
|
| 967 |
with col3:
|
| 968 |
if plot_config['color_column']:
|
| 969 |
st.write(f"**Color By:** {plot_config['color_column']}")
|
| 970 |
+
if plot_config['type'] == 'scatter' and plot_config.get('size_column'):
|
| 971 |
st.write(f"**Size By:** {plot_config['size_column']}")
|
| 972 |
|
| 973 |
# Remove plot button
|