File size: 4,299 Bytes
5b6c556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu, spearmanr
import os

def load_and_preprocess_data(filepath='user_study/data/user_data.csv'):
    # Loads and preprocesses the user study data.
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"The data file was not found at {filepath}")
    df = pd.read_csv(filepath)
    if 'attr_q_cognitive_load' in df.columns:
        df['attr_q_ease_of_use'] = 6 - df['attr_q_cognitive_load']
    return df

def run_ux_ratings_test(df):
    # Compares UX ratings across the three pages.
    print("\n--- 1. UX Ratings Comparison Across Pages ---")
    page_ratings = {
        'Attribution': df[['attr_q_visual_clarity', 'attr_q_ease_of_use', 'attr_q_influencer_plausibility']].mean(axis=1),
        'Function Vectors': df[['fv_q_pca_clarity', 'fv_q_type_attribution_clarity', 'fv_q_layer_evolution_plausibility']].mean(axis=1),
        'Circuit Trace': df[['ct_q_main_graph_clarity', 'ct_q_feature_explorer_usefulness', 'ct_q_subnetwork_clarity']].mean(axis=1)
    }
    
    attr_scores = page_ratings['Attribution'].dropna()
    fv_scores = page_ratings['Function Vectors'].dropna()
    ct_scores = page_ratings['Circuit Trace'].dropna()

    if len(attr_scores) > 0 and len(fv_scores) > 0 and len(ct_scores) > 0:
        stat, p = kruskal(attr_scores, fv_scores, ct_scores)
        print("Kruskal-Wallis test for overall UX ratings across the three pages:")
        print(f"H-statistic: {stat:.4f}, p-value: {p:.4f}")
        if p < 0.05:
            print("Result: There is a statistically significant difference in UX ratings between the pages.")
        else:
            print("Result: There is no statistically significant difference in UX ratings between the pages.")
    else:
        print("Could not perform Kruskal-Wallis test due to insufficient data.")

def run_language_comparison_test(df):
    # Compares ease of use for the Attribution page between English and German speakers.
    print("\n--- 2. Language Comparison for Attribution Page Ease of Use ---")
    en_df = df[df['language'] == 'en']
    de_df = df[df['language'] == 'de']
    
    en_scores = en_df['attr_q_ease_of_use'].dropna()
    de_scores = de_df['attr_q_ease_of_use'].dropna()

    if len(en_scores) > 0 and len(de_scores) > 0:
        stat, p = mannwhitneyu(en_scores, de_scores, alternative='two-sided')
        print("Mann-Whitney U test for 'Ease of Use' on Attribution page (English vs. German):")
        print(f"U-statistic: {stat:.4f}, p-value: {p:.4f}")
        if p < 0.05:
            print("Result: There is a statistically significant difference between the language groups.")
        else:
            print("Result: There is no statistically significant difference between the language groups.")
    else:
        print("Could not perform Mann-Whitney U test due to insufficient data.")

def run_experience_correctness_test(df):
    # Tests for a correlation between LLM experience and correctness.
    print("\n--- 3. Correlation between LLM Experience and Comprehension Correctness ---")
    
    experience_map = {'novice': 1, 'intermediate': 2, 'expert': 3}
    df['llm_experience_ordinal'] = df['llm_experience'].map(experience_map)
    
    correct_cols = [col for col in df.columns if 'correct' in col]
    df['overall_correctness'] = df[correct_cols].mean(axis=1)
    
    corr_df = df[['llm_experience_ordinal', 'overall_correctness']].dropna()
    if not corr_df.empty:
        corr, p = spearmanr(corr_df['llm_experience_ordinal'], corr_df['overall_correctness'])
        print("Spearman correlation between LLM experience and overall comprehension correctness:")
        print(f"Rho: {corr:.4f}, p-value: {p:.4f}")
        if p < 0.05:
            print("Result: There is a statistically significant correlation.")
        else:
            print("Result: There is no statistically significant correlation.")
    else:
        print("Could not perform Spearman correlation due to insufficient data.")

if __name__ == '__main__':
    try:
        data = load_and_preprocess_data('../../user_study/data/user_data.csv')
        run_ux_ratings_test(data)
        run_language_comparison_test(data)
        run_experience_correctness_test(data)
    except Exception as e:
        print(f"An error occurred: {e}")