compositeScores / src /streamlit_app.py
hfariborzi's picture
Update src/streamlit_app.py
52d845e verified
"""
COMPOSITE CORRELATION CALCULATOR - COMPLETE EXPLANATION
========================================================
This module implements unit-weighted composite correlation calculation from a correlation matrix.
It uses the classical test theory formula to compute correlations between a composite (sum of items)
and all other variables, without needing raw data.
Author: HubMeta Team
Date: February 2026
"""
import os
# On Huggingface Spaces the home directory may be unwritable; override it to the current working directory
os.environ['HOME'] = os.getcwd()
# Disable Streamlit usage stats to avoid write attempts
os.environ['STREAMLIT_BROWSER_GATHER_USAGE_STATS'] = 'false'
import streamlit as st
import pandas as pd
import numpy as np
def composite_correlations(R, composite_idx, var_names=None, augment=False):
"""
Compute unit-weighted composite correlations from a correlation matrix.
This function calculates the correlation between a composite variable (formed by
summing multiple items) and all other variables in the correlation matrix. The
calculation is based on classical test theory and uses the psychometric formula
for composite reliability.
Mathematical Background
-----------------------
For a unit-weighted composite Y = X₁ + X₂ + ... + Xₖ, the correlation between
Y and any variable Z is:
r(Y, Z) = Σᵢ r(Xᵢ, Z) / σ_Y
where σ_Y = sqrt(k + k(k-1)×r̄)
Here:
- k = number of items in the composite
- r̄ = average inter-item correlation (mean of off-diagonal correlations in R_yy)
- Σᵢ r(Xᵢ, Z) = sum of correlations between each composite item and variable Z
This formula is mathematically equivalent to:
1. Creating composite scores by summing items
2. Correlating the composite with each variable
But it works directly from the correlation matrix without needing raw data.
Parameters
----------
R : array-like, shape (n_vars, n_vars)
Full correlation matrix. Can be a numpy array or pandas DataFrame.
Must be symmetric with 1s on the diagonal.
composite_idx : list of int
Indices of variables to include in the composite.
Example: [0, 2, 5] means use variables at positions 0, 2, and 5.
var_names : list of str, optional
Names of all variables in R. If provided, output will be a labeled Series.
Length must match R.shape[0].
augment : bool, default=False
If True, return both the composite correlations AND an augmented correlation
matrix that includes the composite as a new row/column.
Returns
-------
r_comp : array or Series
Correlations between the composite and each variable.
- If var_names is None: numpy array of shape (n_vars,)
- If var_names provided: pandas Series with variable names as index
R_aug : array or DataFrame (only if augment=True)
Augmented correlation matrix of shape (n_vars+1, n_vars+1) that includes
the composite as the last row/column.
Algorithm Steps
---------------
1. Extract R_yy: sub-matrix of correlations among composite items
2. Calculate r̄: mean of off-diagonal correlations in R_yy
3. Calculate denominator: σ_Y = sqrt(k + k(k-1)×r̄)
4. Calculate numerator: for each variable, sum its correlations with all composite items
5. Compute final correlation: r_comp = numerator / denominator
Examples
--------
>>> # Simple example with 5 variables
>>> R = np.array([
... [1.0, 0.5, 0.6, 0.3, 0.4],
... [0.5, 1.0, 0.7, 0.2, 0.3],
... [0.6, 0.7, 1.0, 0.4, 0.5],
... [0.3, 0.2, 0.4, 1.0, 0.8],
... [0.4, 0.3, 0.5, 0.8, 1.0]
... ])
>>>
>>> # Create composite from first 3 variables (indices 0, 1, 2)
>>> r_comp = composite_correlations(R, composite_idx=[0, 1, 2])
>>> print(r_comp)
[0.95 0.95 0.95 0.48 0.60] # Composite correlates highly with its items
>>> # With variable names and augmented matrix
>>> var_names = ['Item1', 'Item2', 'Item3', 'Outcome1', 'Outcome2']
>>> r_comp, R_aug = composite_correlations(
... R, composite_idx=[0, 1, 2], var_names=var_names, augment=True
... )
>>> print(r_comp)
Item1 0.95
Item2 0.95
Item3 0.95
Outcome1 0.48
Outcome2 0.60
Name: Composite, dtype: float64
Notes
-----
- The composite items will have correlations close to 1.0 with the composite
(exact value depends on inter-item correlations)
- This assumes unit weighting (all items weighted equally)
- For reliability-weighted composites, use a different formula
- The denominator adjustment accounts for the fact that composite variance
includes both item variances and covariances
References
----------
- Nunnally, J. C., & Bernstein, I. H. (1994). Psychometric Theory (3rd ed.).
McGraw-Hill. Chapter 6: The Assessment of Reliability.
- Schmidt, F. L., & Hunter, J. E. (2015). Methods of Meta-Analysis (3rd ed.).
Sage Publications. Chapter 3: Correlational Artifacts.
"""
# Convert input to numpy array (handles both arrays and DataFrames)
R_mat = np.asarray(R, dtype=float)
n_all = R_mat.shape[0] # Total number of variables
n = len(composite_idx) # Number of items in composite
# STEP 1: Extract sub-matrix of correlations among composite items
# ----------------------------------------------------------------
# R_yy is the k×k correlation matrix for just the composite items
# Example: if composite_idx = [0, 2, 5] and R is 10×10,
# R_yy will be the 3×3 matrix of correlations among variables 0, 2, 5
R_yy = R_mat[np.ix_(composite_idx, composite_idx)]
# STEP 2: Calculate average inter-item correlation (r̄)
# -----------------------------------------------------
# Get upper triangle indices (excluding diagonal)
# For a 3×3 matrix, this gives positions: (0,1), (0,2), (1,2)
iu = np.triu_indices(n, k=1)
# Extract off-diagonal correlations and compute mean
# This is the average correlation between items in the composite
# Example: if items correlate at [0.5, 0.6, 0.7], rbar = 0.6
rbar = R_yy[iu].mean() if iu[0].size > 0 else 0.0
# STEP 3: Calculate denominator (composite standard deviation)
# ------------------------------------------------------------
# Formula: σ_Y = sqrt(k + k(k-1)×r̄)
#
# Derivation:
# For unit-weighted composite Y = X₁ + X₂ + ... + Xₖ (assuming standardized items):
# Var(Y) = Var(X₁) + Var(X₂) + ... + Var(Xₖ) + 2×Σᵢ<ⱼ Cov(Xᵢ, Xⱼ)
# = k + 2×Σᵢ<ⱼ r(Xᵢ, Xⱼ)
# = k + k(k-1)×r̄
#
# where we used: Var(Xᵢ) = 1 (standardized)
# Cov(Xᵢ, Xⱼ) = r(Xᵢ, Xⱼ) (correlation = covariance for standardized vars)
# Number of pairs = k(k-1)/2, so 2×Σᵢ<ⱼ = k(k-1)×r̄
#
# Example: 3 items with r̄ = 0.6
# denom = sqrt(3 + 3×2×0.6) = sqrt(3 + 3.6) = sqrt(6.6) ≈ 2.57
denom = np.sqrt(n + n*(n-1)*rbar)
# STEP 4: Calculate numerator (sum of correlations)
# -------------------------------------------------
# For each variable in the full matrix, sum its correlations with all composite items
#
# R_mat[composite_idx, :] extracts rows corresponding to composite items
# .sum(axis=0) sums down columns, giving sum of correlations for each variable
#
# Example: If composite has items [A, B, C] and we want correlation with variable X:
# numer[X] = r(A,X) + r(B,X) + r(C,X)
#
# This is vectorized - computes for all variables at once
numer = R_mat[composite_idx, :].sum(axis=0)
# STEP 5: Compute final composite correlation
# -------------------------------------------
# r(Composite, X) = Σᵢ r(Xᵢ, X) / σ_Y
#
# This divides the sum of item-X correlations by the composite's standard deviation
# The result is the correlation between the composite and each variable
#
# Interpretation:
# - Composite items will have r ≈ 0.9-1.0 (high correlation with their own composite)
# - Other variables will have r based on their average correlation with composite items
r_comp = numer / denom
# Format output as pandas Series if variable names provided
if var_names is not None:
r_comp = pd.Series(r_comp, index=var_names, name="Composite")
# Return just correlations if augment=False
if not augment:
return r_comp
# STEP 6: Build augmented correlation matrix (optional)
# -----------------------------------------------------
# Create a new correlation matrix that includes the composite as a new variable
# This is useful for further analyses that need the composite in the matrix
if var_names is not None:
# Create labeled DataFrame
idx = list(var_names) + ["Composite"]
R_aug = pd.DataFrame(np.zeros((n_all+1, n_all+1)), index=idx, columns=idx)
# Copy original matrix to top-left block
R_aug.iloc[:-1, :-1] = R_mat
# Add composite correlations to last row and column
R_aug.iloc[-1, :-1] = r_comp.values # Last row (composite vs all vars)
R_aug.iloc[:-1, -1] = r_comp.values # Last column (all vars vs composite)
# Diagonal element (composite vs itself) = 1.0
R_aug.iloc[-1, -1] = 1.0
else:
# Create unlabeled array
R_aug = np.zeros((n_all+1, n_all+1))
R_aug[:n_all, :n_all] = R_mat
R_aug[n_all, :n_all] = r_comp
R_aug[:n_all, n_all] = r_comp
R_aug[n_all, n_all] = 1.0
return r_comp, R_aug
# =============================================================================
# STREAMLIT WEB APPLICATION
# =============================================================================
# Streamlit UI
st.title("Composite-Correlation Calculator")
st.markdown("""
### What This Tool Does
This calculator computes **unit-weighted composite correlations** from a correlation matrix.
**Use Case:** You have a correlation matrix and want to:
1. Combine multiple items into a composite score (e.g., sum of survey items)
2. See how the composite correlates with other variables
3. Add the composite to your correlation matrix for further analysis
**How It Works:**
- Upload a correlation matrix (CSV file)
- Select which variables form the composite
- Get correlations between the composite and all variables
- Optionally get an augmented matrix with the composite included
**Formula:** Uses the psychometric formula `r(Composite, X) = Σr(item, X) / sqrt(k + k(k-1)×r̄)`
---
### Instructions
1. **Upload a CSV file** containing your correlation matrix
- First column should contain row labels (variable names)
- Can be lower-triangular (missing values will be filled by symmetry)
- Diagonal values will be set to 1.0 if missing
2. **Select variables** to include in the composite (minimum 2)
3. **Click "Compute"** to see results
""")
uploaded = st.file_uploader("Upload correlation matrix (CSV)", type=["csv"])
if uploaded is not None:
# 1) Read and label the correlation matrix
try:
df = pd.read_csv(uploaded, index_col=0)
except Exception as e:
st.error(f"Failed to read CSV: {e}")
st.info("Make sure the first column contains row labels (variable names).")
st.stop()
# Validate square matrix
if df.shape[0] != df.shape[1]:
st.error(f"Matrix must be square. Got {df.shape[0]} rows and {df.shape[1]} columns.")
st.stop()
st.success(f"✅ Loaded a {df.shape[0]}×{df.shape[1]} correlation matrix.")
# 2) Symmetrize and fill diagonal
# Many correlation matrices are stored as lower-triangular to save space
# This fills in the upper triangle by copying from the lower triangle
mat = df.values.astype(float)
mat = np.where(np.isnan(mat), mat.T, mat) # Fill missing cells by transpose (symmetry)
np.fill_diagonal(mat, 1.0) # Ensure diagonal = 1.0 (self-correlation)
df_sym = pd.DataFrame(mat, index=df.index, columns=df.columns)
with st.expander("📊 View symmetrized correlation matrix"):
st.dataframe(df_sym.style.format("{:.3f}"))
# 3) Select composite variables
all_vars = list(df_sym.columns)
st.subheader("Select Composite Items")
st.markdown("Choose which variables to combine into a unit-weighted composite:")
composite_vars = st.multiselect(
"Variables in composite",
options=all_vars,
default=all_vars[: min(3, len(all_vars))], # Default to first 3 variables
help="Select at least 2 variables to form a composite"
)
if len(composite_vars) < 2:
st.warning("⚠️ Please select at least 2 variables to form a composite.")
else:
st.info(f"Selected {len(composite_vars)} items for composite: {', '.join(composite_vars)}")
if st.button("🧮 Compute Composite Correlations", type="primary"):
# Get indices of selected variables
idx = [all_vars.index(v) for v in composite_vars]
# Compute composite correlations with augmented matrix
r_comp, R_aug = composite_correlations(
df_sym.values,
composite_idx=idx,
var_names=all_vars,
augment=True
)
# Display results
st.success("✅ Computation complete!")
# Show composite correlations
st.subheader("📈 Composite Correlations")
st.markdown("""
These are the correlations between your composite (sum of selected items)
and each variable in the matrix.
""")
# Create a styled dataframe
result_df = r_comp.to_frame()
result_df.columns = ['Correlation with Composite']
# Highlight composite items
def highlight_composite(row):
if row.name in composite_vars:
return ['background-color: #e3f2fd'] * len(row)
return [''] * len(row)
st.dataframe(
result_df.style
.format("{:.4f}")
.apply(highlight_composite, axis=1)
.bar(subset=['Correlation with Composite'], color='#1f77b4', vmin=-1, vmax=1)
)
st.caption("💡 Composite items (highlighted) typically have high correlations (0.8-1.0) with the composite.")
# Show augmented matrix
st.subheader("📊 Augmented Correlation Matrix")
st.markdown("""
This matrix includes your composite as a new variable (last row/column).
You can use this for further analyses.
""")
with st.expander("View full augmented matrix"):
st.dataframe(R_aug.style.format("{:.3f}"))
# Download options
st.subheader("💾 Download Results")
col1, col2 = st.columns(2)
with col1:
# Download composite correlations
csv1 = result_df.to_csv()
st.download_button(
label="Download Composite Correlations (CSV)",
data=csv1,
file_name="composite_correlations.csv",
mime="text/csv"
)
with col2:
# Download augmented matrix
csv2 = R_aug.to_csv()
st.download_button(
label="Download Augmented Matrix (CSV)",
data=csv2,
file_name="augmented_correlation_matrix.csv",
mime="text/csv"
)
# Show interpretation guide
with st.expander("📖 How to Interpret Results"):
st.markdown("""
**Composite Correlations:**
- **High (0.7-1.0):** Strong relationship with composite
- **Moderate (0.3-0.7):** Moderate relationship
- **Low (0.0-0.3):** Weak relationship
- **Negative:** Inverse relationship
**Composite Items:**
- Should correlate highly (0.8-1.0) with the composite
- Lower values suggest the item doesn't fit well
- Consider removing items with r < 0.7
**Other Variables:**
- Correlation shows how well they relate to the composite
- Use for criterion validity, predictive validity, etc.
**Formula Used:**
```
r(Composite, X) = Σr(item, X) / sqrt(k + k(k-1)×r̄)
```
where k = number of items, r̄ = average inter-item correlation
""")
else:
st.info("👆 Upload a CSV file containing your correlation matrix to get started.")
# Show example
with st.expander("📝 Example CSV Format"):
st.markdown("""
Your CSV should look like this:
```
,Item1,Item2,Item3,Outcome1,Outcome2
Item1,1.0,0.5,0.6,0.3,0.4
Item2,0.5,1.0,0.7,0.2,0.3
Item3,0.6,0.7,1.0,0.4,0.5
Outcome1,0.3,0.2,0.4,1.0,0.8
Outcome2,0.4,0.3,0.5,0.8,1.0
```
Or lower-triangular (missing values will be filled):
```
,Item1,Item2,Item3,Outcome1,Outcome2
Item1,1.0,,,,
Item2,0.5,1.0,,,
Item3,0.6,0.7,1.0,,
Outcome1,0.3,0.2,0.4,1.0,
Outcome2,0.4,0.3,0.5,0.8,1.0
```
""")