Entra / app.py
vkapoor's picture
add donate button
260d576
"""
Gradio App for Entropy-Conserving Transformations
This app demonstrates how divergence-free vector fields can transform
arbitrary distributions towards Gaussian form while conserving entropy.
"""
import gradio as gr
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from entra import DataFrameTransformer, VectorSampler
matplotlib.use("Agg")
def generate_uniform_data(n_per_dim: int = 20, dimensions: int = 2) -> pd.DataFrame:
"""Generate uniform grid data using VectorSampler."""
if dimensions == 2:
center = [0.0, 0.0]
else: # 3D
center = [0.0, 0.0, 0.0]
sampler = VectorSampler(
center=center,
delta_x=1,
num_points_per_dim=n_per_dim,
distribution="uniform",
)
points = sampler.sample()
if dimensions == 2:
df = pd.DataFrame({"x": points[:, 0], "y": points[:, 1]})
else:
df = pd.DataFrame({"x": points[:, 0], "y": points[:, 1], "z": points[:, 2]})
return df
def generate_sample_csv(n_per_dim: int, dimensions: int):
"""Generate sample CSV and return as downloadable file."""
df = generate_uniform_data(n_per_dim, dimensions)
# Save to temp file for download
temp_path = "/tmp/generated_uniform_data.csv"
df.to_csv(temp_path, index=False)
n_points = len(df)
cols = list(df.columns)
preview = df.head(10).to_string()
return (
temp_path,
f"Generated {n_points} points with columns: {cols}\n\nPreview:\n{preview}",
df,
)
def load_csv_file(file):
"""Load uploaded CSV file."""
if file is None:
return None, "No file uploaded", None
df = pd.read_csv(file.name)
n_points = len(df)
cols = list(df.columns)
preview = df.head(10).to_string()
return (
file.name,
f"Loaded {n_points} points with columns: {cols}\n\nPreview:\n{preview}",
df,
)
def run_transformation(
df_state,
columns_str: str,
sigma: float,
max_iterations: int,
progress=gr.Progress(),
):
"""Run the LM optimization and return results."""
if df_state is None:
return (
None,
None,
None,
"Error: No data loaded. Please upload or generate data first.",
)
df = df_state
# Parse columns
columns = [c.strip() for c in columns_str.split(",")]
# Validate columns exist
missing = [c for c in columns if c not in df.columns]
if missing:
return (
None,
None,
None,
f"Error: Columns not found: {missing}. Available: {list(df.columns)}",
)
# Progress callback for the transformer
def progress_callback(iteration, max_iter, det_val, entropy_val):
progress(
iteration / max_iter,
desc=f"Iter {iteration}/{max_iter} | Det: {det_val:.2e} | H: {entropy_val:.4f}",
)
# Create transformer with progress callback
transformer = DataFrameTransformer(
sigma=sigma,
max_iterations=max_iterations,
verbose=False,
progress_callback=progress_callback,
)
# Run transformation
df_transformed = transformer.fit_transform(df, columns=columns)
# Get entropy comparison
entropy = transformer.get_entropy_comparison(df, df_transformed)
target_entropy = entropy["original"]["uniform_entropy"]
# Create plots
fig_scatter = create_scatter_plot(df, df_transformed, columns)
fig_hist = create_histogram_plot(df, df_transformed, columns)
fig_history = create_history_plot(transformer.history_, target_entropy=target_entropy)
# Create results text
results_text = format_results(entropy, transformer.history_)
return fig_scatter, fig_hist, fig_history, results_text
def create_scatter_plot(df_orig, df_trans, columns):
"""Create before/after scatter plot."""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
if len(columns) >= 2:
x_col, y_col = columns[0], columns[1]
axes[0].scatter(df_orig[x_col], df_orig[y_col], c="blue", alpha=0.5, s=10)
axes[0].set_xlabel(x_col)
axes[0].set_ylabel(y_col)
axes[0].set_title("Original Distribution")
axes[0].set_aspect("equal")
axes[0].grid(True, alpha=0.3)
axes[1].scatter(df_trans[x_col], df_trans[y_col], c="red", alpha=0.5, s=10)
axes[1].set_xlabel(x_col)
axes[1].set_ylabel(y_col)
axes[1].set_title("Transformed (Towards Gaussian)")
axes[1].set_aspect("equal")
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
return fig
def create_histogram_plot(df_orig, df_trans, columns):
"""Create marginal histogram plots."""
n_cols = min(len(columns), 3)
fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))
if n_cols == 1:
axes = axes.reshape(1, -1)
for i, col in enumerate(columns[:n_cols]):
# Original
axes[i, 0].hist(df_orig[col], bins=30, density=True, alpha=0.7, color="blue")
axes[i, 0].set_xlabel(col)
axes[i, 0].set_ylabel("Density")
axes[i, 0].set_title(f"Original {col} Marginal")
# Transformed with Gaussian overlay
axes[i, 1].hist(df_trans[col], bins=30, density=True, alpha=0.7, color="red")
x_range = np.linspace(df_trans[col].min(), df_trans[col].max(), 100)
mu = df_trans[col].mean()
std = df_trans[col].std()
gaussian = (1 / (std * np.sqrt(2 * np.pi))) * np.exp(
-0.5 * ((x_range - mu) / std) ** 2
)
axes[i, 1].plot(x_range, gaussian, "k--", linewidth=2, label="Gaussian fit")
axes[i, 1].set_xlabel(col)
axes[i, 1].set_ylabel("Density")
axes[i, 1].set_title(f"Transformed {col} Marginal")
axes[i, 1].legend()
plt.tight_layout()
return fig
def create_history_plot(history, target_entropy=None):
"""Create optimization history plot."""
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Determinant
axes[0].semilogy(history["iteration"], history["determinant"], "b-o", markersize=4)
axes[0].set_xlabel("Iteration")
axes[0].set_ylabel("Covariance Determinant")
axes[0].set_title("Determinant Minimization")
axes[0].grid(True, alpha=0.3)
# Gaussian entropy
axes[1].plot(history["iteration"], history["gaussian_entropy"], "r-o", markersize=4)
if target_entropy is not None:
axes[1].axhline(
target_entropy,
color="green",
linestyle="--",
linewidth=2,
label=f"Target H(uniform) = {target_entropy:.4f}",
)
axes[1].legend()
axes[1].set_xlabel("Iteration")
axes[1].set_ylabel("H(Gaussian)")
axes[1].set_title("Gaussian Entropy → Target Uniform Entropy")
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
return fig
def format_results(entropy, history):
"""Format results as text."""
det_reduction = (
entropy["original"]["determinant"] / entropy["transformed"]["determinant"]
)
target_entropy = entropy["original"]["uniform_entropy"]
final_entropy = entropy["transformed"]["gaussian_entropy"]
entropy_gap = final_entropy - target_entropy
text = f"""
TRANSFORMATION RESULTS
{'=' * 50}
Target Entropy (Uniform Distribution):
H(uniform) = {target_entropy:.6f} nats
This is the true entropy we want to reach.
Gaussian Entropy of Transformed Data:
H(Gaussian) = {final_entropy:.6f} nats
This assumes the transformed data is Gaussian with the
current covariance. When H(Gaussian) = H(uniform), the
distribution is perfectly Gaussian.
Gap to Target:
H(Gaussian) - H(uniform) = {entropy_gap:.6f} nats
(Should approach 0 for perfect Gaussianization)
Covariance Determinant:
Original: {entropy['original']['determinant']:.6e}
Transformed: {entropy['transformed']['determinant']:.6e}
Reduction: {det_reduction:.2f}x
Optimization:
Iterations with improvement: {len(history['iteration'])}
Final determinant: {history['determinant'][-1]:.6e}
Final H(Gaussian): {history['gaussian_entropy'][-1]:.6f}
"""
return text
# Markdown explanation of Levenberg-Marquardt
LM_EXPLANATION = """
## How the Levenberg-Marquardt Algorithm Works
The **Levenberg-Marquardt (LM) algorithm** is used to minimize the covariance determinant. Unlike gradient descent, **LM has no learning rate** - here's why:
### The Key Insight
LM is designed for **least-squares problems** where you minimize a sum of squared residuals. Instead of taking steps proportional to the gradient (like gradient descent), LM solves a **local linear approximation** of the problem at each step.
### How It Works
1. **Compute the Jacobian** `J` - the matrix of partial derivatives of residuals with respect to parameters
2. **Solve the normal equations**:
```
(J^T J + λI) δ = -J^T r
```
where `r` is the residual vector and `λ` is a damping parameter
3. **The damping parameter λ replaces the learning rate**:
- When `λ` is **large**: The step is small and in the gradient direction (like gradient descent with small learning rate)
- When `λ` is **small**: The step approaches the Gauss-Newton step (a direct jump to the local minimum of the quadratic approximation)
4. **Adaptive adjustment**:
- If a step **decreases** the objective: Accept it and **decrease λ** (take bigger steps)
- If a step **increases** the objective: Reject it and **increase λ** (take smaller, safer steps)
### Why No Learning Rate?
The LM algorithm **automatically adapts** its step size through the damping parameter λ:
- It starts cautious (large λ, small steps)
- As it finds a good direction, it becomes more aggressive (small λ, large steps)
- If it overshoots, it backs off automatically
This makes LM much more robust than gradient descent - you don't need to tune a learning rate!
### In This Application
We minimize `log(det(Cov))` where `Cov` is the covariance matrix of the transformed points. The transformation is parameterized by coefficients of divergence-free basis functions, ensuring the transformation is **volume-preserving** and thus **entropy-conserving**.
"""
THEORY_EXPLANATION = """
## Theoretical Background
### Maximum Entropy Principle
A fundamental theorem states: **Among all distributions with a given covariance matrix, the Gaussian has maximum entropy.**
This means for any distribution with entropy `H₀` and covariance `Σ`:
- The Gaussian with the same covariance has entropy `H_Gaussian(Σ) ≥ H₀`
- Equality holds only when the distribution is Gaussian
### The Key Insight
If we apply a **volume-preserving transformation**:
1. The entropy stays fixed at `H₀` (entropy is conserved)
2. But the covariance changes
By **minimizing the covariance determinant** while preserving entropy:
- We reduce `H_Gaussian(Σ)` (the Gaussian entropy bound)
- When `H_Gaussian(Σ) = H₀`, the distribution must be Gaussian!
### Why Divergence-Free?
Divergence-free vector fields define **volume-preserving** transformations:
- The Jacobian determinant equals 1 everywhere
- Total probability volume is conserved
- **Entropy is conserved** under the transformation
This is the incompressibility condition from fluid dynamics: `∇·v = 0`
### The Operator
We construct divergence-free basis functions using Lowitzsch's operator:
**Ô = -I∇² + ∇∇ᵀ**
Applied to Gaussian RBFs, this produces matrix-valued functions where each column is a divergence-free vector field.
"""
def create_app():
"""Create the Gradio interface."""
with gr.Blocks(
title="Entropy-Conserving Transformations", theme=gr.themes.Soft()
) as app:
gr.Markdown(
"""
# Entropy-Conserving Transformations Using Divergence-Free Vector Fields
Transform arbitrary distributions towards Gaussian form while **conserving entropy**.
This demo uses divergence-free basis functions to create volume-preserving transformations,
then minimizes the covariance determinant using the Levenberg-Marquardt algorithm.
"""
)
gr.HTML("""
<div style="text-align: center; margin: 10px 0;">
<a href="https://www.paypal.com/donate?business=varun.kapoor@kapoorlabs.org&currency_code=EUR" target="_blank" style="text-decoration: none;">
<button style="background-color: #0070ba; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; font-size: 14px; font-weight: bold;">
☕ Buy me a coffee (PayPal)
</button>
</a>
</div>
""")
# State to hold the dataframe
df_state = gr.State(None)
with gr.Tabs():
with gr.Tab("Transform Data"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Step 1: Load or Generate Data")
gr.Markdown(
"""
**No CSV file?** Use "Generate Sample Data" below to create a uniform grid.
**Have your own CSV?** Format requirements:
- Header row with column names
- Numeric columns for coordinates (e.g., `x`, `y`, `z`)
- Example:
```
x,y
-9.5,-9.5
-9.5,-8.5
...
```
"""
)
with gr.Accordion(
"Generate Sample Data (no CSV needed)", open=True
):
gr.Markdown(
"*Creates a uniform grid using VectorSampler - perfect for testing*"
)
n_per_dim = gr.Slider(
minimum=5,
maximum=500,
value=20,
step=1,
label="Points per dimension",
)
dimensions = gr.Radio(
choices=[2, 3], value=2, label="Dimensions"
)
generate_btn = gr.Button(
"Generate Uniform Distribution",
variant="primary",
)
download_file = gr.File(label="Download generated CSV")
with gr.Accordion("Upload Your Own CSV", open=False):
file_upload = gr.File(
label="Upload CSV file", file_types=[".csv"]
)
upload_btn = gr.Button("Load CSV", variant="secondary")
data_info = gr.Textbox(
label="Data Info", lines=8, interactive=False
)
gr.Markdown("### Step 2: Configure Transformation")
columns_input = gr.Textbox(
value="x, y",
label="Columns to transform (comma-separated)",
lines=3,
)
sigma = gr.Slider(
minimum=0.1,
maximum=200.0,
value=5.0,
step=0.1,
label="Sigma (RBF width)",
)
max_iterations = gr.Slider(
minimum=10,
maximum=5000,
value=100,
step=10,
label="Max iterations",
)
transform_btn = gr.Button(
"Run Transformation", variant="primary", size="lg"
)
with gr.Column(scale=2):
gr.Markdown("### Results")
results_text = gr.Textbox(
label="Transformation Results",
lines=20,
interactive=False,
)
with gr.Row():
scatter_plot = gr.Plot(label="Before/After Scatter")
with gr.Row():
hist_plot = gr.Plot(label="Marginal Distributions")
with gr.Row():
history_plot = gr.Plot(label="Optimization History")
with gr.Tab("How LM Works"):
gr.Markdown(LM_EXPLANATION)
with gr.Tab("Theory"):
gr.Markdown(THEORY_EXPLANATION)
# Event handlers
def on_generate(n, dims):
path, info, df = generate_sample_csv(n, dims)
return path, info, df
def on_upload(file):
path, info, df = load_csv_file(file)
return info, df
generate_btn.click(
fn=on_generate,
inputs=[n_per_dim, dimensions],
outputs=[download_file, data_info, df_state],
)
upload_btn.click(
fn=on_upload, inputs=[file_upload], outputs=[data_info, df_state]
)
transform_btn.click(
fn=run_transformation,
inputs=[
df_state,
columns_input,
sigma,
max_iterations,
],
outputs=[scatter_plot, hist_plot, history_plot, results_text],
)
return app
if __name__ == "__main__":
app = create_app()
app.launch()