File size: 17,677 Bytes
8e1b079
 
 
 
 
 
 
 
 
 
 
 
 
f03b2d1
8e1b079
 
 
 
 
f03b2d1
8e1b079
f03b2d1
8e1b079
f03b2d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1b079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df9a12c
8e1b079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df9a12c
 
 
 
 
 
 
 
8e1b079
 
 
 
df9a12c
8e1b079
 
 
 
 
 
 
64a8ef6
8e1b079
 
 
 
64a8ef6
8e1b079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64a8ef6
8e1b079
 
 
 
 
 
 
 
 
 
 
 
64a8ef6
 
 
 
 
 
 
 
 
8e1b079
 
64a8ef6
8e1b079
 
 
 
 
 
 
 
 
 
 
64a8ef6
 
 
8e1b079
 
 
 
 
64a8ef6
 
8e1b079
64a8ef6
8e1b079
 
64a8ef6
 
 
 
 
8e1b079
64a8ef6
 
 
8e1b079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260d576
 
 
 
 
 
 
 
 
 
8e1b079
 
 
 
 
 
 
 
 
f03b2d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1b079
f03b2d1
 
 
 
 
 
8e1b079
 
a268f81
8e1b079
 
 
 
 
 
 
 
 
f03b2d1
8e1b079
 
 
f03b2d1
 
 
 
 
 
8e1b079
 
 
 
 
 
 
 
 
c71d126
8e1b079
 
 
a268f81
8e1b079
 
 
 
 
 
a268f81
8e1b079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
"""
Gradio App for Entropy-Conserving Transformations

This app demonstrates how divergence-free vector fields can transform
arbitrary distributions towards Gaussian form while conserving entropy.
"""

import gradio as gr
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from entra import DataFrameTransformer, VectorSampler

matplotlib.use("Agg")


def generate_uniform_data(n_per_dim: int = 20, dimensions: int = 2) -> pd.DataFrame:
    """Generate uniform grid data using VectorSampler."""
    if dimensions == 2:
        center = [0.0, 0.0]
    else:  # 3D
        center = [0.0, 0.0, 0.0]

    sampler = VectorSampler(
        center=center,
        delta_x=1,
        num_points_per_dim=n_per_dim,
        distribution="uniform",
    )
    points = sampler.sample()

    if dimensions == 2:
        df = pd.DataFrame({"x": points[:, 0], "y": points[:, 1]})
    else:
        df = pd.DataFrame({"x": points[:, 0], "y": points[:, 1], "z": points[:, 2]})

    return df


def generate_sample_csv(n_per_dim: int, dimensions: int):
    """Generate sample CSV and return as downloadable file."""
    df = generate_uniform_data(n_per_dim, dimensions)

    # Save to temp file for download
    temp_path = "/tmp/generated_uniform_data.csv"
    df.to_csv(temp_path, index=False)

    n_points = len(df)
    cols = list(df.columns)
    preview = df.head(10).to_string()

    return (
        temp_path,
        f"Generated {n_points} points with columns: {cols}\n\nPreview:\n{preview}",
        df,
    )


def load_csv_file(file):
    """Load uploaded CSV file."""
    if file is None:
        return None, "No file uploaded", None

    df = pd.read_csv(file.name)
    n_points = len(df)
    cols = list(df.columns)
    preview = df.head(10).to_string()

    return (
        file.name,
        f"Loaded {n_points} points with columns: {cols}\n\nPreview:\n{preview}",
        df,
    )


def run_transformation(
    df_state,
    columns_str: str,
    sigma: float,
    max_iterations: int,
    progress=gr.Progress(),
):
    """Run the LM optimization and return results."""
    if df_state is None:
        return (
            None,
            None,
            None,
            "Error: No data loaded. Please upload or generate data first.",
        )

    df = df_state

    # Parse columns
    columns = [c.strip() for c in columns_str.split(",")]

    # Validate columns exist
    missing = [c for c in columns if c not in df.columns]
    if missing:
        return (
            None,
            None,
            None,
            f"Error: Columns not found: {missing}. Available: {list(df.columns)}",
        )

    # Progress callback for the transformer
    def progress_callback(iteration, max_iter, det_val, entropy_val):
        progress(
            iteration / max_iter,
            desc=f"Iter {iteration}/{max_iter} | Det: {det_val:.2e} | H: {entropy_val:.4f}",
        )

    # Create transformer with progress callback
    transformer = DataFrameTransformer(
        sigma=sigma,
        max_iterations=max_iterations,
        verbose=False,
        progress_callback=progress_callback,
    )

    # Run transformation
    df_transformed = transformer.fit_transform(df, columns=columns)

    # Get entropy comparison
    entropy = transformer.get_entropy_comparison(df, df_transformed)
    target_entropy = entropy["original"]["uniform_entropy"]

    # Create plots
    fig_scatter = create_scatter_plot(df, df_transformed, columns)
    fig_hist = create_histogram_plot(df, df_transformed, columns)
    fig_history = create_history_plot(transformer.history_, target_entropy=target_entropy)

    # Create results text
    results_text = format_results(entropy, transformer.history_)

    return fig_scatter, fig_hist, fig_history, results_text


def create_scatter_plot(df_orig, df_trans, columns):
    """Create before/after scatter plot."""
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    if len(columns) >= 2:
        x_col, y_col = columns[0], columns[1]

        axes[0].scatter(df_orig[x_col], df_orig[y_col], c="blue", alpha=0.5, s=10)
        axes[0].set_xlabel(x_col)
        axes[0].set_ylabel(y_col)
        axes[0].set_title("Original Distribution")
        axes[0].set_aspect("equal")
        axes[0].grid(True, alpha=0.3)

        axes[1].scatter(df_trans[x_col], df_trans[y_col], c="red", alpha=0.5, s=10)
        axes[1].set_xlabel(x_col)
        axes[1].set_ylabel(y_col)
        axes[1].set_title("Transformed (Towards Gaussian)")
        axes[1].set_aspect("equal")
        axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    return fig


def create_histogram_plot(df_orig, df_trans, columns):
    """Create marginal histogram plots."""
    n_cols = min(len(columns), 3)
    fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))

    if n_cols == 1:
        axes = axes.reshape(1, -1)

    for i, col in enumerate(columns[:n_cols]):
        # Original
        axes[i, 0].hist(df_orig[col], bins=30, density=True, alpha=0.7, color="blue")
        axes[i, 0].set_xlabel(col)
        axes[i, 0].set_ylabel("Density")
        axes[i, 0].set_title(f"Original {col} Marginal")

        # Transformed with Gaussian overlay
        axes[i, 1].hist(df_trans[col], bins=30, density=True, alpha=0.7, color="red")
        x_range = np.linspace(df_trans[col].min(), df_trans[col].max(), 100)
        mu = df_trans[col].mean()
        std = df_trans[col].std()
        gaussian = (1 / (std * np.sqrt(2 * np.pi))) * np.exp(
            -0.5 * ((x_range - mu) / std) ** 2
        )
        axes[i, 1].plot(x_range, gaussian, "k--", linewidth=2, label="Gaussian fit")
        axes[i, 1].set_xlabel(col)
        axes[i, 1].set_ylabel("Density")
        axes[i, 1].set_title(f"Transformed {col} Marginal")
        axes[i, 1].legend()

    plt.tight_layout()
    return fig


def create_history_plot(history, target_entropy=None):
    """Create optimization history plot."""
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # Determinant
    axes[0].semilogy(history["iteration"], history["determinant"], "b-o", markersize=4)
    axes[0].set_xlabel("Iteration")
    axes[0].set_ylabel("Covariance Determinant")
    axes[0].set_title("Determinant Minimization")
    axes[0].grid(True, alpha=0.3)

    # Gaussian entropy
    axes[1].plot(history["iteration"], history["gaussian_entropy"], "r-o", markersize=4)
    if target_entropy is not None:
        axes[1].axhline(
            target_entropy,
            color="green",
            linestyle="--",
            linewidth=2,
            label=f"Target H(uniform) = {target_entropy:.4f}",
        )
        axes[1].legend()
    axes[1].set_xlabel("Iteration")
    axes[1].set_ylabel("H(Gaussian)")
    axes[1].set_title("Gaussian Entropy → Target Uniform Entropy")
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    return fig


def format_results(entropy, history):
    """Format results as text."""
    det_reduction = (
        entropy["original"]["determinant"] / entropy["transformed"]["determinant"]
    )
    target_entropy = entropy["original"]["uniform_entropy"]
    final_entropy = entropy["transformed"]["gaussian_entropy"]
    entropy_gap = final_entropy - target_entropy

    text = f"""
TRANSFORMATION RESULTS
{'=' * 50}

Target Entropy (Uniform Distribution):
  H(uniform) = {target_entropy:.6f} nats

  This is the true entropy we want to reach.

Gaussian Entropy of Transformed Data:
  H(Gaussian) = {final_entropy:.6f} nats

  This assumes the transformed data is Gaussian with the
  current covariance. When H(Gaussian) = H(uniform), the
  distribution is perfectly Gaussian.

Gap to Target:
  H(Gaussian) - H(uniform) = {entropy_gap:.6f} nats
  (Should approach 0 for perfect Gaussianization)

Covariance Determinant:
  Original:    {entropy['original']['determinant']:.6e}
  Transformed: {entropy['transformed']['determinant']:.6e}
  Reduction:   {det_reduction:.2f}x

Optimization:
  Iterations with improvement: {len(history['iteration'])}
  Final determinant: {history['determinant'][-1]:.6e}
  Final H(Gaussian): {history['gaussian_entropy'][-1]:.6f}
"""
    return text


# Markdown explanation of Levenberg-Marquardt
LM_EXPLANATION = """
## How the Levenberg-Marquardt Algorithm Works

The **Levenberg-Marquardt (LM) algorithm** is used to minimize the covariance determinant. Unlike gradient descent, **LM has no learning rate** - here's why:

### The Key Insight

LM is designed for **least-squares problems** where you minimize a sum of squared residuals. Instead of taking steps proportional to the gradient (like gradient descent), LM solves a **local linear approximation** of the problem at each step.

### How It Works

1. **Compute the Jacobian** `J` - the matrix of partial derivatives of residuals with respect to parameters

2. **Solve the normal equations**:
   ```
   (J^T J + λI) δ = -J^T r
   ```
   where `r` is the residual vector and `λ` is a damping parameter

3. **The damping parameter λ replaces the learning rate**:
   - When `λ` is **large**: The step is small and in the gradient direction (like gradient descent with small learning rate)
   - When `λ` is **small**: The step approaches the Gauss-Newton step (a direct jump to the local minimum of the quadratic approximation)

4. **Adaptive adjustment**:
   - If a step **decreases** the objective: Accept it and **decrease λ** (take bigger steps)
   - If a step **increases** the objective: Reject it and **increase λ** (take smaller, safer steps)

### Why No Learning Rate?

The LM algorithm **automatically adapts** its step size through the damping parameter λ:
- It starts cautious (large λ, small steps)
- As it finds a good direction, it becomes more aggressive (small λ, large steps)
- If it overshoots, it backs off automatically

This makes LM much more robust than gradient descent - you don't need to tune a learning rate!

### In This Application

We minimize `log(det(Cov))` where `Cov` is the covariance matrix of the transformed points. The transformation is parameterized by coefficients of divergence-free basis functions, ensuring the transformation is **volume-preserving** and thus **entropy-conserving**.
"""

THEORY_EXPLANATION = """
## Theoretical Background

### Maximum Entropy Principle

A fundamental theorem states: **Among all distributions with a given covariance matrix, the Gaussian has maximum entropy.**

This means for any distribution with entropy `H₀` and covariance `Σ`:
- The Gaussian with the same covariance has entropy `H_Gaussian(Σ) ≥ H₀`
- Equality holds only when the distribution is Gaussian

### The Key Insight

If we apply a **volume-preserving transformation**:
1. The entropy stays fixed at `H₀` (entropy is conserved)
2. But the covariance changes

By **minimizing the covariance determinant** while preserving entropy:
- We reduce `H_Gaussian(Σ)` (the Gaussian entropy bound)
- When `H_Gaussian(Σ) = H₀`, the distribution must be Gaussian!

### Why Divergence-Free?

Divergence-free vector fields define **volume-preserving** transformations:
- The Jacobian determinant equals 1 everywhere
- Total probability volume is conserved
- **Entropy is conserved** under the transformation

This is the incompressibility condition from fluid dynamics: `∇·v = 0`

### The Operator

We construct divergence-free basis functions using Lowitzsch's operator:

**Ô = -I∇² + ∇∇ᵀ**

Applied to Gaussian RBFs, this produces matrix-valued functions where each column is a divergence-free vector field.
"""


def create_app():
    """Create the Gradio interface."""
    with gr.Blocks(
        title="Entropy-Conserving Transformations", theme=gr.themes.Soft()
    ) as app:
        gr.Markdown(
            """
        # Entropy-Conserving Transformations Using Divergence-Free Vector Fields

        Transform arbitrary distributions towards Gaussian form while **conserving entropy**.

        This demo uses divergence-free basis functions to create volume-preserving transformations,
        then minimizes the covariance determinant using the Levenberg-Marquardt algorithm.
        """
        )

        gr.HTML("""
        <div style="text-align: center; margin: 10px 0;">
            <a href="https://www.paypal.com/donate?business=varun.kapoor@kapoorlabs.org&currency_code=EUR" target="_blank" style="text-decoration: none;">
                <button style="background-color: #0070ba; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; font-size: 14px; font-weight: bold;">
                    ☕ Buy me a coffee (PayPal)
                </button>
            </a>
        </div>
        """)

        # State to hold the dataframe
        df_state = gr.State(None)

        with gr.Tabs():
            with gr.Tab("Transform Data"):
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### Step 1: Load or Generate Data")

                        gr.Markdown(
                            """
**No CSV file?** Use "Generate Sample Data" below to create a uniform grid.

**Have your own CSV?** Format requirements:
- Header row with column names
- Numeric columns for coordinates (e.g., `x`, `y`, `z`)
- Example:
```
x,y
-9.5,-9.5
-9.5,-8.5
...
```
"""
                        )

                        with gr.Accordion(
                            "Generate Sample Data (no CSV needed)", open=True
                        ):
                            gr.Markdown(
                                "*Creates a uniform grid using VectorSampler - perfect for testing*"
                            )
                            n_per_dim = gr.Slider(
                                minimum=5,
                                maximum=500,
                                value=20,
                                step=1,
                                label="Points per dimension",
                            )
                            dimensions = gr.Radio(
                                choices=[2, 3], value=2, label="Dimensions"
                            )
                            generate_btn = gr.Button(
                                "Generate Uniform Distribution",
                                variant="primary",
                            )
                            download_file = gr.File(label="Download generated CSV")

                        with gr.Accordion("Upload Your Own CSV", open=False):
                            file_upload = gr.File(
                                label="Upload CSV file", file_types=[".csv"]
                            )
                            upload_btn = gr.Button("Load CSV", variant="secondary")

                        data_info = gr.Textbox(
                            label="Data Info", lines=8, interactive=False
                        )

                        gr.Markdown("### Step 2: Configure Transformation")

                        columns_input = gr.Textbox(
                            value="x, y",
                            label="Columns to transform (comma-separated)",
                            lines=3,
                        )
                        sigma = gr.Slider(
                            minimum=0.1,
                            maximum=200.0,
                            value=5.0,
                            step=0.1,
                            label="Sigma (RBF width)",
                        )
                        max_iterations = gr.Slider(
                            minimum=10,
                            maximum=5000,
                            value=100,
                            step=10,
                            label="Max iterations",
                        )

                        transform_btn = gr.Button(
                            "Run Transformation", variant="primary", size="lg"
                        )

                    with gr.Column(scale=2):
                        gr.Markdown("### Results")

                        results_text = gr.Textbox(
                            label="Transformation Results",
                            lines=20,
                            interactive=False,
                        )

                        with gr.Row():
                            scatter_plot = gr.Plot(label="Before/After Scatter")

                        with gr.Row():
                            hist_plot = gr.Plot(label="Marginal Distributions")

                        with gr.Row():
                            history_plot = gr.Plot(label="Optimization History")

            with gr.Tab("How LM Works"):
                gr.Markdown(LM_EXPLANATION)

            with gr.Tab("Theory"):
                gr.Markdown(THEORY_EXPLANATION)

        # Event handlers
        def on_generate(n, dims):
            path, info, df = generate_sample_csv(n, dims)
            return path, info, df

        def on_upload(file):
            path, info, df = load_csv_file(file)
            return info, df

        generate_btn.click(
            fn=on_generate,
            inputs=[n_per_dim, dimensions],
            outputs=[download_file, data_info, df_state],
        )

        upload_btn.click(
            fn=on_upload, inputs=[file_upload], outputs=[data_info, df_state]
        )

        transform_btn.click(
            fn=run_transformation,
            inputs=[
                df_state,
                columns_input,
                sigma,
                max_iterations,
            ],
            outputs=[scatter_plot, hist_plot, history_plot, results_text],
        )

    return app


if __name__ == "__main__":
    app = create_app()
    app.launch()