Spaces:

vkapoor
/

Entra

Sleeping

App Files Files Community

Entra / app.py

vkapoor

add donate button

260d576 about 1 month ago

raw

history blame contribute delete

17.7 kB

	"""
	Gradio App for Entropy-Conserving Transformations

	This app demonstrates how divergence-free vector fields can transform
	arbitrary distributions towards Gaussian form while conserving entropy.
	"""

	import gradio as gr
	import matplotlib
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd

	from entra import DataFrameTransformer, VectorSampler

	matplotlib.use("Agg")


	def generate_uniform_data(n_per_dim: int = 20, dimensions: int = 2) -> pd.DataFrame:
	"""Generate uniform grid data using VectorSampler."""
	if dimensions == 2:
	center = [0.0, 0.0]
	else: # 3D
	center = [0.0, 0.0, 0.0]

	sampler = VectorSampler(
	center=center,
	delta_x=1,
	num_points_per_dim=n_per_dim,
	distribution="uniform",
	)
	points = sampler.sample()

	if dimensions == 2:
	df = pd.DataFrame({"x": points[:, 0], "y": points[:, 1]})
	else:
	df = pd.DataFrame({"x": points[:, 0], "y": points[:, 1], "z": points[:, 2]})

	return df


	def generate_sample_csv(n_per_dim: int, dimensions: int):
	"""Generate sample CSV and return as downloadable file."""
	df = generate_uniform_data(n_per_dim, dimensions)

	# Save to temp file for download
	temp_path = "/tmp/generated_uniform_data.csv"
	df.to_csv(temp_path, index=False)

	n_points = len(df)
	cols = list(df.columns)
	preview = df.head(10).to_string()

	return (
	temp_path,
	f"Generated {n_points} points with columns: {cols}\n\nPreview:\n{preview}",
	df,
	)


	def load_csv_file(file):
	"""Load uploaded CSV file."""
	if file is None:
	return None, "No file uploaded", None

	df = pd.read_csv(file.name)
	n_points = len(df)
	cols = list(df.columns)
	preview = df.head(10).to_string()

	return (
	file.name,
	f"Loaded {n_points} points with columns: {cols}\n\nPreview:\n{preview}",
	df,
	)


	def run_transformation(
	df_state,
	columns_str: str,
	sigma: float,
	max_iterations: int,
	progress=gr.Progress(),
	):
	"""Run the LM optimization and return results."""
	if df_state is None:
	return (
	None,
	None,
	None,
	"Error: No data loaded. Please upload or generate data first.",
	)

	df = df_state

	# Parse columns
	columns = [c.strip() for c in columns_str.split(",")]

	# Validate columns exist
	missing = [c for c in columns if c not in df.columns]
	if missing:
	return (
	None,
	None,
	None,
	f"Error: Columns not found: {missing}. Available: {list(df.columns)}",
	)

	# Progress callback for the transformer
	def progress_callback(iteration, max_iter, det_val, entropy_val):
	progress(
	iteration / max_iter,
	desc=f"Iter {iteration}/{max_iter} \| Det: {det_val:.2e} \| H: {entropy_val:.4f}",
	)

	# Create transformer with progress callback
	transformer = DataFrameTransformer(
	sigma=sigma,
	max_iterations=max_iterations,
	verbose=False,
	progress_callback=progress_callback,
	)

	# Run transformation
	df_transformed = transformer.fit_transform(df, columns=columns)

	# Get entropy comparison
	entropy = transformer.get_entropy_comparison(df, df_transformed)
	target_entropy = entropy["original"]["uniform_entropy"]

	# Create plots
	fig_scatter = create_scatter_plot(df, df_transformed, columns)
	fig_hist = create_histogram_plot(df, df_transformed, columns)
	fig_history = create_history_plot(transformer.history_, target_entropy=target_entropy)

	# Create results text
	results_text = format_results(entropy, transformer.history_)

	return fig_scatter, fig_hist, fig_history, results_text


	def create_scatter_plot(df_orig, df_trans, columns):
	"""Create before/after scatter plot."""
	fig, axes = plt.subplots(1, 2, figsize=(12, 5))

	if len(columns) >= 2:
	x_col, y_col = columns[0], columns[1]

	axes[0].scatter(df_orig[x_col], df_orig[y_col], c="blue", alpha=0.5, s=10)
	axes[0].set_xlabel(x_col)
	axes[0].set_ylabel(y_col)
	axes[0].set_title("Original Distribution")
	axes[0].set_aspect("equal")
	axes[0].grid(True, alpha=0.3)

	axes[1].scatter(df_trans[x_col], df_trans[y_col], c="red", alpha=0.5, s=10)
	axes[1].set_xlabel(x_col)
	axes[1].set_ylabel(y_col)
	axes[1].set_title("Transformed (Towards Gaussian)")
	axes[1].set_aspect("equal")
	axes[1].grid(True, alpha=0.3)

	plt.tight_layout()
	return fig


	def create_histogram_plot(df_orig, df_trans, columns):
	"""Create marginal histogram plots."""
	n_cols = min(len(columns), 3)
	fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))

	if n_cols == 1:
	axes = axes.reshape(1, -1)

	for i, col in enumerate(columns[:n_cols]):
	# Original
	axes[i, 0].hist(df_orig[col], bins=30, density=True, alpha=0.7, color="blue")
	axes[i, 0].set_xlabel(col)
	axes[i, 0].set_ylabel("Density")
	axes[i, 0].set_title(f"Original {col} Marginal")

	# Transformed with Gaussian overlay
	axes[i, 1].hist(df_trans[col], bins=30, density=True, alpha=0.7, color="red")
	x_range = np.linspace(df_trans[col].min(), df_trans[col].max(), 100)
	mu = df_trans[col].mean()
	std = df_trans[col].std()
	gaussian = (1 / (std * np.sqrt(2 * np.pi))) * np.exp(
	-0.5 * ((x_range - mu) / std) ** 2
	)
	axes[i, 1].plot(x_range, gaussian, "k--", linewidth=2, label="Gaussian fit")
	axes[i, 1].set_xlabel(col)
	axes[i, 1].set_ylabel("Density")
	axes[i, 1].set_title(f"Transformed {col} Marginal")
	axes[i, 1].legend()

	plt.tight_layout()
	return fig


	def create_history_plot(history, target_entropy=None):
	"""Create optimization history plot."""
	fig, axes = plt.subplots(1, 2, figsize=(12, 4))

	# Determinant
	axes[0].semilogy(history["iteration"], history["determinant"], "b-o", markersize=4)
	axes[0].set_xlabel("Iteration")
	axes[0].set_ylabel("Covariance Determinant")
	axes[0].set_title("Determinant Minimization")
	axes[0].grid(True, alpha=0.3)

	# Gaussian entropy
	axes[1].plot(history["iteration"], history["gaussian_entropy"], "r-o", markersize=4)
	if target_entropy is not None:
	axes[1].axhline(
	target_entropy,
	color="green",
	linestyle="--",
	linewidth=2,
	label=f"Target H(uniform) = {target_entropy:.4f}",
	)
	axes[1].legend()
	axes[1].set_xlabel("Iteration")
	axes[1].set_ylabel("H(Gaussian)")
	axes[1].set_title("Gaussian Entropy → Target Uniform Entropy")
	axes[1].grid(True, alpha=0.3)

	plt.tight_layout()
	return fig


	def format_results(entropy, history):
	"""Format results as text."""
	det_reduction = (
	entropy["original"]["determinant"] / entropy["transformed"]["determinant"]
	)
	target_entropy = entropy["original"]["uniform_entropy"]
	final_entropy = entropy["transformed"]["gaussian_entropy"]
	entropy_gap = final_entropy - target_entropy

	text = f"""
	TRANSFORMATION RESULTS
	{'=' * 50}

	Target Entropy (Uniform Distribution):
	H(uniform) = {target_entropy:.6f} nats

	This is the true entropy we want to reach.

	Gaussian Entropy of Transformed Data:
	H(Gaussian) = {final_entropy:.6f} nats

	This assumes the transformed data is Gaussian with the
	current covariance. When H(Gaussian) = H(uniform), the
	distribution is perfectly Gaussian.

	Gap to Target:
	H(Gaussian) - H(uniform) = {entropy_gap:.6f} nats
	(Should approach 0 for perfect Gaussianization)

	Covariance Determinant:
	Original: {entropy['original']['determinant']:.6e}
	Transformed: {entropy['transformed']['determinant']:.6e}
	Reduction: {det_reduction:.2f}x

	Optimization:
	Iterations with improvement: {len(history['iteration'])}
	Final determinant: {history['determinant'][-1]:.6e}
	Final H(Gaussian): {history['gaussian_entropy'][-1]:.6f}
	"""
	return text


	# Markdown explanation of Levenberg-Marquardt
	LM_EXPLANATION = """
	## How the Levenberg-Marquardt Algorithm Works

	The Levenberg-Marquardt (LM) algorithm is used to minimize the covariance determinant. Unlike gradient descent, LM has no learning rate - here's why:

	### The Key Insight

	LM is designed for least-squares problems where you minimize a sum of squared residuals. Instead of taking steps proportional to the gradient (like gradient descent), LM solves a local linear approximation of the problem at each step.

	### How It Works

	1. Compute the Jacobian `J` - the matrix of partial derivatives of residuals with respect to parameters

	2. Solve the normal equations:
	```
	(J^T J + λI) δ = -J^T r
	```
	where `r` is the residual vector and `λ` is a damping parameter

	3. The damping parameter λ replaces the learning rate:
	- When `λ` is large: The step is small and in the gradient direction (like gradient descent with small learning rate)
	- When `λ` is small: The step approaches the Gauss-Newton step (a direct jump to the local minimum of the quadratic approximation)

	4. Adaptive adjustment:
	- If a step decreases the objective: Accept it and decrease λ (take bigger steps)
	- If a step increases the objective: Reject it and increase λ (take smaller, safer steps)

	### Why No Learning Rate?

	The LM algorithm automatically adapts its step size through the damping parameter λ:
	- It starts cautious (large λ, small steps)
	- As it finds a good direction, it becomes more aggressive (small λ, large steps)
	- If it overshoots, it backs off automatically

	This makes LM much more robust than gradient descent - you don't need to tune a learning rate!

	### In This Application

	We minimize `log(det(Cov))` where `Cov` is the covariance matrix of the transformed points. The transformation is parameterized by coefficients of divergence-free basis functions, ensuring the transformation is volume-preserving and thus entropy-conserving.
	"""

	THEORY_EXPLANATION = """
	## Theoretical Background

	### Maximum Entropy Principle

	A fundamental theorem states: Among all distributions with a given covariance matrix, the Gaussian has maximum entropy.

	This means for any distribution with entropy `H₀` and covariance `Σ`:
	- The Gaussian with the same covariance has entropy `H_Gaussian(Σ) ≥ H₀`
	- Equality holds only when the distribution is Gaussian

	### The Key Insight

	If we apply a volume-preserving transformation:
	1. The entropy stays fixed at `H₀` (entropy is conserved)
	2. But the covariance changes

	By minimizing the covariance determinant while preserving entropy:
	- We reduce `H_Gaussian(Σ)` (the Gaussian entropy bound)
	- When `H_Gaussian(Σ) = H₀`, the distribution must be Gaussian!

	### Why Divergence-Free?

	Divergence-free vector fields define volume-preserving transformations:
	- The Jacobian determinant equals 1 everywhere
	- Total probability volume is conserved
	- Entropy is conserved under the transformation

	This is the incompressibility condition from fluid dynamics: `∇·v = 0`

	### The Operator

	We construct divergence-free basis functions using Lowitzsch's operator:

	Ô = -I∇² + ∇∇ᵀ

	Applied to Gaussian RBFs, this produces matrix-valued functions where each column is a divergence-free vector field.
	"""


	def create_app():
	"""Create the Gradio interface."""
	with gr.Blocks(
	title="Entropy-Conserving Transformations", theme=gr.themes.Soft()
	) as app:
	gr.Markdown(
	"""
	# Entropy-Conserving Transformations Using Divergence-Free Vector Fields

	Transform arbitrary distributions towards Gaussian form while conserving entropy.

	This demo uses divergence-free basis functions to create volume-preserving transformations,
	then minimizes the covariance determinant using the Levenberg-Marquardt algorithm.
	"""
	)

	gr.HTML("""
	<div style="text-align: center; margin: 10px 0;">
	<a href="https://www.paypal.com/donate?business=varun.kapoor@kapoorlabs.org&currency_code=EUR" target="_blank" style="text-decoration: none;">
	<button style="background-color: #0070ba; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; font-size: 14px; font-weight: bold;">
	☕ Buy me a coffee (PayPal)
	</button>
	</a>
	</div>
	""")

	# State to hold the dataframe
	df_state = gr.State(None)

	with gr.Tabs():
	with gr.Tab("Transform Data"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Step 1: Load or Generate Data")

	gr.Markdown(
	"""
	No CSV file? Use "Generate Sample Data" below to create a uniform grid.

	Have your own CSV? Format requirements:
	- Header row with column names
	- Numeric columns for coordinates (e.g., `x`, `y`, `z`)
	- Example:
	```
	x,y
	-9.5,-9.5
	-9.5,-8.5
	...
	```
	"""
	)

	with gr.Accordion(
	"Generate Sample Data (no CSV needed)", open=True
	):
	gr.Markdown(
	"Creates a uniform grid using VectorSampler - perfect for testing"
	)
	n_per_dim = gr.Slider(
	minimum=5,
	maximum=500,
	value=20,
	step=1,
	label="Points per dimension",
	)
	dimensions = gr.Radio(
	choices=[2, 3], value=2, label="Dimensions"
	)
	generate_btn = gr.Button(
	"Generate Uniform Distribution",
	variant="primary",
	)
	download_file = gr.File(label="Download generated CSV")

	with gr.Accordion("Upload Your Own CSV", open=False):
	file_upload = gr.File(
	label="Upload CSV file", file_types=[".csv"]
	)
	upload_btn = gr.Button("Load CSV", variant="secondary")

	data_info = gr.Textbox(
	label="Data Info", lines=8, interactive=False
	)

	gr.Markdown("### Step 2: Configure Transformation")

	columns_input = gr.Textbox(
	value="x, y",
	label="Columns to transform (comma-separated)",
	lines=3,
	)
	sigma = gr.Slider(
	minimum=0.1,
	maximum=200.0,
	value=5.0,
	step=0.1,
	label="Sigma (RBF width)",
	)
	max_iterations = gr.Slider(
	minimum=10,
	maximum=5000,
	value=100,
	step=10,
	label="Max iterations",
	)

	transform_btn = gr.Button(
	"Run Transformation", variant="primary", size="lg"
	)

	with gr.Column(scale=2):
	gr.Markdown("### Results")

	results_text = gr.Textbox(
	label="Transformation Results",
	lines=20,
	interactive=False,
	)

	with gr.Row():
	scatter_plot = gr.Plot(label="Before/After Scatter")

	with gr.Row():
	hist_plot = gr.Plot(label="Marginal Distributions")

	with gr.Row():
	history_plot = gr.Plot(label="Optimization History")

	with gr.Tab("How LM Works"):
	gr.Markdown(LM_EXPLANATION)

	with gr.Tab("Theory"):
	gr.Markdown(THEORY_EXPLANATION)

	# Event handlers
	def on_generate(n, dims):
	path, info, df = generate_sample_csv(n, dims)
	return path, info, df

	def on_upload(file):
	path, info, df = load_csv_file(file)
	return info, df

	generate_btn.click(
	fn=on_generate,
	inputs=[n_per_dim, dimensions],
	outputs=[download_file, data_info, df_state],
	)

	upload_btn.click(
	fn=on_upload, inputs=[file_upload], outputs=[data_info, df_state]
	)

	transform_btn.click(
	fn=run_transformation,
	inputs=[
	df_state,
	columns_input,
	sigma,
	max_iterations,
	],
	outputs=[scatter_plot, hist_plot, history_plot, results_text],
	)

	return app


	if __name__ == "__main__":
	app = create_app()
	app.launch()