Add scripts/generate_report.py

03ac562 verified 19 days ago

14 kB

	#!/usr/bin/env python3
	"""Generate results.md and latex_results_section.tex from computed outputs."""
	import os
	import sys
	import json
	import glob
	import argparse
	import pandas as pd
	import numpy as np
	from datetime import datetime

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from src.utils import ensure_dir


	def generate_results_md(proc_csv, tables_dir, fig_dir):
	"""Generate the main results.md report."""
	df = pd.read_csv(proc_csv) if os.path.exists(proc_csv) else pd.DataFrame()

	report = []
	report.append("# Experimental Results\n")
	report.append(f"Generated: {datetime.now().isoformat()}\n")
	report.append(f"Total records: {len(df)}\n")

	# 1. Goals
	report.append("\n## 1. Goals\n")
	report.append("""
	Validate the weighted Dobrushin locality theory for variational inference
	in matrix factorization models. The core theoretical chain is:

	- Small weighted interaction mass → deletion influence decay → accurate local unlearning → low gradient interference.

	We test this on synthetic data (Gamma-Poisson, Gaussian-Gaussian, Gaussian-Gamma MAP)
	and real data (Last.fm, MovieLens).
	""")

	# 2. Setup
	report.append("\n## 2. Setup\n")
	if len(df) > 0:
	syn_df = df[df['dataset_type'] == 'synthetic'] if 'dataset_type' in df.columns else df
	real_df = df[df['dataset_type'] == 'real'] if 'dataset_type' in df.columns else pd.DataFrame()

	report.append(f"- Synthetic experiments: {len(syn_df)} deletion records\n")
	report.append(f"- Real-data experiments: {len(real_df)} deletion records\n")

	if 'model_family' in df.columns:
	for mf in df['model_family'].unique():
	n = len(df[df['model_family'] == mf])
	report.append(f"- Model family `{mf}`: {n} records\n")

	if 'graph_type' in df.columns:
	report.append(f"- Graph types: {', '.join(df['graph_type'].dropna().unique())}\n")

	# 3. Synthetic Experiments
	report.append("\n## 3. Synthetic Experiments\n")

	report.append("\n### 3.1 Influence Decay\n")
	fig_path = 'results/figures/synthetic_influence_vs_distance.png'
	if os.path.exists(fig_path):
	report.append(f"![Influence vs Distance]({fig_path})\n")
	report.append(f"Source: {fig_path}\n")
	else:
	report.append("Figure not generated.\n")

	if len(df) > 0 and 'dataset_type' in df.columns:
	syn_pg = df[(df['dataset_type'] == 'synthetic') & (df.get('model_family', 'poisson_gamma') == 'poisson_gamma')]
	if 'empirical_decay_mu' in syn_pg.columns:
	valid_mu = syn_pg['empirical_decay_mu'].dropna()
	if len(valid_mu) > 0:
	report.append(f"\nEmpirical decay rate μ_emp: mean={valid_mu.mean():.4f}, median={valid_mu.median():.4f}, std={valid_mu.std():.4f}\n")

	report.append("\n### 3.2 Error vs Radius\n")
	fig_path = 'results/figures/synthetic_error_vs_radius.png'
	if os.path.exists(fig_path):
	report.append(f"![Error vs Radius]({fig_path})\n")
	report.append(f"Source: {fig_path}\n")

	report.append("\n### 3.3 Theory Proxy χ(z)\n")
	fig_path = 'results/figures/synthetic_chi_vs_error.png'
	if os.path.exists(fig_path):
	report.append(f"![Chi vs Error]({fig_path})\n")
	report.append(f"Source: {fig_path}\n")

	report.append("\n### 3.4 Interference\n")
	fig_path = 'results/figures/synthetic_interference_vs_chi.png'
	if os.path.exists(fig_path):
	report.append(f"![Interference vs Chi]({fig_path})\n")
	report.append(f"Source: {fig_path}\n")

	report.append("\n### 3.5 Runtime\n")
	fig_path = 'results/figures/synthetic_runtime_vs_error.png'
	if os.path.exists(fig_path):
	report.append(f"![Runtime vs Error]({fig_path})\n")
	report.append(f"Source: {fig_path}\n")

	# 4. Real-World Experiments
	report.append("\n## 4. Real-World Experiments\n")

	report.append("\n### 4.1 Dataset Summary\n")
	table_path = 'results/tables/table_real_datasets.md'
	if os.path.exists(table_path):
	with open(table_path) as f:
	report.append(f.read())

	report.append("\n### 4.2 Influence Decay\n")
	fig_path = 'results/figures/real_influence_vs_distance.png'
	if os.path.exists(fig_path):
	report.append(f"![Real Influence vs Distance]({fig_path})\n")

	report.append("\n### 4.3 Error vs Radius\n")
	fig_path = 'results/figures/real_error_vs_radius.png'
	if os.path.exists(fig_path):
	report.append(f"![Real Error vs Radius]({fig_path})\n")

	report.append("\n### 4.4 χ(z) Proxy\n")
	fig_path = 'results/figures/real_chi_vs_error.png'
	if os.path.exists(fig_path):
	report.append(f"![Real Chi vs Error]({fig_path})\n")

	# 5. Model-Family Ablation
	report.append("\n## 5. Model-Family Ablation\n")
	report.append("""
	We compare Gaussian-Gaussian, Gaussian-Gamma MAP, and Poisson-Gamma MF
	to test whether locality is specific to the conjugate count model or appears more broadly.
	""")

	for figname, label in [
	('model_family_influence_vs_distance', 'M1: Influence by Model Family'),
	('model_family_decay_mu', 'M2: Decay Rate by Model Family'),
	('model_family_error_vs_radius', 'M3: Error vs Radius by Model Family'),
	('model_family_proxy_vs_error', 'M4: Proxy vs Error Across Models'),
	('model_family_prior_noise_ablation', 'M5: Prior/Noise Ablation'),
	]:
	fig_path = f'results/figures/{figname}.png'
	if os.path.exists(fig_path):
	report.append(f"\n### {label}\n")
	report.append(f"![{label}]({fig_path})\n")

	# 6. Tables
	report.append("\n## 6. Tables\n")
	for tbl_name in ['table_synthetic_regimes', 'table_correlations', 'table_method_comparison',
	'table_model_family_summary', 'table_model_family_correlations']:
	md_path = f'results/tables/{tbl_name}.md'
	if os.path.exists(md_path):
	report.append(f"\n### {tbl_name.replace('_', ' ').title()}\n")
	with open(md_path) as f:
	report.append(f.read())

	# 7. Main Findings
	report.append("\n## 7. Main Findings\n")
	if len(df) > 0:
	# Compute summary stats for findings
	if 'empirical_decay_mu' in df.columns:
	mu_vals = df['empirical_decay_mu'].dropna()
	if len(mu_vals) > 0:
	pct_positive = (mu_vals > 0).mean() * 100
	report.append(f"- {pct_positive:.1f}% of deletion experiments show positive decay rate (influence decreases with distance)\n")

	if 'rel_error_R2' in df.columns and 'rel_error_R4' in df.columns:
	r2_vals = df['rel_error_R2'].dropna()
	r4_vals = df['rel_error_R4'].dropna()
	if len(r2_vals) > 0 and len(r4_vals) > 0:
	report.append(f"- Mean relative error at R=2: {r2_vals.mean():.4f}, at R=4: {r4_vals.mean():.4f}\n")
	if r4_vals.mean() < r2_vals.mean():
	report.append("- Local approximation error decreases with radius, consistent with locality theory\n")

	if 'chi_seed_max' in df.columns and 'rel_error_R2' in df.columns:
	from scipy import stats as scipy_stats
	x = df['chi_seed_max'].dropna()
	y = df['rel_error_R2'].dropna()
	common = x.index.intersection(y.index)
	x, y = x.loc[common], y.loc[common]
	mask = np.isfinite(x) & np.isfinite(y)
	if mask.sum() > 5:
	r, p = scipy_stats.spearmanr(x[mask], y[mask])
	report.append(f"- Spearman correlation between χ_max(z) and local error (R=2): ρ={r:.3f} (p={p:.2e})\n")

	report.append("\n## 8. Limitations and Failure Cases\n")
	report.append("See `debug.md` for numerical issues, convergence failures, and excluded runs.\n")

	return '\n'.join(report)


	def generate_latex(proc_csv, tables_dir, fig_dir):
	"""Generate latex_results_section.tex."""
	df = pd.read_csv(proc_csv) if os.path.exists(proc_csv) else pd.DataFrame()

	latex = []
	latex.append(r"\section{Experimental Results}")
	latex.append("")

	latex.append(r"\subsection{Synthetic Validation}")
	latex.append("")
	latex.append(r"We validate the weighted Dobrushin locality theory on synthetic Gamma--Poisson matrix factorization data across bounded-degree, Erd\H{o}s--R\'{e}nyi, and power-law bipartite graphs.")
	latex.append("")

	# Influence decay figure
	latex.append(r"\begin{figure}[t]")
	latex.append(r"\centering")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_influence_vs_distance.pdf}")
	latex.append(r"\hfill")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_error_vs_radius.pdf}")
	latex.append(r"\caption{Left: Mean deletion influence vs.\ graph distance from the seed set. Right: Local approximation error vs.\ unlearning radius $R$.}")
	latex.append(r"\label{fig:synthetic_main}")
	latex.append(r"\end{figure}")
	latex.append("")

	# Theory proxy
	latex.append(r"\begin{figure}[t]")
	latex.append(r"\centering")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_chi_vs_error.pdf}")
	latex.append(r"\hfill")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_interference_vs_chi.pdf}")
	latex.append(r"\caption{Left: Weighted interaction proxy $\chi_{\max}(z)$ vs.\ local unlearning error. Right: Gradient interference vs.\ $\chi_{\max}(z)$.}")
	latex.append(r"\label{fig:synthetic_theory}")
	latex.append(r"\end{figure}")
	latex.append("")

	# Add data-driven text
	if len(df) > 0 and 'empirical_decay_mu' in df.columns:
	mu_vals = df['empirical_decay_mu'].dropna()
	if len(mu_vals) > 0:
	pct = (mu_vals > 0).mean() * 100
	latex.append(f"Across all synthetic configurations, {pct:.0f}\\% of deletions exhibit positive empirical decay rates, indicating that deletion influence decreases with graph distance from the seed set.")
	latex.append("")

	# Real data
	latex.append(r"\subsection{Real-Data Experiments}")
	latex.append("")
	latex.append(r"We test the theory on two public datasets: Last.fm user--artist listening counts and MovieLens movie ratings converted to integer counts.")
	latex.append("")

	latex.append(r"\begin{figure}[t]")
	latex.append(r"\centering")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/real_influence_vs_distance.pdf}")
	latex.append(r"\hfill")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/real_chi_vs_error.pdf}")
	latex.append(r"\caption{Left: Deletion influence vs.\ distance on real datasets. Right: Interaction proxy $\chi_{\max}(z)$ vs.\ local error on real data.}")
	latex.append(r"\label{fig:real_main}")
	latex.append(r"\end{figure}")
	latex.append("")

	# Model family ablation
	latex.append(r"\subsection{Ablation across likelihood--prior families}")
	latex.append("")
	latex.append(r"To test whether the locality phenomenon is specific to the Gamma--Poisson model, we compare three matrix factorization families: Gaussian--Gaussian, Gaussian--Gamma MAP, and Poisson--Gamma.")
	latex.append("")

	latex.append(r"\begin{figure}[t]")
	latex.append(r"\centering")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/model_family_influence_vs_distance.pdf}")
	latex.append(r"\hfill")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/model_family_error_vs_radius.pdf}")
	latex.append(r"\caption{Left: Influence decay across model families. Right: Error vs.\ radius across model families.}")
	latex.append(r"\label{fig:model_family}")
	latex.append(r"\end{figure}")
	latex.append("")

	# Runtime
	latex.append(r"\subsection{Runtime and Algorithmic Implications}")
	latex.append("")

	latex.append(r"\begin{figure}[t]")
	latex.append(r"\centering")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_runtime_vs_error.pdf}")
	latex.append(r"\hfill")
	latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/real_runtime_vs_error.pdf}")
	latex.append(r"\caption{Runtime vs.\ approximation error trade-off for different unlearning methods.}")
	latex.append(r"\label{fig:runtime}")
	latex.append(r"\end{figure}")
	latex.append("")

	if len(df) > 0:
	for R in [2, 4]:
	rt_col = f'runtime_local_R{R}'
	if rt_col in df.columns and 'runtime_exact' in df.columns:
	speedup = df['runtime_exact'].mean() / max(df[rt_col].mean(), 1e-6)
	latex.append(f"Local unlearning at radius $R={R}$ achieves a mean speedup of {speedup:.1f}$\\times$ over exact retraining.")
	latex.append("")

	return '\n'.join(latex)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--config', type=str, default='config/default.yaml')
	args = parser.parse_args()

	proc_csv = 'results/processed/all_results.csv'
	tables_dir = 'results/tables'
	fig_dir = 'results/figures'

	# Generate results.md
	print("Generating results.md...")
	report = generate_results_md(proc_csv, tables_dir, fig_dir)
	with open('results.md', 'w') as f:
	f.write(report)
	print(f" Saved results.md ({len(report)} chars)")

	# Generate LaTeX
	print("Generating latex_results_section.tex...")
	latex = generate_latex(proc_csv, tables_dir, fig_dir)
	with open('latex_results_section.tex', 'w') as f:
	f.write(latex)
	print(f" Saved latex_results_section.tex ({len(latex)} chars)")


	if __name__ == '__main__':
	main()