File size: 13,997 Bytes
03ac562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/env python3
"""Generate results.md and latex_results_section.tex from computed outputs."""
import os
import sys
import json
import glob
import argparse
import pandas as pd
import numpy as np
from datetime import datetime

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.utils import ensure_dir


def generate_results_md(proc_csv, tables_dir, fig_dir):
    """Generate the main results.md report."""
    df = pd.read_csv(proc_csv) if os.path.exists(proc_csv) else pd.DataFrame()
    
    report = []
    report.append("# Experimental Results\n")
    report.append(f"Generated: {datetime.now().isoformat()}\n")
    report.append(f"Total records: {len(df)}\n")
    
    # 1. Goals
    report.append("\n## 1. Goals\n")
    report.append("""
Validate the weighted Dobrushin locality theory for variational inference
in matrix factorization models. The core theoretical chain is:

- Small weighted interaction mass → deletion influence decay → accurate local unlearning → low gradient interference.

We test this on synthetic data (Gamma-Poisson, Gaussian-Gaussian, Gaussian-Gamma MAP)
and real data (Last.fm, MovieLens).
""")
    
    # 2. Setup
    report.append("\n## 2. Setup\n")
    if len(df) > 0:
        syn_df = df[df['dataset_type'] == 'synthetic'] if 'dataset_type' in df.columns else df
        real_df = df[df['dataset_type'] == 'real'] if 'dataset_type' in df.columns else pd.DataFrame()
        
        report.append(f"- Synthetic experiments: {len(syn_df)} deletion records\n")
        report.append(f"- Real-data experiments: {len(real_df)} deletion records\n")
        
        if 'model_family' in df.columns:
            for mf in df['model_family'].unique():
                n = len(df[df['model_family'] == mf])
                report.append(f"- Model family `{mf}`: {n} records\n")
        
        if 'graph_type' in df.columns:
            report.append(f"- Graph types: {', '.join(df['graph_type'].dropna().unique())}\n")
    
    # 3. Synthetic Experiments
    report.append("\n## 3. Synthetic Experiments\n")
    
    report.append("\n### 3.1 Influence Decay\n")
    fig_path = 'results/figures/synthetic_influence_vs_distance.png'
    if os.path.exists(fig_path):
        report.append(f"![Influence vs Distance]({fig_path})\n")
        report.append(f"*Source: {fig_path}*\n")
    else:
        report.append("*Figure not generated.*\n")
    
    if len(df) > 0 and 'dataset_type' in df.columns:
        syn_pg = df[(df['dataset_type'] == 'synthetic') & (df.get('model_family', 'poisson_gamma') == 'poisson_gamma')]
        if 'empirical_decay_mu' in syn_pg.columns:
            valid_mu = syn_pg['empirical_decay_mu'].dropna()
            if len(valid_mu) > 0:
                report.append(f"\nEmpirical decay rate μ_emp: mean={valid_mu.mean():.4f}, median={valid_mu.median():.4f}, std={valid_mu.std():.4f}\n")
    
    report.append("\n### 3.2 Error vs Radius\n")
    fig_path = 'results/figures/synthetic_error_vs_radius.png'
    if os.path.exists(fig_path):
        report.append(f"![Error vs Radius]({fig_path})\n")
        report.append(f"*Source: {fig_path}*\n")
    
    report.append("\n### 3.3 Theory Proxy χ(z)\n")
    fig_path = 'results/figures/synthetic_chi_vs_error.png'
    if os.path.exists(fig_path):
        report.append(f"![Chi vs Error]({fig_path})\n")
        report.append(f"*Source: {fig_path}*\n")
    
    report.append("\n### 3.4 Interference\n")
    fig_path = 'results/figures/synthetic_interference_vs_chi.png'
    if os.path.exists(fig_path):
        report.append(f"![Interference vs Chi]({fig_path})\n")
        report.append(f"*Source: {fig_path}*\n")
    
    report.append("\n### 3.5 Runtime\n")
    fig_path = 'results/figures/synthetic_runtime_vs_error.png'
    if os.path.exists(fig_path):
        report.append(f"![Runtime vs Error]({fig_path})\n")
        report.append(f"*Source: {fig_path}*\n")
    
    # 4. Real-World Experiments
    report.append("\n## 4. Real-World Experiments\n")
    
    report.append("\n### 4.1 Dataset Summary\n")
    table_path = 'results/tables/table_real_datasets.md'
    if os.path.exists(table_path):
        with open(table_path) as f:
            report.append(f.read())
    
    report.append("\n### 4.2 Influence Decay\n")
    fig_path = 'results/figures/real_influence_vs_distance.png'
    if os.path.exists(fig_path):
        report.append(f"![Real Influence vs Distance]({fig_path})\n")
    
    report.append("\n### 4.3 Error vs Radius\n")
    fig_path = 'results/figures/real_error_vs_radius.png'
    if os.path.exists(fig_path):
        report.append(f"![Real Error vs Radius]({fig_path})\n")
    
    report.append("\n### 4.4 χ(z) Proxy\n")
    fig_path = 'results/figures/real_chi_vs_error.png'
    if os.path.exists(fig_path):
        report.append(f"![Real Chi vs Error]({fig_path})\n")
    
    # 5. Model-Family Ablation
    report.append("\n## 5. Model-Family Ablation\n")
    report.append("""
We compare Gaussian-Gaussian, Gaussian-Gamma MAP, and Poisson-Gamma MF
to test whether locality is specific to the conjugate count model or appears more broadly.
""")
    
    for figname, label in [
        ('model_family_influence_vs_distance', 'M1: Influence by Model Family'),
        ('model_family_decay_mu', 'M2: Decay Rate by Model Family'),
        ('model_family_error_vs_radius', 'M3: Error vs Radius by Model Family'),
        ('model_family_proxy_vs_error', 'M4: Proxy vs Error Across Models'),
        ('model_family_prior_noise_ablation', 'M5: Prior/Noise Ablation'),
    ]:
        fig_path = f'results/figures/{figname}.png'
        if os.path.exists(fig_path):
            report.append(f"\n### {label}\n")
            report.append(f"![{label}]({fig_path})\n")
    
    # 6. Tables
    report.append("\n## 6. Tables\n")
    for tbl_name in ['table_synthetic_regimes', 'table_correlations', 'table_method_comparison',
                      'table_model_family_summary', 'table_model_family_correlations']:
        md_path = f'results/tables/{tbl_name}.md'
        if os.path.exists(md_path):
            report.append(f"\n### {tbl_name.replace('_', ' ').title()}\n")
            with open(md_path) as f:
                report.append(f.read())
    
    # 7. Main Findings
    report.append("\n## 7. Main Findings\n")
    if len(df) > 0:
        # Compute summary stats for findings
        if 'empirical_decay_mu' in df.columns:
            mu_vals = df['empirical_decay_mu'].dropna()
            if len(mu_vals) > 0:
                pct_positive = (mu_vals > 0).mean() * 100
                report.append(f"- {pct_positive:.1f}% of deletion experiments show positive decay rate (influence decreases with distance)\n")
        
        if 'rel_error_R2' in df.columns and 'rel_error_R4' in df.columns:
            r2_vals = df['rel_error_R2'].dropna()
            r4_vals = df['rel_error_R4'].dropna()
            if len(r2_vals) > 0 and len(r4_vals) > 0:
                report.append(f"- Mean relative error at R=2: {r2_vals.mean():.4f}, at R=4: {r4_vals.mean():.4f}\n")
                if r4_vals.mean() < r2_vals.mean():
                    report.append("- Local approximation error decreases with radius, consistent with locality theory\n")
        
        if 'chi_seed_max' in df.columns and 'rel_error_R2' in df.columns:
            from scipy import stats as scipy_stats
            x = df['chi_seed_max'].dropna()
            y = df['rel_error_R2'].dropna()
            common = x.index.intersection(y.index)
            x, y = x.loc[common], y.loc[common]
            mask = np.isfinite(x) & np.isfinite(y)
            if mask.sum() > 5:
                r, p = scipy_stats.spearmanr(x[mask], y[mask])
                report.append(f"- Spearman correlation between χ_max(z) and local error (R=2): ρ={r:.3f} (p={p:.2e})\n")
    
    report.append("\n## 8. Limitations and Failure Cases\n")
    report.append("See `debug.md` for numerical issues, convergence failures, and excluded runs.\n")
    
    return '\n'.join(report)


def generate_latex(proc_csv, tables_dir, fig_dir):
    """Generate latex_results_section.tex."""
    df = pd.read_csv(proc_csv) if os.path.exists(proc_csv) else pd.DataFrame()
    
    latex = []
    latex.append(r"\section{Experimental Results}")
    latex.append("")
    
    latex.append(r"\subsection{Synthetic Validation}")
    latex.append("")
    latex.append(r"We validate the weighted Dobrushin locality theory on synthetic Gamma--Poisson matrix factorization data across bounded-degree, Erd\H{o}s--R\'{e}nyi, and power-law bipartite graphs.")
    latex.append("")
    
    # Influence decay figure
    latex.append(r"\begin{figure}[t]")
    latex.append(r"\centering")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_influence_vs_distance.pdf}")
    latex.append(r"\hfill")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_error_vs_radius.pdf}")
    latex.append(r"\caption{Left: Mean deletion influence vs.\ graph distance from the seed set. Right: Local approximation error vs.\ unlearning radius $R$.}")
    latex.append(r"\label{fig:synthetic_main}")
    latex.append(r"\end{figure}")
    latex.append("")
    
    # Theory proxy
    latex.append(r"\begin{figure}[t]")
    latex.append(r"\centering")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_chi_vs_error.pdf}")
    latex.append(r"\hfill")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_interference_vs_chi.pdf}")
    latex.append(r"\caption{Left: Weighted interaction proxy $\chi_{\max}(z)$ vs.\ local unlearning error. Right: Gradient interference vs.\ $\chi_{\max}(z)$.}")
    latex.append(r"\label{fig:synthetic_theory}")
    latex.append(r"\end{figure}")
    latex.append("")
    
    # Add data-driven text
    if len(df) > 0 and 'empirical_decay_mu' in df.columns:
        mu_vals = df['empirical_decay_mu'].dropna()
        if len(mu_vals) > 0:
            pct = (mu_vals > 0).mean() * 100
            latex.append(f"Across all synthetic configurations, {pct:.0f}\\% of deletions exhibit positive empirical decay rates, indicating that deletion influence decreases with graph distance from the seed set.")
            latex.append("")
    
    # Real data
    latex.append(r"\subsection{Real-Data Experiments}")
    latex.append("")
    latex.append(r"We test the theory on two public datasets: Last.fm user--artist listening counts and MovieLens movie ratings converted to integer counts.")
    latex.append("")
    
    latex.append(r"\begin{figure}[t]")
    latex.append(r"\centering")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/real_influence_vs_distance.pdf}")
    latex.append(r"\hfill")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/real_chi_vs_error.pdf}")
    latex.append(r"\caption{Left: Deletion influence vs.\ distance on real datasets. Right: Interaction proxy $\chi_{\max}(z)$ vs.\ local error on real data.}")
    latex.append(r"\label{fig:real_main}")
    latex.append(r"\end{figure}")
    latex.append("")
    
    # Model family ablation
    latex.append(r"\subsection{Ablation across likelihood--prior families}")
    latex.append("")
    latex.append(r"To test whether the locality phenomenon is specific to the Gamma--Poisson model, we compare three matrix factorization families: Gaussian--Gaussian, Gaussian--Gamma MAP, and Poisson--Gamma.")
    latex.append("")
    
    latex.append(r"\begin{figure}[t]")
    latex.append(r"\centering")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/model_family_influence_vs_distance.pdf}")
    latex.append(r"\hfill")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/model_family_error_vs_radius.pdf}")
    latex.append(r"\caption{Left: Influence decay across model families. Right: Error vs.\ radius across model families.}")
    latex.append(r"\label{fig:model_family}")
    latex.append(r"\end{figure}")
    latex.append("")
    
    # Runtime
    latex.append(r"\subsection{Runtime and Algorithmic Implications}")
    latex.append("")
    
    latex.append(r"\begin{figure}[t]")
    latex.append(r"\centering")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/synthetic_runtime_vs_error.pdf}")
    latex.append(r"\hfill")
    latex.append(r"\includegraphics[width=0.48\textwidth]{results/figures/real_runtime_vs_error.pdf}")
    latex.append(r"\caption{Runtime vs.\ approximation error trade-off for different unlearning methods.}")
    latex.append(r"\label{fig:runtime}")
    latex.append(r"\end{figure}")
    latex.append("")
    
    if len(df) > 0:
        for R in [2, 4]:
            rt_col = f'runtime_local_R{R}'
            if rt_col in df.columns and 'runtime_exact' in df.columns:
                speedup = df['runtime_exact'].mean() / max(df[rt_col].mean(), 1e-6)
                latex.append(f"Local unlearning at radius $R={R}$ achieves a mean speedup of {speedup:.1f}$\\times$ over exact retraining.")
                latex.append("")
    
    return '\n'.join(latex)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default='config/default.yaml')
    args = parser.parse_args()
    
    proc_csv = 'results/processed/all_results.csv'
    tables_dir = 'results/tables'
    fig_dir = 'results/figures'
    
    # Generate results.md
    print("Generating results.md...")
    report = generate_results_md(proc_csv, tables_dir, fig_dir)
    with open('results.md', 'w') as f:
        f.write(report)
    print(f"  Saved results.md ({len(report)} chars)")
    
    # Generate LaTeX
    print("Generating latex_results_section.tex...")
    latex = generate_latex(proc_csv, tables_dir, fig_dir)
    with open('latex_results_section.tex', 'w') as f:
        f.write(latex)
    print(f"  Saved latex_results_section.tex ({len(latex)} chars)")


if __name__ == '__main__':
    main()