| """Figure metadata used by the v2 plotting system.""" |
| from __future__ import annotations |
|
|
| FIGURE_SPECS = { |
| "fig1_task_graph": { |
| "paper_position": "Introduction / task definition", |
| "main_text": True, |
| "purpose": "Define author-paper link prediction on the heterogeneous academic graph.", |
| "source": "Schematic; dataset sizes from project docs and README.", |
| "caption": ( |
| "Heterogeneous author-paper graph and link-prediction task. Authors, papers, " |
| "historical author-paper interactions, coauthor links, and directed paper-paper " |
| "citations define the observed graph; each test author-paper pair is ranked for a " |
| "binary recommendation decision." |
| ), |
| }, |
| "fig2_dataset_sparsity": { |
| "paper_position": "Dataset", |
| "main_text": True, |
| "purpose": "Show sparsity, long tails, and cold-start pressure in the official graph.", |
| "source": "data_and_docs/author_file_ann.txt, paper_file_ann.txt, bipartite_train_ann.txt", |
| "caption": ( |
| "Dataset sparsity and long-tail structure. Log-log CCDFs show heavy-tailed " |
| "coauthor, citation, and author-paper degrees, while the low-degree panel shows " |
| "the mass of cold-start nodes that motivates structural and high-order features." |
| ), |
| }, |
| "fig3_performance_evolution": { |
| "paper_position": "Results overview", |
| "main_text": True, |
| "purpose": "Summarize the method evolution from LightGCN to the final high-order stack.", |
| "source": "README.md, reports, validation summaries, and figures_v2/data/manual_metrics.csv", |
| "caption": ( |
| "Performance evolution across model stages. LightGCN provides the collaborative " |
| "filtering backbone, graph/meta-path stacking supplies the largest jump, random-walk " |
| "blocks add complementary high-order proximity, and citation-aware propagation gives " |
| "the final lift to public F1 = 0.96626." |
| ), |
| }, |
| "fig4_method_pipeline": { |
| "paper_position": "Method", |
| "main_text": True, |
| "purpose": "Explain the final two-stage LightGBM stacking pipeline.", |
| "source": "README.md, CLAUDE.md, code/high_order_graph_stack.py", |
| "caption": ( |
| "Final two-stage stacking pipeline. The first stage produces collaborative, graph, " |
| "content, random-walk, and citation-propagation signals; the second-stage LightGBM " |
| "stacker fuses roughly 259 features and uses a rank cutoff rather than a transferred " |
| "probability threshold for submission generation." |
| ), |
| }, |
| "fig5_highorder_ablation": { |
| "paper_position": "Ablation", |
| "main_text": True, |
| "purpose": "Quantify the value of high-order citation propagation without dual axes.", |
| "source": "validation_runs/dynamic_seed202/high_order_graph_stack/validation_summary.csv", |
| "caption": ( |
| "High-order propagation ablation. F1 and AUC are shown in separate panels to avoid " |
| "dual-axis ambiguity. Rich content and random-walk blocks improve the stack, " |
| "undirected high-order features add the largest late-stage gain, and directed " |
| "citation propagation gives the final improvement." |
| ), |
| }, |
| "fig6_calibration_rank_cutoff": { |
| "paper_position": "Decision rule / results", |
| "main_text": True, |
| "purpose": "Explain why rank cutoff is more robust than transferring a probability threshold.", |
| "source": "validation_runs/stack_ratio_analysis.csv and high_order threshold summaries", |
| "caption": ( |
| "Rank cutoff versus probability-threshold transfer. The validation split is " |
| "artificially balanced, so validation probabilities are not calibrated for test; " |
| "a rank cutoff keeps the predicted-positive ratio fixed while the transferred " |
| "probability threshold drifts to about 0.524 on test." |
| ), |
| }, |
| "figA1_lightgcn_sweep": { |
| "paper_position": "Appendix", |
| "main_text": False, |
| "purpose": "Document the LightGCN layer/dimension sweep.", |
| "source": "validation_runs/dynamic_summary.csv", |
| "caption": "LightGCN validation sweep over propagation depth and embedding dimension.", |
| }, |
| "figA2_rw_ensemble": { |
| "paper_position": "Appendix", |
| "main_text": False, |
| "purpose": "Show random-walk ensemble-size ablation.", |
| "source": "validation_runs/dynamic_seed202/randomwalk_systematic/*.csv", |
| "caption": "Random-walk ensemble-size ablation from the best single block to 5 and 7 blocks.", |
| }, |
| "figA3_feature_group_contribution": { |
| "paper_position": "Appendix", |
| "main_text": False, |
| "purpose": "Summarize incremental feature-group contributions from recorded ablations.", |
| "source": "reports and figures_v2/data/manual_metrics.csv", |
| "caption": "Feature-group contribution measured as recorded incremental validation-F1 gains.", |
| }, |
| "figA4_error_buckets": { |
| "paper_position": "Appendix", |
| "main_text": False, |
| "purpose": "Localize remaining weak regimes without a single overlong heatmap.", |
| "source": "validation_runs/dynamic_seed202/error_group_calibration/error_analysis_buckets.csv", |
| "caption": ( |
| "Error buckets reveal cold-start and weak-evidence regimes. The panels separate " |
| "degree, rank/score, and local-evidence buckets and highlight the lowest-F1 rows." |
| ), |
| }, |
| "figA5_oof_pr_score": { |
| "paper_position": "Appendix", |
| "main_text": False, |
| "purpose": "Show OOF discrimination and readable final-score distributions.", |
| "source": "validation_runs/dynamic_seed202/*_oof.npy and val_labels_seed202.npy", |
| "caption": ( |
| "OOF precision-recall curves and final-score ECDFs. The ECDF view avoids density " |
| "spikes and makes positive/negative separation readable." |
| ), |
| }, |
| "figA6_feature_importance": { |
| "paper_position": "Appendix optional", |
| "main_text": False, |
| "purpose": "LightGBM model feature importance if model metadata is reliably loadable.", |
| "source": "cached_scores/lgb_model.pkl or lgb_v2_model.pkl", |
| "caption": "Skipped unless LightGBM and feature names are available.", |
| }, |
| } |
|
|
| PLAN_ROWS = [ |
| ("fig1_task_graph.pdf", "Introduction", "Task formalization", "Schematic", "Project docs", "Yes", "Redraw as compact heterogeneous-graph diagram"), |
| ("fig2_dataset_sparsity.pdf", "Dataset", "Sparse long-tail graph", "2x2 CCDF/bar", "Official edge files", "Yes", "Recompute degrees from real data"), |
| ("fig3_performance_evolution.pdf", "Results", "Stage-wise improvement", "Line/step plot", "Recorded metrics", "Yes", "Use clean dual-line plot and only three annotations"), |
| ("fig4_method_pipeline.pdf", "Method", "Two-stage stacker", "Architecture", "Code/docs", "Yes", "Aligned three-column schematic"), |
| ("fig5_highorder_ablation.pdf", "Ablation", "High-order citation lift", "Two-panel line plot", "validation_summary.csv", "Yes", "Separate F1 and AUC panels"), |
| ("fig6_calibration_rank_cutoff.pdf", "Decision rule", "Rank cutoff robustness", "Line + grouped bars", "ratio/threshold CSVs", "Yes", "Show ratio sweep and test drift"), |
| ("figA1_lightgcn_sweep.pdf", "Appendix", "LightGCN config", "Heatmap", "dynamic_summary.csv", "No", "Compact heatmap with NA cells"), |
| ("figA2_rw_ensemble.pdf", "Appendix", "RW ensemble benefit", "Line plot", "RW ablation CSVs", "No", "Small labels, no large annotations"), |
| ("figA3_feature_group_contribution.pdf", "Appendix", "Feature groups", "Horizontal bars", "Recorded metrics", "No", "Short labels and bounded x-axis"), |
| ("figA4_error_buckets.pdf", "Appendix", "Weak regimes", "1x3 heatmap", "error buckets CSV", "No", "Split long heatmap into three panels"), |
| ("figA5_oof_pr_score.pdf", "Appendix", "OOF discrimination", "PR + ECDF", "OOF NPY files", "No", "Use ECDF instead of fragile density spike"), |
| ("figA6_feature_importance.pdf", "Appendix", "Feature importance", "Bar chart", "LightGBM model", "Optional", "Skip unless reliable feature names and LightGBM are available"), |
| ] |
|
|
|
|