Samarth0710 commited on
Commit
572d3da
·
verified ·
1 Parent(s): 9943870

Deploy PaperBanana app

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -0
  2. Dockerfile +27 -0
  3. README.md +6 -5
  4. aesthetic_guidelines.py +64 -0
  5. agents/__init__.py +17 -0
  6. agents/critic.py +234 -0
  7. agents/planner.py +117 -0
  8. agents/retriever.py +151 -0
  9. agents/stylist.py +104 -0
  10. agents/visualizer.py +199 -0
  11. app.py +283 -0
  12. config.py +22 -0
  13. data/spotlight_reference_images/ref_0001_00232_GraphMaster_Automated_Graph_Synthesis_via_LLM_Agents_in_Data-Limited_Environments__fdf13132133da88f7ce9ae4d0a22c29da1f05f75072f95010a29b1392696ea70.jpg +0 -0
  14. data/spotlight_reference_images/ref_0002_00279_VoxDet_Rethinking_3D_Semantic_Scene_Completion_as_Dense_Object_Detection__7b55d87bf0fcf6d787a440d59bf4617e6d73f10f5b1bcc1b45736ad0a7a57911.jpg +0 -0
  15. data/spotlight_reference_images/ref_0003_00491_SQS_Enhancing_Sparse_Perception_Models_via_Query-based_Splatting_in_Autonomous_Driving__f7c6f154dc3e45e2f58f5a9111ebf57ead0d19f9e77340cd3838d881af17a916.jpg +0 -0
  16. data/spotlight_reference_images/ref_0004_00691_ProtInvTree_Deliberate_Protein_Inverse_Folding_with_Reward-guided_Tree_Search__713bbbec11cbef0f1d2c8901e17165fa7db3d1fcf6cfa0d4a8b803d2dccb2ca0.jpg +0 -0
  17. data/spotlight_reference_images/ref_0005_00738_Mulberry_Empowering_MLLM_with_o1-like_Reasoning_and_Reflection_via_Collective_Monte_Carlo_Tree_Search__7c987fc8cf213eb47a038117cc38e5a170938289a390de4b8d7b6cd88512d505.jpg +0 -0
  18. data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg +3 -0
  19. data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg +3 -0
  20. data/spotlight_reference_images/ref_0008_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__ea910fb78f4d4027ff7fe9a63cbbc68a7038ee2e9f4250c2f773c631f265b079.jpg +0 -0
  21. data/spotlight_reference_images/ref_0009_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__4cc6ca3e3fa15065b6a1781e9a7f814d67a46649325289b63c9bea4072020f4c.jpg +0 -0
  22. data/spotlight_reference_images/ref_0010_01112_DiCo_Revitalizing_ConvNets_for_Scalable_and_Efficient_Diffusion_Modeling__589d9f3ec341480c16ec00bf41076999cd0ce6c1526b97e71eb9c8ffe33b1a1b.jpg +0 -0
  23. data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg +3 -0
  24. data/spotlight_reference_images/ref_0012_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__fb4694d88a5097ee72a936634a832ced85d11e90079a4fabc3f1b8c28f24e5d6.jpg +0 -0
  25. data/spotlight_reference_images/ref_0013_01620_FutureSightDrive_Thinking_Visually_with_Spatio-Temporal_CoT_for_Autonomous_Driving__8a4bffe9a69ef0d2bc0ced518bccdb3146d38727d7fc7402a418b6227a78bcfb.jpg +0 -0
  26. data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg +3 -0
  27. data/spotlight_reference_images/ref_0015_01839_Mesh-RFT_Enhancing_Mesh_Generation_via_Fine-grained_Reinforcement_Fine-Tuning__4d93550a61cdae636652e7cd1ca974c21c7fca1e8eb6eaa6fa2e03100f1d0f68.jpg +0 -0
  28. data/spotlight_reference_images/ref_0016_01864_Jacobian-Based_Interpretation_of_Nonlinear_Neural_Encoding_Model__c9ac984c977d21aff284dcfbacf7bdfea345cc73096a3f28d624b026654e1740.jpg +0 -0
  29. data/spotlight_reference_images/ref_0017_02109_OnlineSplatter_Pose-Free_Online_3D_Reconstruction_for_Free-Moving_Objects__abe4e12f0fc9a7487c6c5774f5ead6b391ede9f832960b5ce462ceb786534a0d.jpg +0 -0
  30. data/spotlight_reference_images/ref_0018_02239_RobustMerge_Parameter-Efficient_Model_Merging_for_MLLMs_with_Direction_Robustness__45b74fc957d305067e6f46f00bc32c9ee0889b7dc7e18b4c46913d263c1d8c16.jpg +0 -0
  31. data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg +3 -0
  32. data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg +3 -0
  33. data/spotlight_reference_images/ref_0021_03670_Neural_Atlas_Graphs_for_Dynamic_Scene_Decomposition_and_Editing__16f8fc865baed696e798502de56f3b473dbf2b0b6aa3c1286f384de53c524b97.jpg +0 -0
  34. data/spotlight_reference_images/ref_0022_03671_STITCH-OPE_Trajectory_Stitching_with_Guided_Diffusion_for_Off-Policy_Evaluation__6e9c875dbc74a8b39bc947de3b928f8a29f7c6db9b03d726a62a961ba0c2fdd3.jpg +0 -0
  35. data/spotlight_reference_images/ref_0023_04013_scMRDR_A_scalable_and_flexible_framework_for_unpaired_single-cell_multi-omics_data_integration__85cde1b5a410b0d3275a5c0fa81dfe69e2c83c485c9917c296502a6485e2f68b.jpg +0 -0
  36. data/spotlight_reference_images/ref_0024_04165_Transformer_Copilot_Learning_from_The_Mistake_Log_in_LLM_Fine-tuning__8d1e804f51825d9760a37b8d1ca027a61deecc5e33fc172f0c1789814527c37e.jpg +0 -0
  37. data/spotlight_reference_images/ref_0025_04571_GeRaF_Neural_Geometry_Reconstruction_from_Radio_Frequency_Signals__e096c21c76e1eb88c8d865f73e832e8e9246cddec17f62b5d85bc051caa55165.jpg +0 -0
  38. data/spotlight_reference_images/ref_0026_04647_HopaDIFF_Holistic-Partial_Aware_Fourier_Conditioned_Diffusion_for_Referring_Human_Action_Segmentation_in_Multi-Person_Sc__0e15e7e99b52ccbc157e3ee04a6de2238b23ccd60543102fc2a70eddb12e5e41.jpg +0 -0
  39. data/spotlight_reference_images/ref_0027_04717_CSBrain_A_Cross-scale_Spatiotemporal_Brain_Foundation_Model_for_EEG_Decoding__327e1ef5f11138cef78e3dd270f09eea700aedb3c5a64cf848b125d13e4e5f08.jpg +0 -0
  40. data/spotlight_reference_images/ref_0028_05129_Learning_to_Factorize_Spatio-Temporal_Foundation_Models__ef11d1775e9863839bc4dfaf8711bf474ef80365303220ba92781b579879e7a5.jpg +0 -0
  41. data/spotlight_reference_images/ref_0029_05428_EDELINE_Enhancing_Memory_in_Diffusion-based_World_Models_via_Linear-Time_Sequence_Modeling__57df86687acccd0bad49997d26a4a442d6d09e2381e5a570d1d1e7efd02cf303.jpg +0 -0
  42. data/spotlight_reference_images/ref_0030_05467_Vision-centric_Token_Compression_in_Large_Language_Model__056bbf059f83c91ea896c610cef2927606ab780d910996e6cdb293dfaca40ddd.jpg +0 -0
  43. data/spotlight_reference_images/ref_0031_05610_Repo2Run_Automated_Building_Executable_Environment_for_Code_Repository_at_Scale__ea96f359e23ff3f0427d48dd3247314967bb531150d725a7680376b940324680.jpg +0 -0
  44. data/spotlight_reference_images/ref_0032_05774_Shallow_Diffuse_Robust_and_Invisible_Watermarking_through_Low-Dim_Subspaces_in_Diffusion_Models__703f7602f642aa354858ee8cf929888d672a45001a93ac0cb937cd0f4f1b62de.jpg +0 -0
  45. data/spotlight_reference_images/ref_0033_05814_Mozart_Modularized_and_Efficient_MoE_Training_on_35D_Wafer-Scale_Chiplet_Architectures__dc6b73dd98f93241717e3b658c319f70ab6ad6188c24a147cd052fb2153d656d.jpg +0 -0
  46. data/spotlight_reference_images/ref_0034_06044_Theory-Driven_Label-Specific_Representation_for_Incomplete_Multi-View_Multi-Label_Learning__4d4bbb3c5cd4edb56f73502b7eac2b526f30bfc883b337d949cb38ee0747ee22.jpg +0 -0
  47. data/spotlight_reference_images/ref_0035_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__8c0d4df1862409d631870861dc2f047a0cd2572e87267d0d3ea58b6c245408fe.jpg +0 -0
  48. data/spotlight_reference_images/ref_0036_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__a50d002b445446ba0c687045ba485e4c96aaf49765198b3f53fb954327df6f4f.jpg +0 -0
  49. data/spotlight_reference_images/ref_0037_06507_CausalPFN_Amortized_Causal_Effect_Estimation_via_In-Context_Learning__8c660a9d9ad153e854bc67151e7df9977e2244fa4ce9bd649c2b08b58db2e30c.jpg +0 -0
  50. data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg +3 -0
.gitattributes CHANGED
@@ -33,3 +33,40 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg filter=lfs diff=lfs merge=lfs -text
37
+ data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg filter=lfs diff=lfs merge=lfs -text
38
+ data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg filter=lfs diff=lfs merge=lfs -text
39
+ data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg filter=lfs diff=lfs merge=lfs -text
40
+ data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg filter=lfs diff=lfs merge=lfs -text
41
+ data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg filter=lfs diff=lfs merge=lfs -text
42
+ data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg filter=lfs diff=lfs merge=lfs -text
43
+ data/spotlight_reference_images/ref_0040_06606_Disentangled_Concepts_Speak_Louder_Than_Words_Explainable_Video_Action_Recognition__fe96a76b160d3861e188cfe5511fee2d4f07eada1ebf92ade017048a3362d5b8.jpg filter=lfs diff=lfs merge=lfs -text
44
+ data/spotlight_reference_images/ref_0041_07858_HYPERION_Fine-Grained_Hypersphere_Alignment_for_Robust_Federated_Graph_Learning__be29a99497ec2dd4d3a8993ce3edc85c87505a1cabfadc2df234b7e326633ebc.jpg filter=lfs diff=lfs merge=lfs -text
45
+ data/spotlight_reference_images/ref_0043_08772_RepoMaster_Autonomous_Exploration_and_Understanding_of_GitHub_Repositories_for_Complex_Task_Solving__c5102f7309c920d53df2307418ef99304d083aa3f54140e3fa2e55c6b259378b.jpg filter=lfs diff=lfs merge=lfs -text
46
+ data/spotlight_reference_images/ref_0046_09315_Personalized_Decision_Modeling_Utility_Optimization_or_Textualized-Symbolic_Reasoning__56df140d7973f4f1a6286c7cebec84068dc6828711cea2e455416a0d9d381a99.jpg filter=lfs diff=lfs merge=lfs -text
47
+ data/spotlight_reference_images/ref_0048_09629_4DGT_Learning_a_4D_Gaussian_Transformer_Using_Real-World_Monocular_Videos__64516a621163af326843f0152bd0cdb8f798d2df70242271249a95e572c7a300.jpg filter=lfs diff=lfs merge=lfs -text
48
+ data/spotlight_reference_images/ref_0052_10520_TrajMamba_An_Efficient_and_Semantic-rich_Vehicle_Trajectory_Pre-training_Model__02d2a7b7aa1ad60cce35445c46fdcb453afa273d8a170ce0abe33ab2c8c6f245.jpg filter=lfs diff=lfs merge=lfs -text
49
+ data/spotlight_reference_images/ref_0058_13240_A_machine_learning_approach_that_beats_Rubiks_cubes__aec92a7999c868664250d8e9aad60b03dbacabd440355bec73af28c512c9d18a.jpg filter=lfs diff=lfs merge=lfs -text
50
+ data/spotlight_reference_images/ref_0060_14126_Deno-IF_Unsupervised_Noisy_Visible_and_Infrared_Image_Fusion_Method__685d5064d5b82a4e2e38976afb4b02e3359ccff154e8231646e76cb16970b7a0.jpg filter=lfs diff=lfs merge=lfs -text
51
+ data/spotlight_reference_images/ref_0070_15841_Co-Reinforcement_Learning_for_Unified_Multimodal_Understanding_and_Generation__8d1bdeb48a8ecdf31ace6493caea90ec34e8e10428d5b91397cd5531c2b33b09.jpg filter=lfs diff=lfs merge=lfs -text
52
+ data/spotlight_reference_images/ref_0074_16584_IA-GGAD_Zero-shot_Generalist_Graph_Anomaly_Detection_via_Invariant_and_Affinity_Learning__2be0842b017b1925d07b25f0276d02ffabc8a00fe4e73cc930b0ee0096fcfd40.jpg filter=lfs diff=lfs merge=lfs -text
53
+ data/spotlight_reference_images/ref_0079_19455_Wide-Horizon_Thinking_and_Simulation-Based_Evaluation_for_Real-World_LLM_Planning_with_Multifaceted_Constraints__198c85b432b1ddbf1afc76ab8c98e057973c4763acbaa0f16a4da51d97460935.jpg filter=lfs diff=lfs merge=lfs -text
54
+ data/spotlight_reference_images/ref_0087_22755_Breaking_the_Batch_Barrier_B3_of_Contrastive_Learning_via_Smart_Batch_Mining__588e6566b5416ccc18d5f5733612cbc3caa0c11fe2e3bc5c57c32c88a9ec2e41.jpg filter=lfs diff=lfs merge=lfs -text
55
+ data/spotlight_reference_images/ref_0095_26919_FUDOKI_Discrete_Flow-based_Unified_Understanding_and_Generation_via_Kinetic-Optimal_Velocities__9c0a452656594ea3134b9cdcb16988663e9015013c42419254bc35661139b69f.jpg filter=lfs diff=lfs merge=lfs -text
56
+ data/spotlight_reference_images/ref_0096_26975_LogicTree_Improving_Complex_Reasoning_of_LLMs_via_Instantiated_Multi-step_Synthetic_Logical_Data__75918e90d782aa4c0011abbf0fe69a93e2595315a84e5dd3ba23b1cddfb672b5.jpg filter=lfs diff=lfs merge=lfs -text
57
+ data/spotlight_reference_images/ref_0098_27155_DexFlyWheel_A_Scalable_and_Self-improving_Data_Generation_Framework_for_Dexterous_Manipulation__6cb5d9f8f05d11ff6e6bf4f69015de5ad79051d577633793092cf0b753f0d1aa.jpg filter=lfs diff=lfs merge=lfs -text
58
+ examples/basic_example_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
59
+ examples/basic_example_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
60
+ examples/basic_example_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
61
+ examples/neurips_refs_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
62
+ examples/neurips_refs_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
63
+ examples/neurips_refs_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
64
+ examples/readme/ddpm_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
65
+ examples/readme/ddpm_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
66
+ examples/readme/ddpm_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
67
+ examples/readme/resnet_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
68
+ examples/readme/resnet_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
69
+ examples/readme/resnet_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
70
+ examples/readme/transformer_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
71
+ examples/readme/transformer_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
72
+ examples/readme/transformer_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # System deps
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Create non-root user (HF Spaces requirement)
9
+ RUN useradd -m -u 1000 user
10
+ ENV HOME=/home/user \
11
+ PATH=/home/user/.local/bin:$PATH
12
+
13
+ WORKDIR /app
14
+
15
+ # Install Python deps first (cache layer)
16
+ COPY requirements.txt .
17
+ RUN pip install --no-cache-dir --upgrade pip && \
18
+ pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy app code
21
+ COPY --chown=user . .
22
+
23
+ USER user
24
+
25
+ EXPOSE 7860
26
+
27
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
  title: PaperBanana
3
- emoji: 🌍
4
- colorFrom: purple
5
  colorTo: yellow
6
  sdk: docker
7
- pinned: false
 
 
 
8
  ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: PaperBanana
3
+ emoji: 🍌
4
+ colorFrom: yellow
5
  colorTo: yellow
6
  sdk: docker
7
+ app_file: app.py
8
+ pinned: true
9
+ license: mit
10
+ short_description: Methodology text to architecture diagrams
11
  ---
 
 
aesthetic_guidelines.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Aesthetic Guidelines (G) for academic illustration styling.
3
+ Based on Appendix F of the PaperBanana paper.
4
+ """
5
+
6
+ AESTHETIC_GUIDELINE = """
7
+ # Academic Illustration Style Guide (NeurIPS Style)
8
+
9
+ ## Color Palette
10
+ - **Overall Aesthetic:** Soft Tech & Scientific Pastels ("NeurIPS Look")
11
+ - **Background Colors:** Cream (#FFF8E7), Pale Blue (#E3F2FD), Mint (#E8F5E9)
12
+ - **Accent Colors:**
13
+ - Soft Blue (#64B5F6) for primary processes
14
+ - Soft Orange (#FFB74D) for secondary/iterative processes
15
+ - Soft Purple (#9575CD) for highlighting key components
16
+ - Soft Green (#81C784) for success/outputs
17
+ - **Use color to group logical components**
18
+
19
+ ## Shapes and Components
20
+ - **Process Boxes:** Rounded rectangles with subtle shadows
21
+ - **Data/Tensors:** 3D stacks or layered rectangles
22
+ - **Databases/Storage:** Cylinders or drum shapes
23
+ - **Agents/Models:** Robot or brain icons with labels
24
+ - **Inputs/Outputs:** Parallelograms or cloud shapes
25
+
26
+ ## Lines and Arrows
27
+ - **Network/Architecture Diagrams:** Orthogonal/Elbow connectors
28
+ - **Logic Flow:** Curved arrows for feedback loops
29
+ - **Data Flow:** Straight arrows with clear directionality
30
+ - **Arrow Styles:** Solid for primary flow, dashed for optional/conditional
31
+
32
+ ## Typography
33
+ - **Labels:** Sans-serif fonts (Arial, Roboto, Helvetica)
34
+ - **Mathematical Variables:** Serif Italic (Times New Roman) - use LaTeX notation (e.g., $P$, $P^*$)
35
+ - **Font Sizes:**
36
+ - Main labels: 12-14pt
37
+ - Subscript/technical: 10pt
38
+ - Section headers: 16pt bold
39
+
40
+ ## Layout Principles
41
+ - **Hierarchy:** Left-to-right or top-to-bottom flow
42
+ - **Grouping:** Use containers/boxes with subtle backgrounds to group related components
43
+ - **Spacing:** Generous whitespace, consistent padding
44
+ - **Alignment:** Grid-based layout, aligned elements
45
+ - **Balance:** Visual weight distributed evenly
46
+
47
+ ## Technical Details
48
+ - **Line Weight:** 1.5-2pt for main elements, 1pt for details
49
+ - **Corner Radius:** 8-12px for rounded rectangles
50
+ - **Shadow:** Subtle drop shadow (opacity 10-20%)
51
+ - **Icons:** Simple, consistent style throughout
52
+
53
+ ## Diagram-Specific Guidelines
54
+ ### Architecture Diagrams
55
+ - Show clear input → process → output flow
56
+ - Use containers to separate phases/stages
57
+ - Include feedback loops where applicable
58
+
59
+ ### Methodology Diagrams
60
+ - Emphasize the pipeline structure
61
+ - Show agent interactions clearly
62
+ - Use consistent icons for similar components
63
+ - Annotate with mathematical notation where relevant
64
+ """
agents/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agents package for PaperBanana framework.
3
+ """
4
+
5
+ from .retriever import RetrieverAgent
6
+ from .planner import PlannerAgent
7
+ from .stylist import StylistAgent
8
+ from .visualizer import VisualizerAgent
9
+ from .critic import CriticAgent
10
+
11
+ __all__ = [
12
+ 'RetrieverAgent',
13
+ 'PlannerAgent',
14
+ 'StylistAgent',
15
+ 'VisualizerAgent',
16
+ 'CriticAgent'
17
+ ]
agents/critic.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Critic Agent for PaperBanana framework.
3
+
4
+ Forms closed-loop refinement mechanism by identifying factual misalignments
5
+ or visual glitches and providing feedback for iterative improvement.
6
+ """
7
+ import os
8
+ from typing import Dict, List
9
+ from google import genai
10
+ from google.genai import types
11
+ import config
12
+
13
+
14
+ class CriticAgent:
15
+ """
16
+ Critic Agent: Provides iterative feedback for refinement.
17
+
18
+ Identifies factual misalignments, visual glitches, and areas for improvement
19
+ in generated illustrations, enabling closed-loop refinement.
20
+ """
21
+
22
+ def __init__(self):
23
+ """Initialize Critic Agent."""
24
+ self.client = genai.Client(api_key=config.GEMINI_API_KEY)
25
+ self.model = config.VLM_MODEL
26
+
27
+ def critique(self,
28
+ methodology_text: str,
29
+ caption: str,
30
+ current_description: str,
31
+ generated_image_path: str = None,
32
+ iteration: int = 1) -> Dict[str, any]:
33
+ """
34
+ Provide critique and feedback on current illustration.
35
+
36
+ Args:
37
+ methodology_text: Original methodology description
38
+ caption: Target diagram caption
39
+ current_description: Current textual description
40
+ generated_image_path: Path to generated image (if available)
41
+ iteration: Current iteration number
42
+
43
+ Returns:
44
+ Dictionary containing:
45
+ - 'feedback': Textual feedback
46
+ - 'issues': List of identified issues
47
+ - 'suggestions': List of improvement suggestions
48
+ - 'should_continue': Boolean indicating if refinement should continue
49
+ """
50
+ prompt = self._create_critique_prompt(
51
+ methodology_text,
52
+ caption,
53
+ current_description,
54
+ iteration
55
+ )
56
+
57
+ contents = [
58
+ types.Content(
59
+ role="user",
60
+ parts=[types.Part.from_text(text=prompt)]
61
+ )
62
+ ]
63
+
64
+ # If we have an image, we could add it to the critique (future enhancement)
65
+ # For now, we critique based on the description
66
+
67
+ generate_config = types.GenerateContentConfig(
68
+ thinking_config=types.ThinkingConfig(
69
+ thinking_level=config.THINKING_LEVEL
70
+ )
71
+ )
72
+
73
+ critique_text = ""
74
+ for chunk in self.client.models.generate_content_stream(
75
+ model=self.model,
76
+ contents=contents,
77
+ config=generate_config
78
+ ):
79
+ critique_text += chunk.text
80
+
81
+ # Parse critique into structured feedback
82
+ result = self._parse_critique(critique_text, iteration)
83
+
84
+ return result
85
+
86
+ def _create_critique_prompt(self,
87
+ methodology_text: str,
88
+ caption: str,
89
+ current_description: str,
90
+ iteration: int) -> str:
91
+ """Create prompt for critique generation."""
92
+ prompt = f"""You are an expert reviewer of academic illustrations, specializing in methodology diagrams.
93
+
94
+ Your task is to critically evaluate a textual description for an academic diagram and provide constructive feedback.
95
+
96
+ ORIGINAL METHODOLOGY:
97
+ {methodology_text}
98
+
99
+ TARGET CAPTION:
100
+ {caption}
101
+
102
+ CURRENT ILLUSTRATION DESCRIPTION (Iteration {iteration}):
103
+ {current_description}
104
+
105
+ EVALUATION CRITERIA:
106
+
107
+ 1. **Faithfulness**: Does the description accurately represent all key aspects of the methodology?
108
+ - Are all important components mentioned?
109
+ - Is the flow/logic correctly represented?
110
+ - Are there any factual errors or misrepresentations?
111
+
112
+ 2. **Conciseness**: Is the description appropriately detailed without being cluttered?
113
+ - Is information density appropriate?
114
+ - Are there redundant elements?
115
+ - Is anything unnecessarily complex?
116
+
117
+ 3. **Readability**: Will the resulting diagram be easy to understand?
118
+ - Is the layout logical?
119
+ - Are labels clear and informative?
120
+ - Is visual hierarchy appropriate?
121
+
122
+ 4. **Aesthetics**: Does the description specify professional visual design?
123
+ - Are colors, shapes, and typography well-defined?
124
+ - Is there visual consistency?
125
+ - Does it match academic publication standards?
126
+
127
+ YOUR TASK:
128
+ Provide a structured critique covering:
129
+
130
+ ISSUES FOUND:
131
+ - List specific problems (e.g., "Missing connection between X and Y")
132
+ - Rate severity: CRITICAL, MAJOR, or MINOR
133
+
134
+ SUGGESTIONS FOR IMPROVEMENT:
135
+ - Provide concrete, actionable suggestions
136
+ - Prioritize by impact
137
+
138
+ OVERALL ASSESSMENT:
139
+ - Is this ready for visualization, or does it need refinement?
140
+ - If iteration {iteration} < 3, should we continue refining?
141
+
142
+ OUTPUT FORMAT:
143
+ Structure your response as:
144
+
145
+ ISSUES:
146
+ 1. [SEVERITY] Issue description
147
+ 2. [SEVERITY] Issue description
148
+ ...
149
+
150
+ SUGGESTIONS:
151
+ 1. Specific suggestion
152
+ 2. Specific suggestion
153
+ ...
154
+
155
+ DECISION: [READY / NEEDS_REFINEMENT]
156
+ REASONING: Brief explanation of the decision
157
+ """
158
+ return prompt
159
+
160
+ def _parse_critique(self, critique_text: str, iteration: int) -> Dict:
161
+ """Parse critique text into structured format."""
162
+ issues = []
163
+ suggestions = []
164
+ should_continue = True
165
+
166
+ # Simple parsing - look for key sections
167
+ lines = critique_text.split('\n')
168
+ current_section = None
169
+
170
+ for line in lines:
171
+ line_upper = line.upper().strip()
172
+
173
+ if 'ISSUES:' in line_upper:
174
+ current_section = 'issues'
175
+ continue
176
+ elif 'SUGGESTIONS:' in line_upper or 'SUGGESTION' in line_upper:
177
+ current_section = 'suggestions'
178
+ continue
179
+ elif 'DECISION:' in line_upper:
180
+ current_section = 'decision'
181
+ if 'READY' in line_upper and 'NEEDS_REFINEMENT' not in line_upper:
182
+ should_continue = False
183
+ continue
184
+
185
+ # Parse content
186
+ line = line.strip()
187
+ if not line or line.startswith('#'):
188
+ continue
189
+
190
+ if current_section == 'issues' and (line.startswith('-') or line[0].isdigit()):
191
+ issues.append(line.lstrip('-').lstrip('0123456789.').strip())
192
+ elif current_section == 'suggestions' and (line.startswith('-') or line[0].isdigit()):
193
+ suggestions.append(line.lstrip('-').lstrip('0123456789.').strip())
194
+
195
+ # Don't continue past max iterations
196
+ if iteration >= config.MAX_REFINEMENT_ITERATIONS:
197
+ should_continue = False
198
+
199
+ return {
200
+ 'feedback': critique_text,
201
+ 'issues': issues,
202
+ 'suggestions': suggestions,
203
+ 'should_continue': should_continue
204
+ }
205
+
206
+ def generate_refinement_prompt(self,
207
+ original_description: str,
208
+ critique: Dict) -> str:
209
+ """
210
+ Generate prompt for refinement based on critique.
211
+
212
+ Args:
213
+ original_description: Current description
214
+ critique: Critique dictionary from critique()
215
+
216
+ Returns:
217
+ Prompt for Planner to refine the description
218
+ """
219
+ issues_text = "\n".join([f"- {issue}" for issue in critique['issues']])
220
+ suggestions_text = "\n".join([f"- {sug}" for sug in critique['suggestions']])
221
+
222
+ refinement_prompt = f"""CURRENT DESCRIPTION:
223
+ {original_description}
224
+
225
+ IDENTIFIED ISSUES:
226
+ {issues_text}
227
+
228
+ SUGGESTIONS FOR IMPROVEMENT:
229
+ {suggestions_text}
230
+
231
+ Please revise the description to address these issues and incorporate the suggestions.
232
+ Maintain all correct elements while fixing the identified problems.
233
+ """
234
+ return refinement_prompt
agents/planner.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Planner Agent for PaperBanana framework.
3
+
4
+ Serves as the cognitive core. Translates unstructured methodology data
5
+ into comprehensive textual description of the target illustration.
6
+ """
7
+ import os
8
+ from typing import List, Dict, Any
9
+ from google import genai
10
+ from google.genai import types
11
+ import config
12
+
13
+
14
+ class PlannerAgent:
15
+ """
16
+ Planner Agent: Translates methodology into comprehensive illustration description.
17
+
18
+ The cognitive core that interprets source context S and communicative intent C,
19
+ then produces detailed textual description P of the target illustration.
20
+ """
21
+
22
+ def __init__(self):
23
+ """Initialize Planner Agent."""
24
+ self.client = genai.Client(api_key=config.GEMINI_API_KEY)
25
+ self.model = config.VLM_MODEL
26
+
27
+ def plan(self,
28
+ methodology_text: str,
29
+ caption: str,
30
+ reference_examples: List[Dict[str, Any]] = None) -> str:
31
+ """
32
+ Generate comprehensive textual description of target illustration.
33
+
34
+ Args:
35
+ methodology_text: Source methodology description (S)
36
+ caption: Diagram caption (part of C)
37
+ reference_examples: Retrieved reference examples (E)
38
+
39
+ Returns:
40
+ Detailed textual description P of the illustration
41
+ """
42
+ prompt = self._create_planning_prompt(methodology_text, caption, reference_examples)
43
+
44
+ contents = [
45
+ types.Content(
46
+ role="user",
47
+ parts=[types.Part.from_text(text=prompt)]
48
+ )
49
+ ]
50
+
51
+ generate_config = types.GenerateContentConfig(
52
+ thinking_config=types.ThinkingConfig(
53
+ thinking_level=config.THINKING_LEVEL
54
+ )
55
+ )
56
+
57
+ description = ""
58
+ for chunk in self.client.models.generate_content_stream(
59
+ model=self.model,
60
+ contents=contents,
61
+ config=generate_config
62
+ ):
63
+ description += chunk.text
64
+
65
+ return description.strip()
66
+
67
+ def _create_planning_prompt(self,
68
+ methodology_text: str,
69
+ caption: str,
70
+ reference_examples: List[Dict[str, Any]] = None) -> str:
71
+ """Create prompt for generating illustration description."""
72
+
73
+ # Include reference examples if available
74
+ reference_context = ""
75
+ if reference_examples:
76
+ reference_context = "\n\nREFERENCE EXAMPLES (for inspiration):\n"
77
+ for i, ref in enumerate(reference_examples[:3], 1): # Use top 3
78
+ reference_context += f"\nExample {i}:\n"
79
+ reference_context += f"Domain: {ref.get('domain', 'N/A')}\n"
80
+ reference_context += f"Type: {ref.get('diagram_type', 'N/A')}\n"
81
+ reference_context += f"Description: {ref.get('description', 'N/A')}\n"
82
+
83
+ prompt = f"""You are an expert at designing academic methodology diagrams for scientific publications.
84
+
85
+ Your task is to create a COMPREHENSIVE and DETAILED textual description of an illustration that would
86
+ effectively visualize the given methodology. This description will be used to generate the actual diagram.
87
+
88
+ METHODOLOGY TO VISUALIZE:
89
+ {methodology_text}
90
+
91
+ TARGET DIAGRAM CAPTION:
92
+ {caption}
93
+ {reference_context}
94
+
95
+ REQUIREMENTS:
96
+ 1. **Layout Structure**: Specify the overall layout (left-to-right, top-to-bottom, circular, etc.)
97
+ 2. **Components**: List all visual elements needed (boxes, arrows, icons, labels, etc.)
98
+ 3. **Content**: What text/symbols should appear in each component
99
+ 4. **Connections**: How components connect (arrows, lines, groupings)
100
+ 5. **Hierarchy**: Which elements are primary vs secondary
101
+ 6. **Grouping**: How to group related components (containers, background colors)
102
+ 7. **Flow**: The logical flow of information through the diagram
103
+ 8. **Key Details**: Important technical details, equations, or annotations
104
+
105
+ IMPORTANT GUIDELINES:
106
+ - Be specific about spatial relationships and positioning
107
+ - Describe the logical flow clearly (input → process → output)
108
+ - Include any mathematical notation or technical terminology
109
+ - Consider the target audience (academic researchers)
110
+ - Focus on clarity and information density
111
+ - Think about how this supports the paper's narrative
112
+
113
+ OUTPUT FORMAT:
114
+ Provide a detailed paragraph-form description that covers all aspects above.
115
+ Be thorough - this description should be sufficient for someone to create the diagram without seeing the original methodology.
116
+ """
117
+ return prompt
agents/retriever.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Retriever Agent for PaperBanana framework.
3
+
4
+ Identifies the N most relevant examples from a reference set using VLM ranking.
5
+ Matches based on research domain and diagram type.
6
+ """
7
+ import os
8
+ from typing import List, Dict, Any
9
+ from google import genai
10
+ from google.genai import types
11
+ import config
12
+
13
+
14
+ class RetrieverAgent:
15
+ """
16
+ Retriever Agent: Identifies relevant reference examples from a fixed reference set.
17
+
18
+ Uses generative retrieval approach where VLM ranks candidates by matching
19
+ research domain and diagram type.
20
+ """
21
+
22
+ def __init__(self, reference_set: List[Dict[str, Any]] = None):
23
+ """
24
+ Initialize Retriever Agent.
25
+
26
+ Args:
27
+ reference_set: List of reference examples with metadata
28
+ Each example should have: {
29
+ 'id': str,
30
+ 'domain': str,
31
+ 'diagram_type': str,
32
+ 'description': str,
33
+ 'image_path': str (optional)
34
+ }
35
+ """
36
+ self.client = genai.Client(api_key=config.GEMINI_API_KEY)
37
+ self.model = config.VLM_MODEL
38
+ self.reference_set = reference_set or []
39
+
40
+ def retrieve(self,
41
+ methodology_text: str,
42
+ caption: str,
43
+ n: int = config.NUM_REFERENCE_EXAMPLES) -> List[Dict[str, Any]]:
44
+ """
45
+ Retrieve the N most relevant reference examples.
46
+
47
+ Args:
48
+ methodology_text: Source methodology description
49
+ caption: Target diagram caption
50
+ n: Number of examples to retrieve
51
+
52
+ Returns:
53
+ List of N most relevant reference examples
54
+ """
55
+ if not self.reference_set:
56
+ print("Warning: No reference set provided. Skipping retrieval.")
57
+ return []
58
+
59
+ # Create retrieval prompt
60
+ prompt = self._create_retrieval_prompt(methodology_text, caption, n)
61
+
62
+ # Query VLM for ranking
63
+ contents = [
64
+ types.Content(
65
+ role="user",
66
+ parts=[types.Part.from_text(text=prompt)]
67
+ )
68
+ ]
69
+
70
+ generate_config = types.GenerateContentConfig(
71
+ thinking_config=types.ThinkingConfig(
72
+ thinking_level=config.THINKING_LEVEL
73
+ )
74
+ )
75
+
76
+ response_text = ""
77
+ for chunk in self.client.models.generate_content_stream(
78
+ model=self.model,
79
+ contents=contents,
80
+ config=generate_config
81
+ ):
82
+ response_text += chunk.text
83
+
84
+ # Parse the response to extract selected example IDs
85
+ selected_examples = self._parse_retrieval_response(response_text, n)
86
+
87
+ return selected_examples
88
+
89
+ def _create_retrieval_prompt(self, methodology_text: str, caption: str, n: int) -> str:
90
+ """Create prompt for retrieving relevant examples."""
91
+ # Create a summary of available references
92
+ reference_summary = "\n".join([
93
+ f"ID: {ref['id']}\nDomain: {ref['domain']}\nType: {ref['diagram_type']}\nDescription: {ref['description']}\n"
94
+ for ref in self.reference_set
95
+ ])
96
+
97
+ prompt = f"""You are an expert at identifying relevant academic illustration examples.
98
+
99
+ Given a methodology description and diagram caption, select the {n} most relevant reference examples
100
+ from the provided set. Consider:
101
+ 1. Research domain similarity (e.g., NLP, Computer Vision, Reinforcement Learning)
102
+ 2. Diagram type similarity (e.g., architecture diagram, flowchart, pipeline)
103
+ 3. Conceptual similarity in the methodology
104
+
105
+ METHODOLOGY:
106
+ {methodology_text}
107
+
108
+ TARGET CAPTION:
109
+ {caption}
110
+
111
+ AVAILABLE REFERENCE EXAMPLES:
112
+ {reference_summary}
113
+
114
+ OUTPUT FORMAT:
115
+ Return only the IDs of the {n} most relevant examples, one per line, ranked from most to least relevant.
116
+ Example output:
117
+ ref_001
118
+ ref_005
119
+ ref_012
120
+ """
121
+ return prompt
122
+
123
+ def _parse_retrieval_response(self, response_text: str, n: int) -> List[Dict[str, Any]]:
124
+ """Parse VLM response to extract selected examples."""
125
+ # Extract IDs from response
126
+ lines = response_text.strip().split('\n')
127
+ selected_ids = []
128
+
129
+ for line in lines:
130
+ line = line.strip()
131
+ # Look for reference IDs
132
+ for ref in self.reference_set:
133
+ if ref['id'] in line:
134
+ selected_ids.append(ref['id'])
135
+ break
136
+ if len(selected_ids) >= n:
137
+ break
138
+
139
+ # Get full reference objects
140
+ selected_examples = []
141
+ for ref_id in selected_ids:
142
+ for ref in self.reference_set:
143
+ if ref['id'] == ref_id:
144
+ selected_examples.append(ref)
145
+ break
146
+
147
+ # If we didn't get enough, just take the first n
148
+ if len(selected_examples) < n:
149
+ selected_examples = self.reference_set[:n]
150
+
151
+ return selected_examples[:n]
agents/stylist.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stylist Agent for PaperBanana framework.
3
+
4
+ Acts as a design consultant. Uses automatically synthesized aesthetic
5
+ guidelines to refine initial description into stylistically optimized version.
6
+ """
7
+ import os
8
+ from google import genai
9
+ from google.genai import types
10
+ import config
11
+ from aesthetic_guidelines import AESTHETIC_GUIDELINE
12
+
13
+
14
+ class StylistAgent:
15
+ """
16
+ Stylist Agent: Refines illustration descriptions using aesthetic guidelines.
17
+
18
+ Takes initial description P and enhances it with style guidance G
19
+ to produce stylistically optimized description P*.
20
+ """
21
+
22
+ def __init__(self, custom_guidelines: str = None):
23
+ """
24
+ Initialize Stylist Agent.
25
+
26
+ Args:
27
+ custom_guidelines: Optional custom aesthetic guidelines.
28
+ If None, uses default NeurIPS-style guidelines.
29
+ """
30
+ self.client = genai.Client(api_key=config.GEMINI_API_KEY)
31
+ self.model = config.VLM_MODEL
32
+ self.guidelines = custom_guidelines or AESTHETIC_GUIDELINE
33
+
34
+ def refine(self, initial_description: str) -> str:
35
+ """
36
+ Refine initial description with aesthetic styling.
37
+
38
+ Args:
39
+ initial_description: Initial textual description P
40
+
41
+ Returns:
42
+ Stylistically optimized description P*
43
+ """
44
+ prompt = self._create_styling_prompt(initial_description)
45
+
46
+ contents = [
47
+ types.Content(
48
+ role="user",
49
+ parts=[types.Part.from_text(text=prompt)]
50
+ )
51
+ ]
52
+
53
+ generate_config = types.GenerateContentConfig(
54
+ thinking_config=types.ThinkingConfig(
55
+ thinking_level=config.THINKING_LEVEL
56
+ )
57
+ )
58
+
59
+ refined_description = ""
60
+ for chunk in self.client.models.generate_content_stream(
61
+ model=self.model,
62
+ contents=contents,
63
+ config=generate_config
64
+ ):
65
+ refined_description += chunk.text
66
+
67
+ return refined_description.strip()
68
+
69
+ def _create_styling_prompt(self, initial_description: str) -> str:
70
+ """Create prompt for aesthetic refinement."""
71
+ prompt = f"""You are an expert design consultant specializing in academic publication illustrations.
72
+
73
+ Your task is to take an initial diagram description and enhance it with specific aesthetic and design details
74
+ to create a polished, publication-ready illustration that follows academic standards.
75
+
76
+ INITIAL DESCRIPTION:
77
+ {initial_description}
78
+
79
+ AESTHETIC GUIDELINES TO FOLLOW:
80
+ {self.guidelines}
81
+
82
+ YOUR TASK:
83
+ Refine the initial description by adding specific visual design details:
84
+
85
+ 1. **Color Specifications**: Add specific color choices from the palette (e.g., "soft blue #64B5F6 for the main process boxes")
86
+ 2. **Shape Details**: Specify exact shapes and their styling (e.g., "rounded rectangles with 10px radius and subtle shadow")
87
+ 3. **Typography**: Define font choices for different text elements
88
+ 4. **Visual Hierarchy**: Enhance descriptions of size, weight, and emphasis relationships
89
+ 5. **Spacing & Layout**: Add details about padding, margins, and alignment
90
+ 6. **Professional Polish**: Include finishing touches like shadows, borders, gradients
91
+
92
+ IMPORTANT:
93
+ - Preserve ALL content and structural information from the initial description
94
+ - Add aesthetic details WITHOUT changing the fundamental design or information flow
95
+ - Be specific with measurements, colors (hex codes), and styling parameters
96
+ - Ensure the result maintains academic professionalism and clarity
97
+ - The output should be suitable for direct input to an image generation model
98
+
99
+ OUTPUT FORMAT:
100
+ Provide the enhanced description as a detailed, flowing paragraph that seamlessly integrates
101
+ the original content with the aesthetic specifications. Make it vivid and precise enough that
102
+ an image generation model can render it accurately.
103
+ """
104
+ return prompt
agents/visualizer.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visualizer Agent for PaperBanana framework.
3
+
4
+ Renders academic illustrations using image generation models.
5
+ Supports both diagram generation and statistical plot generation.
6
+ """
7
+ import os
8
+ import mimetypes
9
+ from typing import Optional
10
+ from google import genai
11
+ from google.genai import types
12
+ import config
13
+ from utils import save_binary_file
14
+
15
+
16
+ class VisualizerAgent:
17
+ """
18
+ Visualizer Agent: Renders illustrations from textual descriptions.
19
+
20
+ Supports two modes:
21
+ 1. Diagram mode: Uses image generation model (Nano-Banana-Pro / Gemini Image)
22
+ 2. Plot mode: Generates Python Matplotlib code for statistical plots
23
+ """
24
+
25
+ def __init__(self, mode: str = "diagram"):
26
+ """
27
+ Initialize Visualizer Agent.
28
+
29
+ Args:
30
+ mode: Generation mode - "diagram" or "plot"
31
+ """
32
+ self.client = genai.Client(api_key=config.GEMINI_API_KEY)
33
+ self.mode = mode
34
+
35
+ if mode == "diagram":
36
+ self.model = config.IMAGE_MODEL
37
+ elif mode == "plot":
38
+ self.model = config.VLM_MODEL # Use VLM for code generation
39
+ else:
40
+ raise ValueError(f"Invalid mode: {mode}. Use 'diagram' or 'plot'")
41
+
42
+ def visualize(self,
43
+ description: str,
44
+ output_path: str = "output",
45
+ data: dict = None) -> str:
46
+ """
47
+ Generate visualization from description.
48
+
49
+ Args:
50
+ description: Textual description of the illustration
51
+ output_path: Base path for output file (without extension)
52
+ data: Optional data dict for plot mode
53
+
54
+ Returns:
55
+ Path to generated image file or code file
56
+ """
57
+ if self.mode == "diagram":
58
+ return self._generate_diagram(description, output_path)
59
+ elif self.mode == "plot":
60
+ return self._generate_plot(description, output_path, data)
61
+
62
+ def _generate_diagram(self, description: str, output_path: str) -> str:
63
+ """
64
+ Generate diagram image using image generation model.
65
+
66
+ Args:
67
+ description: Detailed visual description
68
+ output_path: Base path for output file
69
+
70
+ Returns:
71
+ Path to generated image
72
+ """
73
+ # Create prompt for image generation
74
+ prompt = f"""Generate a high-quality academic methodology diagram with the following specifications:
75
+
76
+ {description}
77
+
78
+ Requirements:
79
+ - Professional academic publication quality
80
+ - Clear, readable text and labels
81
+ - Consistent styling throughout
82
+ - Appropriate use of colors and shapes
83
+ - Publication-ready resolution
84
+ """
85
+
86
+ contents = [
87
+ types.Content(
88
+ role="user",
89
+ parts=[types.Part.from_text(text=prompt)]
90
+ )
91
+ ]
92
+
93
+ generate_config = types.GenerateContentConfig(
94
+ response_modalities=["IMAGE", "TEXT"],
95
+ image_config=types.ImageConfig(
96
+ image_size=config.IMAGE_SIZE
97
+ )
98
+ )
99
+
100
+ file_index = 0
101
+ saved_path = None
102
+
103
+ for chunk in self.client.models.generate_content_stream(
104
+ model=self.model,
105
+ contents=contents,
106
+ config=generate_config
107
+ ):
108
+ if (chunk.candidates is None or
109
+ chunk.candidates[0].content is None or
110
+ chunk.candidates[0].content.parts is None):
111
+ continue
112
+
113
+ # Check for inline image data
114
+ part = chunk.candidates[0].content.parts[0]
115
+ if part.inline_data and part.inline_data.data:
116
+ inline_data = part.inline_data
117
+ data_buffer = inline_data.data
118
+ file_extension = mimetypes.guess_extension(inline_data.mime_type)
119
+
120
+ if file_extension:
121
+ file_name = f"{output_path}_{file_index}{file_extension}"
122
+ saved_path = save_binary_file(file_name, data_buffer)
123
+ file_index += 1
124
+ else:
125
+ # Print any text output
126
+ if chunk.text:
127
+ print(chunk.text)
128
+
129
+ return saved_path or f"{output_path}_0.png"
130
+
131
+ def _generate_plot(self, description: str, output_path: str, data: dict = None) -> str:
132
+ """
133
+ Generate statistical plot by creating Matplotlib code.
134
+
135
+ Args:
136
+ description: Description of desired plot
137
+ output_path: Base path for output code file
138
+ data: Optional data dictionary
139
+
140
+ Returns:
141
+ Path to generated Python code file
142
+ """
143
+ data_context = ""
144
+ if data:
145
+ data_context = f"\n\nDATA PROVIDED:\n{str(data)}\n"
146
+
147
+ prompt = f"""You are an expert at creating publication-quality statistical plots using Matplotlib.
148
+
149
+ Generate complete, executable Python code using Matplotlib to create the following plot:
150
+
151
+ {description}
152
+ {data_context}
153
+
154
+ Requirements:
155
+ 1. Use professional academic styling (seaborn-paper style or similar)
156
+ 2. Include clear axis labels with units
157
+ 3. Add legend if multiple series
158
+ 4. Use appropriate colors and markers
159
+ 5. Set figure size for publication (e.g., 6x4 inches)
160
+ 6. Save as high-resolution PNG (300 dpi minimum)
161
+ 7. Include error bars if applicable
162
+ 8. Follow best practices for data visualization
163
+
164
+ OUTPUT FORMAT:
165
+ Provide ONLY the complete Python code, ready to execute.
166
+ Start with necessary imports and end with plt.savefig().
167
+ Do not include any explanations outside the code comments.
168
+ """
169
+
170
+ contents = [
171
+ types.Content(
172
+ role="user",
173
+ parts=[types.Part.from_text(text=prompt)]
174
+ )
175
+ ]
176
+
177
+ generate_config = types.GenerateContentConfig(
178
+ thinking_config=types.ThinkingConfig(
179
+ thinking_level="MEDIUM"
180
+ )
181
+ )
182
+
183
+ code = ""
184
+ for chunk in self.client.models.generate_content_stream(
185
+ model=self.model,
186
+ contents=contents,
187
+ config=generate_config
188
+ ):
189
+ code += chunk.text
190
+
191
+ # Save code to file
192
+ code_file = f"{output_path}.py"
193
+ with open(code_file, 'w') as f:
194
+ f.write(code.strip())
195
+
196
+ print(f"Plot code saved to: {code_file}")
197
+ print("Run the code to generate the plot image.")
198
+
199
+ return code_file
app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PaperBanana — Gradio app for HuggingFace Spaces.
3
+
4
+ Turns methodology text into publication-ready architecture diagrams
5
+ using a 5-agent pipeline (Retriever → Planner → Stylist → Visualizer → Critic).
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import tempfile
11
+ import mimetypes
12
+ from pathlib import Path
13
+ from typing import List, Dict, Any, Optional
14
+
15
+ import gradio as gr
16
+ from google import genai
17
+ from google.genai import types
18
+
19
+ from agents import RetrieverAgent, PlannerAgent, StylistAgent, VisualizerAgent, CriticAgent
20
+ from aesthetic_guidelines import AESTHETIC_GUIDELINE
21
+ import config
22
+
23
+ # ── Load reference set at startup ───────────────────────────────────────────
24
+ REF_SET_PATH = Path("data/spotlight_reference_set.json")
25
+ REFERENCE_SET: List[Dict[str, Any]] = []
26
+ if REF_SET_PATH.exists():
27
+ with open(REF_SET_PATH) as f:
28
+ REFERENCE_SET = json.load(f)
29
+ print(f"Loaded {len(REFERENCE_SET)} reference examples")
30
+
31
+ # ── Example gallery images ──────────────────────────────────────────────────
32
+ EXAMPLE_IMAGES = {
33
+ "Transformer": "examples/readme/transformer_iter3_0.jpg",
34
+ "ResNet": "examples/readme/resnet_iter3_0.jpg",
35
+ "DDPM": "examples/readme/ddpm_iter3_0.jpg",
36
+ }
37
+
38
+ # ── Preset examples ─────────────────────────────────────────────────────────
39
+ PRESET_EXAMPLES = [
40
+ [
41
+ # Transformer
42
+ """The Transformer model follows an encoder-decoder structure using stacked self-attention and fully connected layers.
43
+
44
+ Encoder: Stack of N=6 identical layers. Each layer has two sub-layers: (1) multi-head self-attention, and (2) position-wise feed-forward network. Residual connections around each sub-layer, followed by layer normalization.
45
+
46
+ Decoder: Stack of N=6 identical layers. In addition to the two encoder sub-layers, the decoder inserts a third sub-layer for multi-head cross-attention over the encoder output. Masked self-attention prevents attending to subsequent positions.
47
+
48
+ Multi-Head Attention: Linearly project queries, keys, values h times, perform scaled dot-product attention in parallel, concatenate and project again.
49
+
50
+ Positional Encoding: Sinusoidal positional encodings added to input embeddings.""",
51
+ "The Transformer — model architecture (Vaswani et al., 2017)",
52
+ 2,
53
+ ],
54
+ [
55
+ # ResNet
56
+ """We present a residual learning framework. Instead of learning H(x) directly, layers fit a residual mapping F(x) = H(x) - x. The building block is y = F(x, {W_i}) + x via identity shortcut connections.
57
+
58
+ Architecture: Input 224×224 → 7×7 conv, 64, stride 2 → BN → ReLU → 3×3 max pool → Stage 1: 3 blocks, 64 filters → Stage 2: 4 blocks, 128 filters → Stage 3: 6 blocks, 256 filters → Stage 4: 3 blocks, 512 filters → Global avg pool → 1000-d FC → softmax.
59
+
60
+ For deeper networks (50/101/152), bottleneck blocks: 1×1 conv (reduce) → 3×3 conv → 1×1 conv (restore), with shortcut bypassing all three layers.""",
61
+ "Architecture of ResNet with residual learning building blocks (He et al., 2016)",
62
+ 2,
63
+ ],
64
+ [
65
+ # DDPM
66
+ """Denoising diffusion probabilistic models (DDPMs): Forward process gradually adds Gaussian noise over T timesteps: q(x_t|x_{t-1}) = N(x_t; √(1-β_t)x_{t-1}, β_tI). After T steps, x_T ≈ N(0,I).
67
+
68
+ Reverse process learns to denoise: p_θ(x_{t-1}|x_t) = N(x_{t-1}; μ_θ(x_t,t), Σ_θ(x_t,t)). Starting from x_T ~ N(0,I), iteratively produces clean x_0.
69
+
70
+ Denoising network ε_θ(x_t,t) is a U-Net: downsampling with ResNet blocks + self-attention at 16×16, bottleneck with self-attention, upsampling with skip connections. Timestep conditioning via sinusoidal embeddings. Training minimizes L = E[||ε - ε_θ(x_t,t)||²].""",
71
+ "Overview of the denoising diffusion probabilistic model (Ho et al., 2020)",
72
+ 2,
73
+ ],
74
+ ]
75
+
76
+
77
+ # ── Core generation logic (streaming-friendly) ─────────────────────────────
78
+ def generate_diagram(
79
+ methodology_text: str,
80
+ caption: str,
81
+ num_iterations: int,
82
+ api_key: str | None = None,
83
+ progress=gr.Progress(track_tqdm=True),
84
+ ):
85
+ """Run the full PaperBanana pipeline and yield intermediate results."""
86
+
87
+ # Resolve API key: user input > env var
88
+ gemini_key = (api_key or "").strip() or config.GEMINI_API_KEY
89
+ if not gemini_key:
90
+ raise gr.Error(
91
+ "No Gemini API key found. Paste one in the field above, "
92
+ "or set GEMINI_API_KEY as a Space secret."
93
+ )
94
+
95
+ # Patch config so all agents pick it up
96
+ config.GEMINI_API_KEY = gemini_key
97
+
98
+ num_iterations = int(num_iterations)
99
+ logs: list[str] = []
100
+
101
+ def log(msg: str):
102
+ logs.append(msg)
103
+ return "\n".join(logs)
104
+
105
+ # ── 1. Retriever ────────────────────────────────────────────────────────
106
+ yield None, log("🔍 [1/5] Retriever: finding relevant references…")
107
+ retriever = RetrieverAgent(REFERENCE_SET)
108
+ reference_examples = []
109
+ if REFERENCE_SET:
110
+ reference_examples = retriever.retrieve(
111
+ methodology_text, caption, n=config.NUM_REFERENCE_EXAMPLES
112
+ )
113
+ yield None, log(f" ✓ Retrieved {len(reference_examples)} references")
114
+ else:
115
+ yield None, log(" ⏭ Skipped (no reference set loaded)")
116
+
117
+ # ── 2. Planner ──────────────────────────────────────────────────────────
118
+ yield None, log("📝 [2/5] Planner: creating visual description…")
119
+ planner = PlannerAgent()
120
+ current_description = planner.plan(methodology_text, caption, reference_examples)
121
+ yield None, log(f" ✓ Description ready ({len(current_description)} chars)")
122
+
123
+ # ── 3. Stylist ──────────────────────────────────────────────────────────
124
+ yield None, log("🎨 [3/5] Stylist: applying aesthetic guidelines…")
125
+ stylist = StylistAgent()
126
+ current_description = stylist.refine(current_description)
127
+ yield None, log(f" ✓ Styled ({len(current_description)} chars)")
128
+
129
+ # ── 4/5. Visualize → Critique loop ──────────────────────────────────────
130
+ latest_image_path = None
131
+ critic = CriticAgent()
132
+
133
+ for i in range(1, num_iterations + 1):
134
+ yield latest_image_path, log(
135
+ f"🖼️ [4/5] Visualizer: generating image (iteration {i}/{num_iterations})…"
136
+ )
137
+
138
+ with tempfile.TemporaryDirectory() as tmpdir:
139
+ out_base = os.path.join(tmpdir, f"iter{i}")
140
+ visualizer = VisualizerAgent(mode="diagram")
141
+ img_path = visualizer.visualize(current_description, out_base)
142
+
143
+ if img_path and os.path.exists(img_path):
144
+ # Copy to a persistent temp file so Gradio can serve it
145
+ import shutil
146
+
147
+ ext = Path(img_path).suffix or ".jpg"
148
+ persist = tempfile.NamedTemporaryFile(
149
+ suffix=ext, delete=False, dir=tempfile.gettempdir()
150
+ )
151
+ shutil.copy2(img_path, persist.name)
152
+ latest_image_path = persist.name
153
+
154
+ yield latest_image_path, log(f" ✓ Image generated (iteration {i})")
155
+
156
+ # Skip critique on last iteration
157
+ if i >= num_iterations:
158
+ break
159
+
160
+ yield latest_image_path, log(
161
+ f"🔬 [5/5] Critic: evaluating (iteration {i})…"
162
+ )
163
+ critique = critic.critique(
164
+ methodology_text, caption, current_description, latest_image_path, i
165
+ )
166
+ n_issues = len(critique["issues"])
167
+ yield latest_image_path, log(f" ✓ {n_issues} issues found")
168
+
169
+ if not critique["should_continue"]:
170
+ yield latest_image_path, log(" ✓ Quality threshold reached — done!")
171
+ break
172
+
173
+ # Refine
174
+ yield latest_image_path, log("📝 [2/5] Planner: refining description…")
175
+ refinement_prompt = critic.generate_refinement_prompt(
176
+ current_description, critique
177
+ )
178
+ client = genai.Client(api_key=gemini_key)
179
+ contents = [
180
+ types.Content(
181
+ role="user",
182
+ parts=[types.Part.from_text(text=refinement_prompt)],
183
+ )
184
+ ]
185
+ refined = ""
186
+ for chunk in client.models.generate_content_stream(
187
+ model=config.VLM_MODEL,
188
+ contents=contents,
189
+ config=types.GenerateContentConfig(
190
+ thinking_config=types.ThinkingConfig(thinking_level="HIGH")
191
+ ),
192
+ ):
193
+ refined += chunk.text
194
+ current_description = refined.strip()
195
+ yield latest_image_path, log(
196
+ f" ✓ Refined ({len(current_description)} chars)"
197
+ )
198
+
199
+ # Re-style
200
+ yield latest_image_path, log("🎨 [3/5] Stylist: re-applying style…")
201
+ current_description = stylist.refine(current_description)
202
+ yield latest_image_path, log(f" ✓ Styled ({len(current_description)} chars)")
203
+
204
+ yield latest_image_path, log("\n✅ Generation complete!")
205
+
206
+
207
+ # ── Gradio UI ───────────────────────────────────────────────────────────────
208
+ DESCRIPTION_MD = """\
209
+ # 🍌 PaperBanana
210
+
211
+ **Turn methodology text into publication-ready architecture diagrams.**
212
+
213
+ Paste your paper's methodology section + a caption, and PaperBanana's 5-agent pipeline
214
+ (Retriever → Planner → Stylist → Visualizer → Critic) will generate a diagram for you.
215
+
216
+ > Based on [*PaperBanana: Automating Academic Illustration for AI Scientists*](https://arxiv.org/abs/2505.23894) (Zhu et al., NeurIPS 2025).
217
+ """
218
+
219
+ with gr.Blocks(
220
+ title="PaperBanana",
221
+ theme=gr.themes.Soft(primary_hue="amber", secondary_hue="blue"),
222
+ css="footer { display: none !important; }",
223
+ ) as demo:
224
+ gr.Markdown(DESCRIPTION_MD)
225
+
226
+ # ── Example gallery ─────────────────────────────────────────────────────
227
+ with gr.Accordion("📸 Example outputs (click to expand)", open=False):
228
+ existing = {k: v for k, v in EXAMPLE_IMAGES.items() if Path(v).exists()}
229
+ if existing:
230
+ with gr.Row():
231
+ for name, path in existing.items():
232
+ with gr.Column(min_width=200):
233
+ gr.Image(value=path, label=name)
234
+
235
+ # ── Inputs ──────────────────────────────────────────────────────────────
236
+ with gr.Row():
237
+ with gr.Column(scale=1):
238
+ methodology_input = gr.Textbox(
239
+ label="Methodology text",
240
+ placeholder="Paste your methodology / model description here…",
241
+ lines=12,
242
+ )
243
+ caption_input = gr.Textbox(
244
+ label="Diagram caption",
245
+ placeholder='e.g. "Architecture of our proposed method"',
246
+ lines=2,
247
+ )
248
+ iterations_slider = gr.Slider(
249
+ minimum=1,
250
+ maximum=3,
251
+ value=2,
252
+ step=1,
253
+ label="Refinement iterations",
254
+ info="More iterations = better quality, slower",
255
+ )
256
+ api_key_input = gr.Textbox(
257
+ label="Gemini API key (optional if set as Space secret)",
258
+ type="password",
259
+ placeholder="AIza…",
260
+ )
261
+ generate_btn = gr.Button("🍌 Generate diagram", variant="primary", size="lg")
262
+
263
+ # ── Outputs ─────────────────────────────────────────────────────────
264
+ with gr.Column(scale=1):
265
+ output_image = gr.Image(label="Generated diagram", type="filepath")
266
+ output_log = gr.Textbox(label="Pipeline log", lines=18, interactive=False)
267
+
268
+ # ── Examples table ──────────────────────────────────────────────────────
269
+ gr.Examples(
270
+ examples=PRESET_EXAMPLES,
271
+ inputs=[methodology_input, caption_input, iterations_slider],
272
+ label="Try a classic paper",
273
+ )
274
+
275
+ # ── Wiring ──────────────────────────────────────────────────────────────
276
+ generate_btn.click(
277
+ fn=generate_diagram,
278
+ inputs=[methodology_input, caption_input, iterations_slider, api_key_input],
279
+ outputs=[output_image, output_log],
280
+ )
281
+
282
+ if __name__ == "__main__":
283
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
config.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for PaperBanana framework.
3
+ """
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ # Gemini API Configuration
10
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
11
+
12
+ # Model Configuration
13
+ VLM_MODEL = "gemini-3-pro-preview" # For Retriever, Planner, Stylist, Critic
14
+ IMAGE_MODEL = "gemini-3-pro-image-preview" # For Visualizer (referred to as Nano-Banana-Pro in paper)
15
+
16
+ # Generation Configuration
17
+ MAX_REFINEMENT_ITERATIONS = 3 # As per ablation study
18
+ IMAGE_SIZE = "1K" # Image resolution
19
+ THINKING_LEVEL = "HIGH" # For complex reasoning tasks
20
+
21
+ # Number of reference examples to retrieve
22
+ NUM_REFERENCE_EXAMPLES = 10
data/spotlight_reference_images/ref_0001_00232_GraphMaster_Automated_Graph_Synthesis_via_LLM_Agents_in_Data-Limited_Environments__fdf13132133da88f7ce9ae4d0a22c29da1f05f75072f95010a29b1392696ea70.jpg ADDED
data/spotlight_reference_images/ref_0002_00279_VoxDet_Rethinking_3D_Semantic_Scene_Completion_as_Dense_Object_Detection__7b55d87bf0fcf6d787a440d59bf4617e6d73f10f5b1bcc1b45736ad0a7a57911.jpg ADDED
data/spotlight_reference_images/ref_0003_00491_SQS_Enhancing_Sparse_Perception_Models_via_Query-based_Splatting_in_Autonomous_Driving__f7c6f154dc3e45e2f58f5a9111ebf57ead0d19f9e77340cd3838d881af17a916.jpg ADDED
data/spotlight_reference_images/ref_0004_00691_ProtInvTree_Deliberate_Protein_Inverse_Folding_with_Reward-guided_Tree_Search__713bbbec11cbef0f1d2c8901e17165fa7db3d1fcf6cfa0d4a8b803d2dccb2ca0.jpg ADDED
data/spotlight_reference_images/ref_0005_00738_Mulberry_Empowering_MLLM_with_o1-like_Reasoning_and_Reflection_via_Collective_Monte_Carlo_Tree_Search__7c987fc8cf213eb47a038117cc38e5a170938289a390de4b8d7b6cd88512d505.jpg ADDED
data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg ADDED

Git LFS Details

  • SHA256: d04a1104d131abcf53056b5e5617593091f53bbf131d55ab2ce9b5f788b877d3
  • Pointer size: 131 Bytes
  • Size of remote file: 131 kB
data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg ADDED

Git LFS Details

  • SHA256: b34adeb1b8b47ab83bc6644a64894d538cd7418ba6657a72ccdcdfebd2d6ad56
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
data/spotlight_reference_images/ref_0008_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__ea910fb78f4d4027ff7fe9a63cbbc68a7038ee2e9f4250c2f773c631f265b079.jpg ADDED
data/spotlight_reference_images/ref_0009_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__4cc6ca3e3fa15065b6a1781e9a7f814d67a46649325289b63c9bea4072020f4c.jpg ADDED
data/spotlight_reference_images/ref_0010_01112_DiCo_Revitalizing_ConvNets_for_Scalable_and_Efficient_Diffusion_Modeling__589d9f3ec341480c16ec00bf41076999cd0ce6c1526b97e71eb9c8ffe33b1a1b.jpg ADDED
data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg ADDED

Git LFS Details

  • SHA256: 2640c46c7a0d30a577808fbf2b7ed24c2821c1d1991fbec98f91c8c4e6c34556
  • Pointer size: 131 Bytes
  • Size of remote file: 109 kB
data/spotlight_reference_images/ref_0012_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__fb4694d88a5097ee72a936634a832ced85d11e90079a4fabc3f1b8c28f24e5d6.jpg ADDED
data/spotlight_reference_images/ref_0013_01620_FutureSightDrive_Thinking_Visually_with_Spatio-Temporal_CoT_for_Autonomous_Driving__8a4bffe9a69ef0d2bc0ced518bccdb3146d38727d7fc7402a418b6227a78bcfb.jpg ADDED
data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg ADDED

Git LFS Details

  • SHA256: a81e26be68336192dda8116ef0df8001d34ee61398a633266559e1f780cc2e0b
  • Pointer size: 131 Bytes
  • Size of remote file: 170 kB
data/spotlight_reference_images/ref_0015_01839_Mesh-RFT_Enhancing_Mesh_Generation_via_Fine-grained_Reinforcement_Fine-Tuning__4d93550a61cdae636652e7cd1ca974c21c7fca1e8eb6eaa6fa2e03100f1d0f68.jpg ADDED
data/spotlight_reference_images/ref_0016_01864_Jacobian-Based_Interpretation_of_Nonlinear_Neural_Encoding_Model__c9ac984c977d21aff284dcfbacf7bdfea345cc73096a3f28d624b026654e1740.jpg ADDED
data/spotlight_reference_images/ref_0017_02109_OnlineSplatter_Pose-Free_Online_3D_Reconstruction_for_Free-Moving_Objects__abe4e12f0fc9a7487c6c5774f5ead6b391ede9f832960b5ce462ceb786534a0d.jpg ADDED
data/spotlight_reference_images/ref_0018_02239_RobustMerge_Parameter-Efficient_Model_Merging_for_MLLMs_with_Direction_Robustness__45b74fc957d305067e6f46f00bc32c9ee0889b7dc7e18b4c46913d263c1d8c16.jpg ADDED
data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg ADDED

Git LFS Details

  • SHA256: 3350f54f56d42f88c3d4e8da2fe5b3ce9b1b178d2c6a8a1d3097a1a34f630569
  • Pointer size: 131 Bytes
  • Size of remote file: 122 kB
data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg ADDED

Git LFS Details

  • SHA256: e924e108575f4d47a892f436788c8fa5669282ed5bad2882568222daf634c1a5
  • Pointer size: 131 Bytes
  • Size of remote file: 118 kB
data/spotlight_reference_images/ref_0021_03670_Neural_Atlas_Graphs_for_Dynamic_Scene_Decomposition_and_Editing__16f8fc865baed696e798502de56f3b473dbf2b0b6aa3c1286f384de53c524b97.jpg ADDED
data/spotlight_reference_images/ref_0022_03671_STITCH-OPE_Trajectory_Stitching_with_Guided_Diffusion_for_Off-Policy_Evaluation__6e9c875dbc74a8b39bc947de3b928f8a29f7c6db9b03d726a62a961ba0c2fdd3.jpg ADDED
data/spotlight_reference_images/ref_0023_04013_scMRDR_A_scalable_and_flexible_framework_for_unpaired_single-cell_multi-omics_data_integration__85cde1b5a410b0d3275a5c0fa81dfe69e2c83c485c9917c296502a6485e2f68b.jpg ADDED
data/spotlight_reference_images/ref_0024_04165_Transformer_Copilot_Learning_from_The_Mistake_Log_in_LLM_Fine-tuning__8d1e804f51825d9760a37b8d1ca027a61deecc5e33fc172f0c1789814527c37e.jpg ADDED
data/spotlight_reference_images/ref_0025_04571_GeRaF_Neural_Geometry_Reconstruction_from_Radio_Frequency_Signals__e096c21c76e1eb88c8d865f73e832e8e9246cddec17f62b5d85bc051caa55165.jpg ADDED
data/spotlight_reference_images/ref_0026_04647_HopaDIFF_Holistic-Partial_Aware_Fourier_Conditioned_Diffusion_for_Referring_Human_Action_Segmentation_in_Multi-Person_Sc__0e15e7e99b52ccbc157e3ee04a6de2238b23ccd60543102fc2a70eddb12e5e41.jpg ADDED
data/spotlight_reference_images/ref_0027_04717_CSBrain_A_Cross-scale_Spatiotemporal_Brain_Foundation_Model_for_EEG_Decoding__327e1ef5f11138cef78e3dd270f09eea700aedb3c5a64cf848b125d13e4e5f08.jpg ADDED
data/spotlight_reference_images/ref_0028_05129_Learning_to_Factorize_Spatio-Temporal_Foundation_Models__ef11d1775e9863839bc4dfaf8711bf474ef80365303220ba92781b579879e7a5.jpg ADDED
data/spotlight_reference_images/ref_0029_05428_EDELINE_Enhancing_Memory_in_Diffusion-based_World_Models_via_Linear-Time_Sequence_Modeling__57df86687acccd0bad49997d26a4a442d6d09e2381e5a570d1d1e7efd02cf303.jpg ADDED
data/spotlight_reference_images/ref_0030_05467_Vision-centric_Token_Compression_in_Large_Language_Model__056bbf059f83c91ea896c610cef2927606ab780d910996e6cdb293dfaca40ddd.jpg ADDED
data/spotlight_reference_images/ref_0031_05610_Repo2Run_Automated_Building_Executable_Environment_for_Code_Repository_at_Scale__ea96f359e23ff3f0427d48dd3247314967bb531150d725a7680376b940324680.jpg ADDED
data/spotlight_reference_images/ref_0032_05774_Shallow_Diffuse_Robust_and_Invisible_Watermarking_through_Low-Dim_Subspaces_in_Diffusion_Models__703f7602f642aa354858ee8cf929888d672a45001a93ac0cb937cd0f4f1b62de.jpg ADDED
data/spotlight_reference_images/ref_0033_05814_Mozart_Modularized_and_Efficient_MoE_Training_on_35D_Wafer-Scale_Chiplet_Architectures__dc6b73dd98f93241717e3b658c319f70ab6ad6188c24a147cd052fb2153d656d.jpg ADDED
data/spotlight_reference_images/ref_0034_06044_Theory-Driven_Label-Specific_Representation_for_Incomplete_Multi-View_Multi-Label_Learning__4d4bbb3c5cd4edb56f73502b7eac2b526f30bfc883b337d949cb38ee0747ee22.jpg ADDED
data/spotlight_reference_images/ref_0035_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__8c0d4df1862409d631870861dc2f047a0cd2572e87267d0d3ea58b6c245408fe.jpg ADDED
data/spotlight_reference_images/ref_0036_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__a50d002b445446ba0c687045ba485e4c96aaf49765198b3f53fb954327df6f4f.jpg ADDED
data/spotlight_reference_images/ref_0037_06507_CausalPFN_Amortized_Causal_Effect_Estimation_via_In-Context_Learning__8c660a9d9ad153e854bc67151e7df9977e2244fa4ce9bd649c2b08b58db2e30c.jpg ADDED
data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg ADDED

Git LFS Details

  • SHA256: 0130a54d1dacffb6ae73bee7e2cc8694b25045adc6bcf1e4f1f5af6e41480187
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB