Spaces:
Running
Running
Deploy PaperBanana app
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +37 -0
- Dockerfile +27 -0
- README.md +6 -5
- aesthetic_guidelines.py +64 -0
- agents/__init__.py +17 -0
- agents/critic.py +234 -0
- agents/planner.py +117 -0
- agents/retriever.py +151 -0
- agents/stylist.py +104 -0
- agents/visualizer.py +199 -0
- app.py +283 -0
- config.py +22 -0
- data/spotlight_reference_images/ref_0001_00232_GraphMaster_Automated_Graph_Synthesis_via_LLM_Agents_in_Data-Limited_Environments__fdf13132133da88f7ce9ae4d0a22c29da1f05f75072f95010a29b1392696ea70.jpg +0 -0
- data/spotlight_reference_images/ref_0002_00279_VoxDet_Rethinking_3D_Semantic_Scene_Completion_as_Dense_Object_Detection__7b55d87bf0fcf6d787a440d59bf4617e6d73f10f5b1bcc1b45736ad0a7a57911.jpg +0 -0
- data/spotlight_reference_images/ref_0003_00491_SQS_Enhancing_Sparse_Perception_Models_via_Query-based_Splatting_in_Autonomous_Driving__f7c6f154dc3e45e2f58f5a9111ebf57ead0d19f9e77340cd3838d881af17a916.jpg +0 -0
- data/spotlight_reference_images/ref_0004_00691_ProtInvTree_Deliberate_Protein_Inverse_Folding_with_Reward-guided_Tree_Search__713bbbec11cbef0f1d2c8901e17165fa7db3d1fcf6cfa0d4a8b803d2dccb2ca0.jpg +0 -0
- data/spotlight_reference_images/ref_0005_00738_Mulberry_Empowering_MLLM_with_o1-like_Reasoning_and_Reflection_via_Collective_Monte_Carlo_Tree_Search__7c987fc8cf213eb47a038117cc38e5a170938289a390de4b8d7b6cd88512d505.jpg +0 -0
- data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg +3 -0
- data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg +3 -0
- data/spotlight_reference_images/ref_0008_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__ea910fb78f4d4027ff7fe9a63cbbc68a7038ee2e9f4250c2f773c631f265b079.jpg +0 -0
- data/spotlight_reference_images/ref_0009_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__4cc6ca3e3fa15065b6a1781e9a7f814d67a46649325289b63c9bea4072020f4c.jpg +0 -0
- data/spotlight_reference_images/ref_0010_01112_DiCo_Revitalizing_ConvNets_for_Scalable_and_Efficient_Diffusion_Modeling__589d9f3ec341480c16ec00bf41076999cd0ce6c1526b97e71eb9c8ffe33b1a1b.jpg +0 -0
- data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg +3 -0
- data/spotlight_reference_images/ref_0012_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__fb4694d88a5097ee72a936634a832ced85d11e90079a4fabc3f1b8c28f24e5d6.jpg +0 -0
- data/spotlight_reference_images/ref_0013_01620_FutureSightDrive_Thinking_Visually_with_Spatio-Temporal_CoT_for_Autonomous_Driving__8a4bffe9a69ef0d2bc0ced518bccdb3146d38727d7fc7402a418b6227a78bcfb.jpg +0 -0
- data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg +3 -0
- data/spotlight_reference_images/ref_0015_01839_Mesh-RFT_Enhancing_Mesh_Generation_via_Fine-grained_Reinforcement_Fine-Tuning__4d93550a61cdae636652e7cd1ca974c21c7fca1e8eb6eaa6fa2e03100f1d0f68.jpg +0 -0
- data/spotlight_reference_images/ref_0016_01864_Jacobian-Based_Interpretation_of_Nonlinear_Neural_Encoding_Model__c9ac984c977d21aff284dcfbacf7bdfea345cc73096a3f28d624b026654e1740.jpg +0 -0
- data/spotlight_reference_images/ref_0017_02109_OnlineSplatter_Pose-Free_Online_3D_Reconstruction_for_Free-Moving_Objects__abe4e12f0fc9a7487c6c5774f5ead6b391ede9f832960b5ce462ceb786534a0d.jpg +0 -0
- data/spotlight_reference_images/ref_0018_02239_RobustMerge_Parameter-Efficient_Model_Merging_for_MLLMs_with_Direction_Robustness__45b74fc957d305067e6f46f00bc32c9ee0889b7dc7e18b4c46913d263c1d8c16.jpg +0 -0
- data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg +3 -0
- data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg +3 -0
- data/spotlight_reference_images/ref_0021_03670_Neural_Atlas_Graphs_for_Dynamic_Scene_Decomposition_and_Editing__16f8fc865baed696e798502de56f3b473dbf2b0b6aa3c1286f384de53c524b97.jpg +0 -0
- data/spotlight_reference_images/ref_0022_03671_STITCH-OPE_Trajectory_Stitching_with_Guided_Diffusion_for_Off-Policy_Evaluation__6e9c875dbc74a8b39bc947de3b928f8a29f7c6db9b03d726a62a961ba0c2fdd3.jpg +0 -0
- data/spotlight_reference_images/ref_0023_04013_scMRDR_A_scalable_and_flexible_framework_for_unpaired_single-cell_multi-omics_data_integration__85cde1b5a410b0d3275a5c0fa81dfe69e2c83c485c9917c296502a6485e2f68b.jpg +0 -0
- data/spotlight_reference_images/ref_0024_04165_Transformer_Copilot_Learning_from_The_Mistake_Log_in_LLM_Fine-tuning__8d1e804f51825d9760a37b8d1ca027a61deecc5e33fc172f0c1789814527c37e.jpg +0 -0
- data/spotlight_reference_images/ref_0025_04571_GeRaF_Neural_Geometry_Reconstruction_from_Radio_Frequency_Signals__e096c21c76e1eb88c8d865f73e832e8e9246cddec17f62b5d85bc051caa55165.jpg +0 -0
- data/spotlight_reference_images/ref_0026_04647_HopaDIFF_Holistic-Partial_Aware_Fourier_Conditioned_Diffusion_for_Referring_Human_Action_Segmentation_in_Multi-Person_Sc__0e15e7e99b52ccbc157e3ee04a6de2238b23ccd60543102fc2a70eddb12e5e41.jpg +0 -0
- data/spotlight_reference_images/ref_0027_04717_CSBrain_A_Cross-scale_Spatiotemporal_Brain_Foundation_Model_for_EEG_Decoding__327e1ef5f11138cef78e3dd270f09eea700aedb3c5a64cf848b125d13e4e5f08.jpg +0 -0
- data/spotlight_reference_images/ref_0028_05129_Learning_to_Factorize_Spatio-Temporal_Foundation_Models__ef11d1775e9863839bc4dfaf8711bf474ef80365303220ba92781b579879e7a5.jpg +0 -0
- data/spotlight_reference_images/ref_0029_05428_EDELINE_Enhancing_Memory_in_Diffusion-based_World_Models_via_Linear-Time_Sequence_Modeling__57df86687acccd0bad49997d26a4a442d6d09e2381e5a570d1d1e7efd02cf303.jpg +0 -0
- data/spotlight_reference_images/ref_0030_05467_Vision-centric_Token_Compression_in_Large_Language_Model__056bbf059f83c91ea896c610cef2927606ab780d910996e6cdb293dfaca40ddd.jpg +0 -0
- data/spotlight_reference_images/ref_0031_05610_Repo2Run_Automated_Building_Executable_Environment_for_Code_Repository_at_Scale__ea96f359e23ff3f0427d48dd3247314967bb531150d725a7680376b940324680.jpg +0 -0
- data/spotlight_reference_images/ref_0032_05774_Shallow_Diffuse_Robust_and_Invisible_Watermarking_through_Low-Dim_Subspaces_in_Diffusion_Models__703f7602f642aa354858ee8cf929888d672a45001a93ac0cb937cd0f4f1b62de.jpg +0 -0
- data/spotlight_reference_images/ref_0033_05814_Mozart_Modularized_and_Efficient_MoE_Training_on_35D_Wafer-Scale_Chiplet_Architectures__dc6b73dd98f93241717e3b658c319f70ab6ad6188c24a147cd052fb2153d656d.jpg +0 -0
- data/spotlight_reference_images/ref_0034_06044_Theory-Driven_Label-Specific_Representation_for_Incomplete_Multi-View_Multi-Label_Learning__4d4bbb3c5cd4edb56f73502b7eac2b526f30bfc883b337d949cb38ee0747ee22.jpg +0 -0
- data/spotlight_reference_images/ref_0035_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__8c0d4df1862409d631870861dc2f047a0cd2572e87267d0d3ea58b6c245408fe.jpg +0 -0
- data/spotlight_reference_images/ref_0036_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__a50d002b445446ba0c687045ba485e4c96aaf49765198b3f53fb954327df6f4f.jpg +0 -0
- data/spotlight_reference_images/ref_0037_06507_CausalPFN_Amortized_Causal_Effect_Estimation_via_In-Context_Learning__8c660a9d9ad153e854bc67151e7df9977e2244fa4ce9bd649c2b08b58db2e30c.jpg +0 -0
- data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,40 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
data/spotlight_reference_images/ref_0040_06606_Disentangled_Concepts_Speak_Louder_Than_Words_Explainable_Video_Action_Recognition__fe96a76b160d3861e188cfe5511fee2d4f07eada1ebf92ade017048a3362d5b8.jpg filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
data/spotlight_reference_images/ref_0041_07858_HYPERION_Fine-Grained_Hypersphere_Alignment_for_Robust_Federated_Graph_Learning__be29a99497ec2dd4d3a8993ce3edc85c87505a1cabfadc2df234b7e326633ebc.jpg filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
data/spotlight_reference_images/ref_0043_08772_RepoMaster_Autonomous_Exploration_and_Understanding_of_GitHub_Repositories_for_Complex_Task_Solving__c5102f7309c920d53df2307418ef99304d083aa3f54140e3fa2e55c6b259378b.jpg filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
data/spotlight_reference_images/ref_0046_09315_Personalized_Decision_Modeling_Utility_Optimization_or_Textualized-Symbolic_Reasoning__56df140d7973f4f1a6286c7cebec84068dc6828711cea2e455416a0d9d381a99.jpg filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
data/spotlight_reference_images/ref_0048_09629_4DGT_Learning_a_4D_Gaussian_Transformer_Using_Real-World_Monocular_Videos__64516a621163af326843f0152bd0cdb8f798d2df70242271249a95e572c7a300.jpg filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
data/spotlight_reference_images/ref_0052_10520_TrajMamba_An_Efficient_and_Semantic-rich_Vehicle_Trajectory_Pre-training_Model__02d2a7b7aa1ad60cce35445c46fdcb453afa273d8a170ce0abe33ab2c8c6f245.jpg filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
data/spotlight_reference_images/ref_0058_13240_A_machine_learning_approach_that_beats_Rubiks_cubes__aec92a7999c868664250d8e9aad60b03dbacabd440355bec73af28c512c9d18a.jpg filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
data/spotlight_reference_images/ref_0060_14126_Deno-IF_Unsupervised_Noisy_Visible_and_Infrared_Image_Fusion_Method__685d5064d5b82a4e2e38976afb4b02e3359ccff154e8231646e76cb16970b7a0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
data/spotlight_reference_images/ref_0070_15841_Co-Reinforcement_Learning_for_Unified_Multimodal_Understanding_and_Generation__8d1bdeb48a8ecdf31ace6493caea90ec34e8e10428d5b91397cd5531c2b33b09.jpg filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
data/spotlight_reference_images/ref_0074_16584_IA-GGAD_Zero-shot_Generalist_Graph_Anomaly_Detection_via_Invariant_and_Affinity_Learning__2be0842b017b1925d07b25f0276d02ffabc8a00fe4e73cc930b0ee0096fcfd40.jpg filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
data/spotlight_reference_images/ref_0079_19455_Wide-Horizon_Thinking_and_Simulation-Based_Evaluation_for_Real-World_LLM_Planning_with_Multifaceted_Constraints__198c85b432b1ddbf1afc76ab8c98e057973c4763acbaa0f16a4da51d97460935.jpg filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
data/spotlight_reference_images/ref_0087_22755_Breaking_the_Batch_Barrier_B3_of_Contrastive_Learning_via_Smart_Batch_Mining__588e6566b5416ccc18d5f5733612cbc3caa0c11fe2e3bc5c57c32c88a9ec2e41.jpg filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
data/spotlight_reference_images/ref_0095_26919_FUDOKI_Discrete_Flow-based_Unified_Understanding_and_Generation_via_Kinetic-Optimal_Velocities__9c0a452656594ea3134b9cdcb16988663e9015013c42419254bc35661139b69f.jpg filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
data/spotlight_reference_images/ref_0096_26975_LogicTree_Improving_Complex_Reasoning_of_LLMs_via_Instantiated_Multi-step_Synthetic_Logical_Data__75918e90d782aa4c0011abbf0fe69a93e2595315a84e5dd3ba23b1cddfb672b5.jpg filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
data/spotlight_reference_images/ref_0098_27155_DexFlyWheel_A_Scalable_and_Self-improving_Data_Generation_Framework_for_Dexterous_Manipulation__6cb5d9f8f05d11ff6e6bf4f69015de5ad79051d577633793092cf0b753f0d1aa.jpg filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
examples/basic_example_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
examples/basic_example_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
examples/basic_example_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
examples/neurips_refs_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
examples/neurips_refs_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
examples/neurips_refs_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
examples/readme/ddpm_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
examples/readme/ddpm_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
examples/readme/ddpm_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
examples/readme/resnet_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
examples/readme/resnet_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
examples/readme/resnet_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
examples/readme/transformer_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
examples/readme/transformer_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
examples/readme/transformer_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# System deps
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
build-essential \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
# Create non-root user (HF Spaces requirement)
|
| 9 |
+
RUN useradd -m -u 1000 user
|
| 10 |
+
ENV HOME=/home/user \
|
| 11 |
+
PATH=/home/user/.local/bin:$PATH
|
| 12 |
+
|
| 13 |
+
WORKDIR /app
|
| 14 |
+
|
| 15 |
+
# Install Python deps first (cache layer)
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 18 |
+
pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy app code
|
| 21 |
+
COPY --chown=user . .
|
| 22 |
+
|
| 23 |
+
USER user
|
| 24 |
+
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
|
| 27 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
---
|
| 2 |
title: PaperBanana
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
-
|
| 10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: PaperBanana
|
| 3 |
+
emoji: 🍌
|
| 4 |
+
colorFrom: yellow
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: true
|
| 9 |
+
license: mit
|
| 10 |
+
short_description: Methodology text to architecture diagrams
|
| 11 |
---
|
|
|
|
|
|
aesthetic_guidelines.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Aesthetic Guidelines (G) for academic illustration styling.
|
| 3 |
+
Based on Appendix F of the PaperBanana paper.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
AESTHETIC_GUIDELINE = """
|
| 7 |
+
# Academic Illustration Style Guide (NeurIPS Style)
|
| 8 |
+
|
| 9 |
+
## Color Palette
|
| 10 |
+
- **Overall Aesthetic:** Soft Tech & Scientific Pastels ("NeurIPS Look")
|
| 11 |
+
- **Background Colors:** Cream (#FFF8E7), Pale Blue (#E3F2FD), Mint (#E8F5E9)
|
| 12 |
+
- **Accent Colors:**
|
| 13 |
+
- Soft Blue (#64B5F6) for primary processes
|
| 14 |
+
- Soft Orange (#FFB74D) for secondary/iterative processes
|
| 15 |
+
- Soft Purple (#9575CD) for highlighting key components
|
| 16 |
+
- Soft Green (#81C784) for success/outputs
|
| 17 |
+
- **Use color to group logical components**
|
| 18 |
+
|
| 19 |
+
## Shapes and Components
|
| 20 |
+
- **Process Boxes:** Rounded rectangles with subtle shadows
|
| 21 |
+
- **Data/Tensors:** 3D stacks or layered rectangles
|
| 22 |
+
- **Databases/Storage:** Cylinders or drum shapes
|
| 23 |
+
- **Agents/Models:** Robot or brain icons with labels
|
| 24 |
+
- **Inputs/Outputs:** Parallelograms or cloud shapes
|
| 25 |
+
|
| 26 |
+
## Lines and Arrows
|
| 27 |
+
- **Network/Architecture Diagrams:** Orthogonal/Elbow connectors
|
| 28 |
+
- **Logic Flow:** Curved arrows for feedback loops
|
| 29 |
+
- **Data Flow:** Straight arrows with clear directionality
|
| 30 |
+
- **Arrow Styles:** Solid for primary flow, dashed for optional/conditional
|
| 31 |
+
|
| 32 |
+
## Typography
|
| 33 |
+
- **Labels:** Sans-serif fonts (Arial, Roboto, Helvetica)
|
| 34 |
+
- **Mathematical Variables:** Serif Italic (Times New Roman) - use LaTeX notation (e.g., $P$, $P^*$)
|
| 35 |
+
- **Font Sizes:**
|
| 36 |
+
- Main labels: 12-14pt
|
| 37 |
+
- Subscript/technical: 10pt
|
| 38 |
+
- Section headers: 16pt bold
|
| 39 |
+
|
| 40 |
+
## Layout Principles
|
| 41 |
+
- **Hierarchy:** Left-to-right or top-to-bottom flow
|
| 42 |
+
- **Grouping:** Use containers/boxes with subtle backgrounds to group related components
|
| 43 |
+
- **Spacing:** Generous whitespace, consistent padding
|
| 44 |
+
- **Alignment:** Grid-based layout, aligned elements
|
| 45 |
+
- **Balance:** Visual weight distributed evenly
|
| 46 |
+
|
| 47 |
+
## Technical Details
|
| 48 |
+
- **Line Weight:** 1.5-2pt for main elements, 1pt for details
|
| 49 |
+
- **Corner Radius:** 8-12px for rounded rectangles
|
| 50 |
+
- **Shadow:** Subtle drop shadow (opacity 10-20%)
|
| 51 |
+
- **Icons:** Simple, consistent style throughout
|
| 52 |
+
|
| 53 |
+
## Diagram-Specific Guidelines
|
| 54 |
+
### Architecture Diagrams
|
| 55 |
+
- Show clear input → process → output flow
|
| 56 |
+
- Use containers to separate phases/stages
|
| 57 |
+
- Include feedback loops where applicable
|
| 58 |
+
|
| 59 |
+
### Methodology Diagrams
|
| 60 |
+
- Emphasize the pipeline structure
|
| 61 |
+
- Show agent interactions clearly
|
| 62 |
+
- Use consistent icons for similar components
|
| 63 |
+
- Annotate with mathematical notation where relevant
|
| 64 |
+
"""
|
agents/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agents package for PaperBanana framework.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .retriever import RetrieverAgent
|
| 6 |
+
from .planner import PlannerAgent
|
| 7 |
+
from .stylist import StylistAgent
|
| 8 |
+
from .visualizer import VisualizerAgent
|
| 9 |
+
from .critic import CriticAgent
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
'RetrieverAgent',
|
| 13 |
+
'PlannerAgent',
|
| 14 |
+
'StylistAgent',
|
| 15 |
+
'VisualizerAgent',
|
| 16 |
+
'CriticAgent'
|
| 17 |
+
]
|
agents/critic.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Critic Agent for PaperBanana framework.
|
| 3 |
+
|
| 4 |
+
Forms closed-loop refinement mechanism by identifying factual misalignments
|
| 5 |
+
or visual glitches and providing feedback for iterative improvement.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
from google import genai
|
| 10 |
+
from google.genai import types
|
| 11 |
+
import config
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class CriticAgent:
|
| 15 |
+
"""
|
| 16 |
+
Critic Agent: Provides iterative feedback for refinement.
|
| 17 |
+
|
| 18 |
+
Identifies factual misalignments, visual glitches, and areas for improvement
|
| 19 |
+
in generated illustrations, enabling closed-loop refinement.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
"""Initialize Critic Agent."""
|
| 24 |
+
self.client = genai.Client(api_key=config.GEMINI_API_KEY)
|
| 25 |
+
self.model = config.VLM_MODEL
|
| 26 |
+
|
| 27 |
+
def critique(self,
|
| 28 |
+
methodology_text: str,
|
| 29 |
+
caption: str,
|
| 30 |
+
current_description: str,
|
| 31 |
+
generated_image_path: str = None,
|
| 32 |
+
iteration: int = 1) -> Dict[str, any]:
|
| 33 |
+
"""
|
| 34 |
+
Provide critique and feedback on current illustration.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
methodology_text: Original methodology description
|
| 38 |
+
caption: Target diagram caption
|
| 39 |
+
current_description: Current textual description
|
| 40 |
+
generated_image_path: Path to generated image (if available)
|
| 41 |
+
iteration: Current iteration number
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Dictionary containing:
|
| 45 |
+
- 'feedback': Textual feedback
|
| 46 |
+
- 'issues': List of identified issues
|
| 47 |
+
- 'suggestions': List of improvement suggestions
|
| 48 |
+
- 'should_continue': Boolean indicating if refinement should continue
|
| 49 |
+
"""
|
| 50 |
+
prompt = self._create_critique_prompt(
|
| 51 |
+
methodology_text,
|
| 52 |
+
caption,
|
| 53 |
+
current_description,
|
| 54 |
+
iteration
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
contents = [
|
| 58 |
+
types.Content(
|
| 59 |
+
role="user",
|
| 60 |
+
parts=[types.Part.from_text(text=prompt)]
|
| 61 |
+
)
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
# If we have an image, we could add it to the critique (future enhancement)
|
| 65 |
+
# For now, we critique based on the description
|
| 66 |
+
|
| 67 |
+
generate_config = types.GenerateContentConfig(
|
| 68 |
+
thinking_config=types.ThinkingConfig(
|
| 69 |
+
thinking_level=config.THINKING_LEVEL
|
| 70 |
+
)
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
critique_text = ""
|
| 74 |
+
for chunk in self.client.models.generate_content_stream(
|
| 75 |
+
model=self.model,
|
| 76 |
+
contents=contents,
|
| 77 |
+
config=generate_config
|
| 78 |
+
):
|
| 79 |
+
critique_text += chunk.text
|
| 80 |
+
|
| 81 |
+
# Parse critique into structured feedback
|
| 82 |
+
result = self._parse_critique(critique_text, iteration)
|
| 83 |
+
|
| 84 |
+
return result
|
| 85 |
+
|
| 86 |
+
def _create_critique_prompt(self,
|
| 87 |
+
methodology_text: str,
|
| 88 |
+
caption: str,
|
| 89 |
+
current_description: str,
|
| 90 |
+
iteration: int) -> str:
|
| 91 |
+
"""Create prompt for critique generation."""
|
| 92 |
+
prompt = f"""You are an expert reviewer of academic illustrations, specializing in methodology diagrams.
|
| 93 |
+
|
| 94 |
+
Your task is to critically evaluate a textual description for an academic diagram and provide constructive feedback.
|
| 95 |
+
|
| 96 |
+
ORIGINAL METHODOLOGY:
|
| 97 |
+
{methodology_text}
|
| 98 |
+
|
| 99 |
+
TARGET CAPTION:
|
| 100 |
+
{caption}
|
| 101 |
+
|
| 102 |
+
CURRENT ILLUSTRATION DESCRIPTION (Iteration {iteration}):
|
| 103 |
+
{current_description}
|
| 104 |
+
|
| 105 |
+
EVALUATION CRITERIA:
|
| 106 |
+
|
| 107 |
+
1. **Faithfulness**: Does the description accurately represent all key aspects of the methodology?
|
| 108 |
+
- Are all important components mentioned?
|
| 109 |
+
- Is the flow/logic correctly represented?
|
| 110 |
+
- Are there any factual errors or misrepresentations?
|
| 111 |
+
|
| 112 |
+
2. **Conciseness**: Is the description appropriately detailed without being cluttered?
|
| 113 |
+
- Is information density appropriate?
|
| 114 |
+
- Are there redundant elements?
|
| 115 |
+
- Is anything unnecessarily complex?
|
| 116 |
+
|
| 117 |
+
3. **Readability**: Will the resulting diagram be easy to understand?
|
| 118 |
+
- Is the layout logical?
|
| 119 |
+
- Are labels clear and informative?
|
| 120 |
+
- Is visual hierarchy appropriate?
|
| 121 |
+
|
| 122 |
+
4. **Aesthetics**: Does the description specify professional visual design?
|
| 123 |
+
- Are colors, shapes, and typography well-defined?
|
| 124 |
+
- Is there visual consistency?
|
| 125 |
+
- Does it match academic publication standards?
|
| 126 |
+
|
| 127 |
+
YOUR TASK:
|
| 128 |
+
Provide a structured critique covering:
|
| 129 |
+
|
| 130 |
+
ISSUES FOUND:
|
| 131 |
+
- List specific problems (e.g., "Missing connection between X and Y")
|
| 132 |
+
- Rate severity: CRITICAL, MAJOR, or MINOR
|
| 133 |
+
|
| 134 |
+
SUGGESTIONS FOR IMPROVEMENT:
|
| 135 |
+
- Provide concrete, actionable suggestions
|
| 136 |
+
- Prioritize by impact
|
| 137 |
+
|
| 138 |
+
OVERALL ASSESSMENT:
|
| 139 |
+
- Is this ready for visualization, or does it need refinement?
|
| 140 |
+
- If iteration {iteration} < 3, should we continue refining?
|
| 141 |
+
|
| 142 |
+
OUTPUT FORMAT:
|
| 143 |
+
Structure your response as:
|
| 144 |
+
|
| 145 |
+
ISSUES:
|
| 146 |
+
1. [SEVERITY] Issue description
|
| 147 |
+
2. [SEVERITY] Issue description
|
| 148 |
+
...
|
| 149 |
+
|
| 150 |
+
SUGGESTIONS:
|
| 151 |
+
1. Specific suggestion
|
| 152 |
+
2. Specific suggestion
|
| 153 |
+
...
|
| 154 |
+
|
| 155 |
+
DECISION: [READY / NEEDS_REFINEMENT]
|
| 156 |
+
REASONING: Brief explanation of the decision
|
| 157 |
+
"""
|
| 158 |
+
return prompt
|
| 159 |
+
|
| 160 |
+
def _parse_critique(self, critique_text: str, iteration: int) -> Dict:
|
| 161 |
+
"""Parse critique text into structured format."""
|
| 162 |
+
issues = []
|
| 163 |
+
suggestions = []
|
| 164 |
+
should_continue = True
|
| 165 |
+
|
| 166 |
+
# Simple parsing - look for key sections
|
| 167 |
+
lines = critique_text.split('\n')
|
| 168 |
+
current_section = None
|
| 169 |
+
|
| 170 |
+
for line in lines:
|
| 171 |
+
line_upper = line.upper().strip()
|
| 172 |
+
|
| 173 |
+
if 'ISSUES:' in line_upper:
|
| 174 |
+
current_section = 'issues'
|
| 175 |
+
continue
|
| 176 |
+
elif 'SUGGESTIONS:' in line_upper or 'SUGGESTION' in line_upper:
|
| 177 |
+
current_section = 'suggestions'
|
| 178 |
+
continue
|
| 179 |
+
elif 'DECISION:' in line_upper:
|
| 180 |
+
current_section = 'decision'
|
| 181 |
+
if 'READY' in line_upper and 'NEEDS_REFINEMENT' not in line_upper:
|
| 182 |
+
should_continue = False
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
# Parse content
|
| 186 |
+
line = line.strip()
|
| 187 |
+
if not line or line.startswith('#'):
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
if current_section == 'issues' and (line.startswith('-') or line[0].isdigit()):
|
| 191 |
+
issues.append(line.lstrip('-').lstrip('0123456789.').strip())
|
| 192 |
+
elif current_section == 'suggestions' and (line.startswith('-') or line[0].isdigit()):
|
| 193 |
+
suggestions.append(line.lstrip('-').lstrip('0123456789.').strip())
|
| 194 |
+
|
| 195 |
+
# Don't continue past max iterations
|
| 196 |
+
if iteration >= config.MAX_REFINEMENT_ITERATIONS:
|
| 197 |
+
should_continue = False
|
| 198 |
+
|
| 199 |
+
return {
|
| 200 |
+
'feedback': critique_text,
|
| 201 |
+
'issues': issues,
|
| 202 |
+
'suggestions': suggestions,
|
| 203 |
+
'should_continue': should_continue
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
def generate_refinement_prompt(self,
|
| 207 |
+
original_description: str,
|
| 208 |
+
critique: Dict) -> str:
|
| 209 |
+
"""
|
| 210 |
+
Generate prompt for refinement based on critique.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
original_description: Current description
|
| 214 |
+
critique: Critique dictionary from critique()
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
Prompt for Planner to refine the description
|
| 218 |
+
"""
|
| 219 |
+
issues_text = "\n".join([f"- {issue}" for issue in critique['issues']])
|
| 220 |
+
suggestions_text = "\n".join([f"- {sug}" for sug in critique['suggestions']])
|
| 221 |
+
|
| 222 |
+
refinement_prompt = f"""CURRENT DESCRIPTION:
|
| 223 |
+
{original_description}
|
| 224 |
+
|
| 225 |
+
IDENTIFIED ISSUES:
|
| 226 |
+
{issues_text}
|
| 227 |
+
|
| 228 |
+
SUGGESTIONS FOR IMPROVEMENT:
|
| 229 |
+
{suggestions_text}
|
| 230 |
+
|
| 231 |
+
Please revise the description to address these issues and incorporate the suggestions.
|
| 232 |
+
Maintain all correct elements while fixing the identified problems.
|
| 233 |
+
"""
|
| 234 |
+
return refinement_prompt
|
agents/planner.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Planner Agent for PaperBanana framework.
|
| 3 |
+
|
| 4 |
+
Serves as the cognitive core. Translates unstructured methodology data
|
| 5 |
+
into comprehensive textual description of the target illustration.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
from typing import List, Dict, Any
|
| 9 |
+
from google import genai
|
| 10 |
+
from google.genai import types
|
| 11 |
+
import config
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class PlannerAgent:
|
| 15 |
+
"""
|
| 16 |
+
Planner Agent: Translates methodology into comprehensive illustration description.
|
| 17 |
+
|
| 18 |
+
The cognitive core that interprets source context S and communicative intent C,
|
| 19 |
+
then produces detailed textual description P of the target illustration.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
"""Initialize Planner Agent."""
|
| 24 |
+
self.client = genai.Client(api_key=config.GEMINI_API_KEY)
|
| 25 |
+
self.model = config.VLM_MODEL
|
| 26 |
+
|
| 27 |
+
def plan(self,
|
| 28 |
+
methodology_text: str,
|
| 29 |
+
caption: str,
|
| 30 |
+
reference_examples: List[Dict[str, Any]] = None) -> str:
|
| 31 |
+
"""
|
| 32 |
+
Generate comprehensive textual description of target illustration.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
methodology_text: Source methodology description (S)
|
| 36 |
+
caption: Diagram caption (part of C)
|
| 37 |
+
reference_examples: Retrieved reference examples (E)
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Detailed textual description P of the illustration
|
| 41 |
+
"""
|
| 42 |
+
prompt = self._create_planning_prompt(methodology_text, caption, reference_examples)
|
| 43 |
+
|
| 44 |
+
contents = [
|
| 45 |
+
types.Content(
|
| 46 |
+
role="user",
|
| 47 |
+
parts=[types.Part.from_text(text=prompt)]
|
| 48 |
+
)
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
generate_config = types.GenerateContentConfig(
|
| 52 |
+
thinking_config=types.ThinkingConfig(
|
| 53 |
+
thinking_level=config.THINKING_LEVEL
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
description = ""
|
| 58 |
+
for chunk in self.client.models.generate_content_stream(
|
| 59 |
+
model=self.model,
|
| 60 |
+
contents=contents,
|
| 61 |
+
config=generate_config
|
| 62 |
+
):
|
| 63 |
+
description += chunk.text
|
| 64 |
+
|
| 65 |
+
return description.strip()
|
| 66 |
+
|
| 67 |
+
def _create_planning_prompt(self,
|
| 68 |
+
methodology_text: str,
|
| 69 |
+
caption: str,
|
| 70 |
+
reference_examples: List[Dict[str, Any]] = None) -> str:
|
| 71 |
+
"""Create prompt for generating illustration description."""
|
| 72 |
+
|
| 73 |
+
# Include reference examples if available
|
| 74 |
+
reference_context = ""
|
| 75 |
+
if reference_examples:
|
| 76 |
+
reference_context = "\n\nREFERENCE EXAMPLES (for inspiration):\n"
|
| 77 |
+
for i, ref in enumerate(reference_examples[:3], 1): # Use top 3
|
| 78 |
+
reference_context += f"\nExample {i}:\n"
|
| 79 |
+
reference_context += f"Domain: {ref.get('domain', 'N/A')}\n"
|
| 80 |
+
reference_context += f"Type: {ref.get('diagram_type', 'N/A')}\n"
|
| 81 |
+
reference_context += f"Description: {ref.get('description', 'N/A')}\n"
|
| 82 |
+
|
| 83 |
+
prompt = f"""You are an expert at designing academic methodology diagrams for scientific publications.
|
| 84 |
+
|
| 85 |
+
Your task is to create a COMPREHENSIVE and DETAILED textual description of an illustration that would
|
| 86 |
+
effectively visualize the given methodology. This description will be used to generate the actual diagram.
|
| 87 |
+
|
| 88 |
+
METHODOLOGY TO VISUALIZE:
|
| 89 |
+
{methodology_text}
|
| 90 |
+
|
| 91 |
+
TARGET DIAGRAM CAPTION:
|
| 92 |
+
{caption}
|
| 93 |
+
{reference_context}
|
| 94 |
+
|
| 95 |
+
REQUIREMENTS:
|
| 96 |
+
1. **Layout Structure**: Specify the overall layout (left-to-right, top-to-bottom, circular, etc.)
|
| 97 |
+
2. **Components**: List all visual elements needed (boxes, arrows, icons, labels, etc.)
|
| 98 |
+
3. **Content**: What text/symbols should appear in each component
|
| 99 |
+
4. **Connections**: How components connect (arrows, lines, groupings)
|
| 100 |
+
5. **Hierarchy**: Which elements are primary vs secondary
|
| 101 |
+
6. **Grouping**: How to group related components (containers, background colors)
|
| 102 |
+
7. **Flow**: The logical flow of information through the diagram
|
| 103 |
+
8. **Key Details**: Important technical details, equations, or annotations
|
| 104 |
+
|
| 105 |
+
IMPORTANT GUIDELINES:
|
| 106 |
+
- Be specific about spatial relationships and positioning
|
| 107 |
+
- Describe the logical flow clearly (input → process → output)
|
| 108 |
+
- Include any mathematical notation or technical terminology
|
| 109 |
+
- Consider the target audience (academic researchers)
|
| 110 |
+
- Focus on clarity and information density
|
| 111 |
+
- Think about how this supports the paper's narrative
|
| 112 |
+
|
| 113 |
+
OUTPUT FORMAT:
|
| 114 |
+
Provide a detailed paragraph-form description that covers all aspects above.
|
| 115 |
+
Be thorough - this description should be sufficient for someone to create the diagram without seeing the original methodology.
|
| 116 |
+
"""
|
| 117 |
+
return prompt
|
agents/retriever.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Retriever Agent for PaperBanana framework.
|
| 3 |
+
|
| 4 |
+
Identifies the N most relevant examples from a reference set using VLM ranking.
|
| 5 |
+
Matches based on research domain and diagram type.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
from typing import List, Dict, Any
|
| 9 |
+
from google import genai
|
| 10 |
+
from google.genai import types
|
| 11 |
+
import config
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class RetrieverAgent:
|
| 15 |
+
"""
|
| 16 |
+
Retriever Agent: Identifies relevant reference examples from a fixed reference set.
|
| 17 |
+
|
| 18 |
+
Uses generative retrieval approach where VLM ranks candidates by matching
|
| 19 |
+
research domain and diagram type.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, reference_set: List[Dict[str, Any]] = None):
|
| 23 |
+
"""
|
| 24 |
+
Initialize Retriever Agent.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
reference_set: List of reference examples with metadata
|
| 28 |
+
Each example should have: {
|
| 29 |
+
'id': str,
|
| 30 |
+
'domain': str,
|
| 31 |
+
'diagram_type': str,
|
| 32 |
+
'description': str,
|
| 33 |
+
'image_path': str (optional)
|
| 34 |
+
}
|
| 35 |
+
"""
|
| 36 |
+
self.client = genai.Client(api_key=config.GEMINI_API_KEY)
|
| 37 |
+
self.model = config.VLM_MODEL
|
| 38 |
+
self.reference_set = reference_set or []
|
| 39 |
+
|
| 40 |
+
def retrieve(self,
|
| 41 |
+
methodology_text: str,
|
| 42 |
+
caption: str,
|
| 43 |
+
n: int = config.NUM_REFERENCE_EXAMPLES) -> List[Dict[str, Any]]:
|
| 44 |
+
"""
|
| 45 |
+
Retrieve the N most relevant reference examples.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
methodology_text: Source methodology description
|
| 49 |
+
caption: Target diagram caption
|
| 50 |
+
n: Number of examples to retrieve
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
List of N most relevant reference examples
|
| 54 |
+
"""
|
| 55 |
+
if not self.reference_set:
|
| 56 |
+
print("Warning: No reference set provided. Skipping retrieval.")
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
# Create retrieval prompt
|
| 60 |
+
prompt = self._create_retrieval_prompt(methodology_text, caption, n)
|
| 61 |
+
|
| 62 |
+
# Query VLM for ranking
|
| 63 |
+
contents = [
|
| 64 |
+
types.Content(
|
| 65 |
+
role="user",
|
| 66 |
+
parts=[types.Part.from_text(text=prompt)]
|
| 67 |
+
)
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
generate_config = types.GenerateContentConfig(
|
| 71 |
+
thinking_config=types.ThinkingConfig(
|
| 72 |
+
thinking_level=config.THINKING_LEVEL
|
| 73 |
+
)
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
response_text = ""
|
| 77 |
+
for chunk in self.client.models.generate_content_stream(
|
| 78 |
+
model=self.model,
|
| 79 |
+
contents=contents,
|
| 80 |
+
config=generate_config
|
| 81 |
+
):
|
| 82 |
+
response_text += chunk.text
|
| 83 |
+
|
| 84 |
+
# Parse the response to extract selected example IDs
|
| 85 |
+
selected_examples = self._parse_retrieval_response(response_text, n)
|
| 86 |
+
|
| 87 |
+
return selected_examples
|
| 88 |
+
|
| 89 |
+
def _create_retrieval_prompt(self, methodology_text: str, caption: str, n: int) -> str:
|
| 90 |
+
"""Create prompt for retrieving relevant examples."""
|
| 91 |
+
# Create a summary of available references
|
| 92 |
+
reference_summary = "\n".join([
|
| 93 |
+
f"ID: {ref['id']}\nDomain: {ref['domain']}\nType: {ref['diagram_type']}\nDescription: {ref['description']}\n"
|
| 94 |
+
for ref in self.reference_set
|
| 95 |
+
])
|
| 96 |
+
|
| 97 |
+
prompt = f"""You are an expert at identifying relevant academic illustration examples.
|
| 98 |
+
|
| 99 |
+
Given a methodology description and diagram caption, select the {n} most relevant reference examples
|
| 100 |
+
from the provided set. Consider:
|
| 101 |
+
1. Research domain similarity (e.g., NLP, Computer Vision, Reinforcement Learning)
|
| 102 |
+
2. Diagram type similarity (e.g., architecture diagram, flowchart, pipeline)
|
| 103 |
+
3. Conceptual similarity in the methodology
|
| 104 |
+
|
| 105 |
+
METHODOLOGY:
|
| 106 |
+
{methodology_text}
|
| 107 |
+
|
| 108 |
+
TARGET CAPTION:
|
| 109 |
+
{caption}
|
| 110 |
+
|
| 111 |
+
AVAILABLE REFERENCE EXAMPLES:
|
| 112 |
+
{reference_summary}
|
| 113 |
+
|
| 114 |
+
OUTPUT FORMAT:
|
| 115 |
+
Return only the IDs of the {n} most relevant examples, one per line, ranked from most to least relevant.
|
| 116 |
+
Example output:
|
| 117 |
+
ref_001
|
| 118 |
+
ref_005
|
| 119 |
+
ref_012
|
| 120 |
+
"""
|
| 121 |
+
return prompt
|
| 122 |
+
|
| 123 |
+
def _parse_retrieval_response(self, response_text: str, n: int) -> List[Dict[str, Any]]:
|
| 124 |
+
"""Parse VLM response to extract selected examples."""
|
| 125 |
+
# Extract IDs from response
|
| 126 |
+
lines = response_text.strip().split('\n')
|
| 127 |
+
selected_ids = []
|
| 128 |
+
|
| 129 |
+
for line in lines:
|
| 130 |
+
line = line.strip()
|
| 131 |
+
# Look for reference IDs
|
| 132 |
+
for ref in self.reference_set:
|
| 133 |
+
if ref['id'] in line:
|
| 134 |
+
selected_ids.append(ref['id'])
|
| 135 |
+
break
|
| 136 |
+
if len(selected_ids) >= n:
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
# Get full reference objects
|
| 140 |
+
selected_examples = []
|
| 141 |
+
for ref_id in selected_ids:
|
| 142 |
+
for ref in self.reference_set:
|
| 143 |
+
if ref['id'] == ref_id:
|
| 144 |
+
selected_examples.append(ref)
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
+
# If we didn't get enough, just take the first n
|
| 148 |
+
if len(selected_examples) < n:
|
| 149 |
+
selected_examples = self.reference_set[:n]
|
| 150 |
+
|
| 151 |
+
return selected_examples[:n]
|
agents/stylist.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stylist Agent for PaperBanana framework.
|
| 3 |
+
|
| 4 |
+
Acts as a design consultant. Uses automatically synthesized aesthetic
|
| 5 |
+
guidelines to refine initial description into stylistically optimized version.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
from google import genai
|
| 9 |
+
from google.genai import types
|
| 10 |
+
import config
|
| 11 |
+
from aesthetic_guidelines import AESTHETIC_GUIDELINE
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class StylistAgent:
|
| 15 |
+
"""
|
| 16 |
+
Stylist Agent: Refines illustration descriptions using aesthetic guidelines.
|
| 17 |
+
|
| 18 |
+
Takes initial description P and enhances it with style guidance G
|
| 19 |
+
to produce stylistically optimized description P*.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, custom_guidelines: str = None):
|
| 23 |
+
"""
|
| 24 |
+
Initialize Stylist Agent.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
custom_guidelines: Optional custom aesthetic guidelines.
|
| 28 |
+
If None, uses default NeurIPS-style guidelines.
|
| 29 |
+
"""
|
| 30 |
+
self.client = genai.Client(api_key=config.GEMINI_API_KEY)
|
| 31 |
+
self.model = config.VLM_MODEL
|
| 32 |
+
self.guidelines = custom_guidelines or AESTHETIC_GUIDELINE
|
| 33 |
+
|
| 34 |
+
def refine(self, initial_description: str) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Refine initial description with aesthetic styling.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
initial_description: Initial textual description P
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Stylistically optimized description P*
|
| 43 |
+
"""
|
| 44 |
+
prompt = self._create_styling_prompt(initial_description)
|
| 45 |
+
|
| 46 |
+
contents = [
|
| 47 |
+
types.Content(
|
| 48 |
+
role="user",
|
| 49 |
+
parts=[types.Part.from_text(text=prompt)]
|
| 50 |
+
)
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
generate_config = types.GenerateContentConfig(
|
| 54 |
+
thinking_config=types.ThinkingConfig(
|
| 55 |
+
thinking_level=config.THINKING_LEVEL
|
| 56 |
+
)
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
refined_description = ""
|
| 60 |
+
for chunk in self.client.models.generate_content_stream(
|
| 61 |
+
model=self.model,
|
| 62 |
+
contents=contents,
|
| 63 |
+
config=generate_config
|
| 64 |
+
):
|
| 65 |
+
refined_description += chunk.text
|
| 66 |
+
|
| 67 |
+
return refined_description.strip()
|
| 68 |
+
|
| 69 |
+
def _create_styling_prompt(self, initial_description: str) -> str:
|
| 70 |
+
"""Create prompt for aesthetic refinement."""
|
| 71 |
+
prompt = f"""You are an expert design consultant specializing in academic publication illustrations.
|
| 72 |
+
|
| 73 |
+
Your task is to take an initial diagram description and enhance it with specific aesthetic and design details
|
| 74 |
+
to create a polished, publication-ready illustration that follows academic standards.
|
| 75 |
+
|
| 76 |
+
INITIAL DESCRIPTION:
|
| 77 |
+
{initial_description}
|
| 78 |
+
|
| 79 |
+
AESTHETIC GUIDELINES TO FOLLOW:
|
| 80 |
+
{self.guidelines}
|
| 81 |
+
|
| 82 |
+
YOUR TASK:
|
| 83 |
+
Refine the initial description by adding specific visual design details:
|
| 84 |
+
|
| 85 |
+
1. **Color Specifications**: Add specific color choices from the palette (e.g., "soft blue #64B5F6 for the main process boxes")
|
| 86 |
+
2. **Shape Details**: Specify exact shapes and their styling (e.g., "rounded rectangles with 10px radius and subtle shadow")
|
| 87 |
+
3. **Typography**: Define font choices for different text elements
|
| 88 |
+
4. **Visual Hierarchy**: Enhance descriptions of size, weight, and emphasis relationships
|
| 89 |
+
5. **Spacing & Layout**: Add details about padding, margins, and alignment
|
| 90 |
+
6. **Professional Polish**: Include finishing touches like shadows, borders, gradients
|
| 91 |
+
|
| 92 |
+
IMPORTANT:
|
| 93 |
+
- Preserve ALL content and structural information from the initial description
|
| 94 |
+
- Add aesthetic details WITHOUT changing the fundamental design or information flow
|
| 95 |
+
- Be specific with measurements, colors (hex codes), and styling parameters
|
| 96 |
+
- Ensure the result maintains academic professionalism and clarity
|
| 97 |
+
- The output should be suitable for direct input to an image generation model
|
| 98 |
+
|
| 99 |
+
OUTPUT FORMAT:
|
| 100 |
+
Provide the enhanced description as a detailed, flowing paragraph that seamlessly integrates
|
| 101 |
+
the original content with the aesthetic specifications. Make it vivid and precise enough that
|
| 102 |
+
an image generation model can render it accurately.
|
| 103 |
+
"""
|
| 104 |
+
return prompt
|
agents/visualizer.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visualizer Agent for PaperBanana framework.
|
| 3 |
+
|
| 4 |
+
Renders academic illustrations using image generation models.
|
| 5 |
+
Supports both diagram generation and statistical plot generation.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
import mimetypes
|
| 9 |
+
from typing import Optional
|
| 10 |
+
from google import genai
|
| 11 |
+
from google.genai import types
|
| 12 |
+
import config
|
| 13 |
+
from utils import save_binary_file
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class VisualizerAgent:
|
| 17 |
+
"""
|
| 18 |
+
Visualizer Agent: Renders illustrations from textual descriptions.
|
| 19 |
+
|
| 20 |
+
Supports two modes:
|
| 21 |
+
1. Diagram mode: Uses image generation model (Nano-Banana-Pro / Gemini Image)
|
| 22 |
+
2. Plot mode: Generates Python Matplotlib code for statistical plots
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, mode: str = "diagram"):
|
| 26 |
+
"""
|
| 27 |
+
Initialize Visualizer Agent.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
mode: Generation mode - "diagram" or "plot"
|
| 31 |
+
"""
|
| 32 |
+
self.client = genai.Client(api_key=config.GEMINI_API_KEY)
|
| 33 |
+
self.mode = mode
|
| 34 |
+
|
| 35 |
+
if mode == "diagram":
|
| 36 |
+
self.model = config.IMAGE_MODEL
|
| 37 |
+
elif mode == "plot":
|
| 38 |
+
self.model = config.VLM_MODEL # Use VLM for code generation
|
| 39 |
+
else:
|
| 40 |
+
raise ValueError(f"Invalid mode: {mode}. Use 'diagram' or 'plot'")
|
| 41 |
+
|
| 42 |
+
def visualize(self,
|
| 43 |
+
description: str,
|
| 44 |
+
output_path: str = "output",
|
| 45 |
+
data: dict = None) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Generate visualization from description.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
description: Textual description of the illustration
|
| 51 |
+
output_path: Base path for output file (without extension)
|
| 52 |
+
data: Optional data dict for plot mode
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Path to generated image file or code file
|
| 56 |
+
"""
|
| 57 |
+
if self.mode == "diagram":
|
| 58 |
+
return self._generate_diagram(description, output_path)
|
| 59 |
+
elif self.mode == "plot":
|
| 60 |
+
return self._generate_plot(description, output_path, data)
|
| 61 |
+
|
| 62 |
+
def _generate_diagram(self, description: str, output_path: str) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Generate diagram image using image generation model.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
description: Detailed visual description
|
| 68 |
+
output_path: Base path for output file
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
Path to generated image
|
| 72 |
+
"""
|
| 73 |
+
# Create prompt for image generation
|
| 74 |
+
prompt = f"""Generate a high-quality academic methodology diagram with the following specifications:
|
| 75 |
+
|
| 76 |
+
{description}
|
| 77 |
+
|
| 78 |
+
Requirements:
|
| 79 |
+
- Professional academic publication quality
|
| 80 |
+
- Clear, readable text and labels
|
| 81 |
+
- Consistent styling throughout
|
| 82 |
+
- Appropriate use of colors and shapes
|
| 83 |
+
- Publication-ready resolution
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
contents = [
|
| 87 |
+
types.Content(
|
| 88 |
+
role="user",
|
| 89 |
+
parts=[types.Part.from_text(text=prompt)]
|
| 90 |
+
)
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
generate_config = types.GenerateContentConfig(
|
| 94 |
+
response_modalities=["IMAGE", "TEXT"],
|
| 95 |
+
image_config=types.ImageConfig(
|
| 96 |
+
image_size=config.IMAGE_SIZE
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
file_index = 0
|
| 101 |
+
saved_path = None
|
| 102 |
+
|
| 103 |
+
for chunk in self.client.models.generate_content_stream(
|
| 104 |
+
model=self.model,
|
| 105 |
+
contents=contents,
|
| 106 |
+
config=generate_config
|
| 107 |
+
):
|
| 108 |
+
if (chunk.candidates is None or
|
| 109 |
+
chunk.candidates[0].content is None or
|
| 110 |
+
chunk.candidates[0].content.parts is None):
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
# Check for inline image data
|
| 114 |
+
part = chunk.candidates[0].content.parts[0]
|
| 115 |
+
if part.inline_data and part.inline_data.data:
|
| 116 |
+
inline_data = part.inline_data
|
| 117 |
+
data_buffer = inline_data.data
|
| 118 |
+
file_extension = mimetypes.guess_extension(inline_data.mime_type)
|
| 119 |
+
|
| 120 |
+
if file_extension:
|
| 121 |
+
file_name = f"{output_path}_{file_index}{file_extension}"
|
| 122 |
+
saved_path = save_binary_file(file_name, data_buffer)
|
| 123 |
+
file_index += 1
|
| 124 |
+
else:
|
| 125 |
+
# Print any text output
|
| 126 |
+
if chunk.text:
|
| 127 |
+
print(chunk.text)
|
| 128 |
+
|
| 129 |
+
return saved_path or f"{output_path}_0.png"
|
| 130 |
+
|
| 131 |
+
def _generate_plot(self, description: str, output_path: str, data: dict = None) -> str:
|
| 132 |
+
"""
|
| 133 |
+
Generate statistical plot by creating Matplotlib code.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
description: Description of desired plot
|
| 137 |
+
output_path: Base path for output code file
|
| 138 |
+
data: Optional data dictionary
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Path to generated Python code file
|
| 142 |
+
"""
|
| 143 |
+
data_context = ""
|
| 144 |
+
if data:
|
| 145 |
+
data_context = f"\n\nDATA PROVIDED:\n{str(data)}\n"
|
| 146 |
+
|
| 147 |
+
prompt = f"""You are an expert at creating publication-quality statistical plots using Matplotlib.
|
| 148 |
+
|
| 149 |
+
Generate complete, executable Python code using Matplotlib to create the following plot:
|
| 150 |
+
|
| 151 |
+
{description}
|
| 152 |
+
{data_context}
|
| 153 |
+
|
| 154 |
+
Requirements:
|
| 155 |
+
1. Use professional academic styling (seaborn-paper style or similar)
|
| 156 |
+
2. Include clear axis labels with units
|
| 157 |
+
3. Add legend if multiple series
|
| 158 |
+
4. Use appropriate colors and markers
|
| 159 |
+
5. Set figure size for publication (e.g., 6x4 inches)
|
| 160 |
+
6. Save as high-resolution PNG (300 dpi minimum)
|
| 161 |
+
7. Include error bars if applicable
|
| 162 |
+
8. Follow best practices for data visualization
|
| 163 |
+
|
| 164 |
+
OUTPUT FORMAT:
|
| 165 |
+
Provide ONLY the complete Python code, ready to execute.
|
| 166 |
+
Start with necessary imports and end with plt.savefig().
|
| 167 |
+
Do not include any explanations outside the code comments.
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
contents = [
|
| 171 |
+
types.Content(
|
| 172 |
+
role="user",
|
| 173 |
+
parts=[types.Part.from_text(text=prompt)]
|
| 174 |
+
)
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
generate_config = types.GenerateContentConfig(
|
| 178 |
+
thinking_config=types.ThinkingConfig(
|
| 179 |
+
thinking_level="MEDIUM"
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
code = ""
|
| 184 |
+
for chunk in self.client.models.generate_content_stream(
|
| 185 |
+
model=self.model,
|
| 186 |
+
contents=contents,
|
| 187 |
+
config=generate_config
|
| 188 |
+
):
|
| 189 |
+
code += chunk.text
|
| 190 |
+
|
| 191 |
+
# Save code to file
|
| 192 |
+
code_file = f"{output_path}.py"
|
| 193 |
+
with open(code_file, 'w') as f:
|
| 194 |
+
f.write(code.strip())
|
| 195 |
+
|
| 196 |
+
print(f"Plot code saved to: {code_file}")
|
| 197 |
+
print("Run the code to generate the plot image.")
|
| 198 |
+
|
| 199 |
+
return code_file
|
app.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PaperBanana — Gradio app for HuggingFace Spaces.
|
| 3 |
+
|
| 4 |
+
Turns methodology text into publication-ready architecture diagrams
|
| 5 |
+
using a 5-agent pipeline (Retriever → Planner → Stylist → Visualizer → Critic).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
import tempfile
|
| 11 |
+
import mimetypes
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import List, Dict, Any, Optional
|
| 14 |
+
|
| 15 |
+
import gradio as gr
|
| 16 |
+
from google import genai
|
| 17 |
+
from google.genai import types
|
| 18 |
+
|
| 19 |
+
from agents import RetrieverAgent, PlannerAgent, StylistAgent, VisualizerAgent, CriticAgent
|
| 20 |
+
from aesthetic_guidelines import AESTHETIC_GUIDELINE
|
| 21 |
+
import config
|
| 22 |
+
|
| 23 |
+
# ── Load reference set at startup ───────────────────────────────────────────
|
| 24 |
+
REF_SET_PATH = Path("data/spotlight_reference_set.json")
|
| 25 |
+
REFERENCE_SET: List[Dict[str, Any]] = []
|
| 26 |
+
if REF_SET_PATH.exists():
|
| 27 |
+
with open(REF_SET_PATH) as f:
|
| 28 |
+
REFERENCE_SET = json.load(f)
|
| 29 |
+
print(f"Loaded {len(REFERENCE_SET)} reference examples")
|
| 30 |
+
|
| 31 |
+
# ── Example gallery images ──────────────────────────────────────────────────
|
| 32 |
+
EXAMPLE_IMAGES = {
|
| 33 |
+
"Transformer": "examples/readme/transformer_iter3_0.jpg",
|
| 34 |
+
"ResNet": "examples/readme/resnet_iter3_0.jpg",
|
| 35 |
+
"DDPM": "examples/readme/ddpm_iter3_0.jpg",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# ── Preset examples ─────────────────────────────────────────────────────────
|
| 39 |
+
PRESET_EXAMPLES = [
|
| 40 |
+
[
|
| 41 |
+
# Transformer
|
| 42 |
+
"""The Transformer model follows an encoder-decoder structure using stacked self-attention and fully connected layers.
|
| 43 |
+
|
| 44 |
+
Encoder: Stack of N=6 identical layers. Each layer has two sub-layers: (1) multi-head self-attention, and (2) position-wise feed-forward network. Residual connections around each sub-layer, followed by layer normalization.
|
| 45 |
+
|
| 46 |
+
Decoder: Stack of N=6 identical layers. In addition to the two encoder sub-layers, the decoder inserts a third sub-layer for multi-head cross-attention over the encoder output. Masked self-attention prevents attending to subsequent positions.
|
| 47 |
+
|
| 48 |
+
Multi-Head Attention: Linearly project queries, keys, values h times, perform scaled dot-product attention in parallel, concatenate and project again.
|
| 49 |
+
|
| 50 |
+
Positional Encoding: Sinusoidal positional encodings added to input embeddings.""",
|
| 51 |
+
"The Transformer — model architecture (Vaswani et al., 2017)",
|
| 52 |
+
2,
|
| 53 |
+
],
|
| 54 |
+
[
|
| 55 |
+
# ResNet
|
| 56 |
+
"""We present a residual learning framework. Instead of learning H(x) directly, layers fit a residual mapping F(x) = H(x) - x. The building block is y = F(x, {W_i}) + x via identity shortcut connections.
|
| 57 |
+
|
| 58 |
+
Architecture: Input 224×224 → 7×7 conv, 64, stride 2 → BN → ReLU → 3×3 max pool → Stage 1: 3 blocks, 64 filters → Stage 2: 4 blocks, 128 filters → Stage 3: 6 blocks, 256 filters → Stage 4: 3 blocks, 512 filters → Global avg pool → 1000-d FC → softmax.
|
| 59 |
+
|
| 60 |
+
For deeper networks (50/101/152), bottleneck blocks: 1×1 conv (reduce) → 3×3 conv → 1×1 conv (restore), with shortcut bypassing all three layers.""",
|
| 61 |
+
"Architecture of ResNet with residual learning building blocks (He et al., 2016)",
|
| 62 |
+
2,
|
| 63 |
+
],
|
| 64 |
+
[
|
| 65 |
+
# DDPM
|
| 66 |
+
"""Denoising diffusion probabilistic models (DDPMs): Forward process gradually adds Gaussian noise over T timesteps: q(x_t|x_{t-1}) = N(x_t; √(1-β_t)x_{t-1}, β_tI). After T steps, x_T ≈ N(0,I).
|
| 67 |
+
|
| 68 |
+
Reverse process learns to denoise: p_θ(x_{t-1}|x_t) = N(x_{t-1}; μ_θ(x_t,t), Σ_θ(x_t,t)). Starting from x_T ~ N(0,I), iteratively produces clean x_0.
|
| 69 |
+
|
| 70 |
+
Denoising network ε_θ(x_t,t) is a U-Net: downsampling with ResNet blocks + self-attention at 16×16, bottleneck with self-attention, upsampling with skip connections. Timestep conditioning via sinusoidal embeddings. Training minimizes L = E[||ε - ε_θ(x_t,t)||²].""",
|
| 71 |
+
"Overview of the denoising diffusion probabilistic model (Ho et al., 2020)",
|
| 72 |
+
2,
|
| 73 |
+
],
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# ── Core generation logic (streaming-friendly) ─────────────────────────────
|
| 78 |
+
def generate_diagram(
|
| 79 |
+
methodology_text: str,
|
| 80 |
+
caption: str,
|
| 81 |
+
num_iterations: int,
|
| 82 |
+
api_key: str | None = None,
|
| 83 |
+
progress=gr.Progress(track_tqdm=True),
|
| 84 |
+
):
|
| 85 |
+
"""Run the full PaperBanana pipeline and yield intermediate results."""
|
| 86 |
+
|
| 87 |
+
# Resolve API key: user input > env var
|
| 88 |
+
gemini_key = (api_key or "").strip() or config.GEMINI_API_KEY
|
| 89 |
+
if not gemini_key:
|
| 90 |
+
raise gr.Error(
|
| 91 |
+
"No Gemini API key found. Paste one in the field above, "
|
| 92 |
+
"or set GEMINI_API_KEY as a Space secret."
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Patch config so all agents pick it up
|
| 96 |
+
config.GEMINI_API_KEY = gemini_key
|
| 97 |
+
|
| 98 |
+
num_iterations = int(num_iterations)
|
| 99 |
+
logs: list[str] = []
|
| 100 |
+
|
| 101 |
+
def log(msg: str):
|
| 102 |
+
logs.append(msg)
|
| 103 |
+
return "\n".join(logs)
|
| 104 |
+
|
| 105 |
+
# ── 1. Retriever ────────────────────────────────────────────────────────
|
| 106 |
+
yield None, log("🔍 [1/5] Retriever: finding relevant references…")
|
| 107 |
+
retriever = RetrieverAgent(REFERENCE_SET)
|
| 108 |
+
reference_examples = []
|
| 109 |
+
if REFERENCE_SET:
|
| 110 |
+
reference_examples = retriever.retrieve(
|
| 111 |
+
methodology_text, caption, n=config.NUM_REFERENCE_EXAMPLES
|
| 112 |
+
)
|
| 113 |
+
yield None, log(f" ✓ Retrieved {len(reference_examples)} references")
|
| 114 |
+
else:
|
| 115 |
+
yield None, log(" ⏭ Skipped (no reference set loaded)")
|
| 116 |
+
|
| 117 |
+
# ── 2. Planner ──────────────────────────────────────────────────────────
|
| 118 |
+
yield None, log("📝 [2/5] Planner: creating visual description…")
|
| 119 |
+
planner = PlannerAgent()
|
| 120 |
+
current_description = planner.plan(methodology_text, caption, reference_examples)
|
| 121 |
+
yield None, log(f" ✓ Description ready ({len(current_description)} chars)")
|
| 122 |
+
|
| 123 |
+
# ── 3. Stylist ──────────────────────────────────────────────────────────
|
| 124 |
+
yield None, log("🎨 [3/5] Stylist: applying aesthetic guidelines…")
|
| 125 |
+
stylist = StylistAgent()
|
| 126 |
+
current_description = stylist.refine(current_description)
|
| 127 |
+
yield None, log(f" ✓ Styled ({len(current_description)} chars)")
|
| 128 |
+
|
| 129 |
+
# ── 4/5. Visualize → Critique loop ──────────────────────────────────────
|
| 130 |
+
latest_image_path = None
|
| 131 |
+
critic = CriticAgent()
|
| 132 |
+
|
| 133 |
+
for i in range(1, num_iterations + 1):
|
| 134 |
+
yield latest_image_path, log(
|
| 135 |
+
f"🖼️ [4/5] Visualizer: generating image (iteration {i}/{num_iterations})…"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 139 |
+
out_base = os.path.join(tmpdir, f"iter{i}")
|
| 140 |
+
visualizer = VisualizerAgent(mode="diagram")
|
| 141 |
+
img_path = visualizer.visualize(current_description, out_base)
|
| 142 |
+
|
| 143 |
+
if img_path and os.path.exists(img_path):
|
| 144 |
+
# Copy to a persistent temp file so Gradio can serve it
|
| 145 |
+
import shutil
|
| 146 |
+
|
| 147 |
+
ext = Path(img_path).suffix or ".jpg"
|
| 148 |
+
persist = tempfile.NamedTemporaryFile(
|
| 149 |
+
suffix=ext, delete=False, dir=tempfile.gettempdir()
|
| 150 |
+
)
|
| 151 |
+
shutil.copy2(img_path, persist.name)
|
| 152 |
+
latest_image_path = persist.name
|
| 153 |
+
|
| 154 |
+
yield latest_image_path, log(f" ✓ Image generated (iteration {i})")
|
| 155 |
+
|
| 156 |
+
# Skip critique on last iteration
|
| 157 |
+
if i >= num_iterations:
|
| 158 |
+
break
|
| 159 |
+
|
| 160 |
+
yield latest_image_path, log(
|
| 161 |
+
f"🔬 [5/5] Critic: evaluating (iteration {i})…"
|
| 162 |
+
)
|
| 163 |
+
critique = critic.critique(
|
| 164 |
+
methodology_text, caption, current_description, latest_image_path, i
|
| 165 |
+
)
|
| 166 |
+
n_issues = len(critique["issues"])
|
| 167 |
+
yield latest_image_path, log(f" ✓ {n_issues} issues found")
|
| 168 |
+
|
| 169 |
+
if not critique["should_continue"]:
|
| 170 |
+
yield latest_image_path, log(" ✓ Quality threshold reached — done!")
|
| 171 |
+
break
|
| 172 |
+
|
| 173 |
+
# Refine
|
| 174 |
+
yield latest_image_path, log("📝 [2/5] Planner: refining description…")
|
| 175 |
+
refinement_prompt = critic.generate_refinement_prompt(
|
| 176 |
+
current_description, critique
|
| 177 |
+
)
|
| 178 |
+
client = genai.Client(api_key=gemini_key)
|
| 179 |
+
contents = [
|
| 180 |
+
types.Content(
|
| 181 |
+
role="user",
|
| 182 |
+
parts=[types.Part.from_text(text=refinement_prompt)],
|
| 183 |
+
)
|
| 184 |
+
]
|
| 185 |
+
refined = ""
|
| 186 |
+
for chunk in client.models.generate_content_stream(
|
| 187 |
+
model=config.VLM_MODEL,
|
| 188 |
+
contents=contents,
|
| 189 |
+
config=types.GenerateContentConfig(
|
| 190 |
+
thinking_config=types.ThinkingConfig(thinking_level="HIGH")
|
| 191 |
+
),
|
| 192 |
+
):
|
| 193 |
+
refined += chunk.text
|
| 194 |
+
current_description = refined.strip()
|
| 195 |
+
yield latest_image_path, log(
|
| 196 |
+
f" ✓ Refined ({len(current_description)} chars)"
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Re-style
|
| 200 |
+
yield latest_image_path, log("🎨 [3/5] Stylist: re-applying style…")
|
| 201 |
+
current_description = stylist.refine(current_description)
|
| 202 |
+
yield latest_image_path, log(f" ✓ Styled ({len(current_description)} chars)")
|
| 203 |
+
|
| 204 |
+
yield latest_image_path, log("\n✅ Generation complete!")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ── Gradio UI ───────────────────────────────────────────────────────────────
|
| 208 |
+
DESCRIPTION_MD = """\
|
| 209 |
+
# 🍌 PaperBanana
|
| 210 |
+
|
| 211 |
+
**Turn methodology text into publication-ready architecture diagrams.**
|
| 212 |
+
|
| 213 |
+
Paste your paper's methodology section + a caption, and PaperBanana's 5-agent pipeline
|
| 214 |
+
(Retriever → Planner → Stylist → Visualizer → Critic) will generate a diagram for you.
|
| 215 |
+
|
| 216 |
+
> Based on [*PaperBanana: Automating Academic Illustration for AI Scientists*](https://arxiv.org/abs/2505.23894) (Zhu et al., NeurIPS 2025).
|
| 217 |
+
"""
|
| 218 |
+
|
| 219 |
+
with gr.Blocks(
|
| 220 |
+
title="PaperBanana",
|
| 221 |
+
theme=gr.themes.Soft(primary_hue="amber", secondary_hue="blue"),
|
| 222 |
+
css="footer { display: none !important; }",
|
| 223 |
+
) as demo:
|
| 224 |
+
gr.Markdown(DESCRIPTION_MD)
|
| 225 |
+
|
| 226 |
+
# ── Example gallery ─────────────────────────────────────────────────────
|
| 227 |
+
with gr.Accordion("📸 Example outputs (click to expand)", open=False):
|
| 228 |
+
existing = {k: v for k, v in EXAMPLE_IMAGES.items() if Path(v).exists()}
|
| 229 |
+
if existing:
|
| 230 |
+
with gr.Row():
|
| 231 |
+
for name, path in existing.items():
|
| 232 |
+
with gr.Column(min_width=200):
|
| 233 |
+
gr.Image(value=path, label=name)
|
| 234 |
+
|
| 235 |
+
# ── Inputs ──────────────────────────────────────────────────────────────
|
| 236 |
+
with gr.Row():
|
| 237 |
+
with gr.Column(scale=1):
|
| 238 |
+
methodology_input = gr.Textbox(
|
| 239 |
+
label="Methodology text",
|
| 240 |
+
placeholder="Paste your methodology / model description here…",
|
| 241 |
+
lines=12,
|
| 242 |
+
)
|
| 243 |
+
caption_input = gr.Textbox(
|
| 244 |
+
label="Diagram caption",
|
| 245 |
+
placeholder='e.g. "Architecture of our proposed method"',
|
| 246 |
+
lines=2,
|
| 247 |
+
)
|
| 248 |
+
iterations_slider = gr.Slider(
|
| 249 |
+
minimum=1,
|
| 250 |
+
maximum=3,
|
| 251 |
+
value=2,
|
| 252 |
+
step=1,
|
| 253 |
+
label="Refinement iterations",
|
| 254 |
+
info="More iterations = better quality, slower",
|
| 255 |
+
)
|
| 256 |
+
api_key_input = gr.Textbox(
|
| 257 |
+
label="Gemini API key (optional if set as Space secret)",
|
| 258 |
+
type="password",
|
| 259 |
+
placeholder="AIza…",
|
| 260 |
+
)
|
| 261 |
+
generate_btn = gr.Button("🍌 Generate diagram", variant="primary", size="lg")
|
| 262 |
+
|
| 263 |
+
# ── Outputs ─────────────────────────────────────────────────────────
|
| 264 |
+
with gr.Column(scale=1):
|
| 265 |
+
output_image = gr.Image(label="Generated diagram", type="filepath")
|
| 266 |
+
output_log = gr.Textbox(label="Pipeline log", lines=18, interactive=False)
|
| 267 |
+
|
| 268 |
+
# ── Examples table ──────────────────────────────────────────────────────
|
| 269 |
+
gr.Examples(
|
| 270 |
+
examples=PRESET_EXAMPLES,
|
| 271 |
+
inputs=[methodology_input, caption_input, iterations_slider],
|
| 272 |
+
label="Try a classic paper",
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# ── Wiring ──────────────────────────────────────────────────────────────
|
| 276 |
+
generate_btn.click(
|
| 277 |
+
fn=generate_diagram,
|
| 278 |
+
inputs=[methodology_input, caption_input, iterations_slider, api_key_input],
|
| 279 |
+
outputs=[output_image, output_log],
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
if __name__ == "__main__":
|
| 283 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=7860)
|
config.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration for PaperBanana framework.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# Gemini API Configuration
|
| 10 |
+
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
|
| 11 |
+
|
| 12 |
+
# Model Configuration
|
| 13 |
+
VLM_MODEL = "gemini-3-pro-preview" # For Retriever, Planner, Stylist, Critic
|
| 14 |
+
IMAGE_MODEL = "gemini-3-pro-image-preview" # For Visualizer (referred to as Nano-Banana-Pro in paper)
|
| 15 |
+
|
| 16 |
+
# Generation Configuration
|
| 17 |
+
MAX_REFINEMENT_ITERATIONS = 3 # As per ablation study
|
| 18 |
+
IMAGE_SIZE = "1K" # Image resolution
|
| 19 |
+
THINKING_LEVEL = "HIGH" # For complex reasoning tasks
|
| 20 |
+
|
| 21 |
+
# Number of reference examples to retrieve
|
| 22 |
+
NUM_REFERENCE_EXAMPLES = 10
|
data/spotlight_reference_images/ref_0001_00232_GraphMaster_Automated_Graph_Synthesis_via_LLM_Agents_in_Data-Limited_Environments__fdf13132133da88f7ce9ae4d0a22c29da1f05f75072f95010a29b1392696ea70.jpg
ADDED
|
data/spotlight_reference_images/ref_0002_00279_VoxDet_Rethinking_3D_Semantic_Scene_Completion_as_Dense_Object_Detection__7b55d87bf0fcf6d787a440d59bf4617e6d73f10f5b1bcc1b45736ad0a7a57911.jpg
ADDED
|
data/spotlight_reference_images/ref_0003_00491_SQS_Enhancing_Sparse_Perception_Models_via_Query-based_Splatting_in_Autonomous_Driving__f7c6f154dc3e45e2f58f5a9111ebf57ead0d19f9e77340cd3838d881af17a916.jpg
ADDED
|
data/spotlight_reference_images/ref_0004_00691_ProtInvTree_Deliberate_Protein_Inverse_Folding_with_Reward-guided_Tree_Search__713bbbec11cbef0f1d2c8901e17165fa7db3d1fcf6cfa0d4a8b803d2dccb2ca0.jpg
ADDED
|
data/spotlight_reference_images/ref_0005_00738_Mulberry_Empowering_MLLM_with_o1-like_Reasoning_and_Reflection_via_Collective_Monte_Carlo_Tree_Search__7c987fc8cf213eb47a038117cc38e5a170938289a390de4b8d7b6cd88512d505.jpg
ADDED
|
data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg
ADDED
|
Git LFS Details
|
data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg
ADDED
|
Git LFS Details
|
data/spotlight_reference_images/ref_0008_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__ea910fb78f4d4027ff7fe9a63cbbc68a7038ee2e9f4250c2f773c631f265b079.jpg
ADDED
|
data/spotlight_reference_images/ref_0009_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__4cc6ca3e3fa15065b6a1781e9a7f814d67a46649325289b63c9bea4072020f4c.jpg
ADDED
|
data/spotlight_reference_images/ref_0010_01112_DiCo_Revitalizing_ConvNets_for_Scalable_and_Efficient_Diffusion_Modeling__589d9f3ec341480c16ec00bf41076999cd0ce6c1526b97e71eb9c8ffe33b1a1b.jpg
ADDED
|
data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg
ADDED
|
Git LFS Details
|
data/spotlight_reference_images/ref_0012_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__fb4694d88a5097ee72a936634a832ced85d11e90079a4fabc3f1b8c28f24e5d6.jpg
ADDED
|
data/spotlight_reference_images/ref_0013_01620_FutureSightDrive_Thinking_Visually_with_Spatio-Temporal_CoT_for_Autonomous_Driving__8a4bffe9a69ef0d2bc0ced518bccdb3146d38727d7fc7402a418b6227a78bcfb.jpg
ADDED
|
data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg
ADDED
|
Git LFS Details
|
data/spotlight_reference_images/ref_0015_01839_Mesh-RFT_Enhancing_Mesh_Generation_via_Fine-grained_Reinforcement_Fine-Tuning__4d93550a61cdae636652e7cd1ca974c21c7fca1e8eb6eaa6fa2e03100f1d0f68.jpg
ADDED
|
data/spotlight_reference_images/ref_0016_01864_Jacobian-Based_Interpretation_of_Nonlinear_Neural_Encoding_Model__c9ac984c977d21aff284dcfbacf7bdfea345cc73096a3f28d624b026654e1740.jpg
ADDED
|
data/spotlight_reference_images/ref_0017_02109_OnlineSplatter_Pose-Free_Online_3D_Reconstruction_for_Free-Moving_Objects__abe4e12f0fc9a7487c6c5774f5ead6b391ede9f832960b5ce462ceb786534a0d.jpg
ADDED
|
data/spotlight_reference_images/ref_0018_02239_RobustMerge_Parameter-Efficient_Model_Merging_for_MLLMs_with_Direction_Robustness__45b74fc957d305067e6f46f00bc32c9ee0889b7dc7e18b4c46913d263c1d8c16.jpg
ADDED
|
data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg
ADDED
|
Git LFS Details
|
data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg
ADDED
|
Git LFS Details
|
data/spotlight_reference_images/ref_0021_03670_Neural_Atlas_Graphs_for_Dynamic_Scene_Decomposition_and_Editing__16f8fc865baed696e798502de56f3b473dbf2b0b6aa3c1286f384de53c524b97.jpg
ADDED
|
data/spotlight_reference_images/ref_0022_03671_STITCH-OPE_Trajectory_Stitching_with_Guided_Diffusion_for_Off-Policy_Evaluation__6e9c875dbc74a8b39bc947de3b928f8a29f7c6db9b03d726a62a961ba0c2fdd3.jpg
ADDED
|
data/spotlight_reference_images/ref_0023_04013_scMRDR_A_scalable_and_flexible_framework_for_unpaired_single-cell_multi-omics_data_integration__85cde1b5a410b0d3275a5c0fa81dfe69e2c83c485c9917c296502a6485e2f68b.jpg
ADDED
|
data/spotlight_reference_images/ref_0024_04165_Transformer_Copilot_Learning_from_The_Mistake_Log_in_LLM_Fine-tuning__8d1e804f51825d9760a37b8d1ca027a61deecc5e33fc172f0c1789814527c37e.jpg
ADDED
|
data/spotlight_reference_images/ref_0025_04571_GeRaF_Neural_Geometry_Reconstruction_from_Radio_Frequency_Signals__e096c21c76e1eb88c8d865f73e832e8e9246cddec17f62b5d85bc051caa55165.jpg
ADDED
|
data/spotlight_reference_images/ref_0026_04647_HopaDIFF_Holistic-Partial_Aware_Fourier_Conditioned_Diffusion_for_Referring_Human_Action_Segmentation_in_Multi-Person_Sc__0e15e7e99b52ccbc157e3ee04a6de2238b23ccd60543102fc2a70eddb12e5e41.jpg
ADDED
|
data/spotlight_reference_images/ref_0027_04717_CSBrain_A_Cross-scale_Spatiotemporal_Brain_Foundation_Model_for_EEG_Decoding__327e1ef5f11138cef78e3dd270f09eea700aedb3c5a64cf848b125d13e4e5f08.jpg
ADDED
|
data/spotlight_reference_images/ref_0028_05129_Learning_to_Factorize_Spatio-Temporal_Foundation_Models__ef11d1775e9863839bc4dfaf8711bf474ef80365303220ba92781b579879e7a5.jpg
ADDED
|
data/spotlight_reference_images/ref_0029_05428_EDELINE_Enhancing_Memory_in_Diffusion-based_World_Models_via_Linear-Time_Sequence_Modeling__57df86687acccd0bad49997d26a4a442d6d09e2381e5a570d1d1e7efd02cf303.jpg
ADDED
|
data/spotlight_reference_images/ref_0030_05467_Vision-centric_Token_Compression_in_Large_Language_Model__056bbf059f83c91ea896c610cef2927606ab780d910996e6cdb293dfaca40ddd.jpg
ADDED
|
data/spotlight_reference_images/ref_0031_05610_Repo2Run_Automated_Building_Executable_Environment_for_Code_Repository_at_Scale__ea96f359e23ff3f0427d48dd3247314967bb531150d725a7680376b940324680.jpg
ADDED
|
data/spotlight_reference_images/ref_0032_05774_Shallow_Diffuse_Robust_and_Invisible_Watermarking_through_Low-Dim_Subspaces_in_Diffusion_Models__703f7602f642aa354858ee8cf929888d672a45001a93ac0cb937cd0f4f1b62de.jpg
ADDED
|
data/spotlight_reference_images/ref_0033_05814_Mozart_Modularized_and_Efficient_MoE_Training_on_35D_Wafer-Scale_Chiplet_Architectures__dc6b73dd98f93241717e3b658c319f70ab6ad6188c24a147cd052fb2153d656d.jpg
ADDED
|
data/spotlight_reference_images/ref_0034_06044_Theory-Driven_Label-Specific_Representation_for_Incomplete_Multi-View_Multi-Label_Learning__4d4bbb3c5cd4edb56f73502b7eac2b526f30bfc883b337d949cb38ee0747ee22.jpg
ADDED
|
data/spotlight_reference_images/ref_0035_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__8c0d4df1862409d631870861dc2f047a0cd2572e87267d0d3ea58b6c245408fe.jpg
ADDED
|
data/spotlight_reference_images/ref_0036_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__a50d002b445446ba0c687045ba485e4c96aaf49765198b3f53fb954327df6f4f.jpg
ADDED
|
data/spotlight_reference_images/ref_0037_06507_CausalPFN_Amortized_Causal_Effect_Estimation_via_In-Context_Learning__8c660a9d9ad153e854bc67151e7df9977e2244fa4ce9bd649c2b08b58db2e30c.jpg
ADDED
|
data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg
ADDED
|
Git LFS Details
|