Spaces:

Samarth0710
/

PaperBanana

Running

App Files Files Community

Samarth0710 commited on 28 days ago

Commit

572d3da

verified ·

1 Parent(s): 9943870

Deploy PaperBanana app

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
Dockerfile +27 -0
README.md +6 -5
aesthetic_guidelines.py +64 -0
agents/__init__.py +17 -0
agents/critic.py +234 -0
agents/planner.py +117 -0
agents/retriever.py +151 -0
agents/stylist.py +104 -0
agents/visualizer.py +199 -0
app.py +283 -0
config.py +22 -0
data/spotlight_reference_images/ref_0001_00232_GraphMaster_Automated_Graph_Synthesis_via_LLM_Agents_in_Data-Limited_Environments__fdf13132133da88f7ce9ae4d0a22c29da1f05f75072f95010a29b1392696ea70.jpg +0 -0
data/spotlight_reference_images/ref_0002_00279_VoxDet_Rethinking_3D_Semantic_Scene_Completion_as_Dense_Object_Detection__7b55d87bf0fcf6d787a440d59bf4617e6d73f10f5b1bcc1b45736ad0a7a57911.jpg +0 -0
data/spotlight_reference_images/ref_0003_00491_SQS_Enhancing_Sparse_Perception_Models_via_Query-based_Splatting_in_Autonomous_Driving__f7c6f154dc3e45e2f58f5a9111ebf57ead0d19f9e77340cd3838d881af17a916.jpg +0 -0
data/spotlight_reference_images/ref_0004_00691_ProtInvTree_Deliberate_Protein_Inverse_Folding_with_Reward-guided_Tree_Search__713bbbec11cbef0f1d2c8901e17165fa7db3d1fcf6cfa0d4a8b803d2dccb2ca0.jpg +0 -0
data/spotlight_reference_images/ref_0005_00738_Mulberry_Empowering_MLLM_with_o1-like_Reasoning_and_Reflection_via_Collective_Monte_Carlo_Tree_Search__7c987fc8cf213eb47a038117cc38e5a170938289a390de4b8d7b6cd88512d505.jpg +0 -0
data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg +3 -0
data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg +3 -0
data/spotlight_reference_images/ref_0008_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__ea910fb78f4d4027ff7fe9a63cbbc68a7038ee2e9f4250c2f773c631f265b079.jpg +0 -0
data/spotlight_reference_images/ref_0009_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__4cc6ca3e3fa15065b6a1781e9a7f814d67a46649325289b63c9bea4072020f4c.jpg +0 -0
data/spotlight_reference_images/ref_0010_01112_DiCo_Revitalizing_ConvNets_for_Scalable_and_Efficient_Diffusion_Modeling__589d9f3ec341480c16ec00bf41076999cd0ce6c1526b97e71eb9c8ffe33b1a1b.jpg +0 -0
data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg +3 -0
data/spotlight_reference_images/ref_0012_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__fb4694d88a5097ee72a936634a832ced85d11e90079a4fabc3f1b8c28f24e5d6.jpg +0 -0
data/spotlight_reference_images/ref_0013_01620_FutureSightDrive_Thinking_Visually_with_Spatio-Temporal_CoT_for_Autonomous_Driving__8a4bffe9a69ef0d2bc0ced518bccdb3146d38727d7fc7402a418b6227a78bcfb.jpg +0 -0
data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg +3 -0
data/spotlight_reference_images/ref_0015_01839_Mesh-RFT_Enhancing_Mesh_Generation_via_Fine-grained_Reinforcement_Fine-Tuning__4d93550a61cdae636652e7cd1ca974c21c7fca1e8eb6eaa6fa2e03100f1d0f68.jpg +0 -0
data/spotlight_reference_images/ref_0016_01864_Jacobian-Based_Interpretation_of_Nonlinear_Neural_Encoding_Model__c9ac984c977d21aff284dcfbacf7bdfea345cc73096a3f28d624b026654e1740.jpg +0 -0
data/spotlight_reference_images/ref_0017_02109_OnlineSplatter_Pose-Free_Online_3D_Reconstruction_for_Free-Moving_Objects__abe4e12f0fc9a7487c6c5774f5ead6b391ede9f832960b5ce462ceb786534a0d.jpg +0 -0
data/spotlight_reference_images/ref_0018_02239_RobustMerge_Parameter-Efficient_Model_Merging_for_MLLMs_with_Direction_Robustness__45b74fc957d305067e6f46f00bc32c9ee0889b7dc7e18b4c46913d263c1d8c16.jpg +0 -0
data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg +3 -0
data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg +3 -0
data/spotlight_reference_images/ref_0021_03670_Neural_Atlas_Graphs_for_Dynamic_Scene_Decomposition_and_Editing__16f8fc865baed696e798502de56f3b473dbf2b0b6aa3c1286f384de53c524b97.jpg +0 -0
data/spotlight_reference_images/ref_0022_03671_STITCH-OPE_Trajectory_Stitching_with_Guided_Diffusion_for_Off-Policy_Evaluation__6e9c875dbc74a8b39bc947de3b928f8a29f7c6db9b03d726a62a961ba0c2fdd3.jpg +0 -0
data/spotlight_reference_images/ref_0023_04013_scMRDR_A_scalable_and_flexible_framework_for_unpaired_single-cell_multi-omics_data_integration__85cde1b5a410b0d3275a5c0fa81dfe69e2c83c485c9917c296502a6485e2f68b.jpg +0 -0
data/spotlight_reference_images/ref_0024_04165_Transformer_Copilot_Learning_from_The_Mistake_Log_in_LLM_Fine-tuning__8d1e804f51825d9760a37b8d1ca027a61deecc5e33fc172f0c1789814527c37e.jpg +0 -0
data/spotlight_reference_images/ref_0025_04571_GeRaF_Neural_Geometry_Reconstruction_from_Radio_Frequency_Signals__e096c21c76e1eb88c8d865f73e832e8e9246cddec17f62b5d85bc051caa55165.jpg +0 -0
data/spotlight_reference_images/ref_0026_04647_HopaDIFF_Holistic-Partial_Aware_Fourier_Conditioned_Diffusion_for_Referring_Human_Action_Segmentation_in_Multi-Person_Sc__0e15e7e99b52ccbc157e3ee04a6de2238b23ccd60543102fc2a70eddb12e5e41.jpg +0 -0
data/spotlight_reference_images/ref_0027_04717_CSBrain_A_Cross-scale_Spatiotemporal_Brain_Foundation_Model_for_EEG_Decoding__327e1ef5f11138cef78e3dd270f09eea700aedb3c5a64cf848b125d13e4e5f08.jpg +0 -0
data/spotlight_reference_images/ref_0028_05129_Learning_to_Factorize_Spatio-Temporal_Foundation_Models__ef11d1775e9863839bc4dfaf8711bf474ef80365303220ba92781b579879e7a5.jpg +0 -0
data/spotlight_reference_images/ref_0029_05428_EDELINE_Enhancing_Memory_in_Diffusion-based_World_Models_via_Linear-Time_Sequence_Modeling__57df86687acccd0bad49997d26a4a442d6d09e2381e5a570d1d1e7efd02cf303.jpg +0 -0
data/spotlight_reference_images/ref_0030_05467_Vision-centric_Token_Compression_in_Large_Language_Model__056bbf059f83c91ea896c610cef2927606ab780d910996e6cdb293dfaca40ddd.jpg +0 -0
data/spotlight_reference_images/ref_0031_05610_Repo2Run_Automated_Building_Executable_Environment_for_Code_Repository_at_Scale__ea96f359e23ff3f0427d48dd3247314967bb531150d725a7680376b940324680.jpg +0 -0
data/spotlight_reference_images/ref_0032_05774_Shallow_Diffuse_Robust_and_Invisible_Watermarking_through_Low-Dim_Subspaces_in_Diffusion_Models__703f7602f642aa354858ee8cf929888d672a45001a93ac0cb937cd0f4f1b62de.jpg +0 -0
data/spotlight_reference_images/ref_0033_05814_Mozart_Modularized_and_Efficient_MoE_Training_on_35D_Wafer-Scale_Chiplet_Architectures__dc6b73dd98f93241717e3b658c319f70ab6ad6188c24a147cd052fb2153d656d.jpg +0 -0
data/spotlight_reference_images/ref_0034_06044_Theory-Driven_Label-Specific_Representation_for_Incomplete_Multi-View_Multi-Label_Learning__4d4bbb3c5cd4edb56f73502b7eac2b526f30bfc883b337d949cb38ee0747ee22.jpg +0 -0
data/spotlight_reference_images/ref_0035_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__8c0d4df1862409d631870861dc2f047a0cd2572e87267d0d3ea58b6c245408fe.jpg +0 -0
data/spotlight_reference_images/ref_0036_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__a50d002b445446ba0c687045ba485e4c96aaf49765198b3f53fb954327df6f4f.jpg +0 -0
data/spotlight_reference_images/ref_0037_06507_CausalPFN_Amortized_Causal_Effect_Estimation_via_In-Context_Learning__8c660a9d9ad153e854bc67151e7df9977e2244fa4ce9bd649c2b08b58db2e30c.jpg +0 -0
data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,40 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0040_06606_Disentangled_Concepts_Speak_Louder_Than_Words_Explainable_Video_Action_Recognition__fe96a76b160d3861e188cfe5511fee2d4f07eada1ebf92ade017048a3362d5b8.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0041_07858_HYPERION_Fine-Grained_Hypersphere_Alignment_for_Robust_Federated_Graph_Learning__be29a99497ec2dd4d3a8993ce3edc85c87505a1cabfadc2df234b7e326633ebc.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0043_08772_RepoMaster_Autonomous_Exploration_and_Understanding_of_GitHub_Repositories_for_Complex_Task_Solving__c5102f7309c920d53df2307418ef99304d083aa3f54140e3fa2e55c6b259378b.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0046_09315_Personalized_Decision_Modeling_Utility_Optimization_or_Textualized-Symbolic_Reasoning__56df140d7973f4f1a6286c7cebec84068dc6828711cea2e455416a0d9d381a99.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0048_09629_4DGT_Learning_a_4D_Gaussian_Transformer_Using_Real-World_Monocular_Videos__64516a621163af326843f0152bd0cdb8f798d2df70242271249a95e572c7a300.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0052_10520_TrajMamba_An_Efficient_and_Semantic-rich_Vehicle_Trajectory_Pre-training_Model__02d2a7b7aa1ad60cce35445c46fdcb453afa273d8a170ce0abe33ab2c8c6f245.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0058_13240_A_machine_learning_approach_that_beats_Rubiks_cubes__aec92a7999c868664250d8e9aad60b03dbacabd440355bec73af28c512c9d18a.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0060_14126_Deno-IF_Unsupervised_Noisy_Visible_and_Infrared_Image_Fusion_Method__685d5064d5b82a4e2e38976afb4b02e3359ccff154e8231646e76cb16970b7a0.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0070_15841_Co-Reinforcement_Learning_for_Unified_Multimodal_Understanding_and_Generation__8d1bdeb48a8ecdf31ace6493caea90ec34e8e10428d5b91397cd5531c2b33b09.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0074_16584_IA-GGAD_Zero-shot_Generalist_Graph_Anomaly_Detection_via_Invariant_and_Affinity_Learning__2be0842b017b1925d07b25f0276d02ffabc8a00fe4e73cc930b0ee0096fcfd40.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0079_19455_Wide-Horizon_Thinking_and_Simulation-Based_Evaluation_for_Real-World_LLM_Planning_with_Multifaceted_Constraints__198c85b432b1ddbf1afc76ab8c98e057973c4763acbaa0f16a4da51d97460935.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0087_22755_Breaking_the_Batch_Barrier_B3_of_Contrastive_Learning_via_Smart_Batch_Mining__588e6566b5416ccc18d5f5733612cbc3caa0c11fe2e3bc5c57c32c88a9ec2e41.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0095_26919_FUDOKI_Discrete_Flow-based_Unified_Understanding_and_Generation_via_Kinetic-Optimal_Velocities__9c0a452656594ea3134b9cdcb16988663e9015013c42419254bc35661139b69f.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0096_26975_LogicTree_Improving_Complex_Reasoning_of_LLMs_via_Instantiated_Multi-step_Synthetic_Logical_Data__75918e90d782aa4c0011abbf0fe69a93e2595315a84e5dd3ba23b1cddfb672b5.jpg filter=lfs diff=lfs merge=lfs -text
+data/spotlight_reference_images/ref_0098_27155_DexFlyWheel_A_Scalable_and_Self-improving_Data_Generation_Framework_for_Dexterous_Manipulation__6cb5d9f8f05d11ff6e6bf4f69015de5ad79051d577633793092cf0b753f0d1aa.jpg filter=lfs diff=lfs merge=lfs -text
+examples/basic_example_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/basic_example_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/basic_example_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/neurips_refs_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/neurips_refs_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/neurips_refs_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/ddpm_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/ddpm_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/ddpm_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/resnet_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/resnet_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/resnet_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/transformer_iter1_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/transformer_iter2_0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/readme/transformer_iter3_0.jpg filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.10-slim
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user (HF Spaces requirement)
+RUN useradd -m -u 1000 user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /app
+# Install Python deps first (cache layer)
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy app code
+COPY --chown=user . .
+USER user
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
 title: PaperBanana
-emoji: 🌍
-colorFrom: purple
 colorTo: yellow
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: PaperBanana
+emoji: 🍌
+colorFrom: yellow
 colorTo: yellow
 sdk: docker
+app_file: app.py
+pinned: true
+license: mit
+short_description: Methodology text to architecture diagrams
 ---

aesthetic_guidelines.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Aesthetic Guidelines (G) for academic illustration styling.
+Based on Appendix F of the PaperBanana paper.
+"""
+AESTHETIC_GUIDELINE = """
+# Academic Illustration Style Guide (NeurIPS Style)
+## Color Palette
+- **Overall Aesthetic:** Soft Tech & Scientific Pastels ("NeurIPS Look")
+- **Background Colors:** Cream (#FFF8E7), Pale Blue (#E3F2FD), Mint (#E8F5E9)
+- **Accent Colors:**
+  - Soft Blue (#64B5F6) for primary processes
+  - Soft Orange (#FFB74D) for secondary/iterative processes
+  - Soft Purple (#9575CD) for highlighting key components
+  - Soft Green (#81C784) for success/outputs
+- **Use color to group logical components**
+## Shapes and Components
+- **Process Boxes:** Rounded rectangles with subtle shadows
+- **Data/Tensors:** 3D stacks or layered rectangles
+- **Databases/Storage:** Cylinders or drum shapes
+- **Agents/Models:** Robot or brain icons with labels
+- **Inputs/Outputs:** Parallelograms or cloud shapes
+## Lines and Arrows
+- **Network/Architecture Diagrams:** Orthogonal/Elbow connectors
+- **Logic Flow:** Curved arrows for feedback loops
+- **Data Flow:** Straight arrows with clear directionality
+- **Arrow Styles:** Solid for primary flow, dashed for optional/conditional
+## Typography
+- **Labels:** Sans-serif fonts (Arial, Roboto, Helvetica)
+- **Mathematical Variables:** Serif Italic (Times New Roman) - use LaTeX notation (e.g., $P$, $P^*$)
+- **Font Sizes:**
+  - Main labels: 12-14pt
+  - Subscript/technical: 10pt
+  - Section headers: 16pt bold
+## Layout Principles
+- **Hierarchy:** Left-to-right or top-to-bottom flow
+- **Grouping:** Use containers/boxes with subtle backgrounds to group related components
+- **Spacing:** Generous whitespace, consistent padding
+- **Alignment:** Grid-based layout, aligned elements
+- **Balance:** Visual weight distributed evenly
+## Technical Details
+- **Line Weight:** 1.5-2pt for main elements, 1pt for details
+- **Corner Radius:** 8-12px for rounded rectangles
+- **Shadow:** Subtle drop shadow (opacity 10-20%)
+- **Icons:** Simple, consistent style throughout
+## Diagram-Specific Guidelines
+### Architecture Diagrams
+- Show clear input → process → output flow
+- Use containers to separate phases/stages
+- Include feedback loops where applicable
+### Methodology Diagrams
+- Emphasize the pipeline structure
+- Show agent interactions clearly
+- Use consistent icons for similar components
+- Annotate with mathematical notation where relevant
+"""

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Agents package for PaperBanana framework.
+"""
+from .retriever import RetrieverAgent
+from .planner import PlannerAgent
+from .stylist import StylistAgent
+from .visualizer import VisualizerAgent
+from .critic import CriticAgent
+__all__ = [
+    'RetrieverAgent',
+    'PlannerAgent',
+    'StylistAgent',
+    'VisualizerAgent',
+    'CriticAgent'
+]

agents/critic.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+Critic Agent for PaperBanana framework.
+Forms closed-loop refinement mechanism by identifying factual misalignments
+or visual glitches and providing feedback for iterative improvement.
+"""
+import os
+from typing import Dict, List
+from google import genai
+from google.genai import types
+import config
+class CriticAgent:
+    """
+    Critic Agent: Provides iterative feedback for refinement.
+    Identifies factual misalignments, visual glitches, and areas for improvement
+    in generated illustrations, enabling closed-loop refinement.
+    """
+    def __init__(self):
+        """Initialize Critic Agent."""
+        self.client = genai.Client(api_key=config.GEMINI_API_KEY)
+        self.model = config.VLM_MODEL
+    def critique(self,
+                 methodology_text: str,
+                 caption: str,
+                 current_description: str,
+                 generated_image_path: str = None,
+                 iteration: int = 1) -> Dict[str, any]:
+        """
+        Provide critique and feedback on current illustration.
+        Args:
+            methodology_text: Original methodology description
+            caption: Target diagram caption
+            current_description: Current textual description
+            generated_image_path: Path to generated image (if available)
+            iteration: Current iteration number
+        Returns:
+            Dictionary containing:
+                - 'feedback': Textual feedback
+                - 'issues': List of identified issues
+                - 'suggestions': List of improvement suggestions
+                - 'should_continue': Boolean indicating if refinement should continue
+        """
+        prompt = self._create_critique_prompt(
+            methodology_text,
+            caption,
+            current_description,
+            iteration
+        )
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)]
+            )
+        ]
+        # If we have an image, we could add it to the critique (future enhancement)
+        # For now, we critique based on the description
+        generate_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(
+                thinking_level=config.THINKING_LEVEL
+            )
+        )
+        critique_text = ""
+        for chunk in self.client.models.generate_content_stream(
+            model=self.model,
+            contents=contents,
+            config=generate_config
+        ):
+            critique_text += chunk.text
+        # Parse critique into structured feedback
+        result = self._parse_critique(critique_text, iteration)
+        return result
+    def _create_critique_prompt(self,
+                                methodology_text: str,
+                                caption: str,
+                                current_description: str,
+                                iteration: int) -> str:
+        """Create prompt for critique generation."""
+        prompt = f"""You are an expert reviewer of academic illustrations, specializing in methodology diagrams.
+Your task is to critically evaluate a textual description for an academic diagram and provide constructive feedback.
+ORIGINAL METHODOLOGY:
+{methodology_text}
+TARGET CAPTION:
+{caption}
+CURRENT ILLUSTRATION DESCRIPTION (Iteration {iteration}):
+{current_description}
+EVALUATION CRITERIA:
+1. **Faithfulness**: Does the description accurately represent all key aspects of the methodology?
+   - Are all important components mentioned?
+   - Is the flow/logic correctly represented?
+   - Are there any factual errors or misrepresentations?
+2. **Conciseness**: Is the description appropriately detailed without being cluttered?
+   - Is information density appropriate?
+   - Are there redundant elements?
+   - Is anything unnecessarily complex?
+3. **Readability**: Will the resulting diagram be easy to understand?
+   - Is the layout logical?
+   - Are labels clear and informative?
+   - Is visual hierarchy appropriate?
+4. **Aesthetics**: Does the description specify professional visual design?
+   - Are colors, shapes, and typography well-defined?
+   - Is there visual consistency?
+   - Does it match academic publication standards?
+YOUR TASK:
+Provide a structured critique covering:
+ISSUES FOUND:
+- List specific problems (e.g., "Missing connection between X and Y")
+- Rate severity: CRITICAL, MAJOR, or MINOR
+SUGGESTIONS FOR IMPROVEMENT:
+- Provide concrete, actionable suggestions
+- Prioritize by impact
+OVERALL ASSESSMENT:
+- Is this ready for visualization, or does it need refinement?
+- If iteration {iteration} < 3, should we continue refining?
+OUTPUT FORMAT:
+Structure your response as:
+ISSUES:
+1. [SEVERITY] Issue description
+2. [SEVERITY] Issue description
+...
+SUGGESTIONS:
+1. Specific suggestion
+2. Specific suggestion
+...
+DECISION: [READY / NEEDS_REFINEMENT]
+REASONING: Brief explanation of the decision
+"""
+        return prompt
+    def _parse_critique(self, critique_text: str, iteration: int) -> Dict:
+        """Parse critique text into structured format."""
+        issues = []
+        suggestions = []
+        should_continue = True
+        # Simple parsing - look for key sections
+        lines = critique_text.split('\n')
+        current_section = None
+        for line in lines:
+            line_upper = line.upper().strip()
+            if 'ISSUES:' in line_upper:
+                current_section = 'issues'
+                continue
+            elif 'SUGGESTIONS:' in line_upper or 'SUGGESTION' in line_upper:
+                current_section = 'suggestions'
+                continue
+            elif 'DECISION:' in line_upper:
+                current_section = 'decision'
+                if 'READY' in line_upper and 'NEEDS_REFINEMENT' not in line_upper:
+                    should_continue = False
+                continue
+            # Parse content
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            if current_section == 'issues' and (line.startswith('-') or line[0].isdigit()):
+                issues.append(line.lstrip('-').lstrip('0123456789.').strip())
+            elif current_section == 'suggestions' and (line.startswith('-') or line[0].isdigit()):
+                suggestions.append(line.lstrip('-').lstrip('0123456789.').strip())
+        # Don't continue past max iterations
+        if iteration >= config.MAX_REFINEMENT_ITERATIONS:
+            should_continue = False
+        return {
+            'feedback': critique_text,
+            'issues': issues,
+            'suggestions': suggestions,
+            'should_continue': should_continue
+        }
+    def generate_refinement_prompt(self,
+                                   original_description: str,
+                                   critique: Dict) -> str:
+        """
+        Generate prompt for refinement based on critique.
+        Args:
+            original_description: Current description
+            critique: Critique dictionary from critique()
+        Returns:
+            Prompt for Planner to refine the description
+        """
+        issues_text = "\n".join([f"- {issue}" for issue in critique['issues']])
+        suggestions_text = "\n".join([f"- {sug}" for sug in critique['suggestions']])
+        refinement_prompt = f"""CURRENT DESCRIPTION:
+{original_description}
+IDENTIFIED ISSUES:
+{issues_text}
+SUGGESTIONS FOR IMPROVEMENT:
+{suggestions_text}
+Please revise the description to address these issues and incorporate the suggestions.
+Maintain all correct elements while fixing the identified problems.
+"""
+        return refinement_prompt

agents/planner.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Planner Agent for PaperBanana framework.
+Serves as the cognitive core. Translates unstructured methodology data
+into comprehensive textual description of the target illustration.
+"""
+import os
+from typing import List, Dict, Any
+from google import genai
+from google.genai import types
+import config
+class PlannerAgent:
+    """
+    Planner Agent: Translates methodology into comprehensive illustration description.
+    The cognitive core that interprets source context S and communicative intent C,
+    then produces detailed textual description P of the target illustration.
+    """
+    def __init__(self):
+        """Initialize Planner Agent."""
+        self.client = genai.Client(api_key=config.GEMINI_API_KEY)
+        self.model = config.VLM_MODEL
+    def plan(self,
+             methodology_text: str,
+             caption: str,
+             reference_examples: List[Dict[str, Any]] = None) -> str:
+        """
+        Generate comprehensive textual description of target illustration.
+        Args:
+            methodology_text: Source methodology description (S)
+            caption: Diagram caption (part of C)
+            reference_examples: Retrieved reference examples (E)
+        Returns:
+            Detailed textual description P of the illustration
+        """
+        prompt = self._create_planning_prompt(methodology_text, caption, reference_examples)
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)]
+            )
+        ]
+        generate_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(
+                thinking_level=config.THINKING_LEVEL
+            )
+        )
+        description = ""
+        for chunk in self.client.models.generate_content_stream(
+            model=self.model,
+            contents=contents,
+            config=generate_config
+        ):
+            description += chunk.text
+        return description.strip()
+    def _create_planning_prompt(self,
+                                methodology_text: str,
+                                caption: str,
+                                reference_examples: List[Dict[str, Any]] = None) -> str:
+        """Create prompt for generating illustration description."""
+        # Include reference examples if available
+        reference_context = ""
+        if reference_examples:
+            reference_context = "\n\nREFERENCE EXAMPLES (for inspiration):\n"
+            for i, ref in enumerate(reference_examples[:3], 1):  # Use top 3
+                reference_context += f"\nExample {i}:\n"
+                reference_context += f"Domain: {ref.get('domain', 'N/A')}\n"
+                reference_context += f"Type: {ref.get('diagram_type', 'N/A')}\n"
+                reference_context += f"Description: {ref.get('description', 'N/A')}\n"
+        prompt = f"""You are an expert at designing academic methodology diagrams for scientific publications.
+Your task is to create a COMPREHENSIVE and DETAILED textual description of an illustration that would
+effectively visualize the given methodology. This description will be used to generate the actual diagram.
+METHODOLOGY TO VISUALIZE:
+{methodology_text}
+TARGET DIAGRAM CAPTION:
+{caption}
+{reference_context}
+REQUIREMENTS:
+1. **Layout Structure**: Specify the overall layout (left-to-right, top-to-bottom, circular, etc.)
+2. **Components**: List all visual elements needed (boxes, arrows, icons, labels, etc.)
+3. **Content**: What text/symbols should appear in each component
+4. **Connections**: How components connect (arrows, lines, groupings)
+5. **Hierarchy**: Which elements are primary vs secondary
+6. **Grouping**: How to group related components (containers, background colors)
+7. **Flow**: The logical flow of information through the diagram
+8. **Key Details**: Important technical details, equations, or annotations
+IMPORTANT GUIDELINES:
+- Be specific about spatial relationships and positioning
+- Describe the logical flow clearly (input → process → output)
+- Include any mathematical notation or technical terminology
+- Consider the target audience (academic researchers)
+- Focus on clarity and information density
+- Think about how this supports the paper's narrative
+OUTPUT FORMAT:
+Provide a detailed paragraph-form description that covers all aspects above.
+Be thorough - this description should be sufficient for someone to create the diagram without seeing the original methodology.
+"""
+        return prompt

agents/retriever.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Retriever Agent for PaperBanana framework.
+Identifies the N most relevant examples from a reference set using VLM ranking.
+Matches based on research domain and diagram type.
+"""
+import os
+from typing import List, Dict, Any
+from google import genai
+from google.genai import types
+import config
+class RetrieverAgent:
+    """
+    Retriever Agent: Identifies relevant reference examples from a fixed reference set.
+    Uses generative retrieval approach where VLM ranks candidates by matching
+    research domain and diagram type.
+    """
+    def __init__(self, reference_set: List[Dict[str, Any]] = None):
+        """
+        Initialize Retriever Agent.
+        Args:
+            reference_set: List of reference examples with metadata
+                          Each example should have: {
+                              'id': str,
+                              'domain': str,
+                              'diagram_type': str,
+                              'description': str,
+                              'image_path': str (optional)
+                          }
+        """
+        self.client = genai.Client(api_key=config.GEMINI_API_KEY)
+        self.model = config.VLM_MODEL
+        self.reference_set = reference_set or []
+    def retrieve(self,
+                 methodology_text: str,
+                 caption: str,
+                 n: int = config.NUM_REFERENCE_EXAMPLES) -> List[Dict[str, Any]]:
+        """
+        Retrieve the N most relevant reference examples.
+        Args:
+            methodology_text: Source methodology description
+            caption: Target diagram caption
+            n: Number of examples to retrieve
+        Returns:
+            List of N most relevant reference examples
+        """
+        if not self.reference_set:
+            print("Warning: No reference set provided. Skipping retrieval.")
+            return []
+        # Create retrieval prompt
+        prompt = self._create_retrieval_prompt(methodology_text, caption, n)
+        # Query VLM for ranking
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)]
+            )
+        ]
+        generate_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(
+                thinking_level=config.THINKING_LEVEL
+            )
+        )
+        response_text = ""
+        for chunk in self.client.models.generate_content_stream(
+            model=self.model,
+            contents=contents,
+            config=generate_config
+        ):
+            response_text += chunk.text
+        # Parse the response to extract selected example IDs
+        selected_examples = self._parse_retrieval_response(response_text, n)
+        return selected_examples
+    def _create_retrieval_prompt(self, methodology_text: str, caption: str, n: int) -> str:
+        """Create prompt for retrieving relevant examples."""
+        # Create a summary of available references
+        reference_summary = "\n".join([
+            f"ID: {ref['id']}\nDomain: {ref['domain']}\nType: {ref['diagram_type']}\nDescription: {ref['description']}\n"
+            for ref in self.reference_set
+        ])
+        prompt = f"""You are an expert at identifying relevant academic illustration examples.
+Given a methodology description and diagram caption, select the {n} most relevant reference examples
+from the provided set. Consider:
+1. Research domain similarity (e.g., NLP, Computer Vision, Reinforcement Learning)
+2. Diagram type similarity (e.g., architecture diagram, flowchart, pipeline)
+3. Conceptual similarity in the methodology
+METHODOLOGY:
+{methodology_text}
+TARGET CAPTION:
+{caption}
+AVAILABLE REFERENCE EXAMPLES:
+{reference_summary}
+OUTPUT FORMAT:
+Return only the IDs of the {n} most relevant examples, one per line, ranked from most to least relevant.
+Example output:
+ref_001
+ref_005
+ref_012
+"""
+        return prompt
+    def _parse_retrieval_response(self, response_text: str, n: int) -> List[Dict[str, Any]]:
+        """Parse VLM response to extract selected examples."""
+        # Extract IDs from response
+        lines = response_text.strip().split('\n')
+        selected_ids = []
+        for line in lines:
+            line = line.strip()
+            # Look for reference IDs
+            for ref in self.reference_set:
+                if ref['id'] in line:
+                    selected_ids.append(ref['id'])
+                    break
+            if len(selected_ids) >= n:
+                break
+        # Get full reference objects
+        selected_examples = []
+        for ref_id in selected_ids:
+            for ref in self.reference_set:
+                if ref['id'] == ref_id:
+                    selected_examples.append(ref)
+                    break
+        # If we didn't get enough, just take the first n
+        if len(selected_examples) < n:
+            selected_examples = self.reference_set[:n]
+        return selected_examples[:n]

agents/stylist.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Stylist Agent for PaperBanana framework.
+Acts as a design consultant. Uses automatically synthesized aesthetic
+guidelines to refine initial description into stylistically optimized version.
+"""
+import os
+from google import genai
+from google.genai import types
+import config
+from aesthetic_guidelines import AESTHETIC_GUIDELINE
+class StylistAgent:
+    """
+    Stylist Agent: Refines illustration descriptions using aesthetic guidelines.
+    Takes initial description P and enhances it with style guidance G
+    to produce stylistically optimized description P*.
+    """
+    def __init__(self, custom_guidelines: str = None):
+        """
+        Initialize Stylist Agent.
+        Args:
+            custom_guidelines: Optional custom aesthetic guidelines.
+                             If None, uses default NeurIPS-style guidelines.
+        """
+        self.client = genai.Client(api_key=config.GEMINI_API_KEY)
+        self.model = config.VLM_MODEL
+        self.guidelines = custom_guidelines or AESTHETIC_GUIDELINE
+    def refine(self, initial_description: str) -> str:
+        """
+        Refine initial description with aesthetic styling.
+        Args:
+            initial_description: Initial textual description P
+        Returns:
+            Stylistically optimized description P*
+        """
+        prompt = self._create_styling_prompt(initial_description)
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)]
+            )
+        ]
+        generate_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(
+                thinking_level=config.THINKING_LEVEL
+            )
+        )
+        refined_description = ""
+        for chunk in self.client.models.generate_content_stream(
+            model=self.model,
+            contents=contents,
+            config=generate_config
+        ):
+            refined_description += chunk.text
+        return refined_description.strip()
+    def _create_styling_prompt(self, initial_description: str) -> str:
+        """Create prompt for aesthetic refinement."""
+        prompt = f"""You are an expert design consultant specializing in academic publication illustrations.
+Your task is to take an initial diagram description and enhance it with specific aesthetic and design details
+to create a polished, publication-ready illustration that follows academic standards.
+INITIAL DESCRIPTION:
+{initial_description}
+AESTHETIC GUIDELINES TO FOLLOW:
+{self.guidelines}
+YOUR TASK:
+Refine the initial description by adding specific visual design details:
+1. **Color Specifications**: Add specific color choices from the palette (e.g., "soft blue #64B5F6 for the main process boxes")
+2. **Shape Details**: Specify exact shapes and their styling (e.g., "rounded rectangles with 10px radius and subtle shadow")
+3. **Typography**: Define font choices for different text elements
+4. **Visual Hierarchy**: Enhance descriptions of size, weight, and emphasis relationships
+5. **Spacing & Layout**: Add details about padding, margins, and alignment
+6. **Professional Polish**: Include finishing touches like shadows, borders, gradients
+IMPORTANT:
+- Preserve ALL content and structural information from the initial description
+- Add aesthetic details WITHOUT changing the fundamental design or information flow
+- Be specific with measurements, colors (hex codes), and styling parameters
+- Ensure the result maintains academic professionalism and clarity
+- The output should be suitable for direct input to an image generation model
+OUTPUT FORMAT:
+Provide the enhanced description as a detailed, flowing paragraph that seamlessly integrates
+the original content with the aesthetic specifications. Make it vivid and precise enough that
+an image generation model can render it accurately.
+"""
+        return prompt

agents/visualizer.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Visualizer Agent for PaperBanana framework.
+Renders academic illustrations using image generation models.
+Supports both diagram generation and statistical plot generation.
+"""
+import os
+import mimetypes
+from typing import Optional
+from google import genai
+from google.genai import types
+import config
+from utils import save_binary_file
+class VisualizerAgent:
+    """
+    Visualizer Agent: Renders illustrations from textual descriptions.
+    Supports two modes:
+    1. Diagram mode: Uses image generation model (Nano-Banana-Pro / Gemini Image)
+    2. Plot mode: Generates Python Matplotlib code for statistical plots
+    """
+    def __init__(self, mode: str = "diagram"):
+        """
+        Initialize Visualizer Agent.
+        Args:
+            mode: Generation mode - "diagram" or "plot"
+        """
+        self.client = genai.Client(api_key=config.GEMINI_API_KEY)
+        self.mode = mode
+        if mode == "diagram":
+            self.model = config.IMAGE_MODEL
+        elif mode == "plot":
+            self.model = config.VLM_MODEL  # Use VLM for code generation
+        else:
+            raise ValueError(f"Invalid mode: {mode}. Use 'diagram' or 'plot'")
+    def visualize(self,
+                  description: str,
+                  output_path: str = "output",
+                  data: dict = None) -> str:
+        """
+        Generate visualization from description.
+        Args:
+            description: Textual description of the illustration
+            output_path: Base path for output file (without extension)
+            data: Optional data dict for plot mode
+        Returns:
+            Path to generated image file or code file
+        """
+        if self.mode == "diagram":
+            return self._generate_diagram(description, output_path)
+        elif self.mode == "plot":
+            return self._generate_plot(description, output_path, data)
+    def _generate_diagram(self, description: str, output_path: str) -> str:
+        """
+        Generate diagram image using image generation model.
+        Args:
+            description: Detailed visual description
+            output_path: Base path for output file
+        Returns:
+            Path to generated image
+        """
+        # Create prompt for image generation
+        prompt = f"""Generate a high-quality academic methodology diagram with the following specifications:
+{description}
+Requirements:
+- Professional academic publication quality
+- Clear, readable text and labels
+- Consistent styling throughout
+- Appropriate use of colors and shapes
+- Publication-ready resolution
+"""
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)]
+            )
+        ]
+        generate_config = types.GenerateContentConfig(
+            response_modalities=["IMAGE", "TEXT"],
+            image_config=types.ImageConfig(
+                image_size=config.IMAGE_SIZE
+            )
+        )
+        file_index = 0
+        saved_path = None
+        for chunk in self.client.models.generate_content_stream(
+            model=self.model,
+            contents=contents,
+            config=generate_config
+        ):
+            if (chunk.candidates is None or
+                chunk.candidates[0].content is None or
+                chunk.candidates[0].content.parts is None):
+                continue
+            # Check for inline image data
+            part = chunk.candidates[0].content.parts[0]
+            if part.inline_data and part.inline_data.data:
+                inline_data = part.inline_data
+                data_buffer = inline_data.data
+                file_extension = mimetypes.guess_extension(inline_data.mime_type)
+                if file_extension:
+                    file_name = f"{output_path}_{file_index}{file_extension}"
+                    saved_path = save_binary_file(file_name, data_buffer)
+                    file_index += 1
+            else:
+                # Print any text output
+                if chunk.text:
+                    print(chunk.text)
+        return saved_path or f"{output_path}_0.png"
+    def _generate_plot(self, description: str, output_path: str, data: dict = None) -> str:
+        """
+        Generate statistical plot by creating Matplotlib code.
+        Args:
+            description: Description of desired plot
+            output_path: Base path for output code file
+            data: Optional data dictionary
+        Returns:
+            Path to generated Python code file
+        """
+        data_context = ""
+        if data:
+            data_context = f"\n\nDATA PROVIDED:\n{str(data)}\n"
+        prompt = f"""You are an expert at creating publication-quality statistical plots using Matplotlib.
+Generate complete, executable Python code using Matplotlib to create the following plot:
+{description}
+{data_context}
+Requirements:
+1. Use professional academic styling (seaborn-paper style or similar)
+2. Include clear axis labels with units
+3. Add legend if multiple series
+4. Use appropriate colors and markers
+5. Set figure size for publication (e.g., 6x4 inches)
+6. Save as high-resolution PNG (300 dpi minimum)
+7. Include error bars if applicable
+8. Follow best practices for data visualization
+OUTPUT FORMAT:
+Provide ONLY the complete Python code, ready to execute.
+Start with necessary imports and end with plt.savefig().
+Do not include any explanations outside the code comments.
+"""
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)]
+            )
+        ]
+        generate_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(
+                thinking_level="MEDIUM"
+            )
+        )
+        code = ""
+        for chunk in self.client.models.generate_content_stream(
+            model=self.model,
+            contents=contents,
+            config=generate_config
+        ):
+            code += chunk.text
+        # Save code to file
+        code_file = f"{output_path}.py"
+        with open(code_file, 'w') as f:
+            f.write(code.strip())
+        print(f"Plot code saved to: {code_file}")
+        print("Run the code to generate the plot image.")
+        return code_file

app.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+PaperBanana — Gradio app for HuggingFace Spaces.
+Turns methodology text into publication-ready architecture diagrams
+using a 5-agent pipeline (Retriever → Planner → Stylist → Visualizer → Critic).
+"""
+import os
+import json
+import tempfile
+import mimetypes
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import gradio as gr
+from google import genai
+from google.genai import types
+from agents import RetrieverAgent, PlannerAgent, StylistAgent, VisualizerAgent, CriticAgent
+from aesthetic_guidelines import AESTHETIC_GUIDELINE
+import config
+# ── Load reference set at startup ───────────────────────────────────────────
+REF_SET_PATH = Path("data/spotlight_reference_set.json")
+REFERENCE_SET: List[Dict[str, Any]] = []
+if REF_SET_PATH.exists():
+    with open(REF_SET_PATH) as f:
+        REFERENCE_SET = json.load(f)
+    print(f"Loaded {len(REFERENCE_SET)} reference examples")
+# ── Example gallery images ──────────────────────────────────────────────────
+EXAMPLE_IMAGES = {
+    "Transformer": "examples/readme/transformer_iter3_0.jpg",
+    "ResNet": "examples/readme/resnet_iter3_0.jpg",
+    "DDPM": "examples/readme/ddpm_iter3_0.jpg",
+}
+# ── Preset examples ─────────────────────────────────────────────────────────
+PRESET_EXAMPLES = [
+    [
+        # Transformer
+        """The Transformer model follows an encoder-decoder structure using stacked self-attention and fully connected layers.
+Encoder: Stack of N=6 identical layers. Each layer has two sub-layers: (1) multi-head self-attention, and (2) position-wise feed-forward network. Residual connections around each sub-layer, followed by layer normalization.
+Decoder: Stack of N=6 identical layers. In addition to the two encoder sub-layers, the decoder inserts a third sub-layer for multi-head cross-attention over the encoder output. Masked self-attention prevents attending to subsequent positions.
+Multi-Head Attention: Linearly project queries, keys, values h times, perform scaled dot-product attention in parallel, concatenate and project again.
+Positional Encoding: Sinusoidal positional encodings added to input embeddings.""",
+        "The Transformer — model architecture (Vaswani et al., 2017)",
+        2,
+    ],
+    [
+        # ResNet
+        """We present a residual learning framework. Instead of learning H(x) directly, layers fit a residual mapping F(x) = H(x) - x. The building block is y = F(x, {W_i}) + x via identity shortcut connections.
+Architecture: Input 224×224 → 7×7 conv, 64, stride 2 → BN → ReLU → 3×3 max pool → Stage 1: 3 blocks, 64 filters → Stage 2: 4 blocks, 128 filters → Stage 3: 6 blocks, 256 filters → Stage 4: 3 blocks, 512 filters → Global avg pool → 1000-d FC → softmax.
+For deeper networks (50/101/152), bottleneck blocks: 1×1 conv (reduce) → 3×3 conv → 1×1 conv (restore), with shortcut bypassing all three layers.""",
+        "Architecture of ResNet with residual learning building blocks (He et al., 2016)",
+        2,
+    ],
+    [
+        # DDPM
+        """Denoising diffusion probabilistic models (DDPMs): Forward process gradually adds Gaussian noise over T timesteps: q(x_t|x_{t-1}) = N(x_t; √(1-β_t)x_{t-1}, β_tI). After T steps, x_T ≈ N(0,I).
+Reverse process learns to denoise: p_θ(x_{t-1}|x_t) = N(x_{t-1}; μ_θ(x_t,t), Σ_θ(x_t,t)). Starting from x_T ~ N(0,I), iteratively produces clean x_0.
+Denoising network ε_θ(x_t,t) is a U-Net: downsampling with ResNet blocks + self-attention at 16×16, bottleneck with self-attention, upsampling with skip connections. Timestep conditioning via sinusoidal embeddings. Training minimizes L = E[||ε - ε_θ(x_t,t)||²].""",
+        "Overview of the denoising diffusion probabilistic model (Ho et al., 2020)",
+        2,
+    ],
+]
+# ── Core generation logic (streaming-friendly) ─────────────────────────────
+def generate_diagram(
+    methodology_text: str,
+    caption: str,
+    num_iterations: int,
+    api_key: str | None = None,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Run the full PaperBanana pipeline and yield intermediate results."""
+    # Resolve API key: user input > env var
+    gemini_key = (api_key or "").strip() or config.GEMINI_API_KEY
+    if not gemini_key:
+        raise gr.Error(
+            "No Gemini API key found. Paste one in the field above, "
+            "or set GEMINI_API_KEY as a Space secret."
+        )
+    # Patch config so all agents pick it up
+    config.GEMINI_API_KEY = gemini_key
+    num_iterations = int(num_iterations)
+    logs: list[str] = []
+    def log(msg: str):
+        logs.append(msg)
+        return "\n".join(logs)
+    # ── 1. Retriever ────────────────────────────────────────────────────────
+    yield None, log("🔍 [1/5] Retriever: finding relevant references…")
+    retriever = RetrieverAgent(REFERENCE_SET)
+    reference_examples = []
+    if REFERENCE_SET:
+        reference_examples = retriever.retrieve(
+            methodology_text, caption, n=config.NUM_REFERENCE_EXAMPLES
+        )
+        yield None, log(f"   ✓ Retrieved {len(reference_examples)} references")
+    else:
+        yield None, log("   ⏭ Skipped (no reference set loaded)")
+    # ── 2. Planner ──────────────────────────────────────────────────────────
+    yield None, log("📝 [2/5] Planner: creating visual description…")
+    planner = PlannerAgent()
+    current_description = planner.plan(methodology_text, caption, reference_examples)
+    yield None, log(f"   ✓ Description ready ({len(current_description)} chars)")
+    # ── 3. Stylist ──────────────────────────────────────────────────────────
+    yield None, log("🎨 [3/5] Stylist: applying aesthetic guidelines…")
+    stylist = StylistAgent()
+    current_description = stylist.refine(current_description)
+    yield None, log(f"   ✓ Styled ({len(current_description)} chars)")
+    # ── 4/5. Visualize → Critique loop ──────────────────────────────────────
+    latest_image_path = None
+    critic = CriticAgent()
+    for i in range(1, num_iterations + 1):
+        yield latest_image_path, log(
+            f"🖼️ [4/5] Visualizer: generating image (iteration {i}/{num_iterations})…"
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            out_base = os.path.join(tmpdir, f"iter{i}")
+            visualizer = VisualizerAgent(mode="diagram")
+            img_path = visualizer.visualize(current_description, out_base)
+            if img_path and os.path.exists(img_path):
+                # Copy to a persistent temp file so Gradio can serve it
+                import shutil
+                ext = Path(img_path).suffix or ".jpg"
+                persist = tempfile.NamedTemporaryFile(
+                    suffix=ext, delete=False, dir=tempfile.gettempdir()
+                )
+                shutil.copy2(img_path, persist.name)
+                latest_image_path = persist.name
+        yield latest_image_path, log(f"   ✓ Image generated (iteration {i})")
+        # Skip critique on last iteration
+        if i >= num_iterations:
+            break
+        yield latest_image_path, log(
+            f"🔬 [5/5] Critic: evaluating (iteration {i})…"
+        )
+        critique = critic.critique(
+            methodology_text, caption, current_description, latest_image_path, i
+        )
+        n_issues = len(critique["issues"])
+        yield latest_image_path, log(f"   ✓ {n_issues} issues found")
+        if not critique["should_continue"]:
+            yield latest_image_path, log("   ✓ Quality threshold reached — done!")
+            break
+        # Refine
+        yield latest_image_path, log("📝 [2/5] Planner: refining description…")
+        refinement_prompt = critic.generate_refinement_prompt(
+            current_description, critique
+        )
+        client = genai.Client(api_key=gemini_key)
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=refinement_prompt)],
+            )
+        ]
+        refined = ""
+        for chunk in client.models.generate_content_stream(
+            model=config.VLM_MODEL,
+            contents=contents,
+            config=types.GenerateContentConfig(
+                thinking_config=types.ThinkingConfig(thinking_level="HIGH")
+            ),
+        ):
+            refined += chunk.text
+        current_description = refined.strip()
+        yield latest_image_path, log(
+            f"   ✓ Refined ({len(current_description)} chars)"
+        )
+        # Re-style
+        yield latest_image_path, log("🎨 [3/5] Stylist: re-applying style…")
+        current_description = stylist.refine(current_description)
+        yield latest_image_path, log(f"   ✓ Styled ({len(current_description)} chars)")
+    yield latest_image_path, log("\n✅ Generation complete!")
+# ── Gradio UI ───────────────────────────────────────────────────────────────
+DESCRIPTION_MD = """\
+# 🍌 PaperBanana
+**Turn methodology text into publication-ready architecture diagrams.**
+Paste your paper's methodology section + a caption, and PaperBanana's 5-agent pipeline
+(Retriever → Planner → Stylist → Visualizer → Critic) will generate a diagram for you.
+> Based on [*PaperBanana: Automating Academic Illustration for AI Scientists*](https://arxiv.org/abs/2505.23894) (Zhu et al., NeurIPS 2025).
+"""
+with gr.Blocks(
+    title="PaperBanana",
+    theme=gr.themes.Soft(primary_hue="amber", secondary_hue="blue"),
+    css="footer { display: none !important; }",
+) as demo:
+    gr.Markdown(DESCRIPTION_MD)
+    # ── Example gallery ─────────────────────────────────────────────────────
+    with gr.Accordion("📸 Example outputs (click to expand)", open=False):
+        existing = {k: v for k, v in EXAMPLE_IMAGES.items() if Path(v).exists()}
+        if existing:
+            with gr.Row():
+                for name, path in existing.items():
+                    with gr.Column(min_width=200):
+                        gr.Image(value=path, label=name)
+    # ── Inputs ──────────────────────────────────────────────────────────────
+    with gr.Row():
+        with gr.Column(scale=1):
+            methodology_input = gr.Textbox(
+                label="Methodology text",
+                placeholder="Paste your methodology / model description here…",
+                lines=12,
+            )
+            caption_input = gr.Textbox(
+                label="Diagram caption",
+                placeholder='e.g. "Architecture of our proposed method"',
+                lines=2,
+            )
+            iterations_slider = gr.Slider(
+                minimum=1,
+                maximum=3,
+                value=2,
+                step=1,
+                label="Refinement iterations",
+                info="More iterations = better quality, slower",
+            )
+            api_key_input = gr.Textbox(
+                label="Gemini API key (optional if set as Space secret)",
+                type="password",
+                placeholder="AIza…",
+            )
+            generate_btn = gr.Button("🍌 Generate diagram", variant="primary", size="lg")
+        # ── Outputs ─────────────────────────────────────────────────────────
+        with gr.Column(scale=1):
+            output_image = gr.Image(label="Generated diagram", type="filepath")
+            output_log = gr.Textbox(label="Pipeline log", lines=18, interactive=False)
+    # ── Examples table ──────────────────────────────────────────────────────
+    gr.Examples(
+        examples=PRESET_EXAMPLES,
+        inputs=[methodology_input, caption_input, iterations_slider],
+        label="Try a classic paper",
+    )
+    # ── Wiring ──────────────────────────────────────────────────────────────
+    generate_btn.click(
+        fn=generate_diagram,
+        inputs=[methodology_input, caption_input, iterations_slider, api_key_input],
+        outputs=[output_image, output_log],
+    )
+if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860)

config.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""
+Configuration for PaperBanana framework.
+"""
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Gemini API Configuration
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
+# Model Configuration
+VLM_MODEL = "gemini-3-pro-preview"  # For Retriever, Planner, Stylist, Critic
+IMAGE_MODEL = "gemini-3-pro-image-preview"  # For Visualizer (referred to as Nano-Banana-Pro in paper)
+# Generation Configuration
+MAX_REFINEMENT_ITERATIONS = 3  # As per ablation study
+IMAGE_SIZE = "1K"  # Image resolution
+THINKING_LEVEL = "HIGH"  # For complex reasoning tasks
+# Number of reference examples to retrieve
+NUM_REFERENCE_EXAMPLES = 10

data/spotlight_reference_images/ref_0001_00232_GraphMaster_Automated_Graph_Synthesis_via_LLM_Agents_in_Data-Limited_Environments__fdf13132133da88f7ce9ae4d0a22c29da1f05f75072f95010a29b1392696ea70.jpg ADDED Viewed

data/spotlight_reference_images/ref_0002_00279_VoxDet_Rethinking_3D_Semantic_Scene_Completion_as_Dense_Object_Detection__7b55d87bf0fcf6d787a440d59bf4617e6d73f10f5b1bcc1b45736ad0a7a57911.jpg ADDED Viewed

data/spotlight_reference_images/ref_0003_00491_SQS_Enhancing_Sparse_Perception_Models_via_Query-based_Splatting_in_Autonomous_Driving__f7c6f154dc3e45e2f58f5a9111ebf57ead0d19f9e77340cd3838d881af17a916.jpg ADDED Viewed

data/spotlight_reference_images/ref_0004_00691_ProtInvTree_Deliberate_Protein_Inverse_Folding_with_Reward-guided_Tree_Search__713bbbec11cbef0f1d2c8901e17165fa7db3d1fcf6cfa0d4a8b803d2dccb2ca0.jpg ADDED Viewed

data/spotlight_reference_images/ref_0005_00738_Mulberry_Empowering_MLLM_with_o1-like_Reasoning_and_Reflection_via_Collective_Monte_Carlo_Tree_Search__7c987fc8cf213eb47a038117cc38e5a170938289a390de4b8d7b6cd88512d505.jpg ADDED Viewed

data/spotlight_reference_images/ref_0006_00981_MesaTask_Towards_Task-Driven_Tabletop_Scene_Generation_via_3D_Spatial_Reasoning__ddce4bdb68a1689579a491b5a31349db83e5046fbb5424a45dc0883d4115e7d5.jpg ADDED Viewed

Git LFS Details

SHA256: d04a1104d131abcf53056b5e5617593091f53bbf131d55ab2ce9b5f788b877d3
Pointer size: 131 Bytes
Size of remote file: 131 kB

data/spotlight_reference_images/ref_0007_01003_OmniSync_Towards_Universal_Lip_Synchronization_via_Diffusion_Transformers__28d606ccd79ed54496343219767701efb0f445058d39887c1cf629a800942f77.jpg ADDED Viewed

Git LFS Details

SHA256: b34adeb1b8b47ab83bc6644a64894d538cd7418ba6657a72ccdcdfebd2d6ad56
Pointer size: 131 Bytes
Size of remote file: 119 kB

data/spotlight_reference_images/ref_0008_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__ea910fb78f4d4027ff7fe9a63cbbc68a7038ee2e9f4250c2f773c631f265b079.jpg ADDED Viewed

data/spotlight_reference_images/ref_0009_01041_Enhancing_Time_Series_Forecasting_through_Selective_Representation_Spaces_A_Patch_Perspective__4cc6ca3e3fa15065b6a1781e9a7f814d67a46649325289b63c9bea4072020f4c.jpg ADDED Viewed

data/spotlight_reference_images/ref_0010_01112_DiCo_Revitalizing_ConvNets_for_Scalable_and_Efficient_Diffusion_Modeling__589d9f3ec341480c16ec00bf41076999cd0ce6c1526b97e71eb9c8ffe33b1a1b.jpg ADDED Viewed

data/spotlight_reference_images/ref_0011_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__b501bcad7830654e421726b37ea0d89207d68c72f0f6aa9179fec65e0c71b205.jpg ADDED Viewed

Git LFS Details

SHA256: 2640c46c7a0d30a577808fbf2b7ed24c2821c1d1991fbec98f91c8c4e6c34556
Pointer size: 131 Bytes
Size of remote file: 109 kB

data/spotlight_reference_images/ref_0012_01591_E2Former_An_Efficient_and_Equivariant_Transformer_with_Linear-Scaling_Tensor_Products__fb4694d88a5097ee72a936634a832ced85d11e90079a4fabc3f1b8c28f24e5d6.jpg ADDED Viewed

data/spotlight_reference_images/ref_0013_01620_FutureSightDrive_Thinking_Visually_with_Spatio-Temporal_CoT_for_Autonomous_Driving__8a4bffe9a69ef0d2bc0ced518bccdb3146d38727d7fc7402a418b6227a78bcfb.jpg ADDED Viewed

data/spotlight_reference_images/ref_0014_01659_G-Memory_Tracing_Hierarchical_Memory_for_Multi-Agent_Systems__48772f699bccd9ecf7285d9f2c4af85d34d60b7ee6b2cbd681278611869db12b.jpg ADDED Viewed

Git LFS Details

SHA256: a81e26be68336192dda8116ef0df8001d34ee61398a633266559e1f780cc2e0b
Pointer size: 131 Bytes
Size of remote file: 170 kB

data/spotlight_reference_images/ref_0015_01839_Mesh-RFT_Enhancing_Mesh_Generation_via_Fine-grained_Reinforcement_Fine-Tuning__4d93550a61cdae636652e7cd1ca974c21c7fca1e8eb6eaa6fa2e03100f1d0f68.jpg ADDED Viewed

data/spotlight_reference_images/ref_0016_01864_Jacobian-Based_Interpretation_of_Nonlinear_Neural_Encoding_Model__c9ac984c977d21aff284dcfbacf7bdfea345cc73096a3f28d624b026654e1740.jpg ADDED Viewed

data/spotlight_reference_images/ref_0017_02109_OnlineSplatter_Pose-Free_Online_3D_Reconstruction_for_Free-Moving_Objects__abe4e12f0fc9a7487c6c5774f5ead6b391ede9f832960b5ce462ceb786534a0d.jpg ADDED Viewed

data/spotlight_reference_images/ref_0018_02239_RobustMerge_Parameter-Efficient_Model_Merging_for_MLLMs_with_Direction_Robustness__45b74fc957d305067e6f46f00bc32c9ee0889b7dc7e18b4c46913d263c1d8c16.jpg ADDED Viewed

data/spotlight_reference_images/ref_0019_02373_MDReID_Modality-Decoupled_Learning_for_Any-to-Any_Multi-Modal_Object_Re-Identification__517a2ed3f7dd16e048d526da7807ac8b1b73cdb062b79021b7047dd0b467ba9a.jpg ADDED Viewed

Git LFS Details

SHA256: 3350f54f56d42f88c3d4e8da2fe5b3ce9b1b178d2c6a8a1d3097a1a34f630569
Pointer size: 131 Bytes
Size of remote file: 122 kB

data/spotlight_reference_images/ref_0020_03077_Toward_Relative_Positional_Encoding_in_Spiking_Transformers__bdb178031ec8d263c3a5388d900e974b8ce39ae5759a6611f688a57e22173fa5.jpg ADDED Viewed

Git LFS Details

SHA256: e924e108575f4d47a892f436788c8fa5669282ed5bad2882568222daf634c1a5
Pointer size: 131 Bytes
Size of remote file: 118 kB

data/spotlight_reference_images/ref_0021_03670_Neural_Atlas_Graphs_for_Dynamic_Scene_Decomposition_and_Editing__16f8fc865baed696e798502de56f3b473dbf2b0b6aa3c1286f384de53c524b97.jpg ADDED Viewed

data/spotlight_reference_images/ref_0022_03671_STITCH-OPE_Trajectory_Stitching_with_Guided_Diffusion_for_Off-Policy_Evaluation__6e9c875dbc74a8b39bc947de3b928f8a29f7c6db9b03d726a62a961ba0c2fdd3.jpg ADDED Viewed

data/spotlight_reference_images/ref_0023_04013_scMRDR_A_scalable_and_flexible_framework_for_unpaired_single-cell_multi-omics_data_integration__85cde1b5a410b0d3275a5c0fa81dfe69e2c83c485c9917c296502a6485e2f68b.jpg ADDED Viewed

data/spotlight_reference_images/ref_0024_04165_Transformer_Copilot_Learning_from_The_Mistake_Log_in_LLM_Fine-tuning__8d1e804f51825d9760a37b8d1ca027a61deecc5e33fc172f0c1789814527c37e.jpg ADDED Viewed

data/spotlight_reference_images/ref_0025_04571_GeRaF_Neural_Geometry_Reconstruction_from_Radio_Frequency_Signals__e096c21c76e1eb88c8d865f73e832e8e9246cddec17f62b5d85bc051caa55165.jpg ADDED Viewed

data/spotlight_reference_images/ref_0026_04647_HopaDIFF_Holistic-Partial_Aware_Fourier_Conditioned_Diffusion_for_Referring_Human_Action_Segmentation_in_Multi-Person_Sc__0e15e7e99b52ccbc157e3ee04a6de2238b23ccd60543102fc2a70eddb12e5e41.jpg ADDED Viewed

data/spotlight_reference_images/ref_0027_04717_CSBrain_A_Cross-scale_Spatiotemporal_Brain_Foundation_Model_for_EEG_Decoding__327e1ef5f11138cef78e3dd270f09eea700aedb3c5a64cf848b125d13e4e5f08.jpg ADDED Viewed

data/spotlight_reference_images/ref_0028_05129_Learning_to_Factorize_Spatio-Temporal_Foundation_Models__ef11d1775e9863839bc4dfaf8711bf474ef80365303220ba92781b579879e7a5.jpg ADDED Viewed

data/spotlight_reference_images/ref_0029_05428_EDELINE_Enhancing_Memory_in_Diffusion-based_World_Models_via_Linear-Time_Sequence_Modeling__57df86687acccd0bad49997d26a4a442d6d09e2381e5a570d1d1e7efd02cf303.jpg ADDED Viewed

data/spotlight_reference_images/ref_0030_05467_Vision-centric_Token_Compression_in_Large_Language_Model__056bbf059f83c91ea896c610cef2927606ab780d910996e6cdb293dfaca40ddd.jpg ADDED Viewed

data/spotlight_reference_images/ref_0031_05610_Repo2Run_Automated_Building_Executable_Environment_for_Code_Repository_at_Scale__ea96f359e23ff3f0427d48dd3247314967bb531150d725a7680376b940324680.jpg ADDED Viewed

data/spotlight_reference_images/ref_0032_05774_Shallow_Diffuse_Robust_and_Invisible_Watermarking_through_Low-Dim_Subspaces_in_Diffusion_Models__703f7602f642aa354858ee8cf929888d672a45001a93ac0cb937cd0f4f1b62de.jpg ADDED Viewed

data/spotlight_reference_images/ref_0033_05814_Mozart_Modularized_and_Efficient_MoE_Training_on_35D_Wafer-Scale_Chiplet_Architectures__dc6b73dd98f93241717e3b658c319f70ab6ad6188c24a147cd052fb2153d656d.jpg ADDED Viewed

data/spotlight_reference_images/ref_0034_06044_Theory-Driven_Label-Specific_Representation_for_Incomplete_Multi-View_Multi-Label_Learning__4d4bbb3c5cd4edb56f73502b7eac2b526f30bfc883b337d949cb38ee0747ee22.jpg ADDED Viewed

data/spotlight_reference_images/ref_0035_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__8c0d4df1862409d631870861dc2f047a0cd2572e87267d0d3ea58b6c245408fe.jpg ADDED Viewed

data/spotlight_reference_images/ref_0036_06067_EAG3R_Event-Augmented_3D_Geometry_Estimation_for_Dynamic_and_Extreme-Lighting_Scenes__a50d002b445446ba0c687045ba485e4c96aaf49765198b3f53fb954327df6f4f.jpg ADDED Viewed

data/spotlight_reference_images/ref_0037_06507_CausalPFN_Amortized_Causal_Effect_Estimation_via_In-Context_Learning__8c660a9d9ad153e854bc67151e7df9977e2244fa4ce9bd649c2b08b58db2e30c.jpg ADDED Viewed

data/spotlight_reference_images/ref_0038_06527_Robust_Graph_Condensation_via_Classification_Complexity_Mitigation__dedb33c198673910da24a5a2a5794a8228afce42d7b46ff594dab0cac9ee61e0.jpg ADDED Viewed

Git LFS Details

SHA256: 0130a54d1dacffb6ae73bee7e2cc8694b25045adc6bcf1e4f1f5af6e41480187
Pointer size: 131 Bytes
Size of remote file: 104 kB