CYChenv

tangyue0820

fferroni

harrim-nv

liang1225

shilinzhu-nvidia

mli0603

mbalaNV commited on Jun 1

Commit

fdafd05

0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files

Co-authored-by: CYChenv <CYChenv@users.noreply.huggingface.co>
Co-authored-by: tangyue0820 <tangyue0820@users.noreply.huggingface.co>
Co-authored-by: fferroni <fferroni@users.noreply.huggingface.co>
Co-authored-by: harrim-nv <harrim-nv@users.noreply.huggingface.co>
Co-authored-by: liang1225 <liang1225@users.noreply.huggingface.co>
Co-authored-by: shilinzhu-nvidia <shilinzhu-nvidia@users.noreply.huggingface.co>
Co-authored-by: mli0603 <mli0603@users.noreply.huggingface.co>
Co-authored-by: mbalaNV <mbalaNV@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
.gitignore +2 -0
AGENTIC_UPSAMPLING.md +174 -0
BIAS.md +11 -0
EXPLAINABILITY.md +16 -0
PRIVACY.md +6 -0
README.md +463 -0
SAFETY.md +11 -0
agentic_upsampling/__init__.py +6 -0
agentic_upsampling/__main__.py +7 -0
agentic_upsampling/clients.py +521 -0
agentic_upsampling/constants.py +35 -0
agentic_upsampling/data.py +167 -0
agentic_upsampling/extract_best.py +155 -0
agentic_upsampling/io_utils.py +46 -0
agentic_upsampling/prompt_upsampler.py +388 -0
agentic_upsampling/rubric.py +220 -0
agentic_upsampling/run.py +187 -0
agentic_upsampling/runner.py +474 -0
assets/benchmark-text2image-leaderboard-all-models.jpg +3 -0
assets/benchmark-text2image-leaderboard.png +3 -0
assets/benchmark-text2image.png +3 -0
assets/example_caption.json +88 -0
assets/example_image.png +3 -0
assets/more_images.jpg +3 -0
assets/original_prompt.txt +1 -0
chat_template.json +3 -0
checkpoint.json +1 -0
config.json +258 -0
generation_config.json +14 -0
merges.txt +0 -0
model.safetensors.index.json +0 -0
model_index.json +28 -0
preprocessor_config.json +21 -0
pytest.ini +4 -0
scheduler/scheduler_config.json +33 -0
sound_tokenizer/config.json +64 -0
sound_tokenizer/diffusion_pytorch_model.safetensors +3 -0
tests/test_agentic_upsampling.py +496 -0
text_tokenizer/added_tokens.json +28 -0
text_tokenizer/chat_template.jinja +120 -0
text_tokenizer/merges.txt +0 -0
text_tokenizer/special_tokens_map.json +31 -0
text_tokenizer/tokenizer.json +3 -0
text_tokenizer/tokenizer_config.json +239 -0
text_tokenizer/vocab.json +0 -0
tokenizer.json +0 -0
tokenizer_config.json +239 -0
transformer/config.json +54 -0
transformer/diffusion_pytorch_model-00001-of-00027.safetensors +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+text_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pyc
2	+ outputs/

AGENTIC_UPSAMPLING.md ADDED Viewed

	@@ -0,0 +1,174 @@

+# Agentic Prompt Upsampling
+This repository includes a standalone text-to-image agentic prompt upsampler for Cosmos3-Super-Text2Image.
+The loop:
+1. Upsamples the user prompt into a structured Cosmos3 T2I JSON prompt.
+2. Generates an image through a vLLM-Omni `/v1/images/generations` endpoint.
+3. Scores the image with a VLM critic.
+4. Rewrites both the positive JSON prompt and generator-side negative prompt from the critic feedback.
+5. Repeats up to the configured iteration limit and returns the best scored image.
+## Install
+From the repository root:
+```bash
+python -m pip install requests pillow
+```
+Recommended vLLM-Omni serving configuration for `nvidia/Cosmos3-Super-Text2Image` on 4xH200 is:
+```bash
+vllm serve nvidia/Cosmos3-Super-Text2Image \
+  --omni \
+  --cfg-parallel-size 2 \
+  --ulysses-degree 2 \
+  --tensor-parallel-size 1
+```
+With the no-offload configuration above, 1024x1024 image generation with 50 steps is expected to take roughly 5 seconds server-side per request.
+## Default Models
+The default prompt upsampler and rewriter are OpenAI GPT-5.5 through the public OpenAI chat completions API:
+```text
+endpoint: https://api.openai.com/v1
+model: gpt-5.5
+extra body: {"reasoning_effort": "low"}
+env var: OPENAI_API_KEY
+```
+The default critic is Gemini 3.1 Pro Preview through Google's OpenAI-compatible chat completions endpoint:
+```text
+endpoint: https://generativelanguage.googleapis.com/v1beta/openai/
+model: gemini-3.1-pro-preview
+env var: GEMINI_API_KEY
+```
+Set credentials:
+```bash
+export OPENAI_API_KEY=...
+export GEMINI_API_KEY=...
+```
+If your vLLM-Omni generation endpoint requires auth:
+```bash
+export AGENTIC_UPSAMPLING_GENERATION_AUTH_KEY=...
+```
+## Run One Prompt
+```bash
+python -m agentic_upsampling.run \
+  --prompt "a cinematic photo of a glass greenhouse at sunrise" \
+  --output-dir outputs/agentic_greenhouse \
+  --generation-endpoint https://YOUR_VLLM_OMNI_ENDPOINT
+```
+The generation call is a standard vLLM-Omni image request:
+```text
+POST /v1/images/generations
+model: nvidia/Cosmos3-Super-Text2Image
+size: 1024x1024
+response_format: b64_json
+num_inference_steps: 50
+guidance_scale: 4.0
+flow_shift: 3.0
+negative_prompt: ""
+extra_args: {"guardrails": false, "use_resolution_template": false}
+```
+## Run A Batch
+Text file, one prompt per non-empty line:
+```bash
+python -m agentic_upsampling.run \
+  --prompts prompts.txt \
+  --output-dir outputs/agentic_batch \
+  --generation-endpoint https://YOUR_VLLM_OMNI_ENDPOINT
+```
+JSONL rows can be strings or objects with `prompt` and optional `id`:
+```json
+{"id": "greenhouse", "prompt": "a glass greenhouse at sunrise"}
+{"id": "city", "prompt": "a clean futuristic city plaza after rain"}
+```
+CSV files must include a `prompt` or `Prompt` column and may include an `id` column.
+## Useful Options
+```bash
+python -m agentic_upsampling.run \
+  --prompt "a precise product photo of a transparent mechanical keyboard" \
+  --output-dir outputs/keyboard \
+  --generation-endpoint https://YOUR_VLLM_OMNI_ENDPOINT \
+  --max-iterations 2 \
+  --samples-per-iteration 3 \
+  --seed-base 42 \
+  --size 1024x1024 \
+  --guidance 4.0 \
+  --flow-shift 3.0
+```
+- `--max-iterations` controls total prompt stages. The default is `2`, meaning the initial upsample plus up to two rewrites.
+- `--samples-per-iteration` runs a best-of-N seed search for each prompt stage. Generation requests for those seeds are submitted concurrently within the iteration.
+- `--seed-base` makes seeds deterministic. Sample seeds are `seed_base + sample_index`.
+- `--size` is the vLLM-Omni image size in `WIDTHxHEIGHT` format.
+- `--guidance` sets `guidance_scale`; the default is `4.0`.
+- `--flow-shift` sets `flow_shift`; the default is `3.0`.
+- `--generation-extra-args` overrides the default vLLM-Omni generation `extra_args` JSON object.
+- Early stopping is enabled by default when the critic score clears the strict threshold. Use `--disable-early-stop` to always run every iteration.
+- Reruns resume from completed artifacts by default. Use `--overwrite` to regenerate them.
+## Output Layout
+```text
+output_dir/
+  run_config.json
+  summary.json
+  manifest.jsonl
+  failures.jsonl
+  0001/
+    best.json
+    iter_00/
+      prompt.json
+      negative_prompt.json
+      image.jpg
+      generation_meta.json
+      analysis.json
+      samples.json
+      meta.json
+    iter_01/
+      ...
+```
+For `--samples-per-iteration N`, each iteration contains `sample_00/`, `sample_01/`, and so on.
+## Export Best Images
+Copy the selected best image for every completed prompt into one folder:
+```bash
+python -m agentic_upsampling.extract_best \
+  --output-dir outputs/agentic_batch \
+  --export-dir outputs/agentic_batch_best \
+  --overwrite
+```
+The exporter writes:
+```text
+best_generations.jsonl
+best_generations.csv
+images/
+```

BIAS.md ADDED Viewed

	@@ -0,0 +1,11 @@

+## Bias
+| Field | Response |
+| :---- | :---- |
+| Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing | None. |
+| Measures taken to mitigate against unwanted bias | Training, evaluation, and testing data are curated before release to filter restricted content, including content relating to protected classes. Model behavior is evaluated across Physical AI domains — robotics, autonomous vehicles, human-centric scenes, common scenes, industry, miscellaneous, and physics-oriented benchmarks — with attention to coverage across diverse demographic and contextual characteristics that affect protected-class outcomes. |
+| Which characteristic (feature) show(s) the greatest difference in performance?: | Greatest performance differences are observed in tasks requiring long-horizon temporal consistency, fine-grained physical interactions, and embodiment-specific action generation. Performance is generally stronger on common visual reasoning and world-generation tasks than on complex multi-agent, robotics-control, or tightly synchronized multimodal generation scenarios. |
+| Which feature(s) have the worst performance overall? | Performance is generally weakest in tasks requiring long-horizon temporal consistency, precise physical interactions, embodiment-specific action control, and strict audio-visual synchronization. |
+| If using internal data, description of methods implemented in data acquisition or processing, if any, to address the prevalence of identifiable biases in the training, testing, and validation data: | Bias-specific methods applied during data processing include person-presence screening, demographic-taxonomy classification (age, gender, ethnicity), embedding-based diversity analysis, and dataset balancing across sources. Internal analysis surfaced: non-person scenes are more prevalent than person-centric content; demographic-taxonomy outputs on person-present samples are most frequently "uncertain" across age, gender, and ethnicity dimensions; and source-type variation, with people-centric image and video datasets showing higher demographic signal than document-, object-, robotics-, or scene-focused datasets. *(Quantitative details in the row below.)* Downstream deployments should add bias audits, fairness evaluation, red-teaming, demographically balanced fine-tuning, or counterfactual augmentation as mitigations. |
+| Tools used to assess statistical imbalances and highlight patterns that may introduce bias into AI models: | Dataset analytics pipelines, metadata distribution analysis, heuristic quality checks, embedding-based clustering, model-assisted filtering systems, and benchmark evaluation suites are used to assess statistical imbalances and identify patterns that may introduce bias into model behavior. |
+| Tools used to assess statistical imbalances and highlight patterns that may introduce bias into AI models: | These datasets, such as OpenImages-derived detection-to-NLP datasets, visual grounding and VQA datasets, document/image understanding datasets, video/action understanding datasets, and NVIDIA-created or curated visual datasets, do not collectively or exhaustively represent all demographic groups (and proportionally therein). For instance, automated person-presence screening did not identify a person in approximately 58% of visual samples analyzed across approximately 400 datasets, while person-present signals were identified in approximately 42% of analyzed samples. In the subset where person-present signals were identified, these datasets contain uneven representation splits across the measured visual taxonomies: age outputs were most frequently uncertain, followed by child and adult; gender outputs were most frequently uncertain, followed by male and female; and ethnicity outputs were most frequently uncertain, followed by Hispanic and White as the most frequent identified categories. Dataset-level results vary by source type, with people-centric image and video datasets containing higher person-present and demographic-taxonomy signals than document-, object-, robotics-, or scene-focused datasets. To mitigate these imbalances, we recommend considering evaluation techniques such as bias audits, task-specific fairness evaluation, and red-teaming, along with fine-tuning with demographically balanced datasets and counterfactual data augmentation to align with the desired model behavior. This evaluation used a baseline of 200 samples across all datasets, with larger subsets of up to 3,000 samples utilized for certain in-depth analyses, identified as optimal thresholds for maximizing embedder accuracy. |

EXPLAINABILITY.md ADDED Viewed

	@@ -0,0 +1,16 @@

+## Explainability
+| Field | Response |
+| :---- | :---- |
+| Intended Application & Domain | World reasoning and generation for Physical AI. |
+| Model Type | Mixture-of-Transformers architecture with two towers. One is an autoregressive model for Physical AI reasoning; the other is a diffusion model for Physical AI generation. |
+| Intended Users | Physical AI developers, researchers, and practitioners building or evaluating autonomous vehicle, robotics, and world-generation workflows. |
+| Output | Images, videos, audio, and action commands. |
+| Tools used to evaluate datasets to identify synthetic data and ensure data authenticity. | Dataset provenance analysis, metadata validation, watermark and artifact detection, embedding-based clustering, heuristic quality checks, and model-assisted data validation pipelines are used to identify synthetic content patterns, assess dataset authenticity, and improve data quality during dataset curation. |
+| Describe how the model works | Cosmos3 is an Omni world foundation model that generates texts, images, videos, audio, and action commands from combinations of text, images, videos, and action trajectory inputs. Input tokens from multiple modalities are packed into a shared sequence and processed by our mixture-of-transformer backbone with modality-specific output heads. |
+| Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | None. |
+| Technical Limitations | The model may not follow text, image, video, audio, or action trajectory inputs accurately in challenging cases, especially where the input contains complex scene composition, unusual camera motion, multiple interacting agents, low lighting, high motion blur, or fine-grained physical interactions. Generated outputs may contain temporal inconsistency, object morphing, inaccurate 3D structure, or implausible physical dynamics. Generated audio may not accurately render intelligible speech, or maintain strict temporal and semantic alignment with the visual context. |
+| Verified to have met prescribed NVIDIA quality standards | Yes. |
+| Performance Metrics | Video generation is measured using PAIBench-G, RBench, PhysicsIQ, and Artifical Analysis Image2Video benchmark. Image generation uses UniGenBench and Artifical Analysis Text2Image benchmark. For transfer evaluation, we use PAIBench-C and AVBench-C. Audio generation uses internal benchmarks. Action prediction uses metrics such as action MSE, Absolute Translation Error, Relative Translation Error, Relative Rotation Error, PSNR, and robotic task completion success rate. |
+| Potential Known Risks | This model can generate synthetic media and may produce content that is offensive, unsafe, misleading, indecent, or unsuitable for a target deployment. Users should implement robust safety guardrails — including content filtering, abuse monitoring, and access controls — to reduce the risk of harmful outputs. Users are responsible for ensuring that their use of the model complies with all applicable laws and regulations, and for regularly reviewing and updating their guardrails as risks evolve. |
+| Licensing | [OpenMDW1.1](https://openmdw.ai/)  |

PRIVACY.md ADDED Viewed

	@@ -0,0 +1,6 @@

+## Privacy
+| Privacy Information |
+|---|
+| The model was trained on large-scale publicly available data that may contain images, audio-video, and text relating to people. NVIDIA collected and used this data in compliance with applicable data protection and privacy laws. This model was not designed to derive insights or otherwise learn from any personal data contained in the datasets. |
+| NVIDIA uses a combination of filters, data minimization techniques, and other guardrails to help prevent personal data from being recited by our models. We employ automated tools and data processing techniques during pre-training or training to identify and filter certain categories of personal data. For example, for text-bearing source and document components, our automated tools identified potential personal data such as person names, locations, and possible business or public-facing contact information such as email addresses and phone numbers.  We reviewed and removed any verified instances of personal data through a combination of automated filtering and human-in-the-loop validation. |
+| Please review NVIDIA's [Privacy Policy](https://www.nvidia.com/en-us/about-nvidia/privacy-policy/) for more information. |

README.md ADDED Viewed

	@@ -0,0 +1,463 @@

+---
+license: other
+license_name: openmdw1.1-license
+license_link: >-
+  https://openmdw.ai/license/1-1/
+library_name: cosmos
+tags:
+  - nvidia
+  - cosmos
+  - cosmos3
+  - vllm-omni
+  - diffusers
+  - text-to-image
+  - image-generation
+---
+# **Cosmos 3: Omnimodal World Models for Physical AI**
+**[Model Collection](https://huggingface.co/collections/nvidia/cosmos3)** | **[Code](https://github.com/nvidia/cosmos)** | **[White Paper](https://research.nvidia.com/labs/cosmos-lab/cosmos3/technical-report.pdf)** | **[Website](https://research.nvidia.com/labs/cosmos-lab/cosmos3/)**
+[NVIDIA Cosmos™](https://github.com/nvidia/cosmos) is a world foundation model platform designed to accelerate the development of Physical AI by enabling machines to understand, simulate, and interact with the physical world across robotics, autonomous driving, and smart space environments, including industrial and factory-scale applications.
+# Model Overview: Cosmos3-Super-Text2Image
+## Description
+Cosmos3 is a collection of Omnimodal world models capable of generating dynamic, high-quality video, image, audio, and action commands from combinations of text, image, video, and action trajectory inputs. It serves as a foundational building block for a broad range of Physical AI applications and research spanning world understanding, world generation, simulation, and embodied policy learning.
+This model is ready for commercial and non-commercial use.
+**Model Developer:** NVIDIA
+### Model Versions
+- Cosmos3-Nano:
+  - Given multimodal inputs including text, images, video, audio, and action trajectories, generate coherent text, images, video, audio, and action outputs for multimodal understanding, world simulation, future prediction, action reasoning, and Physical AI applications.
+- Cosmos3-Super:
+  - Given multimodal inputs including text, images, video, audio, and action trajectories, generate coherent text, images, video, audio, and action outputs for multimodal understanding, world simulation, future prediction, action reasoning, and Physical AI applications.
+- Cosmos3-Nano-Policy-DROID:
+  - Given language instructions and visual observations from the DROID robot platform, generate robot action trajectories for manipulation and control tasks.
+- Cosmos3-Super-Image2Video:
+  - Given one input image and text instructions, generate temporally coherent video sequences that are consistent with the provided visual content.
+- Cosmos3-Super-Text2Image:
+  - Given text input, generate high-fidelity images that are consistent with the provided description.
+### License
+This model is released under the [OpenMDW1.1](https://openmdw.ai/license/1-1/)
+### Deployment Geography
+Global
+### Use Case
+Physical AI: Encompassing robotics, autonomous vehicles (AV), and smart space environments, including industrial and factory-scale applications.
+### Release Date
+Hugging Face 05/31/2026 via [https://huggingface.co/collections/nvidia/cosmos3](https://huggingface.co/collections/nvidia/cosmos3)
+GitHub 05/31/2026 via [https://github.com/nvidia/cosmos](https://github.com/nvidia/cosmos)
+## Model Architecture
+**Architecture Type:** Transformer
+**Network Architecture:** Mixture-of-Transformers (MoT)
+Cosmos3 is an Omni-modal foundation model built on a Mixture-of-Transformers (MoT) architecture consisting of two complementary transformer towers: an autoregressive transformer for discrete token generation and a diffusion transformer for continuous multimodal generation. During inference, text is generated through standard next-token autoregressive decoding, while non-text modalities, such as images, video, audio, and actions, are synthesized through iterative denoising. This unified architecture enables Cosmos3 to model heterogeneous modalities within a single framework while preserving generation mechanisms best suited to each modality.
+**This model was developed based on:**  [Cosmos Framework](https://github.com/nvidia/cosmos-framework)
+**Number of trainable model parameters:**
+- Cosmos3-Nano: 16B
+- Cosmos3-Super: 64B
+- Cosmos3-Nano-Policy-DROID: 16B
+- Cosmos3-Super-Image2Video: 64B
+- Cosmos3-Super-Text2Image: 64B
+## Input/Output Specifications
+- **Generator Input**
+  - **Input Type(s)**: Text, Image, Video (with audio or without audio), Action Trajectory
+  - **Input Format(s)**:
+    - Text: String
+    - Image: jpg, png, jpeg, webp
+    - Video (with or without audio): mp4
+    - Action: json (1D list)
+  - **Input Parameters**:
+    - Text: One-dimensional (1D)
+    - Image: Two-dimensional (2D)
+    - Video: Three-dimensional (3D)
+    - Audio: One-dimensional (1D)
+    - Action trajectory: One-dimensional (1D)
+  - **Other Properties Related to Input**:
+    - For video inputs, we accept various resolutions, including 720p, 480p, and 256p.
+    - When using input video with audio muxed into the video MP4 file, the audio should have 2 channels (stereo) and a 48 kHz sample rate.
+    - Image and video inputs are RGB color (8 bits per channel, sRGB color space); grayscale inputs are not supported.
+    - Action input is a per-frame sequence of robot/agent state or control values (e.g., joint positions, gripper state, camera pose). The full input is a 2D array shaped (T, D), where T is the number of frames and D is the embodiment-specific dimensionality listed below.
+    - Input action is only supported for compatible embodiments, including general camera motion (9D), autonomous vehicle (9D), egocentric motion (57D), single Franka Panda arm with RobotiQ gripper (10D), dual Franka Panda arm with RobotiQ gripper (20D), Agibot (29D), UR (10D), Google robot (10D), WidowX 250 (10D), UMI (9D).
+  - **Input Size and Length limits:**
+    - **Text:** 4096 tokens
+    - **Image:** 256p, 480p, and 720p resolution at one of these aspect ratios (16:9, 4:3, 1:1, 3:4, 9:16)
+    - **Video:** 256p, 480p, and 720p resolution at one of these aspect ratios (16:9, 4:3, 1:1, 3:4, 9:16). Max number of frames = 5.
+    - **Audio:** Max 0.5 second
+    - **Action:** 16 – 400 video frames
+- **Generator Output**
+  - **Output Type(s)**: Image, video, audio, action, text
+  - **Output Format(s)**:
+    - Image: JPG
+    - Video: MP4
+    - Audio: Advanced Audio Coding (AAC) stream (muxed within the MP4)
+    - Action: 1D list (.json)
+    - Text: string
+  - **Output Parameters**:
+    - Image: Two-dimensional (2D)
+    - Video: Three-dimensional (3D)
+    - Audio: One-dimensional (1D)
+    - Action: One-dimensional (1D)
+    - Text: One-dimensional (1D)
+  - **Other Properties Related to Output**:
+    - The generated video is an MP4 file, with the resolution, frame rate, and duration specified in the input. The generated audio is encoded in AAC format, muxed into the video MP4 file with 2 channels (stereo) and a 48 kHz sample rate.
+    - Video generation supports durations from 5 to 400 frames, with 189 frames as the default generation duration.
+    - The generated action is only supported for compatible embodiments, including general camera motion (9D), autonomous vehicle (9D), egocentric motion (57D), single Franka Panda arm with RobotiQ gripper (10D), dual Franka Panda arm with RobotiQ gripper (20D), Agibot (29D), UR (10D), Google robot (10D), WidowX 250 (10D), UMI (9D).
+    - Audio: 48 kHz stereo AAC stream muxed into video mp4
+    - Video: mp4 at the FPS specified in input
+    - Image: JPEG
+- **Reasoner Input**
+  - **Input Type(s)**: Text, Text+Image, Text+Video
+  - **Input Format(s)**:
+    - Text: String
+    - Image: jpg, png, jpeg, webp
+    - Video: mp4
+  - **Input Parameters**:
+    - Text: One-dimensional (1D)
+    - Image: Two-dimensional (2D)
+    - Video: Three-dimensional (3D)
+  - **Other Properties Related to Input**:
+    - Video inputs are recommended at a frame rate of 4 fps.
+    - Long-context inputs supported up to 256K tokens.
+  - **Input Size and Length limits:**
+    - **Text:** Up to 256K tokens (context window).
+    - **Image:** Standard input image formats; passed as file or URL.
+    - **Video:** mp4 at the recommended 4 fps.
+- **Reasoner Output**
+  - **Output Type(s)**: Text
+  - **Output Format(s)**:
+    - Text: string
+  - **Output Parameters**:
+    - Text: One-dimensional (1D)
+  - **Other Properties Related to Output**:
+    - Default `max_tokens=4096+` is recommended for reasoning outputs; longer outputs may be requested.
+    - Reasoning outputs may include structured chain-of-thought, 2D/3D point localization, and bounding-box coordinates for vision-based tasks.
+The video content visualizes the input text description as a short animated scene, capturing key elements within the specified time constraints.
+Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA's hardware (e.g., GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
+## Software Integration
+**Runtime Engine(s):**
+- [PyTorch](https://github.com/nvidia/cosmos3)
+- [vLLM-Omni](https://github.com/vllm-project/vllm-omni)
+- [Hugging Face Diffusers](https://huggingface.co/docs/diffusers/en/index)
+**Supported Hardware Microarchitecture Compatibility:**
+- NVIDIA Ampere
+- NVIDIA Blackwell
+- NVIDIA Hopper
+**Operating System(s):**
+- Linux (We have not tested on other operating systems.)
+**Note:** Only BF16 precision is tested. Other precisions like FP4, FP8, and FP16 are not officially supported.
+The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment.
+## Training, Testing, and Evaluation Datasets
+### Dataset Overview
+- **Total Size:** 1.3B data points
+- **Total Number of Datasets:** 393 dataset entries
+- **Dataset partition:** Training [100%], Testing [N/A ��� evaluation benchmarks used separately], Validation [N/A — evaluation benchmarks used separately]
+- **Time period for training data collection:** 2024–2026
+- **Time period for testing data collection:** N/A (standard public benchmarks)
+- **Time period for validation data collection:** N/A (standard public benchmarks)
+Raw data from internal and external sources is transformed into training-ready data through multiple stages of curation, filtering, and quality review. Data acquisition spans diverse multimodal sources — robotics, autonomous driving, industrial environments, indoor and outdoor scenes, varied lighting and weather conditions, camera viewpoints, object categories, and human activities — to broaden coverage across Physical AI operating environments. Automated filtering pipelines remove corrupted, duplicate, low-quality, and restricted content. Metadata analysis, heuristic rules, and model-assisted classifiers are applied during preprocessing to flag anomalous distributions and low-diversity subsets. Human review supplements automated filtering for selected datasets, benchmark construction, and targeted quality analysis. Datasets are balanced across modalities and task categories — visual reasoning, text-to-image, text-to-video, image-to-video, audio generation, video transfer, action-conditioned generation, and action command generation — to reduce overrepresentation of narrow domains. Synthetic and simulation-based augmentation supplements coverage of rare physical interactions and edge-case scenarios. Deduplication and provenance tracking are applied across the corpus. The resulting processed data is converted into model-ready tokenized or encoded representations through modality-specific preprocessors before training begins.
+Training datasets passed through multiple layers of automated and manual safeguards designed to reduce the presence of harmful or policy-violating content across categories including weapons and weapons-related instructional content, criminal planning, child sexual abuse material (CSAM), non-consensual intimate imagery (NCII), sexual content involving minors, harassment, hate speech, profanity, threats and incitement to violence, self-harm or suicide-related content, and graphic violence. Data sources are reviewed for licensing compatibility, provenance, and alignment with internal data governance and safety policies before admission into training corpora. Automated filtering pipelines combine multiple detection strategies: hash-matching against known CSAM and NCII reference databases; classifier-based moderation models trained for explicit sexual content, hate speech, violence, weapons imagery, and other restricted categories; keyword and regex-based screening for criminal-planning, threats, and self-harm phrases in text data; metadata and provenance heuristics for source-level risk signals; and embedding-based anomaly detection to surface samples that fall outside expected distributions. Human review and targeted audits supplement automated filtering for selected datasets, benchmark construction, and safety-sensitive evaluation. For multimodal Physical AI data (robotics, autonomous driving, industrial scenes), additional filtering targets invalid action trajectories, physically implausible interactions, and unsafe control sequences. Synthetic and simulation-generated data are evaluated through internal validation before inclusion. Benchmark evaluations and red-team testing are applied post-training to surface remaining safety gaps across world generation, reasoning, audio, and action tasks. No large-scale data-filtering process can guarantee complete removal of all harmful content; residual risks may remain, particularly in rare edge cases or open-world deployment settings. Ongoing monitoring and dataset review continue post-release.
+**Data Modality and Training Data Size**
+| Modality | Reasoning Data Sample Count | Generation Data Sample Count |
+| -------- | ------------------- | -------------------- |
+| Text     | 22M                 | Not Applicable       |
+| Image    | 19M                 | 767M                 |
+| Video    | 1M                  | 348M                 |
+| Audio    | Not Applicable      | 139M                 |
+| Action   | Not Applicable      | 8M                   |
+**Data Collection Method by dataset**
+- Hybrid: Automatic/Sensors, Synthetic, Automated
+**Labeling Method by dataset**
+- Hybrid: Human, Automated
+**Properties:** The training, testing, and evaluation datasets consist of diverse multimodal video, image, audio, action, synthetic, and sensor-conditioned data sourced from NVIDIA-owned data and publicly available, commercially permissive datasets. These datasets are curated to exclude known restricted content and to support building an Omni model that learns to generate and reason about dynamic physical environments across world reasoning and generation tasks.
+### Public Datasets
+| Dataset&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Samples&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; |
+|---|---|
+| OpenImage | 1.2M |
+| Coyo700M | 100M |
+| YouTube Video | 340M |
+| UMI | 4.5M |
+### Private Datasets
+| Dataset&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Samples&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; |
+|---|---|
+| Egocentric | 7M |
+| Nexar | 0.6M |
+| AgiBot | 0.2M |
+| HOI | 0.3M |
+### Synthetic Datasets
+| Dataset | Samples |
+|---|---|
+| synthetic images generated using HiDream-I1 | 15M |
+| synthetic images generated using Qwen-Image-2512 | 14M |
+| synthetic captions generated using Qwen3-VL | 1115M |
+## Evaluation Datasets
+**Data Collection Method by dataset**
+- Hybrid: Automatic/Sensors, Synthetic, Automated
+**Labeling Method by dataset**
+- Hybrid: Human, Automated
+**Properties:** The training, testing, and evaluation datasets consist of diverse multimodal video, image, audio, action, synthetic, and sensor-conditioned data sourced from NVIDIA-owned data and publicly available, commercially permissive datasets. These datasets are curated to exclude known restricted content and to support building an Omni model that learns to generate and reason about dynamic physical environments across world reasoning and generation tasks.
+## Benchmarks
+Please see our [technical paper](https://research.nvidia.com/labs/cosmos-lab/cosmos3/technical-report.pdf) for detailed evaluations of the base model.
+### Text-to-image benchmark results
+![benchmark results](assets/benchmark-text2image.png)
+### Artificial Analysis Leaderboard
+#### Open-Source Models [2026/05/28/]
+![Artificial Analysis Text-to-Image leaderboard — open-source models](assets/benchmark-text2image-leaderboard.png)
+#### All Models [2026/05/28/] (Including Closed-Source)
+![Artificial Analysis Text-to-Image leaderboard — all models including closed-source](assets/benchmark-text2image-leaderboard-all-models.jpg)
+## Qualitative examples
+![Qualitative examples](assets/more_images.jpg)
+## Usage
+- See [Cosmos](https://github.com/nvidia/cosmos) for details.
+### Prompt upsampling
+For optimal quality, text prompts should be upsampled into a specific JSON structure. Description and code can be found [here](https://github.com/nvidia/cosmos-framework/blob/main/docs/prompt_upsampling.md).
+For example, for text-to-image upsampling using Opus-4.7:
+```bash
+git clone https://github.com/NVIDIA/cosmos-framework.git packages/cosmos-framework
+pip install -e packages/cosmos-framework
+export PROMPT_UPSAMPLER_ENDPOINT_URL="https://api.anthropic.com/v1/"
+export PROMPT_UPSAMPLER_MODEL_NAME="claude-opus-4-7"
+export PROMPT_UPSAMPLER_API_TOKEN="<your_token>"
+python -m cosmos_framework.inference.prompt_upsampling \
+    --input assets/original_prompt.txt \
+    --output /tmp/upsampled_t2i_opus/ \
+    --mode text2image \
+    --endpoint-url "${PROMPT_UPSAMPLER_ENDPOINT_URL}" \
+    --model "${PROMPT_UPSAMPLER_MODEL_NAME}" \
+    --api-token "${PROMPT_UPSAMPLER_API_TOKEN}" \
+    --resolution 768 \
+    --aspect-ratio "1,1"
+```
+The JSON-upsampled version of `assets/original_prompt.txt` is saved in `assets/example_caption.json` for convenience, and is used for the image generation examples below.
+### vLLM-Omni
+#### Container
+```
+docker pull vllm/vllm-omni:cosmos3
+```
+#### General Invocation
+You can use the release-tested `vllm-omni` package for deploying an OpenAI-compatible API inference endpoint.
+The recommended vLLM-Omni serving configuration for `nvidia/Cosmos3-Super-Text2Image` on a 8xH100 node is:
+```bash
+vllm serve nvidia/Cosmos3-Super-Text2Image \
+  --omni \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --cfg-parallel-size 2 \
+  --ulysses-degree 4 \
+  --tensor-parallel-size 1 \
+  --use-hsdp \
+  --hsdp-shard-size 8 \
+  --init-timeout 1800
+```
+Setting `--enable-layerwise-offload` can help with memory usage on GPUs with less available memory; however, please note that for text2image generation, this may incur a significant performance penalty. For 4xH200 or 4xGB200 one can simply use `--cfg-parallel-size 2 --ulysses-degree 2 --tensor-parallel-size 1`.
+#### Examples
+##### Text to image generation
+```python
+import base64
+import json
+import requests
+# 1. Read JSON-upsampled prompt
+json_prompt = json.load(open("assets/example_caption.json"))
+# 2. Build your API payload
+payload = {
+    "prompt": json.dumps(json_prompt),
+    "size": "1024x1024",
+    "n": 1,  # single frame generation
+    "num_inference_steps": 50,
+    "guidance_scale": 4.0,
+    "flow_shift": 3.0,
+    "negative_prompt": "",
+    "seed": 1143,
+    "extra_args": {
+      "use_resolution_template": False,
+      "guardrails": True,
+    },
+}
+# 3. Send the POST request
+url = "http://localhost:8000/v1/images/generations"
+print("Sending request to server...")
+response = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
+response.raise_for_status()
+# 4. Extract the base64 data and decode it into an image
+response_json = response.json()
+b64_data = response_json["data"][0]["b64_json"]
+image_bytes = base64.b64decode(b64_data)
+# 5. Save the final PNG file
+with open("/tmp/cosmos3_t2i.png", "wb") as image_file:
+    image_file.write(image_bytes)
+print("Saved image to /tmp/cosmos3_t2i.png")
+```
+![example_image](assets/example_image.png)
+### Diffusers
+Cosmos3 is fully supported within the popular HuggingFace Diffusers package. This integration makes it a supported inference backend, allowing developers to easily incorporate Cosmos3's capabilities - such as text-to-image generation - into their pipelines using the Cosmos3OmniPipeline class, as demonstrated by the provided code examples (see examples for other modalities on the HuggingFace Cosmos3 page).
+**Note:** This example is tested on GB200. For H100, use the [vLLM-Omni serving recipe](#vllm-omni) above, which supports multi-GPU deployment via HSDP.
+#### Installation
+To install diffusers with Cosmos3OmniPipeline:
+```
+uv venv --python 3.13 --seed --managed-python
+source .venv/bin/activate
+uv pip install \
+  "diffusers @ git+https://github.com/huggingface/diffusers.git" \
+  accelerate \
+  av \
+  cosmos_guardrail \
+  huggingface_hub \
+  imageio \
+  imageio-ffmpeg \
+  torch \
+  torchvision \
+  transformers
+```
+#### Examples
+##### Text to image generation
+```python
+import json
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+json_prompt = json.load(open("assets/example_caption.json"))
+pipe = Cosmos3OmniPipeline.from_pretrained(
+    "nvidia/Cosmos3-Super-Text2Image",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",
+    enable_safety_checker=True,
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=3.0)
+result = pipe(
+    prompt=json.dumps(json_prompt),
+    negative_prompt="",
+    num_frames=1,
+    height=1024,
+    width=1024,
+    num_inference_steps=50,
+    guidance_scale=4.0,
+    generator=torch.Generator(device="cuda").manual_seed(1143),
+)
+result.video[0].save("/tmp/cosmos3_t2i.png")
+print("Saved image to /tmp/cosmos3_t2i.png")
+```
+## Limitations
+Cosmos3 may produce imperfect outputs in challenging scenarios. Generation artifacts include temporal inconsistency, unstable camera or object motion, imprecise physical interactions, inaccurate audio-video synchronization, and action-state drift — especially in long-horizon or high-resolution outputs. Reasoning may also be incorrect: object states, causal relationships, spatial geometry, temporal ordering, agent intent, and future outcomes can be misinferred, and complex or long-context inputs may yield hallucinated entities, inconsistent interpretations, or implausible predictions. Because the model lacks an explicit physics simulator, 3D geometry, 4D space-time evolution, object permanence, contact dynamics, and physical laws are only approximated — producing artifacts such as disappearing or morphing objects, unrealistic collisions, and physically implausible motions. Quality further degrades in out-of-distribution environments, safety-critical edge cases, and domains underrepresented in training.
+Cosmos3 outputs should not be treated as physically accurate simulation, reliable ground-truth reasoning, or safety-certified decision making. Applications involving robotics control, autonomous systems, scientific simulation, or safety-critical planning require additional validation, external constraints, system-level safety analysis, and domain-specific guardrails before deployment.
+## Inference
+**Acceleration Engine:** [PyTorch](https://pytorch.org/), [vLLM](https://github.com/vllm-project/vllm), [vLLM-Omni](https://github.com/vllm-project/vllm-omni), [Hugging Face Diffusers](https://github.com/huggingface/diffusers)
+**Test Hardware:** GB200 and H100
+## Ethical Considerations
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications.  Developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+Please make sure you have proper rights and permissions for all input image and video content; if image or video includes people, personal health information, or intellectual property, the image or video generated will not blur or maintain proportions of image subjects included.
+Users are responsible for model inputs and outputs. Users are responsible for ensuring safe integration of this model, including implementing guardrails as well as other safety mechanisms, prior to deployment.
+For more detailed information on ethical considerations for this model, please see the Model Card++ [Explainability](EXPLAINABILITY.md), [Bias](BIAS.md), [Safety & Security](SAFETY.md), and [Privacy](PRIVACY.md) subcards. Please report model quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).

SAFETY.md ADDED Viewed

	@@ -0,0 +1,11 @@

+## Safety & Security
+| Field | Response |
+| :---- | :---- |
+| Model Application(s) | World reasoning and generation for Physical AI. |
+| Describe the life critical impact: | This model is not a safety-certified component and must not be used as the sole basis for life-critical decisions or control without additional system-level validation, safety analysis, and safeguards. The model is not designed or tested by NVIDIA for use in any system or application where the use of or failure of such system or application developed with the model could result in injury, death, or catastrophic damage. NVIDIA is not liable to any party, in whole or in part, for any claims or damages arising from those uses. Any system or application developed with the model must include sufficient safety and redundancy features and comply with applicable legal and regulatory standards and requirements. |
+| Description of methods implemented in data acquisition or processing, if any, to address other types of potentially harmful data in the training, testing, and validation data:  | Training, evaluation, and validation datasets pass through multi-stage automated and manual filtering to reduce harmful, unsafe, restricted, or policy-violating content. Pipelines include source-licensing review, deduplication, metadata-based and classifier-based moderation, embedding-based anomaly detection, and human audits on selected datasets. For Physical AI data (robotics, autonomous driving, industrial scenes), filtering also targets invalid action trajectories, physically implausible interactions, and unsafe control sequences. Synthetic and simulation-generated data are evaluated through internal validation before inclusion. Benchmark and red-team testing surface remaining safety gaps across world generation, reasoning, audio, and action tasks. No data-filtering process can guarantee complete removal; developers are responsible for application-specific safeguards and validation before deployment. |
+| Description of any methods implemented in data acquisition or processing, if any, to address illegal or harmful content in the training data, including, but not limited to, child sexual abuse material (CSAM) and non-consensual intimate imagery (NCII)  | In addition to the general unsafe-content filtering described above, training data acquisition and preprocessing apply CSAM- and NCII-specific safeguards: hash-matching systems against known CSAM databases, classifier-based moderation models trained specifically for explicit content and NCII detection, and provenance and licensing review for sources containing human imagery. Identified content is removed at ingest, with human review and targeted audits supplementing automated filtering for selected datasets. Despite these safeguards, no large-scale data-filtering system can guarantee complete detection. Ongoing monitoring and dataset review continue post-release. |
+| Use Case Restrictions | Use is governed by the [OpenMDW1.1](https://openmdw.ai/) |
+| Model and dataset restrictions | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
+| Responsible Data Handling | This AI model was developed based on our policies to ensure responsible data handling and risk mitigation. The datasets used for training have been scanned for harmful content and illegal content, consistent with our policies including scanning for Child Sexual Abuse Material (CSAM). Ongoing review and monitoring mechanisms are in place based on our policies and to maintain data integrity. |

agentic_upsampling/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Standalone agentic prompt upsampling for Cosmos3 text-to-image."""
+from agentic_upsampling.data import PromptItem
+from agentic_upsampling.runner import AgenticUpsamplerRunner, RunnerConfig
+__all__ = ["AgenticUpsamplerRunner", "PromptItem", "RunnerConfig"]

agentic_upsampling/__main__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Run the agentic prompt upsampling CLI."""
+from agentic_upsampling.run import main
+if __name__ == "__main__":
+    raise SystemExit(main())

agentic_upsampling/clients.py ADDED Viewed

	@@ -0,0 +1,521 @@

+"""Network clients for standalone agentic text-to-image upsampling."""
+from __future__ import annotations
+import base64
+import io
+import json
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+import requests
+from PIL import Image
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from agentic_upsampling.constants import (
+    DEFAULT_ASPECT_RATIO,
+    DEFAULT_CRITIC_ENDPOINT_URL,
+    DEFAULT_CRITIC_MODEL,
+    DEFAULT_GENERATION_AUTH_KEY_ENV,
+    DEFAULT_GENERATION_EXTRA_ARGS,
+    DEFAULT_GENERATION_MODEL,
+    DEFAULT_FLOW_SHIFT,
+    DEFAULT_GUIDANCE,
+    DEFAULT_IMAGE_SIZE,
+    DEFAULT_JPEG_QUALITY,
+    DEFAULT_LLM_EXTRA_BODY,
+    DEFAULT_NUM_STEPS,
+    DEFAULT_OPENAI_API_KEY_ENV,
+    DEFAULT_RESOLUTION,
+    DEFAULT_REWRITER_ENDPOINT_URL,
+    DEFAULT_REWRITER_MODEL,
+    DEFAULT_UPSAMPLER_ENDPOINT_URL,
+    DEFAULT_UPSAMPLER_MODEL,
+)
+from agentic_upsampling.data import PromptItem, validate_t2i_json
+from agentic_upsampling.io_utils import compact_json, write_json_atomic
+from agentic_upsampling.prompt_upsampler import (
+    JSON_ENSURE_ASCII,
+    SYSTEM_MESSAGE,
+    ChatClientConfig,
+    OpenAIChatClient,
+    Text2ImagePromptUpsampler,
+    extract_json_object,
+)
+from agentic_upsampling.rubric import (
+    all_category_check_text,
+    analysis_json_text,
+    build_judge_prompt,
+    compact_analysis_for_rewrite,
+    parse_analysis_response,
+)
+CONNECT_TIMEOUT_S = 60
+SUBMIT_READ_TIMEOUT_S = 240
+IMAGE_GENERATION_READ_TIMEOUT_S = 600
+REWRITER_APPLICATION_GUIDANCE = all_category_check_text()
+@dataclass(frozen=True, slots=True)
+class GenerationOutput:
+    """Output from one image generation request."""
+    image_path: Path
+    meta_path: Path
+    meta: dict[str, Any]
+def read_api_token(api_key_env: str, api_key_file: Path | None = None) -> str:
+    """Resolve an API token from an environment variable or explicit file."""
+    token = os.environ.get(api_key_env, "").strip()
+    if token:
+        return token
+    if api_key_file is not None and api_key_file.exists():
+        token = api_key_file.read_text(encoding="utf-8").strip()
+        if token:
+            return token
+    raise RuntimeError(f"Missing API key. Export {api_key_env} or pass the matching --*-api-key-file flag.")
+def read_optional_generation_auth_key(auth_key: str, api_key_env: str = DEFAULT_GENERATION_AUTH_KEY_ENV) -> str:
+    """Resolve the optional generation endpoint auth key."""
+    return auth_key.strip() or os.environ.get(api_key_env, "").strip()
+def normalize_generation_endpoint(endpoint: str) -> str:
+    """Normalize the vLLM-Omni endpoint root without the /v1 suffix."""
+    normalized = endpoint.strip().rstrip("/")
+    if not normalized:
+        raise ValueError("generation endpoint cannot be empty.")
+    if not normalized.startswith(("http://", "https://")):
+        normalized = f"https://{normalized}"
+    if normalized.endswith("/v1/images/generations"):
+        normalized = normalized[: -len("/v1/images/generations")]
+    elif normalized.endswith("/v1"):
+        normalized = normalized[: -len("/v1")]
+    return normalized.rstrip("/")
+def make_session(pool_size: int = 4) -> requests.Session:
+    """Create a retrying HTTP session."""
+    session = requests.Session()
+    retry = Retry(
+        total=2,
+        connect=2,
+        read=0,
+        status=2,
+        status_forcelist=(429, 500, 502, 503, 504),
+        allowed_methods=frozenset({"GET", "POST"}),
+        backoff_factor=0.5,
+        raise_on_status=False,
+    )
+    adapter = HTTPAdapter(pool_connections=pool_size, pool_maxsize=pool_size, max_retries=retry, pool_block=False)
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+    return session
+def image_path_to_data_url(path: Path, *, jpeg_quality: int | None = DEFAULT_JPEG_QUALITY) -> str:
+    """Encode a local image file as a data URL, optionally transcoding to JPEG."""
+    if jpeg_quality is None:
+        encoded = base64.b64encode(path.read_bytes()).decode("ascii")
+        return f"data:image/png;base64,{encoded}"
+    with Image.open(path) as image:
+        if image.mode not in ("RGB", "L"):
+            image = image.convert("RGB")
+        buf = io.BytesIO()
+        image.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
+    encoded = base64.b64encode(buf.getvalue()).decode("ascii")
+    return f"data:image/jpeg;base64,{encoded}"
+class PromptRewriterClient:
+    """GPT-based T2I JSON prompt upsampler and iterative rewriter."""
+    upsampler: Text2ImagePromptUpsampler
+    rewrite_client: OpenAIChatClient
+    resolution: str
+    aspect_ratio: str
+    def __init__(
+        self,
+        *,
+        api_token: str,
+        upsampler_endpoint_url: str = DEFAULT_UPSAMPLER_ENDPOINT_URL,
+        upsampler_model: str = DEFAULT_UPSAMPLER_MODEL,
+        rewriter_endpoint_url: str = DEFAULT_REWRITER_ENDPOINT_URL,
+        rewriter_model: str = DEFAULT_REWRITER_MODEL,
+        extra_body: dict[str, Any] | None = None,
+        resolution: str = DEFAULT_RESOLUTION,
+        aspect_ratio: str = DEFAULT_ASPECT_RATIO,
+    ) -> None:
+        resolved_extra_body = DEFAULT_LLM_EXTRA_BODY if extra_body is None else extra_body
+        self.upsampler = Text2ImagePromptUpsampler.from_defaults(
+            api_token=api_token,
+            endpoint_url=upsampler_endpoint_url,
+            model=upsampler_model,
+            extra_body=resolved_extra_body,
+        )
+        self.rewrite_client = OpenAIChatClient(
+            ChatClientConfig(
+                endpoint_url=rewriter_endpoint_url,
+                model=rewriter_model,
+                api_token=api_token,
+                extra_body=resolved_extra_body,
+                max_tokens=8192,
+                max_retries=3,
+            )
+        )
+        self.resolution = resolution
+        self.aspect_ratio = aspect_ratio
+    def initial_prompt(self, item: PromptItem) -> dict[str, Any]:
+        """Create the initial dense structured prompt for a user prompt."""
+        return self.upsampler.upsample(
+            item.prompt,
+            prompt_id=item.prompt_id,
+            resolution=self.resolution,
+            aspect_ratio=self.aspect_ratio,
+        )
+    def rewrite_prompt_pair(
+        self,
+        item: PromptItem,
+        previous_prompt: dict[str, Any],
+        previous_negative_prompt: str,
+        previous_analysis: dict[str, Any],
+        history: list[dict[str, Any]],
+    ) -> tuple[dict[str, Any], str]:
+        """Jointly rewrite the positive JSON prompt and generator-side negative prompt."""
+        schema_keys = list(previous_prompt.keys())
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    "You are a precise text-to-image prompt engineer. Return valid JSON only, no markdown. "
+                    "Jointly coordinate the positive structured prompt and generator-side negative prompt so they do not contradict each other."
+                ),
+            },
+            {
+                "role": "user",
+                "content": self._joint_rewrite_user_prompt(
+                    item=item,
+                    previous_prompt=previous_prompt,
+                    previous_negative_prompt=previous_negative_prompt,
+                    previous_analysis=previous_analysis,
+                    history=history,
+                    schema_keys=schema_keys,
+                ),
+            },
+        ]
+        last_exc: Exception | None = None
+        for attempt in range(1, 4):
+            try:
+                raw = self.rewrite_client.complete(messages, response_format_json=True)
+                return self._parse_joint_rewrite_response(raw, item.prompt_id)
+            except Exception as exc:
+                last_exc = exc
+                if attempt < 3:
+                    time.sleep(min(20.0, 2.0 * attempt))
+        raise RuntimeError(f"Joint prompt rewrite failed after 3 attempts for prompt {item.prompt_id}.") from last_exc
+    @staticmethod
+    def _parse_joint_rewrite_response(raw: str, prompt_id: str) -> tuple[dict[str, Any], str]:
+        data = extract_json_object(raw)
+        positive_prompt = data.get("positive_prompt")
+        if not isinstance(positive_prompt, dict):
+            raise ValueError(f"Joint rewrite returned missing or non-object positive_prompt for prompt {prompt_id}.")
+        validate_t2i_json(positive_prompt, prompt_id)
+        negative_prompt = data.get("negative_prompt", "")
+        if not isinstance(negative_prompt, str):
+            raise ValueError(f"Joint rewrite returned non-string negative_prompt for prompt {prompt_id}.")
+        return positive_prompt, " ".join(negative_prompt.split())
+    @staticmethod
+    def _joint_rewrite_user_prompt(
+        *,
+        item: PromptItem,
+        previous_prompt: dict[str, Any],
+        previous_negative_prompt: str,
+        previous_analysis: dict[str, Any],
+        history: list[dict[str, Any]],
+        schema_keys: list[str],
+    ) -> str:
+        sections = [
+            "Original user prompt:",
+            item.prompt,
+            "",
+            "Application-specific guidance:",
+            "Apply the following sections as one checklist program. Do not first classify the prompt. Apply each section only when relevant to the original user prompt, previous JSON, or VLM failures.",
+            REWRITER_APPLICATION_GUIDANCE,
+            "",
+            "Previous generated image failed or scored according to this VLM analysis:",
+            analysis_json_text(compact_analysis_for_rewrite(previous_analysis)),
+            "",
+            "Iteration history summary:",
+            json.dumps(PromptRewriterClient._history_summary(history), ensure_ascii=JSON_ENSURE_ASCII, indent=2),
+            "",
+            "Previous positive JSON prompt:",
+            json.dumps(previous_prompt, ensure_ascii=JSON_ENSURE_ASCII, indent=2),
+            "",
+            "Previous negative prompt:",
+            previous_negative_prompt or "",
+            "",
+            "Joint rewrite task:",
+            'Return a JSON object with exactly two top-level keys: "positive_prompt" and "negative_prompt".',
+            '"positive_prompt" must be a complete JSON object with exactly these top-level keys, preserving their names and types:',
+            json.dumps(schema_keys, ensure_ascii=JSON_ENSURE_ASCII),
+            "",
+            '"positive_prompt" must keep the previous "resolution" and "aspect_ratio".',
+            '"negative_prompt" must be a concise generator-side negative prompt string.',
+            "Coordinate both fields: strengthen required positive constraints while using the negative prompt only to suppress concrete wrong alternatives or artifacts.",
+            "Do not put positive instructions in negative_prompt. Do not negate content required by the original user prompt.",
+            "For exact counts, grids, text, geometry, or anatomy, explicitly block wrong alternatives when useful.",
+            'The positive "comprehensive_t2i_caption" should be direct generation guidance, not an explanation of this rewrite process.',
+        ]
+        return "\n".join(sections)
+    @staticmethod
+    def _history_summary(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        return [
+            {
+                "iteration": item.get("iteration"),
+                "overall_score": item.get("analysis", {}).get("overall_score"),
+                "prompt_adherence_score": item.get("analysis", {}).get("prompt_adherence_score"),
+                "category_score": item.get("analysis", {}).get("category_score"),
+                "threshold_cleared": item.get("analysis", {}).get("threshold_cleared"),
+            }
+            for item in history
+        ]
+class ImageGenerationClient:
+    """Client for a vLLM-Omni /v1/images/generations text-to-image endpoint."""
+    endpoint: str
+    auth_key: str
+    model: str
+    session: requests.Session
+    size: str
+    num_steps: int
+    guidance: float
+    flow_shift: float
+    extra_args: dict[str, Any]
+    def __init__(
+        self,
+        *,
+        endpoint: str,
+        auth_key: str = "",
+        model: str = DEFAULT_GENERATION_MODEL,
+        size: str = DEFAULT_IMAGE_SIZE,
+        num_steps: int = DEFAULT_NUM_STEPS,
+        guidance: float = DEFAULT_GUIDANCE,
+        flow_shift: float = DEFAULT_FLOW_SHIFT,
+        extra_args: dict[str, Any] | None = None,
+        session: requests.Session | None = None,
+    ) -> None:
+        self.endpoint = normalize_generation_endpoint(endpoint)
+        self.auth_key = auth_key
+        self.model = model
+        self.session = session or make_session()
+        self.size = size
+        self.num_steps = num_steps
+        self.guidance = guidance
+        self.flow_shift = flow_shift
+        self.extra_args = dict(DEFAULT_GENERATION_EXTRA_ARGS if extra_args is None else extra_args)
+    def build_payload(
+        self,
+        prompt_json: dict[str, Any],
+        prompt_id: str,
+        seed: int | None = None,
+        negative_prompt: str = "",
+    ) -> dict[str, Any]:
+        """Build the vLLM-Omni image generation request payload."""
+        del prompt_id
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "prompt": compact_json(prompt_json, ensure_ascii=JSON_ENSURE_ASCII),
+            "size": self.size,
+            "n": 1,
+            "response_format": "b64_json",
+            "negative_prompt": negative_prompt.strip(),
+            "num_inference_steps": self.num_steps,
+            "guidance_scale": self.guidance,
+            "flow_shift": self.flow_shift,
+            "extra_args": dict(self.extra_args),
+        }
+        if seed is not None:
+            payload["seed"] = int(seed)
+        return payload
+    def generate(
+        self,
+        *,
+        prompt_json: dict[str, Any],
+        prompt_id: str,
+        output_dir: Path,
+        seed: int | None = None,
+        negative_prompt: str = "",
+        jpeg_quality: int = DEFAULT_JPEG_QUALITY,
+    ) -> GenerationOutput:
+        """Generate and persist one candidate image."""
+        payload = self.build_payload(prompt_json, prompt_id, seed, negative_prompt=negative_prompt)
+        response_json = self._generate_image(payload)
+        image_bytes = self._decode_image_response(response_json)
+        image_path = output_dir / "image.jpg"
+        image_info = self._save_jpeg(image_bytes, image_path, jpeg_quality)
+        meta = {
+            "prompt_id": prompt_id,
+            "status": "completed",
+            "endpoint": self.endpoint,
+            "image_generation_url": self._image_generation_url(),
+            "payload": payload,
+            "response": self._response_without_image_bytes(response_json),
+            "output_image_path": str(image_path),
+            "image_info": image_info,
+        }
+        meta_path = output_dir / "generation_meta.json"
+        write_json_atomic(meta_path, meta, ensure_ascii=JSON_ENSURE_ASCII)
+        return GenerationOutput(image_path=image_path, meta_path=meta_path, meta=meta)
+    def _generate_image(self, payload: dict[str, Any]) -> dict[str, Any]:
+        last_exc: Exception | None = None
+        for attempt in range(1, 4):
+            try:
+                return self._request_json(
+                    "POST",
+                    self._image_generation_url(),
+                    json=payload,
+                    headers=self._auth_headers(),
+                    timeout=(CONNECT_TIMEOUT_S, IMAGE_GENERATION_READ_TIMEOUT_S),
+                )
+            except Exception as exc:
+                last_exc = exc
+                if attempt < 3:
+                    time.sleep(min(20.0, 2.0 * attempt))
+        raise RuntimeError(f"/v1/images/generations failed after retries: {last_exc}") from last_exc
+    def _image_generation_url(self) -> str:
+        return f"{self.endpoint}/v1/images/generations"
+    def _auth_headers(self) -> dict[str, str] | None:
+        token = self.auth_key.strip()
+        if not token:
+            return None
+        if token.lower().startswith("bearer "):
+            return {"Authorization": token}
+        return {"Authorization": f"Bearer {token}"}
+    def _request_json(self, method: str, url: str, **kwargs: Any) -> dict[str, Any]:
+        timeout = kwargs.pop("timeout", (CONNECT_TIMEOUT_S, IMAGE_GENERATION_READ_TIMEOUT_S))
+        response = self.session.request(method, url, timeout=timeout, **kwargs)
+        if not response.ok:
+            raise RuntimeError(f"{method} {url} HTTP {response.status_code}: {response.text[:1000]}")
+        parsed = response.json()
+        if not isinstance(parsed, dict):
+            raise RuntimeError(f"{method} {url} returned non-object JSON: {parsed!r}")
+        return parsed
+    @staticmethod
+    def _decode_image_response(response_json: dict[str, Any]) -> bytes:
+        data = response_json.get("data")
+        if not isinstance(data, list) or not data or not isinstance(data[0], dict):
+            raise RuntimeError(f"Image generation response has no data[0] object: {response_json}")
+        first_image = data[0]
+        b64_image = first_image.get("b64_json")
+        if not isinstance(b64_image, str) or not b64_image.strip():
+            image_url = first_image.get("url")
+            if isinstance(image_url, str) and image_url.startswith("data:image") and "," in image_url:
+                b64_image = image_url.split(",", 1)[1]
+            else:
+                raise RuntimeError(f"Image generation response has no b64_json image: {response_json}")
+        try:
+            return base64.b64decode(b64_image, validate=True)
+        except ValueError:
+            return base64.b64decode(b64_image)
+    @staticmethod
+    def _response_without_image_bytes(response_json: dict[str, Any]) -> dict[str, Any]:
+        redacted = json.loads(json.dumps(response_json))
+        data = redacted.get("data")
+        if isinstance(data, list):
+            for item in data:
+                if isinstance(item, dict) and isinstance(item.get("b64_json"), str):
+                    item["b64_json"] = f"<base64 image omitted: {len(item['b64_json'])} chars>"
+                if isinstance(item, dict) and isinstance(item.get("url"), str) and item["url"].startswith("data:image"):
+                    item["url"] = f"<data image omitted: {len(item['url'])} chars>"
+        return redacted
+    @staticmethod
+    def _save_jpeg(image_bytes: bytes, output_path: Path, quality: int) -> dict[str, Any]:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = output_path.with_suffix(output_path.suffix + ".tmp")
+        with Image.open(io.BytesIO(image_bytes)) as image:
+            source_format = image.format
+            rgb = image.convert("RGB")
+            width, height = rgb.size
+            rgb.save(tmp, format="JPEG", quality=quality, optimize=True)
+        tmp.replace(output_path)
+        return {"source_image_format": source_format, "saved_format": "JPEG", "width": width, "height": height}
+class VLMQualityJudge:
+    """Gemini critic for generated images through an OpenAI-compatible endpoint."""
+    chat_client: OpenAIChatClient
+    image_jpeg_quality: int | None
+    def __init__(
+        self,
+        *,
+        api_token: str,
+        endpoint_url: str = DEFAULT_CRITIC_ENDPOINT_URL,
+        model: str = DEFAULT_CRITIC_MODEL,
+        max_tokens: int = 8192,
+        image_jpeg_quality: int | None = DEFAULT_JPEG_QUALITY,
+    ) -> None:
+        self.chat_client = OpenAIChatClient(
+            ChatClientConfig(
+                endpoint_url=endpoint_url,
+                model=model,
+                api_token=api_token,
+                max_tokens=max_tokens,
+                max_retries=3,
+            )
+        )
+        self.image_jpeg_quality = image_jpeg_quality
+    def score_image(
+        self,
+        *,
+        item: PromptItem,
+        image_path: Path,
+    ) -> dict[str, Any]:
+        """Score one image with the non-classifying rubric program."""
+        messages = [
+            SYSTEM_MESSAGE,
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_path_to_data_url(image_path, jpeg_quality=self.image_jpeg_quality)},
+                    },
+                    {
+                        "type": "text",
+                        "text": build_judge_prompt(item),
+                    },
+                ],
+            },
+        ]
+        raw = self.chat_client.complete(messages, response_format_json=True)
+        analysis = parse_analysis_response(raw)
+        analysis["raw_response"] = raw
+        return analysis

agentic_upsampling/constants.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Public defaults for the standalone agentic text-to-image upsampler."""
+from __future__ import annotations
+from typing import Any
+DEFAULT_OPENAI_ENDPOINT_URL = "https://api.openai.com/v1"
+DEFAULT_UPSAMPLER_ENDPOINT_URL = DEFAULT_OPENAI_ENDPOINT_URL
+DEFAULT_REWRITER_ENDPOINT_URL = DEFAULT_OPENAI_ENDPOINT_URL
+DEFAULT_UPSAMPLER_MODEL = "gpt-5.5"
+DEFAULT_REWRITER_MODEL = "gpt-5.5"
+DEFAULT_OPENAI_API_KEY_ENV = "OPENAI_API_KEY"
+DEFAULT_LLM_EXTRA_BODY: dict[str, Any] = {"reasoning_effort": "low"}
+DEFAULT_CRITIC_ENDPOINT_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
+DEFAULT_CRITIC_MODEL = "gemini-3.1-pro-preview"
+DEFAULT_GEMINI_API_KEY_ENV = "GEMINI_API_KEY"
+DEFAULT_GENERATION_AUTH_KEY_ENV = "AGENTIC_UPSAMPLING_GENERATION_AUTH_KEY"
+DEFAULT_GENERATION_MODEL = "nvidia/Cosmos3-Super-Text2Image"
+DEFAULT_IMAGE_SIZE = "1024x1024"
+DEFAULT_GENERATION_EXTRA_ARGS: dict[str, Any] = {"guardrails": False, "use_resolution_template": False}
+DEFAULT_RESOLUTION = "768"
+DEFAULT_ASPECT_RATIO = "1,1"
+DEFAULT_NUM_STEPS = 50
+DEFAULT_GUIDANCE = 4.0
+DEFAULT_FLOW_SHIFT = 3.0
+DEFAULT_MAX_ITERATIONS = 2
+DEFAULT_SAMPLES_PER_ITERATION = 3
+DEFAULT_JPEG_QUALITY = 99
+STRICT_OVERALL_THRESHOLD = 9.0
+STRICT_PROMPT_THRESHOLD = 9.0

agentic_upsampling/data.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Generic prompt loading and text-to-image JSON validation."""
+from __future__ import annotations
+import csv
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+@dataclass(frozen=True, slots=True)
+class PromptItem:
+    """One text-to-image prompt to process."""
+    prompt_id: str
+    row_number: int
+    prompt: str
+    metadata: dict[str, Any] = field(default_factory=dict)
+REQUIRED_T2I_KEYS = {
+    "subjects",
+    "subject_details",
+    "background_setting",
+    "lighting",
+    "text_and_signage_elements",
+    "resolution",
+    "aspect_ratio",
+    "comprehensive_t2i_caption",
+}
+PROMPT_COLUMNS = ("prompt", "Prompt")
+ID_COLUMNS = ("id", "ID", "prompt_id", "Prompt ID")
+_SAFE_ID_RE = re.compile(r"[^A-Za-z0-9_.-]+")
+def prompt_dir_name(item: PromptItem) -> str:
+    """Return the deterministic output directory name for a prompt."""
+    raw_id = item.prompt_id.strip()
+    if raw_id.isdigit():
+        return f"{int(raw_id):04d}"
+    cleaned = _SAFE_ID_RE.sub("_", raw_id).strip("._-")
+    return cleaned or f"row_{item.row_number + 1:04d}"
+def load_prompt_items(
+    *,
+    prompt: str | None = None,
+    prompts_path: Path | None = None,
+    limit: int | None = None,
+) -> list[PromptItem]:
+    """Load prompts from a literal prompt or a txt/jsonl/csv file."""
+    if bool(prompt) == bool(prompts_path):
+        raise ValueError("Provide exactly one of --prompt or --prompts.")
+    if prompt:
+        items = [PromptItem(prompt_id="1", row_number=0, prompt=prompt.strip())]
+    elif prompts_path is not None:
+        items = _load_prompts_path(prompts_path)
+    else:
+        items = []
+    items = [item for item in items if item.prompt.strip()]
+    if limit is not None and limit >= 0:
+        items = items[:limit]
+    _validate_unique_output_dirs(items)
+    return items
+def _load_prompts_path(path: Path) -> list[PromptItem]:
+    suffix = path.suffix.lower()
+    if suffix == ".txt":
+        return _load_txt_prompts(path)
+    if suffix == ".jsonl":
+        return _load_jsonl_prompts(path)
+    if suffix == ".csv":
+        return _load_csv_prompts(path)
+    raise ValueError(f"Unsupported prompt file extension {suffix!r}. Use .txt, .jsonl, or .csv.")
+def _load_txt_prompts(path: Path) -> list[PromptItem]:
+    items: list[PromptItem] = []
+    for row_number, line in enumerate(path.read_text(encoding="utf-8").splitlines()):
+        prompt = line.strip()
+        if not prompt:
+            continue
+        items.append(PromptItem(prompt_id=str(len(items) + 1), row_number=row_number, prompt=prompt))
+    return items
+def _load_jsonl_prompts(path: Path) -> list[PromptItem]:
+    items: list[PromptItem] = []
+    with path.open(encoding="utf-8") as f:
+        for row_number, line in enumerate(f):
+            raw = line.strip()
+            if not raw:
+                continue
+            parsed = json.loads(raw)
+            if isinstance(parsed, str):
+                prompt = parsed.strip()
+                prompt_id = str(len(items) + 1)
+                metadata: dict[str, Any] = {}
+            elif isinstance(parsed, dict):
+                prompt = str(parsed.get("prompt") or parsed.get("Prompt") or "").strip()
+                prompt_id = str(parsed.get("id") or parsed.get("prompt_id") or len(items) + 1)
+                metadata = {key: value for key, value in parsed.items() if key not in {"prompt", "Prompt"}}
+            else:
+                raise ValueError(f"JSONL row {row_number + 1} must be a string or object.")
+            if prompt:
+                items.append(PromptItem(prompt_id=prompt_id, row_number=row_number, prompt=prompt, metadata=metadata))
+    return items
+def _load_csv_prompts(path: Path) -> list[PromptItem]:
+    items: list[PromptItem] = []
+    with path.open(newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row_number, row in enumerate(reader):
+            prompt_key = _first_present_key(row, PROMPT_COLUMNS)
+            if prompt_key is None:
+                raise ValueError(f"CSV must include one of these prompt columns: {', '.join(PROMPT_COLUMNS)}.")
+            prompt = str(row.get(prompt_key) or "").strip()
+            if not prompt:
+                continue
+            id_key = _first_present_key(row, ID_COLUMNS)
+            prompt_id = str(row.get(id_key) or len(items) + 1) if id_key is not None else str(len(items) + 1)
+            items.append(PromptItem(prompt_id=prompt_id, row_number=row_number, prompt=prompt, metadata=dict(row)))
+    return items
+def _first_present_key(row: dict[str, Any], keys: tuple[str, ...]) -> str | None:
+    for key in keys:
+        if key in row:
+            return key
+    return None
+def _validate_unique_output_dirs(items: list[PromptItem]) -> None:
+    seen: dict[str, str] = {}
+    for item in items:
+        dirname = prompt_dir_name(item)
+        previous = seen.get(dirname)
+        if previous is not None:
+            raise ValueError(f"Prompt ids {previous!r} and {item.prompt_id!r} map to the same output dir {dirname!r}.")
+        seen[dirname] = item.prompt_id
+def validate_t2i_json(data: dict[str, Any], prompt_id: str) -> None:
+    """Validate the minimum structured T2I JSON shape expected by Cosmos3."""
+    missing = sorted(REQUIRED_T2I_KEYS - set(data))
+    if missing:
+        raise ValueError(f"Prompt JSON for {prompt_id} is missing required keys: {missing}")
+    if not isinstance(data.get("subjects"), list):
+        raise ValueError(f"Prompt JSON for {prompt_id}: subjects must be a list.")
+    if not isinstance(data.get("text_and_signage_elements"), list):
+        raise ValueError(f"Prompt JSON for {prompt_id}: text_and_signage_elements must be a list.")
+    caption = data.get("comprehensive_t2i_caption")
+    if not isinstance(caption, str) or not caption.strip():
+        raise ValueError(f"Prompt JSON for {prompt_id}: comprehensive_t2i_caption is empty.")
+    resolution = data.get("resolution")
+    if not isinstance(resolution, dict) or not {"H", "W"}.issubset(resolution):
+        raise ValueError(f"Prompt JSON for {prompt_id}: resolution must contain H and W.")
+    aspect_ratio = data.get("aspect_ratio")
+    if not isinstance(aspect_ratio, str) or not aspect_ratio.strip():
+        raise ValueError(f"Prompt JSON for {prompt_id}: aspect_ratio must be a non-empty string.")

agentic_upsampling/extract_best.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""Extract best agentic upsampling images from an output directory."""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import shutil
+from pathlib import Path
+from typing import Any
+from agentic_upsampling.io_utils import append_jsonl
+IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp"}
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--output-dir", type=Path, required=True, help="Agentic upsampler run output directory.")
+    parser.add_argument(
+        "--export-dir",
+        type=Path,
+        default=None,
+        help="Directory for copied best images and manifests. Defaults to OUTPUT_DIR/best_generations.",
+    )
+    parser.add_argument("--overwrite", action="store_true", help="Replace existing copied images/manifests.")
+    return parser.parse_args()
+def iter_best_jsons(output_dir: Path) -> list[Path]:
+    """Return per-prompt best.json files in deterministic order."""
+    return sorted(path for path in output_dir.glob("*/best.json") if path.parent.name != "best_generations")
+def resolve_image_path(raw_path: str, *, output_dir: Path, best_json_path: Path) -> Path:
+    """Resolve image paths written by runs launched with relative or absolute output dirs."""
+    image_path = Path(raw_path)
+    candidates = [image_path]
+    if not image_path.is_absolute():
+        candidates.extend(
+            [
+                output_dir / image_path,
+                output_dir.parent / image_path,
+                best_json_path.parent / image_path.name,
+            ]
+        )
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    raise FileNotFoundError(f"Best image does not exist: {raw_path}")
+def copied_image_name(record: dict[str, Any], image_path: Path) -> str:
+    """Build a simple copied image filename."""
+    prompt_id = str(record["prompt_id"])
+    suffix = image_path.suffix.lower()
+    if suffix not in IMAGE_SUFFIXES:
+        suffix = ".jpg"
+    return f"{prompt_id}{suffix}"
+def extract_record(best_json_path: Path, *, output_dir: Path, images_dir: Path, overwrite: bool) -> dict[str, Any]:
+    """Copy one best image and return its export manifest record."""
+    best_data = json.loads(best_json_path.read_text(encoding="utf-8"))
+    if not isinstance(best_data, dict):
+        raise ValueError(f"{best_json_path} must contain a JSON object.")
+    best = best_data.get("best")
+    if not isinstance(best, dict):
+        raise ValueError(f"{best_json_path} is missing best candidate metadata.")
+    raw_image_path = str(best.get("image_path") or "")
+    if not raw_image_path:
+        raise ValueError(f"{best_json_path} best candidate is missing image_path.")
+    image_path = resolve_image_path(raw_image_path, output_dir=output_dir, best_json_path=best_json_path)
+    record = {
+        "prompt_id": str(best_data["prompt_id"]),
+        "prompt": str(best_data.get("prompt") or ""),
+        "best_score": best_data.get("best_score"),
+        "best_iteration": best_data.get("best_iteration"),
+        "selected_sample_index": best.get("selected_sample_index", best.get("sample_index")),
+        "threshold_cleared_any": bool(best_data.get("threshold_cleared_any")),
+        "source_image_path": str(image_path),
+        "best_json_path": str(best_json_path),
+        "analysis_path": str(best.get("analysis_path") or ""),
+    }
+    dest_path = images_dir / copied_image_name(record, image_path)
+    if dest_path.exists() and not overwrite:
+        raise FileExistsError(f"Refusing to overwrite existing image: {dest_path}")
+    images_dir.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(image_path, dest_path)
+    record["copied_image_path"] = str(dest_path)
+    return record
+def write_csv(path: Path, records: list[dict[str, Any]]) -> None:
+    """Write a flat CSV summary for quick spreadsheet inspection."""
+    fieldnames = [
+        "prompt_id",
+        "best_score",
+        "best_iteration",
+        "selected_sample_index",
+        "threshold_cleared_any",
+        "copied_image_path",
+        "source_image_path",
+        "best_json_path",
+        "analysis_path",
+        "prompt",
+    ]
+    with path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for record in records:
+            writer.writerow({key: record.get(key, "") for key in fieldnames})
+def extract_best_images(output_dir: Path, export_dir: Path, *, overwrite: bool = False) -> list[dict[str, Any]]:
+    """Copy best images from a run and write JSONL/CSV manifests."""
+    output_dir = output_dir.expanduser()
+    export_dir = export_dir.expanduser()
+    if not output_dir.exists():
+        raise FileNotFoundError(f"Missing output directory: {output_dir}")
+    best_jsons = iter_best_jsons(output_dir)
+    if not best_jsons:
+        raise RuntimeError(f"No per-prompt best.json files found under {output_dir}")
+    images_dir = export_dir / "images"
+    manifest_path = export_dir / "best_generations.jsonl"
+    csv_path = export_dir / "best_generations.csv"
+    if overwrite:
+        manifest_path.unlink(missing_ok=True)
+        csv_path.unlink(missing_ok=True)
+    elif manifest_path.exists() or csv_path.exists():
+        raise FileExistsError(f"Export manifests already exist in {export_dir}; pass --overwrite to replace them.")
+    records: list[dict[str, Any]] = []
+    for best_json_path in best_jsons:
+        record = extract_record(best_json_path, output_dir=output_dir, images_dir=images_dir, overwrite=overwrite)
+        records.append(record)
+        append_jsonl(manifest_path, record)
+    write_csv(csv_path, records)
+    return records
+def main() -> int:
+    args = parse_args()
+    export_dir = args.export_dir or (args.output_dir / "best_generations")
+    records = extract_best_images(args.output_dir, export_dir, overwrite=args.overwrite)
+    print(f"Exported {len(records)} best images to {export_dir}", flush=True)
+    print(f"Images: {export_dir / 'images'}", flush=True)
+    print(f"JSONL:  {export_dir / 'best_generations.jsonl'}", flush=True)
+    print(f"CSV:    {export_dir / 'best_generations.csv'}", flush=True)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

agentic_upsampling/io_utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Small JSON and file helpers for agentic upsampling runs."""
+from __future__ import annotations
+import json
+import os
+import tempfile
+from pathlib import Path
+from typing import Any
+def write_json_atomic(path: Path, data: Any, *, ensure_ascii: bool = True) -> None:
+    """Write JSON through a temporary file and atomically replace the destination."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_name = tempfile.mkstemp(prefix=f".{path.name}.", suffix=".tmp", dir=path.parent)
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=ensure_ascii, indent=2)
+            f.write("\n")
+        Path(tmp_name).replace(path)
+    except Exception:
+        try:
+            Path(tmp_name).unlink(missing_ok=True)
+        finally:
+            raise
+def append_jsonl(path: Path, data: Any, *, ensure_ascii: bool = True) -> None:
+    """Append one compact JSON record to a JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(data, ensure_ascii=ensure_ascii, separators=(",", ":")) + "\n")
+def read_json(path: Path) -> dict[str, Any]:
+    """Read a JSON object from disk."""
+    data = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise ValueError(f"{path} must contain a JSON object.")
+    return data
+def compact_json(data: dict[str, Any], *, ensure_ascii: bool = True) -> str:
+    """Serialize JSON using the compact prompt format expected by the generation endpoint."""
+    return json.dumps(data, ensure_ascii=ensure_ascii, separators=(",", ":"))

agentic_upsampling/prompt_upsampler.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""OpenAI-compatible text-to-image prompt upsampling client."""
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from agentic_upsampling.constants import (
+    DEFAULT_LLM_EXTRA_BODY,
+    DEFAULT_UPSAMPLER_ENDPOINT_URL,
+    DEFAULT_UPSAMPLER_MODEL,
+)
+from agentic_upsampling.data import validate_t2i_json
+JSON_ENSURE_ASCII = bool(int(os.environ.get("JSON_ENSURE_ASCII", "1")))
+DEFAULT_USER_AGENT = "Cosmos3-Super-Text2Image-Agentic-Upsampling/1.0"
+SYSTEM_MESSAGE: dict[str, Any] = {
+    "role": "system",
+    "content": [{"type": "text", "text": "You are a helpful assistant."}],
+}
+log = logging.getLogger(__name__)
+RESOLUTION_RATIO_DICT: dict[str, dict[str, dict[str, int]]] = {
+    "256": {
+        "1,1": {"W": 256, "H": 256},
+        "4,3": {"W": 320, "H": 256},
+        "3,4": {"W": 256, "H": 320},
+        "16,9": {"W": 320, "H": 192},
+        "9,16": {"W": 192, "H": 320},
+    },
+    "480": {
+        "1,1": {"W": 640, "H": 640},
+        "4,3": {"W": 736, "H": 544},
+        "3,4": {"W": 544, "H": 736},
+        "16,9": {"W": 832, "H": 480},
+        "9,16": {"W": 480, "H": 832},
+    },
+    "720": {
+        "1,1": {"W": 960, "H": 960},
+        "4,3": {"W": 1104, "H": 832},
+        "3,4": {"W": 832, "H": 1104},
+        "16,9": {"W": 1280, "H": 720},
+        "9,16": {"W": 720, "H": 1280},
+    },
+    "768": {
+        "1,1": {"W": 1024, "H": 1024},
+        "4,3": {"W": 1184, "H": 880},
+        "3,4": {"W": 880, "H": 1184},
+        "16,9": {"W": 1360, "H": 768},
+        "9,16": {"W": 768, "H": 1360},
+    },
+}
+T2I_JSON_TEMPLATE = """Given the user's natural-language request below, generate a dense structured JSON that fully describes the image to be produced. The JSON must strictly follow the template provided after the request, including every top-level key and every nested sub-field.
+The output is always dense. Even when the request is brief, infer plausible, scene-consistent details for every field. Do not leave fields empty merely because the request did not mention them. Be creative but stay grounded: additions must be physically plausible and internally consistent with the request.
+Requirements:
+- Extract visual intent from the user request into the visual fields.
+- For every visual field, write rich, specific content inferred from the request's scene, subjects, mood, and context.
+- Empty values ("", 0, [], {{}}) are permitted only for truly inapplicable fields.
+- Do not add keys beyond the template. Do not omit keys required by the template.
+- Return only the JSON object. Do not include markdown fences or prose outside JSON.
+USER VISUAL REQUEST:
+{caption_dense}
+Lists may contain zero or more items of the shape shown. All top-level keys must always be present in the output; fill unused fields with "", 0, {{}}, or [] as appropriate.
+{{
+  "subjects": [
+    {{
+      "description": "full visual description of the subject",
+      "appearance_details": "additional visual details such as accessories, texture, and distinguishing features",
+      "relationship": "how this subject relates to others or to the scene",
+      "location": "where in frame, for example center foreground or top right",
+      "relative_size": "size within frame",
+      "orientation": "direction subject faces relative to camera",
+      "pose": "body position and posture",
+      "clothing": "clothing and accessories; empty string if non-human or not applicable",
+      "expression": "facial expression; empty string if non-human or not applicable",
+      "gender": "Male, Female, Unknown, or N/A",
+      "age": "age category",
+      "skin_tone_and_texture": "skin tone description; empty string if non-human",
+      "facial_features": "notable facial features; empty string if non-human or not visible",
+      "number_of_subjects": "int; total in this subject group, 0 if not applicable",
+      "number_of_arms": "int; 2 for humans, 0 if non-human",
+      "number_of_legs": "int; 2 for humans, 0 if non-human",
+      "number_of_hands": "int; 2 for humans, 0 if non-human",
+      "number_of_fingers": "int; 10 for humans, 0 if non-human"
+    }}
+  ],
+  "subject_details": {{
+    "key_name_1": "free-form image-specific attribute; empty object if not applicable"
+  }},
+  "background_setting": "full prose description of the environment and setting",
+  "lighting": {{
+    "conditions": "type and quality of light",
+    "direction": "where light comes from; None for flat digital images",
+    "shadows": "shadow description; None for flat digital images",
+    "illumination_effect": "overall effect of the lighting"
+  }},
+  "aesthetics": {{
+    "composition": "framing and compositional choices",
+    "color_scheme": "dominant colors and palette",
+    "mood_atmosphere": "emotional atmosphere in short phrases",
+    "patterns": "notable repeating visual patterns; None if none"
+  }},
+  "cinematography": {{
+    "framing": "shot type",
+    "camera_angle": "angle such as Eye-level, Low angle, or High angle",
+    "depth_of_field": "Shallow, Deep, Uniform focus, or N/A",
+    "focus": "what is in sharp focus",
+    "lens_focal_length": "descriptive focal length"
+  }},
+  "style_medium": "visual medium, for example Photography, Digital illustration, or Screenshot",
+  "artistic_style": "genre or approach",
+  "context": "scene context or use case",
+  "text_and_signage_elements": [
+    {{
+      "text": "the visible text content",
+      "category": "physical_in_scene, ui_text, body_text, scene_sign, logo, or label",
+      "appearance": "font, color, size, style",
+      "spatial": "position in image",
+      "context": "purpose or meaning of the text"
+    }}
+  ],
+  "quadrant_scan": {{
+    "top_left": "description of what appears in the top-left region",
+    "top_right": "description of what appears in the top-right region",
+    "bottom_left": "description of what appears in the bottom-left region",
+    "bottom_right": "description of what appears in the bottom-right region",
+    "absolute_center": "description of what appears at the center"
+  }},
+  "comprehensive_t2i_caption": "a comprehensive, full-scene natural-language prose description of the image",
+  "resolution": {{
+    "H": "will be overwritten by the selected resolution and aspect ratio",
+    "W": "will be overwritten by the selected resolution and aspect ratio"
+  }},
+  "aspect_ratio": "will be overwritten by the selected aspect ratio"
+}}"""
+@dataclass(slots=True)
+class ChatClientConfig:
+    """Configuration for an OpenAI-compatible chat-completions endpoint."""
+    endpoint_url: str
+    model: str
+    api_token: str
+    timeout_s: float = 300.0
+    max_tokens: int = 8192
+    max_retries: int = 3
+    retry_base_delay_s: float = 1.0
+    extra_body: dict[str, Any] | None = None
+    connection_max_retries: int = 2
+    connection_pool_size: int = 4
+class OpenAIChatClient:
+    """Small synchronous OpenAI-compatible chat-completions client."""
+    config: ChatClientConfig
+    base_url: str
+    session: requests.Session
+    sleep: Callable[[float], None]
+    def __init__(
+        self,
+        config: ChatClientConfig,
+        *,
+        session: requests.Session | None = None,
+        sleep: Callable[[float], None] = time.sleep,
+    ) -> None:
+        self.config = config
+        self.base_url = normalize_openai_base_url(config.endpoint_url)
+        self.session = _make_session(config) if session is None else session
+        self.sleep = sleep
+    def complete(self, messages: list[dict[str, Any]], *, response_format_json: bool = False) -> str:
+        """Request one chat completion and return assistant text."""
+        def _call() -> str:
+            payload: dict[str, Any] = {
+                "model": self.config.model,
+                "messages": messages,
+                self._max_tokens_key(): self.config.max_tokens,
+            }
+            if response_format_json:
+                payload["response_format"] = {"type": "json_object"}
+            if self.config.extra_body:
+                payload.update(self.config.extra_body)
+            parsed = self._request_json("POST", f"{self.base_url}/chat/completions", payload=payload)
+            choices = parsed.get("choices")
+            if not isinstance(choices, list) or not choices:
+                raise ValueError("Chat completion response missing choices.")
+            first_choice = choices[0]
+            if not isinstance(first_choice, dict):
+                raise ValueError("Chat completion choice must be an object.")
+            message = first_choice.get("message")
+            if not isinstance(message, dict):
+                raise ValueError("Chat completion choice missing message.")
+            return _message_content_to_text(message.get("content"))
+        return self._with_retries("complete chat request", _call)
+    def _request_json(self, method: str, url: str, *, payload: dict[str, Any] | None = None) -> dict[str, Any]:
+        headers = {"Accept": "application/json", "User-Agent": DEFAULT_USER_AGENT}
+        if payload is not None:
+            headers["Content-Type"] = "application/json"
+        if self.config.api_token:
+            headers["Authorization"] = f"Bearer {self.config.api_token}"
+        try:
+            response = self.session.request(method, url, json=payload, headers=headers, timeout=self.config.timeout_s)
+        except requests.RequestException as exc:
+            raise RuntimeError(f"Failed to reach {url}: {exc}") from exc
+        if not response.ok:
+            raise RuntimeError(f"HTTP {response.status_code} from {url}: {response.text[:1000]}")
+        parsed = response.json()
+        if not isinstance(parsed, dict):
+            raise RuntimeError(f"Response from {url} must be a JSON object.")
+        return parsed
+    def _with_retries(self, operation: str, fn: Callable[[], str]) -> str:
+        if self.config.max_retries < 1:
+            raise ValueError("max_retries must be >= 1.")
+        last_exc: Exception | None = None
+        for attempt in range(self.config.max_retries):
+            try:
+                return fn()
+            except Exception as exc:
+                last_exc = exc
+                if attempt == self.config.max_retries - 1:
+                    break
+                self.sleep(self.config.retry_base_delay_s * (2**attempt))
+        raise RuntimeError(f"Failed to {operation} after {self.config.max_retries} attempts: {last_exc}") from last_exc
+    def _max_tokens_key(self) -> str:
+        if "api.openai.com" in self.base_url:
+            return "max_completion_tokens"
+        return "max_tokens"
+class Text2ImagePromptUpsampler:
+    """Create structured Cosmos3 text-to-image JSON prompts from user text."""
+    chat_client: OpenAIChatClient
+    def __init__(self, chat_client: OpenAIChatClient) -> None:
+        self.chat_client = chat_client
+    @classmethod
+    def from_defaults(
+        cls,
+        *,
+        api_token: str,
+        endpoint_url: str = DEFAULT_UPSAMPLER_ENDPOINT_URL,
+        model: str = DEFAULT_UPSAMPLER_MODEL,
+        extra_body: dict[str, Any] | None = None,
+    ) -> Text2ImagePromptUpsampler:
+        """Build the default GPT-5.5 based T2I prompt upsampler."""
+        return cls(
+            OpenAIChatClient(
+                ChatClientConfig(
+                    endpoint_url=endpoint_url,
+                    model=model,
+                    api_token=api_token,
+                    extra_body=DEFAULT_LLM_EXTRA_BODY if extra_body is None else extra_body,
+                )
+            )
+        )
+    def upsample(
+        self,
+        prompt: str,
+        *,
+        prompt_id: str,
+        resolution: str,
+        aspect_ratio: str,
+        user_prompt: str | None = None,
+    ) -> dict[str, Any]:
+        """Return a validated structured T2I JSON prompt."""
+        messages = build_t2i_messages(prompt, user_prompt=user_prompt)
+        raw = self.chat_client.complete(messages, response_format_json=True)
+        data = apply_t2i_output_parameters(extract_json_object(raw), resolution=resolution, aspect_ratio=aspect_ratio)
+        validate_t2i_json(data, prompt_id)
+        return data
+def build_t2i_messages(prompt: str, *, user_prompt: str | None = None) -> list[dict[str, Any]]:
+    """Build chat messages for the initial structured prompt upsampling request."""
+    message_text = user_prompt or T2I_JSON_TEMPLATE.format(caption_dense=prompt.strip())
+    return [
+        SYSTEM_MESSAGE,
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": message_text}],
+        },
+    ]
+def apply_t2i_output_parameters(data: dict[str, Any], *, resolution: str, aspect_ratio: str) -> dict[str, Any]:
+    """Overwrite output metadata with the selected T2I canvas parameters."""
+    if resolution not in RESOLUTION_RATIO_DICT:
+        raise ValueError(f"Unsupported resolution {resolution!r}.")
+    if aspect_ratio not in RESOLUTION_RATIO_DICT[resolution]:
+        raise ValueError(f"Unsupported aspect_ratio {aspect_ratio!r} for resolution {resolution!r}.")
+    resolution_pair = RESOLUTION_RATIO_DICT[resolution][aspect_ratio]
+    data["resolution"] = {"H": resolution_pair["H"], "W": resolution_pair["W"]}
+    data["aspect_ratio"] = aspect_ratio
+    return data
+def extract_json_object(text: str) -> dict[str, Any]:
+    """Extract a JSON object from raw model text."""
+    cleaned = text.strip()
+    fence_match = re.search(r"```(?:json)?\s*(.*?)\s*```", cleaned, flags=re.DOTALL)
+    if fence_match:
+        cleaned = fence_match.group(1).strip()
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start < 0 or end < start:
+        raise ValueError("Model response did not contain a JSON object.")
+    parsed = json.loads(cleaned[start : end + 1])
+    if not isinstance(parsed, dict):
+        raise ValueError("Model response JSON must be an object.")
+    return parsed
+def normalize_openai_base_url(url: str) -> str:
+    """Normalize an OpenAI-compatible endpoint root."""
+    normalized = url.strip().rstrip("/")
+    if not normalized:
+        raise ValueError("endpoint_url cannot be empty.")
+    if not normalized.startswith(("http://", "https://")):
+        normalized = f"https://{normalized}"
+    if normalized.endswith("/chat/completions"):
+        normalized = normalized[: -len("/chat/completions")]
+    if normalized.endswith("/v1") or normalized.endswith("/openai"):
+        return normalized
+    return f"{normalized}/v1"
+def _make_session(config: ChatClientConfig) -> requests.Session:
+    session = requests.Session()
+    retry = Retry(
+        total=config.connection_max_retries,
+        connect=config.connection_max_retries,
+        read=0,
+        status=2,
+        status_forcelist=(429, 500, 502, 503, 504),
+        allowed_methods=frozenset({"GET", "POST"}),
+        backoff_factor=0.5,
+        raise_on_status=False,
+    )
+    adapter = HTTPAdapter(
+        pool_connections=config.connection_pool_size,
+        pool_maxsize=config.connection_pool_size,
+        max_retries=retry,
+    )
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+    return session
+def _message_content_to_text(content: Any) -> str:
+    if isinstance(content, str) and content.strip():
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text" and isinstance(item.get("text"), str):
+                parts.append(item["text"])
+        text = "".join(parts).strip()
+        if text:
+            return text
+    raise ValueError("Chat completion message content is empty or unsupported.")

agentic_upsampling/rubric.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""VLM critic prompt and score normalization for agentic T2I upsampling."""
+from __future__ import annotations
+import json
+from typing import Any
+from agentic_upsampling.constants import STRICT_OVERALL_THRESHOLD, STRICT_PROMPT_THRESHOLD
+from agentic_upsampling.data import PromptItem
+from agentic_upsampling.prompt_upsampler import extract_json_object
+CATEGORY_SECTIONS = {
+    "text_commercial_ui": (
+        "Text/commercial/UI/logo checks: readable text for logos, labels, posters, "
+        "billboards, product packaging, or UI. Verify exact quoted strings, spelling, legibility, typography, "
+        "placement, layout, and whether commercial/UI intent is visually clear."
+    ),
+    "people_anatomy": (
+        "People/anatomy checks: if humans, human-like characters, body parts, portraits, or poses are present or "
+        "required by the prompt, inspect faces, eyes, hands, fingers, limbs, pose, proportions, expression, "
+        "clothing coherence, and physically possible interactions."
+    ),
+    "fantasy_cartoon_vector": (
+        "Fantasy/cartoon/vector/pixel-art checks: if a stylized medium is requested, judge whether stylization is "
+        "intentional and clean. Penalize messy geometry, inconsistent line language, broken vector shapes, muddy "
+        "palettes, and unwanted photorealistic texture."
+    ),
+    "photorealistic_physical": (
+        "Photorealistic/physical checks: if realism, physical objects, geometry, camera behavior, reflections, "
+        "transparent materials, shadows, perspective, scale, or contact matter, judge material realism, lighting "
+        "physics, lens plausibility, and whether objects obey real-world physical constraints."
+    ),
+    "general_scene": (
+        "General scene checks: always judge object completeness, layout clarity, subject relationships, background "
+        "coherence, visual appeal, and absence of obvious AI artifacts."
+    ),
+}
+SCORE_KEYS = (
+    "prompt_adherence_score",
+    "visual_quality_score",
+    "aesthetics_score",
+    "physical_plausibility_score",
+    "category_score",
+    "overall_score",
+)
+ISSUE_SEVERITIES = {"minor", "moderate", "severe"}
+def all_category_check_text() -> str:
+    """Return the full non-classifying category checklist."""
+    return "\n".join(f"- {text}" for text in CATEGORY_SECTIONS.values())
+def build_judge_prompt(item: PromptItem) -> str:
+    """Build the VLM critic prompt using the original user prompt as task context."""
+    return f"""You are an expert image quality analyst specializing in AI-generated image evaluation.
+Your job is to produce an exhaustive defect report. Be meticulous: go beyond obvious problems and look carefully for subtle or background issues too.
+The attached image was generated by an AI image model.
+Analyze this image carefully and list every quality issue you observe.
+For each issue give an approximate location and name the specific object or region involved. Report each distinct occurrence separately.
+Before finalizing, check these areas, but only report issues you actually see:
+- Physics: gravity violations, impossible collisions, implausible trajectories.
+- Object deformation: morphing, melting, stretching of solid objects.
+- Anatomy: distorted hands, faces, fingers, limbs, or wrong body proportions.
+- Lighting and shadows: missing shadows or inconsistent illumination.
+- Depth and scale: wrong spatial relationships, perspective issues, or scale inconsistencies.
+- Text and numbers: garbled, floating, or incorrect text and digits.
+- Visual quality: blur patches, noise, compression blocking, visual artifacts, or low-resolution regions.
+- Color: inconsistent coloration, bleeding, or banding.
+- Action correctness: prompted actions are correctly displayed.
+- Prompt following: missing subjects, wrong objects, wrong setting, or wrong action.
+Depending on the prompt, also apply the relevant checks below:
+{all_category_check_text()}
+The attached image was generated from this prompt:
+{item.prompt}
+Return exactly one JSON object, no markdown fences and no prose outside JSON:
+{{
+  "prompt_adherence_score": <number 0-10>,
+  "visual_quality_score": <number 0-10>,
+  "aesthetics_score": <number 0-10>,
+  "physical_plausibility_score": <number 0-10>,
+  "category_score": <number 0-10>,
+  "text_rendering_score": <number 0-10 or null>,
+  "photorealism_score": <number 0-10 or null>,
+  "overall_score": <number 0-10>,
+  "issues": [
+    {{
+      "category": "<concise label>",
+      "description": "<what failed and where in the image>",
+      "severity": "minor" | "moderate" | "severe"
+    }}
+  ],
+  "prompt_elements": {{
+    "<key noun or action from the prompt>": "present" | "absent" | "partial"
+  }},
+  "category_findings": {{"<check area>": "<concise finding>"}},
+  "improvement_directives": ["<specific prompt rewrite instruction>"],
+  "rationale": "<2-4 concise sentences>"
+}}
+"""
+def parse_analysis_response(text: str) -> dict[str, Any]:
+    """Parse and normalize a raw VLM scoring response."""
+    return normalize_analysis(extract_json_object(text))
+def normalize_analysis(data: dict[str, Any]) -> dict[str, Any]:
+    """Normalize VLM analysis into the schema used by selection and reporting."""
+    normalized = dict(data)
+    for key in SCORE_KEYS:
+        normalized[key] = _score(normalized.get(key))
+    for optional_key in ("text_rendering_score", "photorealism_score"):
+        if normalized.get(optional_key) is not None:
+            normalized[optional_key] = _score(normalized.get(optional_key))
+    normalized["issues"] = _normalize_issues(normalized.get("issues"))
+    directives = normalized.get("improvement_directives")
+    if isinstance(directives, list):
+        normalized["improvement_directives"] = [str(item) for item in directives if str(item).strip()]
+    else:
+        normalized["improvement_directives"] = []
+    findings = normalized.get("category_findings")
+    normalized["category_findings"] = findings if isinstance(findings, dict) else {}
+    normalized["threshold_cleared"] = clears_strict_threshold(normalized)
+    return normalized
+def clears_strict_threshold(analysis: dict[str, Any]) -> bool:
+    """Return whether a candidate clears the strict quality milestone."""
+    if _score(analysis.get("overall_score")) < STRICT_OVERALL_THRESHOLD:
+        return False
+    if _score(analysis.get("prompt_adherence_score")) < STRICT_PROMPT_THRESHOLD:
+        return False
+    if _has_severe_issue(analysis.get("issues")):
+        return False
+    if analysis.get("text_rendering_score") is not None:
+        return _score(analysis.get("text_rendering_score")) >= STRICT_PROMPT_THRESHOLD
+    return True
+def candidate_sort_key(candidate: dict[str, Any]) -> tuple[float, float, float, float, float, int]:
+    """Sort key for picking the best candidate."""
+    analysis = candidate.get("analysis", {})
+    iteration = int(candidate.get("iteration", 0))
+    return (
+        _score(analysis.get("overall_score")),
+        _score(analysis.get("prompt_adherence_score")),
+        _score(analysis.get("category_score")),
+        _score(analysis.get("visual_quality_score")),
+        _score(analysis.get("aesthetics_score")),
+        -iteration,
+    )
+def compact_analysis_for_rewrite(analysis: dict[str, Any]) -> dict[str, Any]:
+    """Return the VLM fields most useful for the next prompt rewrite."""
+    keys = (
+        "overall_score",
+        "prompt_adherence_score",
+        "visual_quality_score",
+        "aesthetics_score",
+        "physical_plausibility_score",
+        "category_score",
+        "text_rendering_score",
+        "photorealism_score",
+        "issues",
+        "prompt_elements",
+        "category_findings",
+        "improvement_directives",
+        "rationale",
+    )
+    return {key: analysis.get(key) for key in keys if key in analysis}
+def analysis_json_text(data: dict[str, Any]) -> str:
+    """Serialize compact analysis for prompt inclusion."""
+    return json.dumps(data, ensure_ascii=True, indent=2)
+def _score(value: Any) -> float:
+    if value is None:
+        return 0.0
+    try:
+        number = float(value)
+    except (TypeError, ValueError):
+        return 0.0
+    return max(0.0, min(10.0, number))
+def _normalize_issues(value: Any) -> list[dict[str, str]]:
+    if not isinstance(value, list):
+        return []
+    issues: list[dict[str, str]] = []
+    for item in value:
+        if not isinstance(item, dict):
+            continue
+        description = str(item.get("description") or "").strip()
+        if not description:
+            continue
+        category = str(item.get("category") or "unspecified").strip() or "unspecified"
+        severity = str(item.get("severity") or "moderate").strip().lower()
+        if severity not in ISSUE_SEVERITIES:
+            severity = "moderate"
+        issues.append({"category": category, "description": description, "severity": severity})
+    return issues
+def _has_severe_issue(issues: Any) -> bool:
+    return any(isinstance(item, dict) and item.get("severity") == "severe" for item in issues or [])

agentic_upsampling/run.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""CLI for standalone agentic Cosmos3 text-to-image prompt upsampling."""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from agentic_upsampling.clients import (
+    ImageGenerationClient,
+    PromptRewriterClient,
+    VLMQualityJudge,
+    read_api_token,
+    read_optional_generation_auth_key,
+)
+from agentic_upsampling.constants import (
+    DEFAULT_ASPECT_RATIO,
+    DEFAULT_CRITIC_ENDPOINT_URL,
+    DEFAULT_CRITIC_MODEL,
+    DEFAULT_FLOW_SHIFT,
+    DEFAULT_GENERATION_AUTH_KEY_ENV,
+    DEFAULT_GENERATION_EXTRA_ARGS,
+    DEFAULT_GENERATION_MODEL,
+    DEFAULT_GEMINI_API_KEY_ENV,
+    DEFAULT_GUIDANCE,
+    DEFAULT_IMAGE_SIZE,
+    DEFAULT_LLM_EXTRA_BODY,
+    DEFAULT_MAX_ITERATIONS,
+    DEFAULT_NUM_STEPS,
+    DEFAULT_OPENAI_API_KEY_ENV,
+    DEFAULT_RESOLUTION,
+    DEFAULT_REWRITER_ENDPOINT_URL,
+    DEFAULT_REWRITER_MODEL,
+    DEFAULT_SAMPLES_PER_ITERATION,
+    DEFAULT_UPSAMPLER_ENDPOINT_URL,
+    DEFAULT_UPSAMPLER_MODEL,
+)
+from agentic_upsampling.data import load_prompt_items
+from agentic_upsampling.extract_best import extract_best_images
+from agentic_upsampling.io_utils import write_json_atomic
+from agentic_upsampling.runner import AgenticUpsamplerRunner, RunnerConfig, write_run_manifest
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument("--prompt", default=None, help="Single text prompt to run.")
+    input_group.add_argument("--prompts", type=Path, default=None, help="Path to .txt, .jsonl, or .csv prompts.")
+    parser.add_argument("--limit", type=int, default=None, help="Optional maximum number of prompts to run.")
+    parser.add_argument("--output-dir", type=Path, required=True)
+    parser.add_argument("--overwrite", action="store_true")
+    parser.add_argument("--max-iterations", type=int, default=DEFAULT_MAX_ITERATIONS)
+    parser.add_argument("--samples-per-iteration", type=int, default=DEFAULT_SAMPLES_PER_ITERATION)
+    parser.add_argument("--seed-base", type=int, default=None)
+    parser.add_argument("--disable-early-stop", action="store_true")
+    parser.add_argument("--quiet", action="store_true")
+    parser.add_argument("--extract-best", action="store_true", help="Copy best images after the run finishes.")
+    parser.add_argument("--generation-endpoint", required=True)
+    parser.add_argument("--generation-model", default=DEFAULT_GENERATION_MODEL)
+    parser.add_argument("--size", default=DEFAULT_IMAGE_SIZE, help="vLLM-Omni image size in WIDTHxHEIGHT format.")
+    parser.add_argument("--generation-auth-key", default="")
+    parser.add_argument("--generation-auth-key-env", default=DEFAULT_GENERATION_AUTH_KEY_ENV)
+    parser.add_argument("--resolution", default=DEFAULT_RESOLUTION)
+    parser.add_argument("--aspect-ratio", default=DEFAULT_ASPECT_RATIO)
+    parser.add_argument("--num-steps", type=int, default=DEFAULT_NUM_STEPS)
+    parser.add_argument("--guidance", type=float, default=DEFAULT_GUIDANCE)
+    parser.add_argument("--flow-shift", type=float, default=DEFAULT_FLOW_SHIFT)
+    parser.add_argument("--generation-extra-args", type=json.loads, default=DEFAULT_GENERATION_EXTRA_ARGS)
+    parser.add_argument("--upsampler-endpoint-url", default=DEFAULT_UPSAMPLER_ENDPOINT_URL)
+    parser.add_argument("--upsampler-model", default=DEFAULT_UPSAMPLER_MODEL)
+    parser.add_argument("--rewriter-endpoint-url", default=DEFAULT_REWRITER_ENDPOINT_URL)
+    parser.add_argument("--rewriter-model", default=DEFAULT_REWRITER_MODEL)
+    parser.add_argument("--openai-api-key-env", default=DEFAULT_OPENAI_API_KEY_ENV)
+    parser.add_argument("--openai-api-key-file", type=Path, default=None)
+    parser.add_argument("--llm-extra-body", type=json.loads, default=DEFAULT_LLM_EXTRA_BODY)
+    parser.add_argument("--initial-negative-prompt", default="")
+    parser.add_argument("--critic-endpoint-url", default=DEFAULT_CRITIC_ENDPOINT_URL)
+    parser.add_argument("--critic-model", default=DEFAULT_CRITIC_MODEL)
+    parser.add_argument("--gemini-api-key-env", default=DEFAULT_GEMINI_API_KEY_ENV)
+    parser.add_argument("--gemini-api-key-file", type=Path, default=None)
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    items = load_prompt_items(prompt=args.prompt, prompts_path=args.prompts, limit=args.limit)
+    if not items:
+        raise RuntimeError("No prompts selected.")
+    if args.samples_per_iteration < 1:
+        raise ValueError("--samples-per-iteration must be >= 1.")
+    if not isinstance(args.generation_extra_args, dict):
+        raise ValueError("--generation-extra-args must decode to a JSON object.")
+    openai_token = read_api_token(args.openai_api_key_env, args.openai_api_key_file)
+    gemini_token = read_api_token(args.gemini_api_key_env, args.gemini_api_key_file)
+    generation_auth_key = read_optional_generation_auth_key(args.generation_auth_key, args.generation_auth_key_env)
+    write_json_atomic(
+        args.output_dir / "run_config.json",
+        {
+            "selected_prompts": len(items),
+            "max_iterations": args.max_iterations,
+            "samples_per_iteration": args.samples_per_iteration,
+            "early_stop": not args.disable_early_stop,
+            "generation_endpoint": args.generation_endpoint,
+            "generation_model": args.generation_model,
+            "size": args.size,
+            "resolution": args.resolution,
+            "aspect_ratio": args.aspect_ratio,
+            "num_steps": args.num_steps,
+            "guidance": args.guidance,
+            "flow_shift": args.flow_shift,
+            "generation_extra_args": args.generation_extra_args,
+            "upsampler_endpoint_url": args.upsampler_endpoint_url,
+            "upsampler_model": args.upsampler_model,
+            "rewriter_endpoint_url": args.rewriter_endpoint_url,
+            "rewriter_model": args.rewriter_model,
+            "llm_extra_body": args.llm_extra_body,
+            "critic_endpoint_url": args.critic_endpoint_url,
+            "critic_model": args.critic_model,
+            "initial_negative_prompt": args.initial_negative_prompt,
+        },
+    )
+    rewriter = PromptRewriterClient(
+        api_token=openai_token,
+        upsampler_endpoint_url=args.upsampler_endpoint_url,
+        upsampler_model=args.upsampler_model,
+        rewriter_endpoint_url=args.rewriter_endpoint_url,
+        rewriter_model=args.rewriter_model,
+        extra_body=args.llm_extra_body,
+        resolution=args.resolution,
+        aspect_ratio=args.aspect_ratio,
+    )
+    generator = ImageGenerationClient(
+        endpoint=args.generation_endpoint,
+        auth_key=generation_auth_key,
+        model=args.generation_model,
+        size=args.size,
+        num_steps=args.num_steps,
+        guidance=args.guidance,
+        flow_shift=args.flow_shift,
+        extra_args=args.generation_extra_args,
+    )
+    judge = VLMQualityJudge(
+        api_token=gemini_token,
+        endpoint_url=args.critic_endpoint_url,
+        model=args.critic_model,
+    )
+    runner = AgenticUpsamplerRunner(
+        rewriter=rewriter,
+        generator=generator,
+        judge=judge,
+        config=RunnerConfig(
+            output_dir=args.output_dir,
+            max_iterations=args.max_iterations,
+            samples_per_iteration=args.samples_per_iteration,
+            overwrite=args.overwrite,
+            seed_base=args.seed_base,
+            initial_negative_prompt=args.initial_negative_prompt,
+            early_stop=not args.disable_early_stop,
+            verbose=not args.quiet,
+        ),
+    )
+    results = [runner.run_item_safely(item) for item in items]
+    write_run_manifest(args.output_dir, results)
+    failures = sum(1 for item in results if item.get("error"))
+    summary = {"selected_prompts": len(items), "completed": len(items) - failures, "failures": failures}
+    write_json_atomic(args.output_dir / "summary.json", summary)
+    print(json.dumps(summary, indent=2), flush=True)
+    if args.extract_best and not failures:
+        export_dir = args.output_dir / "best_generations"
+        extract_best_images(args.output_dir, export_dir, overwrite=args.overwrite)
+        print(f"Exported best images to {export_dir}", flush=True)
+    return 1 if failures else 0
+if __name__ == "__main__":
+    raise SystemExit(main())

agentic_upsampling/runner.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""Agentic text-to-image prompt upsampling orchestration."""
+from __future__ import annotations
+import json
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Protocol
+from agentic_upsampling.clients import GenerationOutput
+from agentic_upsampling.constants import DEFAULT_JPEG_QUALITY, DEFAULT_MAX_ITERATIONS, DEFAULT_SAMPLES_PER_ITERATION
+from agentic_upsampling.data import PromptItem, prompt_dir_name
+from agentic_upsampling.io_utils import read_json, write_json_atomic
+from agentic_upsampling.rubric import candidate_sort_key
+class RewriterLike(Protocol):
+    def initial_prompt(self, item: PromptItem) -> dict[str, Any]:
+        """Create an initial prompt."""
+    def rewrite_prompt_pair(
+        self,
+        item: PromptItem,
+        previous_prompt: dict[str, Any],
+        previous_negative_prompt: str,
+        previous_analysis: dict[str, Any],
+        history: list[dict[str, Any]],
+    ) -> tuple[dict[str, Any], str]:
+        """Jointly rewrite a positive prompt and negative prompt."""
+class GeneratorLike(Protocol):
+    def generate(
+        self,
+        *,
+        prompt_json: dict[str, Any],
+        prompt_id: str,
+        output_dir: Path,
+        seed: int | None = None,
+        negative_prompt: str = "",
+        jpeg_quality: int = DEFAULT_JPEG_QUALITY,
+    ) -> GenerationOutput:
+        """Generate one image."""
+class JudgeLike(Protocol):
+    def score_image(
+        self,
+        *,
+        item: PromptItem,
+        image_path: Path,
+    ) -> dict[str, Any]:
+        """Score one image."""
+@dataclass(frozen=True, slots=True)
+class RunnerConfig:
+    """Runtime settings for the agentic loop."""
+    output_dir: Path
+    max_iterations: int = DEFAULT_MAX_ITERATIONS
+    samples_per_iteration: int = DEFAULT_SAMPLES_PER_ITERATION
+    overwrite: bool = False
+    seed_base: int | None = None
+    jpeg_quality: int = DEFAULT_JPEG_QUALITY
+    initial_negative_prompt: str = ""
+    early_stop: bool = True
+    verbose: bool = True
+    def __post_init__(self) -> None:
+        if self.max_iterations < 1:
+            raise ValueError("max_iterations must be >= 1.")
+        if self.samples_per_iteration < 1:
+            raise ValueError("samples_per_iteration must be >= 1.")
+@dataclass(frozen=True, slots=True)
+class IterationPrompt:
+    """Positive and negative prompts prepared for one iteration."""
+    prompt_json: dict[str, Any]
+    negative_prompt: str
+class AgenticUpsamplerRunner:
+    """Run the iterative prompt rewrite, generate, and judge loop."""
+    rewriter: RewriterLike
+    generator: GeneratorLike
+    judge: JudgeLike
+    config: RunnerConfig
+    def __init__(
+        self,
+        *,
+        rewriter: RewriterLike,
+        generator: GeneratorLike,
+        judge: JudgeLike,
+        config: RunnerConfig,
+    ) -> None:
+        self.rewriter = rewriter
+        self.generator = generator
+        self.judge = judge
+        self.config = config
+    def run_item(self, item: PromptItem) -> dict[str, Any]:
+        """Run all iterations for one prompt item and persist the best candidate."""
+        item_dir = self.config.output_dir / prompt_dir_name(item)
+        item_dir.mkdir(parents=True, exist_ok=True)
+        (item_dir / "failure.json").unlink(missing_ok=True)
+        (item_dir / "incomplete.json").unlink(missing_ok=True)
+        self._log(f"[prompt {item.prompt_id}] start")
+        candidates: list[dict[str, Any]] = []
+        previous_prompt: dict[str, Any] | None = None
+        previous_analysis: dict[str, Any] | None = None
+        previous_negative_prompt = self.config.initial_negative_prompt.strip()
+        incomplete_error: dict[str, Any] | None = None
+        for iteration in range(self.config.max_iterations):
+            iteration_dir = item_dir / f"iter_{iteration:02d}"
+            candidate = None if self.config.overwrite else self._load_iteration(iteration_dir, iteration)
+            if candidate is None:
+                try:
+                    candidate = self._run_iteration(
+                        item,
+                        iteration_dir,
+                        iteration,
+                        previous_prompt,
+                        previous_analysis,
+                        previous_negative_prompt,
+                        candidates,
+                    )
+                except Exception as exc:
+                    if not candidates:
+                        raise
+                    incomplete_error = {
+                        "iteration": iteration,
+                        "error": repr(exc),
+                        "traceback": traceback.format_exc(),
+                    }
+                    write_json_atomic(item_dir / "incomplete.json", incomplete_error)
+                    self._log(f"[prompt {item.prompt_id}] incomplete at iter={iteration}: {exc!r}")
+                    break
+            candidates.append(candidate)
+            previous_prompt = candidate["prompt_json"]
+            previous_analysis = candidate["analysis"]
+            previous_negative_prompt = str(candidate.get("negative_prompt") or "")
+            if self.config.early_stop and bool(candidate["analysis"].get("threshold_cleared")):
+                self._log(f"[prompt {item.prompt_id}] early stop at iter={iteration}")
+                break
+        return self.finalize_item(item, candidates, incomplete_error=incomplete_error)
+    def run_item_safely(self, item: PromptItem) -> dict[str, Any]:
+        """Run one item and convert failures into structured records."""
+        try:
+            return self.run_item(item)
+        except Exception as exc:
+            self._log(f"[prompt {item.prompt_id}] failed: {exc!r}")
+            failure = {
+                "prompt_id": item.prompt_id,
+                "prompt": item.prompt,
+                "error": repr(exc),
+                "traceback": traceback.format_exc(),
+            }
+            failure_path = self.config.output_dir / prompt_dir_name(item) / "failure.json"
+            write_json_atomic(failure_path, failure)
+            return {"prompt_id": item.prompt_id, "error": repr(exc), "failure_path": str(failure_path)}
+    def _run_iteration(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        previous_prompt: dict[str, Any] | None,
+        previous_analysis: dict[str, Any] | None,
+        previous_negative_prompt: str,
+        candidates: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        prepared = self.prepare_iteration_prompt(
+            item,
+            iteration_dir,
+            iteration,
+            previous_prompt,
+            previous_analysis,
+            previous_negative_prompt,
+            candidates,
+        )
+        sample_candidates, sample_errors = self._run_iteration_samples(
+            item,
+            iteration_dir,
+            iteration,
+            prepared.prompt_json,
+            prepared.negative_prompt,
+        )
+        return self.finalize_iteration(item, iteration_dir, iteration, sample_candidates, sample_errors)
+    def _run_iteration_samples(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        prompt_json: dict[str, Any],
+        negative_prompt: str,
+    ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+        """Generate seed samples concurrently, then judge successful images in sample order."""
+        generation_outputs: dict[int, GenerationOutput] = {}
+        sample_errors: list[dict[str, Any]] = []
+        with ThreadPoolExecutor(max_workers=self.config.samples_per_iteration) as executor:
+            future_to_sample_index = {
+                executor.submit(
+                    self.run_generation_sample,
+                    item,
+                    iteration_dir,
+                    sample_index,
+                    prompt_json,
+                    negative_prompt,
+                ): sample_index
+                for sample_index in range(self.config.samples_per_iteration)
+            }
+            for future in as_completed(future_to_sample_index):
+                sample_index = future_to_sample_index[future]
+                try:
+                    generation_outputs[sample_index] = future.result()
+                except Exception as exc:
+                    sample_errors.append(self._record_sample_error(item, iteration_dir, iteration, sample_index, exc))
+        sample_candidates: list[dict[str, Any]] = []
+        for sample_index in range(self.config.samples_per_iteration):
+            generation = generation_outputs.get(sample_index)
+            if generation is None:
+                continue
+            try:
+                sample_candidates.append(
+                    self.judge_iteration_sample(
+                        item,
+                        iteration_dir,
+                        iteration,
+                        sample_index,
+                        prompt_json,
+                        negative_prompt,
+                        generation,
+                    )
+                )
+            except Exception as exc:
+                sample_errors.append(self._record_sample_error(item, iteration_dir, iteration, sample_index, exc))
+        return sample_candidates, sample_errors
+    def _record_sample_error(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        sample_index: int,
+        exc: Exception,
+    ) -> dict[str, Any]:
+        """Persist one per-sample failure record."""
+        error = {"sample_index": sample_index, "error": repr(exc), "traceback": traceback.format_exc()}
+        write_json_atomic(self._sample_dir(iteration_dir, sample_index) / "failure.json", error)
+        self._log(f"[prompt {item.prompt_id}] iter={iteration} sample={sample_index} failed: {exc!r}")
+        return error
+    def prepare_iteration_prompt(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        previous_prompt: dict[str, Any] | None,
+        previous_analysis: dict[str, Any] | None,
+        previous_negative_prompt: str,
+        candidates: list[dict[str, Any]],
+    ) -> IterationPrompt:
+        """Prepare and persist the positive/negative prompt pair for one iteration."""
+        iteration_dir.mkdir(parents=True, exist_ok=True)
+        self._log(f"[prompt {item.prompt_id}] iter={iteration} start")
+        if iteration == 0 or previous_prompt is None or previous_analysis is None:
+            prompt_json = self.rewriter.initial_prompt(item)
+            negative_prompt = self.config.initial_negative_prompt.strip()
+        else:
+            prompt_json, negative_prompt = self.rewriter.rewrite_prompt_pair(
+                item,
+                previous_prompt,
+                previous_negative_prompt,
+                previous_analysis,
+                candidates,
+            )
+            negative_prompt = negative_prompt.strip()
+        write_json_atomic(iteration_dir / "prompt.json", prompt_json)
+        write_json_atomic(iteration_dir / "negative_prompt.json", {"negative_prompt": negative_prompt})
+        return IterationPrompt(prompt_json=prompt_json, negative_prompt=negative_prompt)
+    def _run_iteration_sample(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        sample_index: int,
+        prompt_json: dict[str, Any],
+        negative_prompt: str,
+    ) -> dict[str, Any]:
+        generation = self.run_generation_sample(item, iteration_dir, sample_index, prompt_json, negative_prompt)
+        return self.judge_iteration_sample(
+            item,
+            iteration_dir,
+            iteration,
+            sample_index,
+            prompt_json,
+            negative_prompt,
+            generation,
+        )
+    def run_generation_sample(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        sample_index: int,
+        prompt_json: dict[str, Any],
+        negative_prompt: str,
+    ) -> GenerationOutput:
+        """Generate one sample image for an iteration."""
+        sample_dir = self._sample_dir(iteration_dir, sample_index)
+        sample_dir.mkdir(parents=True, exist_ok=True)
+        self._log(f"[prompt {item.prompt_id}] sample={sample_index} generate")
+        return self.generator.generate(
+            prompt_json=prompt_json,
+            prompt_id=item.prompt_id,
+            output_dir=sample_dir,
+            seed=self._sample_seed(sample_index),
+            negative_prompt=negative_prompt,
+            jpeg_quality=self.config.jpeg_quality,
+        )
+    def judge_iteration_sample(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        sample_index: int,
+        prompt_json: dict[str, Any],
+        negative_prompt: str,
+        generation: GenerationOutput,
+    ) -> dict[str, Any]:
+        """Judge one generated sample and persist its candidate metadata."""
+        sample_dir = self._sample_dir(iteration_dir, sample_index)
+        analysis = self.judge.score_image(item=item, image_path=generation.image_path)
+        self._log(f"[prompt {item.prompt_id}] iter={iteration} sample={sample_index} score={analysis.get('overall_score')}")
+        analysis_path = sample_dir / "analysis.json"
+        write_json_atomic(analysis_path, analysis)
+        candidate = {
+            "prompt_id": item.prompt_id,
+            "iteration": iteration,
+            "sample_index": sample_index,
+            "prompt_path": str(iteration_dir / "prompt.json"),
+            "image_path": str(generation.image_path),
+            "analysis_path": str(analysis_path),
+            "generation_meta_path": str(generation.meta_path),
+            "negative_prompt_path": str(iteration_dir / "negative_prompt.json"),
+            "negative_prompt": negative_prompt,
+            "prompt_json": prompt_json,
+            "analysis": analysis,
+        }
+        write_json_atomic(sample_dir / "meta.json", candidate)
+        return candidate
+    def finalize_iteration(
+        self,
+        item: PromptItem,
+        iteration_dir: Path,
+        iteration: int,
+        sample_candidates: list[dict[str, Any]],
+        sample_errors: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        """Select and persist the best sample candidate for one iteration."""
+        if not sample_candidates:
+            raise RuntimeError(f"All {self.config.samples_per_iteration} samples failed for iteration {iteration}.")
+        write_json_atomic(iteration_dir / "samples.json", sample_candidates)
+        candidate = dict(max(sample_candidates, key=candidate_sort_key))
+        candidate["samples"] = sample_candidates
+        candidate["sample_count"] = len(sample_candidates)
+        candidate["selected_sample_index"] = candidate["sample_index"]
+        if sample_errors:
+            candidate["sample_errors"] = sample_errors
+            write_json_atomic(iteration_dir / "sample_failures.json", sample_errors)
+        write_json_atomic(iteration_dir / "meta.json", candidate)
+        self._log(
+            f"[prompt {item.prompt_id}] iter={iteration} best_sample={candidate['selected_sample_index']} "
+            f"score={candidate['analysis'].get('overall_score')} samples={len(sample_candidates)}"
+        )
+        return candidate
+    def finalize_item(
+        self,
+        item: PromptItem,
+        candidates: list[dict[str, Any]],
+        *,
+        incomplete_error: dict[str, Any] | None = None,
+    ) -> dict[str, Any]:
+        """Persist and return the best candidate summary for a completed or incomplete item."""
+        if not candidates:
+            raise RuntimeError(f"No candidates produced for prompt {item.prompt_id}.")
+        item_dir = self.config.output_dir / prompt_dir_name(item)
+        best = max(candidates, key=candidate_sort_key)
+        summary = {
+            "prompt_id": item.prompt_id,
+            "prompt": item.prompt,
+            "best_iteration": best["iteration"],
+            "best_score": best["analysis"].get("overall_score"),
+            "threshold_cleared_any": any(bool(candidate["analysis"].get("threshold_cleared")) for candidate in candidates),
+            "best": best,
+            "iterations": candidates,
+        }
+        if incomplete_error is not None:
+            summary["incomplete_error"] = incomplete_error
+        write_json_atomic(item_dir / "best.json", summary)
+        self._log(f"[prompt {item.prompt_id}] done best_iter={summary['best_iteration']} best_score={summary['best_score']}")
+        return summary
+    def _log(self, message: str) -> None:
+        if self.config.verbose:
+            print(message, flush=True)
+    def _sample_seed(self, sample_index: int) -> int | None:
+        if self.config.seed_base is None:
+            return None
+        return self.config.seed_base + sample_index
+    def _sample_dir(self, iteration_dir: Path, sample_index: int) -> Path:
+        if self.config.samples_per_iteration == 1:
+            return iteration_dir
+        return iteration_dir / f"sample_{sample_index:02d}"
+    @staticmethod
+    def _load_iteration(iteration_dir: Path, iteration: int) -> dict[str, Any] | None:
+        meta_path = iteration_dir / "meta.json"
+        prompt_path = iteration_dir / "prompt.json"
+        if not (meta_path.exists() and prompt_path.exists()):
+            return None
+        meta = read_json(meta_path)
+        analysis_path = Path(str(meta.get("analysis_path") or iteration_dir / "analysis.json"))
+        image_path = Path(str(meta.get("image_path") or iteration_dir / "image.jpg"))
+        if not (analysis_path.exists() and image_path.exists()):
+            return None
+        meta["iteration"] = iteration
+        meta["prompt_json"] = read_json(prompt_path)
+        meta["analysis"] = read_json(analysis_path)
+        negative_prompt_path = iteration_dir / "negative_prompt.json"
+        if "negative_prompt" not in meta and negative_prompt_path.exists():
+            negative_prompt_data = read_json(negative_prompt_path)
+            meta["negative_prompt"] = str(negative_prompt_data.get("negative_prompt") or "")
+            meta["negative_prompt_path"] = str(negative_prompt_path)
+        meta.setdefault("negative_prompt", "")
+        samples_path = iteration_dir / "samples.json"
+        if samples_path.exists():
+            samples = json.loads(samples_path.read_text(encoding="utf-8"))
+            if isinstance(samples, list):
+                meta["samples"] = samples
+                meta["sample_count"] = len(samples)
+        return meta
+def write_run_manifest(output_dir: Path, results: list[dict[str, Any]]) -> None:
+    """Write compact run-level manifest files."""
+    manifest_path = output_dir / "manifest.jsonl"
+    failures_path = output_dir / "failures.jsonl"
+    manifest_path.unlink(missing_ok=True)
+    failures_path.unlink(missing_ok=True)
+    for result in results:
+        target = failures_path if result.get("error") else manifest_path
+        with target.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(result, ensure_ascii=True, separators=(",", ":")) + "\n")

assets/benchmark-text2image-leaderboard-all-models.jpg ADDED Viewed

Git LFS Details

SHA256: 380c72e4df9a1b95d7929d7af082ad0af8bd885f160a81ae3b66661040923c9e
Pointer size: 131 Bytes
Size of remote file: 432 kB

assets/benchmark-text2image-leaderboard.png ADDED Viewed

Git LFS Details

SHA256: 10458182a0c5dfe07ae295f20fabf60a4e4d2a59633a27ff15c50ac1f9baae84
Pointer size: 132 Bytes
Size of remote file: 4.12 MB

assets/benchmark-text2image.png ADDED Viewed

Git LFS Details

SHA256: 55bdd6bc617832086be44c3d63f03cb28426dc352a97e8d3c10ae7967b94c4e9
Pointer size: 131 Bytes
Size of remote file: 147 kB

assets/example_caption.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "subjects": [
+    {
+      "description": "Two damp human hands working a spinning cylinder of wet gray clay on a pottery wheel, fingers gently pinching and pulling the walls upward to form a narrow neck and rounded belly",
+      "appearance_details": "Hands coated with a thin sheen of slip, glistening with water; fingertips leave subtle ridges and fingerprints in the clay; knuckles slightly creased, nails short and rimmed with gray clay; small splatters of clay on the back of the wrists",
+      "relationship": "The primary actor shaping the central spinning clay form on the wheel",
+      "location": "Center foreground, framing the clay column",
+      "relative_size": "Hands and clay occupy roughly the central 60 percent of the frame",
+      "orientation": "Hands angled inward from left and right, palms cupping the clay symmetrically",
+      "pose": "Both hands curled around the clay, thumbs and index fingers pinched at the upper neck while remaining fingers support the rounded belly below",
+      "clothing": "",
+      "expression": "",
+      "gender": "Unknown",
+      "age": "Adult",
+      "skin_tone_and_texture": "Light to medium skin tone, slick with watery slip giving a glossy sheen, fine pores and small wrinkles visible at knuckles",
+      "facial_features": "",
+      "number_of_subjects": 1,
+      "number_of_arms": 2,
+      "number_of_legs": 0,
+      "number_of_hands": 2,
+      "number_of_fingers": 10
+    },
+    {
+      "description": "A spinning cylinder of wet gray stoneware clay being formed into a vase shape with a narrow neck and rounded belly",
+      "appearance_details": "Concentric horizontal rings spiral around the surface from the rotation; glistening film of water; subtle fingerprint impressions; small drips of slip running down the lower belly onto the wheel head",
+      "relationship": "The object being shaped by the hands; central focal point of the scene",
+      "location": "Absolute center of the frame, rising vertically from the wheel head",
+      "relative_size": "Approximately one third of the frame height, dominant central form",
+      "orientation": "Vertical axis, rotating",
+      "pose": "Upright cylindrical-to-vase form mid-throw, neck tapering, belly bulging",
+      "clothing": "",
+      "expression": "",
+      "gender": "N/A",
+      "age": "N/A",
+      "skin_tone_and_texture": "",
+      "facial_features": "",
+      "number_of_subjects": 1,
+      "number_of_arms": 0,
+      "number_of_legs": 0,
+      "number_of_hands": 0,
+      "number_of_fingers": 0
+    }
+  ],
+  "subject_details": {
+    "clay_state": "Wet gray stoneware, plastic and pliable, glistening with slip",
+    "wheel_motion": "Visible motion blur in concentric ring patterns indicating rotation",
+    "forming_stage": "Mid-throw, transitioning from cylinder to vase form with narrow neck and rounded belly",
+    "slip_splatter": "Splattered clay droplets dot the black apron and the wheel tray",
+    "tools_in_background": "Wooden ribs, wire cutter, sponge, and metal trimming tools softly blurred behind"
+  },
+  "background_setting": "A dim pottery studio with a matte black wheel tray and splash pan surrounding the spinning wheel head. Behind the potter, softly blurred, sit a row of throwing tools: wooden ribs, a metal kidney, a wire cutter coiled on a small shelf, and a sponge in a shallow water bowl. The black apron worn by the potter forms part of the lower background, dotted with dried clay flecks. The studio walls are deep charcoal, allowing the warm directional light to sculpt the central action.",
+  "lighting": {
+    "conditions": "Warm, controlled studio lighting with a single key source",
+    "direction": "From the right side of the frame, slightly elevated",
+    "shadows": "Soft, defined shadows falling to the left of the clay form and beneath the hands; subtle contact shadow on the wheel head",
+    "illumination_effect": "Highlights the wet sheen on the clay, accentuates concentric ring textures and fingerprint detail, and creates a moody chiaroscuro that isolates the hands and clay from the darker background"
+  },
+  "aesthetics": {
+    "composition": "Centered symmetrical composition with the clay form on the vertical axis, hands framing it from both sides; rule-of-thirds intersections align with the neck and belly of the vase",
+    "color_scheme": "Muted earth tones dominated by cool grays of the clay, warm amber highlights from the studio light, deep blacks of the apron and wheel tray, and natural skin tones",
+    "mood_atmosphere": "Meditative, tactile, focused craftsmanship",
+    "patterns": "Concentric horizontal rings spiraling around the clay surface from the wheel's rotation"
+  },
+  "cinematography": {
+    "framing": "Medium close-up",
+    "camera_angle": "Slightly above rim height, looking gently down onto the hands and clay",
+    "depth_of_field": "Shallow",
+    "focus": "Crisp focus on the hands and the wet clay surface",
+    "lens_focal_length": "35mm"
+  },
+  "style_medium": "Photography",
+  "artistic_style": "Photorealistic studio product/process photography",
+  "context": "Editorial or artisanal documentation of a hand-thrown ceramic vase being formed on a pottery wheel",
+  "text_and_signage_elements": [],
+  "quadrant_scan": {
+    "top_left": "Softly blurred dark background with faint warm rim light catching the edge of a wooden rib tool on a shelf",
+    "top_right": "Warm directional studio light source area; brightest highlights spill across the upper right, illuminating the narrow neck of the vase",
+    "bottom_left": "Edge of the black apron speckled with dried clay flecks and the curved rim of the splash pan in shadow",
+    "bottom_right": "Wheel tray dotted with splattered clay droplets and trickles of slip; partial view of the rotating wheel head",
+    "absolute_center": "The wet gray clay vase form mid-throw, cradled by two slip-coated hands pinching the neck and supporting the rounded belly, concentric rings glistening under warm light"
+  },
+  "comprehensive_t2i_caption": "A photorealistic studio photograph captured at slightly above rim height with a 35mm lens shows a pottery wheel in motion at the center of the frame. A cylinder of wet gray stoneware clay spins, its surface scored with fine concentric rings from rotation and glistening with a thin film of water and slip. Two damp human hands, coated in pale gray slip, gently pinch and pull the walls upward, the thumbs and index fingers narrowing the neck while the lower fingers cradle and shape a rounded belly, transforming the cylinder into a vase. Warm directional studio light from the right rakes across the scene, accentuating the sheen of water, the subtle ridges of fingerprints, and the soft modeling of the clay's curves, while casting gentle shadows to the left. Splattered droplets of clay dot a black apron worn by the potter and the matte black wheel tray and splash pan beneath. In the softly blurred background, a row of pottery tools — wooden ribs, a metal kidney, a wire cutter, and a damp sponge in a water bowl — sit on a charcoal-toned shelf. The depth of field is shallow, holding crisp focus on the hands and clay while the surroundings dissolve into a moody, warm-toned haze. The atmosphere is meditative and tactile, celebrating the intimate craftsmanship of hand-thrown ceramics.",
+  "resolution": {
+    "H": 1024,
+    "W": 1024
+  },
+  "aspect_ratio": "1,1"
+}

assets/example_image.png ADDED Viewed

Git LFS Details

SHA256: 478903c6adf090f6dbf8c584e073061caf0451053360cae02b9fea3b84c132eb
Pointer size: 132 Bytes
Size of remote file: 1.25 MB

assets/more_images.jpg ADDED Viewed

Git LFS Details

SHA256: 48efdeacbf9824941fff4348c5780a87fa760ebd3fa1b22d87fa9b51fba72120
Pointer size: 132 Bytes
Size of remote file: 4.25 MB

assets/original_prompt.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Photorealistic studio photograph of a pottery wheel in motion, a cylinder of wet gray clay spinning with concentric rings. Two hands, damp and coated with slip, gently pinch and pull the walls upward to form a narrow neck and rounded belly like a vase. Directional warm studio light from the right highlights the sheen of water on clay and the texture of fingerprints. Splattered clay dots the black apron and wheel tray. Camera slightly above rim height, 35mm, crisp focus on hands and clay, background tools softly blurred.

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+}

checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

config.json ADDED Viewed

	@@ -0,0 +1,258 @@

+{
+  "allow_patterns_overrides": [
+    "*/*.safetensors"
+  ],
+  "architectures": [
+    "Cosmos3ForConditionalGeneration"
+  ],
+  "image_token_id": 151655,
+  "model": {
+    "_recursive_": false,
+    "_target": "omni_mot_model",
+    "config": {
+      "_type": "omni_mot_model_config",
+      "action_gen": false,
+      "activation_checkpointing": {
+        "_type": "activation_checkpointing_config",
+        "determinism_check": "default",
+        "mode": "full",
+        "preserve_rng_state": true,
+        "save_ops_regex": [
+          "fmha"
+        ]
+      },
+      "causal_training_strategy": "none",
+      "compile": {
+        "_type": "compile_config",
+        "compile_dynamic": true,
+        "compiled_region": "language",
+        "coordinate_descent_tuning": false,
+        "enabled": true,
+        "max_autotune_pointwise": false,
+        "use_cuda_graphs": false
+      },
+      "diffusion_expert_config": {
+        "_type": "diffusion_expert_config",
+        "base_fps": 24,
+        "enable_fps_modulation": true,
+        "load_weights_from_pretrained": false,
+        "max_vae_latent_side_after_patchify": 20,
+        "patch_spatial": 2,
+        "position_embedding_type": "unified_3d_mrope",
+        "rope_h_extrapolation_ratio": 1.0,
+        "rope_t_extrapolation_ratio": 1.0,
+        "rope_w_extrapolation_ratio": 1.0,
+        "timestep_range": 1.0,
+        "unified_3d_mrope_reset_spatial_ids": true,
+        "unified_3d_mrope_temporal_modality_margin": 15000
+      },
+      "ema": {
+        "_type": "ema_config",
+        "enabled": false,
+        "iteration_shift": 0,
+        "rate": 0.1
+      },
+      "fixed_step_sampler_config": null,
+      "input_caption_key": "ai_caption",
+      "input_image_key": "images",
+      "input_video_key": "video",
+      "joint_attn_implementation": "two_way",
+      "latent_downsample_factor": 16,
+      "lbl": {
+        "_type": "lbl_config",
+        "coeff_gen": null,
+        "coeff_und": null,
+        "method": "local"
+      },
+      "log_enc_time_every_n": 100,
+      "lora_alpha": 32,
+      "lora_enabled": false,
+      "lora_rank": 16,
+      "lora_target_modules": "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen",
+      "max_action_dim": 32,
+      "max_num_tokens_after_packing": 69632,
+      "natten_parameter_list": null,
+      "net": null,
+      "num_embodiment_domains": 32,
+      "parallelism": {
+        "_type": "parallelism_config",
+        "cfg_parallel_shard_degree": 1,
+        "context_parallel_shard_degree": 1,
+        "data_parallel_replicate_degree": 1,
+        "data_parallel_shard_degree": 16,
+        "enable_inference_mode": false,
+        "fsdp_master_dtype": "float32"
+      },
+      "precision": "bfloat16",
+      "rectified_flow_inference_config": {
+        "_type": "rectified_flow_inference_config",
+        "num_train_timesteps": 1000,
+        "scheduler_type": "unipc",
+        "shift": 3,
+        "use_dynamic_shifting": false
+      },
+      "rectified_flow_training_config": {
+        "_type": "rectified_flow_training_config",
+        "action_loss_weight": 10.0,
+        "high_sigma_ratio": 0.05,
+        "high_sigma_timesteps_max": 1000,
+        "high_sigma_timesteps_min": 995,
+        "image_loss_scale": null,
+        "independent_action_schedule": false,
+        "independent_sound_schedule": false,
+        "loss_scale": 10.0,
+        "normalize_loss_by_active": false,
+        "shift": {
+          "720": 5,
+          "768": 5
+        },
+        "shift_action": null,
+        "shift_sound": null,
+        "sound_loss_scale": 2.0,
+        "train_time_action_distribution": "logitnormal",
+        "train_time_image_distribution": "logitnormal",
+        "train_time_sound_distribution": "logitnormal",
+        "train_time_video_distribution": "waver",
+        "train_time_weight": "uniform",
+        "use_discrete_rf": false,
+        "use_dynamic_shift": false,
+        "use_high_sigma_strategy": false,
+        "use_high_sigma_strategy_action": false,
+        "use_high_sigma_strategy_sound": false
+      },
+      "resolution": "768",
+      "sound_dim": 64,
+      "sound_gen": true,
+      "sound_latent_fps": 25,
+      "sound_tokenizer": {
+        "_target": "avae_interface",
+        "audio_channels": 2,
+        "avae_config_path": "",
+        "avae_path": "pretrained/tokenizers/audio/avae/avae_48k_noncausal_25hz_64ch.ckpt",
+        "bucket_name": "bucket",
+        "hop_size": 1920,
+        "io_channels": 64,
+        "latent_mean": null,
+        "latent_std": null,
+        "normalization_type": "none",
+        "normalize_latents": false,
+        "object_store_credential_path_pretrained": "credentials/gcp_training.secret",
+        "sample_rate": 48000,
+        "tanh_clamp": 0.995,
+        "tanh_input_scale": 1.5,
+        "tanh_output_scale": 3.5
+      },
+      "state_ch": 48,
+      "state_t": 300,
+      "tokenizer": {
+        "_target": "wan2pt2_vae_interface",
+        "bucket_name": "bucket",
+        "causal": true,
+        "chunk_duration": 93,
+        "encode_bucket_multiple": null,
+        "encode_chunk_frames": {
+          "720": 12,
+          "768": 12
+        },
+        "encode_exact_durations": null,
+        "keep_decoder_cache": false,
+        "object_store_credential_path_pretrained": "credentials/gcp_training.secret",
+        "spatial_compression_factor": 16,
+        "temporal_compression_factor": 4,
+        "temporal_window": null,
+        "use_streaming_encode": false,
+        "vae_path": "pretrained/tokenizers/video/wan2pt2/Wan2.2_VAE.pth"
+      },
+      "video_temporal_causal": false,
+      "vision_gen": true,
+      "vlm_config": {
+        "_type": "vlm_config",
+        "layer_module": null,
+        "model_instance": {
+          "_target": "qwen3_vl_text_for_causal_lm",
+          "config": {
+            "_target": "create_vlm_config",
+            "base_config": {
+              "_target": "qwen3_vl_mot_config_from_json_file",
+              "json_file": "cosmos3://vfm/models/vlm/qwen3_vl/configs/Qwen3-VL-32B-Instruct.json"
+            },
+            "qk_norm_for_text": true
+          }
+        },
+        "model_name": "nvidia/Cosmos3-Super-Reasoner",
+        "pretrained_weights": {
+          "_type": "pretrained_weights_config",
+          "backbone_path": "s3://bucket/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Super-Reasoner-b6df0d1/",
+          "checkpoint_format": null,
+          "credentials_path": "credentials/gcp_checkpoint.secret",
+          "enable_gcs_patch_in_boto3": true,
+          "enabled": false
+        },
+        "qk_norm": false,
+        "tie_word_embeddings": false,
+        "tokenizer": {
+          "_target": "create_qwen2_tokenizer_with_download",
+          "config_variant": "gcp",
+          "pretrained_model_name": "Qwen/Qwen3-VL-32B-Instruct"
+        },
+        "use_system_prompt": false
+      }
+    }
+  },
+  "model_type": "cosmos3_omni",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 25600,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 64,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.0.dev0",
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 5120,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "bos_token_id": 151643,
+    "pad_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "top_p": 0.8,
+    "top_k": 20,
+    "temperature": 0.7,
+    "repetition_penalty": 1.0,
+    "transformers_version": "4.56.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model_index.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_class_name": "Cosmos3OmniDiffusersPipeline",
+  "_diffusers_version": "0.37.1",
+  "scheduler": [
+    "diffusers",
+    "UniPCMultistepScheduler"
+  ],
+  "text_tokenizer": [
+    "transformers",
+    "Qwen2TokenizerFast"
+  ],
+  "transformer": [
+    "diffusers",
+    "Cosmos3OmniTransformer"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLWan"
+  ],
+  "vision_encoder": [
+    "transformers",
+    "Qwen3VLVisionModel"
+  ],
+  "sound_tokenizer": [
+    "diffusers",
+    "Cosmos3AVAEAudioTokenizer"
+  ]
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "size": {
+        "longest_edge": 16777216,
+        "shortest_edge": 65536
+    },
+    "patch_size": 16,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "Qwen3VLProcessor",
+    "image_processor_type": "Qwen2VLImageProcessorFast"
+}

pytest.ini ADDED Viewed

	@@ -0,0 +1,4 @@

+[pytest]
+testpaths = tests
+pythonpath = .
+addopts = --confcutdir=.

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_class_name": "UniPCMultistepScheduler",
+  "_diffusers_version": "0.37.1",
+  "beta_end": 0.02,
+  "beta_schedule": "linear",
+  "beta_start": 0.0001,
+  "disable_corrector": [],
+  "dynamic_thresholding_ratio": 0.995,
+  "final_sigmas_type": "zero",
+  "flow_shift": 3.0,
+  "lower_order_final": true,
+  "num_train_timesteps": 1000,
+  "predict_x0": true,
+  "prediction_type": "flow_prediction",
+  "rescale_betas_zero_snr": false,
+  "sample_max_value": 1.0,
+  "shift_terminal": null,
+  "sigma_max": 200.0,
+  "sigma_min": 0.147,
+  "solver_order": 2,
+  "solver_p": null,
+  "solver_type": "bh2",
+  "steps_offset": 0,
+  "thresholding": false,
+  "time_shift_type": "exponential",
+  "timestep_spacing": "linspace",
+  "trained_betas": null,
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_flow_sigmas": true,
+  "use_karras_sigmas": true
+}

sound_tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+    "model_type": "autoencoder_v2",
+    "sampling_rate": 48000,
+    "stereo": true,
+    "use_wav_as_input": true,
+    "normalize_volume": true,
+    "hop_size": 1920,
+    "input_channels": 1,
+    "enc_type": "spec_convnext",
+    "enc_dim": 192,
+    "enc_intermediate_dim": 768,
+    "enc_num_layers": 12,
+    "enc_num_blocks": 2,
+    "enc_n_fft": 64,
+    "enc_hop_length": 16,
+    "enc_latent_dim": 128,
+    "enc_c_mults": [
+        1,
+        2,
+        4
+    ],
+    "enc_strides": [
+        4,
+        5,
+        6
+    ],
+    "enc_identity_init": false,
+    "enc_use_snake": true,
+    "dec_type": "oobleck",
+    "dec_dim": 320,
+    "dec_c_mults": [
+        1,
+        2,
+        4,
+        8,
+        16
+    ],
+    "dec_strides": [
+        2,
+        4,
+        5,
+        6,
+        8
+    ],
+    "dec_use_snake": true,
+    "dec_final_tanh": false,
+    "dec_out_channels": 2,
+    "dec_anti_aliasing": false,
+    "dec_use_nearest_upsample": false,
+    "dec_use_tanh_at_final": false,
+    "bottleneck_type": "vae",
+    "bottleneck": {
+        "type": "vae"
+    },
+    "activation": "snakebeta",
+    "snake_logscale": true,
+    "anti_aliasing": false,
+    "use_cuda_kernel": false,
+    "causal": false,
+    "padding_mode": "zeros",
+    "vocoder_input_dim": 64,
+    "latent_mean": null,
+    "latent_std": null
+}

sound_tokenizer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d4c61cde38acfb0cad9048a140c3533750277a8462b19dc08450d9fe1ad9879
+size 1892409600

tests/test_agentic_upsampling.py ADDED Viewed

	@@ -0,0 +1,496 @@

+from __future__ import annotations
+import base64
+import io
+import json
+import threading
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from PIL import Image
+from agentic_upsampling.clients import ImageGenerationClient, PromptRewriterClient
+from agentic_upsampling.constants import (
+    DEFAULT_CRITIC_ENDPOINT_URL,
+    DEFAULT_CRITIC_MODEL,
+    DEFAULT_FLOW_SHIFT,
+    DEFAULT_GENERATION_EXTRA_ARGS,
+    DEFAULT_GENERATION_MODEL,
+    DEFAULT_LLM_EXTRA_BODY,
+    DEFAULT_REWRITER_MODEL,
+)
+from agentic_upsampling.data import PromptItem, load_prompt_items, prompt_dir_name
+from agentic_upsampling.extract_best import extract_best_images
+from agentic_upsampling.prompt_upsampler import (
+    Text2ImagePromptUpsampler,
+    apply_t2i_output_parameters,
+    normalize_openai_base_url,
+)
+from agentic_upsampling.rubric import parse_analysis_response
+from agentic_upsampling.runner import AgenticUpsamplerRunner, RunnerConfig
+def _item(prompt_id: str = "1", prompt: str = "a red cube") -> PromptItem:
+    return PromptItem(prompt_id=prompt_id, row_number=0, prompt=prompt)
+def _valid_t2i_prompt(caption: str) -> dict[str, Any]:
+    return {
+        "subjects": [],
+        "subject_details": {},
+        "background_setting": "plain studio",
+        "lighting": {"conditions": "soft", "direction": "front", "shadows": "soft", "illumination_effect": "clear"},
+        "aesthetics": {
+            "composition": "centered",
+            "color_scheme": "balanced",
+            "mood_atmosphere": "precise",
+            "patterns": "",
+        },
+        "cinematography": {
+            "framing": "centered",
+            "camera_angle": "eye-level",
+            "depth_of_field": "deep",
+            "focus": "sharp",
+            "lens_focal_length": "standard",
+        },
+        "style_medium": "digital render",
+        "artistic_style": "clean realistic render",
+        "context": "test prompt",
+        "text_and_signage_elements": [],
+        "quadrant_scan": {
+            "top_left": "",
+            "top_right": "",
+            "bottom_left": "",
+            "bottom_right": "",
+            "absolute_center": "",
+        },
+        "comprehensive_t2i_caption": caption,
+        "resolution": {"H": 960, "W": 960},
+        "aspect_ratio": "1,1",
+    }
+class FakeChatClient:
+    messages: list[dict[str, Any]]
+    response_format_json: bool
+    def __init__(self, response: dict[str, Any]) -> None:
+        self.response = response
+        self.messages = []
+        self.response_format_json = False
+    def complete(self, messages: list[dict[str, Any]], *, response_format_json: bool = False) -> str:
+        self.messages = messages
+        self.response_format_json = response_format_json
+        return json.dumps(self.response)
+def test_defaults_are_public_provider_defaults() -> None:
+    assert DEFAULT_REWRITER_MODEL == "gpt-5.5"
+    assert DEFAULT_LLM_EXTRA_BODY == {"reasoning_effort": "low"}
+    assert DEFAULT_CRITIC_MODEL == "gemini-3.1-pro-preview"
+    assert DEFAULT_CRITIC_ENDPOINT_URL == "https://generativelanguage.googleapis.com/v1beta/openai/"
+def test_gemini_openai_compatible_base_url_is_not_modified() -> None:
+    assert (
+        normalize_openai_base_url("https://generativelanguage.googleapis.com/v1beta/openai/")
+        == "https://generativelanguage.googleapis.com/v1beta/openai"
+    )
+    assert (
+        normalize_openai_base_url("https://generativelanguage.googleapis.com/v1beta/openai/chat/completions")
+        == "https://generativelanguage.googleapis.com/v1beta/openai"
+    )
+def test_prompt_loaders_support_text_jsonl_and_csv(tmp_path: Path) -> None:
+    txt_path = tmp_path / "prompts.txt"
+    txt_path.write_text("one\n\ntwo\n", encoding="utf-8")
+    assert [item.prompt for item in load_prompt_items(prompts_path=txt_path)] == ["one", "two"]
+    jsonl_path = tmp_path / "prompts.jsonl"
+    jsonl_path.write_text('{"id":"custom id","prompt":"three"}\n"four"\n', encoding="utf-8")
+    jsonl_items = load_prompt_items(prompts_path=jsonl_path)
+    assert [item.prompt for item in jsonl_items] == ["three", "four"]
+    assert prompt_dir_name(jsonl_items[0]) == "custom_id"
+    csv_path = tmp_path / "prompts.csv"
+    csv_path.write_text("id,prompt\nfive_id,five\n", encoding="utf-8")
+    csv_items = load_prompt_items(prompts_path=csv_path)
+    assert csv_items[0].prompt_id == "five_id"
+    assert csv_items[0].prompt == "five"
+def test_prompt_upsampler_applies_resolution_and_requests_json() -> None:
+    prompt_json = _valid_t2i_prompt("initial cube prompt")
+    fake_client = FakeChatClient(prompt_json)
+    upsampler = Text2ImagePromptUpsampler(fake_client)  # type: ignore[arg-type]
+    result = upsampler.upsample("a cube", prompt_id="cube", resolution="720", aspect_ratio="16,9")
+    assert result["resolution"] == {"H": 720, "W": 1280}
+    assert result["aspect_ratio"] == "16,9"
+    assert fake_client.response_format_json is True
+def test_apply_t2i_output_parameters_rejects_bad_canvas() -> None:
+    try:
+        apply_t2i_output_parameters(_valid_t2i_prompt("x"), resolution="999", aspect_ratio="1,1")
+    except ValueError as exc:
+        assert "Unsupported resolution" in str(exc)
+    else:
+        raise AssertionError("Expected unsupported resolution error.")
+def test_prompt_rewriter_joint_rewrite_uses_vlm_feedback() -> None:
+    previous_prompt = _valid_t2i_prompt("old cube prompt")
+    rewritten_prompt = _valid_t2i_prompt("new cube prompt with no 4x4 grid")
+    analysis = {
+        "overall_score": 2.0,
+        "prompt_adherence_score": 3.0,
+        "category_score": 3.0,
+        "issues": [
+            {
+                "category": "geometry",
+                "description": "Generated a 4x4 grid instead of a 3x3 cube.",
+                "severity": "severe",
+            }
+        ],
+        "improvement_directives": ["Strictly enforce 3x3x3 geometry."],
+        "raw_response": "large omitted blob",
+    }
+    rewriter = PromptRewriterClient(api_token="unused")
+    fake_client = FakeChatClient({"positive_prompt": rewritten_prompt, "negative_prompt": "4x4 grid"})
+    rewriter.rewrite_client = fake_client  # type: ignore[assignment]
+    positive_prompt, negative_prompt = rewriter.rewrite_prompt_pair(
+        _item("39", "A Rubik's cube mid twist with the top layer rotated exactly 45 degrees"),
+        previous_prompt,
+        "",
+        analysis,
+        [{"iteration": 0, "analysis": analysis}],
+    )
+    assert positive_prompt["comprehensive_t2i_caption"] == "new cube prompt with no 4x4 grid"
+    assert negative_prompt == "4x4 grid"
+    assert fake_client.response_format_json is True
+    user_message = str(fake_client.messages[1]["content"])
+    assert "Generated a 4x4 grid" in user_message
+    assert "Strictly enforce 3x3x3 geometry" in user_message
+    assert "raw_response" not in user_message
+def test_generation_payload_uses_vllm_omni_images_api() -> None:
+    client = ImageGenerationClient(endpoint="https://example.test/v1", model="test/model")
+    payload = client.build_payload({"comprehensive_t2i_caption": "x"}, prompt_id="3", seed=100, negative_prompt="blur")
+    assert client.endpoint == "https://example.test"
+    assert payload["model"] == "test/model"
+    assert payload["prompt"] == '{"comprehensive_t2i_caption":"x"}'
+    assert payload["size"] == "1024x1024"
+    assert payload["n"] == 1
+    assert payload["response_format"] == "b64_json"
+    assert payload["negative_prompt"] == "blur"
+    assert payload["num_inference_steps"] == 50
+    assert payload["guidance_scale"] == 4.0
+    assert payload["flow_shift"] == DEFAULT_FLOW_SHIFT
+    assert payload["extra_args"] == DEFAULT_GENERATION_EXTRA_ARGS
+    assert payload["seed"] == 100
+    assert "model_mode" not in payload
+    assert "prompt_upsampling" not in payload
+def test_generation_payload_allows_custom_extra_args() -> None:
+    client = ImageGenerationClient(endpoint="https://example.test", extra_args={"guardrails": True})
+    payload = client.build_payload({"comprehensive_t2i_caption": "x"}, prompt_id="3")
+    assert payload["extra_args"] == {"guardrails": True}
+class FakeImageResponse:
+    ok: bool = True
+    status_code: int = 200
+    text: str = "ok"
+    def __init__(self, payload: dict[str, Any]) -> None:
+        self.payload = payload
+    def json(self) -> dict[str, Any]:
+        return self.payload
+class FakeImageSession:
+    calls: list[dict[str, Any]]
+    def __init__(self, response_payload: dict[str, Any]) -> None:
+        self.response_payload = response_payload
+        self.calls = []
+    def request(self, method: str, url: str, **kwargs: Any) -> FakeImageResponse:
+        self.calls.append({"method": method, "url": url, "kwargs": kwargs})
+        return FakeImageResponse(self.response_payload)
+def _tiny_png_b64() -> str:
+    buf = io.BytesIO()
+    Image.new("RGB", (4, 4), (0, 255, 0)).save(buf, format="PNG")
+    return base64.b64encode(buf.getvalue()).decode("ascii")
+def test_generation_client_decodes_vllm_omni_b64_response(tmp_path: Path) -> None:
+    session = FakeImageSession({"created": 1, "data": [{"b64_json": _tiny_png_b64(), "revised_prompt": None}]})
+    client = ImageGenerationClient(endpoint="example.test", auth_key="secret-token", session=session)  # type: ignore[arg-type]
+    result = client.generate(prompt_json=_valid_t2i_prompt("x"), prompt_id="3", output_dir=tmp_path, seed=5)
+    assert result.image_path.exists()
+    assert session.calls[0]["method"] == "POST"
+    assert session.calls[0]["url"] == "https://example.test/v1/images/generations"
+    assert session.calls[0]["kwargs"]["headers"] == {"Authorization": "Bearer secret-token"}
+    assert session.calls[0]["kwargs"]["json"]["model"] == DEFAULT_GENERATION_MODEL
+    meta = json.loads(result.meta_path.read_text(encoding="utf-8"))
+    assert meta["status"] == "completed"
+    assert meta["response"]["data"][0]["b64_json"].startswith("<base64 image omitted:")
+def test_parse_analysis_response_sets_threshold_flag() -> None:
+    analysis = parse_analysis_response(
+        """
+        {
+          "prompt_adherence_score": 9,
+          "visual_quality_score": 9,
+          "aesthetics_score": 8.5,
+          "physical_plausibility_score": 8,
+          "category_score": 9,
+          "text_rendering_score": 9,
+          "photorealism_score": null,
+          "overall_score": 9.1,
+          "issues": [],
+          "category_findings": {},
+          "improvement_directives": [],
+          "rationale": "Strong."
+        }
+        """,
+    )
+    assert analysis["threshold_cleared"] is True
+class FakeRewriter:
+    initial_calls: int
+    joint_rewrite_calls: int
+    previous_scores: list[float]
+    def __init__(self) -> None:
+        self.initial_calls = 0
+        self.joint_rewrite_calls = 0
+        self.previous_scores = []
+    def initial_prompt(self, item: PromptItem) -> dict[str, Any]:
+        self.initial_calls += 1
+        return _valid_t2i_prompt(f"initial {item.prompt_id}")
+    def rewrite_prompt_pair(
+        self,
+        item: PromptItem,
+        previous_prompt: dict[str, Any],
+        previous_negative_prompt: str,
+        previous_analysis: dict[str, Any],
+        history: list[dict[str, Any]],
+    ) -> tuple[dict[str, Any], str]:
+        self.joint_rewrite_calls += 1
+        self.previous_scores.append(float(previous_analysis["overall_score"]))
+        return _valid_t2i_prompt(f"rewrite {len(history)}"), f"negative {len(history)}"
+@dataclass(frozen=True, slots=True)
+class FakeGeneration:
+    image_path: Path
+    meta_path: Path
+    meta: dict[str, Any]
+class FakeGenerator:
+    seeds: list[int | None]
+    negative_prompts: list[str]
+    def __init__(self) -> None:
+        self.seeds = []
+        self.negative_prompts = []
+    def generate(
+        self,
+        *,
+        prompt_json: dict[str, Any],
+        prompt_id: str,
+        output_dir: Path,
+        seed: int | None = None,
+        negative_prompt: str = "",
+        jpeg_quality: int = 95,
+    ) -> FakeGeneration:
+        self.seeds.append(seed)
+        self.negative_prompts.append(negative_prompt)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        image_path = output_dir / "image.jpg"
+        Image.new("RGB", (8, 8), (255, 0, 0)).save(image_path)
+        meta_path = output_dir / "generation_meta.json"
+        meta_path.write_text('{"status":"completed"}\n', encoding="utf-8")
+        return FakeGeneration(image_path=image_path, meta_path=meta_path, meta={"status": "completed"})
+class BarrierGenerator(FakeGenerator):
+    barrier: threading.Barrier
+    lock: threading.Lock
+    def __init__(self, parties: int) -> None:
+        super().__init__()
+        self.barrier = threading.Barrier(parties)
+        self.lock = threading.Lock()
+    def generate(
+        self,
+        *,
+        prompt_json: dict[str, Any],
+        prompt_id: str,
+        output_dir: Path,
+        seed: int | None = None,
+        negative_prompt: str = "",
+        jpeg_quality: int = 95,
+    ) -> FakeGeneration:
+        with self.lock:
+            self.seeds.append(seed)
+            self.negative_prompts.append(negative_prompt)
+        self.barrier.wait(timeout=2.0)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        image_path = output_dir / "image.jpg"
+        Image.new("RGB", (8, 8), (255, 0, 0)).save(image_path)
+        meta_path = output_dir / "generation_meta.json"
+        meta_path.write_text('{"status":"completed"}\n', encoding="utf-8")
+        return FakeGeneration(image_path=image_path, meta_path=meta_path, meta={"status": "completed"})
+class FakeJudge:
+    calls: int
+    scores: list[float]
+    def __init__(self, scores: list[float]) -> None:
+        self.calls = 0
+        self.scores = scores
+    def score_image(
+        self,
+        *,
+        item: PromptItem,
+        image_path: Path,
+    ) -> dict[str, Any]:
+        score = self.scores[self.calls]
+        self.calls += 1
+        return {
+            "overall_score": score,
+            "prompt_adherence_score": score,
+            "visual_quality_score": score,
+            "aesthetics_score": score,
+            "physical_plausibility_score": score,
+            "category_score": score,
+            "issues": [],
+            "improvement_directives": [],
+            "threshold_cleared": score >= 9,
+        }
+def test_runner_early_stops_by_default(tmp_path: Path) -> None:
+    rewriter = FakeRewriter()
+    generator = FakeGenerator()
+    runner = AgenticUpsamplerRunner(
+        rewriter=rewriter,
+        generator=generator,  # type: ignore[arg-type]
+        judge=FakeJudge([9.1, 8.0]),
+        config=RunnerConfig(output_dir=tmp_path, max_iterations=3, samples_per_iteration=1),
+    )
+    result = runner.run_item(_item())
+    assert result["best_iteration"] == 0
+    assert rewriter.initial_calls == 1
+    assert rewriter.joint_rewrite_calls == 0
+    assert generator.seeds == [None]
+def test_runner_can_disable_early_stop_and_select_best_sample(tmp_path: Path) -> None:
+    rewriter = FakeRewriter()
+    generator = FakeGenerator()
+    runner = AgenticUpsamplerRunner(
+        rewriter=rewriter,
+        generator=generator,  # type: ignore[arg-type]
+        judge=FakeJudge([5.0, 9.0, 7.0, 6.0, 10.0, 8.0]),
+        config=RunnerConfig(
+            output_dir=tmp_path,
+            max_iterations=2,
+            samples_per_iteration=3,
+            seed_base=1000,
+            early_stop=False,
+        ),
+    )
+    result = runner.run_item(_item("8", "exactly 12 balloons with exact color counts"))
+    assert generator.seeds == [1000, 1001, 1002, 1000, 1001, 1002]
+    assert rewriter.previous_scores == [9.0]
+    assert result["best_iteration"] == 1
+    assert result["best"]["selected_sample_index"] == 1
+    assert result["iterations"][0]["selected_sample_index"] == 1
+def test_runner_generates_seed_samples_in_parallel(tmp_path: Path) -> None:
+    rewriter = FakeRewriter()
+    generator = BarrierGenerator(parties=3)
+    runner = AgenticUpsamplerRunner(
+        rewriter=rewriter,
+        generator=generator,  # type: ignore[arg-type]
+        judge=FakeJudge([5.0, 6.0, 7.0]),
+        config=RunnerConfig(
+            output_dir=tmp_path,
+            max_iterations=1,
+            samples_per_iteration=3,
+            seed_base=2000,
+            early_stop=False,
+        ),
+    )
+    result = runner.run_item(_item("parallel", "a parallel seed test"))
+    assert sorted(generator.seeds) == [2000, 2001, 2002]
+    assert result["best"]["selected_sample_index"] == 2
+    assert result["iterations"][0]["sample_count"] == 3
+def test_extract_best_images_copies_images_and_writes_manifests(tmp_path: Path) -> None:
+    output_dir = tmp_path / "run"
+    image_dir = output_dir / "0001" / "iter_00"
+    image_dir.mkdir(parents=True)
+    image_path = image_dir / "image.jpg"
+    Image.new("RGB", (8, 8), (255, 0, 0)).save(image_path)
+    best_json = {
+        "prompt_id": "1",
+        "prompt": "a red square",
+        "best_iteration": 0,
+        "best_score": 9.25,
+        "threshold_cleared_any": True,
+        "best": {
+            "selected_sample_index": 0,
+            "image_path": str(image_path),
+            "analysis_path": str(image_dir / "analysis.json"),
+        },
+        "iterations": [],
+    }
+    (output_dir / "0001" / "best.json").write_text(json.dumps(best_json), encoding="utf-8")
+    records = extract_best_images(output_dir, tmp_path / "export")
+    assert len(records) == 1
+    copied_path = Path(records[0]["copied_image_path"])
+    assert copied_path.exists()
+    assert copied_path.name == "1.jpg"
+    assert (tmp_path / "export" / "best_generations.jsonl").exists()
+    assert (tmp_path / "export" / "best_generations.csv").exists()

text_tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

text_tokenizer/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

text_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

text_tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

text_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

text_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

transformer/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "_class_name": "Cosmos3OmniTransformer",
+  "_diffusers_version": "0.37.1",
+  "action_dim": 32,
+  "action_gen": false,
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "base_fps": 24,
+  "dtype": "bfloat16",
+  "enable_fps_modulation": true,
+  "freeze_und": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 25600,
+  "joint_attn_implementation": "two_way",
+  "latent_channel": 48,
+  "latent_patch_size": 2,
+  "max_action_dim": 32,
+  "max_position_embeddings": 262144,
+  "model_type": "qwen3_vl_text",
+  "num_attention_heads": 64,
+  "num_embodiment_domains": 32,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "patch_latent_dim": 192,
+  "position_embedding_type": "unified_3d_mrope",
+  "qk_norm": false,
+  "qk_norm_for_diffusion": true,
+  "qk_norm_for_text": true,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_interleaved": true,
+    "mrope_section": [
+      24,
+      20,
+      20
+    ],
+    "rope_type": "default"
+  },
+  "rope_theta": 5000000,
+  "sound_dim": 64,
+  "sound_gen": true,
+  "sound_latent_fps": 25,
+  "temporal_compression_factor_sound": 1,
+  "timestep_scale": 0.001,
+  "unified_3d_mrope_reset_spatial_ids": true,
+  "unified_3d_mrope_temporal_modality_margin": 15000,
+  "use_cache": true,
+  "use_moe": true,
+  "video_temporal_causal": false,
+  "vocab_size": 151936
+}

transformer/diffusion_pytorch_model-00001-of-00027.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28fe2fcd84de5c3e1a26a5224fda8da81b13ba6e58cf6073460f9b04403a33d6
+size 4932297056