Instructions to use nvidia/Cosmos3-Super-Text2Image with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use nvidia/Cosmos3-Super-Text2Image with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Diffusers
How to use nvidia/Cosmos3-Super-Text2Image with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("nvidia/Cosmos3-Super-Text2Image", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
File size: 9,040 Bytes
fdafd05 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | """VLM critic prompt and score normalization for agentic T2I upsampling."""
from __future__ import annotations
import json
from typing import Any
from agentic_upsampling.constants import STRICT_OVERALL_THRESHOLD, STRICT_PROMPT_THRESHOLD
from agentic_upsampling.data import PromptItem
from agentic_upsampling.prompt_upsampler import extract_json_object
CATEGORY_SECTIONS = {
"text_commercial_ui": (
"Text/commercial/UI/logo checks: readable text for logos, labels, posters, "
"billboards, product packaging, or UI. Verify exact quoted strings, spelling, legibility, typography, "
"placement, layout, and whether commercial/UI intent is visually clear."
),
"people_anatomy": (
"People/anatomy checks: if humans, human-like characters, body parts, portraits, or poses are present or "
"required by the prompt, inspect faces, eyes, hands, fingers, limbs, pose, proportions, expression, "
"clothing coherence, and physically possible interactions."
),
"fantasy_cartoon_vector": (
"Fantasy/cartoon/vector/pixel-art checks: if a stylized medium is requested, judge whether stylization is "
"intentional and clean. Penalize messy geometry, inconsistent line language, broken vector shapes, muddy "
"palettes, and unwanted photorealistic texture."
),
"photorealistic_physical": (
"Photorealistic/physical checks: if realism, physical objects, geometry, camera behavior, reflections, "
"transparent materials, shadows, perspective, scale, or contact matter, judge material realism, lighting "
"physics, lens plausibility, and whether objects obey real-world physical constraints."
),
"general_scene": (
"General scene checks: always judge object completeness, layout clarity, subject relationships, background "
"coherence, visual appeal, and absence of obvious AI artifacts."
),
}
SCORE_KEYS = (
"prompt_adherence_score",
"visual_quality_score",
"aesthetics_score",
"physical_plausibility_score",
"category_score",
"overall_score",
)
ISSUE_SEVERITIES = {"minor", "moderate", "severe"}
def all_category_check_text() -> str:
"""Return the full non-classifying category checklist."""
return "\n".join(f"- {text}" for text in CATEGORY_SECTIONS.values())
def build_judge_prompt(item: PromptItem) -> str:
"""Build the VLM critic prompt using the original user prompt as task context."""
return f"""You are an expert image quality analyst specializing in AI-generated image evaluation.
Your job is to produce an exhaustive defect report. Be meticulous: go beyond obvious problems and look carefully for subtle or background issues too.
The attached image was generated by an AI image model.
Analyze this image carefully and list every quality issue you observe.
For each issue give an approximate location and name the specific object or region involved. Report each distinct occurrence separately.
Before finalizing, check these areas, but only report issues you actually see:
- Physics: gravity violations, impossible collisions, implausible trajectories.
- Object deformation: morphing, melting, stretching of solid objects.
- Anatomy: distorted hands, faces, fingers, limbs, or wrong body proportions.
- Lighting and shadows: missing shadows or inconsistent illumination.
- Depth and scale: wrong spatial relationships, perspective issues, or scale inconsistencies.
- Text and numbers: garbled, floating, or incorrect text and digits.
- Visual quality: blur patches, noise, compression blocking, visual artifacts, or low-resolution regions.
- Color: inconsistent coloration, bleeding, or banding.
- Action correctness: prompted actions are correctly displayed.
- Prompt following: missing subjects, wrong objects, wrong setting, or wrong action.
Depending on the prompt, also apply the relevant checks below:
{all_category_check_text()}
The attached image was generated from this prompt:
{item.prompt}
Return exactly one JSON object, no markdown fences and no prose outside JSON:
{{
"prompt_adherence_score": <number 0-10>,
"visual_quality_score": <number 0-10>,
"aesthetics_score": <number 0-10>,
"physical_plausibility_score": <number 0-10>,
"category_score": <number 0-10>,
"text_rendering_score": <number 0-10 or null>,
"photorealism_score": <number 0-10 or null>,
"overall_score": <number 0-10>,
"issues": [
{{
"category": "<concise label>",
"description": "<what failed and where in the image>",
"severity": "minor" | "moderate" | "severe"
}}
],
"prompt_elements": {{
"<key noun or action from the prompt>": "present" | "absent" | "partial"
}},
"category_findings": {{"<check area>": "<concise finding>"}},
"improvement_directives": ["<specific prompt rewrite instruction>"],
"rationale": "<2-4 concise sentences>"
}}
"""
def parse_analysis_response(text: str) -> dict[str, Any]:
"""Parse and normalize a raw VLM scoring response."""
return normalize_analysis(extract_json_object(text))
def normalize_analysis(data: dict[str, Any]) -> dict[str, Any]:
"""Normalize VLM analysis into the schema used by selection and reporting."""
normalized = dict(data)
for key in SCORE_KEYS:
normalized[key] = _score(normalized.get(key))
for optional_key in ("text_rendering_score", "photorealism_score"):
if normalized.get(optional_key) is not None:
normalized[optional_key] = _score(normalized.get(optional_key))
normalized["issues"] = _normalize_issues(normalized.get("issues"))
directives = normalized.get("improvement_directives")
if isinstance(directives, list):
normalized["improvement_directives"] = [str(item) for item in directives if str(item).strip()]
else:
normalized["improvement_directives"] = []
findings = normalized.get("category_findings")
normalized["category_findings"] = findings if isinstance(findings, dict) else {}
normalized["threshold_cleared"] = clears_strict_threshold(normalized)
return normalized
def clears_strict_threshold(analysis: dict[str, Any]) -> bool:
"""Return whether a candidate clears the strict quality milestone."""
if _score(analysis.get("overall_score")) < STRICT_OVERALL_THRESHOLD:
return False
if _score(analysis.get("prompt_adherence_score")) < STRICT_PROMPT_THRESHOLD:
return False
if _has_severe_issue(analysis.get("issues")):
return False
if analysis.get("text_rendering_score") is not None:
return _score(analysis.get("text_rendering_score")) >= STRICT_PROMPT_THRESHOLD
return True
def candidate_sort_key(candidate: dict[str, Any]) -> tuple[float, float, float, float, float, int]:
"""Sort key for picking the best candidate."""
analysis = candidate.get("analysis", {})
iteration = int(candidate.get("iteration", 0))
return (
_score(analysis.get("overall_score")),
_score(analysis.get("prompt_adherence_score")),
_score(analysis.get("category_score")),
_score(analysis.get("visual_quality_score")),
_score(analysis.get("aesthetics_score")),
-iteration,
)
def compact_analysis_for_rewrite(analysis: dict[str, Any]) -> dict[str, Any]:
"""Return the VLM fields most useful for the next prompt rewrite."""
keys = (
"overall_score",
"prompt_adherence_score",
"visual_quality_score",
"aesthetics_score",
"physical_plausibility_score",
"category_score",
"text_rendering_score",
"photorealism_score",
"issues",
"prompt_elements",
"category_findings",
"improvement_directives",
"rationale",
)
return {key: analysis.get(key) for key in keys if key in analysis}
def analysis_json_text(data: dict[str, Any]) -> str:
"""Serialize compact analysis for prompt inclusion."""
return json.dumps(data, ensure_ascii=True, indent=2)
def _score(value: Any) -> float:
if value is None:
return 0.0
try:
number = float(value)
except (TypeError, ValueError):
return 0.0
return max(0.0, min(10.0, number))
def _normalize_issues(value: Any) -> list[dict[str, str]]:
if not isinstance(value, list):
return []
issues: list[dict[str, str]] = []
for item in value:
if not isinstance(item, dict):
continue
description = str(item.get("description") or "").strip()
if not description:
continue
category = str(item.get("category") or "unspecified").strip() or "unspecified"
severity = str(item.get("severity") or "moderate").strip().lower()
if severity not in ISSUE_SEVERITIES:
severity = "moderate"
issues.append({"category": category, "description": description, "severity": severity})
return issues
def _has_severe_issue(issues: Any) -> bool:
return any(isinstance(item, dict) and item.get("severity") == "severe" for item in issues or [])
|