Instructions to use nvidia/Cosmos3-Super-Image2Video with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use nvidia/Cosmos3-Super-Image2Video with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Diffusers
How to use nvidia/Cosmos3-Super-Image2Video with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline from diffusers.utils import load_image, export_to_video # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("nvidia/Cosmos3-Super-Image2Video", dtype=torch.bfloat16, device_map="cuda") pipe.to("cuda") prompt = "A man with short gray hair plays a red electric guitar." image = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png" ) output = pipe(image=image, prompt=prompt).frames[0] export_to_video(output, "output.mp4") - Notebooks
- Google Colab
- Kaggle
File size: 2,394 Bytes
8889131 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | """Minimal image-to-video generation against a vLLM-Omni endpoint (sync mode).
Run from the Cosmos3-Super-Image2Video repo root:
python scripts/gen_video.py \
--endpoint <endpoint-url> \
--prompt-file assets/example_prompt.json \
--image-path assets/example_first_frame.png \
--output-path scripts/output.mp4
"""
import argparse
import json
from pathlib import Path
import requests
# Fixed generation settings: 16:9 480p, 189 frames @ 24 fps.
ASPECT_RATIO = "16,9"
WIDTH = 832
HEIGHT = 480
NUM_FRAMES = 189
FPS = 24
def main() -> None:
parser = argparse.ArgumentParser(description="Generate one I2V sample (sync mode).")
parser.add_argument("--endpoint", required=True, help="vLLM-Omni endpoint base URL.")
parser.add_argument("--prompt-file", type=Path, default=Path("assets/example_prompt.json"))
parser.add_argument("--image-path", type=Path, default=Path("assets/example_first_frame.png"))
parser.add_argument("--output-path", type=Path, default=Path("scripts/output.mp4"))
args = parser.parse_args()
spec = json.loads(args.prompt_file.read_text(encoding="utf-8"))
# Safeguard the metadata and json format
prompt = json.loads(spec["prompt"])
prompt["duration"] = f"{int(NUM_FRAMES / FPS)}s"
prompt["fps"] = float(round(FPS))
prompt["resolution"] = {"H": HEIGHT, "W": WIDTH}
prompt["aspect_ratio"] = ASPECT_RATIO
data = {
"prompt": json.dumps(prompt, ensure_ascii=False),
"negative_prompt": spec["negative_prompt"],
"size": f"{WIDTH}x{HEIGHT}",
"num_frames": NUM_FRAMES,
"fps": FPS,
"num_inference_steps": 50,
"guidance_scale": 6.0,
"flow_shift": 5.0,
"extra_params": json.dumps({"use_resolution_template": False, "use_duration_template": False}),
}
files = {"input_reference": ("input.png", args.image_path.read_bytes(), "image/png")}
headers = {"Accept": "video/mp4", "User-Agent": "curl/8.5.0"}
response = requests.post(f"{args.endpoint}/v1/videos/sync", data=data, files=files, headers=headers, timeout=(10, 600))
response.raise_for_status()
args.output_path.parent.mkdir(parents=True, exist_ok=True)
args.output_path.write_bytes(response.content)
print(f"Saved video to {args.output_path} ({len(response.content) / (1024 * 1024):.1f} MB)")
if __name__ == "__main__":
main()
|