SanskarModi's picture
updated app
9bc957e
"""Text-to-image generation with clean metadata output."""
from __future__ import annotations
import time
from typing import Tuple
import torch
from PIL import Image
from sdgen.sd.models import GenerationMetadata, Txt2ImgConfig
from sdgen.utils.common import validate_resolution
from sdgen.utils.logger import get_logger
logger = get_logger(__name__)
def generate_image(
pipe: any,
cfg: Txt2ImgConfig,
) -> Tuple[Image.Image, GenerationMetadata]:
"""Generate an image from text using a Stable Diffusion pipeline.
Args:
pipe: A diffusers StableDiffusionPipeline instance.
cfg: Structured configuration for text-to-image generation.
Returns:
A tuple of (PIL image, GenerationMetadata).
"""
width, height = validate_resolution(cfg.width, cfg.height)
start = time.time()
seed = cfg.seed
if seed is None:
seed = int(torch.seed() & ((1 << 63) - 1))
device = cfg.device
gen = torch.Generator("cpu" if device == "cpu" else device).manual_seed(int(seed))
logger.info(
"txt2img: steps=%s cfg=%s res=%sx%s seed=%s",
cfg.steps,
cfg.guidance_scale,
width,
height,
seed,
)
autocast_device = device if device == "cuda" else "cpu"
with torch.autocast(device_type=autocast_device):
out = pipe(
prompt=cfg.prompt,
negative_prompt=cfg.negative_prompt or None,
width=width,
height=height,
num_inference_steps=int(cfg.steps),
guidance_scale=float(cfg.guidance_scale),
generator=gen,
)
img = out.images[0]
elapsed = time.time() - start
meta = GenerationMetadata(
mode="txt2img",
prompt=cfg.prompt,
negative_prompt=cfg.negative_prompt or "",
steps=int(cfg.steps),
guidance_scale=float(cfg.guidance_scale),
width=width,
height=height,
seed=int(seed),
elapsed_seconds=float(elapsed),
)
return img, meta