Spaces:

achase25
/

AiSolMM

Sleeping

App Files Files Community

achase25 commited on Oct 3, 2025

Commit

787d948

verified ·

1 Parent(s): d5419d7

Create app.py

Browse files

Files changed (1) hide show

app.py +253 -0

app.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# app.py — Image+Command router: "describe photo" (caption), "write a story" (text), "make it a cartoon" (img2img)
+# Deps:
+#   pip install -q gradio transformers diffusers accelerate torch safetensors pillow
+import os
+import re
+from typing import Optional
+import torch
+import gradio as gr
+from PIL import Image
+from transformers import (
+    VisionEncoderDecoderModel,
+    AutoImageProcessor,
+    AutoTokenizer,
+    pipeline as hf_pipeline,
+)
+from diffusers import StableDiffusionImg2ImgPipeline
+# ----------------- Config -----------------
+CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "nlpconnect/vit-gpt2-image-captioning")
+# For longer/better stories you can set: google/flan-t5-xl (needs ~10–12GB VRAM) or google/flan-ul2 (heavy)
+STORY_MODEL_ID   = os.getenv("STORY_MODEL_ID",   "google/flan-t5-large")
+IMG2IMG_MODEL_ID = os.getenv("IMG2IMG_MODEL_ID", "stabilityai/stable-diffusion-2-1")
+MAX_IMG_SIDE     = int(os.getenv("MAX_IMG_SIDE", "768"))
+DEFAULT_STEPS    = int(os.getenv("STEPS", "30"))
+DEFAULT_GUIDANCE = float(os.getenv("GUIDANCE", "7.5"))
+DEFAULT_STRENGTH = float(os.getenv("STRENGTH", "0.6"))
+DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
+DTYPE  = torch.float16 if DEVICE == "cuda" else torch.float32
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
+# ----------------- Caches -----------------
+_caption_bundle = {}
+_story_pipe = None
+_img2img_pipe = None
+# ----------------- Utils -----------------
+def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
+    w, h = img.size
+    if max(w, h) <= max_side:
+        return img
+    if w >= h:
+        new_w = max_side
+        new_h = int(h * (max_side / w))
+    else:
+        new_h = max_side
+        new_w = int(w * (max_side / h))
+    # Snap to multiples of 8 for SD pipelines
+    return img.resize((new_w // 8 * 8, new_h // 8 * 8), Image.LANCZOS)
+def _seeded_generator(seed: Optional[int]):
+    if seed is None or str(seed).strip() == "":
+        return None
+    try:
+        seed = int(seed)
+    except Exception:
+        return None
+    dev = "cuda" if DEVICE == "cuda" else "cpu"
+    return torch.Generator(device=dev).manual_seed(seed)
+def parse_num_sentences(cmd: str, default: int = 5) -> int:
+    m = re.search(r"(\d+)\s*(?:sentences?|sentence)", (cmd or "").lower())
+    if m:
+        try:
+            n = int(m.group(1))
+            return max(1, min(n, 20))  # keep sane bounds
+        except Exception:
+            pass
+    return default
+# ----------------- Loaders -----------------
+def get_caption_bundle():
+    global _caption_bundle
+    if _caption_bundle:
+        return _caption_bundle
+    # use_fast=True avoids “slow processor/tokenizer” warnings
+    processor = AutoImageProcessor.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(CAPTION_MODEL_ID, use_fast=True, token=HF_TOKEN)
+    model     = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
+    # GPT-2 decoders have no pad by default -> set pad=eos; set ids so generate() is happy
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.config.eos_token_id = tokenizer.eos_token_id
+    if getattr(model.config, "decoder_start_token_id", None) is None and tokenizer.bos_token_id is not None:
+        model.config.decoder_start_token_id = tokenizer.bos_token_id
+    model.to(DEVICE).eval()
+    _caption_bundle = {"processor": processor, "tokenizer": tokenizer, "model": model}
+    return _caption_bundle
+def get_story_pipe():
+    global _story_pipe
+    if _story_pipe is not None:
+        return _story_pipe
+    # Load a fast tokenizer explicitly to kill “slow” warning
+    story_tok = AutoTokenizer.from_pretrained(STORY_MODEL_ID, use_fast=True, token=HF_TOKEN)
+    _story_pipe = hf_pipeline(
+        "text2text-generation",
+        model=STORY_MODEL_ID,
+        tokenizer=story_tok,
+        device_map="auto",   # lets HF place layers smartly; will still run CPU if no GPU
+        # Do NOT pass torch_dtype here (deprecated in some paths). We'll rely on device_map.
+    )
+    return _story_pipe
+def get_img2img_pipe():
+    global _img2img_pipe
+    if _img2img_pipe is not None:
+        return _img2img_pipe
+    pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+        IMG2IMG_MODEL_ID,
+        dtype=DTYPE,                 # <-- modern arg (fixes torch_dtype deprecation)
+        safety_checker=None,         # flip to enable if you want
+        requires_safety_checker=False,
+        use_safetensors=True,
+    )
+    pipe = pipe.to(DEVICE)
+    try:
+        pipe.enable_xformers_memory_efficient_attention()
+    except Exception:
+        pass
+    _img2img_pipe = pipe
+    return _img2img_pipe
+# ----------------- Ops -----------------
+@torch.inference_mode()
+def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4) -> str:
+    bundle = get_caption_bundle()
+    proc, tok, mdl = bundle["processor"], bundle["tokenizer"], bundle["model"]
+    # Let processor handle size; accepts any input resolution
+    pv = proc(image.convert("RGB"), return_tensors="pt").pixel_values.to(DEVICE)
+    out = mdl.generate(
+        pixel_values=pv,
+        max_new_tokens=max_new_tokens,
+        num_beams=num_beams,
+        pad_token_id=tok.pad_token_id,
+        eos_token_id=tok.eos_token_id,
+    )
+    return tok.decode(out[0], skip_special_tokens=True).strip()
+def op_story(
+    image: Image.Image,
+    num_sentences: int = 5,
+    max_new_tokens: int = 220,    # enough headroom
+    min_new_tokens: int = 80,     # force >= ~80 tokens to discourage 1-line outputs
+    temperature: float = 0.9,
+    top_p: float = 0.92,
+    no_repeat_ngram_size: int = 3,
+) -> str:
+    # Ground with the caption (keeps story on-topic)
+    caption = op_caption(image)
+    prompt = (
+        f"Write exactly {num_sentences} sentences based on this image description. "
+        "Use vivid sensory details. No title, no lists, no bullet points, no numbered lines, no dialogue.\n"
+        f"Image description: {caption}\n\nStory:"
+    )
+    story_pipe = get_story_pipe()
+    out = story_pipe(
+        prompt,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        min_new_tokens=min_new_tokens,    # key to prevent early stop
+        max_new_tokens=max_new_tokens,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        num_return_sequences=1,
+    )
+    text = out[0]["generated_text"].strip()
+    # Final safety belt: clamp to exactly N sentences
+    sents = re.split(r'(?<=[.!?])\s+', text)
+    sents = [s.strip() for s in sents if s.strip()]
+    if len(sents) >= num_sentences:
+        text = " ".join(sents[:num_sentences])
+    return text
+@torch.inference_mode()
+def op_cartoon(image: Image.Image, steps=DEFAULT_STEPS, guidance=DEFAULT_GUIDANCE, strength=DEFAULT_STRENGTH, seed: Optional[int]=None):
+    img = _resize_max(image.convert("RGB"))
+    gen = _seeded_generator(seed)
+    pipe = get_img2img_pipe()
+    prompt = "cartoon, cel-shaded, flat colors, bold outlines, clean lineart, anime style, comic book"
+    negative = "photorealistic, blurry, noisy, artifacts, distorted, watermark"
+    result = pipe(
+        prompt=prompt,
+        negative_prompt=negative,
+        image=img,
+        strength=float(strength),
+        guidance_scale=float(guidance),
+        num_inference_steps=int(steps),
+        generator=gen,
+    )
+    return result.images[0]
+# ----------------- Router -----------------
+def route_command(command: str) -> str:
+    c = (command or "").lower()
+    if any(k in c for k in ["cartoon", "sketch", "comic", "anime", "illustration"]):
+        return "cartoon"
+    if any(k in c for k in ["story", "poem", "narrative", "write"]):
+        return "story"
+    return "caption"  # default / “describe”, “caption”, etc.
+def run(image: Image.Image, command: str, steps: int, guidance: float, strength: float, seed: str):
+    if image is None:
+        raise gr.Error("Upload an image.")
+    mode = route_command(command)
+    if mode == "cartoon":
+        img = op_cartoon(image, steps=steps, guidance=guidance, strength=strength, seed=int(seed) if seed else None)
+        return None, img, f"Mode: cartoon ({steps} steps, guidance {guidance}, strength {strength}, seed {seed or 'None'})"
+    elif mode == "story":
+        n = parse_num_sentences(command, default=5)
+        txt = op_story(image, num_sentences=n)
+        return txt, None, f"Mode: story ({n} sentences)"
+    else:
+        txt = op_caption(image)
+        return txt, None, "Mode: caption"
+# ----------------- Gradio UI -----------------
+with gr.Blocks(css="footer {visibility:hidden}") as demo:
+    gr.Markdown("# Image Command Router — describe • cartoonize • write a story")
+    with gr.Row():
+        with gr.Column():
+            inp_img = gr.Image(type="pil", label="Image")
+            inp_cmd = gr.Textbox(
+                label="Command",
+                placeholder='e.g., "describe the photo", "make the photo look like a cartoon", "write a 5 sentence story about the image"',
+                lines=2,
+                value="describe the photo"
+            )
+            with gr.Accordion("Advanced (cartoon mode)", open=False):
+                steps    = gr.Slider(1, 75, value=DEFAULT_STEPS, step=1, label="Steps")
+                guidance = gr.Slider(0.0, 15.0, value=DEFAULT_GUIDANCE, step=0.1, label="Guidance (CFG)")
+                strength = gr.Slider(0.1, 1.0, value=DEFAULT_STRENGTH, step=0.05, label="Strength (how much to change)")
+                seed     = gr.Textbox(value="", label="Seed (optional int)")
+            go = gr.Button("Run", variant="primary")
+        with gr.Column():
+            out_text  = gr.Textbox(label="Text output", lines=10)
+            out_image = gr.Image(label="Image output")
+            status    = gr.Markdown()
+    go.click(run, inputs=[inp_img, inp_cmd, steps, guidance, strength, seed], outputs=[out_text, out_image, status], scroll_to_output=True)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), debug=True)