Spaces:

achase25
/

AiSolMM

Sleeping

App Files Files Community

achase25 commited on Oct 3, 2025

Commit

e6ef52a

verified ·

1 Parent(s): 9780798

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -59

app.py CHANGED Viewed

@@ -1,14 +1,22 @@
-# app.py — Image+Command router: "describe photo" (caption), "write a story" (text), "make it a cartoon" (img2img)
-# Deps:
 #   pip install -q gradio transformers diffusers accelerate torch safetensors pillow
 import os
 import re
-from typing import Optional
 import torch
 import gradio as gr
 from PIL import Image
 from transformers import (
     VisionEncoderDecoderModel,
     AutoImageProcessor,
@@ -16,30 +24,30 @@ from transformers import (
     pipeline as hf_pipeline,
 )
 from diffusers import StableDiffusionImg2ImgPipeline
-# ----------------- Config -----------------
 CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "nlpconnect/vit-gpt2-image-captioning")
-# For longer/better stories you can set: google/flan-t5-xl (needs ~10–12GB VRAM) or google/flan-ul2 (heavy)
-STORY_MODEL_ID   = os.getenv("STORY_MODEL_ID",   "google/flan-t5-large")
 IMG2IMG_MODEL_ID = os.getenv("IMG2IMG_MODEL_ID", "stabilityai/stable-diffusion-2-1")
-MAX_IMG_SIDE     = int(os.getenv("MAX_IMG_SIDE", "768"))
 DEFAULT_STEPS    = int(os.getenv("STEPS", "30"))
 DEFAULT_GUIDANCE = float(os.getenv("GUIDANCE", "7.5"))
-DEFAULT_STRENGTH = float(os.getenv("STRENGTH", "0.6"))
 DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
-DTYPE  = torch.float16 if DEVICE == "cuda" else torch.float32
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
-# ----------------- Caches -----------------
 _caption_bundle = {}
 _story_pipe = None
 _img2img_pipe = None
-# ----------------- Utils -----------------
 def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     w, h = img.size
     if max(w, h) <= max_side:
@@ -50,8 +58,7 @@ def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     else:
         new_h = max_side
         new_w = int(w * (max_side / h))
-    # Snap to multiples of 8 for SD pipelines
-    return img.resize((new_w // 8 * 8, new_h // 8 * 8), Image.LANCZOS)
 def _seeded_generator(seed: Optional[int]):
     if seed is None or str(seed).strip() == "":
@@ -63,35 +70,21 @@ def _seeded_generator(seed: Optional[int]):
     dev = "cuda" if DEVICE == "cuda" else "cpu"
     return torch.Generator(device=dev).manual_seed(seed)
-def parse_num_sentences(cmd: str, default: int = 5) -> int:
-    m = re.search(r"(\d+)\s*(?:sentences?|sentence)", (cmd or "").lower())
-    if m:
-        try:
-            n = int(m.group(1))
-            return max(1, min(n, 20))  # keep sane bounds
-        except Exception:
-            pass
-    return default
-# ----------------- Loaders -----------------
 def get_caption_bundle():
     global _caption_bundle
     if _caption_bundle:
         return _caption_bundle
-    # use_fast=True avoids “slow processor/tokenizer” warnings
     processor = AutoImageProcessor.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(CAPTION_MODEL_ID, use_fast=True, token=HF_TOKEN)
     model     = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
-    # GPT-2 decoders have no pad by default -> set pad=eos; set ids so generate() is happy
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "right"
     model.config.pad_token_id = tokenizer.pad_token_id
     model.config.eos_token_id = tokenizer.eos_token_id
     if getattr(model.config, "decoder_start_token_id", None) is None and tokenizer.bos_token_id is not None:
         model.config.decoder_start_token_id = tokenizer.bos_token_id
     model.to(DEVICE).eval()
     _caption_bundle = {"processor": processor, "tokenizer": tokenizer, "model": model}
     return _caption_bundle
@@ -100,15 +93,8 @@ def get_story_pipe():
     global _story_pipe
     if _story_pipe is not None:
         return _story_pipe
-    # Load a fast tokenizer explicitly to kill “slow” warning
-    story_tok = AutoTokenizer.from_pretrained(STORY_MODEL_ID, use_fast=True, token=HF_TOKEN)
-    _story_pipe = hf_pipeline(
-        "text2text-generation",
-        model=STORY_MODEL_ID,
-        tokenizer=story_tok,
-        device_map="auto",   # lets HF place layers smartly; will still run CPU if no GPU
-        # Do NOT pass torch_dtype here (deprecated in some paths). We'll rely on device_map.
-    )
     return _story_pipe
 def get_img2img_pipe():
@@ -117,8 +103,8 @@ def get_img2img_pipe():
         return _img2img_pipe
     pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
         IMG2IMG_MODEL_ID,
-        dtype=DTYPE,                 # <-- modern arg (fixes torch_dtype deprecation)
-        safety_checker=None,         # flip to enable if you want
         requires_safety_checker=False,
         use_safetensors=True,
     )
@@ -130,12 +116,11 @@ def get_img2img_pipe():
     _img2img_pipe = pipe
     return _img2img_pipe
-# ----------------- Ops -----------------
 @torch.inference_mode()
 def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4) -> str:
     bundle = get_caption_bundle()
     proc, tok, mdl = bundle["processor"], bundle["tokenizer"], bundle["model"]
-    # Let processor handle size; accepts any input resolution
     pv = proc(image.convert("RGB"), return_tensors="pt").pixel_values.to(DEVICE)
     out = mdl.generate(
         pixel_values=pv,
@@ -149,14 +134,15 @@ def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4)
 def op_story(
     image: Image.Image,
     num_sentences: int = 5,
-    max_new_tokens: int = 220,    # enough headroom
-    min_new_tokens: int = 80,     # force >= ~80 tokens to discourage 1-line outputs
     temperature: float = 0.9,
     top_p: float = 0.92,
     no_repeat_ngram_size: int = 3,
 ) -> str:
-    # Ground with the caption (keeps story on-topic)
     caption = op_caption(image)
     prompt = (
         f"Write exactly {num_sentences} sentences based on this image description. "
         "Use vivid sensory details. No title, no lists, no bullet points, no numbered lines, no dialogue.\n"
@@ -169,20 +155,23 @@ def op_story(
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
-        min_new_tokens=min_new_tokens,    # key to prevent early stop
         max_new_tokens=max_new_tokens,
         no_repeat_ngram_size=no_repeat_ngram_size,
         num_return_sequences=1,
     )
     text = out[0]["generated_text"].strip()
-    # Final safety belt: clamp to exactly N sentences
     sents = re.split(r'(?<=[.!?])\s+', text)
     sents = [s.strip() for s in sents if s.strip()]
     if len(sents) >= num_sentences:
         text = " ".join(sents[:num_sentences])
     return text
 @torch.inference_mode()
 def op_cartoon(image: Image.Image, steps=DEFAULT_STEPS, guidance=DEFAULT_GUIDANCE, strength=DEFAULT_STRENGTH, seed: Optional[int]=None):
     img = _resize_max(image.convert("RGB"))
@@ -201,15 +190,17 @@ def op_cartoon(image: Image.Image, steps=DEFAULT_STEPS, guidance=DEFAULT_GUIDANC
     )
     return result.images[0]
-# ----------------- Router -----------------
 def route_command(command: str) -> str:
     c = (command or "").lower()
     if any(k in c for k in ["cartoon", "sketch", "comic", "anime", "illustration"]):
         return "cartoon"
     if any(k in c for k in ["story", "poem", "narrative", "write"]):
         return "story"
-    return "caption"  # default / “describe”, “caption”, etc.
 def run(image: Image.Image, command: str, steps: int, guidance: float, strength: float, seed: str):
     if image is None:
         raise gr.Error("Upload an image.")
@@ -218,25 +209,18 @@ def run(image: Image.Image, command: str, steps: int, guidance: float, strength:
         img = op_cartoon(image, steps=steps, guidance=guidance, strength=strength, seed=int(seed) if seed else None)
         return None, img, f"Mode: cartoon ({steps} steps, guidance {guidance}, strength {strength}, seed {seed or 'None'})"
     elif mode == "story":
-        n = parse_num_sentences(command, default=5)
-        txt = op_story(image, num_sentences=n)
-        return txt, None, f"Mode: story ({n} sentences)"
     else:
         txt = op_caption(image)
         return txt, None, "Mode: caption"
-# ----------------- Gradio UI -----------------
 with gr.Blocks(css="footer {visibility:hidden}") as demo:
     gr.Markdown("# Image Command Router — describe • cartoonize • write a story")
     with gr.Row():
         with gr.Column():
             inp_img = gr.Image(type="pil", label="Image")
-            inp_cmd = gr.Textbox(
-                label="Command",
-                placeholder='e.g., "describe the photo", "make the photo look like a cartoon", "write a 5 sentence story about the image"',
-                lines=2,
-                value="describe the photo"
-            )
             with gr.Accordion("Advanced (cartoon mode)", open=False):
                 steps    = gr.Slider(1, 75, value=DEFAULT_STEPS, step=1, label="Steps")
                 guidance = gr.Slider(0.0, 15.0, value=DEFAULT_GUIDANCE, step=0.1, label="Guidance (CFG)")

+# app.py — Multimodal router: one image input + freeform command -> text OR image output
+# Commands (examples):
+#   "describe the photo"  -> text caption
+#   "write a story about the image" -> text story
+#   "make the photo look like a cartoon" -> image stylization
+#
+# Dependencies / requirements.txt:
 #   pip install -q gradio transformers diffusers accelerate torch safetensors pillow
 import os
 import re
+import random
+from typing import Optional, Tuple
 import torch
 import gradio as gr
 from PIL import Image
+# ---- Transformers: caption + story ----
 from transformers import (
     VisionEncoderDecoderModel,
     AutoImageProcessor,
     pipeline as hf_pipeline,
 )
+# ---- Diffusers: image-to-image stylization ----
 from diffusers import StableDiffusionImg2ImgPipeline
+# ------------- Config -------------
 CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "nlpconnect/vit-gpt2-image-captioning")
+STORY_MODEL_ID   = os.getenv("STORY_MODEL_ID",   "google/flan-t5-large")   # light-ish; ok stories
 IMG2IMG_MODEL_ID = os.getenv("IMG2IMG_MODEL_ID", "stabilityai/stable-diffusion-2-1")
+MAX_IMG_SIDE     = int(os.getenv("MAX_IMG_SIDE", "768"))   # clamp big uploads to save VRAM
 DEFAULT_STEPS    = int(os.getenv("STEPS", "30"))
 DEFAULT_GUIDANCE = float(os.getenv("GUIDANCE", "7.5"))
+DEFAULT_STRENGTH = float(os.getenv("STRENGTH", "0.6"))     # 0..1 (higher = more stylized, less like original)
 DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
+DTYPE  = torch.float16 if (DEVICE == "cuda") else torch.float32
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
+# ------------- Caches -------------
 _caption_bundle = {}
 _story_pipe = None
 _img2img_pipe = None
+# ------------- Utils -------------
 def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     w, h = img.size
     if max(w, h) <= max_side:
     else:
         new_h = max_side
         new_w = int(w * (max_side / h))
+    return img.resize((new_w // 8 * 8, new_h // 8 * 8), Image.LANCZOS)  # multiples of 8 for SD
 def _seeded_generator(seed: Optional[int]):
     if seed is None or str(seed).strip() == "":
     dev = "cuda" if DEVICE == "cuda" else "cpu"
     return torch.Generator(device=dev).manual_seed(seed)
+# ------------- Loaders -------------
 def get_caption_bundle():
     global _caption_bundle
     if _caption_bundle:
         return _caption_bundle
     processor = AutoImageProcessor.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(CAPTION_MODEL_ID, use_fast=True, token=HF_TOKEN)
     model     = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
+    # GPT2 has no pad by default -> set pad=eos to avoid mask issues
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model.config.pad_token_id = tokenizer.pad_token_id
     model.config.eos_token_id = tokenizer.eos_token_id
     if getattr(model.config, "decoder_start_token_id", None) is None and tokenizer.bos_token_id is not None:
         model.config.decoder_start_token_id = tokenizer.bos_token_id
     model.to(DEVICE).eval()
     _caption_bundle = {"processor": processor, "tokenizer": tokenizer, "model": model}
     return _caption_bundle
     global _story_pipe
     if _story_pipe is not None:
         return _story_pipe
+    # Flan-T5 works with text2text-generation
+    _story_pipe = hf_pipeline("text2text-generation", model=STORY_MODEL_ID, device_map="auto", model_kwargs={"torch_dtype": DTYPE})
     return _story_pipe
 def get_img2img_pipe():
         return _img2img_pipe
     pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
         IMG2IMG_MODEL_ID,
+        torch_dtype=DTYPE,
+        safety_checker=None,           # flip to enable safety if you prefer
         requires_safety_checker=False,
         use_safetensors=True,
     )
     _img2img_pipe = pipe
     return _img2img_pipe
+# ------------- Ops -------------
 @torch.inference_mode()
 def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4) -> str:
     bundle = get_caption_bundle()
     proc, tok, mdl = bundle["processor"], bundle["tokenizer"], bundle["model"]
     pv = proc(image.convert("RGB"), return_tensors="pt").pixel_values.to(DEVICE)
     out = mdl.generate(
         pixel_values=pv,
 def op_story(
     image: Image.Image,
     num_sentences: int = 5,
+    max_new_tokens: int = 220,     # allow enough room
+    min_new_tokens: int = 80,      # force >= ~80 tokens (~5 sentences)
     temperature: float = 0.9,
     top_p: float = 0.92,
     no_repeat_ngram_size: int = 3,
 ) -> str:
+    # Ground the story with a caption of the image
     caption = op_caption(image)
     prompt = (
         f"Write exactly {num_sentences} sentences based on this image description. "
         "Use vivid sensory details. No title, no lists, no bullet points, no numbered lines, no dialogue.\n"
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
+        min_new_tokens=min_new_tokens,    # <- prevents early stop
         max_new_tokens=max_new_tokens,
         no_repeat_ngram_size=no_repeat_ngram_size,
         num_return_sequences=1,
     )
     text = out[0]["generated_text"].strip()
+    # Safety belt: hard-trim to exactly N sentences
+    import re
     sents = re.split(r'(?<=[.!?])\s+', text)
     sents = [s.strip() for s in sents if s.strip()]
     if len(sents) >= num_sentences:
         text = " ".join(sents[:num_sentences])
     return text
 @torch.inference_mode()
 def op_cartoon(image: Image.Image, steps=DEFAULT_STEPS, guidance=DEFAULT_GUIDANCE, strength=DEFAULT_STRENGTH, seed: Optional[int]=None):
     img = _resize_max(image.convert("RGB"))
     )
     return result.images[0]
+# ------------- Router -------------
 def route_command(command: str) -> str:
     c = (command or "").lower()
     if any(k in c for k in ["cartoon", "sketch", "comic", "anime", "illustration"]):
         return "cartoon"
     if any(k in c for k in ["story", "poem", "narrative", "write"]):
         return "story"
+    # default / describe / caption / explain
+    return "caption"
+# ------------- Gradio App -------------
 def run(image: Image.Image, command: str, steps: int, guidance: float, strength: float, seed: str):
     if image is None:
         raise gr.Error("Upload an image.")
         img = op_cartoon(image, steps=steps, guidance=guidance, strength=strength, seed=int(seed) if seed else None)
         return None, img, f"Mode: cartoon ({steps} steps, guidance {guidance}, strength {strength}, seed {seed or 'None'})"
     elif mode == "story":
+        txt = op_story(image)
+        return txt, None, "Mode: story"
     else:
         txt = op_caption(image)
         return txt, None, "Mode: caption"
 with gr.Blocks(css="footer {visibility:hidden}") as demo:
     gr.Markdown("# Image Command Router — describe • cartoonize • write a story")
     with gr.Row():
         with gr.Column():
             inp_img = gr.Image(type="pil", label="Image")
+            inp_cmd = gr.Textbox(label="Command", placeholder='e.g., "describe the photo", "make the photo look like a cartoon", "write a story about the image"', lines=2, value="describe the photo")
             with gr.Accordion("Advanced (cartoon mode)", open=False):
                 steps    = gr.Slider(1, 75, value=DEFAULT_STEPS, step=1, label="Steps")
                 guidance = gr.Slider(0.0, 15.0, value=DEFAULT_GUIDANCE, step=0.1, label="Guidance (CFG)")