Spaces:

achase25
/

AiSolMM

Sleeping

App Files Files Community

achase25 commited on Oct 7, 2025

Commit

fc12805

verified ·

1 Parent(s): 8549faf

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -142

app.py CHANGED Viewed

@@ -1,22 +1,22 @@
-# app.py — Multimodal router: one image input + freeform command -> text OR image output
-# Commands (examples):
-#   "describe the photo"  -> text caption
-#   "write a story about the image" -> text story
-#   "make the photo look like a cartoon" -> image stylization
 #
-# Dependencies / requirements.txt:
-#   pip install -q gradio transformers diffusers accelerate torch safetensors pillow
 import os
-import re
-import random
-from typing import Optional, Tuple
 import torch
 import gradio as gr
 from PIL import Image
-# ---- Transformers: caption + story ----
 from transformers import (
     VisionEncoderDecoderModel,
     AutoImageProcessor,
@@ -24,38 +24,22 @@ from transformers import (
     pipeline as hf_pipeline,
 )
-# ---- Diffusers: image-to-image stylization ----
-from diffusers import StableDiffusionImg2ImgPipeline
-# ------------- Config -------------
-CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "nlpconnect/vit-gpt2-image-captioning")
-STORY_MODEL_ID   = os.getenv("STORY_MODEL_ID",   "google/flan-t5-large")   # light-ish; ok stories
-IMG2IMG_MODEL_ID = os.getenv("IMG2IMG_MODEL_ID", "stabilityai/stable-diffusion-2-1")
-MAX_IMG_SIDE     = int(os.getenv("MAX_IMG_SIDE", "768"))   # clamp big uploads to save VRAM
-DEFAULT_STEPS    = int(os.getenv("STEPS", "30"))
-DEFAULT_GUIDANCE = float(os.getenv("GUIDANCE", "7.5"))
-DEFAULT_STRENGTH = float(os.getenv("STRENGTH", "0.6"))     # 0..1 (higher = more stylized, less like original)
-DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
-DTYPE  = torch.float16 if (DEVICE == "cuda") else torch.float32
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
-# ---- Space/runtime feature flags ----
-CARTOON_AVAILABLE = torch.cuda.is_available()  # SD img2img is GPU-only on Spaces (CPU will timeout)
-# CPU-friendly fallbacks (keep things snappy on Spaces CPU)
-if not CARTOON_AVAILABLE:
-    DEFAULT_STEPS = min(DEFAULT_STEPS, 20)
-    DEFAULT_GUIDANCE = min(DEFAULT_GUIDANCE, 7.5)
-# ------------- Caches -------------
 _caption_bundle = {}
 _story_pipe = None
-_img2img_pipe = None
-# ------------- Utils -------------
 def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     w, h = img.size
     if max(w, h) <= max_side:
@@ -66,72 +50,56 @@ def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     else:
         new_h = max_side
         new_w = int(w * (max_side / h))
-    return img.resize((new_w // 8 * 8, new_h // 8 * 8), Image.LANCZOS)  # multiples of 8 for SD
-def _seeded_generator(seed: Optional[int]):
-    if seed is None or str(seed).strip() == "":
-        return None
-    try:
-        seed = int(seed)
-    except Exception:
-        return None
-    dev = "cuda" if DEVICE == "cuda" else "cpu"
-    return torch.Generator(device=dev).manual_seed(seed)
-# ------------- Loaders -------------
 def get_caption_bundle():
     global _caption_bundle
     if _caption_bundle:
         return _caption_bundle
     processor = AutoImageProcessor.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
     tokenizer = AutoTokenizer.from_pretrained(CAPTION_MODEL_ID, use_fast=True, token=HF_TOKEN)
     model     = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
-    # GPT2 has no pad by default -> set pad=eos to avoid mask issues
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model.config.pad_token_id = tokenizer.pad_token_id
     model.config.eos_token_id = tokenizer.eos_token_id
     if getattr(model.config, "decoder_start_token_id", None) is None and tokenizer.bos_token_id is not None:
         model.config.decoder_start_token_id = tokenizer.bos_token_id
     model.to(DEVICE).eval()
     _caption_bundle = {"processor": processor, "tokenizer": tokenizer, "model": model}
     return _caption_bundle
 def get_story_pipe():
     global _story_pipe
     if _story_pipe is not None:
         return _story_pipe
-    # Flan-T5 works with text2text-generation
-    _story_pipe = hf_pipeline("text2text-generation", model=STORY_MODEL_ID, device_map="auto", model_kwargs={"torch_dtype": DTYPE})
-    return _story_pipe
-def get_img2img_pipe():
-    global _img2img_pipe
-    if _img2img_pipe is not None:
-        return _img2img_pipe
-    pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-        IMG2IMG_MODEL_ID,
-        torch_dtype=DTYPE,
-        safety_checker=None,           # flip to enable safety if you prefer
-        requires_safety_checker=False,
-        use_safetensors=True,
     )
-    pipe = pipe.to(DEVICE)
-    try:
-        pipe.enable_xformers_memory_efficient_attention()
-    except Exception:
-        pass
-    _img2img_pipe = pipe
-    return _img2img_pipe
-# ------------- Ops -------------
 @torch.inference_mode()
 def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4) -> str:
-    bundle = get_caption_bundle()
-    proc, tok, mdl = bundle["processor"], bundle["tokenizer"], bundle["model"]
-    pv = proc(image.convert("RGB"), return_tensors="pt").pixel_values.to(DEVICE)
     out = mdl.generate(
-        pixel_values=pv,
         max_new_tokens=max_new_tokens,
         num_beams=num_beams,
         pad_token_id=tok.pad_token_id,
@@ -142,13 +110,13 @@ def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4)
 def op_story(
     image: Image.Image,
     num_sentences: int = 5,
-    max_new_tokens: int = 220,     # allow enough room
-    min_new_tokens: int = 80,      # force >= ~80 tokens (~5 sentences)
     temperature: float = 0.9,
     top_p: float = 0.92,
     no_repeat_ngram_size: int = 3,
 ) -> str:
-    # Ground the story with a caption of the image
     caption = op_caption(image)
     prompt = (
@@ -157,20 +125,20 @@ def op_story(
         f"Image description: {caption}\n\nStory:"
     )
-    story_pipe = get_story_pipe()
-    out = story_pipe(
         prompt,
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
-        min_new_tokens=min_new_tokens,    # <- prevents early stop
         max_new_tokens=max_new_tokens,
         no_repeat_ngram_size=no_repeat_ngram_size,
         num_return_sequences=1,
     )
     text = out[0]["generated_text"].strip()
-    # Safety belt: hard-trim to exactly N sentences
     import re
     sents = re.split(r'(?<=[.!?])\s+', text)
     sents = [s.strip() for s in sents if s.strip()]
@@ -178,80 +146,34 @@ def op_story(
         text = " ".join(sents[:num_sentences])
     return text
-@torch.inference_mode()
-def op_cartoon(image: Image.Image, steps=DEFAULT_STEPS, guidance=DEFAULT_GUIDANCE, strength=DEFAULT_STRENGTH, seed: Optional[int]=None):
-    img = _resize_max(image.convert("RGB"))
-    gen = _seeded_generator(seed)
-    pipe = get_img2img_pipe()
-    prompt = "cartoon, cel-shaded, flat colors, bold outlines, clean lineart, anime style, comic book"
-    negative = "photorealistic, blurry, noisy, artifacts, distorted, watermark"
-    result = pipe(
-        prompt=prompt,
-        negative_prompt=negative,
-        image=img,
-        strength=float(strength),
-        guidance_scale=float(guidance),
-        num_inference_steps=int(steps),
-        generator=gen,
-    )
-    return result.images[0]
-# ------------- Router -------------
-def route_command(command: str) -> str:
-    c = (command or "").lower()
-    if any(k in c for k in ["cartoon", "sketch", "comic", "anime", "illustration"]):
-        return "cartoon"
-    if any(k in c for k in ["story", "poem", "narrative", "write"]):
-        return "story"
-    # default / describe / caption / explain
-    return "caption"
-# ------------- Gradio App -------------
-def run(image: Image.Image, command: str, steps: int, guidance: float, strength: float, seed: str):
     if image is None:
-        raise gr.Error("Upload an image.")
-    mode = route_command(command)
-    if mode == "cartoon":
-        if not CARTOON_AVAILABLE:
-            raise gr.Error("Cartoon mode requires a GPU and is disabled on this Space’s hardware.")
-        img = op_cartoon(
-            image,
-            steps=steps,
-            guidance=guidance,
-            strength=strength,
-            seed=int(seed) if seed else None,
-        )
-        return None, img, f"Mode: cartoon ({steps} steps, guidance {guidance}, strength {strength}, seed {seed or 'None'})"
-    elif mode == "story":
         txt = op_story(image)
         return txt, None, "Mode: story"
     else:
         txt = op_caption(image)
         return txt, None, "Mode: caption"
 with gr.Blocks(css="footer {visibility:hidden}") as demo:
-    gr.Markdown("# Image Command Router — describe • cartoonize • write a story")
     with gr.Row():
         with gr.Column():
             inp_img = gr.Image(type="pil", label="Image")
-            inp_cmd = gr.Textbox(label="Command", placeholder='e.g., "describe the photo", "make the photo look like a cartoon", "write a story about the image"', lines=2, value="describe the photo")
-            with gr.Accordion("Advanced (cartoon mode)", open=False, visible=CARTOON_AVAILABLE):
-                steps    = gr.Slider(1, 75, value=DEFAULT_STEPS, step=1, label="Steps")
-                guidance = gr.Slider(0.0, 15.0, value=DEFAULT_GUIDANCE, step=0.1, label="Guidance (CFG)")
-                strength = gr.Slider(0.1, 1.0, value=DEFAULT_STRENGTH, step=0.05, label="Strength (how much to change)")
-                seed     = gr.Textbox(value="", label="Seed (optional int)")
             go = gr.Button("Run", variant="primary")
         with gr.Column():
             out_text  = gr.Textbox(label="Text output", lines=10)
-            out_image = gr.Image(label="Image output")
             status    = gr.Markdown()
-    go.click(run, inputs=[inp_img, inp_cmd, steps, guidance, strength, seed], outputs=[out_text, out_image, status], scroll_to_output=True)
 if __name__ == "__main__":
-    # queue() helps Spaces handle concurrent requests + long inference safely
     demo.queue(max_size=8).launch()

+# CPU-only Hugging Face Space: Image -> (Caption OR Story)
+# - Caption: Salesforce BLIP or ViT-GPT2 (set via env or leave defaults)
+# - Story: text2text generation using a lightweight T5-family model
 #
+# Env (optional):
+#   CAPTION_MODEL_ID = "Salesforce/blip-image-captioning-large"  (heavier, better)
+#                    or "nlpconnect/vit-gpt2-image-captioning"  (lighter, faster on CPU)
+#   STORY_MODEL_ID   = "google/flan-t5-large"  (default, decent on CPU)
+#   HUGGINGFACE_HUB_TOKEN / HF_TOKEN (if models require auth)
+#
+# Requirements are in requirements.txt.
 import os
+from typing import Optional
 import torch
 import gradio as gr
 from PIL import Image
 from transformers import (
     VisionEncoderDecoderModel,
     AutoImageProcessor,
     pipeline as hf_pipeline,
 )
+# -------------------- Config --------------------
+CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "Salesforce/blip-image-captioning-large")
+STORY_MODEL_ID   = os.getenv("STORY_MODEL_ID",   "google/flan-t5-large")
+HF_TOKEN         = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
+# CPU only (works on Spaces without GPU)
+DEVICE = "cpu"
+DTYPE  = torch.float32
+MAX_IMG_SIDE     = int(os.getenv("MAX_IMG_SIDE", "768"))  # clamp inputs to keep it snappy
+# -------------------- Caches --------------------
 _caption_bundle = {}
 _story_pipe = None
+# -------------------- Utils --------------------
 def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     w, h = img.size
     if max(w, h) <= max_side:
     else:
         new_h = max_side
         new_w = int(w * (max_side / h))
+    return img.resize((new_w, new_h), Image.LANCZOS)
+# -------------------- Loaders --------------------
 def get_caption_bundle():
+    """Load a vision->text captioning model (BLIP or ViT-GPT2 family) with sane tokenizer settings."""
     global _caption_bundle
     if _caption_bundle:
         return _caption_bundle
     processor = AutoImageProcessor.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
+    # Use fast tokenizer when available to silence 'use_fast' warnings
     tokenizer = AutoTokenizer.from_pretrained(CAPTION_MODEL_ID, use_fast=True, token=HF_TOKEN)
     model     = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_ID, token=HF_TOKEN)
+    # GPT2 lacks pad by default; set to eos and mirror in config to avoid attention_mask warnings
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model.config.pad_token_id = tokenizer.pad_token_id
     model.config.eos_token_id = tokenizer.eos_token_id
     if getattr(model.config, "decoder_start_token_id", None) is None and tokenizer.bos_token_id is not None:
         model.config.decoder_start_token_id = tokenizer.bos_token_id
     model.to(DEVICE).eval()
     _caption_bundle = {"processor": processor, "tokenizer": tokenizer, "model": model}
     return _caption_bundle
 def get_story_pipe():
+    """Lightweight text2text pipeline for story generation."""
     global _story_pipe
     if _story_pipe is not None:
         return _story_pipe
+    _story_pipe = hf_pipeline(
+        "text2text-generation",
+        model=STORY_MODEL_ID,
+        device=-1,  # CPU
+        model_kwargs={"torch_dtype": DTYPE},
     )
+    return _story_pipe
+# -------------------- Ops --------------------
 @torch.inference_mode()
 def op_caption(image: Image.Image, max_new_tokens: int = 32, num_beams: int = 4) -> str:
+    b = get_caption_bundle()
+    proc, tok, mdl = b["processor"], b["tokenizer"], b["model"]
+    image = _resize_max(image.convert("RGB"))
+    pixel_values = proc(image, return_tensors="pt").pixel_values.to(DEVICE)
     out = mdl.generate(
+        pixel_values=pixel_values,
         max_new_tokens=max_new_tokens,
         num_beams=num_beams,
         pad_token_id=tok.pad_token_id,
 def op_story(
     image: Image.Image,
     num_sentences: int = 5,
+    max_new_tokens: int = 220,
+    min_new_tokens: int = 80,
     temperature: float = 0.9,
     top_p: float = 0.92,
     no_repeat_ngram_size: int = 3,
 ) -> str:
+    # Ground with a caption first
     caption = op_caption(image)
     prompt = (
         f"Image description: {caption}\n\nStory:"
     )
+    pipe = get_story_pipe()
+    out = pipe(
         prompt,
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
+        min_new_tokens=min_new_tokens,   # prevents early stop at 1 sentence
         max_new_tokens=max_new_tokens,
         no_repeat_ngram_size=no_repeat_ngram_size,
         num_return_sequences=1,
     )
     text = out[0]["generated_text"].strip()
+    # Trim to exactly N sentences (safety belt)
     import re
     sents = re.split(r'(?<=[.!?])\s+', text)
     sents = [s.strip() for s in sents if s.strip()]
         text = " ".join(sents[:num_sentences])
     return text
+# -------------------- Gradio UI --------------------
+def run(image: Image.Image, mode: str):
     if image is None:
+        raise gr.Error("Upload an image first.")
+    mode = (mode or "Caption").lower()
+    if mode == "story":
         txt = op_story(image)
         return txt, None, "Mode: story"
     else:
         txt = op_caption(image)
         return txt, None, "Mode: caption"
 with gr.Blocks(css="footer {visibility:hidden}") as demo:
+    gr.Markdown("# Image → Caption or Story (CPU-only)")
     with gr.Row():
         with gr.Column():
             inp_img = gr.Image(type="pil", label="Image")
+            mode = gr.Radio(
+                choices=["Caption", "Story"],
+                value="Caption",
+                label="Task",
+            )
             go = gr.Button("Run", variant="primary")
         with gr.Column():
             out_text  = gr.Textbox(label="Text output", lines=10)
+            out_image = gr.Image(label="(unused for CPU app)", visible=False)
             status    = gr.Markdown()
+    go.click(run, inputs=[inp_img, mode], outputs=[out_text, out_image, status], scroll_to_output=True)
 if __name__ == "__main__":
     demo.queue(max_size=8).launch()