Spaces:

Hug0endob
/

Image-describer

Runtime error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

36cfd26

verified ·

1 Parent(s): 4462cb3

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -20

app.py CHANGED Viewed

@@ -10,23 +10,37 @@ from transformers import (
     T5ForConditionalGeneration,
     T5Tokenizer,
 )
-import re
 device = torch.device("cpu")
-# Image captioning model (nlpconnect/vit-gpt2-image-captioning)
-processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
 model.eval()
-# Rewriter (T5)
 rewriter_tokenizer = T5Tokenizer.from_pretrained("t5-small")
 rewriter = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
 rewriter.eval()
 def load_image_from_url(url: str, timeout=10):
     try:
         resp = requests.get(url, timeout=timeout, headers={"User-Agent": "huggingface-space/1.0"})
         resp.raise_for_status()
         img = Image.open(BytesIO(resp.content)).convert("RGB")
@@ -34,49 +48,68 @@ def load_image_from_url(url: str, timeout=10):
     except Exception as e:
         return None, f"Error loading image: {e}"
-def generate_caption(img: Image.Image, max_len: int = 30):
     inputs = processor(images=img, return_tensors="pt")
     pixel_values = inputs.pixel_values.to(device)
-    out = model.generate(pixel_values, max_length=max_len, num_beams=2, early_stopping=True)
     caption = tokenizer.decode(out[0], skip_special_tokens=True).strip()
     return caption
-def rewrite_caption(caption: str, max_len: int = 64):
-    input_text = "paraphrase: " + caption
     tok = rewriter_tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
     out = rewriter.generate(**tok, max_length=max_len, num_beams=2, early_stopping=True)
     rewritten = rewriter_tokenizer.decode(out[0], skip_special_tokens=True).strip()
     return rewritten
-def describe_image(url: str, max_caption_len: int = 30, expand: bool = True):
     img, err = load_image_from_url(url)
     if err:
         return None, f"Error: {err}"
-    caption = generate_caption(img, max_len=max_caption_len)
     if expand:
         try:
-            caption = rewrite_caption(caption, max_len=64)
         except Exception:
             pass
     if len(caption.split()) < 6:
         caption = f"{caption}. The scene appears to contain: {caption.lower()}."
     return img, caption
-# Gradio UI: image left, caption right
-with gr.Blocks() as demo:
-    gr.Markdown("## Image captioning — image on the left, descriptive caption on the right (CPU-optimized, uncensored)")
     with gr.Row():
         with gr.Column(scale=1):
-            url_in = gr.Textbox(label="Image URL", placeholder="https://example.com/photo.jpg")
-            max_len = gr.Slider(minimum=10, maximum=60, value=30, label="Max caption length")
-            expand_chk = gr.Checkbox(label="Expand/rewite caption (slower, more natural)", value=True)
             go = gr.Button("Load & Describe")
         with gr.Column(scale=1):
             img_out = gr.Image(type="pil", label="Image")
         with gr.Column(scale=1):
             caption_out = gr.Textbox(label="Descriptive caption", lines=6)
-    go.click(fn=describe_image, inputs=[url_in, max_len, expand_chk], outputs=[img_out, caption_out])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

     T5ForConditionalGeneration,
     T5Tokenizer,
 )
+import urllib.parse
 device = torch.device("cpu")
+# Models
+PROCESSOR_NAME = "nlpconnect/vit-gpt2-image-captioning"
+processor = ViTImageProcessor.from_pretrained(PROCESSOR_NAME)
+tokenizer = AutoTokenizer.from_pretrained(PROCESSOR_NAME)
+model = VisionEncoderDecoderModel.from_pretrained(PROCESSOR_NAME).to(device)
 model.eval()
+# Optional rewriter (T5-small) to make captions more natural / respond to prompt
 rewriter_tokenizer = T5Tokenizer.from_pretrained("t5-small")
 rewriter = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
 rewriter.eval()
 def load_image_from_url(url: str, timeout=10):
     try:
+        # allow string that is data URL or direct URL
+        url = url.strip()
+        if url.startswith("data:"):
+            # let PIL handle data URLs via BytesIO after splitting
+            header, encoded = url.split(",", 1)
+            import base64
+            data = base64.b64decode(encoded)
+            img = Image.open(BytesIO(data)).convert("RGB")
+            return img, None
+        # ensure proper URL encoding
+        parsed = urllib.parse.urlsplit(url)
+        if parsed.scheme == "":
+            return None, "Invalid URL (missing scheme: http/https)."
         resp = requests.get(url, timeout=timeout, headers={"User-Agent": "huggingface-space/1.0"})
         resp.raise_for_status()
         img = Image.open(BytesIO(resp.content)).convert("RGB")
     except Exception as e:
         return None, f"Error loading image: {e}"
+def generate_caption(img: Image.Image, prompt: str = None, max_len: int = 30, num_beams: int = 2):
+    # Prepare encoder inputs
     inputs = processor(images=img, return_tensors="pt")
     pixel_values = inputs.pixel_values.to(device)
+    # If a prompt is provided, prepend it to the decoder start tokens via tokenizer (prefix)
+    # This is a lightweight way to bias output by using the tokenizer's bos/tokenizer decoding prefix.
+    gen_kwargs = {"max_length": max_len, "num_beams": num_beams, "early_stopping": True}
+    if prompt:
+        # For vit-gpt2 model, we can try to use forced_decoder_input_ids or prefix decoding
+        # Simpler approach: generate normally and then rely on rewriter to apply prompt.
+        pass
+    out = model.generate(pixel_values, **gen_kwargs)
     caption = tokenizer.decode(out[0], skip_special_tokens=True).strip()
     return caption
+def rewrite_caption_with_prompt(caption: str, prompt: str = None, max_len: int = 64):
+    # If prompt provided, use it to instruct T5; otherwise paraphrase
+    if prompt:
+        input_text = f"paraphrase: {caption} prompt: {prompt}"
+    else:
+        input_text = "paraphrase: " + caption
     tok = rewriter_tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
     out = rewriter.generate(**tok, max_length=max_len, num_beams=2, early_stopping=True)
     rewritten = rewriter_tokenizer.decode(out[0], skip_special_tokens=True).strip()
     return rewritten
+def describe_image(url: str, prompt: str, max_caption_len: int = 30, expand: bool = True, beams: int = 2):
     img, err = load_image_from_url(url)
     if err:
         return None, f"Error: {err}"
+    caption = generate_caption(img, prompt=prompt, max_len=max_caption_len, num_beams=beams)
     if expand:
         try:
+            caption = rewrite_caption_with_prompt(caption, prompt=prompt, max_len=64)
         except Exception:
             pass
     if len(caption.split()) < 6:
         caption = f"{caption}. The scene appears to contain: {caption.lower()}."
     return img, caption
+css = """
+footer {display: none !important;}
+"""
+with gr.Blocks(css=css, title="Image Describer (vit-gpt2, uncensored, promptable)") as demo:
+    gr.Markdown("## Image Describer — uncensored captions, optional prompt to bias description")
     with gr.Row():
         with gr.Column(scale=1):
+            url_in = gr.Textbox(label="Image URL or data URL", placeholder="https://example.com/photo.jpg")
+            prompt_in = gr.Textbox(label="Optional prompt (e.g. 'Describe people and actions')", placeholder="Focus on people, actions, or colors.")
+            max_len = gr.Slider(minimum=8, maximum=60, value=30, label="Max caption length")
+            beams = gr.Slider(minimum=1, maximum=4, value=2, step=1, label="Num beams (higher = better quality, slower)")
+            expand_chk = gr.Checkbox(label="Rewrite/Paraphrase with prompt (slower)", value=True)
             go = gr.Button("Load & Describe")
         with gr.Column(scale=1):
             img_out = gr.Image(type="pil", label="Image")
         with gr.Column(scale=1):
             caption_out = gr.Textbox(label="Descriptive caption", lines=6)
+    go.click(fn=describe_image, inputs=[url_in, prompt_in, max_len, expand_chk, beams], outputs=[img_out, caption_out])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)