Spaces:

Hug0endob
/

Image-describer

Runtime error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

63ffe59

verified ·

1 Parent(s): 049393b

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -26

app.py CHANGED Viewed

@@ -1,18 +1,25 @@
-# app.py – corrected for Gradio 6+
 import gradio as gr
-from PIL import Image
-import requests, urllib.parse, threading, time
 import torch
 from transformers import (
     VisionEncoderDecoderModel,
     ViTImageProcessor,
-    AutoTokenizer,
     T5ForConditionalGeneration,
     T5Tokenizer,
 )
 # -------------------------------------------------
-# Device & models (CPU)
 # -------------------------------------------------
 device = torch.device("cpu")
@@ -26,6 +33,7 @@ vision = VisionEncoderDecoderModel.from_pretrained(IMG_MODEL).to(device).eval()
 rewriter_tok = T5Tokenizer.from_pretrained(TXT_MODEL)
 rewriter = T5ForConditionalGeneration.from_pretrained(TXT_MODEL).to(device).eval()
 # -------------------------------------------------
 # Helpers
 # -------------------------------------------------
@@ -34,7 +42,6 @@ def load_image(url: str):
     try:
         url = url.strip()
         if url.startswith("data:"):
-            import base64
             _, data = url.split(",", 1)
             img = Image.open(BytesIO(base64.b64decode(data))).convert("RGB")
             return img, None
@@ -46,8 +53,9 @@ def load_image(url: str):
     except Exception as e:
         return None, f"Load error: {e}"
 def generate_base(img: Image.Image, max_len=40, beams=2, sample=False):
-    """Return a single “most detailed” base caption."""
     inputs = processor(images=img, return_tensors="pt")
     pix = inputs.pixel_values.to(device)
@@ -71,10 +79,11 @@ def generate_base(img: Image.Image, max_len=40, beams=2, sample=False):
             early_stopping=True,
         )
     caps = [tokenizer.decode(o, skip_special_tokens=True).strip() for o in out]
-    return max(caps, key=lambda s: len(s.split()))  # longest = most detailed
 def expand_caption(base: str, prompt: str = None, max_len=160):
-    """Rich T5 expansion."""
     if prompt and prompt.strip():
         instr = f"Expand using: '{prompt}'. Caption: \"{base}\""
     else:
@@ -97,13 +106,12 @@ def expand_caption(base: str, prompt: str = None, max_len=160):
     )
     return rewriter_tok.decode(out[0], skip_special_tokens=True).strip()
-# -------------------------------------------------
-# Async expansion (background thread)
-# -------------------------------------------------
 def async_expand(base, prompt, max_len, status):
     try:
         status["text"] = "Expanding…"
-        time.sleep(0.1)
         result = expand_caption(base, prompt, max_len)
         status["text"] = "Done"
         return result
@@ -111,6 +119,7 @@ def async_expand(base, prompt, max_len, status):
         status["text"] = f"Error: {e}"
         return base
 # -------------------------------------------------
 # Gradio callbacks
 # -------------------------------------------------
@@ -131,10 +140,12 @@ def fast_describe(url, prompt, detail, beams, sample):
     threading.Thread(target=worker, daemon=True).start()
     return img, base, status["text"]
 def final_caption(url, prompt, detail, beams, sample):
     img, err = load_image(url)
     if err:
         return "", err
     detail_map = {"Low": 80, "Medium": 140, "High": 220}
     max_expand = detail_map.get(detail, 140)
@@ -145,25 +156,20 @@ def final_caption(url, prompt, detail, beams, sample):
     except Exception as e:
         return base, f"Expand error: {e}"
 # -------------------------------------------------
 # UI
 # -------------------------------------------------
 css = "footer {display:none !important;}"
-with gr.Blocks() as demo:                     # no css here
-    gr.Markdown(
-        "## Image Describer"
-    )
     with gr.Row():
         with gr.Column():
             url_in = gr.Textbox(label="Image URL / data‑URL")
             prompt_in = gr.Textbox(label="Optional prompt")
-            detail_in = gr.Radio(
-                ["Low", "Medium", "High"], value="Medium", label="Detail level"
-            )
             beams_in = gr.Slider(1, 4, step=1, value=2, label="Beams")
-            sample_in = gr.Checkbox(
-                label="Enable sampling (more diverse)", value=False
-            )
             go_btn = gr.Button("Load & Describe (fast)")
             final_btn = gr.Button("Get final caption (detailed)")
             status_out = gr.Textbox(label="Status", interactive=False)
@@ -184,14 +190,14 @@ with gr.Blocks() as demo:                     # no css here
     )
 # -------------------------------------------------
-# Launch – css and title are passed here (Gradio 6+)
 # -------------------------------------------------
 if __name__ == "__main__":
-    demo.queue()  # enables background threads without leaking the event loop
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         css=css,
         title="Image Describer (CPU)",
-        prevent_thread_lock=True,   # clean shutdown → no “invalid file descriptor” warnings
     )

+# app.py – Gradio 6+ (CPU)
+import base64
+import threading
+import time
+import urllib.parse
+from io import BytesIO
 import gradio as gr
+import requests
 import torch
+from PIL import Image
 from transformers import (
+    AutoTokenizer,
     VisionEncoderDecoderModel,
     ViTImageProcessor,
     T5ForConditionalGeneration,
     T5Tokenizer,
 )
 # -------------------------------------------------
+# Device & models
 # -------------------------------------------------
 device = torch.device("cpu")
 rewriter_tok = T5Tokenizer.from_pretrained(TXT_MODEL)
 rewriter = T5ForConditionalGeneration.from_pretrained(TXT_MODEL).to(device).eval()
 # -------------------------------------------------
 # Helpers
 # -------------------------------------------------
     try:
         url = url.strip()
         if url.startswith("data:"):
             _, data = url.split(",", 1)
             img = Image.open(BytesIO(base64.b64decode(data))).convert("RGB")
             return img, None
     except Exception as e:
         return None, f"Load error: {e}"
 def generate_base(img: Image.Image, max_len=40, beams=2, sample=False):
+    """Return the longest caption (most detailed) from the vision model."""
     inputs = processor(images=img, return_tensors="pt")
     pix = inputs.pixel_values.to(device)
             early_stopping=True,
         )
     caps = [tokenizer.decode(o, skip_special_tokens=True).strip() for o in out]
+    return max(caps, key=lambda s: len(s.split()))
 def expand_caption(base: str, prompt: str = None, max_len=160):
+    """Use T5 to expand the base caption."""
     if prompt and prompt.strip():
         instr = f"Expand using: '{prompt}'. Caption: \"{base}\""
     else:
     )
     return rewriter_tok.decode(out[0], skip_special_tokens=True).strip()
 def async_expand(base, prompt, max_len, status):
+    """Background expansion; updates status dict."""
     try:
         status["text"] = "Expanding…"
+        time.sleep(0.1)  # tiny yield for UI responsiveness
         result = expand_caption(base, prompt, max_len)
         status["text"] = "Done"
         return result
         status["text"] = f"Error: {e}"
         return base
 # -------------------------------------------------
 # Gradio callbacks
 # -------------------------------------------------
     threading.Thread(target=worker, daemon=True).start()
     return img, base, status["text"]
 def final_caption(url, prompt, detail, beams, sample):
     img, err = load_image(url)
     if err:
         return "", err
     detail_map = {"Low": 80, "Medium": 140, "High": 220}
     max_expand = detail_map.get(detail, 140)
     except Exception as e:
         return base, f"Expand error: {e}"
 # -------------------------------------------------
 # UI
 # -------------------------------------------------
 css = "footer {display:none !important;}"
+with gr.Blocks() as demo:
+    gr.Markdown("## Image Describer")
     with gr.Row():
         with gr.Column():
             url_in = gr.Textbox(label="Image URL / data‑URL")
             prompt_in = gr.Textbox(label="Optional prompt")
+            detail_in = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Detail level")
             beams_in = gr.Slider(1, 4, step=1, value=2, label="Beams")
+            sample_in = gr.Checkbox(label="Enable sampling (more diverse)", value=False)
             go_btn = gr.Button("Load & Describe (fast)")
             final_btn = gr.Button("Get final caption (detailed)")
             status_out = gr.Textbox(label="Status", interactive=False)
     )
 # -------------------------------------------------
+# Launch
 # -------------------------------------------------
 if __name__ == "__main__":
+    demo.queue()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         css=css,
         title="Image Describer (CPU)",
+        prevent_thread_lock=True,
     )