Spaces:

achase25
/

AiSolMM

Sleeping

App Files Files Community

achase25 commited on Oct 7, 2025

Commit

df4efd1

verified ·

1 Parent(s): a6262d6

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -1

app.py CHANGED Viewed

@@ -186,4 +186,122 @@ def op_story_vlm(
     if not bun:
         return None
-    image

     if not bun:
         return None
+    image = _resize_max(image.convert("RGB"))
+    prompt = (
+        f"Write exactly {num_sentences} sentences that tell a vivid, sensory story about this image. "
+        "Do not include a title or bullet points. No dialogue.\n\nStory:"
+    )
+    if bun["type"] == "phi35":
+        processor = bun["processor"]
+        model     = bun["model"]
+        # Phi-3.5-vision expects a chat-style input with images
+        messages = [
+            {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}
+        ]
+        inputs = processor.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        )
+        # Some processor versions want pixel values separately:
+        proc_out = processor(images=image, return_tensors="pt")
+        input_ids = inputs.to(DEVICE)
+        pixel_values = proc_out.get("pixel_values")
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(DEVICE)
+        gen = model.generate(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+            min_new_tokens=min_new_tokens,
+            max_new_tokens=max_new_tokens,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            pad_token_id=model.config.pad_token_id,
+            eos_token_id=model.config.eos_token_id,
+        )
+        text = processor.batch_decode(gen, skip_special_tokens=True)[0].strip()
+        # Post-trim to exactly N sentences
+        import re
+        sents = re.split(r'(?<=[.!?])\s+', text)
+        sents = [s.strip() for s in sents if s.strip()]
+        if len(sents) >= num_sentences:
+            text = " ".join(sents[:num_sentences])
+        return text
+    # Unknown VLM type
+    return None
+def op_story_chain(
+    image: Image.Image,
+    num_sentences: int = 5,
+    max_new_tokens: int = 220,
+    min_new_tokens: int = 80,
+    temperature: float = 0.9,
+    top_p: float = 0.92,
+    no_repeat_ngram_size: int = 3,
+) -> str:
+    # Caption -> text LLM
+    caption = op_caption(image)
+    prompt = (
+        f"Write exactly {num_sentences} sentences based on this image description. "
+        "Use vivid sensory details. No title, no lists, no bullet points, no numbered lines, no dialogue.\n"
+        f"Image description: {caption}\n\nStory:"
+    )
+    pipe = get_story_pipe_t2t()
+    out = pipe(
+        prompt,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        min_new_tokens=min_new_tokens,
+        max_new_tokens=max_new_tokens,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        num_return_sequences=1,
+    )
+    text = out[0]["generated_text"].strip()
+    # Trim to exactly N sentences
+    import re
+    sents = re.split(r'(?<=[.!?])\s+', text)
+    sents = [s.strip() for s in sents if s.strip()]
+    if len(sents) >= num_sentences:
+        text = " ".join(sents[:num_sentences])
+    return text
+# -------------------- Gradio UI --------------------
+def run(image: Image.Image, mode: str):
+    if image is None:
+        raise gr.Error("Upload an image first.")
+    mode = (mode or "Caption").lower()
+    if mode == "story":
+        # Try direct VLM if configured; otherwise fallback chain
+        story = op_story_vlm(image)
+        if story is None:
+            story = op_story_chain(image)
+        return story, None, f"Mode: story ({'VLM' if STORY_VLM_ID else 'caption→LLM'})"
+    else:
+        txt = op_caption(image)
+        return txt, None, "Mode: caption"
+with gr.Blocks(css="footer {visibility:hidden}") as demo:
+    gr.Markdown("# Image → Caption or Story (CPU-only) — BLIP-safe, optional CPU VLM")
+    with gr.Row():
+        with gr.Column():
+            inp_img = gr.Image(type="pil", label="Image")
+            mode = gr.Radio(choices=["Caption", "Story"], value="Caption", label="Task")
+            go = gr.Button("Run", variant="primary")
+        with gr.Column():
+            out_text  = gr.Textbox(label="Text output", lines=10)
+            out_image = gr.Image(label="(unused)", visible=False)
+            status    = gr.Markdown()
+    go.click(run, inputs=[inp_img, mode], outputs=[out_text, out_image, status], scroll_to_output=True)
+if __name__ == "__main__":
+    demo.queue(max_size=8).launch()