Update app.py
Browse files
app.py
CHANGED
|
@@ -186,4 +186,122 @@ def op_story_vlm(
|
|
| 186 |
if not bun:
|
| 187 |
return None
|
| 188 |
|
| 189 |
-
image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
if not bun:
|
| 187 |
return None
|
| 188 |
|
| 189 |
+
image = _resize_max(image.convert("RGB"))
|
| 190 |
+
prompt = (
|
| 191 |
+
f"Write exactly {num_sentences} sentences that tell a vivid, sensory story about this image. "
|
| 192 |
+
"Do not include a title or bullet points. No dialogue.\n\nStory:"
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
if bun["type"] == "phi35":
|
| 196 |
+
processor = bun["processor"]
|
| 197 |
+
model = bun["model"]
|
| 198 |
+
|
| 199 |
+
# Phi-3.5-vision expects a chat-style input with images
|
| 200 |
+
messages = [
|
| 201 |
+
{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}
|
| 202 |
+
]
|
| 203 |
+
inputs = processor.apply_chat_template(
|
| 204 |
+
messages, add_generation_prompt=True, return_tensors="pt"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Some processor versions want pixel values separately:
|
| 208 |
+
proc_out = processor(images=image, return_tensors="pt")
|
| 209 |
+
input_ids = inputs.to(DEVICE)
|
| 210 |
+
pixel_values = proc_out.get("pixel_values")
|
| 211 |
+
if pixel_values is not None:
|
| 212 |
+
pixel_values = pixel_values.to(DEVICE)
|
| 213 |
+
|
| 214 |
+
gen = model.generate(
|
| 215 |
+
input_ids=input_ids,
|
| 216 |
+
pixel_values=pixel_values,
|
| 217 |
+
do_sample=True,
|
| 218 |
+
temperature=temperature,
|
| 219 |
+
top_p=top_p,
|
| 220 |
+
min_new_tokens=min_new_tokens,
|
| 221 |
+
max_new_tokens=max_new_tokens,
|
| 222 |
+
no_repeat_ngram_size=no_repeat_ngram_size,
|
| 223 |
+
pad_token_id=model.config.pad_token_id,
|
| 224 |
+
eos_token_id=model.config.eos_token_id,
|
| 225 |
+
)
|
| 226 |
+
text = processor.batch_decode(gen, skip_special_tokens=True)[0].strip()
|
| 227 |
+
|
| 228 |
+
# Post-trim to exactly N sentences
|
| 229 |
+
import re
|
| 230 |
+
sents = re.split(r'(?<=[.!?])\s+', text)
|
| 231 |
+
sents = [s.strip() for s in sents if s.strip()]
|
| 232 |
+
if len(sents) >= num_sentences:
|
| 233 |
+
text = " ".join(sents[:num_sentences])
|
| 234 |
+
return text
|
| 235 |
+
|
| 236 |
+
# Unknown VLM type
|
| 237 |
+
return None
|
| 238 |
+
|
| 239 |
+
def op_story_chain(
|
| 240 |
+
image: Image.Image,
|
| 241 |
+
num_sentences: int = 5,
|
| 242 |
+
max_new_tokens: int = 220,
|
| 243 |
+
min_new_tokens: int = 80,
|
| 244 |
+
temperature: float = 0.9,
|
| 245 |
+
top_p: float = 0.92,
|
| 246 |
+
no_repeat_ngram_size: int = 3,
|
| 247 |
+
) -> str:
|
| 248 |
+
# Caption -> text LLM
|
| 249 |
+
caption = op_caption(image)
|
| 250 |
+
prompt = (
|
| 251 |
+
f"Write exactly {num_sentences} sentences based on this image description. "
|
| 252 |
+
"Use vivid sensory details. No title, no lists, no bullet points, no numbered lines, no dialogue.\n"
|
| 253 |
+
f"Image description: {caption}\n\nStory:"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
pipe = get_story_pipe_t2t()
|
| 257 |
+
out = pipe(
|
| 258 |
+
prompt,
|
| 259 |
+
do_sample=True,
|
| 260 |
+
temperature=temperature,
|
| 261 |
+
top_p=top_p,
|
| 262 |
+
min_new_tokens=min_new_tokens,
|
| 263 |
+
max_new_tokens=max_new_tokens,
|
| 264 |
+
no_repeat_ngram_size=no_repeat_ngram_size,
|
| 265 |
+
num_return_sequences=1,
|
| 266 |
+
)
|
| 267 |
+
text = out[0]["generated_text"].strip()
|
| 268 |
+
|
| 269 |
+
# Trim to exactly N sentences
|
| 270 |
+
import re
|
| 271 |
+
sents = re.split(r'(?<=[.!?])\s+', text)
|
| 272 |
+
sents = [s.strip() for s in sents if s.strip()]
|
| 273 |
+
if len(sents) >= num_sentences:
|
| 274 |
+
text = " ".join(sents[:num_sentences])
|
| 275 |
+
return text
|
| 276 |
+
|
| 277 |
+
# -------------------- Gradio UI --------------------
|
| 278 |
+
def run(image: Image.Image, mode: str):
|
| 279 |
+
if image is None:
|
| 280 |
+
raise gr.Error("Upload an image first.")
|
| 281 |
+
mode = (mode or "Caption").lower()
|
| 282 |
+
|
| 283 |
+
if mode == "story":
|
| 284 |
+
# Try direct VLM if configured; otherwise fallback chain
|
| 285 |
+
story = op_story_vlm(image)
|
| 286 |
+
if story is None:
|
| 287 |
+
story = op_story_chain(image)
|
| 288 |
+
return story, None, f"Mode: story ({'VLM' if STORY_VLM_ID else 'caption→LLM'})"
|
| 289 |
+
else:
|
| 290 |
+
txt = op_caption(image)
|
| 291 |
+
return txt, None, "Mode: caption"
|
| 292 |
+
|
| 293 |
+
with gr.Blocks(css="footer {visibility:hidden}") as demo:
|
| 294 |
+
gr.Markdown("# Image → Caption or Story (CPU-only) — BLIP-safe, optional CPU VLM")
|
| 295 |
+
with gr.Row():
|
| 296 |
+
with gr.Column():
|
| 297 |
+
inp_img = gr.Image(type="pil", label="Image")
|
| 298 |
+
mode = gr.Radio(choices=["Caption", "Story"], value="Caption", label="Task")
|
| 299 |
+
go = gr.Button("Run", variant="primary")
|
| 300 |
+
with gr.Column():
|
| 301 |
+
out_text = gr.Textbox(label="Text output", lines=10)
|
| 302 |
+
out_image = gr.Image(label="(unused)", visible=False)
|
| 303 |
+
status = gr.Markdown()
|
| 304 |
+
go.click(run, inputs=[inp_img, mode], outputs=[out_text, out_image, status], scroll_to_output=True)
|
| 305 |
+
|
| 306 |
+
if __name__ == "__main__":
|
| 307 |
+
demo.queue(max_size=8).launch()
|