achase25 commited on
Commit
df4efd1
·
verified ·
1 Parent(s): a6262d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -1
app.py CHANGED
@@ -186,4 +186,122 @@ def op_story_vlm(
186
  if not bun:
187
  return None
188
 
189
- image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  if not bun:
187
  return None
188
 
189
+ image = _resize_max(image.convert("RGB"))
190
+ prompt = (
191
+ f"Write exactly {num_sentences} sentences that tell a vivid, sensory story about this image. "
192
+ "Do not include a title or bullet points. No dialogue.\n\nStory:"
193
+ )
194
+
195
+ if bun["type"] == "phi35":
196
+ processor = bun["processor"]
197
+ model = bun["model"]
198
+
199
+ # Phi-3.5-vision expects a chat-style input with images
200
+ messages = [
201
+ {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}
202
+ ]
203
+ inputs = processor.apply_chat_template(
204
+ messages, add_generation_prompt=True, return_tensors="pt"
205
+ )
206
+
207
+ # Some processor versions want pixel values separately:
208
+ proc_out = processor(images=image, return_tensors="pt")
209
+ input_ids = inputs.to(DEVICE)
210
+ pixel_values = proc_out.get("pixel_values")
211
+ if pixel_values is not None:
212
+ pixel_values = pixel_values.to(DEVICE)
213
+
214
+ gen = model.generate(
215
+ input_ids=input_ids,
216
+ pixel_values=pixel_values,
217
+ do_sample=True,
218
+ temperature=temperature,
219
+ top_p=top_p,
220
+ min_new_tokens=min_new_tokens,
221
+ max_new_tokens=max_new_tokens,
222
+ no_repeat_ngram_size=no_repeat_ngram_size,
223
+ pad_token_id=model.config.pad_token_id,
224
+ eos_token_id=model.config.eos_token_id,
225
+ )
226
+ text = processor.batch_decode(gen, skip_special_tokens=True)[0].strip()
227
+
228
+ # Post-trim to exactly N sentences
229
+ import re
230
+ sents = re.split(r'(?<=[.!?])\s+', text)
231
+ sents = [s.strip() for s in sents if s.strip()]
232
+ if len(sents) >= num_sentences:
233
+ text = " ".join(sents[:num_sentences])
234
+ return text
235
+
236
+ # Unknown VLM type
237
+ return None
238
+
239
+ def op_story_chain(
240
+ image: Image.Image,
241
+ num_sentences: int = 5,
242
+ max_new_tokens: int = 220,
243
+ min_new_tokens: int = 80,
244
+ temperature: float = 0.9,
245
+ top_p: float = 0.92,
246
+ no_repeat_ngram_size: int = 3,
247
+ ) -> str:
248
+ # Caption -> text LLM
249
+ caption = op_caption(image)
250
+ prompt = (
251
+ f"Write exactly {num_sentences} sentences based on this image description. "
252
+ "Use vivid sensory details. No title, no lists, no bullet points, no numbered lines, no dialogue.\n"
253
+ f"Image description: {caption}\n\nStory:"
254
+ )
255
+
256
+ pipe = get_story_pipe_t2t()
257
+ out = pipe(
258
+ prompt,
259
+ do_sample=True,
260
+ temperature=temperature,
261
+ top_p=top_p,
262
+ min_new_tokens=min_new_tokens,
263
+ max_new_tokens=max_new_tokens,
264
+ no_repeat_ngram_size=no_repeat_ngram_size,
265
+ num_return_sequences=1,
266
+ )
267
+ text = out[0]["generated_text"].strip()
268
+
269
+ # Trim to exactly N sentences
270
+ import re
271
+ sents = re.split(r'(?<=[.!?])\s+', text)
272
+ sents = [s.strip() for s in sents if s.strip()]
273
+ if len(sents) >= num_sentences:
274
+ text = " ".join(sents[:num_sentences])
275
+ return text
276
+
277
+ # -------------------- Gradio UI --------------------
278
+ def run(image: Image.Image, mode: str):
279
+ if image is None:
280
+ raise gr.Error("Upload an image first.")
281
+ mode = (mode or "Caption").lower()
282
+
283
+ if mode == "story":
284
+ # Try direct VLM if configured; otherwise fallback chain
285
+ story = op_story_vlm(image)
286
+ if story is None:
287
+ story = op_story_chain(image)
288
+ return story, None, f"Mode: story ({'VLM' if STORY_VLM_ID else 'caption→LLM'})"
289
+ else:
290
+ txt = op_caption(image)
291
+ return txt, None, "Mode: caption"
292
+
293
+ with gr.Blocks(css="footer {visibility:hidden}") as demo:
294
+ gr.Markdown("# Image → Caption or Story (CPU-only) — BLIP-safe, optional CPU VLM")
295
+ with gr.Row():
296
+ with gr.Column():
297
+ inp_img = gr.Image(type="pil", label="Image")
298
+ mode = gr.Radio(choices=["Caption", "Story"], value="Caption", label="Task")
299
+ go = gr.Button("Run", variant="primary")
300
+ with gr.Column():
301
+ out_text = gr.Textbox(label="Text output", lines=10)
302
+ out_image = gr.Image(label="(unused)", visible=False)
303
+ status = gr.Markdown()
304
+ go.click(run, inputs=[inp_img, mode], outputs=[out_text, out_image, status], scroll_to_output=True)
305
+
306
+ if __name__ == "__main__":
307
+ demo.queue(max_size=8).launch()