Spaces:

Harshithtd
/

llama_VLM

Runtime error

App Files Files Community

Harshithtd commited on Sep 26, 2024

Commit

6df7939

verified ·

1 Parent(s): 33cf200

Create app.py

Browse files

Files changed (1) hide show

app.py +85 -0

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
+from PIL import Image
+import torch
+from threading import Thread
+import gradio as gr
+from gradio import FileData
+import time
+import spaces
+ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+# Load model to CPU by default
+model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16)
+processor = AutoProcessor.from_pretrained(ckpt)
+@spaces.CPU
+def bot_streaming(message, history, max_new_tokens=250):
+    txt = message["text"]
+    ext_buffer = f"{txt}"
+    messages = []
+    images = []
+    for i, msg in enumerate(history):
+        if isinstance(msg[0], tuple):
+            messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
+            images.append(Image.open(msg[0][0]).convert("RGB"))
+        elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
+            pass
+        elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
+            messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
+    if len(message["files"]) == 1:
+        if isinstance(message["files"][0], str):
+            image = Image.open(message["files"][0]).convert("RGB")
+        else:
+            image = Image.open(message["files"][0]["path"]).convert("RGB")
+        images.append(image)
+        messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
+    else:
+        messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
+    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
+    if images == []:
+        inputs = processor(text=texts, return_tensors="pt")  # No .to("cuda"), stays on CPU
+    else:
+        inputs = processor(text=texts, images=images, return_tensors="pt")  # No .to("cuda"), stays on CPU
+    streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
+    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
+    generated_text = ""
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        generated_text_without_prompt = buffer
+        time.sleep(0.01)
+        yield buffer
+demo = gr.ChatInterface(
+    fn=bot_streaming,
+    title="Multimodal Llama",
+    examples=[],  # No examples provided
+    textbox=gr.MultimodalTextbox(),
+    additional_inputs=[gr.Slider(
+        minimum=10,
+        maximum=500,
+        value=250,
+        step=10,
+        label="Maximum number of new tokens to generate",
+    )],
+    cache_examples=False,
+    description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32).",
+    stop_btn="Stop Generation",
+    fill_height=True,
+    multimodal=True
+)
+demo.launch(debug=True)