Spaces:

rogerxi
/

Spatial-LLaVA

Sleeping

App Files Files Community

Nightwalkx commited on Mar 6, 2025

Commit

c545fd8

1 Parent(s): abd25d9

update app

Browse files

Files changed (1) hide show

app.py +75 -92

app.py CHANGED Viewed

@@ -1,105 +1,88 @@
-import time
-from threading import Thread
 import gradio as gr
-import torch
 from PIL import Image
-from transformers import AutoProcessor, LlavaForConditionalGeneration
-from transformers import TextIteratorStreamer
 import spaces
-PLACEHOLDER = """
-<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <img src="https://cdn-uploads.huggingface.co/production/uploads/64ccdc322e592905f922a06e/DDIW0kbWmdOQWwy4XMhwX.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">LLaVA-Llama-3-8B</h1>
-   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Llava-Llama-3-8b is a LLaVA model fine-tuned from Meta-Llama-3-8B-Instruct and CLIP-ViT-Large-patch14-336 with ShareGPT4V-PT and InternVL-SFT by XTuner</p>
-</div>
-"""
-model_id = "rogerxi/llava-finetune-test"
-processor = AutoProcessor.from_pretrained(model_id)
-model = LlavaForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True,
-)
-model.to("cuda:0")
-model.generation_config.eos_token_id = 128009
-@spaces.GPU
-def bot_streaming(message, history):
-    print(message)
-    if message["files"]:
-        # message["files"][-1] is a Dict or just a string
-        if type(message["files"][-1]) == dict:
-            image = message["files"][-1]["path"]
-        else:
-            image = message["files"][-1]
-    else:
-        # if there's no image uploaded for this turn, look for images in the past turns
-        # kept inside tuples, take the last one
-        for hist in history:
-            if type(hist[0]) == tuple:
-                image = hist[0][0]
-    try:
-        if image is None:
-            # Handle the case where image is None
-            gr.Error("You need to upload an image for LLaVA to work.")
-    except NameError:
-        # Handle the case where 'image' is not defined at all
-        gr.Error("You need to upload an image for LLaVA to work.")
-    prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    # print(f"prompt: {prompt}")
-    image = Image.open(image)
-    inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16)
-    streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True})
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False)
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    text_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    # print(f"text_prompt: {text_prompt}")
-    buffer = ""
-    time.sleep(0.5)
-    for new_text in streamer:
-        # find <|eot_id|> and remove it from the new_text
-        if "<|eot_id|>" in new_text:
-            new_text = new_text.split("<|eot_id|>")[0]
-        buffer += new_text
-        # generated_text_without_prompt = buffer[len(text_prompt):]
-        generated_text_without_prompt = buffer
-        # print(generated_text_without_prompt)
-        time.sleep(0.06)
-        # print(f"new_text: {generated_text_without_prompt}")
-        yield generated_text_without_prompt
-chatbot=gr.Chatbot(placeholder=PLACEHOLDER,scale=1)
-chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
-with gr.Blocks(fill_height=True, ) as demo:
-    gr.ChatInterface(
-    fn=bot_streaming,
-    title="LLaVA Llama-3-8B",
-    examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]},
-              {"text": "How to make this pastry?", "files": ["./baklava.png"]}],
-    description="Try [LLaVA Llama-3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
-    stop_btn="Stop Generation",
-    multimodal=True,
-    textbox=chat_input,
-    chatbot=chatbot,
-    )
-# demo.queue(api_open=False)
 demo.launch(debug=True)

 import gradio as gr
+from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
+from threading import Thread
+import re
+import time
 from PIL import Image
+import torch
+import cv2
 import spaces
+# model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
+model_id = "rogerxi/llava-finetune-test"
+processor = LlavaProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
+model.to("cuda")
+@spaces.GPU
+def bot_streaming(message, history):
+  print(message)
+  txt = message['text']
+  ext_buffer = f"user\n{txt} assistant"
+  if message['files']:
+    if len(message['files']) == 1:
+      image = [message['files'][0]]
+    elif len(message['files']) > 1:
+      image = [msg.path for msg in message['files']]
+  else:
+    # if there's no image uploaded for this turn, look for images in the past turns
+    # kept inside tuples, take the last one
+    for hist in history:
+      if type(hist[0])==tuple:
+        image = hist[0][0]
+  if message['files'] is None:
+      gr.Error("You need to upload an image or video for LLaVA to work.")
+  image_extensions = Image.registered_extensions()
+  image_extensions = tuple([ex for ex, f in image_extensions.items()])
+  if len(image) == 1:
+    image = Image.open(image[0]).convert("RGB")
+    prompt = f"<|im_start|>user <image>\n{message['text']}<|im_end|><|im_start|>assistant"
+  elif len(image) > 1:
+    image_list = []
+    user_prompt = message['text']
+    for img in image:
+        img = Image.open(img).convert("RGB")
+        image_list.append(img)
+    toks = "<image>" * len(image_list)
+    prompt = "<|im_start|>user"+ toks + f"\n{user_prompt}<|im_end|><|im_start|>assistant"
+    image = image_list
+  inputs = processor(image, prompt, return_tensors="pt").to("cuda", torch.float16)
+  streamer = TextIteratorStreamer(processor, **{"max_new_tokens": 200, "skip_special_tokens": True})
+  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=100)
+  generated_text = ""
+  thread = Thread(target=model.generate, kwargs=generation_kwargs)
+  thread.start()
+  buffer = ""
+  for new_text in streamer:
+    buffer += new_text
+    print(buffer)
+    print(buffer[len(ext_buffer):])
+    generated_text_without_prompt = buffer[len(ext_buffer):]
+    time.sleep(0.01)
+    yield generated_text_without_prompt
+demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA ",
+      textbox=gr.MultimodalTextbox(file_count="multiple"),
+      description="Try EgoLlava. If you don't upload an image, you will receive an error. ",
+      stop_btn="Stop Generation", multimodal=True)
 demo.launch(debug=True)