Spaces:
Runtime error
Runtime error
add gradio interface
Browse files
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: Llava Video
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
|
|
|
| 1 |
---
|
| 2 |
title: Llava Video
|
| 3 |
+
emoji: ππΉ
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
app.py
CHANGED
|
@@ -25,6 +25,31 @@ import tempfile
|
|
| 25 |
import os
|
| 26 |
import shutil
|
| 27 |
#warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
|
| 30 |
if max_frames_num == 0:
|
|
@@ -94,14 +119,19 @@ def gradio_interface(video_file, question):
|
|
| 94 |
return response
|
| 95 |
|
| 96 |
with gr.Blocks() as demo:
|
| 97 |
-
gr.Markdown(
|
| 98 |
-
gr.Markdown("Upload a video and ask a question about it.")
|
| 99 |
-
|
| 100 |
with gr.Row():
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
output = gr.Textbox(label="Response")
|
| 106 |
|
| 107 |
submit_button.click(
|
|
|
|
| 25 |
import os
|
| 26 |
import shutil
|
| 27 |
#warnings.filterwarnings("ignore")
|
| 28 |
+
title = "# ππ»ββοΈWelcome to πTonic's ππΉLLaVA-Video!"
|
| 29 |
+
description1 ="""The **ππΉLLaVA-Video-7B-Qwen2** is a 7B parameter model trained on the ππΉLLaVA-Video-178K dataset and the LLaVA-OneVision dataset. It is [based on the **Qwen2 language model**](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f), supporting a context window of up to 32K tokens. The model can process and interact with images, multi-images, and videos, with specific optimizations for video analysis.
|
| 30 |
+
This model leverages the **SO400M vision backbone** for visual input and Qwen2 for language processing, making it highly efficient in multi-modal reasoning, including visual and video-based tasks.
|
| 31 |
+
ππΉLLaVA-Video has larger variants of [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) and [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) and with a [variant](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) only trained on the new synthetic data
|
| 32 |
+
For further details, please visit the [Project Page](https://github.com/LLaVA-VL/LLaVA-NeXT) or check out the corresponding [research paper](https://arxiv.org/abs/2410.02713).
|
| 33 |
+
"""
|
| 34 |
+
description2 ="""- **Architecture**: `LlavaQwenForCausalLM`
|
| 35 |
+
- **Attention Heads**: 28
|
| 36 |
+
- **Hidden Layers**: 28
|
| 37 |
+
- **Hidden Size**: 3584
|
| 38 |
+
- **Intermediate Size**: 18944
|
| 39 |
+
- **Max Frames Supported**: 64
|
| 40 |
+
- **Languages Supported**: English, Chinese
|
| 41 |
+
- **Image Aspect Ratio**: `anyres_max_9`
|
| 42 |
+
- **Image Resolution**: Various grid resolutions
|
| 43 |
+
- **Max Position Embeddings**: 32,768
|
| 44 |
+
- **Vocab Size**: 152,064
|
| 45 |
+
- **Model Precision**: bfloat16
|
| 46 |
+
- **Hardware Used for Training**: 256 * Nvidia Tesla A100 GPUs
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
joinus = """
|
| 50 |
+
## Join us :
|
| 51 |
+
πTeamTonicπ is always making cool demos! Join our active builder's π οΈcommunity π» [](https://discord.gg/qdfnvSPcqP) On π€Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On πGithub: [Tonic-AI](https://github.com/tonic-ai) & contribute toπ [Build Tonic](https://git.tonic-ai.com/contribute)π€Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant π€
|
| 52 |
+
"""
|
| 53 |
|
| 54 |
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
|
| 55 |
if max_frames_num == 0:
|
|
|
|
| 119 |
return response
|
| 120 |
|
| 121 |
with gr.Blocks() as demo:
|
| 122 |
+
gr.Markdown(title)
|
|
|
|
|
|
|
| 123 |
with gr.Row():
|
| 124 |
+
with gr.Group():
|
| 125 |
+
gr.Markdown(description1)
|
| 126 |
+
with gr.Group():
|
| 127 |
+
gr.Markdown(description2)
|
| 128 |
+
with gr.Accordion("Join Us", open=False):
|
| 129 |
+
gr.Markdown(join_us)
|
| 130 |
+
with gr.Row():
|
| 131 |
+
with gr.Column():
|
| 132 |
+
video_input = gr.Video()
|
| 133 |
+
question_input = gr.Textbox(label="Question", placeholder="Ask a question about the video...")
|
| 134 |
+
submit_button = gr.Button("Submit")
|
| 135 |
output = gr.Textbox(label="Response")
|
| 136 |
|
| 137 |
submit_button.click(
|