Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -29,6 +29,31 @@ tokenizer, model, image_processor, context_len = load_pretrained_model(model_pat
|
|
| 29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 30 |
model.to(device).eval()
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
|
| 33 |
roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
|
| 34 |
|
|
@@ -85,6 +110,9 @@ def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_im
|
|
| 85 |
@spaces.GPU(duration=120)
|
| 86 |
def oryx_inference(multimodal):
|
| 87 |
visual, text = multimodal["files"][0], multimodal["text"]
|
|
|
|
|
|
|
|
|
|
| 88 |
if visual.endswith(".mp4"):
|
| 89 |
modality = "video"
|
| 90 |
else:
|
|
@@ -190,14 +218,13 @@ def oryx_inference(multimodal):
|
|
| 190 |
return outputs
|
| 191 |
|
| 192 |
# Define input and output for the Gradio interface
|
| 193 |
-
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 194 |
demo = gr.Interface(
|
| 195 |
fn=oryx_inference,
|
| 196 |
inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
|
| 197 |
outputs="text",
|
| 198 |
examples=[
|
| 199 |
{
|
| 200 |
-
"files":[f"{cur_dir}/case/
|
| 201 |
"text":"Describe what is happening in this video in detail.",
|
| 202 |
},
|
| 203 |
{
|
|
@@ -206,7 +233,8 @@ demo = gr.Interface(
|
|
| 206 |
},
|
| 207 |
],
|
| 208 |
title="Oryx Demo",
|
| 209 |
-
description=
|
|
|
|
| 210 |
)
|
| 211 |
|
| 212 |
# Launch the Gradio app
|
|
|
|
| 29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 30 |
model.to(device).eval()
|
| 31 |
|
| 32 |
+
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 33 |
+
|
| 34 |
+
title_markdown = """
|
| 35 |
+
<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);"> <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
| 36 |
+
<img src="https://oryx-mllm.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
|
| 37 |
+
</a>
|
| 38 |
+
<div>
|
| 39 |
+
<h2 ><a href="https://github.com/Oryx-mllm/Oryx">Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution</a> </h2>
|
| 40 |
+
<h5 style="margin: 0;"><a href="https://oryx-mllm.github.io/">Project Page</a> | <a href="https://github.com/Oryx-mllm/Oryx">Github</a> | <a href="https://huggingface.co/collections/THUdyh/oryx-66ebe5d0cfb61a2837a103ff">Huggingface</a> | <a href="https://arxiv.org/abs/2409.12961">Paper</a> | <a href="https://x.com/_akhaliq/status/1836963718887866400"> Twitter </a> </h5>
|
| 41 |
+
</div>
|
| 42 |
+
</div>
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
bibtext = """
|
| 46 |
+
### Citation
|
| 47 |
+
```
|
| 48 |
+
@article{liu2024oryx,
|
| 49 |
+
title={Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution},
|
| 50 |
+
author={Liu, Zuyan and Dong, Yuhao and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
|
| 51 |
+
journal={arXiv preprint arXiv:2409.12961},
|
| 52 |
+
year={2024}
|
| 53 |
+
}
|
| 54 |
+
```
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
|
| 58 |
roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
|
| 59 |
|
|
|
|
| 110 |
@spaces.GPU(duration=120)
|
| 111 |
def oryx_inference(multimodal):
|
| 112 |
visual, text = multimodal["files"][0], multimodal["text"]
|
| 113 |
+
if visual.endswith("case/image2.png"):
|
| 114 |
+
modality = "video"
|
| 115 |
+
visual = f"{cur_dir}/case/case1.mp4"
|
| 116 |
if visual.endswith(".mp4"):
|
| 117 |
modality = "video"
|
| 118 |
else:
|
|
|
|
| 218 |
return outputs
|
| 219 |
|
| 220 |
# Define input and output for the Gradio interface
|
|
|
|
| 221 |
demo = gr.Interface(
|
| 222 |
fn=oryx_inference,
|
| 223 |
inputs=gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."),
|
| 224 |
outputs="text",
|
| 225 |
examples=[
|
| 226 |
{
|
| 227 |
+
"files":[f"{cur_dir}/case/image2.png"],
|
| 228 |
"text":"Describe what is happening in this video in detail.",
|
| 229 |
},
|
| 230 |
{
|
|
|
|
| 233 |
},
|
| 234 |
],
|
| 235 |
title="Oryx Demo",
|
| 236 |
+
description=title_markdown,
|
| 237 |
+
article=bibtext,
|
| 238 |
)
|
| 239 |
|
| 240 |
# Launch the Gradio app
|