Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
| 1 |
-
import spaces
|
| 2 |
import gradio as gr
|
| 3 |
import subprocess
|
| 4 |
from deep_translator import GoogleTranslator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# Gerekli kütüphanelerin kurulumu
|
| 7 |
subprocess.run(
|
|
@@ -11,64 +18,42 @@ subprocess.run(
|
|
| 11 |
)
|
| 12 |
subprocess.run("pip install deep_translator", shell=True)
|
| 13 |
|
| 14 |
-
import torch
|
| 15 |
-
from llava.model.builder import load_pretrained_model
|
| 16 |
-
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
| 17 |
-
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
| 18 |
-
from llava.conversation import conv_templates, SeparatorStyle
|
| 19 |
-
import copy
|
| 20 |
-
import warnings
|
| 21 |
-
from decord import VideoReader, cpu
|
| 22 |
-
import numpy as np
|
| 23 |
-
|
| 24 |
# Çevirmen nesnesi oluştur
|
| 25 |
translator = GoogleTranslator(source='tr', target='en')
|
| 26 |
translator_reverse = GoogleTranslator(source='en', target='tr')
|
| 27 |
|
| 28 |
title = "# 🙋🏻♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
|
| 29 |
-
description1 = """**🌋📹LLaVA-Video-7B-Qwen2**,
|
| 30 |
-
Bu model, görsel girdi için **SO400M görüş omurgasını** ve dil işleme için Qwen2'yi kullanır, bu da onu görsel ve video tabanlı görevler de dahil olmak üzere çoklu modal akıl yürütmede oldukça verimli kılar.
|
| 31 |
-
🌋📹LLaVA-Video'nun [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) ve [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) daha büyük varyantları ve [sadece yeni sentetik veriler üzerinde eğitilmiş bir varyantı](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) bulunmaktadır.
|
| 32 |
-
Daha fazla detay için lütfen [Proje Sayfasını](https://github.com/LLaVA-VL/LLaVA-NeXT) ziyaret edin veya ilgili [araştırma makalesine](https://arxiv.org/abs/2410.02713) göz atın.
|
| 33 |
-
- **Mimari**: `LlavaQwenForCausalLM`
|
| 34 |
-
- **Dikkat Başlıkları**: 28
|
| 35 |
-
- **Gizli Katmanlar**: 28
|
| 36 |
-
- **Gizli Boyut**: 3584
|
| 37 |
"""
|
| 38 |
description2 = """
|
| 39 |
-
|
| 40 |
-
- **Desteklenen Maksimum Kare Sayısı**: 64
|
| 41 |
-
- **Desteklenen Diller**: İngilizce, Çince
|
| 42 |
-
- **Görüntü En-Boy Oranı**: `anyres_max_9`
|
| 43 |
-
- **Görüntü Çözünürlüğü**: Çeşitli ızgara çözünürlükleri
|
| 44 |
-
- **Maksimum Konum Gömmeleri**: 32,768
|
| 45 |
-
- **Kelime Dağarcığı Boyutu**: 152,064
|
| 46 |
-
- **Model Hassasiyeti**: bfloat16
|
| 47 |
-
- **Eğitim İçin Kullanılan Donanım**: 256 * Nvidia Tesla A100 GPU'ları
|
| 48 |
"""
|
| 49 |
|
| 50 |
join_us = """
|
| 51 |
## Bize Katılın:
|
| 52 |
-
|
| 53 |
"""
|
| 54 |
|
| 55 |
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
|
| 56 |
if max_frames_num == 0:
|
| 57 |
return np.zeros((1, 336, 336, 3))
|
|
|
|
| 58 |
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
| 59 |
total_frame_num = len(vr)
|
| 60 |
-
video_time = total_frame_num / vr.get_avg_fps()
|
| 61 |
fps = round(vr.get_avg_fps()/fps)
|
| 62 |
frame_idx = [i for i in range(0, len(vr), fps)]
|
| 63 |
-
frame_time = [i/
|
|
|
|
| 64 |
if len(frame_idx) > max_frames_num or force_sample:
|
| 65 |
sample_fps = max_frames_num
|
| 66 |
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
|
| 67 |
frame_idx = uniform_sampled_frames.tolist()
|
| 68 |
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
|
|
|
| 69 |
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
|
| 70 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
| 71 |
-
|
|
|
|
| 72 |
|
| 73 |
# Model yükleme
|
| 74 |
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
|
|
@@ -81,7 +66,6 @@ tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained
|
|
| 81 |
model.eval()
|
| 82 |
print("Model başarıyla yüklendi!")
|
| 83 |
|
| 84 |
-
@spaces.GPU
|
| 85 |
def process_video(video_path, question):
|
| 86 |
try:
|
| 87 |
max_frames_num = 64
|
|
@@ -101,8 +85,8 @@ def process_video(video_path, question):
|
|
| 101 |
conv.append_message(conv.roles[1], None)
|
| 102 |
prompt_question = conv.get_prompt()
|
| 103 |
|
| 104 |
-
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").
|
| 105 |
-
|
| 106 |
with torch.no_grad():
|
| 107 |
output = model.generate(
|
| 108 |
input_ids,
|
|
@@ -150,4 +134,4 @@ with gr.Blocks() as demo:
|
|
| 150 |
)
|
| 151 |
|
| 152 |
if __name__ == "__main__":
|
| 153 |
-
demo.launch(show_error=True)
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
from deep_translator import GoogleTranslator
|
| 4 |
+
import torch
|
| 5 |
+
from llava.model.builder import load_pretrained_model
|
| 6 |
+
from llava.mm_utils import tokenizer_image_token
|
| 7 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
| 8 |
+
from llava.conversation import conv_templates
|
| 9 |
+
from decord import VideoReader, cpu
|
| 10 |
+
import numpy as np
|
| 11 |
+
import copy
|
| 12 |
|
| 13 |
# Gerekli kütüphanelerin kurulumu
|
| 14 |
subprocess.run(
|
|
|
|
| 18 |
)
|
| 19 |
subprocess.run("pip install deep_translator", shell=True)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# Çevirmen nesnesi oluştur
|
| 22 |
translator = GoogleTranslator(source='tr', target='en')
|
| 23 |
translator_reverse = GoogleTranslator(source='en', target='tr')
|
| 24 |
|
| 25 |
title = "# 🙋🏻♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
|
| 26 |
+
description1 = """**🌋📹LLaVA-Video-7B-Qwen2**, ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
"""
|
| 28 |
description2 = """
|
| 29 |
+
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
join_us = """
|
| 33 |
## Bize Katılın:
|
| 34 |
+
...
|
| 35 |
"""
|
| 36 |
|
| 37 |
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
|
| 38 |
if max_frames_num == 0:
|
| 39 |
return np.zeros((1, 336, 336, 3))
|
| 40 |
+
|
| 41 |
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
| 42 |
total_frame_num = len(vr)
|
|
|
|
| 43 |
fps = round(vr.get_avg_fps()/fps)
|
| 44 |
frame_idx = [i for i in range(0, len(vr), fps)]
|
| 45 |
+
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
| 46 |
+
|
| 47 |
if len(frame_idx) > max_frames_num or force_sample:
|
| 48 |
sample_fps = max_frames_num
|
| 49 |
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
|
| 50 |
frame_idx = uniform_sampled_frames.tolist()
|
| 51 |
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
| 52 |
+
|
| 53 |
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
|
| 54 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
| 55 |
+
|
| 56 |
+
return spare_frames, frame_time, total_frame_num / vr.get_avg_fps()
|
| 57 |
|
| 58 |
# Model yükleme
|
| 59 |
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
|
|
|
|
| 66 |
model.eval()
|
| 67 |
print("Model başarıyla yüklendi!")
|
| 68 |
|
|
|
|
| 69 |
def process_video(video_path, question):
|
| 70 |
try:
|
| 71 |
max_frames_num = 64
|
|
|
|
| 85 |
conv.append_message(conv.roles[1], None)
|
| 86 |
prompt_question = conv.get_prompt()
|
| 87 |
|
| 88 |
+
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").to(device)
|
| 89 |
+
|
| 90 |
with torch.no_grad():
|
| 91 |
output = model.generate(
|
| 92 |
input_ids,
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
if __name__ == "__main__":
|
| 137 |
+
demo.launch(show_error=True)
|