Spaces:

metek7
/

instagram-short-summarizing

Runtime error

App Files Files Community

metek7 commited on Oct 8, 2024

Commit

40bd94b

verified ·

1 Parent(s): a84dbbf

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -36

app.py CHANGED Viewed

@@ -1,7 +1,14 @@
-import spaces
 import gradio as gr
 import subprocess
 from deep_translator import GoogleTranslator
 # Gerekli kütüphanelerin kurulumu
 subprocess.run(
@@ -11,64 +18,42 @@ subprocess.run(
 )
 subprocess.run("pip install deep_translator", shell=True)
-import torch
-from llava.model.builder import load_pretrained_model
-from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
-from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
-from llava.conversation import conv_templates, SeparatorStyle
-import copy
-import warnings
-from decord import VideoReader, cpu
-import numpy as np
 # Çevirmen nesnesi oluştur
 translator = GoogleTranslator(source='tr', target='en')
 translator_reverse = GoogleTranslator(source='en', target='tr')
 title = "# 🙋🏻‍♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
-description1 = """**🌋📹LLaVA-Video-7B-Qwen2**, 🌋📹LLaVA-Video-178K veri seti ve LLaVA-OneVision veri seti üzerinde eğitilmiş 7B parametreli bir modeldir. [**Qwen2 dil modeline dayanmaktadır**](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f) ve 32K tokene kadar bağlam penceresini destekler. Model, görüntüleri, çoklu görüntüleri ve videoları işleyebilir ve bunlarla etkileşime girebilir, video analizi için özel optimizasyonlara sahiptir.
-Bu model, görsel girdi için **SO400M görüş omurgasını** ve dil işleme için Qwen2'yi kullanır, bu da onu görsel ve video tabanlı görevler de dahil olmak üzere çoklu modal akıl yürütmede oldukça verimli kılar.
-🌋📹LLaVA-Video'nun [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) ve [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) daha büyük varyantları ve [sadece yeni sentetik veriler üzerinde eğitilmiş bir varyantı](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) bulunmaktadır.
-Daha fazla detay için lütfen [Proje Sayfasını](https://github.com/LLaVA-VL/LLaVA-NeXT) ziyaret edin veya ilgili [araştırma makalesine](https://arxiv.org/abs/2410.02713) göz atın.
-- **Mimari**: `LlavaQwenForCausalLM`
-- **Dikkat Başlıkları**: 28
-- **Gizli Katmanlar**: 28
-- **Gizli Boyut**: 3584
 """
 description2 = """
-- **Ara Boyut**: 18944
-- **Desteklenen Maksimum Kare Sayısı**: 64
-- **Desteklenen Diller**: İngilizce, Çince
-- **Görüntü En-Boy Oranı**: `anyres_max_9`
-- **Görüntü Çözünürlüğü**: Çeşitli ızgara çözünürlükleri
-- **Maksimum Konum Gömmeleri**: 32,768
-- **Kelime Dağarcığı Boyutu**: 152,064
-- **Model Hassasiyeti**: bfloat16
-- **Eğitim İçin Kullanılan Donanım**: 256 * Nvidia Tesla A100 GPU'ları
 """
 join_us = """
 ## Bize Katılın:
-🌟TeamTonic🌟 her zaman harika demolar yapıyor! Aktif geliştirici 🛠️topluluğumuza 👻 katılın [![Discord'da bize katılın](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) 🤗Huggingface'de:[MultiTransformer](https://huggingface.co/MultiTransformer) 🌐Github'da: [Tonic-AI](https://github.com/tonic-ai) & 🌟 [Build Tonic](https://git.tonic-ai.com/contribute)'e katkıda bulunun 🤗 Yuvi Sharma ve Huggingface'deki herkese topluluk hibesi için çok teşekkürler 🤗
 """
 def load_video(video_path, max_frames_num, fps=1, force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     total_frame_num = len(vr)
-    video_time = total_frame_num / vr.get_avg_fps()
     fps = round(vr.get_avg_fps()/fps)
     frame_idx = [i for i in range(0, len(vr), fps)]
-    frame_time = [i/fps for i in frame_idx]
     if len(frame_idx) > max_frames_num or force_sample:
         sample_fps = max_frames_num
         uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
         frame_idx = uniform_sampled_frames.tolist()
         frame_time = [i/vr.get_avg_fps() for i in frame_idx]
     frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
     spare_frames = vr.get_batch(frame_idx).asnumpy()
-    return spare_frames, frame_time, video_time
 # Model yükleme
 pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
@@ -81,7 +66,6 @@ tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained
 model.eval()
 print("Model başarıyla yüklendi!")
-@spaces.GPU
 def process_video(video_path, question):
     try:
         max_frames_num = 64
@@ -101,8 +85,8 @@ def process_video(video_path, question):
         conv.append_message(conv.roles[1], None)
         prompt_question = conv.get_prompt()
-        input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
         with torch.no_grad():
             output = model.generate(
                 input_ids,
@@ -150,4 +134,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch(show_error=True)

 import gradio as gr
 import subprocess
 from deep_translator import GoogleTranslator
+import torch
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import tokenizer_image_token
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from llava.conversation import conv_templates
+from decord import VideoReader, cpu
+import numpy as np
+import copy
 # Gerekli kütüphanelerin kurulumu
 subprocess.run(
 )
 subprocess.run("pip install deep_translator", shell=True)
 # Çevirmen nesnesi oluştur
 translator = GoogleTranslator(source='tr', target='en')
 translator_reverse = GoogleTranslator(source='en', target='tr')
 title = "# 🙋🏻‍♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
+description1 = """**🌋📹LLaVA-Video-7B-Qwen2**, ...
 """
 description2 = """
+...
 """
 join_us = """
 ## Bize Katılın:
+...
 """
 def load_video(video_path, max_frames_num, fps=1, force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     total_frame_num = len(vr)
     fps = round(vr.get_avg_fps()/fps)
     frame_idx = [i for i in range(0, len(vr), fps)]
+    frame_time = [i/vr.get_avg_fps() for i in frame_idx]
     if len(frame_idx) > max_frames_num or force_sample:
         sample_fps = max_frames_num
         uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
         frame_idx = uniform_sampled_frames.tolist()
         frame_time = [i/vr.get_avg_fps() for i in frame_idx]
     frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
     spare_frames = vr.get_batch(frame_idx).asnumpy()
+    return spare_frames, frame_time, total_frame_num / vr.get_avg_fps()
 # Model yükleme
 pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
 model.eval()
 print("Model başarıyla yüklendi!")
 def process_video(video_path, question):
     try:
         max_frames_num = 64
         conv.append_message(conv.roles[1], None)
         prompt_question = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").to(device)
         with torch.no_grad():
             output = model.generate(
                 input_ids,
     )
 if __name__ == "__main__":
+    demo.launch(show_error=True)