Spaces:

darisdzakwanhoesien
/

n8n-setup-dataset

Sleeping

App Files Files Community

darisdzakwanhoesien commited on Sep 22, 2025

Commit

6b6719d

verified ·

1 Parent(s): d617813

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -15

app.py CHANGED Viewed

@@ -2,9 +2,16 @@ import gradio as gr
 import json
 import torch
 import decord
-from transformers import AutoProcessor, AutoModelForVideoClassification, AutoTokenizer, AutoModel
-# --- 1. VideoMAE ---
 vm_model_id = "MCG-NJU/videomae-base-finetuned-kinetics"
 vm_processor = AutoProcessor.from_pretrained(vm_model_id)
 vm_model = AutoModelForVideoClassification.from_pretrained(vm_model_id)
@@ -25,32 +32,78 @@ def run_videomae(video):
     except Exception as e:
         return {"model": "VideoMAE", "error": str(e)}
-# --- 2. InternVideo2.5-Chat-8B ---
 iv_model_id = "OpenGVLab/InternVideo2_5_Chat_8B"
-iv_tokenizer = AutoTokenizer.from_pretrained(iv_model_id, trust_remote_code=True)
-iv_model = AutoModel.from_pretrained(iv_model_id, trust_remote_code=True).half().cuda().eval()
 def run_internvideo(video, prompt):
     try:
-        # Simplified: they provide a .chat() API in trust_remote_code
-        response, _ = iv_model.chat(iv_tokenizer, video_path=video, user_prompt=prompt, history=None)
-        return {"model": "InternVideo2.5-Chat-8B", "output": response}
     except Exception as e:
         return {"model": "InternVideo2.5-Chat-8B", "error": str(e)}
-# --- 3. LLaVA-Video-Llama-3.1-8B ---
 llava_model_id = "weizhiwang/LLaVA-Video-Llama-3.1-8B"
-llava_tokenizer = AutoTokenizer.from_pretrained(llava_model_id, trust_remote_code=True)
-llava_model = AutoModel.from_pretrained(llava_model_id, trust_remote_code=True).half().cuda().eval()
 def run_llava(video, prompt):
     try:
-        response, _ = llava_model.chat(llava_tokenizer, video_path=video, user_prompt=prompt, history=None)
-        return {"model": "LLaVA-Video-Llama-3.1-8B", "output": response}
     except Exception as e:
         return {"model": "LLaVA-Video-Llama-3.1-8B", "error": str(e)}
-# --- Unified ---
 def analyze_all(video, prompt):
     results = []
     results.append(run_videomae(video))
@@ -58,10 +111,15 @@ def analyze_all(video, prompt):
     results.append(run_llava(video, prompt))
     return json.dumps(results, indent=2)
 demo = gr.Interface(
     fn=analyze_all,
     inputs=[gr.Video(label="Upload Video"), gr.Textbox(label="Prompt")],
-    outputs="json"
 )
 if __name__ == "__main__":

 import json
 import torch
 import decord
+from transformers import (
+    AutoProcessor,
+    AutoModelForVideoClassification,
+    AutoTokenizer,
+    AutoModel
+)
+# ------------------------------------------------------------
+# 1. VideoMAE (simple classification)
+# ------------------------------------------------------------
 vm_model_id = "MCG-NJU/videomae-base-finetuned-kinetics"
 vm_processor = AutoProcessor.from_pretrained(vm_model_id)
 vm_model = AutoModelForVideoClassification.from_pretrained(vm_model_id)
     except Exception as e:
         return {"model": "VideoMAE", "error": str(e)}
+# ------------------------------------------------------------
+# 2. InternVideo2.5-Chat-8B
+# ------------------------------------------------------------
 iv_model_id = "OpenGVLab/InternVideo2_5_Chat_8B"
+try:
+    iv_tokenizer = AutoTokenizer.from_pretrained(iv_model_id, trust_remote_code=True)
+    iv_model = AutoModel.from_pretrained(
+        iv_model_id,
+        trust_remote_code=True,
+        revision="main"   # pin revision for stability
+    ).to(torch.bfloat16).cuda().eval()
+except Exception as e:
+    iv_model = None
+    iv_tokenizer = None
+    iv_load_error = str(e)
 def run_internvideo(video, prompt):
+    if iv_model is None:
+        return {"model": "InternVideo2.5-Chat-8B", "error": iv_load_error}
     try:
+        # TODO: Replace with proper frame extraction & preprocessing from repo
+        question = "Describe this video."
+        output, _ = iv_model.chat(
+            iv_tokenizer,
+            None,   # placeholder: pixel_values
+            question,
+            generation_config={"max_new_tokens": 256},
+            num_patches_list=[1],
+            history=None,
+            return_history=True
+        )
+        return {"model": "InternVideo2.5-Chat-8B", "output": output}
     except Exception as e:
         return {"model": "InternVideo2.5-Chat-8B", "error": str(e)}
+# ------------------------------------------------------------
+# 3. LLaVA-Video-Llama-3.1-8B
+# ------------------------------------------------------------
 llava_model_id = "weizhiwang/LLaVA-Video-Llama-3.1-8B"
+try:
+    lv_tokenizer = AutoTokenizer.from_pretrained(llava_model_id, trust_remote_code=True)
+    lv_model = AutoModel.from_pretrained(
+        llava_model_id,
+        trust_remote_code=True,
+        revision="main"
+    ).to(torch.bfloat16).cuda().eval()
+except Exception as e:
+    lv_model = None
+    lv_tokenizer = None
+    lv_load_error = str(e)
 def run_llava(video, prompt):
+    if lv_model is None:
+        return {"model": "LLaVA-Video-Llama-3.1-8B", "error": lv_load_error}
     try:
+        # TODO: Replace with proper preprocessing from repo
+        output, _ = lv_model.chat(
+            lv_tokenizer,
+            None,   # placeholder: pixel_values
+            prompt,
+            generation_config={"max_new_tokens": 256},
+            num_patches_list=[1],
+            history=None,
+            return_history=True
+        )
+        return {"model": "LLaVA-Video-Llama-3.1-8B", "output": output}
     except Exception as e:
         return {"model": "LLaVA-Video-Llama-3.1-8B", "error": str(e)}
+# ------------------------------------------------------------
+# Unified function
+# ------------------------------------------------------------
 def analyze_all(video, prompt):
     results = []
     results.append(run_videomae(video))
     results.append(run_llava(video, prompt))
     return json.dumps(results, indent=2)
+# ------------------------------------------------------------
+# Gradio UI
+# ------------------------------------------------------------
 demo = gr.Interface(
     fn=analyze_all,
     inputs=[gr.Video(label="Upload Video"), gr.Textbox(label="Prompt")],
+    outputs="json",
+    title="Multi-Model Video Analysis",
+    description="Runs the same video + prompt through VideoMAE, InternVideo2.5-Chat-8B, and LLaVA-Video-Llama-3.1-8B."
 )
 if __name__ == "__main__":