Spaces:

GF-John
/

video-caption

Running on Zero

App Files Files Community

John Ho commited on Jul 23

Commit

4fa18d9

1 Parent(s): 8fe5da3

added function to read fps

Browse files

Files changed (2) hide show

app.py +29 -5
pyproject.toml +1 -0

app.py CHANGED Viewed

@@ -1,7 +1,14 @@
-import spaces
 import gradio as gr
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # --- Installing Flash Attention for ZeroGPU is special --- #
 import subprocess
@@ -16,6 +23,20 @@ subprocess.run(
 # The model is trained on 8.0 FPS which we recommend for optimal inference
 @spaces.GPU(duration=30)
 def load_model(
     model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
@@ -40,10 +61,13 @@ def load_model(
 @spaces.GPU(duration=120)
-def inference(video_path: str):
     # default processor
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
     messages = [
         {
             "role": "user",
@@ -51,9 +75,9 @@ def inference(video_path: str):
                 {
                     "type": "video",
                     "video": video_path,
-                    "fps": 8.0,
                 },
-                {"type": "text", "text": "Describe the camera motion in this video."},
             ],
         }
     ]

+import spaces, ffmpeg, os
 import gradio as gr
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
+from loguru import logger
+logger.remove()
+logger.add(
+    sys.stderr,
+    format="<d>{time:YYYY-MM-DD ddd HH:mm:ss}</d> | <lvl>{level}</lvl> | <lvl>{message}</lvl>",
+)
 # --- Installing Flash Attention for ZeroGPU is special --- #
 import subprocess
 # The model is trained on 8.0 FPS which we recommend for optimal inference
+def get_fps_ffmpeg(video_path: str):
+    probe = ffmpeg.probe(video_path)
+    # Find the first video stream
+    video_stream = next(
+        (stream for stream in probe["streams"] if stream["codec_type"] == "video"), None
+    )
+    if video_stream is None:
+        raise ValueError("No video stream found")
+    # Frame rate is given as a string fraction, e.g., '30000/1001'
+    r_frame_rate = video_stream["r_frame_rate"]
+    num, denom = map(int, r_frame_rate.split("/"))
+    return num / denom
 @spaces.GPU(duration=30)
 def load_model(
     model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
 @spaces.GPU(duration=120)
+def inference(
+    video_path: str, prompt: str = "Describe the camera motion in this video."
+):
     # default processor
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    fps = get_fps_ffmpeg(video_path)
+    logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
     messages = [
         {
             "role": "user",
                 {
                     "type": "video",
                     "video": video_path,
+                    "fps": fps,
                 },
+                {"type": "text", "text": prompt},
             ],
         }
     ]

pyproject.toml CHANGED Viewed

@@ -11,4 +11,5 @@ dependencies = [
     "loguru>=0.7.3",
     "qwen-vl-utils>=0.0.11",
     "torchvision==0.19.0",
 ]

     "loguru>=0.7.3",
     "qwen-vl-utils>=0.0.11",
     "torchvision==0.19.0",
+    "ffmpeg-python>=0.2.0"
 ]