multiinputgenerativevideo

No application file

App Files Files

linoyts HF Staff commited on Mar 17

Commit

fb8d538

verified ·

1 Parent(s): 1ac11cf

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -22

app.py CHANGED Viewed

@@ -1030,6 +1030,9 @@ def generate_video(
 # SmolVLM2 — Auto-describe motion from reference video
 # ─────────────────────────────────────────────────────────────────────────────
 SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
 _vlm_model = None
 _vlm_processor = None
@@ -1055,37 +1058,19 @@ def _load_vlm():
     if _vlm_model is None:
         from transformers import AutoProcessor, AutoModelForImageTextToText
-        # Diagnostic: surface the real import error
-        try:
-            from transformers import SmolVLMProcessor
-            print(f"[SmolVLM] SmolVLMProcessor import OK: {SmolVLMProcessor}")
-        except ImportError as diag_e:
-            print(f"[SmolVLM] SmolVLMProcessor direct import failed: {diag_e}")
-            # Try to see what's actually missing
-            try:
-                import num2words
-                print(f"[SmolVLM] num2words OK: {num2words.__version__}")
-            except ImportError:
-                print("[SmolVLM] num2words is MISSING — installing now...")
-                subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)
-            try:
-                import decord
-                print(f"[SmolVLM] decord OK")
-            except ImportError:
-                print("[SmolVLM] decord is MISSING — installing now...")
-                subprocess.run([sys.executable, "-m", "pip", "install", "decord"], check=True)
-        print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID}...")
-        _vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID)
         try:
             _vlm_model = AutoModelForImageTextToText.from_pretrained(
                 SMOLVLM_MODEL_ID,
                 torch_dtype=torch.bfloat16,
                 _attn_implementation="flash_attention_2",
             ).to("cuda")
         except Exception:
             _vlm_model = AutoModelForImageTextToText.from_pretrained(
                 SMOLVLM_MODEL_ID,
                 torch_dtype=torch.bfloat16,
             ).to("cuda")
         print("[SmolVLM] Model loaded!")

 # SmolVLM2 — Auto-describe motion from reference video
 # ─────────────────────────────────────────────────────────────────────────────
 SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+# Pin to a revision known to work with transformers==4.57.6
+# (the main branch updated processor_config.json to reference a newer processor class)
+SMOLVLM_REVISION = "3444947b810d9efa1173515e44396d7710ba1042"
 _vlm_model = None
 _vlm_processor = None
     if _vlm_model is None:
         from transformers import AutoProcessor, AutoModelForImageTextToText
+        print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID} (rev {SMOLVLM_REVISION[:8]})...")
+        _vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID, revision=SMOLVLM_REVISION)
         try:
             _vlm_model = AutoModelForImageTextToText.from_pretrained(
                 SMOLVLM_MODEL_ID,
+                revision=SMOLVLM_REVISION,
                 torch_dtype=torch.bfloat16,
                 _attn_implementation="flash_attention_2",
             ).to("cuda")
         except Exception:
             _vlm_model = AutoModelForImageTextToText.from_pretrained(
                 SMOLVLM_MODEL_ID,
+                revision=SMOLVLM_REVISION,
                 torch_dtype=torch.bfloat16,
             ).to("cuda")
         print("[SmolVLM] Model loaded!")