Spaces:
No application file
No application file
Update app.py
Browse files
app.py
CHANGED
|
@@ -1030,6 +1030,9 @@ def generate_video(
|
|
| 1030 |
# SmolVLM2 β Auto-describe motion from reference video
|
| 1031 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1032 |
SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
|
|
|
|
|
|
|
|
|
| 1033 |
_vlm_model = None
|
| 1034 |
_vlm_processor = None
|
| 1035 |
|
|
@@ -1055,37 +1058,19 @@ def _load_vlm():
|
|
| 1055 |
if _vlm_model is None:
|
| 1056 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 1057 |
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
from transformers import SmolVLMProcessor
|
| 1061 |
-
print(f"[SmolVLM] SmolVLMProcessor import OK: {SmolVLMProcessor}")
|
| 1062 |
-
except ImportError as diag_e:
|
| 1063 |
-
print(f"[SmolVLM] SmolVLMProcessor direct import failed: {diag_e}")
|
| 1064 |
-
# Try to see what's actually missing
|
| 1065 |
-
try:
|
| 1066 |
-
import num2words
|
| 1067 |
-
print(f"[SmolVLM] num2words OK: {num2words.__version__}")
|
| 1068 |
-
except ImportError:
|
| 1069 |
-
print("[SmolVLM] num2words is MISSING β installing now...")
|
| 1070 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)
|
| 1071 |
-
try:
|
| 1072 |
-
import decord
|
| 1073 |
-
print(f"[SmolVLM] decord OK")
|
| 1074 |
-
except ImportError:
|
| 1075 |
-
print("[SmolVLM] decord is MISSING β installing now...")
|
| 1076 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "decord"], check=True)
|
| 1077 |
-
|
| 1078 |
-
print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID}...")
|
| 1079 |
-
_vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID)
|
| 1080 |
try:
|
| 1081 |
_vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 1082 |
SMOLVLM_MODEL_ID,
|
|
|
|
| 1083 |
torch_dtype=torch.bfloat16,
|
| 1084 |
_attn_implementation="flash_attention_2",
|
| 1085 |
).to("cuda")
|
| 1086 |
except Exception:
|
| 1087 |
_vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 1088 |
SMOLVLM_MODEL_ID,
|
|
|
|
| 1089 |
torch_dtype=torch.bfloat16,
|
| 1090 |
).to("cuda")
|
| 1091 |
print("[SmolVLM] Model loaded!")
|
|
|
|
| 1030 |
# SmolVLM2 β Auto-describe motion from reference video
|
| 1031 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1032 |
SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
| 1033 |
+
# Pin to a revision known to work with transformers==4.57.6
|
| 1034 |
+
# (the main branch updated processor_config.json to reference a newer processor class)
|
| 1035 |
+
SMOLVLM_REVISION = "3444947b810d9efa1173515e44396d7710ba1042"
|
| 1036 |
_vlm_model = None
|
| 1037 |
_vlm_processor = None
|
| 1038 |
|
|
|
|
| 1058 |
if _vlm_model is None:
|
| 1059 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 1060 |
|
| 1061 |
+
print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID} (rev {SMOLVLM_REVISION[:8]})...")
|
| 1062 |
+
_vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID, revision=SMOLVLM_REVISION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1063 |
try:
|
| 1064 |
_vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 1065 |
SMOLVLM_MODEL_ID,
|
| 1066 |
+
revision=SMOLVLM_REVISION,
|
| 1067 |
torch_dtype=torch.bfloat16,
|
| 1068 |
_attn_implementation="flash_attention_2",
|
| 1069 |
).to("cuda")
|
| 1070 |
except Exception:
|
| 1071 |
_vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 1072 |
SMOLVLM_MODEL_ID,
|
| 1073 |
+
revision=SMOLVLM_REVISION,
|
| 1074 |
torch_dtype=torch.bfloat16,
|
| 1075 |
).to("cuda")
|
| 1076 |
print("[SmolVLM] Model loaded!")
|