Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Aug 9, 2025

Commit

c337e98

verified ·

1 Parent(s): 819aa9c

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -4

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ import requests
 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
     AutoModel,
@@ -30,8 +29,20 @@ MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # --- Model Loading ---
 # To address the warnings, we add `use_fast=False` to ensure we use the
@@ -81,7 +92,9 @@ model_v4 = AutoModel.from_pretrained(
     MODEL_ID_V4,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
-    attn_implementation='sdpa'
 ).eval().to(device)
 tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True, use_fast=False)
@@ -312,4 +325,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, show_error=True)

 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
     AutoModel,
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# Let the environment (e.g., Hugging Face Spaces) determine the device.
+# This avoids conflicts with the CUDA environment setup by the platform.
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
+print("torch.__version__ =", torch.__version__)
+print("torch.version.cuda =", torch.version.cuda)
+print("cuda available:", torch.cuda.is_available())
+print("cuda device count:", torch.cuda.device_count())
+if torch.cuda.is_available():
+    print("current device:", torch.cuda.current_device())
+    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
+print("Using device:", device)
 # --- Model Loading ---
 # To address the warnings, we add `use_fast=False` to ensure we use the
     MODEL_ID_V4,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
+    # Using 'sdpa' can sometimes cause issues in certain environments,
+    # letting transformers choose the default is safer.
+    # attn_implementation='sdpa'
 ).eval().to(device)
 tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True, use_fast=False)
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)