Spaces:

anaspro
/

chatbox

Runtime error

anaspro commited on Oct 25

Commit

154d3ef

1 Parent(s): d7e9b4a

upadte

Files changed (2) hide show

app.py CHANGED Viewed

@@ -24,12 +24,21 @@ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
 hf_token = os.getenv("HF_TOKEN")
 # Initialize pipeline for chat
 pipeline_model = pipeline(
     "text-generation",
     model=model_path,
-    device_map="auto",
     token=hf_token,
-    trust_remote_code=True
 )
 def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):

 hf_token = os.getenv("HF_TOKEN")
 # Initialize pipeline for chat
+# For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
 pipeline_model = pipeline(
     "text-generation",
     model=model_path,
+    device=0,  # Use GPU device directly
+    torch_dtype=torch.bfloat16,
     token=hf_token,
+    trust_remote_code=True,
+    model_kwargs={
+        "torch_dtype": torch.bfloat16,
+        "load_in_4bit": True,
+        "bnb_4bit_compute_dtype": torch.bfloat16,
+        "bnb_4bit_use_double_quant": False,
+        "bnb_4bit_quant_type": "nf4",
+    }
 )
 def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):

test_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 import transformers
 from transformers import pipeline
-model_path = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
 # إذا كان فيه HF_TOKEN في البيئة
 hf_token = os.getenv("HF_TOKEN")
@@ -14,12 +14,21 @@ hf_token = os.getenv("HF_TOKEN")
 print("Loading model...")
 try:
     # Initialize pipeline for chat
     pipeline_model = pipeline(
         "text-generation",
         model=model_path,
-        device_map="auto",
         token=hf_token,
-        trust_remote_code=True
     )
     print("Model loaded successfully!")

 import transformers
 from transformers import pipeline
+model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
 # إذا كان فيه HF_TOKEN في البيئة
 hf_token = os.getenv("HF_TOKEN")
 print("Loading model...")
 try:
     # Initialize pipeline for chat
+    # For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
     pipeline_model = pipeline(
         "text-generation",
         model=model_path,
+        device=0,  # Use GPU device directly
+        torch_dtype=torch.bfloat16,
         token=hf_token,
+        trust_remote_code=True,
+        model_kwargs={
+            "torch_dtype": torch.bfloat16,
+            "load_in_4bit": True,
+            "bnb_4bit_compute_dtype": torch.bfloat16,
+            "bnb_4bit_use_double_quant": False,
+            "bnb_4bit_quant_type": "nf4",
+        }
     )
     print("Model loaded successfully!")