anaspro commited on
Commit
154d3ef
·
1 Parent(s): d7e9b4a
Files changed (2) hide show
  1. app.py +11 -2
  2. test_model.py +12 -3
app.py CHANGED
@@ -24,12 +24,21 @@ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
24
  hf_token = os.getenv("HF_TOKEN")
25
 
26
  # Initialize pipeline for chat
 
27
  pipeline_model = pipeline(
28
  "text-generation",
29
  model=model_path,
30
- device_map="auto",
 
31
  token=hf_token,
32
- trust_remote_code=True
 
 
 
 
 
 
 
33
  )
34
 
35
  def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
 
24
  hf_token = os.getenv("HF_TOKEN")
25
 
26
  # Initialize pipeline for chat
27
+ # For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
28
  pipeline_model = pipeline(
29
  "text-generation",
30
  model=model_path,
31
+ device=0, # Use GPU device directly
32
+ torch_dtype=torch.bfloat16,
33
  token=hf_token,
34
+ trust_remote_code=True,
35
+ model_kwargs={
36
+ "torch_dtype": torch.bfloat16,
37
+ "load_in_4bit": True,
38
+ "bnb_4bit_compute_dtype": torch.bfloat16,
39
+ "bnb_4bit_use_double_quant": False,
40
+ "bnb_4bit_quant_type": "nf4",
41
+ }
42
  )
43
 
44
  def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
test_model.py CHANGED
@@ -6,7 +6,7 @@ import torch
6
  import transformers
7
  from transformers import pipeline
8
 
9
- model_path = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
10
 
11
  # إذا كان فيه HF_TOKEN في البيئة
12
  hf_token = os.getenv("HF_TOKEN")
@@ -14,12 +14,21 @@ hf_token = os.getenv("HF_TOKEN")
14
  print("Loading model...")
15
  try:
16
  # Initialize pipeline for chat
 
17
  pipeline_model = pipeline(
18
  "text-generation",
19
  model=model_path,
20
- device_map="auto",
 
21
  token=hf_token,
22
- trust_remote_code=True
 
 
 
 
 
 
 
23
  )
24
 
25
  print("Model loaded successfully!")
 
6
  import transformers
7
  from transformers import pipeline
8
 
9
+ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
10
 
11
  # إذا كان فيه HF_TOKEN في البيئة
12
  hf_token = os.getenv("HF_TOKEN")
 
14
  print("Loading model...")
15
  try:
16
  # Initialize pipeline for chat
17
+ # For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
18
  pipeline_model = pipeline(
19
  "text-generation",
20
  model=model_path,
21
+ device=0, # Use GPU device directly
22
+ torch_dtype=torch.bfloat16,
23
  token=hf_token,
24
+ trust_remote_code=True,
25
+ model_kwargs={
26
+ "torch_dtype": torch.bfloat16,
27
+ "load_in_4bit": True,
28
+ "bnb_4bit_compute_dtype": torch.bfloat16,
29
+ "bnb_4bit_use_double_quant": False,
30
+ "bnb_4bit_quant_type": "nf4",
31
+ }
32
  )
33
 
34
  print("Model loaded successfully!")