anaspro commited on
Commit
24d5388
·
1 Parent(s): 79983eb
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -27,12 +27,12 @@ tokenizer = AutoTokenizer.from_pretrained(
27
 
28
  model = AutoModelForCausalLM.from_pretrained(
29
  model_path,
30
- device_map="auto",
31
  trust_remote_code=True,
32
  token=hf_token,
33
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
34
  low_cpu_mem_usage=True
35
- )
36
  print("تم تحميل المودل بنجاح!")
37
 
38
  if tokenizer.pad_token is None:
@@ -46,6 +46,7 @@ def get_response(text, tokenizer=tokenizer, model=model):
46
  generate_ids = model.generate(
47
  input_ids,
48
  attention_mask=attention_mask,
 
49
  top_p=0.8,
50
  temperature=0.2,
51
  max_length=input_len + 256, # Limit response length to prevent multiple responses
 
27
 
28
  model = AutoModelForCausalLM.from_pretrained(
29
  model_path,
30
+ device_map=None, # إزالة device_map لتجنب مشاكل مع past_key_values
31
  trust_remote_code=True,
32
  token=hf_token,
33
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
34
  low_cpu_mem_usage=True
35
+ ).to(device) # نقل المودل إلى الجهاز المحدد
36
  print("تم تحميل المودل بنجاح!")
37
 
38
  if tokenizer.pad_token is None:
 
46
  generate_ids = model.generate(
47
  input_ids,
48
  attention_mask=attention_mask,
49
+ past_key_values=None, # إضافة past_key_values صراحة لتجنب الأخطاء
50
  top_p=0.8,
51
  temperature=0.2,
52
  max_length=input_len + 256, # Limit response length to prevent multiple responses