nsultan5 commited on
Commit
8844959
·
verified ·
1 Parent(s): 481bbf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -26,17 +26,29 @@ if hf_token is None:
26
  raise ValueError("HF_TOKEN environment variable not found. Please add it in your secrets or environment.")
27
 
28
  # Load tokenizer
29
- tokenizer = AutoTokenizer.from_pretrained(llama_model_id, use_auth_token=hf_token)
30
 
31
  # Load quantized model (AutoGPTQ)
 
 
 
 
 
 
 
 
 
 
 
32
  model = AutoGPTQForCausalLM.from_quantized(
33
- llama_model_id,
34
  use_safetensors=True,
 
35
  device="cuda" if torch.cuda.is_available() else "cpu",
36
- use_triton=True, # Use Triton for faster inference if available
37
- use_auth_token=hf_token,
38
  )
39
 
 
40
  # Text generation pipeline
41
  text_pipe = pipeline(
42
  "text-generation",
 
26
  raise ValueError("HF_TOKEN environment variable not found. Please add it in your secrets or environment.")
27
 
28
  # Load tokenizer
29
+
30
 
31
  # Load quantized model (AutoGPTQ)
32
+ from auto_gptq import AutoGPTQForCausalLM
33
+ from transformers import AutoTokenizer
34
+
35
+ model_name = "TheBloke/Llama-2-7B-Chat-GPTQ" # change to your GPTQ model ID
36
+
37
+ tokenizer = AutoTokenizer.from_pretrained(
38
+ model_name,
39
+ token=hf_token,
40
+ trust_remote_code=True
41
+ )
42
+
43
  model = AutoGPTQForCausalLM.from_quantized(
44
+ model_name_or_path=model_name,
45
  use_safetensors=True,
46
+ trust_remote_code=True,
47
  device="cuda" if torch.cuda.is_available() else "cpu",
48
+ token=hf_token
 
49
  )
50
 
51
+
52
  # Text generation pipeline
53
  text_pipe = pipeline(
54
  "text-generation",