eduard76 commited on
Commit
3a7ce12
·
verified ·
1 Parent(s): ba8e18b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -4,30 +4,30 @@ import gradio as gr
4
 
5
  model_id = "eduard76/Llama3-8b-good-new"
6
 
 
 
 
 
 
 
 
7
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
8
  model = AutoModelForCausalLM.from_pretrained(
9
  model_id,
10
- device_map="auto", # automatically uses GPU if available
11
  torch_dtype=torch.float16,
12
- load_in_4bit=True,
13
  trust_remote_code=True
14
  )
15
- model.eval()
16
 
17
- def chat(user_input, history):
18
- history_text = "\n".join([f"User: {u}\nAI: {a}" for u, a in history])
19
- prompt = f"{history_text}\nUser: {user_input}\nAI:"
 
 
 
20
 
21
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
22
- with torch.no_grad():
23
- outputs = model.generate(
24
- **inputs,
25
- max_new_tokens=512,
26
- do_sample=True,
27
- temperature=0.001
28
- )
29
- generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
30
- answer = generated.split("AI:")[-1].strip()
31
- return answer
32
 
33
- gr.ChatInterface(chat, title="💬 Chat with first Eduard LLM").launch()
 
 
4
 
5
  model_id = "eduard76/Llama3-8b-good-new"
6
 
7
+ quant_config = BitsAndBytesConfig(
8
+ load_in_4bit=True,
9
+ bnb_4bit_compute_dtype=torch.float16,
10
+ bnb_4bit_use_double_quant=True,
11
+ bnb_4bit_quant_type="nf4"
12
+ )
13
+
14
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
15
  model = AutoModelForCausalLM.from_pretrained(
16
  model_id,
17
+ device_map="auto",
18
  torch_dtype=torch.float16,
19
+ quantization_config=quant_config,
20
  trust_remote_code=True
21
  )
 
22
 
23
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
24
+
25
+ def chat(user_input):
26
+ prompt = f"User: {user_input}\nAI:"
27
+ response = pipe(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)[0]["generated_text"]
28
+ return response[len(prompt):].strip()
29
 
30
+ iface = gr.Interface(fn=chat, inputs="text", outputs="text", title="Llama3 8B Chat")
 
 
 
 
 
 
 
 
 
 
31
 
32
+ if __name__ == "__main__":
33
+ iface.launch()