Alvaro8gb commited on
Commit
3acb6f9
·
verified ·
1 Parent(s): 5ae758d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -20
app.py CHANGED
@@ -1,22 +1,40 @@
1
  import os
2
  import gradio as gr
 
 
 
3
 
4
- # Constants for generation parameters
5
  MAX_NEW_TOKENS = 100
6
  TEMPERATURE = 0.5
7
  TOP_P = 0.95
8
  TOP_K = 50
9
  REPETITION_PENALTY = 1.05
10
 
11
-
12
- # Global variables to store model and tokenizer
13
- model = None
14
- tokenizer = None
15
 
16
  def load_model():
17
- global model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  return model, tokenizer
19
 
 
 
20
 
21
  def generate_response(input_text, max_tokens, temperature, top_p, repetition_penalty):
22
  global model, tokenizer
@@ -24,23 +42,41 @@ def generate_response(input_text, max_tokens, temperature, top_p, repetition_pen
24
  if model is None or tokenizer is None:
25
  model, tokenizer = load_model()
26
 
 
27
 
28
- return "Adios"
 
 
 
 
 
 
 
 
 
29
 
30
- def chat_interface(message, history, system_message, max_tokens, temperature, top_p, repetition_penalty):
 
 
 
 
 
 
 
31
 
32
- prompt = f"{message}"
33
- if system_message:
34
- prompt = f"{system_message}\n{message}"
35
-
36
- response = generate_response(
37
- prompt,
38
- max_tokens,
39
- temperature,
40
- top_p,
41
- repetition_penalty
42
- )
43
- return response
 
44
 
45
  demo = gr.ChatInterface(
46
  chat_interface,
 
1
  import os
2
  import gradio as gr
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from peft import PeftModel, PeftConfig
6
 
 
7
  MAX_NEW_TOKENS = 100
8
  TEMPERATURE = 0.5
9
  TOP_P = 0.95
10
  TOP_K = 50
11
  REPETITION_PENALTY = 1.05
12
 
13
+ HF_TOKEN = os.getenv('HF_TOKEN')
 
 
 
14
 
15
  def load_model():
16
+ base_model_id = "meta-llama/Llama-2-7b-hf"
17
+ peft_model_id = "somosnlp-hackathon-2025/Llama-2-7b-hf-lora-refranes"
18
+
19
+ config = PeftConfig.from_pretrained(peft_model_id)
20
+
21
+ base_model = AutoModelForCausalLM.from_pretrained(
22
+ base_model_id,
23
+ torch_dtype="auto",
24
+ device_map="auto",
25
+ token=HF_TOKEN
26
+ )
27
+
28
+ model = PeftModel.from_pretrained(base_model, peft_model_id)
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
31
+ if tokenizer.pad_token is None:
32
+ tokenizer.pad_token = tokenizer.eos_token
33
+
34
  return model, tokenizer
35
 
36
+ model = None
37
+ tokenizer = None
38
 
39
  def generate_response(input_text, max_tokens, temperature, top_p, repetition_penalty):
40
  global model, tokenizer
 
42
  if model is None or tokenizer is None:
43
  model, tokenizer = load_model()
44
 
45
+ inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
46
 
47
+ with torch.no_grad():
48
+ outputs = model.generate(
49
+ **inputs,
50
+ max_new_tokens=max_tokens,
51
+ temperature=temperature,
52
+ do_sample=True,
53
+ top_p=top_p,
54
+ top_k=TOP_K,
55
+ repetition_penalty=repetition_penalty
56
+ )
57
 
58
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
59
+
60
+ if "->" in full_response:
61
+ response_parts = full_response.split("->", 1)
62
+ if len(response_parts) > 1:
63
+ return response_parts[1].strip()
64
+
65
+ return full_response.strip()
66
 
67
+ def chat_interface(message, history, system_message, max_tokens, temperature, top_p, repetition_penalty):
68
+ prompt = f"{message}"
69
+ if system_message:
70
+ prompt = f"{system_message}\n{message}"
71
+
72
+ response = generate_response(
73
+ prompt,
74
+ max_tokens,
75
+ temperature,
76
+ top_p,
77
+ repetition_penalty
78
+ )
79
+ return response
80
 
81
  demo = gr.ChatInterface(
82
  chat_interface,