LiamNguyenNOR commited on
Commit
b93f452
·
verified ·
1 Parent(s): 7e70322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -37
app.py CHANGED
@@ -1,54 +1,76 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("NorwAI/NorwAI-Llama2-7B")
8
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
27
 
28
- response = ""
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
  demo = gr.ChatInterface(
47
  respond,
48
  additional_inputs=[
49
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
  gr.Slider(
53
  minimum=0.1,
54
  maximum=1.0,
@@ -59,6 +81,5 @@ demo = gr.ChatInterface(
59
  ],
60
  )
61
 
62
-
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
+ # --- Configuration ---
6
+ MODEL_NAME = "NorwAI/NorwAI-Llama2-7B" #"google/gemma-2-9b"
 
 
7
 
8
+ # --- Model Loading (Explicit) ---
9
+ # Use a try-except block to handle potential loading errors
10
+ try:
11
+ # Load the tokenizer
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
 
14
+ # Load the model with appropriate configurations.
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_NAME,
17
+ device_map="auto", # Use "auto" to let Transformers handle device placement.
18
+ torch_dtype=torch.bfloat16, # Use bfloat16 for reduced memory usage (if supported by your hardware).
19
+ )
 
 
 
20
 
21
+ except Exception as e:
22
+ print(f"Error loading model: {e}")
23
+ # You might want to raise the exception or exit gracefully here.
24
+ raise
 
25
 
26
+ # --- Inference Function ---
27
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
28
+ try:
29
+ # Build the conversation history. Use the correct roles ("user", "model").
30
+ formatted_history = ""
31
+ for user_msg, model_msg in history:
32
+ formatted_history += f"<start_of_turn>user\n{user_msg}<end_of_turn>\n"
33
+ if model_msg: # Check if model_msg is not None
34
+ formatted_history += f"<start_of_turn>model\n{model_msg}<end_of_turn>\n"
35
 
36
+ # Combine system message, history, and current message.
37
+ prompt = f"<start_of_turn>system\n{system_message}<end_of_turn>\n{formatted_history}<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
38
 
39
+ # Tokenize the input
40
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 
 
 
 
 
41
 
42
+ # Generate text with streaming (important for a chatbot).
43
+ streamer = model.generate(
44
+ **inputs,
45
+ max_new_tokens=max_tokens,
46
+ temperature=temperature,
47
+ top_p=top_p,
48
+ do_sample=True, # Enable sampling for more diverse responses.
49
+ streamer=True, #for stream
50
+ pad_token_id=tokenizer.eos_token_id
51
+ )
52
 
53
+ # Accumulate the response. Decode in chunks.
54
+ response = ""
55
+ for chunk in streamer:
56
+ if chunk is not None:
57
+ response += tokenizer.decode(chunk[0], skip_special_tokens=True)
58
+ yield response
59
+
60
+
61
+ except Exception as e:
62
+ print(f"Error during inference: {e}")
63
+ yield "An error occurred during generation."
64
+ return
65
+
66
+ # --- Gradio Interface ---
67
 
 
 
 
68
  demo = gr.ChatInterface(
69
  respond,
70
  additional_inputs=[
71
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
72
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
73
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
74
  gr.Slider(
75
  minimum=0.1,
76
  maximum=1.0,
 
81
  ],
82
  )
83
 
 
84
  if __name__ == "__main__":
85
+ demo.launch()