prakhardoneria commited on
Commit
282328e
·
verified ·
1 Parent(s): c5a5582

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -1,37 +1,38 @@
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
3
  import gradio as gr
4
 
5
- # Load model (automatically downloaded and cached by Hugging Face)
6
- model_id = "mistralai/Mistral-7B-Instruct-v0.1"
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_id)
9
  model = AutoModelForCausalLM.from_pretrained(
10
  model_id,
11
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
12
  device_map="auto"
13
  )
14
 
15
  streamer = TextStreamer(tokenizer, skip_prompt=True)
16
 
17
- # Simple chat loop
18
  def chat(message, history):
19
  prompt = ""
20
  for user, bot in history:
21
- prompt += f"[INST] {user.strip()} [/INST] {bot.strip()} "
22
- prompt += f"[INST] {message.strip()} [/INST]"
23
 
24
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
25
- output = model.generate(
26
  **inputs,
27
  max_new_tokens=256,
28
  temperature=0.7,
 
29
  do_sample=True,
30
- top_p=0.95,
31
  pad_token_id=tokenizer.eos_token_id
32
  )
33
- result = tokenizer.decode(output[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
34
- return result
 
35
 
36
- # Launch Gradio app
37
- gr.ChatInterface(fn=chat, title="Mistral Chat (CPU)", description="Ask questions, get answers using a real LLM.").launch()
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
3
  import gradio as gr
4
 
5
+ # Use lightweight, public model
6
+ model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_id)
9
  model = AutoModelForCausalLM.from_pretrained(
10
  model_id,
11
+ torch_dtype=torch.float32,
12
  device_map="auto"
13
  )
14
 
15
  streamer = TextStreamer(tokenizer, skip_prompt=True)
16
 
17
+ # Chat formatting
18
  def chat(message, history):
19
  prompt = ""
20
  for user, bot in history:
21
+ prompt += f"<|user|>\n{user.strip()}\n<|assistant|>\n{bot.strip()}\n"
22
+ prompt += f"<|user|>\n{message.strip()}\n<|assistant|>\n"
23
 
24
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
25
+ outputs = model.generate(
26
  **inputs,
27
  max_new_tokens=256,
28
  temperature=0.7,
29
+ top_p=0.9,
30
  do_sample=True,
 
31
  pad_token_id=tokenizer.eos_token_id
32
  )
33
+ text = tokenizer.decode(outputs[0], skip_special_tokens=True)
34
+ reply = text.split("<|assistant|>")[-1].strip()
35
+ return reply
36
 
37
+ # Gradio UI
38
+ gr.ChatInterface(chat, title="TinyLlama Chat", description="Lightweight local LLM (1.1B)").launch()