prakhardoneria commited on
Commit
9a71b5a
·
verified ·
1 Parent(s): 989ca7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -28
app.py CHANGED
@@ -1,36 +1,37 @@
1
- import os
 
2
  import gradio as gr
3
- from huggingface_hub import hf_hub_download
4
- from llama_cpp import Llama
5
 
6
- # Model info
7
- REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
8
- FILENAME = "TinyLlama-1.1B-Chat-v1.0.Q4_K_M.gguf"
9
 
10
- # Download model (if not already)
11
- model_path = hf_hub_download(repo_id=REPO, filename=FILENAME, cache_dir="./models")
12
-
13
- # Load model with llama-cpp
14
- llm = Llama(
15
- model_path=model_path,
16
- n_ctx=2048,
17
- n_threads=4,
18
- use_mlock=True
19
  )
20
 
21
- # Chat prompt wrapper
22
- def format_prompt(message, history):
23
- conversation = ""
24
- for user, bot in history:
25
- conversation += f"<|user|>\n{user.strip()}\n<|assistant|>\n{bot.strip()}\n"
26
- conversation += f"<|user|>\n{message.strip()}\n<|assistant|>\n"
27
- return conversation
28
 
 
29
  def chat(message, history):
30
- prompt = format_prompt(message, history)
31
- output = llm(prompt, max_tokens=256, temperature=0.7, top_p=0.9, stop=["<|user|>", "<|assistant|>"])
32
- reply = output["choices"][0]["text"].strip()
33
- return reply
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Gradio chat UI
36
- gr.ChatInterface(chat, title="TinyLlama CPU Chat", description="Lightweight local LLM (1.1B) powered by llama.cpp.").launch()
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
3
  import gradio as gr
 
 
4
 
5
+ # Load model (automatically downloaded and cached by Hugging Face)
6
+ model_id = "mistralai/Mistral-7B-Instruct-v0.1"
 
7
 
8
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ model_id,
11
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
12
+ device_map="auto"
 
 
 
 
13
  )
14
 
15
+ streamer = TextStreamer(tokenizer, skip_prompt=True)
 
 
 
 
 
 
16
 
17
+ # Simple chat loop
18
  def chat(message, history):
19
+ prompt = ""
20
+ for user, bot in history:
21
+ prompt += f"[INST] {user.strip()} [/INST] {bot.strip()} "
22
+ prompt += f"[INST] {message.strip()} [/INST]"
23
+
24
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
25
+ output = model.generate(
26
+ **inputs,
27
+ max_new_tokens=256,
28
+ temperature=0.7,
29
+ do_sample=True,
30
+ top_p=0.95,
31
+ pad_token_id=tokenizer.eos_token_id
32
+ )
33
+ result = tokenizer.decode(output[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
34
+ return result
35
 
36
+ # Launch Gradio app
37
+ gr.ChatInterface(fn=chat, title="Mistral Chat (CPU)", description="Ask questions, get answers using a real LLM.").launch()