Spaces:

prakhardoneria
/

CodeIT

Runtime error

prakhardoneria commited on May 5, 2025

Commit

9a71b5a

verified ·

1 Parent(s): 989ca7a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,36 +1,37 @@
-import os
 import gradio as gr
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-# Model info
-REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
-FILENAME = "TinyLlama-1.1B-Chat-v1.0.Q4_K_M.gguf"
-# Download model (if not already)
-model_path = hf_hub_download(repo_id=REPO, filename=FILENAME, cache_dir="./models")
-# Load model with llama-cpp
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,
-    n_threads=4,
-    use_mlock=True
 )
-# Chat prompt wrapper
-def format_prompt(message, history):
-    conversation = ""
-    for user, bot in history:
-        conversation += f"<|user|>\n{user.strip()}\n<|assistant|>\n{bot.strip()}\n"
-    conversation += f"<|user|>\n{message.strip()}\n<|assistant|>\n"
-    return conversation
 def chat(message, history):
-    prompt = format_prompt(message, history)
-    output = llm(prompt, max_tokens=256, temperature=0.7, top_p=0.9, stop=["<|user|>", "<|assistant|>"])
-    reply = output["choices"][0]["text"].strip()
-    return reply
-# Gradio chat UI
-gr.ChatInterface(chat, title="TinyLlama CPU Chat", description="Lightweight local LLM (1.1B) powered by llama.cpp.").launch()

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
 import gradio as gr
+# Load model (automatically downloaded and cached by Hugging Face)
+model_id = "mistralai/Mistral-7B-Instruct-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"
 )
+streamer = TextStreamer(tokenizer, skip_prompt=True)
+# Simple chat loop
 def chat(message, history):
+    prompt = ""
+    for user, bot in history:
+        prompt += f"[INST] {user.strip()} [/INST] {bot.strip()} "
+    prompt += f"[INST] {message.strip()} [/INST]"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    output = model.generate(
+        **inputs,
+        max_new_tokens=256,
+        temperature=0.7,
+        do_sample=True,
+        top_p=0.95,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    result = tokenizer.decode(output[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
+    return result
+# Launch Gradio app
+gr.ChatInterface(fn=chat, title="Mistral Chat (CPU)", description="Ask questions, get answers using a real LLM.").launch()