Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| # Get the HF token from environment | |
| hf_token = os.getenv("HUGGINGFACE_TOKEN") | |
| # Your fine-tuned model | |
| model_id = "alphaoumardev/Llama3-8B-noryu-instruct" | |
| # Authenticate with token when loading tokenizer/model | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token) | |
| model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_token) | |
| model.eval() | |
| # Device setup | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| def chat(user_input, history=[]): | |
| history.append({"role": "user", "content": user_input}) | |
| # Format the prompt | |
| prompt = "" | |
| for turn in history: | |
| role = turn["role"] | |
| content = turn["content"] | |
| prompt += f"{role}: {content}\n" | |
| prompt += "assistant:" | |
| # Tokenize and generate | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| assistant_reply = output_text.split("assistant:")[-1].strip() | |
| history.append({"role": "assistant", "content": assistant_reply}) | |
| # Gradio expects tuple list format for Chatbot display | |
| chat_history = [(h["content"], history[i + 1]["content"]) for i, h in enumerate(history[:-1]) if h["role"] == "user"] | |
| return chat_history, history | |
| # Gradio Blocks UI | |
| with gr.Blocks() as demo: | |
| chatbot = gr.Chatbot() | |
| state = gr.State([]) # memory of the conversation | |
| txt = gr.Textbox(show_label=False, placeholder="Type your message...") | |
| txt.submit(chat, [txt, state], [chatbot, state]) | |
| demo.launch() | |