Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from transformers import TextIteratorStreamer | |
| from threading import Thread | |
| # Load model and tokenizer at startup | |
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" | |
| print(f"Loading model {model_name}...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| low_cpu_mem_usage=True | |
| ) | |
| print("Model loaded successfully!") | |
| def respond( | |
| message, | |
| history: list[dict[str, str]], | |
| system_message, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| ): | |
| """ | |
| Generate responses using the local Anki 2.5 model. | |
| """ | |
| # Build conversation history | |
| conversation = [] | |
| # Add system message if provided | |
| if system_message: | |
| conversation.append({"role": "system", "content": system_message}) | |
| # Add chat history | |
| for msg in history: | |
| conversation.append(msg) | |
| # Add current message | |
| conversation.append({"role": "user", "content": message}) | |
| # Format prompt for the model | |
| # Try to apply chat template if available, otherwise use simple format | |
| try: | |
| formatted_prompt = tokenizer.apply_chat_template( | |
| conversation, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| except: | |
| # Fallback to simple format if chat template not available | |
| formatted_prompt = "" | |
| for msg in conversation: | |
| role = msg.get("role", "user") | |
| content = msg.get("content", "") | |
| if role == "system": | |
| formatted_prompt += f"System: {content}\n" | |
| elif role == "user": | |
| formatted_prompt += f"User: {content}\n" | |
| elif role == "assistant": | |
| formatted_prompt += f"Assistant: {content}\n" | |
| formatted_prompt += "Assistant: " | |
| # Tokenize input | |
| inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=2048) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| # Generate response with streaming | |
| response = "" | |
| # Create streamer for batch generation | |
| streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) | |
| # Set up generation parameters | |
| generation_kwargs = { | |
| **inputs, | |
| "max_new_tokens": max_tokens, | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "do_sample": True, | |
| "no_repeat_ngram_size": 3, | |
| "repetition_penalty": 1.1, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| "streamer": streamer, | |
| } | |
| # Run generation in separate thread for streaming | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Stream response | |
| response = "" | |
| for new_text in streamer: | |
| response += new_text | |
| yield response | |
| thread.join() | |
| """ | |
| For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| """ | |
| chatbot = gr.ChatInterface( | |
| respond, | |
| type="messages", | |
| additional_inputs=[ | |
| gr.Textbox(value="You are a friendly Chatbot proficient in Indian languages.", label="System message"), | |
| gr.Slider(minimum=1, maximum=512, value=256, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| chatbot.launch() |