Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # --- Configuration --- | |
| # Define available models: Label -> (Repo ID, GGUF Filename) | |
| MODELS = { | |
| "Llama-3.2-1B": { | |
| "repo_id": "Emil-Matteus/llama-32-1b", | |
| "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf" | |
| }, | |
| "Llama-3.2-3B": { | |
| "repo_id": "Emil-Matteus/llama-3B_model-GGUF", | |
| "filename": "llama-3B-Q4_K_M.gguf" | |
| }, | |
| "Qwen2.5-1.5B": { | |
| "repo_id": "Emil-Matteus/qwen2.5-1.5B_model-GGUF", | |
| "filename": "qwen2.5-1.5B-Q4_K_M.gguf" | |
| } | |
| } | |
| # Global state to hold the currently loaded model | |
| current_model_name = None | |
| llm = None | |
| def load_model(model_name): | |
| """ | |
| Loads the specified model into memory, unloading the previous one. | |
| """ | |
| global llm, current_model_name | |
| # If this model is already loaded, do nothing | |
| if llm is not None and current_model_name == model_name: | |
| return llm | |
| print(f"Loading new model: {model_name}...") | |
| if model_name not in MODELS: | |
| raise ValueError(f"Unknown model: {model_name}") | |
| repo_id = MODELS[model_name]["repo_id"] | |
| filename = MODELS[model_name]["filename"] | |
| try: | |
| model_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename | |
| ) | |
| # Initialize Llama model (n_gpu_layers=0 for CPU) | |
| # n_ctx=4096 gives a decent context window | |
| llm = Llama( | |
| model_path=model_path, | |
| n_gpu_layers=0, | |
| n_ctx=4096, | |
| verbose=True | |
| ) | |
| current_model_name = model_name | |
| print(f"Successfully loaded {model_name}") | |
| return llm | |
| except Exception as e: | |
| print(f"Error loading model {model_name}: {e}") | |
| raise e | |
| def respond( | |
| message, | |
| history: list[dict[str, str]], | |
| model_selection, # First additional input (Dropdown) | |
| system_message, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| ): | |
| global llm | |
| # Ensure the correct model is loaded | |
| try: | |
| load_model(model_selection) | |
| except Exception as e: | |
| yield f"Error loading model '{model_selection}': {str(e)}. Please check if the model has been uploaded to Hugging Face." | |
| return | |
| messages = [{"role": "system", "content": system_message}] | |
| messages.extend(history) | |
| messages.append({"role": "user", "content": message}) | |
| response = "" | |
| # Generate response | |
| completion = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stream=True | |
| ) | |
| for chunk in completion: | |
| if "content" in chunk["choices"][0]["delta"]: | |
| token = chunk["choices"][0]["delta"]["content"] | |
| response += token | |
| yield response | |
| # --- UI Setup --- | |
| """ | |
| For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| """ | |
| chatbot = gr.ChatInterface( | |
| respond, | |
| type="messages", | |
| additional_inputs=[ | |
| # Model Selector Dropdown | |
| gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value="Llama-3.2-1B", | |
| label="Select Model", | |
| info="Switching models will take a few seconds to download/load." | |
| ), | |
| gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| chatbot.launch() | |