import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # --- Configuration --- # Define available models: Label -> (Repo ID, GGUF Filename) MODELS = { "Llama-3.2-1B": { "repo_id": "Emil-Matteus/llama-32-1b", "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf" }, "Llama-3.2-3B": { "repo_id": "Emil-Matteus/llama-3B_model-GGUF", "filename": "llama-3B-Q4_K_M.gguf" }, "Qwen2.5-1.5B": { "repo_id": "Emil-Matteus/qwen2.5-1.5B_model-GGUF", "filename": "qwen2.5-1.5B-Q4_K_M.gguf" } } # Global state to hold the currently loaded model current_model_name = None llm = None def load_model(model_name): """ Loads the specified model into memory, unloading the previous one. """ global llm, current_model_name # If this model is already loaded, do nothing if llm is not None and current_model_name == model_name: return llm print(f"Loading new model: {model_name}...") if model_name not in MODELS: raise ValueError(f"Unknown model: {model_name}") repo_id = MODELS[model_name]["repo_id"] filename = MODELS[model_name]["filename"] try: model_path = hf_hub_download( repo_id=repo_id, filename=filename ) # Initialize Llama model (n_gpu_layers=0 for CPU) # n_ctx=4096 gives a decent context window llm = Llama( model_path=model_path, n_gpu_layers=0, n_ctx=4096, verbose=True ) current_model_name = model_name print(f"Successfully loaded {model_name}") return llm except Exception as e: print(f"Error loading model {model_name}: {e}") raise e def respond( message, history: list[dict[str, str]], model_selection, # First additional input (Dropdown) system_message, max_tokens, temperature, top_p, ): global llm # Ensure the correct model is loaded try: load_model(model_selection) except Exception as e: yield f"Error loading model '{model_selection}': {str(e)}. Please check if the model has been uploaded to Hugging Face." return messages = [{"role": "system", "content": system_message}] messages.extend(history) messages.append({"role": "user", "content": message}) response = "" # Generate response completion = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True ) for chunk in completion: if "content" in chunk["choices"][0]["delta"]: token = chunk["choices"][0]["delta"]["content"] response += token yield response # --- UI Setup --- """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ # Model Selector Dropdown gr.Dropdown( choices=list(MODELS.keys()), value="Llama-3.2-1B", label="Select Model", info="Switching models will take a few seconds to download/load." ), gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": chatbot.launch()