import gradio as gr import torch import gc from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread # 1. Define the Available Models Library # Removed all proprietary/unavailable models. All models here have public weights. MODELS = { # --- ๐ŸŸข Free Tier Friendly (Smooth on 16GB CPU) --- "๐ŸŸข Llama-3.2-1B-Tele-it (1 Billion)": "AliMaatouk/Llama-3.2-1B-Tele-it", "๐ŸŸข Qwen-2.5-Coder-1.5B (1.5 Billion)": "Qwen/Qwen2.5-Coder-1.5B-Instruct", " ๐ŸŸข TSLAM-mini-2B": "NetoAISolutions/TSLAM-Mini-2B", # --- ๐Ÿ”ด Mid-Size (Slow on Free Tier, requires patience or basic GPU) --- "๐Ÿ”ด Llama-3.2-3B-Tele-it (3 Billion)": "AliMaatouk/Llama-3.2-3B-Tele-it", "๐Ÿ”ด TSLAM-4B (4 Billion - Gated Model)": "NetoAISolutions/TSLAM-4B", # --- ๐Ÿ”ด Massive Models (Will crash Free Tier - Requires A100/H100 GPU clusters) --- "๐Ÿ”ด LTM / AdaptKey-Nemotron-30b (30 Billion)": "AdaptKey/AdaptKey-Nemotron-30b", "๐Ÿ”ด A.X K1 by SK Telecom (519 Billion MoE)": "skt/A.X-K1", "๐Ÿ”ด Kimi K2.6 by Moonshot AI (1 Trillion MoE)": "moonshotai/Kimi-K2.6" } current_model_id = None tokenizer = None model = None # 2. Memory Management and Model Loading def load_model(model_selection, hf_token): global current_model_id, tokenizer, model target_model_id = MODELS[model_selection] if current_model_id == target_model_id: return f"โœ… {target_model_id} is already active." # ๐Ÿงน Delete old model from memory to prevent server crashes if model is not None: del model del tokenizer gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() try: # Pass the token if provided (required for gated models like TSLAM-4B) token_arg = hf_token if hf_token.strip() != "" else None tokenizer = AutoTokenizer.from_pretrained(target_model_id, token=token_arg) # Load the model with compression (bfloat16) to save RAM model = AutoModelForCausalLM.from_pretrained( target_model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, token=token_arg ) current_model_id = target_model_id return f"โœ… Successfully loaded: {target_model_id}. You can now chat!" except Exception as e: if "401 Client Error" in str(e) or "gated repo" in str(e).lower(): return "โŒ Access Denied: This is a gated model. Please accept the terms on the model's Hugging Face page and enter your Access Token in the box above." return f"โŒ Error loading model: Ensure you have enough RAM/vRAM. Error: {str(e)}" # 3. The Chat Logic def generate_response(message, history): if model is None or tokenizer is None: yield "โš ๏ธ Please select a model from the dropdown and click 'Load Model' before chatting." return prompt = f"User: {message}\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=256, temperature=0.3, top_p=0.9 ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() response = "" for new_text in streamer: response += new_text yield response # 4. Build the User Interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML("""

Telecom AI Model Selector

Select a model below. Loading may take a few minutes.

๐ŸŸข Green: Runs on Free CPU Tier.

๐Ÿ”ด Red: Requires heavy GPU / Paid Tier.

""") with gr.Row(): with gr.Column(scale=3): model_dropdown = gr.Dropdown( choices=list(MODELS.keys()), value=list(MODELS.keys())[0], label="1. Choose your LLM", interactive=True ) with gr.Column(scale=2): hf_token_input = gr.Textbox( label="2. Hugging Face Token (Optional)", placeholder="Required only for Gated Models...", type="password" ) with gr.Column(scale=1): # Empty markdown just to push the button down to align with text boxes gr.Markdown("
") load_btn = gr.Button("3. Load Model", variant="primary") status_text = gr.Textbox(label="System Status", value="Waiting for model to load...", interactive=False) load_btn.click(fn=load_model, inputs=[model_dropdown, hf_token_input], outputs=status_text) gr.ChatInterface( fn=generate_response, examples=["Explain how BGP handles route propagation.", "Write an Elasticsearch DSL query to find high latency."], ) demo.launch()