Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import gc | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from threading import Thread | |
| # 1. Define the Available Models Library | |
| # Removed all proprietary/unavailable models. All models here have public weights. | |
| MODELS = { | |
| # --- π’ Free Tier Friendly (Smooth on 16GB CPU) --- | |
| "π’ Llama-3.2-1B-Tele-it (1 Billion)": "AliMaatouk/Llama-3.2-1B-Tele-it", | |
| "π’ Qwen-2.5-Coder-1.5B (1.5 Billion)": "Qwen/Qwen2.5-Coder-1.5B-Instruct", | |
| " π’ TSLAM-mini-2B": "NetoAISolutions/TSLAM-Mini-2B", | |
| # --- π΄ Mid-Size (Slow on Free Tier, requires patience or basic GPU) --- | |
| "π΄ Llama-3.2-3B-Tele-it (3 Billion)": "AliMaatouk/Llama-3.2-3B-Tele-it", | |
| "π΄ TSLAM-4B (4 Billion - Gated Model)": "NetoAISolutions/TSLAM-4B", | |
| # --- π΄ Massive Models (Will crash Free Tier - Requires A100/H100 GPU clusters) --- | |
| "π΄ LTM / AdaptKey-Nemotron-30b (30 Billion)": "AdaptKey/AdaptKey-Nemotron-30b", | |
| "π΄ A.X K1 by SK Telecom (519 Billion MoE)": "skt/A.X-K1", | |
| "π΄ Kimi K2.6 by Moonshot AI (1 Trillion MoE)": "moonshotai/Kimi-K2.6" | |
| } | |
| current_model_id = None | |
| tokenizer = None | |
| model = None | |
| # 2. Memory Management and Model Loading | |
| def load_model(model_selection, hf_token): | |
| global current_model_id, tokenizer, model | |
| target_model_id = MODELS[model_selection] | |
| if current_model_id == target_model_id: | |
| return f"β {target_model_id} is already active." | |
| # π§Ή Delete old model from memory to prevent server crashes | |
| if model is not None: | |
| del model | |
| del tokenizer | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| try: | |
| # Pass the token if provided (required for gated models like TSLAM-4B) | |
| token_arg = hf_token if hf_token.strip() != "" else None | |
| tokenizer = AutoTokenizer.from_pretrained(target_model_id, token=token_arg) | |
| # Load the model with compression (bfloat16) to save RAM | |
| model = AutoModelForCausalLM.from_pretrained( | |
| target_model_id, | |
| torch_dtype=torch.bfloat16, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True, | |
| token=token_arg | |
| ) | |
| current_model_id = target_model_id | |
| return f"β Successfully loaded: {target_model_id}. You can now chat!" | |
| except Exception as e: | |
| if "401 Client Error" in str(e) or "gated repo" in str(e).lower(): | |
| return "β Access Denied: This is a gated model. Please accept the terms on the model's Hugging Face page and enter your Access Token in the box above." | |
| return f"β Error loading model: Ensure you have enough RAM/vRAM. Error: {str(e)}" | |
| # 3. The Chat Logic | |
| def generate_response(message, history): | |
| if model is None or tokenizer is None: | |
| yield "β οΈ Please select a model from the dropdown and click 'Load Model' before chatting." | |
| return | |
| prompt = f"User: {message}\nAssistant:" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| generation_kwargs = dict( | |
| inputs, | |
| streamer=streamer, | |
| max_new_tokens=256, | |
| temperature=0.3, | |
| top_p=0.9 | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| response = "" | |
| for new_text in streamer: | |
| response += new_text | |
| yield response | |
| # 4. Build the User Interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-bottom: 20px;"> | |
| <h2>Telecom AI Model Selector</h2> | |
| <p>Select a model below. Loading may take a few minutes.</p> | |
| <p style="color: green; font-weight: bold;">π’ Green: Runs on Free CPU Tier.</p> | |
| <p style="color: red; font-weight: bold;">π΄ Red: Requires heavy GPU / Paid Tier.</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value=list(MODELS.keys())[0], | |
| label="1. Choose your LLM", | |
| interactive=True | |
| ) | |
| with gr.Column(scale=2): | |
| hf_token_input = gr.Textbox( | |
| label="2. Hugging Face Token (Optional)", | |
| placeholder="Required only for Gated Models...", | |
| type="password" | |
| ) | |
| with gr.Column(scale=1): | |
| # Empty markdown just to push the button down to align with text boxes | |
| gr.Markdown("<br>") | |
| load_btn = gr.Button("3. Load Model", variant="primary") | |
| status_text = gr.Textbox(label="System Status", value="Waiting for model to load...", interactive=False) | |
| load_btn.click(fn=load_model, inputs=[model_dropdown, hf_token_input], outputs=status_text) | |
| gr.ChatInterface( | |
| fn=generate_response, | |
| examples=["Explain how BGP handles route propagation.", "Write an Elasticsearch DSL query to find high latency."], | |
| ) | |
| demo.launch() |