dudub100's picture
Update app.py
b834c74 verified
import gradio as gr
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
# 1. Define the Available Models Library
# Removed all proprietary/unavailable models. All models here have public weights.
MODELS = {
# --- 🟒 Free Tier Friendly (Smooth on 16GB CPU) ---
"🟒 Llama-3.2-1B-Tele-it (1 Billion)": "AliMaatouk/Llama-3.2-1B-Tele-it",
"🟒 Qwen-2.5-Coder-1.5B (1.5 Billion)": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
" 🟒 TSLAM-mini-2B": "NetoAISolutions/TSLAM-Mini-2B",
# --- πŸ”΄ Mid-Size (Slow on Free Tier, requires patience or basic GPU) ---
"πŸ”΄ Llama-3.2-3B-Tele-it (3 Billion)": "AliMaatouk/Llama-3.2-3B-Tele-it",
"πŸ”΄ TSLAM-4B (4 Billion - Gated Model)": "NetoAISolutions/TSLAM-4B",
# --- πŸ”΄ Massive Models (Will crash Free Tier - Requires A100/H100 GPU clusters) ---
"πŸ”΄ LTM / AdaptKey-Nemotron-30b (30 Billion)": "AdaptKey/AdaptKey-Nemotron-30b",
"πŸ”΄ A.X K1 by SK Telecom (519 Billion MoE)": "skt/A.X-K1",
"πŸ”΄ Kimi K2.6 by Moonshot AI (1 Trillion MoE)": "moonshotai/Kimi-K2.6"
}
current_model_id = None
tokenizer = None
model = None
# 2. Memory Management and Model Loading
def load_model(model_selection, hf_token):
global current_model_id, tokenizer, model
target_model_id = MODELS[model_selection]
if current_model_id == target_model_id:
return f"βœ… {target_model_id} is already active."
# 🧹 Delete old model from memory to prevent server crashes
if model is not None:
del model
del tokenizer
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
try:
# Pass the token if provided (required for gated models like TSLAM-4B)
token_arg = hf_token if hf_token.strip() != "" else None
tokenizer = AutoTokenizer.from_pretrained(target_model_id, token=token_arg)
# Load the model with compression (bfloat16) to save RAM
model = AutoModelForCausalLM.from_pretrained(
target_model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
token=token_arg
)
current_model_id = target_model_id
return f"βœ… Successfully loaded: {target_model_id}. You can now chat!"
except Exception as e:
if "401 Client Error" in str(e) or "gated repo" in str(e).lower():
return "❌ Access Denied: This is a gated model. Please accept the terms on the model's Hugging Face page and enter your Access Token in the box above."
return f"❌ Error loading model: Ensure you have enough RAM/vRAM. Error: {str(e)}"
# 3. The Chat Logic
def generate_response(message, history):
if model is None or tokenizer is None:
yield "⚠️ Please select a model from the dropdown and click 'Load Model' before chatting."
return
prompt = f"User: {message}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=256,
temperature=0.3,
top_p=0.9
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
response = ""
for new_text in streamer:
response += new_text
yield response
# 4. Build the User Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div style="text-align: center; margin-bottom: 20px;">
<h2>Telecom AI Model Selector</h2>
<p>Select a model below. Loading may take a few minutes.</p>
<p style="color: green; font-weight: bold;">🟒 Green: Runs on Free CPU Tier.</p>
<p style="color: red; font-weight: bold;">πŸ”΄ Red: Requires heavy GPU / Paid Tier.</p>
</div>
""")
with gr.Row():
with gr.Column(scale=3):
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()),
value=list(MODELS.keys())[0],
label="1. Choose your LLM",
interactive=True
)
with gr.Column(scale=2):
hf_token_input = gr.Textbox(
label="2. Hugging Face Token (Optional)",
placeholder="Required only for Gated Models...",
type="password"
)
with gr.Column(scale=1):
# Empty markdown just to push the button down to align with text boxes
gr.Markdown("<br>")
load_btn = gr.Button("3. Load Model", variant="primary")
status_text = gr.Textbox(label="System Status", value="Waiting for model to load...", interactive=False)
load_btn.click(fn=load_model, inputs=[model_dropdown, hf_token_input], outputs=status_text)
gr.ChatInterface(
fn=generate_response,
examples=["Explain how BGP handles route propagation.", "Write an Elasticsearch DSL query to find high latency."],
)
demo.launch()