Spaces:

dudub100
/

Telecom-assistance

Sleeping

App Files Files Community

Telecom-assistance / app.py

dudub100

Update app.py

b834c74 verified 5 days ago

raw

history blame contribute delete

5.24 kB

	import gradio as gr
	import torch
	import gc
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	from threading import Thread

	# 1. Define the Available Models Library
	# Removed all proprietary/unavailable models. All models here have public weights.
	MODELS = {
	# --- 🟢 Free Tier Friendly (Smooth on 16GB CPU) ---
	"🟢 Llama-3.2-1B-Tele-it (1 Billion)": "AliMaatouk/Llama-3.2-1B-Tele-it",
	"🟢 Qwen-2.5-Coder-1.5B (1.5 Billion)": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
	" 🟢 TSLAM-mini-2B": "NetoAISolutions/TSLAM-Mini-2B",

	# --- 🔴 Mid-Size (Slow on Free Tier, requires patience or basic GPU) ---
	"🔴 Llama-3.2-3B-Tele-it (3 Billion)": "AliMaatouk/Llama-3.2-3B-Tele-it",
	"🔴 TSLAM-4B (4 Billion - Gated Model)": "NetoAISolutions/TSLAM-4B",

	# --- 🔴 Massive Models (Will crash Free Tier - Requires A100/H100 GPU clusters) ---
	"🔴 LTM / AdaptKey-Nemotron-30b (30 Billion)": "AdaptKey/AdaptKey-Nemotron-30b",
	"🔴 A.X K1 by SK Telecom (519 Billion MoE)": "skt/A.X-K1",
	"🔴 Kimi K2.6 by Moonshot AI (1 Trillion MoE)": "moonshotai/Kimi-K2.6"
	}

	current_model_id = None
	tokenizer = None
	model = None

	# 2. Memory Management and Model Loading
	def load_model(model_selection, hf_token):
	global current_model_id, tokenizer, model

	target_model_id = MODELS[model_selection]

	if current_model_id == target_model_id:
	return f"✅ {target_model_id} is already active."

	# 🧹 Delete old model from memory to prevent server crashes
	if model is not None:
	del model
	del tokenizer
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	try:
	# Pass the token if provided (required for gated models like TSLAM-4B)
	token_arg = hf_token if hf_token.strip() != "" else None

	tokenizer = AutoTokenizer.from_pretrained(target_model_id, token=token_arg)

	# Load the model with compression (bfloat16) to save RAM
	model = AutoModelForCausalLM.from_pretrained(
	target_model_id,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	token=token_arg
	)

	current_model_id = target_model_id
	return f"✅ Successfully loaded: {target_model_id}. You can now chat!"

	except Exception as e:
	if "401 Client Error" in str(e) or "gated repo" in str(e).lower():
	return "❌ Access Denied: This is a gated model. Please accept the terms on the model's Hugging Face page and enter your Access Token in the box above."
	return f"❌ Error loading model: Ensure you have enough RAM/vRAM. Error: {str(e)}"

	# 3. The Chat Logic
	def generate_response(message, history):
	if model is None or tokenizer is None:
	yield "⚠️ Please select a model from the dropdown and click 'Load Model' before chatting."
	return

	prompt = f"User: {message}\nAssistant:"
	inputs = tokenizer(prompt, return_tensors="pt")

	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	generation_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=256,
	temperature=0.3,
	top_p=0.9
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	response = ""
	for new_text in streamer:
	response += new_text
	yield response

	# 4. Build the User Interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:

	gr.HTML("""
	<div style="text-align: center; margin-bottom: 20px;">
	<h2>Telecom AI Model Selector</h2>
	<p>Select a model below. Loading may take a few minutes.</p>
	<p style="color: green; font-weight: bold;">🟢 Green: Runs on Free CPU Tier.</p>
	<p style="color: red; font-weight: bold;">🔴 Red: Requires heavy GPU / Paid Tier.</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=3):
	model_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	value=list(MODELS.keys())[0],
	label="1. Choose your LLM",
	interactive=True
	)
	with gr.Column(scale=2):
	hf_token_input = gr.Textbox(
	label="2. Hugging Face Token (Optional)",
	placeholder="Required only for Gated Models...",
	type="password"
	)
	with gr.Column(scale=1):
	# Empty markdown just to push the button down to align with text boxes
	gr.Markdown("<br>")
	load_btn = gr.Button("3. Load Model", variant="primary")

	status_text = gr.Textbox(label="System Status", value="Waiting for model to load...", interactive=False)

	load_btn.click(fn=load_model, inputs=[model_dropdown, hf_token_input], outputs=status_text)

	gr.ChatInterface(
	fn=generate_response,
	examples=["Explain how BGP handles route propagation.", "Write an Elasticsearch DSL query to find high latency."],
	)

	demo.launch()