Spaces:

Datangtang
/

iris

Sleeping

App Files Files Community

iris / app.py

Datangtang

真没招了

37b5c7e verified about 1 month ago

raw

history blame contribute delete

3.99 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	# ----------------------------------------
	# Global model cache
	# ----------------------------------------
	loaded_models = {} # Cache loaded Llama models
	current_model_name = None

	MODEL_CONFIGS = {
	"1B Model (Datangtang/GGUF1B)": {
	"repo_id": "Datangtang/GGUF1B",
	"filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
	},
	"3B Model (Datangtang/GGUF3B)": {
	"repo_id": "Datangtang/GGUF3B",
	"filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
	}
	}


	# ----------------------------------------
	# Load model function
	# ----------------------------------------
	def load_model(model_choice):
	global loaded_models, current_model_name

	if model_choice in loaded_models:
	print(f"Reusing already loaded model: {model_choice}")
	current_model_name = model_choice
	return loaded_models[model_choice]

	print(f"Downloading model: {model_choice}")

	cfg = MODEL_CONFIGS[model_choice]

	model_path = hf_hub_download(
	repo_id=cfg["repo_id"],
	filename=cfg["filename"],
	local_dir="./model",
	token=os.environ["HF_TOKEN"]
	)

	print(f"Model downloaded to: {model_path}")
	print("Loading GGUF model into memory...")

	llm = Llama(
	model_path=model_path,
	n_ctx=1024,
	n_threads=6,
	n_batch=512,
	n_gpu_layers=0,
	use_mmap=True,
	use_mlock=True,
	verbose=False,
	)

	loaded_models[model_choice] = llm
	current_model_name = model_choice

	print("Model loaded successfully!")
	return llm


	# ----------------------------------------
	# Chat function
	# ----------------------------------------
	def chat(message, history, model_choice):
	llm = load_model(model_choice)

	# System prompt
	conversation = "System: You are a helpful assistant.\n"

	# Convert ChatInterface history (list of dicts) into text prompt
	for msg in history[-3:]:
	# ChatInterface format: {"role": "...", "content": "..."}
	if isinstance(msg, dict):
	role = msg.get("role")
	content = msg.get("content", "")
	if role == "user":
	conversation += f"User: {content}\n"
	elif role == "assistant":
	conversation += f"Assistant: {content}\n"

	# Safety: old tuple format
	elif isinstance(msg, list) or isinstance(msg, tuple):
	human, assistant = msg
	conversation += f"User: {human}\n"
	if assistant:
	conversation += f"Assistant: {assistant}\n"

	# Add current message
	conversation += f"User: {message}\nAssistant:"

	# Generate model response
	response = llm(
	conversation,
	max_tokens=128,
	temperature=0.7,
	top_p=0.9,
	top_k=40,
	repeat_penalty=1.1,
	stop=["User:", "Assistant:"],
	echo=False
	)

	return response["choices"][0]["text"].strip()

	# ----------------------------------------
	# Gradio UI
	# ----------------------------------------
	with gr.Blocks() as demo:

	gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
	gr.Markdown("Switch between 1B and 3B GGUF models in real-time.")

	model_choice = gr.Dropdown(
	label="Select Model",
	choices=list(MODEL_CONFIGS.keys()),
	value="1B Model (Datangtang/GGUF1B)",
	)

	chat_iface = gr.ChatInterface(
	fn=lambda message, history: chat(message, history, model_choice.value),
	examples=[
	"Explain deep learning in one paragraph.",
	"What is the difference between supervised and unsupervised learning?",
	"Explain what a transformer model is.",
	],
	cache_examples=False,
	)

	model_choice.change(
	fn=lambda x: f"🔄 Switched to: {x}",
	inputs=[model_choice],
	outputs=[],
	)


	if __name__ == "__main__":
	demo.launch()