Spaces:

Locutusque
/

Locutusque-Models

Running on Zero

App Files Files Community

Locutusque-Models / app.py

Locutusque

Update app.py

8eb13b2 verified 8 months ago

raw

history blame

8.1 kB

	import spaces
	import gradio as gr
	from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
	import torch
	from threading import Thread, Lock, Event
	import os
	import asyncio
	import time
	from datetime import datetime
	import gc

	# Global dictionary to store preloaded models and tokenizers
	LOADED_MODELS = {}
	LOADED_TOKENIZERS = {}
	# Lock for thread-safe model access
	MODEL_LOCK = Lock()
	# Event to signal shutdown
	SHUTDOWN_EVENT = Event()

	def clear_memory():
	"""Clear GPU and CPU memory"""
	torch.cuda.empty_cache()
	gc.collect()

	def load_single_model(model_name):
	"""Load a single model and tokenizer"""
	try:
	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Loading {model_name}...")

	# Load model to CPU with bfloat16 to save memory
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	token=os.environ.get("token"),
	)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	trust_remote_code=True,
	token=os.environ.get("token")
	)
	tokenizer.eos_token = "<\|im_end\|>"

	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Successfully loaded {model_name}")
	return model, tokenizer
	except Exception as e:
	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Failed to load {model_name}: {e}")
	return None, None

	def preload_models(model_choices):
	"""Preload all models to CPU at startup"""
	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Preloading models to CPU...")

	with MODEL_LOCK:
	for model_name in model_choices:
	model, tokenizer = load_single_model(model_name)
	if model is not None and tokenizer is not None:
	LOADED_MODELS[model_name] = model
	LOADED_TOKENIZERS[model_name] = tokenizer

	def reload_models_task(model_choices):
	"""Background task to reload models every 15 minutes"""
	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting model reload task...")

	while not SHUTDOWN_EVENT.is_set():
	# Wait for 15 minutes (900 seconds)
	if SHUTDOWN_EVENT.wait(900):
	# If event is set, exit the loop
	break

	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting periodic model reload...")

	# Create temporary dictionaries for new models
	new_models = {}
	new_tokenizers = {}

	# Load new models
	for model_name in model_choices:
	model, tokenizer = load_single_model(model_name)
	if model is not None and tokenizer is not None:
	new_models[model_name] = model
	new_tokenizers[model_name] = tokenizer

	# Replace old models with new ones atomically
	with MODEL_LOCK:
	# Clear old models from memory
	for model_name in LOADED_MODELS:
	if model_name in LOADED_MODELS:
	try:
	del LOADED_MODELS[model_name]
	except:
	pass
	if model_name in LOADED_TOKENIZERS:
	try:
	del LOADED_TOKENIZERS[model_name]
	except:
	pass

	# Clear memory
	clear_memory()

	# Update with new models
	LOADED_MODELS.update(new_models)
	LOADED_TOKENIZERS.update(new_tokenizers)

	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Model reload completed")

	@spaces.GPU()
	def get_model_pipeline(model_name):
	"""Move selected model to GPU and create pipeline"""
	with MODEL_LOCK:
	if model_name not in LOADED_MODELS:
	raise ValueError(f"Model {model_name} not found in preloaded models")

	# Get model and tokenizer references
	model = LOADED_MODELS[model_name]
	tokenizer = LOADED_TOKENIZERS[model_name]

	# Create pipeline with the GPU model
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=torch.bfloat16,
	device="cuda"
	)

	return pipe, model

	@spaces.GPU(duration=45)
	def generate(
	message,
	history,
	model_name,
	system,
	temperature=0.4,
	top_p=0.95,
	min_p=0.1,
	top_k=50,
	max_new_tokens=256,
	):
	try:
	# Get the pipeline with model on GPU
	pipe, gpu_model = get_model_pipeline(model_name)

	# Build the prompt
	prompt = f"<\|im_start\|>system\n{system}<\|im_end\|>\n"
	for (user_turn, assistant_turn) in history:
	prompt += f"<\|im_start\|>user\n{user_turn}<\|im_end\|>\n<\|im_start\|>assistant\n{assistant_turn}<\|im_end\|>\n"
	prompt += f"<\|im_start\|>user\n{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	streamer = TextIteratorStreamer(
	pipe.tokenizer,
	timeout=240.0,
	skip_prompt=True,
	skip_special_tokens=True
	)

	generation_kwargs = dict(
	text_inputs=prompt,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	min_p=min_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=1.1
	)

	t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
	t.start()

	outputs = []
	for chunk in streamer:
	outputs.append(chunk)
	yield "".join(outputs)

	# Move model back to CPU after inference to free GPU memory
	gpu_model.to("cpu")
	torch.cuda.empty_cache()

	except StopAsyncIteration:
	print("Stream stopped unexpectedly.")
	yield "".join(outputs)
	except Exception as e:
	print(f"An error occurred: {e}")
	yield f"An error occurred during generation: {str(e)}"
	finally:
	# Ensure model is moved back to CPU even if there's an error
	if 'gpu_model' in locals():
	gpu_model.to("cpu")
	torch.cuda.empty_cache()

	# Define model choices
	model_choices = [
	"Locutusque/Hercules-6.1-Llama-3.1-8B",
	"Locutusque/liberalis-cogitator-mistral-nemo-2407",
	"Locutusque/lmsys-llama-3.2-3b"
	]

	# Preload all models to CPU at startup
	preload_models(model_choices)

	# Start the background reload task
	reload_thread = Thread(target=reload_models_task, args=(model_choices,), daemon=True)
	reload_thread.start()

	# Create Gradio interface
	g = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.components.Dropdown(
	choices=model_choices,
	label="Model",
	value=model_choices[0],
	interactive=True
	),
	gr.components.Textbox(
	lines=2,
	label="System Prompt",
	value="You are a sentient, super intelligent AI developed by a man named Locutusque."
	),
	gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
	gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
	gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
	gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
	gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
	],
	title="Locutusque's Language Models",
	description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
	)

	if __name__ == "__main__":
	try:
	g.launch()
	finally:
	# Signal the reload thread to stop when the app shuts down
	SHUTDOWN_EVENT.set()