Spaces:

AdamF92
/

RxT-Beta-Supervised-Demo

Running on Zero

App Files Files Community

RxT-Beta-Supervised-Demo / app.py

AdamF92

Update app.py

7441c66 verified about 1 month ago

raw

history blame contribute delete

4.65 kB

	# app.py
	import os
	import gradio as gr
	import torch
	import spaces
	from typing import Literal
	from rxlm.rxt.models import RxTBeta
	from rxlm.llm.models import DecoderOnlyTransformer
	from rxlm.training.tokenizer import load_tokenizer_from_hf_hub

	def disable_flash_attention(model: RxTBeta):
	model.decoder.model.use_flash_attention = False
	model.encoder.model.use_flash_attention = False

	for layer in model.decoder.model.layers:
	layer.attention.use_flash_attention = False
	layer.memory_cross_attention.use_flash_attention = False
	for layer in model.decoder.model.stateless_layers:
	layer.attention.use_flash_attention = False
	for layer in model.decoder.model.final_stateless_layers:
	layer.attention.use_flash_attention = False

	for layer in model.encoder.model.layers:
	layer.attention.use_flash_attention = False

	for layer in model.memory_attention.model.attention_layers:
	layer.use_flash_attention = False

	for layer in model.memory_attention.model.attention_layers:
	layer.use_flash_attention = False

	for layer in model.memory_attention.model.mean_attention_layers:
	layer.use_flash_attention = False

	HF_TOKEN = os.environ.get("HF_TOKEN")

	tokenizer = load_tokenizer_from_hf_hub('ReactiveAI/RxT-Beta', token=HF_TOKEN)
	model = RxTBeta.from_pretrained('ReactiveAI/RxT-Beta-Supervised', token=HF_TOKEN, tokenizer=tokenizer)
	disable_flash_attention(model)


	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device, dtype=torch.bfloat16)
	model.init_model(device=device)

	model.set_batch_mode(False)
	initial_stm = model.export_stm_state().cpu()

	seq_len = 8192

	@spaces.GPU
	def chat(message: str, history: list, stm_state: torch.Tensor, temperature: float, top_p: float, thinking_mode: Literal['auto', 'extended', 'fast']):
	tokenized_query = model.tokenize_query(message, max_seq_len=seq_len, device=device)

	model.load_stm_state(stm_state)

	response = ""

	is_thinking = False

	history += [
	{
	'role': 'user',
	'content': message
	}
	]

	with torch.amp.autocast(device_type=device.type, dtype=torch.bfloat16):
	for token_id in model.interact(**tokenized_query, thinking_mode=thinking_mode, max_seq_len=seq_len, temperature=temperature, top_p=top_p):
	next_token = model.stringify_token(token_id, show_memory_update=False, skip_special_tokens=False)
	if next_token == '[T]':
	response += '[START THINKING]\n\n'
	is_thinking = True
	elif next_token == '[A]':
	if is_thinking:
	is_thinking = False
	response += '\n\n[END THINKING]\n\n'
	else:
	response += next_token
	yield history + [{ 'role': 'assistant', 'content': response }], stm_state

	return history + [{ 'role': 'assistant', 'content': response }], model.export_stm_state().cpu()

	with gr.Blocks(title="RxT-Beta 3B A190M (Supervised) Demo") as demo:
	gr.Markdown("""
	# RxT-Beta 3B A190M Supervised Demo
	Demo for supervised version of first real-scale Reactive Transformer model with 3B total params and 190M active in decoder.

	Work in progress - fixing generation errors

	## Limitations
	Supervised version of the model is still in intermediate stage and will be further improved
	in Direct Memory and Preference Optimization (DMPO) stage (demo will be constantly updated).
	""")

	with gr.Row():
	chatbot = gr.Chatbot(height=600, label='RxT-Beta Chat')

	with gr.Row():
	msg = gr.Textbox(placeholder="Ask RxT...", label="Query", scale=4)
	thinking_mode = gr.Radio(choices=['auto', 'extended', 'fast'], label="Thinking Mode", value='auto')
	send_btn = gr.Button("Send", scale=1)

	with gr.Row():
	temp = gr.Slider(0.1, 2.0, value=0.7, step=0.05, label="Temperature")
	top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
	clear = gr.Button("Clear & Reset STM", scale=1)

	stm_state = gr.State(initial_stm.clone())

	msg.submit(chat, [msg, chatbot, stm_state, temp, top_p, thinking_mode], [chatbot, stm_state], queue=True).then(
	lambda: gr.update(value=""), outputs=msg
	)

	send_btn.click(chat, [msg, chatbot, stm_state, temp, top_p, thinking_mode], [chatbot, stm_state], queue=True).then(
	lambda: gr.update(value=""), outputs=msg
	)

	clear.click(lambda: ([], initial_stm.clone()), None, [chatbot, stm_state])


	if __name__ == "__main__":
	demo.queue()
	demo.launch()