Spaces:

VirtualInsight
/

Lumen-Instruct

Sleeping

App Files Files Community

Lumen-Instruct / app.py

VirtualInsight

Update app.py

e24b08e verified 3 months ago

raw

history blame contribute delete

3.78 kB

	import gradio as gr
	import torch
	import json
	import re
	from tokenizers import Tokenizer
	from huggingface_hub import hf_hub_download
	from safetensors.torch import load_file
	from ModelArchitecture import Transformer, ModelConfig, generate

	# -----------------------------
	# Load model and tokenizer
	# -----------------------------
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	REPO_ID = "VirtualInsight/Lumen-Instruct"

	# Download model files
	model_path = hf_hub_download(repo_id=REPO_ID, filename="model.safetensors")
	tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename="tokenizer.json")
	config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")

	# Initialize tokenizer and model
	tokenizer = Tokenizer.from_file(tokenizer_path)
	with open(config_path) as f:
	config = ModelConfig(**json.load(f))

	model = Transformer(config).to(device)
	model.load_state_dict(load_file(model_path, device=str(device)), strict=False)
	model.eval()

	# -----------------------------
	# Special Tokens
	# -----------------------------
	EOS_TOKEN = "<\|im_end\|>"
	EOS_TOKEN_ID = tokenizer.encode(EOS_TOKEN).ids[0]
	print(f"EOS token ID: {EOS_TOKEN_ID}")

	# -----------------------------
	# Generation Function
	# -----------------------------
	@torch.no_grad()
	def generate_response(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
	"""
	Generates a clean assistant-only response, removing any echoed user text.
	"""
	# Chat-style prompt
	formatted_prompt = f"<\|im_start\|>user\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Tokenize
	input_ids = torch.tensor([tokenizer.encode(formatted_prompt).ids], dtype=torch.long, device=device)

	# Generate
	output = generate(
	model,
	input_ids,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_k=50,
	top_p=top_p,
	do_sample=True,
	eos_token_id=EOS_TOKEN_ID,
	)

	# Decode
	full_text = tokenizer.decode(output[0].tolist())

	# Extract assistant’s section
	if "<\|im_start\|>assistant" in full_text:
	response = full_text.split("<\|im_start\|>assistant")[-1]
	response = response.split("<\|im_end\|>")[0] if "<\|im_end\|>" in response else response
	else:
	response = full_text

	# Remove leftover role tokens and whitespace
	response = re.sub(r"(?i)\buser\b.*", "", response)
	response = re.sub(r"(?i)\bassistant\b.*", "", response)
	response = response.strip()

	# 🧹 Final cleanup: remove leading user echo if present
	lines = [line.strip() for line in response.splitlines() if line.strip()]
	if len(lines) >= 2 and (
	lines[0].lower() == prompt.strip().lower() # exact echo
	or lines[0].rstrip("!?.,").lower() == prompt.strip().rstrip("!?.,").lower() # punctuation variation
	or len(lines[0].split()) <= 3 # very short echo like "Hello!"
	):
	lines = lines[1:] # drop the first echo line

	clean_response = "\n".join(lines).strip()

	return clean_response

	# -----------------------------
	# Gradio Interface
	# -----------------------------
	demo = gr.Interface(
	fn=generate_response,
	inputs=[
	gr.Textbox(label="User Prompt", placeholder="Ask Lumen anything...", lines=3),
	gr.Slider(10, 500, value=200, label="Max Tokens"),
	gr.Slider(0.1, 2.0, value=0.7, label="Temperature"),
	gr.Slider(0.1, 1.0, value=0.9, label="Top-p"),
	],
	outputs=gr.Textbox(label="Lumen’s Response", lines=10),
	title="Lumen Instruct Model",
	description="Lumen Instruct — a fine-tuned, instruction-following language model built on the Lumen Foundational Model.",
	)

	# -----------------------------
	# Launch
	# -----------------------------
	if __name__ == "__main__":
	demo.launch(share=True)