Spaces:

TMVishnu
/

Inference_nanochat_d12

Sleeping

App Files Files Community

Inference_nanochat_d12 / app.py

TMVishnu

Update app.py

52b32be verified 28 days ago

raw

history blame contribute delete

2.89 kB

	import gradio as gr
	import torch
	import sys
	import os
	import pickle
	from huggingface_hub import hf_hub_download

	sys.path.insert(0, os.path.dirname(__file__))

	from nanochat.gpt import GPT, GPTConfig

	print("Downloading model files...")
	model_path = hf_hub_download(
	repo_id="TMVishnu/nanochat-distill-d12-int8",
	filename="model.pt"
	)

	tokenizer_pkl_path = hf_hub_download(
	repo_id="TMVishnu/nanochat-distill-d12-int8",
	filename="tokenizer.pkl"
	)

	token_bytes_path = hf_hub_download(
	repo_id="TMVishnu/nanochat-distill-d12-int8",
	filename="token_bytes.pt"
	)

	device = "cpu"
	print(f"Loading model on {device}")

	checkpoint = torch.load(model_path, map_location=device, weights_only=False)

	quantized_weights = checkpoint["quantized_weights"]
	scales = checkpoint["scales"]
	config_dict = checkpoint["config"]
	bits = checkpoint["bits"]

	print(f"Dequantizing INT{bits} weights...")
	model_state = {}
	for key, qweight in quantized_weights.items():
	if key in scales:
	scale = scales[key]
	model_state[key] = qweight.float() * scale
	else:
	model_state[key] = qweight

	config = GPTConfig(**config_dict)
	model = GPT(config)
	model.load_state_dict(model_state, strict=False)
	model.eval()
	model.to(device)

	with open(tokenizer_pkl_path, "rb") as f:
	tokenizer = pickle.load(f)

	print("Model loaded successfully")

	def generate_text(prompt, max_tokens=50, temperature=0.8):
	try:
	tokens = tokenizer.encode(prompt)
	x = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

	with torch.no_grad():
	for _ in range(max_tokens):
	logits = model(x)
	logits = logits[:, -1, :] / temperature
	probs = torch.nn.functional.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)
	x = torch.cat([x, next_token], dim=1)

	output_tokens = x[0].tolist()
	output = tokenizer.decode(output_tokens)
	return output

	except Exception as e:
	import traceback
	return f"Error: {str(e)}\n\n{traceback.format_exc()}"

	demo = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.Textbox(label="Prompt", placeholder="Enter your prompt", lines=3),
	gr.Slider(minimum=10, maximum=200, value=50, step=1, label="Max Tokens"),
	gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature")
	],
	outputs=gr.Textbox(label="Generated Text", lines=10),
	title="NanoChat Distilled Model INT8",
	description="375M parameter student with MQA and INT8 quantization. Warning: Output quality is limited by undertrained teacher.",
	examples=[
	["What is the capital of France?", 50, 0.7],
	["Explain machine learning in simple terms", 100, 0.8],
	["Write a haiku about coding", 50, 1.0]
	]
	)

	demo.launch()