Spaces:

ahuggingface01
/

tush1

Build error

App Files Files Community

tush1 / app.py

ahuggingface01

Update app.py

4f75e01 verified about 2 months ago

raw

history blame contribute delete

2.08 kB

	import os
	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

	# ---- Safety: prevent VRAM fragmentation ----
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

	MODEL_ID = "deepseek-ai/DeepSeek-R1"

	# ---- HARD GPU CHECK ----
	if not torch.cuda.is_available():
	raise RuntimeError("❌ GPU not detected. Please enable GPU hardware in HF Space settings.")

	print("✅ Using GPU:", torch.cuda.get_device_name(0))

	# ---- 4bit quant config (24GB optimized) ----
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	)

	# ---- Load tokenizer ----
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID,
	trust_remote_code=True
	)

	# ---- Load model ----
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True,
	attn_implementation="flash_attention_2"
	)

	model.eval()


	# ---- Chat Function ----
	def chat_fn(message, history):
	messages = []

	for user, bot in history:
	messages.append({"role": "user", "content": user})
	messages.append({"role": "assistant", "content": bot})

	messages.append({"role": "user", "content": message})

	inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_tensors="pt",
	return_dict=True
	).to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.9,
	repetition_penalty=1.1
	)

	response = tokenizer.decode(
	outputs[0][inputs["input_ids"].shape[-1]:],
	skip_special_tokens=True
	)

	return response


	# ---- Gradio UI ----
	demo = gr.ChatInterface(
	fn=chat_fn,
	title="DeepSeek-R1 32B (4bit) - 24GB GPU",
	chatbot=gr.Chatbot(height=500),
	)

	demo.queue()
	demo.launch()