Spaces:

olacode55
/

zimble

Sleeping

App Files Files Community

zimble / app.py

olacode55

Update app.py

06f711f verified 4 months ago

raw

history blame contribute delete

1.69 kB

	import os
	import torch
	import gradio as gr
	from huggingface_hub import login
	from transformers import AutoTokenizer, AutoModelForCausalLM
	# === STEP 1: Authenticate with Hugging Face ===
	# Make sure you set your HF token as an environment variable or paste it here temporarily
	# For security, prefer environment variable (recommended)
	#hf_token = "TLpIICgZJrDCTgVTsaaydFFWbWyGKiGAPa"
	#login(token="hf_" + hf_token)

	# === STEP 2: Load base and adapter models ===
	base_model = "meta-llama/Llama-2-7b-chat-hf"
	adapter_model = "olacode55/zimble-llama2-finetunedhybride"

	tokenizer = AutoTokenizer.from_pretrained(adapter_model)

	# Enable memory-efficient loading if needed
	device = "cuda" if torch.cuda.is_available() else "cpu"

	model = AutoModelForCausalLM.from_pretrained(
	adapter_model,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto",
	low_cpu_mem_usage=True
	)

	# === STEP 3: Define generation function ===
	def generate(prompt):
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=250,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	# === STEP 4: Launch Gradio app ===
	demo = gr.Interface(
	fn=generate,
	inputs=gr.Textbox(label="Enter your prompt", lines=4, placeholder="Type something..."),
	outputs=gr.Textbox(label="Model output"),
	title="🦙 Zimble LLaMA 2 (Merged)",
	description="Fine-tuned and merged version of LLaMA 2 running on Hugging Face Space"
	)

	demo.launch()