inference.py · DeepXR/Helion-V1.5-XL at refs/pr/1

Create inference.py

23a2e44 verified 6 months ago

8.96 kB

	"""
	Helion-V1.5-XL Inference Script
	Supports multiple inference modes and optimization techniques
	"""

	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	BitsAndBytesConfig,
	GenerationConfig
	)
	from typing import Optional, Dict, Any, List
	import argparse
	import json
	import time


	class HelionInference:
	"""Inference wrapper for Helion-V1.5-XL"""

	def __init__(
	self,
	model_name: str = "DeepXR/Helion-V1.5-XL",
	load_in_4bit: bool = False,
	load_in_8bit: bool = False,
	device_map: str = "auto",
	torch_dtype: str = "bfloat16"
	):
	"""
	Initialize the model and tokenizer

	Args:
	model_name: HuggingFace model identifier
	load_in_4bit: Enable 4-bit quantization
	load_in_8bit: Enable 8-bit quantization
	device_map: Device mapping strategy
	torch_dtype: PyTorch dtype for model weights
	"""
	self.model_name = model_name
	print(f"Loading model: {model_name}")

	# Setup dtype
	dtype_map = {
	"bfloat16": torch.bfloat16,
	"float16": torch.float16,
	"float32": torch.float32
	}
	torch_dtype = dtype_map.get(torch_dtype, torch.bfloat16)

	# Setup quantization config
	quantization_config = None
	if load_in_4bit:
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch_dtype,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)
	elif load_in_8bit:
	quantization_config = BitsAndBytesConfig(load_in_8bit=True)

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	trust_remote_code=True
	)

	# Load model
	model_kwargs = {
	"device_map": device_map,
	"trust_remote_code": True,
	}

	if quantization_config:
	model_kwargs["quantization_config"] = quantization_config
	else:
	model_kwargs["torch_dtype"] = torch_dtype

	self.model = AutoModelForCausalLM.from_pretrained(
	model_name,
	**model_kwargs
	)

	self.model.eval()
	print("Model loaded successfully!")

	def generate(
	self,
	prompt: str,
	max_new_tokens: int = 512,
	temperature: float = 0.7,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.1,
	do_sample: bool = True,
	num_return_sequences: int = 1,
	**kwargs
	) -> List[str]:
	"""
	Generate text from a prompt

	Args:
	prompt: Input text prompt
	max_new_tokens: Maximum number of tokens to generate
	temperature: Sampling temperature (0.0 to 2.0)
	top_p: Nucleus sampling threshold
	top_k: Top-k sampling threshold
	repetition_penalty: Penalty for repetition
	do_sample: Whether to use sampling
	num_return_sequences: Number of sequences to generate

	Returns:
	List of generated text strings
	"""
	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

	generation_config = GenerationConfig(
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	do_sample=do_sample,
	num_return_sequences=num_return_sequences,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id,
	**kwargs
	)

	start_time = time.time()

	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	generation_config=generation_config
	)

	generation_time = time.time() - start_time

	# Decode outputs
	responses = []
	for output in outputs:
	response = self.tokenizer.decode(output, skip_special_tokens=True)
	# Remove the prompt from response
	response = response[len(prompt):].strip()
	responses.append(response)

	# Calculate tokens per second
	total_tokens = sum(len(output) for output in outputs)
	tokens_per_sec = total_tokens / generation_time

	print(f"\nGeneration Stats:")
	print(f" Time: {generation_time:.2f}s")
	print(f" Tokens/sec: {tokens_per_sec:.2f}")

	return responses

	def chat(
	self,
	messages: List[Dict[str, str]],
	max_new_tokens: int = 512,
	temperature: float = 0.7,
	**kwargs
	) -> str:
	"""
	Generate response in chat format

	Args:
	messages: List of message dicts with 'role' and 'content'
	max_new_tokens: Maximum tokens to generate
	temperature: Sampling temperature

	Returns:
	Generated response string
	"""
	# Apply chat template
	prompt = self.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	responses = self.generate(
	prompt,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	**kwargs
	)

	return responses[0]

	def batch_generate(
	self,
	prompts: List[str],
	max_new_tokens: int = 512,
	**kwargs
	) -> List[str]:
	"""
	Generate responses for multiple prompts in batch

	Args:
	prompts: List of input prompts
	max_new_tokens: Maximum tokens per generation

	Returns:
	List of generated responses
	"""
	inputs = self.tokenizer(
	prompts,
	return_tensors="pt",
	padding=True,
	truncation=True
	).to(self.model.device)

	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	**kwargs
	)

	responses = []
	for i, output in enumerate(outputs):
	response = self.tokenizer.decode(output, skip_special_tokens=True)
	# Remove prompt
	response = response[len(prompts[i]):].strip()
	responses.append(response)

	return responses


	def main():
	parser = argparse.ArgumentParser(description="Helion-V1.5-XL Inference")
	parser.add_argument(
	"--model",
	type=str,
	default="DeepXR/Helion-V1.5-XL",
	help="Model name or path"
	)
	parser.add_argument(
	"--prompt",
	type=str,
	required=True,
	help="Input prompt"
	)
	parser.add_argument(
	"--max-tokens",
	type=int,
	default=512,
	help="Maximum tokens to generate"
	)
	parser.add_argument(
	"--temperature",
	type=float,
	default=0.7,
	help="Sampling temperature"
	)
	parser.add_argument(
	"--top-p",
	type=float,
	default=0.9,
	help="Nucleus sampling threshold"
	)
	parser.add_argument(
	"--load-in-4bit",
	action="store_true",
	help="Load model in 4-bit quantization"
	)
	parser.add_argument(
	"--load-in-8bit",
	action="store_true",
	help="Load model in 8-bit quantization"
	)
	parser.add_argument(
	"--chat-mode",
	action="store_true",
	help="Use chat format"
	)

	args = parser.parse_args()

	# Initialize model
	inference = HelionInference(
	model_name=args.model,
	load_in_4bit=args.load_in_4bit,
	load_in_8bit=args.load_in_8bit
	)

	# Generate response
	if args.chat_mode:
	messages = [
	{"role": "user", "content": args.prompt}
	]
	response = inference.chat(
	messages,
	max_new_tokens=args.max_tokens,
	temperature=args.temperature,
	top_p=args.top_p
	)
	else:
	responses = inference.generate(
	args.prompt,
	max_new_tokens=args.max_tokens,
	temperature=args.temperature,
	top_p=args.top_p
	)
	response = responses[0]

	print("\n" + "="*80)
	print("PROMPT:")
	print("="*80)
	print(args.prompt)
	print("\n" + "="*80)
	print("RESPONSE:")
	print("="*80)
	print(response)
	print("="*80)


	if __name__ == "__main__":
	main()