File size: 4,756 Bytes
462502d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | # Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Configuration
MODEL_PATH = "VDrontV2-mini-fp16/" # Path to your local HF model folder
TEMPERATURE = 0.5 # Sampling temperature (lower = more deterministic)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Use GPU if available
def load_model_and_tokenizer(model_path: str):
"""
Load the pretrained model and tokenizer from the local folder.
Args:
model_path (str): Path to the local model directory.
Returns:
model, tokenizer: Loaded model and tokenizer instances.
"""
print(f"Loading model from {model_path}...")
print(f"Using device: {DEVICE}")
# Load tokenizer (trust_remote_code=False as we are using a local standard model)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False)
# Load model with automatic device mapping and half-precision if GPU is available
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
device_map="auto",
trust_remote_code=False
)
# Set pad token if not already set (helps with generation)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Model and tokenizer loaded successfully!\n")
return model, tokenizer
def generate_response(model, tokenizer, prompt: str, temperature: float = 0.4, max_new_tokens: int = 256):
"""
Generate a response from the model given a user prompt.
Args:
model: The loaded language model.
tokenizer: The tokenizer for encoding/decoding text.
prompt (str): User input string.
temperature (float): Sampling temperature (higher = more random).
max_new_tokens (int): Maximum number of new tokens to generate.
Returns:
str: Model's generated response (without the input prompt).
"""
# Encode the prompt to input IDs
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate response with specified temperature
with torch.no_grad(): # Disable gradient calculation for inference
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=True, # Enable sampling to use temperature
top_p=0.95, # Nucleus sampling for diversity
repetition_penalty=1.1, # Slight penalty to avoid loops
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode only the newly generated tokens (exclude input prompt)
input_length = inputs["input_ids"].shape[1]
new_tokens = output_ids[0][input_length:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
return response.strip()
def interactive_chat(model, tokenizer, temperature: float):
"""
Run an infinite interactive chat loop.
Args:
model: Loaded language model.
tokenizer: Tokenizer for the model.
temperature (float): Sampling temperature for generation.
"""
print("=" * 60)
print("🤖 Chat with VDrone V2 Mini2 (Temperature = {})".format(temperature))
print("Type 'exit', 'quit', or press Ctrl+C to stop the conversation.")
print("=" * 60)
while True:
try:
# Get user input
user_input = input("\n👤 You: ").strip()
# Exit conditions
if user_input.lower() in ["exit", "quit"]:
print("🤖 Assistant: Goodbye! Have a great day!")
break
# Skip empty inputs
if not user_input:
continue
# Generate and print model response
response = generate_response(model, tokenizer, user_input, temperature=temperature)
print(f"🤖 Assistant: {response}")
except KeyboardInterrupt:
print("\n\n🤖 Assistant: Conversation interrupted. Goodbye!")
break
except Exception as e:
print(f"⚠️ Error occurred: {e}")
print("Continuing chat...")
# Main entry point
if __name__ == "__main__":
# Load model and tokenizer from the specified folder
model, tokenizer = load_model_and_tokenizer(MODEL_PATH)
# Start the interactive chat with temperature 0.4
interactive_chat(model, tokenizer, temperature=TEMPERATURE) |