Safetensors
English
lm
12m
VDRONT
MishaGGG's picture
Upload use.py
462502d verified
# Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Configuration
MODEL_PATH = "VDrontV2-mini-fp16/" # Path to your local HF model folder
TEMPERATURE = 0.5 # Sampling temperature (lower = more deterministic)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Use GPU if available
def load_model_and_tokenizer(model_path: str):
"""
Load the pretrained model and tokenizer from the local folder.
Args:
model_path (str): Path to the local model directory.
Returns:
model, tokenizer: Loaded model and tokenizer instances.
"""
print(f"Loading model from {model_path}...")
print(f"Using device: {DEVICE}")
# Load tokenizer (trust_remote_code=False as we are using a local standard model)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False)
# Load model with automatic device mapping and half-precision if GPU is available
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
device_map="auto",
trust_remote_code=False
)
# Set pad token if not already set (helps with generation)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Model and tokenizer loaded successfully!\n")
return model, tokenizer
def generate_response(model, tokenizer, prompt: str, temperature: float = 0.4, max_new_tokens: int = 256):
"""
Generate a response from the model given a user prompt.
Args:
model: The loaded language model.
tokenizer: The tokenizer for encoding/decoding text.
prompt (str): User input string.
temperature (float): Sampling temperature (higher = more random).
max_new_tokens (int): Maximum number of new tokens to generate.
Returns:
str: Model's generated response (without the input prompt).
"""
# Encode the prompt to input IDs
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate response with specified temperature
with torch.no_grad(): # Disable gradient calculation for inference
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=True, # Enable sampling to use temperature
top_p=0.95, # Nucleus sampling for diversity
repetition_penalty=1.1, # Slight penalty to avoid loops
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode only the newly generated tokens (exclude input prompt)
input_length = inputs["input_ids"].shape[1]
new_tokens = output_ids[0][input_length:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
return response.strip()
def interactive_chat(model, tokenizer, temperature: float):
"""
Run an infinite interactive chat loop.
Args:
model: Loaded language model.
tokenizer: Tokenizer for the model.
temperature (float): Sampling temperature for generation.
"""
print("=" * 60)
print("🤖 Chat with VDrone V2 Mini2 (Temperature = {})".format(temperature))
print("Type 'exit', 'quit', or press Ctrl+C to stop the conversation.")
print("=" * 60)
while True:
try:
# Get user input
user_input = input("\n👤 You: ").strip()
# Exit conditions
if user_input.lower() in ["exit", "quit"]:
print("🤖 Assistant: Goodbye! Have a great day!")
break
# Skip empty inputs
if not user_input:
continue
# Generate and print model response
response = generate_response(model, tokenizer, user_input, temperature=temperature)
print(f"🤖 Assistant: {response}")
except KeyboardInterrupt:
print("\n\n🤖 Assistant: Conversation interrupted. Goodbye!")
break
except Exception as e:
print(f"⚠️ Error occurred: {e}")
print("Continuing chat...")
# Main entry point
if __name__ == "__main__":
# Load model and tokenizer from the specified folder
model, tokenizer = load_model_and_tokenizer(MODEL_PATH)
# Start the interactive chat with temperature 0.4
interactive_chat(model, tokenizer, temperature=TEMPERATURE)