# Import required libraries import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Configuration MODEL_PATH = "VDrontV2-mini-fp16/" # Path to your local HF model folder TEMPERATURE = 0.5 # Sampling temperature (lower = more deterministic) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Use GPU if available def load_model_and_tokenizer(model_path: str): """ Load the pretrained model and tokenizer from the local folder. Args: model_path (str): Path to the local model directory. Returns: model, tokenizer: Loaded model and tokenizer instances. """ print(f"Loading model from {model_path}...") print(f"Using device: {DEVICE}") # Load tokenizer (trust_remote_code=False as we are using a local standard model) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False) # Load model with automatic device mapping and half-precision if GPU is available model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, device_map="auto", trust_remote_code=False ) # Set pad token if not already set (helps with generation) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("Model and tokenizer loaded successfully!\n") return model, tokenizer def generate_response(model, tokenizer, prompt: str, temperature: float = 0.4, max_new_tokens: int = 256): """ Generate a response from the model given a user prompt. Args: model: The loaded language model. tokenizer: The tokenizer for encoding/decoding text. prompt (str): User input string. temperature (float): Sampling temperature (higher = more random). max_new_tokens (int): Maximum number of new tokens to generate. Returns: str: Model's generated response (without the input prompt). """ # Encode the prompt to input IDs inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) # Move inputs to the same device as the model inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate response with specified temperature with torch.no_grad(): # Disable gradient calculation for inference output_ids = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True, # Enable sampling to use temperature top_p=0.95, # Nucleus sampling for diversity repetition_penalty=1.1, # Slight penalty to avoid loops pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) # Decode only the newly generated tokens (exclude input prompt) input_length = inputs["input_ids"].shape[1] new_tokens = output_ids[0][input_length:] response = tokenizer.decode(new_tokens, skip_special_tokens=True) return response.strip() def interactive_chat(model, tokenizer, temperature: float): """ Run an infinite interactive chat loop. Args: model: Loaded language model. tokenizer: Tokenizer for the model. temperature (float): Sampling temperature for generation. """ print("=" * 60) print("šŸ¤– Chat with VDrone V2 Mini2 (Temperature = {})".format(temperature)) print("Type 'exit', 'quit', or press Ctrl+C to stop the conversation.") print("=" * 60) while True: try: # Get user input user_input = input("\nšŸ‘¤ You: ").strip() # Exit conditions if user_input.lower() in ["exit", "quit"]: print("šŸ¤– Assistant: Goodbye! Have a great day!") break # Skip empty inputs if not user_input: continue # Generate and print model response response = generate_response(model, tokenizer, user_input, temperature=temperature) print(f"šŸ¤– Assistant: {response}") except KeyboardInterrupt: print("\n\nšŸ¤– Assistant: Conversation interrupted. Goodbye!") break except Exception as e: print(f"āš ļø Error occurred: {e}") print("Continuing chat...") # Main entry point if __name__ == "__main__": # Load model and tokenizer from the specified folder model, tokenizer = load_model_and_tokenizer(MODEL_PATH) # Start the interactive chat with temperature 0.4 interactive_chat(model, tokenizer, temperature=TEMPERATURE)