|
|
| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
| MODEL_PATH = "VDrontV2-mini-fp16/"
|
| TEMPERATURE = 0.5
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
| def load_model_and_tokenizer(model_path: str):
|
| """
|
| Load the pretrained model and tokenizer from the local folder.
|
|
|
| Args:
|
| model_path (str): Path to the local model directory.
|
|
|
| Returns:
|
| model, tokenizer: Loaded model and tokenizer instances.
|
| """
|
| print(f"Loading model from {model_path}...")
|
| print(f"Using device: {DEVICE}")
|
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False)
|
|
|
|
|
| model = AutoModelForCausalLM.from_pretrained(
|
| model_path,
|
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
|
| device_map="auto",
|
| trust_remote_code=False
|
| )
|
|
|
|
|
| if tokenizer.pad_token is None:
|
| tokenizer.pad_token = tokenizer.eos_token
|
|
|
| print("Model and tokenizer loaded successfully!\n")
|
| return model, tokenizer
|
|
|
|
|
| def generate_response(model, tokenizer, prompt: str, temperature: float = 0.4, max_new_tokens: int = 256):
|
| """
|
| Generate a response from the model given a user prompt.
|
|
|
| Args:
|
| model: The loaded language model.
|
| tokenizer: The tokenizer for encoding/decoding text.
|
| prompt (str): User input string.
|
| temperature (float): Sampling temperature (higher = more random).
|
| max_new_tokens (int): Maximum number of new tokens to generate.
|
|
|
| Returns:
|
| str: Model's generated response (without the input prompt).
|
| """
|
|
|
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
|
|
|
|
|
| inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
|
|
|
|
| with torch.no_grad():
|
| output_ids = model.generate(
|
| **inputs,
|
| max_new_tokens=max_new_tokens,
|
| temperature=temperature,
|
| do_sample=True,
|
| top_p=0.95,
|
| repetition_penalty=1.1,
|
| pad_token_id=tokenizer.pad_token_id,
|
| eos_token_id=tokenizer.eos_token_id
|
| )
|
|
|
|
|
| input_length = inputs["input_ids"].shape[1]
|
| new_tokens = output_ids[0][input_length:]
|
| response = tokenizer.decode(new_tokens, skip_special_tokens=True)
|
|
|
| return response.strip()
|
|
|
|
|
| def interactive_chat(model, tokenizer, temperature: float):
|
| """
|
| Run an infinite interactive chat loop.
|
|
|
| Args:
|
| model: Loaded language model.
|
| tokenizer: Tokenizer for the model.
|
| temperature (float): Sampling temperature for generation.
|
| """
|
| print("=" * 60)
|
| print("🤖 Chat with VDrone V2 Mini2 (Temperature = {})".format(temperature))
|
| print("Type 'exit', 'quit', or press Ctrl+C to stop the conversation.")
|
| print("=" * 60)
|
|
|
| while True:
|
| try:
|
|
|
| user_input = input("\n👤 You: ").strip()
|
|
|
|
|
| if user_input.lower() in ["exit", "quit"]:
|
| print("🤖 Assistant: Goodbye! Have a great day!")
|
| break
|
|
|
|
|
| if not user_input:
|
| continue
|
|
|
|
|
| response = generate_response(model, tokenizer, user_input, temperature=temperature)
|
| print(f"🤖 Assistant: {response}")
|
|
|
| except KeyboardInterrupt:
|
| print("\n\n🤖 Assistant: Conversation interrupted. Goodbye!")
|
| break
|
| except Exception as e:
|
| print(f"⚠️ Error occurred: {e}")
|
| print("Continuing chat...")
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
| model, tokenizer = load_model_and_tokenizer(MODEL_PATH)
|
|
|
|
|
| interactive_chat(model, tokenizer, temperature=TEMPERATURE) |