# Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configuration
MODEL_PATH = "VDrontV2-mini-fp16/"  # Path to your local HF model folder
TEMPERATURE = 0.5  # Sampling temperature (lower = more deterministic)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available


def load_model_and_tokenizer(model_path: str):
    """
    Load the pretrained model and tokenizer from the local folder.

    Args:
        model_path (str): Path to the local model directory.

    Returns:
        model, tokenizer: Loaded model and tokenizer instances.
    """
    print(f"Loading model from {model_path}...")
    print(f"Using device: {DEVICE}")

    # Load tokenizer (trust_remote_code=False as we are using a local standard model)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False)

    # Load model with automatic device mapping and half-precision if GPU is available
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        device_map="auto",
        trust_remote_code=False
    )

    # Set pad token if not already set (helps with generation)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("Model and tokenizer loaded successfully!\n")
    return model, tokenizer


def generate_response(model, tokenizer, prompt: str, temperature: float = 0.4, max_new_tokens: int = 256):
    """
    Generate a response from the model given a user prompt.

    Args:
        model: The loaded language model.
        tokenizer: The tokenizer for encoding/decoding text.
        prompt (str): User input string.
        temperature (float): Sampling temperature (higher = more random).
        max_new_tokens (int): Maximum number of new tokens to generate.

    Returns:
        str: Model's generated response (without the input prompt).
    """
    # Encode the prompt to input IDs
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response with specified temperature
    with torch.no_grad():  # Disable gradient calculation for inference
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,  # Enable sampling to use temperature
            top_p=0.95,  # Nucleus sampling for diversity
            repetition_penalty=1.1,  # Slight penalty to avoid loops
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the newly generated tokens (exclude input prompt)
    input_length = inputs["input_ids"].shape[1]
    new_tokens = output_ids[0][input_length:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)

    return response.strip()


def interactive_chat(model, tokenizer, temperature: float):
    """
    Run an infinite interactive chat loop.

    Args:
        model: Loaded language model.
        tokenizer: Tokenizer for the model.
        temperature (float): Sampling temperature for generation.
    """
    print("=" * 60)
    print("🤖 Chat with VDrone V2 Mini2 (Temperature = {})".format(temperature))
    print("Type 'exit', 'quit', or press Ctrl+C to stop the conversation.")
    print("=" * 60)

    while True:
        try:
            # Get user input
            user_input = input("\n👤 You: ").strip()

            # Exit conditions
            if user_input.lower() in ["exit", "quit"]:
                print("🤖 Assistant: Goodbye! Have a great day!")
                break

            # Skip empty inputs
            if not user_input:
                continue

            # Generate and print model response
            response = generate_response(model, tokenizer, user_input, temperature=temperature)
            print(f"🤖 Assistant: {response}")

        except KeyboardInterrupt:
            print("\n\n🤖 Assistant: Conversation interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"⚠️ Error occurred: {e}")
            print("Continuing chat...")


# Main entry point
if __name__ == "__main__":
    # Load model and tokenizer from the specified folder
    model, tokenizer = load_model_and_tokenizer(MODEL_PATH)

    # Start the interactive chat with temperature 0.4
    interactive_chat(model, tokenizer, temperature=TEMPERATURE)