Safetensors
English
lm
12m
VDRONT
File size: 4,756 Bytes
462502d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configuration
MODEL_PATH = "VDrontV2-mini-fp16/"  # Path to your local HF model folder
TEMPERATURE = 0.5  # Sampling temperature (lower = more deterministic)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available


def load_model_and_tokenizer(model_path: str):
    """

    Load the pretrained model and tokenizer from the local folder.



    Args:

        model_path (str): Path to the local model directory.



    Returns:

        model, tokenizer: Loaded model and tokenizer instances.

    """
    print(f"Loading model from {model_path}...")
    print(f"Using device: {DEVICE}")

    # Load tokenizer (trust_remote_code=False as we are using a local standard model)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False)

    # Load model with automatic device mapping and half-precision if GPU is available
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        device_map="auto",
        trust_remote_code=False
    )

    # Set pad token if not already set (helps with generation)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("Model and tokenizer loaded successfully!\n")
    return model, tokenizer


def generate_response(model, tokenizer, prompt: str, temperature: float = 0.4, max_new_tokens: int = 256):
    """

    Generate a response from the model given a user prompt.



    Args:

        model: The loaded language model.

        tokenizer: The tokenizer for encoding/decoding text.

        prompt (str): User input string.

        temperature (float): Sampling temperature (higher = more random).

        max_new_tokens (int): Maximum number of new tokens to generate.



    Returns:

        str: Model's generated response (without the input prompt).

    """
    # Encode the prompt to input IDs
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response with specified temperature
    with torch.no_grad():  # Disable gradient calculation for inference
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,  # Enable sampling to use temperature
            top_p=0.95,  # Nucleus sampling for diversity
            repetition_penalty=1.1,  # Slight penalty to avoid loops
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the newly generated tokens (exclude input prompt)
    input_length = inputs["input_ids"].shape[1]
    new_tokens = output_ids[0][input_length:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)

    return response.strip()


def interactive_chat(model, tokenizer, temperature: float):
    """

    Run an infinite interactive chat loop.



    Args:

        model: Loaded language model.

        tokenizer: Tokenizer for the model.

        temperature (float): Sampling temperature for generation.

    """
    print("=" * 60)
    print("🤖 Chat with VDrone V2 Mini2 (Temperature = {})".format(temperature))
    print("Type 'exit', 'quit', or press Ctrl+C to stop the conversation.")
    print("=" * 60)

    while True:
        try:
            # Get user input
            user_input = input("\n👤 You: ").strip()

            # Exit conditions
            if user_input.lower() in ["exit", "quit"]:
                print("🤖 Assistant: Goodbye! Have a great day!")
                break

            # Skip empty inputs
            if not user_input:
                continue

            # Generate and print model response
            response = generate_response(model, tokenizer, user_input, temperature=temperature)
            print(f"🤖 Assistant: {response}")

        except KeyboardInterrupt:
            print("\n\n🤖 Assistant: Conversation interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"⚠️ Error occurred: {e}")
            print("Continuing chat...")


# Main entry point
if __name__ == "__main__":
    # Load model and tokenizer from the specified folder
    model, tokenizer = load_model_and_tokenizer(MODEL_PATH)

    # Start the interactive chat with temperature 0.4
    interactive_chat(model, tokenizer, temperature=TEMPERATURE)