File size: 4,989 Bytes
0889154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel
import threading
import os

# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model_and_tokenizer(model_path="./final_model"):
    """Load the fine-tuned phi-2 model and tokenizer"""
    print(f"Loading fine-tuned model from {model_path}...")
    print(f"Using device: {device}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "microsoft/phi-2", 
        trust_remote_code=True
    )
    # Set pad_token to a different value than eos_token to fix attention mask issue
    if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.unk_token
    
    # For CPU environments, we can't use 4-bit quantization
    if device.type == "cuda":
        # Use 4-bit quantization on GPU
        from transformers import BitsAndBytesConfig
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )
        
        # Load base model with 4-bit quantization
        base_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            quantization_config=bnb_config,
            trust_remote_code=True,
            device_map="auto"
        )
    else:
        # On CPU, load in 8-bit or full precision
        print("Loading on CPU - using 8-bit quantization or full precision")
        try:
            # Try 8-bit first (requires bitsandbytes)
            base_model = AutoModelForCausalLM.from_pretrained(
                "microsoft/phi-2",
                load_in_8bit=True,
                trust_remote_code=True,
                device_map="auto"
            )
        except:
            # Fall back to full precision if 8-bit fails
            print("8-bit loading failed, falling back to full precision (fp32)")
            base_model = AutoModelForCausalLM.from_pretrained(
                "microsoft/phi-2",
                trust_remote_code=True,
                torch_dtype=torch.float32
            )
    
    # Load the fine-tuned LoRA adapter
    try:
        model = PeftModel.from_pretrained(
            base_model,
            model_path,
            device_map="auto" if device.type == "cuda" else None
        )
    except Exception as e:
        print(f"Error loading LoRA adapter: {e}")
        print("Falling back to base model")
        model = base_model
    
    # Move model to CPU if needed
    if device.type == "cpu":
        model = model.to(device)
    
    model.eval()  # Set model to evaluation mode
    print(f"Fine-tuned model loaded successfully!")
    return model, tokenizer

def format_chat_history(messages):
    """Format the chat history into a prompt for the model"""
    formatted_prompt = ""
    
    for message in messages:
        role = message["role"]
        content = message["content"]
        
        if role == "user":
            formatted_prompt += f"Human: {content}\n\n"
        elif role == "assistant":
            formatted_prompt += f"Assistant: {content}\n\n"
    
    # Add the final assistant prompt
    formatted_prompt += "Assistant:"
    
    return formatted_prompt

def generate_response(model, tokenizer, messages):
    """Generate a streaming response from the model based on chat history"""
    # Format the conversation history
    prompt = format_chat_history(messages)
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Create a streamer for token-by-token generation
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Set generation parameters
    generation_kwargs = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "max_new_tokens": 500,
        "temperature": 0.7,
        "top_p": 0.9,
        "do_sample": True,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "streamer": streamer,
    }
    
    # Start generation in a separate thread
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Stream tokens as they're generated
    generated_text = ""
    for new_text in streamer:
        # Check if the model is trying to start a new turn
        if "Human:" in new_text or "\nHuman:" in generated_text + new_text:
            # Stop generation if model tries to create a new human turn
            break
        if "Assistant:" in new_text and generated_text:
            # Stop if model tries to create a new assistant turn
            break
        
        yield new_text
        generated_text += new_text