Spaces:
Sleeping
Sleeping
File size: 4,989 Bytes
0889154 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel
import threading
import os
# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_model_and_tokenizer(model_path="./final_model"):
"""Load the fine-tuned phi-2 model and tokenizer"""
print(f"Loading fine-tuned model from {model_path}...")
print(f"Using device: {device}")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
"microsoft/phi-2",
trust_remote_code=True
)
# Set pad_token to a different value than eos_token to fix attention mask issue
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
tokenizer.pad_token = tokenizer.unk_token
# For CPU environments, we can't use 4-bit quantization
if device.type == "cuda":
# Use 4-bit quantization on GPU
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
# Load base model with 4-bit quantization
base_model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
quantization_config=bnb_config,
trust_remote_code=True,
device_map="auto"
)
else:
# On CPU, load in 8-bit or full precision
print("Loading on CPU - using 8-bit quantization or full precision")
try:
# Try 8-bit first (requires bitsandbytes)
base_model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
load_in_8bit=True,
trust_remote_code=True,
device_map="auto"
)
except:
# Fall back to full precision if 8-bit fails
print("8-bit loading failed, falling back to full precision (fp32)")
base_model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
trust_remote_code=True,
torch_dtype=torch.float32
)
# Load the fine-tuned LoRA adapter
try:
model = PeftModel.from_pretrained(
base_model,
model_path,
device_map="auto" if device.type == "cuda" else None
)
except Exception as e:
print(f"Error loading LoRA adapter: {e}")
print("Falling back to base model")
model = base_model
# Move model to CPU if needed
if device.type == "cpu":
model = model.to(device)
model.eval() # Set model to evaluation mode
print(f"Fine-tuned model loaded successfully!")
return model, tokenizer
def format_chat_history(messages):
"""Format the chat history into a prompt for the model"""
formatted_prompt = ""
for message in messages:
role = message["role"]
content = message["content"]
if role == "user":
formatted_prompt += f"Human: {content}\n\n"
elif role == "assistant":
formatted_prompt += f"Assistant: {content}\n\n"
# Add the final assistant prompt
formatted_prompt += "Assistant:"
return formatted_prompt
def generate_response(model, tokenizer, messages):
"""Generate a streaming response from the model based on chat history"""
# Format the conversation history
prompt = format_chat_history(messages)
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Create a streamer for token-by-token generation
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Set generation parameters
generation_kwargs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"max_new_tokens": 500,
"temperature": 0.7,
"top_p": 0.9,
"do_sample": True,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"streamer": streamer,
}
# Start generation in a separate thread
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream tokens as they're generated
generated_text = ""
for new_text in streamer:
# Check if the model is trying to start a new turn
if "Human:" in new_text or "\nHuman:" in generated_text + new_text:
# Stop generation if model tries to create a new human turn
break
if "Assistant:" in new_text and generated_text:
# Stop if model tries to create a new assistant turn
break
yield new_text
generated_text += new_text |