from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
import threading

model_name = "microsoft/phi-2"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    low_cpu_mem_usage=True
).to(device)

system_prompt = (
    "You are ProTalk, a professional AI assistant. "
    "You remember everything the user said in this session and respond politely, "
    "clearly, and intelligently. Keep a coherent conversation history."
)

chat_history = []

def chat_loop():
    print("ProTalk Memory Chat Online — type 'exit' to quit.\n")
    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        chat_history.append(f"User: {user_input}")
        prompt = system_prompt + "\n" + "\n".join(chat_history) + "\nProTalk:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
        thread = threading.Thread(target=model.generate, kwargs={
            "input_ids": inputs["input_ids"],
            "max_new_tokens": 300,
            "do_sample": True,
            "temperature": 0.7,
            "top_p": 0.9,
            "repetition_penalty": 1.2,
            "streamer": streamer
        })
        thread.start()
        output_text = ""
        for token in streamer:
            print(token, end="", flush=True)
            output_text += token
        thread.join()
        print()
        chat_history.append(f"ProTalk: {output_text}")

if __name__ == "__main__":
    chat_loop()