# inference.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

class MistralChat:
    def __init__(self, model_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        print("Loading model...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map="auto" if self.device == "cuda" else None,
            trust_remote_code=True
        )
        
        if self.device == "cuda":
            self.model = self.model.to(self.device)
        
        print("Model loaded successfully!")
    
    def generate(self, prompt, max_length=500, temperature=0.7):
        # Format for instruct models
        formatted_prompt = f"[INST] {prompt} [/INST]"
        
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt")
        if self.device == "cuda":
            inputs = inputs.to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=temperature,
                do_sample=True,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant's response
        if "[/INST]" in response:
            response = response.split("[/INST]")[1].strip()
        
        return response
    
    def chat_stream(self, prompt):
        """Stream the response token by token"""
        formatted_prompt = f"[INST] {prompt} [/INST]"
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt")
        
        streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        if self.device == "cuda":
            inputs = inputs.to(self.device)
        
        _ = self.model.generate(**inputs, streamer=streamer, max_new_tokens=500)

# Usage
if __name__ == "__main__":
    chat = MistralChat()
    
    # Single response
    response = chat.generate("Explain quantum computing in simple terms")
    print("Response:", response)
    
    # Streaming response
    print("\nStreaming response:")
    chat.chat_stream("Write a short poem about AI")