import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from safetensors.torch import load_file
import json

class AgGPT:
    def __init__(self, model_path="aggpt13/"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

        quant_path = model_path + "model-int8.safetensors"
        quant_params_path = model_path + "model-int8-quant_params.json"

        quant_tensors = load_file(quant_path)
        with open(quant_params_path, "r") as f:
            quant_params = json.load(f)

        # Dequantize tensors
        state_dict = {}
        for k, q_tensor in quant_tensors.items():
            if k in quant_params:
                scale = quant_params[k]["scale"]
                zero_point = quant_params[k]["zero_point"]
                dequant = (q_tensor.to(torch.float32) - zero_point) * scale
                state_dict[k] = dequant
            else:
                state_dict[k] = q_tensor

        # Load config and model architecture only (without weights)
        config = AutoConfig.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_config(config)
        
        # Load dequantized state dict manually
        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
        print(f"Missing keys: {missing}")
        print(f"Unexpected keys: {unexpected}")

        self.model.to("cuda" if torch.cuda.is_available() else "cpu")
        self.model.eval()

    def ask(self, prompt):
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(**inputs, max_new_tokens=50)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


if __name__ == "__main__":
    agent = AgGPT()
    response = agent.ask("hey, who are you?")
    print(response)