import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from safetensors.torch import load_file import json class AgGPT: def __init__(self, model_path="aggpt13/"): self.tokenizer = AutoTokenizer.from_pretrained(model_path) quant_path = model_path + "model-int8.safetensors" quant_params_path = model_path + "model-int8-quant_params.json" quant_tensors = load_file(quant_path) with open(quant_params_path, "r") as f: quant_params = json.load(f) # Dequantize tensors state_dict = {} for k, q_tensor in quant_tensors.items(): if k in quant_params: scale = quant_params[k]["scale"] zero_point = quant_params[k]["zero_point"] dequant = (q_tensor.to(torch.float32) - zero_point) * scale state_dict[k] = dequant else: state_dict[k] = q_tensor # Load config and model architecture only (without weights) config = AutoConfig.from_pretrained(model_path) self.model = AutoModelForCausalLM.from_config(config) # Load dequantized state dict manually missing, unexpected = self.model.load_state_dict(state_dict, strict=False) print(f"Missing keys: {missing}") print(f"Unexpected keys: {unexpected}") self.model.to("cuda" if torch.cuda.is_available() else "cpu") self.model.eval() def ask(self, prompt): inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) outputs = self.model.generate(**inputs, max_new_tokens=50) return self.tokenizer.decode(outputs[0], skip_special_tokens=True) if __name__ == "__main__": agent = AgGPT() response = agent.ask("hey, who are you?") print(response)