# inference.py import torch from model import GPTModel, ScratchTokenizer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Initialize tokenizer tokenizer = ScratchTokenizer() # You must rebuild vocab manually or load a saved vocab # For Hugging Face Spaces, it is recommended you hardcode or load a saved vocab here # Example: loading vocab from a file if you saved earlier. import json with open("vocab.json", "r") as f: vocab = json.load(f) tokenizer.word2idx = vocab["word2idx"] tokenizer.idx2word = {int(k): v for k, v in vocab["idx2word"].items()} tokenizer.vocab_size = vocab["vocab_size"] # Load model model = GPTModel(vocab_size=tokenizer.vocab_size) model.load_state_dict(torch.load("gpt_model.pth", map_location=device)) model.to(device) model.eval() # Generation function def generate_response(query, max_length=200): src = torch.tensor(tokenizer.encode(query)).unsqueeze(0).to(device) tgt = torch.tensor([[1]]).to(device) # token for _ in range(max_length): output = model(src, tgt) next_word = output.argmax(-1)[:, -1].unsqueeze(1) tgt = torch.cat([tgt, next_word], dim=1) if next_word.item() == 2: # break return tokenizer.decode(tgt.squeeze(0).tolist())