| import torch | |
| from dataset import MiniBPETokenizr, ChatDataset, train,SimpleTokenizr | |
| from model import MiniGPT | |
| import json | |
| # Load and prepare | |
| with open("./customchatbot-v1/data/filtered_data.jsonl", "r", encoding="utf-8") as f: | |
| texts = [json.loads(line)["text"] for line in f if line.strip()] | |
| tokenizer = SimpleTokenizr() | |
| tokenizer.train(texts) | |
| ch_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth" | |
| dataset = ChatDataset("./customchatbot-v1/data/filtered_data.jsonl", tokenizer) | |
| model = MiniGPT(vocab_size=len(tokenizer)) | |
| model.reset_params() | |
| #model.load_state_dict(torch.load(ch_path)) | |
| # Train | |
| train(model, dataset, tokenizer, epochs=3, filepathh="./customchatbot-v1/data/merged_data.jsonl") | |