dangermouse77 commited on
Commit
33558f8
·
verified ·
1 Parent(s): 898c807

Upload 3 files

Browse files
Files changed (3) hide show
  1. finetune.py +56 -0
  2. split_train_eval.py +15 -0
  3. test_aqmodel.py +24 -0
finetune.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/home/dm/miniconda3/bin/python3
2
+ from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
3
+ from datasets import load_dataset
4
+
5
+ # Load model and tokenizer
6
+ model_name = "t5-small" # or another transformer-based model
7
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
8
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
9
+
10
+ # Load dataset
11
+ dataset = load_dataset("json", data_files={"train": "train.json"})
12
+ evalset = load_dataset("json", data_files={"eval": "eval.json"})
13
+
14
+ def preprocess_function(examples):
15
+ inputs = ["Generate a question for: " + (ans if isinstance(ans, str) else "Unknown") for ans in examples["answer"]]
16
+ model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length") # <-- Added padding
17
+
18
+ labels = [q if isinstance(q, str) else "" for q in examples["question"]]
19
+ labels = tokenizer(labels, max_length=128, truncation=True, padding="max_length") # <-- Added padding
20
+
21
+ model_inputs["labels"] = labels["input_ids"]
22
+ return model_inputs
23
+
24
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
25
+ tokenized_evalsets = evalset.map(preprocess_function, batched=True)
26
+
27
+ # Define training arguments
28
+ training_args = TrainingArguments(
29
+ output_dir="./results",
30
+ evaluation_strategy="epoch",
31
+ save_strategy="epoch",
32
+ per_device_train_batch_size=8,
33
+ per_device_eval_batch_size=8,
34
+ num_train_epochs=3,
35
+ weight_decay=0.01,
36
+ logging_dir="./logs",
37
+ )
38
+
39
+ trainer = Trainer(
40
+ model=model,
41
+ args=training_args,
42
+ train_dataset=tokenized_datasets["train"],
43
+ eval_dataset=tokenized_evalsets["eval"]
44
+ )
45
+
46
+ # Train model
47
+ trainer.train()
48
+
49
+ # Save trained model
50
+ output_dir = "/home/dm/chat/AQ/aq_model" # Change the folder name if needed
51
+ trainer.save_model(output_dir)
52
+ tokenizer.save_pretrained(output_dir)
53
+
54
+ print(f"Model saved to {output_dir}")
55
+
56
+
split_train_eval.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/home/dm/miniconda3/bin/python3
2
+ import json
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ with open('inverted.json', 'r') as f:
6
+ data = json.load(f)
7
+
8
+ train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)
9
+
10
+ with open('train.json', 'w') as f:
11
+ json.dump(train_data, f, indent=4)
12
+
13
+ with open('eval.json', 'w') as f:
14
+ json.dump(eval_data, f, indent=4)
15
+
test_aqmodel.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/home/dm/miniconda3/bin/python3
2
+ import sys
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
4
+
5
+ if len(sys.argv)<2:
6
+ print("Usage: python3 conversation.py '<your answer here>'")
7
+ sys.exit(1)
8
+
9
+ # Define model path
10
+ model_path = "/home/dm/chat/AQ/aq_model_b8" # Make sure this points to your saved directory
11
+
12
+ # Load model and tokenizer
13
+ model = T5ForConditionalGeneration.from_pretrained(model_path)
14
+ tokenizer = T5Tokenizer.from_pretrained(model_path)
15
+
16
+ print("Model loaded successfully!")
17
+
18
+ def generate_question(answer):
19
+ input_text = "Generate a question for: " + answer
20
+ input_ids = tokenizer(input_text, return_tensors="pt").input_ids
21
+ output_ids = model.generate(input_ids, max_length=50)
22
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True)
23
+
24
+ print(generate_question(sys.argv[1]))