catninja123 commited on
Commit
c20de65
·
1 Parent(s): 7a08344

Add training app and requirements

Browse files
Files changed (2) hide show
  1. app.py +300 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DIPPER Humanizer - LoRA Fine-tuning Space
3
+ Trains a T5-large model to convert AI-style text back to human-style text.
4
+ Uses persistent storage at /data for model checkpoints.
5
+ """
6
+ import gradio as gr
7
+ import json, os, sys, random, time, threading
8
+ import torch
9
+ from torch.utils.data import Dataset
10
+ from transformers import (
11
+ T5ForConditionalGeneration,
12
+ T5Tokenizer,
13
+ TrainingArguments,
14
+ Trainer,
15
+ DataCollatorForSeq2Seq,
16
+ )
17
+ from peft import LoraConfig, get_peft_model, TaskType, PeftModel
18
+
19
+ # ============ Config ============
20
+ MODEL_NAME = "SamSJackson/paraphrase-dipper-no-ctx"
21
+ DATA_DIR = "/data" if os.path.exists("/data") else "."
22
+ OUTPUT_DIR = os.path.join(DATA_DIR, "dipper-humanizer-lora")
23
+ DATA_FILE = os.path.join(DATA_DIR, "training_pairs.jsonl")
24
+ FINAL_MODEL_DIR = os.path.join(OUTPUT_DIR, "final")
25
+
26
+ LORA_R = 16
27
+ LORA_ALPHA = 32
28
+ LORA_DROPOUT = 0.05
29
+ MAX_INPUT_LEN = 512
30
+ MAX_OUTPUT_LEN = 512
31
+ SEED = 42
32
+
33
+ training_status = {"running": False, "log": [], "progress": "Idle"}
34
+
35
+ # ============ Dataset ============
36
+ class ParaphraseDataset(Dataset):
37
+ def __init__(self, data, tokenizer, max_input_len=512, max_output_len=512):
38
+ self.data = data
39
+ self.tokenizer = tokenizer
40
+ self.max_input_len = max_input_len
41
+ self.max_output_len = max_output_len
42
+
43
+ def __len__(self):
44
+ return len(self.data)
45
+
46
+ def __getitem__(self, idx):
47
+ item = self.data[idx]
48
+ input_text = f"lexical = 60, order = 80 <sent> {item['ai_text']} </sent>"
49
+ target_text = item['human_text']
50
+
51
+ input_enc = self.tokenizer(
52
+ input_text, max_length=self.max_input_len,
53
+ padding="max_length", truncation=True, return_tensors="pt",
54
+ )
55
+ target_enc = self.tokenizer(
56
+ target_text, max_length=self.max_output_len,
57
+ padding="max_length", truncation=True, return_tensors="pt",
58
+ )
59
+
60
+ labels = target_enc["input_ids"].squeeze()
61
+ labels[labels == self.tokenizer.pad_token_id] = -100
62
+
63
+ return {
64
+ "input_ids": input_enc["input_ids"].squeeze(),
65
+ "attention_mask": input_enc["attention_mask"].squeeze(),
66
+ "labels": labels,
67
+ }
68
+
69
+ def load_data(path):
70
+ data = []
71
+ with open(path) as f:
72
+ for line in f:
73
+ d = json.loads(line)
74
+ if d.get('human_words', 0) < 30 or d.get('ai_words', 0) < 30:
75
+ continue
76
+ if d.get('ai_words', 0) < d.get('human_words', 0) * 0.5:
77
+ continue
78
+ if d.get('ai_words', 0) > d.get('human_words', 0) * 2:
79
+ continue
80
+ data.append(d)
81
+ random.seed(SEED)
82
+ random.shuffle(data)
83
+ split = int(len(data) * 0.95)
84
+ return data[:split], data[split:]
85
+
86
+ class LogCallback:
87
+ def __init__(self):
88
+ self.logs = []
89
+
90
+ def on_log(self, args, state, control, logs=None, **kwargs):
91
+ if logs:
92
+ self.logs.append(str(logs))
93
+ training_status["log"].append(str(logs))
94
+
95
+ def run_training(epochs, batch_size, lr, grad_accum):
96
+ global training_status
97
+ training_status = {"running": True, "log": [], "progress": "Loading data..."}
98
+
99
+ try:
100
+ train_data, val_data = load_data(DATA_FILE)
101
+ training_status["progress"] = f"Data loaded: {len(train_data)} train, {len(val_data)} val"
102
+ training_status["log"].append(training_status["progress"])
103
+
104
+ tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
105
+ training_status["progress"] = "Loading model..."
106
+ training_status["log"].append("Loading model...")
107
+
108
+ model = T5ForConditionalGeneration.from_pretrained(
109
+ MODEL_NAME, torch_dtype=torch.float16,
110
+ )
111
+
112
+ lora_config = LoraConfig(
113
+ task_type=TaskType.SEQ_2_SEQ_LM,
114
+ r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
115
+ target_modules=["q", "v", "k", "o", "wi", "wo"],
116
+ bias="none",
117
+ )
118
+ model = get_peft_model(model, lora_config)
119
+
120
+ import io
121
+ buf = io.StringIO()
122
+ model.print_trainable_parameters(file=buf)
123
+ training_status["log"].append(buf.getvalue())
124
+
125
+ train_dataset = ParaphraseDataset(train_data, tokenizer, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
126
+ val_dataset = ParaphraseDataset(val_data, tokenizer, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
127
+
128
+ training_args = TrainingArguments(
129
+ output_dir=OUTPUT_DIR,
130
+ num_train_epochs=epochs,
131
+ per_device_train_batch_size=batch_size,
132
+ per_device_eval_batch_size=batch_size,
133
+ gradient_accumulation_steps=grad_accum,
134
+ learning_rate=lr,
135
+ warmup_ratio=0.1,
136
+ weight_decay=0.01,
137
+ fp16=True,
138
+ logging_steps=25,
139
+ eval_strategy="steps",
140
+ eval_steps=250,
141
+ save_strategy="steps",
142
+ save_steps=250,
143
+ save_total_limit=3,
144
+ load_best_model_at_end=True,
145
+ metric_for_best_model="eval_loss",
146
+ report_to="none",
147
+ seed=SEED,
148
+ dataloader_num_workers=2,
149
+ )
150
+
151
+ data_collator = DataCollatorForSeq2Seq(
152
+ tokenizer=tokenizer, model=model, padding=True,
153
+ )
154
+
155
+ training_status["progress"] = "Training started..."
156
+ training_status["log"].append("Training started!")
157
+
158
+ trainer = Trainer(
159
+ model=model, args=training_args,
160
+ train_dataset=train_dataset, eval_dataset=val_dataset,
161
+ data_collator=data_collator,
162
+ )
163
+
164
+ trainer.train()
165
+
166
+ training_status["progress"] = "Saving model..."
167
+ training_status["log"].append("Saving final model...")
168
+ os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
169
+ model.save_pretrained(FINAL_MODEL_DIR)
170
+ tokenizer.save_pretrained(FINAL_MODEL_DIR)
171
+
172
+ training_status["progress"] = "DONE! Model saved."
173
+ training_status["log"].append("Training complete! Model saved to " + FINAL_MODEL_DIR)
174
+
175
+ except Exception as e:
176
+ training_status["progress"] = f"ERROR: {str(e)}"
177
+ training_status["log"].append(f"ERROR: {str(e)}")
178
+ import traceback
179
+ training_status["log"].append(traceback.format_exc())
180
+ finally:
181
+ training_status["running"] = False
182
+
183
+ # ============ Inference ============
184
+ loaded_model = None
185
+ loaded_tokenizer = None
186
+
187
+ def load_finetuned_model():
188
+ global loaded_model, loaded_tokenizer
189
+ if loaded_model is not None:
190
+ return True
191
+
192
+ if not os.path.exists(FINAL_MODEL_DIR):
193
+ return False
194
+
195
+ loaded_tokenizer = T5Tokenizer.from_pretrained(FINAL_MODEL_DIR)
196
+ base_model = T5ForConditionalGeneration.from_pretrained(
197
+ MODEL_NAME, torch_dtype=torch.float16,
198
+ )
199
+ loaded_model = PeftModel.from_pretrained(base_model, FINAL_MODEL_DIR)
200
+ loaded_model.eval()
201
+ if torch.cuda.is_available():
202
+ loaded_model = loaded_model.cuda()
203
+ return True
204
+
205
+ def humanize_text(text, lex_diversity=40, order_diversity=20):
206
+ if not load_finetuned_model():
207
+ return "Model not trained yet. Please train first."
208
+
209
+ from nltk.tokenize import sent_tokenize
210
+ import nltk
211
+ try:
212
+ nltk.data.find('tokenizers/punkt_tab')
213
+ except LookupError:
214
+ nltk.download('punkt_tab', quiet=True)
215
+
216
+ lex_code = int(100 - lex_diversity)
217
+ order_code = int(100 - order_diversity)
218
+
219
+ text = " ".join(text.split())
220
+ sentences = sent_tokenize(text)
221
+ output_text = ""
222
+
223
+ for sent_idx in range(0, len(sentences), 3):
224
+ curr_sent_window = " ".join(sentences[sent_idx:sent_idx + 3])
225
+ final_input_text = f"lexical = {lex_code}, order = {order_code} <sent> {curr_sent_window} </sent>"
226
+
227
+ final_input = loaded_tokenizer([final_input_text], return_tensors="pt")
228
+ if torch.cuda.is_available():
229
+ final_input = {k: v.cuda() for k, v in final_input.items()}
230
+
231
+ with torch.inference_mode():
232
+ outputs = loaded_model.generate(
233
+ **final_input,
234
+ do_sample=True, top_p=0.75, top_k=None, max_length=512
235
+ )
236
+ decoded = loaded_tokenizer.batch_decode(outputs, skip_special_tokens=True)
237
+ output_text += " " + decoded[0]
238
+
239
+ return output_text.strip()
240
+
241
+ # ============ Gradio UI ============
242
+ def start_training(epochs, batch_size, lr, grad_accum):
243
+ if training_status["running"]:
244
+ return "Training already in progress!"
245
+
246
+ thread = threading.Thread(
247
+ target=run_training,
248
+ args=(int(epochs), int(batch_size), float(lr), int(grad_accum))
249
+ )
250
+ thread.start()
251
+ return "Training started! Check status below."
252
+
253
+ def get_status():
254
+ logs = "\n".join(training_status["log"][-20:])
255
+ return f"Status: {training_status['progress']}\n\n{logs}"
256
+
257
+ def check_data():
258
+ if not os.path.exists(DATA_FILE):
259
+ return f"Data file not found at {DATA_FILE}. Please upload training_pairs.jsonl to /data/"
260
+
261
+ count = 0
262
+ with open(DATA_FILE) as f:
263
+ for line in f:
264
+ count += 1
265
+ return f"Found {count} training pairs in {DATA_FILE}"
266
+
267
+ with gr.Blocks(title="DIPPER Humanizer Training") as demo:
268
+ gr.Markdown("# DIPPER Humanizer - LoRA Fine-tuning")
269
+ gr.Markdown("Train DIPPER to convert AI-style text back to human-style text")
270
+
271
+ with gr.Tab("Training"):
272
+ data_info = gr.Textbox(label="Data Status", value=check_data())
273
+
274
+ with gr.Row():
275
+ epochs = gr.Number(value=3, label="Epochs")
276
+ batch_size = gr.Number(value=4, label="Batch Size")
277
+ lr = gr.Number(value=3e-4, label="Learning Rate")
278
+ grad_accum = gr.Number(value=4, label="Gradient Accumulation")
279
+
280
+ train_btn = gr.Button("Start Training", variant="primary")
281
+ train_output = gr.Textbox(label="Training Output")
282
+ train_btn.click(start_training, [epochs, batch_size, lr, grad_accum], train_output)
283
+
284
+ status_btn = gr.Button("Refresh Status")
285
+ status_output = gr.Textbox(label="Training Status", lines=15)
286
+ status_btn.click(get_status, outputs=status_output)
287
+
288
+ with gr.Tab("Inference"):
289
+ gr.Markdown("## Humanize AI Text")
290
+ input_text = gr.Textbox(label="AI Text Input", lines=10, placeholder="Paste AI-generated text here...")
291
+
292
+ with gr.Row():
293
+ lex_div = gr.Slider(0, 100, value=40, step=20, label="Lexical Diversity")
294
+ ord_div = gr.Slider(0, 100, value=20, step=20, label="Order Diversity")
295
+
296
+ humanize_btn = gr.Button("Humanize", variant="primary")
297
+ output_text = gr.Textbox(label="Humanized Output", lines=10)
298
+ humanize_btn.click(humanize_text, [input_text, lex_div, ord_div], output_text)
299
+
300
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ peft
4
+ accelerate
5
+ sentencepiece
6
+ protobuf
7
+ nltk
8
+ gradio