FlameF0X commited on
Commit
32c80ed
·
verified ·
1 Parent(s): 35f317c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -168
app.py CHANGED
@@ -1,5 +1,10 @@
1
  import gradio as gr
2
  import torch
 
 
 
 
 
3
  from transformers import (
4
  GPT2Config,
5
  GPT2LMHeadModel,
@@ -10,32 +15,7 @@ from transformers import (
10
  TrainerCallback
11
  )
12
  from datasets import load_dataset
13
- from huggingface_hub import whoami
14
- import os
15
- import threading
16
- import queue
17
- import time
18
- import json
19
-
20
- # --- Custom Code Templates ---
21
-
22
- CONFIGURATION_CODE = """
23
- from transformers import GPT2Config
24
-
25
- class CustomTinyConfig(GPT2Config):
26
- model_type = "custom_tiny"
27
- """
28
-
29
- MODELING_CODE = """
30
- from transformers import GPT2LMHeadModel
31
- from .configuration_custom import CustomTinyConfig
32
-
33
- class CustomTinyModel(GPT2LMHeadModel):
34
- config_class = CustomTinyConfig
35
-
36
- def __init__(self, config):
37
- super().__init__(config)
38
- """
39
 
40
  # --- Helper Classes ---
41
 
@@ -61,46 +41,50 @@ def get_user_info(token):
61
  return None
62
 
63
  def train_thread_target(
 
64
  dataset_id,
65
  model_name,
66
  num_layers,
67
  n_embd,
 
 
68
  epochs,
69
  lr,
 
 
 
 
70
  sample_limit,
71
- token,
72
  log_queue,
73
  result_queue
74
  ):
75
  """
76
- Function to be run in a separate thread.
77
- Handles the heavy lifting of training and pushing.
78
  """
79
  try:
80
  username = get_user_info(token)
81
  if not username:
82
- raise ValueError("Could not authenticate user.")
83
 
84
  full_repo_id = f"{username}/{model_name}"
85
- log_queue.put(f"🚀 Starting process for {full_repo_id}...\n")
86
 
87
  # 1. Load Dataset
88
- log_queue.put(f"📚 Loading dataset: {dataset_id}...\n")
89
  try:
90
  dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
91
  except Exception as e:
92
  raise ValueError(f"Error loading dataset: {e}")
93
 
94
- # Find text column
95
  text_column = "text"
96
  if "text" not in dataset.column_names:
97
- for col, dtype in zip(dataset.column_names, dataset.features.values()):
98
- if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
99
  text_column = col
100
  break
101
 
102
- if text_column not in dataset.column_names:
103
- raise ValueError("Could not find a text column in this dataset.")
104
 
105
  # 2. Tokenize
106
  log_queue.put("✂️ Tokenizing data...\n")
@@ -108,25 +92,25 @@ def train_thread_target(
108
  tokenizer.pad_token = tokenizer.eos_token
109
 
110
  def tokenize_function(examples):
111
- return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
 
 
 
 
 
112
 
113
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
114
 
115
  # 3. Initialize Model
116
- log_queue.put("🏗️ Initializing Custom Nano Model...\n")
117
-
118
- # We use GPT2Config but will modify it before push to look like "CustomTinyConfig"
119
  config = GPT2Config(
120
  vocab_size=len(tokenizer),
121
- n_positions=128,
122
- n_ctx=128,
123
  n_embd=int(n_embd),
124
  n_layer=int(num_layers),
125
- n_head=4,
126
  )
127
-
128
- # We train using standard GPT2 implementation for stability,
129
- # but will wrap it in custom code files on upload.
130
  model = GPT2LMHeadModel(config)
131
 
132
  # 4. Train
@@ -136,14 +120,17 @@ def train_thread_target(
136
  output_dir="./results",
137
  overwrite_output_dir=True,
138
  num_train_epochs=epochs,
139
- per_device_train_batch_size=8,
140
- save_steps=1000, # Don't save intermediate checkpoints to save time/space
141
- save_total_limit=1,
142
- prediction_loss_only=True,
143
  learning_rate=lr,
144
- logging_steps=5, # Log frequently for the UI
 
 
 
 
145
  report_to="none",
146
  use_cpu=not torch.cuda.is_available(),
 
147
  )
148
 
149
  trainer = Trainer(
@@ -156,166 +143,127 @@ def train_thread_target(
156
 
157
  trainer.train()
158
 
159
- # 5. Prepare Custom Code Files
160
- log_queue.put("📝 Generating Custom Code files (modeling_custom.py)...\n")
161
-
162
- # Write the python files locally
163
- with open("configuration_custom.py", "w") as f:
164
- f.write(CONFIGURATION_CODE)
165
-
166
- with open("modeling_custom.py", "w") as f:
167
- f.write(MODELING_CODE)
168
 
169
- # Update config to point to custom code
170
- # This makes it a "Custom Code" model on the Hub
171
- model.config.auto_map = {
172
- "AutoConfig": "configuration_custom.CustomTinyConfig",
173
- "AutoModelForCausalLM": "modeling_custom.CustomTinyModel"
174
- }
175
- # We also need to change the architecture name in config so it matches the class name
176
- model.config.architectures = ["CustomTinyModel"]
177
-
178
- # 6. Push to Hub
179
- log_queue.put(f"☁️ Pushing to {full_repo_id} (this includes custom python files)...\n")
180
-
181
- # Push model weights and config
182
- model.push_to_hub(full_repo_id, token=token, private=True)
183
- tokenizer.push_to_hub(full_repo_id, token=token, private=True)
184
-
185
- # Upload the custom python files explicitly
186
- api = gr.HuggingFaceHub(token=token) # wrapper or use HfApi
187
- from huggingface_hub import HfApi
188
- hf_api = HfApi(token=token)
189
-
190
- hf_api.upload_file(
191
- path_or_fileobj="configuration_custom.py",
192
- path_in_repo="configuration_custom.py",
193
- repo_id=full_repo_id,
194
- )
195
- hf_api.upload_file(
196
- path_or_fileobj="modeling_custom.py",
197
- path_in_repo="modeling_custom.py",
198
- repo_id=full_repo_id,
199
- )
200
-
201
- result_queue.put(f"🎉 Done! Model available at: https://huggingface.co/{full_repo_id}")
202
 
203
  except Exception as e:
204
  log_queue.put(f"❌ Error: {str(e)}\n")
205
- result_queue.put(None) # Signal failure
206
 
207
  # --- Main Generator Function ---
208
 
209
  def train_and_push_generator(
210
- dataset_id,
211
- model_name,
212
- num_layers,
213
- n_embd,
214
- epochs,
215
- lr,
216
- sample_limit,
217
- oauth_token: gr.OAuthToken
218
  ):
219
- if oauth_token is None or oauth_token.token is None:
220
- yield "You must be logged in to train a model!", ""
221
  return
222
 
223
- token = oauth_token.token
224
-
225
- # queues for communication between threads
226
  log_queue = queue.Queue()
227
  result_queue = queue.Queue()
228
 
229
- # Start training in background thread
230
  t = threading.Thread(target=train_thread_target, args=(
231
- dataset_id, model_name, num_layers, n_embd, epochs, lr, sample_limit, token, log_queue, result_queue
 
 
 
 
232
  ))
233
  t.start()
234
 
235
- # Main loop: yield logs as they come in
236
  logs_history = ""
237
-
238
  while t.is_alive():
239
- # Drain queue
240
  while not log_queue.empty():
241
- new_log = log_queue.get()
242
- logs_history += new_log
243
- yield logs_history, "Training..."
244
  time.sleep(0.5)
245
 
246
- # Drain remaining logs after thread finishes
247
  while not log_queue.empty():
248
- new_log = log_queue.get()
249
- logs_history += new_log
250
 
251
- # Get final result
252
  if not result_queue.empty():
253
  result = result_queue.get()
254
- if result:
255
- yield logs_history, result
256
- else:
257
- yield logs_history, "Failed. Check logs."
258
  else:
259
  yield logs_history, "Process finished unexpectedly."
260
 
261
  # --- UI Layout ---
262
 
263
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
264
- gr.Markdown(
265
- """
266
- # Auto-PreTrain
267
- Login, pick a dataset, and train a **Custom Code** language model.
268
- We will generate `modeling_custom.py` and `configuration_custom.py` and upload them to your repo!
269
- """
270
- )
271
-
272
- with gr.Row():
273
- login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
274
 
275
  with gr.Row():
276
- with gr.Column():
277
- gr.Markdown("### 1. Data Configuration")
278
- dataset_input = gr.Textbox(
279
- label="Dataset Name",
280
- value="roneneldan/TinyStories",
281
- placeholder="e.g. wikitext, roneneldan/TinyStories"
282
- )
283
- sample_limit = gr.Slider(
284
- minimum=100, maximum=5000, value=500, step=100,
285
- label="Sample Size"
286
- )
287
 
288
- with gr.Column():
289
- gr.Markdown("### 2. Hyperparameters")
290
- model_name_input = gr.Textbox(
291
- label="Model Name",
292
- value="my-custom-tiny-model",
 
 
 
 
 
 
 
 
 
 
 
 
293
  )
294
-
 
295
  with gr.Row():
296
- layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers")
297
- embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embed Dim")
298
-
 
 
 
 
 
 
 
 
 
 
299
  with gr.Row():
300
- epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
301
- lr = gr.Number(label="Learning Rate", value=5e-4)
302
 
303
- train_btn = gr.Button("🚀 Train Custom Model", variant="primary")
304
 
305
  with gr.Row():
306
- log_output = gr.Code(label="Training Logs", language="json", lines=10)
307
- status_output = gr.Textbox(label="Final Status")
308
 
309
  train_btn.click(
310
  fn=train_and_push_generator,
311
  inputs=[
312
- dataset_input,
313
- model_name_input,
314
- layers,
315
- embd,
316
- epochs,
317
- lr,
318
- sample_limit
319
  ],
320
  outputs=[log_output, status_output]
321
  )
 
1
  import gradio as gr
2
  import torch
3
+ import os
4
+ import threading
5
+ import queue
6
+ import time
7
+ import json
8
  from transformers import (
9
  GPT2Config,
10
  GPT2LMHeadModel,
 
15
  TrainerCallback
16
  )
17
  from datasets import load_dataset
18
+ from huggingface_hub import whoami, HfApi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # --- Helper Classes ---
21
 
 
41
  return None
42
 
43
  def train_thread_target(
44
+ token,
45
  dataset_id,
46
  model_name,
47
  num_layers,
48
  n_embd,
49
+ n_head,
50
+ context_length,
51
  epochs,
52
  lr,
53
+ weight_decay,
54
+ warmup_steps,
55
+ batch_size,
56
+ grad_accumulation,
57
  sample_limit,
 
58
  log_queue,
59
  result_queue
60
  ):
61
  """
62
+ Background thread for training.
 
63
  """
64
  try:
65
  username = get_user_info(token)
66
  if not username:
67
+ raise ValueError("Invalid Hugging Face Token. Could not authenticate.")
68
 
69
  full_repo_id = f"{username}/{model_name}"
70
+ log_queue.put(f"🚀 Initializing for {full_repo_id}...\n")
71
 
72
  # 1. Load Dataset
73
+ log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n")
74
  try:
75
  dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
76
  except Exception as e:
77
  raise ValueError(f"Error loading dataset: {e}")
78
 
79
+ # Auto-detect text column
80
  text_column = "text"
81
  if "text" not in dataset.column_names:
82
+ for col in dataset.column_names:
83
+ if isinstance(dataset[0][col], str):
84
  text_column = col
85
  break
86
 
87
+ log_queue.put(f"🔍 Using text column: '{text_column}'\n")
 
88
 
89
  # 2. Tokenize
90
  log_queue.put("✂️ Tokenizing data...\n")
 
92
  tokenizer.pad_token = tokenizer.eos_token
93
 
94
  def tokenize_function(examples):
95
+ return tokenizer(
96
+ examples[text_column],
97
+ padding="max_length",
98
+ truncation=True,
99
+ max_length=int(context_length)
100
+ )
101
 
102
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
103
 
104
  # 3. Initialize Model
105
+ log_queue.put("🏗️ Building GPT-2 Architecture...\n")
 
 
106
  config = GPT2Config(
107
  vocab_size=len(tokenizer),
108
+ n_positions=int(context_length),
109
+ n_ctx=int(context_length),
110
  n_embd=int(n_embd),
111
  n_layer=int(num_layers),
112
+ n_head=int(n_head),
113
  )
 
 
 
114
  model = GPT2LMHeadModel(config)
115
 
116
  # 4. Train
 
120
  output_dir="./results",
121
  overwrite_output_dir=True,
122
  num_train_epochs=epochs,
123
+ per_device_train_batch_size=int(batch_size),
124
+ gradient_accumulation_steps=int(grad_accumulation),
 
 
125
  learning_rate=lr,
126
+ weight_decay=weight_decay,
127
+ warmup_steps=int(warmup_steps),
128
+ logging_steps=10,
129
+ save_strategy="no", # Save only at the end
130
+ push_to_hub=False,
131
  report_to="none",
132
  use_cpu=not torch.cuda.is_available(),
133
+ fp16=torch.cuda.is_available(),
134
  )
135
 
136
  trainer = Trainer(
 
143
 
144
  trainer.train()
145
 
146
+ # 5. Push to Hub
147
+ log_queue.put(f"☁️ Pushing weights to https://huggingface.co/{full_repo_id}...\n")
148
+ model.push_to_hub(full_repo_id, token=token)
149
+ tokenizer.push_to_hub(full_repo_id, token=token)
 
 
 
 
 
150
 
151
+ result_queue.put(f"🎉 Success! Model published to: https://huggingface.co/{full_repo_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  except Exception as e:
154
  log_queue.put(f"❌ Error: {str(e)}\n")
155
+ result_queue.put(None)
156
 
157
  # --- Main Generator Function ---
158
 
159
  def train_and_push_generator(
160
+ token, dataset_id, model_name,
161
+ num_layers, n_embd, n_head, context_length,
162
+ epochs, lr, weight_decay, warmup_steps,
163
+ batch_size, grad_accumulation, sample_limit
 
 
 
 
164
  ):
165
+ if not token:
166
+ yield "Error: Hugging Face Token is required.", ""
167
  return
168
 
 
 
 
169
  log_queue = queue.Queue()
170
  result_queue = queue.Queue()
171
 
 
172
  t = threading.Thread(target=train_thread_target, args=(
173
+ token, dataset_id, model_name,
174
+ num_layers, n_embd, n_head, context_length,
175
+ epochs, lr, weight_decay, warmup_steps,
176
+ batch_size, grad_accumulation, sample_limit,
177
+ log_queue, result_queue
178
  ))
179
  t.start()
180
 
 
181
  logs_history = ""
 
182
  while t.is_alive():
 
183
  while not log_queue.empty():
184
+ logs_history += log_queue.get()
185
+ yield logs_history, "Training in progress..."
 
186
  time.sleep(0.5)
187
 
 
188
  while not log_queue.empty():
189
+ logs_history += log_queue.get()
 
190
 
 
191
  if not result_queue.empty():
192
  result = result_queue.get()
193
+ yield logs_history, result or "Failed. Check logs for errors."
 
 
 
194
  else:
195
  yield logs_history, "Process finished unexpectedly."
196
 
197
  # --- UI Layout ---
198
 
199
+ with gr.Blocks(theme=gr.themes.Default(primary_hue="orange", secondary_hue="gray")) as demo:
200
+ gr.Markdown("# 🔥 Advanced Auto-PreTrain")
201
+ gr.Markdown("Configure your transformer architecture and train it directly to your Hugging Face account.")
 
 
 
 
 
 
 
 
202
 
203
  with gr.Row():
204
+ hf_token = gr.Textbox(
205
+ label="Hugging Face Write Token",
206
+ placeholder="hf_...",
207
+ type="password",
208
+ info="Get your token at huggingface.co/settings/tokens (must have 'Write' access)"
209
+ )
210
+ model_name_input = gr.Textbox(
211
+ label="Model Repository Name",
212
+ value="my-tiny-gpt2",
213
+ placeholder="e.g. tiny-coder-v1"
214
+ )
215
 
216
+ with gr.Tabs():
217
+ with gr.TabItem("1. Dataset & Data"):
218
+ with gr.Row():
219
+ dataset_input = gr.Textbox(
220
+ label="Dataset ID",
221
+ value="roneneldan/TinyStories",
222
+ placeholder="e.g. wikitext"
223
+ )
224
+ sample_limit = gr.Number(
225
+ label="Sample Limit",
226
+ value=1000,
227
+ precision=0,
228
+ info="Number of rows to use for training"
229
+ )
230
+ context_length = gr.Slider(
231
+ minimum=64, maximum=1024, value=128, step=64,
232
+ label="Max Context Length (Sequence Length)"
233
  )
234
+
235
+ with gr.TabItem("2. Model Architecture"):
236
  with gr.Row():
237
+ layers = gr.Slider(minimum=1, maximum=24, value=4, step=1, label="Number of Layers")
238
+ embd = gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Embedding Dimension")
239
+ with gr.Row():
240
+ heads = gr.Slider(minimum=2, maximum=16, value=8, step=2, label="Attention Heads")
241
+ gr.Markdown("Note: Embedding dimension must be divisible by attention heads.")
242
+
243
+ with gr.TabItem("3. Training Hyperparameters"):
244
+ with gr.Row():
245
+ epochs = gr.Slider(minimum=1, maximum=50, value=1, step=1, label="Epochs")
246
+ lr = gr.Number(label="Learning Rate", value=5e-4, format="%.1e")
247
+ with gr.Row():
248
+ batch_size = gr.Slider(minimum=1, maximum=64, value=8, step=1, label="Batch Size (per device)")
249
+ grad_accumulation = gr.Slider(minimum=1, maximum=32, value=1, step=1, label="Gradient Accumulation Steps")
250
  with gr.Row():
251
+ weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay")
252
+ warmup_steps = gr.Number(label="Warmup Steps", value=100, precision=0)
253
 
254
+ train_btn = gr.Button("🚀 Start Pre-Training", variant="primary")
255
 
256
  with gr.Row():
257
+ log_output = gr.Code(label="Live Training Logs", language="json", lines=15)
258
+ status_output = gr.Textbox(label="Status & Hub Link", interactive=False)
259
 
260
  train_btn.click(
261
  fn=train_and_push_generator,
262
  inputs=[
263
+ hf_token, dataset_input, model_name_input,
264
+ layers, embd, heads, context_length,
265
+ epochs, lr, weight_decay, warmup_steps,
266
+ batch_size, grad_accumulation, sample_limit
 
 
 
267
  ],
268
  outputs=[log_output, status_output]
269
  )