FlameF0X commited on
Commit
0bde4c8
ยท
verified ยท
1 Parent(s): 553a5e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -79
app.py CHANGED
@@ -6,13 +6,49 @@ from transformers import (
6
  GPT2Tokenizer,
7
  Trainer,
8
  TrainingArguments,
9
- DataCollatorForLanguageModeling
 
10
  )
11
  from datasets import load_dataset
12
  from huggingface_hub import whoami
13
  import os
 
 
 
 
14
 
15
- # --- Helper Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def get_user_info(token):
18
  """Retrieves the username from the HF token."""
@@ -24,7 +60,7 @@ def get_user_info(token):
24
  except Exception:
25
  return None
26
 
27
- def train_and_push(
28
  dataset_id,
29
  model_name,
30
  num_layers,
@@ -32,58 +68,42 @@ def train_and_push(
32
  epochs,
33
  lr,
34
  sample_limit,
35
- oauth_token: gr.OAuthToken
 
 
36
  ):
37
  """
38
- Main Logic:
39
- 1. Authenticate
40
- 2. Load & Prepare Data
41
- 3. Initialize Tiny Model
42
- 4. Train
43
- 5. Push to Hub
44
  """
45
-
46
- # 1. Authentication Check
47
- if oauth_token is None or oauth_token.token is None:
48
- raise gr.Error("You must be logged in to train a model!")
49
-
50
- token = oauth_token.token
51
- username = get_user_info(token)
52
-
53
- if not username:
54
- raise gr.Error("Could not retrieve user info. Please try logging in again.")
55
-
56
- full_repo_id = f"{username}/{model_name}"
57
-
58
- progress = gr.Progress()
59
-
60
  try:
61
- # 2. Load Dataset
62
- progress(0.1, desc=f"Loading dataset: {dataset_id}...")
63
-
64
- # We try to load the dataset. We'll default to the 'train' split.
65
- # We only take a small slice to keep it fast for this demo.
 
 
 
 
66
  try:
67
- # Try loading just the first 'sample_limit' rows to save bandwidth/memory
68
  dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
69
  except Exception as e:
70
- raise gr.Error(f"Error loading dataset: {str(e)}. Make sure it exists and has a 'train' split.")
71
 
72
- # Heuristic: Find the text column (first string column)
73
  text_column = "text"
74
  if "text" not in dataset.column_names:
75
- # simple fallback: look for the first string column
76
  for col, dtype in zip(dataset.column_names, dataset.features.values()):
77
  if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
78
  text_column = col
79
  break
80
 
81
  if text_column not in dataset.column_names:
82
- raise gr.Error("Could not find a text column in this dataset. Please use a dataset with a 'text' column.")
83
-
84
- progress(0.2, desc="Tokenizing data...")
85
-
86
- # We use the standard GPT-2 tokenizer for convenience
87
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
88
  tokenizer.pad_token = tokenizer.eos_token
89
 
@@ -91,76 +111,164 @@ def train_and_push(
91
  return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
92
 
93
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
94
-
95
  # 3. Initialize Model
96
- progress(0.3, desc="Initializing Nano Model...")
97
 
98
- # We create a custom configuration based on user inputs (Constrained for speed)
99
  config = GPT2Config(
100
  vocab_size=len(tokenizer),
101
- n_positions=128, # Short context window for speed
102
  n_ctx=128,
103
- n_embd=int(n_embd), # Small embedding size
104
- n_layer=int(num_layers), # Few layers
105
  n_head=4,
106
  )
107
 
 
 
108
  model = GPT2LMHeadModel(config)
109
 
110
- # 4. Training
111
- progress(0.4, desc="Starting Training (this might take a minute)...")
112
 
113
  training_args = TrainingArguments(
114
  output_dir="./results",
115
  overwrite_output_dir=True,
116
  num_train_epochs=epochs,
117
  per_device_train_batch_size=8,
118
- save_steps=500,
119
  save_total_limit=1,
120
  prediction_loss_only=True,
121
  learning_rate=lr,
122
- logging_steps=10,
123
- report_to="none", # Don't log to wandb/tensorboard
124
- use_cpu=not torch.cuda.is_available(), # Force CPU if no GPU available
125
- )
126
-
127
- data_collator = DataCollatorForLanguageModeling(
128
- tokenizer=tokenizer, mlm=False
129
  )
130
 
131
  trainer = Trainer(
132
  model=model,
133
  args=training_args,
134
- data_collator=data_collator,
135
  train_dataset=tokenized_datasets,
 
136
  )
137
 
138
  trainer.train()
 
 
 
139
 
140
- # 5. Push to Hub
141
- progress(0.9, desc=f"Pushing to {full_repo_id}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- # We push both model and tokenizer
144
- model.push_to_hub(full_repo_id, token=token, private=True) # Default to private for safety
145
  tokenizer.push_to_hub(full_repo_id, token=token, private=True)
146
 
147
- return f"๐ŸŽ‰ Success! Model trained and pushed to: https://huggingface.co/{full_repo_id}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
 
 
149
  except Exception as e:
150
- raise gr.Error(f"An error occurred: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # --- UI Layout ---
153
 
154
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
155
  gr.Markdown(
156
  """
157
- # ๐Ÿš‚ Tiny AutoTrain Space
158
- Login with your Hugging Face account, pick a dataset, and train a tiny language model from scratch!
159
- The model will be automatically uploaded to your profile.
160
  """
161
  )
162
 
163
- # Login Button (Native HF Integration)
164
  with gr.Row():
165
  login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
166
 
@@ -168,37 +276,38 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
168
  with gr.Column():
169
  gr.Markdown("### 1. Data Configuration")
170
  dataset_input = gr.Textbox(
171
- label="Dataset Name (from Hub)",
172
  value="roneneldan/TinyStories",
173
  placeholder="e.g. wikitext, roneneldan/TinyStories"
174
  )
175
  sample_limit = gr.Slider(
176
  minimum=100, maximum=5000, value=500, step=100,
177
- label="Training Sample Size (Keep small for speed)"
178
  )
179
 
180
  with gr.Column():
181
- gr.Markdown("### 2. Model Hyperparameters")
182
  model_name_input = gr.Textbox(
183
- label="New Model Name",
184
- value="my-tiny-model",
185
- placeholder="Name of the repo to create"
186
  )
187
 
188
  with gr.Row():
189
- layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers (Depth)")
190
- embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embedding Size (Width)")
191
 
192
  with gr.Row():
193
  epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
194
  lr = gr.Number(label="Learning Rate", value=5e-4)
195
 
196
- train_btn = gr.Button("๐Ÿš€ Train & Publish", variant="primary")
197
- output_text = gr.Textbox(label="Status", interactive=False)
 
 
 
198
 
199
- # Wire up the button
200
  train_btn.click(
201
- fn=train_and_push,
202
  inputs=[
203
  dataset_input,
204
  model_name_input,
@@ -208,7 +317,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
208
  lr,
209
  sample_limit
210
  ],
211
- outputs=output_text
212
  )
213
 
214
  if __name__ == "__main__":
 
6
  GPT2Tokenizer,
7
  Trainer,
8
  TrainingArguments,
9
+ DataCollatorForLanguageModeling,
10
+ TrainerCallback
11
  )
12
  from datasets import load_dataset
13
  from huggingface_hub import whoami
14
  import os
15
+ import threading
16
+ import queue
17
+ import time
18
+ import json
19
 
20
+ # --- Custom Code Templates ---
21
+
22
+ CONFIGURATION_CODE = """
23
+ from transformers import GPT2Config
24
+
25
+ class CustomTinyConfig(GPT2Config):
26
+ model_type = "custom_tiny"
27
+ """
28
+
29
+ MODELING_CODE = """
30
+ from transformers import GPT2LMHeadModel
31
+ from .configuration_custom import CustomTinyConfig
32
+
33
+ class CustomTinyModel(GPT2LMHeadModel):
34
+ config_class = CustomTinyConfig
35
+
36
+ def __init__(self, config):
37
+ super().__init__(config)
38
+ """
39
+
40
+ # --- Helper Classes ---
41
+
42
+ class LogQueueCallback(TrainerCallback):
43
+ """A custom callback that pushes logs to a queue for the UI."""
44
+ def __init__(self, log_queue):
45
+ self.log_queue = log_queue
46
+
47
+ def on_log(self, args, state, control, logs=None, **kwargs):
48
+ if logs:
49
+ # Format log dictionary nicely
50
+ log_str = f"Step {state.global_step}: {json.dumps(logs)}\n"
51
+ self.log_queue.put(log_str)
52
 
53
  def get_user_info(token):
54
  """Retrieves the username from the HF token."""
 
60
  except Exception:
61
  return None
62
 
63
+ def train_thread_target(
64
  dataset_id,
65
  model_name,
66
  num_layers,
 
68
  epochs,
69
  lr,
70
  sample_limit,
71
+ token,
72
+ log_queue,
73
+ result_queue
74
  ):
75
  """
76
+ Function to be run in a separate thread.
77
+ Handles the heavy lifting of training and pushing.
 
 
 
 
78
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
+ username = get_user_info(token)
81
+ if not username:
82
+ raise ValueError("Could not authenticate user.")
83
+
84
+ full_repo_id = f"{username}/{model_name}"
85
+ log_queue.put(f"๐Ÿš€ Starting process for {full_repo_id}...\n")
86
+
87
+ # 1. Load Dataset
88
+ log_queue.put(f"๐Ÿ“š Loading dataset: {dataset_id}...\n")
89
  try:
 
90
  dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
91
  except Exception as e:
92
+ raise ValueError(f"Error loading dataset: {e}")
93
 
94
+ # Find text column
95
  text_column = "text"
96
  if "text" not in dataset.column_names:
 
97
  for col, dtype in zip(dataset.column_names, dataset.features.values()):
98
  if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
99
  text_column = col
100
  break
101
 
102
  if text_column not in dataset.column_names:
103
+ raise ValueError("Could not find a text column in this dataset.")
104
+
105
+ # 2. Tokenize
106
+ log_queue.put("โœ‚๏ธ Tokenizing data...\n")
 
107
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
108
  tokenizer.pad_token = tokenizer.eos_token
109
 
 
111
  return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
112
 
113
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
114
+
115
  # 3. Initialize Model
116
+ log_queue.put("๐Ÿ—๏ธ Initializing Custom Nano Model...\n")
117
 
118
+ # We use GPT2Config but will modify it before push to look like "CustomTinyConfig"
119
  config = GPT2Config(
120
  vocab_size=len(tokenizer),
121
+ n_positions=128,
122
  n_ctx=128,
123
+ n_embd=int(n_embd),
124
+ n_layer=int(num_layers),
125
  n_head=4,
126
  )
127
 
128
+ # We train using standard GPT2 implementation for stability,
129
+ # but will wrap it in custom code files on upload.
130
  model = GPT2LMHeadModel(config)
131
 
132
+ # 4. Train
133
+ log_queue.put("๐Ÿ‹๏ธ Starting Training Loop...\n")
134
 
135
  training_args = TrainingArguments(
136
  output_dir="./results",
137
  overwrite_output_dir=True,
138
  num_train_epochs=epochs,
139
  per_device_train_batch_size=8,
140
+ save_steps=1000, # Don't save intermediate checkpoints to save time/space
141
  save_total_limit=1,
142
  prediction_loss_only=True,
143
  learning_rate=lr,
144
+ logging_steps=5, # Log frequently for the UI
145
+ report_to="none",
146
+ use_cpu=not torch.cuda.is_available(),
 
 
 
 
147
  )
148
 
149
  trainer = Trainer(
150
  model=model,
151
  args=training_args,
152
+ data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
153
  train_dataset=tokenized_datasets,
154
+ callbacks=[LogQueueCallback(log_queue)]
155
  )
156
 
157
  trainer.train()
158
+
159
+ # 5. Prepare Custom Code Files
160
+ log_queue.put("๐Ÿ“ Generating Custom Code files (modeling_custom.py)...\n")
161
 
162
+ # Write the python files locally
163
+ with open("configuration_custom.py", "w") as f:
164
+ f.write(CONFIGURATION_CODE)
165
+
166
+ with open("modeling_custom.py", "w") as f:
167
+ f.write(MODELING_CODE)
168
+
169
+ # Update config to point to custom code
170
+ # This makes it a "Custom Code" model on the Hub
171
+ model.config.auto_map = {
172
+ "AutoConfig": "configuration_custom.CustomTinyConfig",
173
+ "AutoModelForCausalLM": "modeling_custom.CustomTinyModel"
174
+ }
175
+ # We also need to change the architecture name in config so it matches the class name
176
+ model.config.architectures = ["CustomTinyModel"]
177
+
178
+ # 6. Push to Hub
179
+ log_queue.put(f"โ˜๏ธ Pushing to {full_repo_id} (this includes custom python files)...\n")
180
 
181
+ # Push model weights and config
182
+ model.push_to_hub(full_repo_id, token=token, private=True)
183
  tokenizer.push_to_hub(full_repo_id, token=token, private=True)
184
 
185
+ # Upload the custom python files explicitly
186
+ api = gr.HuggingFaceHub(token=token) # wrapper or use HfApi
187
+ from huggingface_hub import HfApi
188
+ hf_api = HfApi(token=token)
189
+
190
+ hf_api.upload_file(
191
+ path_or_fileobj="configuration_custom.py",
192
+ path_in_repo="configuration_custom.py",
193
+ repo_id=full_repo_id,
194
+ )
195
+ hf_api.upload_file(
196
+ path_or_fileobj="modeling_custom.py",
197
+ path_in_repo="modeling_custom.py",
198
+ repo_id=full_repo_id,
199
+ )
200
 
201
+ result_queue.put(f"๐ŸŽ‰ Done! Model available at: https://huggingface.co/{full_repo_id}")
202
+
203
  except Exception as e:
204
+ log_queue.put(f"โŒ Error: {str(e)}\n")
205
+ result_queue.put(None) # Signal failure
206
+
207
+ # --- Main Generator Function ---
208
+
209
+ def train_and_push_generator(
210
+ dataset_id,
211
+ model_name,
212
+ num_layers,
213
+ n_embd,
214
+ epochs,
215
+ lr,
216
+ sample_limit,
217
+ oauth_token: gr.OAuthToken
218
+ ):
219
+ if oauth_token is None or oauth_token.token is None:
220
+ yield "You must be logged in to train a model!", ""
221
+ return
222
+
223
+ token = oauth_token.token
224
+
225
+ # queues for communication between threads
226
+ log_queue = queue.Queue()
227
+ result_queue = queue.Queue()
228
+
229
+ # Start training in background thread
230
+ t = threading.Thread(target=train_thread_target, args=(
231
+ dataset_id, model_name, num_layers, n_embd, epochs, lr, sample_limit, token, log_queue, result_queue
232
+ ))
233
+ t.start()
234
+
235
+ # Main loop: yield logs as they come in
236
+ logs_history = ""
237
+
238
+ while t.is_alive():
239
+ # Drain queue
240
+ while not log_queue.empty():
241
+ new_log = log_queue.get()
242
+ logs_history += new_log
243
+ yield logs_history, "Training..."
244
+ time.sleep(0.5)
245
+
246
+ # Drain remaining logs after thread finishes
247
+ while not log_queue.empty():
248
+ new_log = log_queue.get()
249
+ logs_history += new_log
250
+
251
+ # Get final result
252
+ if not result_queue.empty():
253
+ result = result_queue.get()
254
+ if result:
255
+ yield logs_history, result
256
+ else:
257
+ yield logs_history, "Failed. Check logs."
258
+ else:
259
+ yield logs_history, "Process finished unexpectedly."
260
 
261
  # --- UI Layout ---
262
 
263
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
264
  gr.Markdown(
265
  """
266
+ # ๐Ÿš‚ Tiny AutoTrain Space (Custom Code Edition)
267
+ Login, pick a dataset, and train a **Custom Code** language model.
268
+ We will generate `modeling_custom.py` and `configuration_custom.py` and upload them to your repo!
269
  """
270
  )
271
 
 
272
  with gr.Row():
273
  login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
274
 
 
276
  with gr.Column():
277
  gr.Markdown("### 1. Data Configuration")
278
  dataset_input = gr.Textbox(
279
+ label="Dataset Name",
280
  value="roneneldan/TinyStories",
281
  placeholder="e.g. wikitext, roneneldan/TinyStories"
282
  )
283
  sample_limit = gr.Slider(
284
  minimum=100, maximum=5000, value=500, step=100,
285
+ label="Sample Size"
286
  )
287
 
288
  with gr.Column():
289
+ gr.Markdown("### 2. Hyperparameters")
290
  model_name_input = gr.Textbox(
291
+ label="Model Name",
292
+ value="my-custom-tiny-model",
 
293
  )
294
 
295
  with gr.Row():
296
+ layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers")
297
+ embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embed Dim")
298
 
299
  with gr.Row():
300
  epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
301
  lr = gr.Number(label="Learning Rate", value=5e-4)
302
 
303
+ train_btn = gr.Button("๐Ÿš€ Train Custom Model", variant="primary")
304
+
305
+ with gr.Row():
306
+ log_output = gr.Code(label="Training Logs", language="json", lines=10)
307
+ status_output = gr.Textbox(label="Final Status")
308
 
 
309
  train_btn.click(
310
+ fn=train_and_push_generator,
311
  inputs=[
312
  dataset_input,
313
  model_name_input,
 
317
  lr,
318
  sample_limit
319
  ],
320
+ outputs=[log_output, status_output]
321
  )
322
 
323
  if __name__ == "__main__":