Percy3822 commited on
Commit
579cbca
Β·
verified Β·
1 Parent(s): 304794d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +345 -80
app.py CHANGED
@@ -1,102 +1,367 @@
1
  import os
2
  import shutil
3
  import subprocess
 
 
 
 
 
 
4
  import gradio as gr
5
  from transformers import pipeline
6
 
7
- uploaded_file_path = "dataset.jsonl"
8
- log_path = "train.log"
9
- model_dir = "trained_model"
10
- zip_file = "trained_model.zip"
 
 
11
 
12
- # Try loading the generator model once at the top (for better performance)
13
- try:
14
- generator = pipeline("text-generation", model=model_dir, tokenizer="distilgpt2")
15
- except:
16
- generator = None
 
 
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def upload_file(file):
 
19
  if file is None:
20
- return "No file uploaded."
21
- shutil.copy(file.name, uploaded_file_path)
22
- return "βœ… File uploaded successfully."
23
-
24
- def start_training():
25
- with open(log_path, "w") as log_file:
26
- process = subprocess.Popen(
27
- ["python", "train.py", "--dataset", uploaded_file_path],
28
- stdout=log_file,
29
- stderr=subprocess.STDOUT
30
- )
31
- process.wait()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- if os.path.exists(model_dir):
34
- shutil.make_archive("trained_model", "zip", model_dir)
35
- return "βœ… Training complete!", zip_file
 
36
  else:
37
- return "❌ Training failed.", None
38
 
39
- def read_logs():
40
- if os.path.exists(log_path):
41
- with open(log_path, "r") as f:
42
- return f.read()
43
- return "Waiting for logs..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- def generate_response(prompt):
 
 
46
  try:
47
- if generator is None:
48
- return "❌ Model not loaded. Please train or upload a valid model."
49
- result = generator(
50
- prompt,
51
- max_length=256,
52
- do_sample=True,
53
- temperature=0.7,
54
- truncation=True
55
- )[0]["generated_text"]
56
- return result
 
 
 
57
  except Exception as e:
58
  return f"❌ Error: {e}"
59
 
60
- def upload_model(file):
61
- global generator
62
- if file is None:
63
- return "No model uploaded."
64
- shutil.unpack_archive(file.name, model_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  try:
66
- generator = pipeline("text-generation", model=model_dir, tokenizer="distilgpt2")
67
- return "βœ… Model uploaded and loaded successfully!"
 
68
  except Exception as e:
69
- return f"❌ Failed to load model: {e}"
70
-
71
- # === UI ===
72
- with gr.Blocks() as app:
73
- with gr.Tab("🧠 Train Python AI"):
74
- gr.Markdown("## πŸ“₯ Upload your dataset and 🎯 train a *Python AI* model")
75
- file_input = gr.File(label="Upload JSONL Dataset")
76
- upload_btn = gr.Button("Upload Dataset")
77
- status_box = gr.Textbox(label="Upload Status")
78
-
79
- start_btn = gr.Button("πŸš€ Start Training")
80
- log_output = gr.Textbox(label="πŸ“œ Training Logs", lines=15)
81
-
82
- download_btn = gr.File(label="πŸ“₯ Download Trained Model", visible=False)
83
-
84
- upload_btn.click(fn=upload_file, inputs=file_input, outputs=status_box)
85
- start_btn.click(fn=start_training, outputs=[status_box, download_btn])
86
- start_btn.click(fn=read_logs, outputs=log_output)
87
-
88
- with gr.Tab("πŸš€ Test Python AI"):
89
- gr.Markdown("## πŸ’‘ Try your trained Python AI below")
90
- prompt_input = gr.Textbox(label="Enter Prompt")
91
- test_btn = gr.Button("πŸ” Test AI")
92
- response_output = gr.Textbox(label="AI Response", lines=10)
93
- test_btn.click(fn=generate_response, inputs=prompt_input, outputs=response_output)
94
-
95
- with gr.Tab("πŸ“€ Upload Pretrained Model"):
96
- gr.Markdown("## πŸ“¦ Upload a trained Python AI model (.zip) to test")
97
- model_file_input = gr.File(label="Upload Model ZIP")
98
- model_upload_btn = gr.Button("Upload Model")
99
- model_status = gr.Textbox(label="Model Upload Status")
100
- model_upload_btn.click(fn=upload_model, inputs=model_file_input, outputs=model_status)
101
 
102
  app.launch()
 
1
  import os
2
  import shutil
3
  import subprocess
4
+ import threading
5
+ import uuid
6
+ import time
7
+ import zipfile
8
+ import glob
9
+ import gzip
10
  import gradio as gr
11
  from transformers import pipeline
12
 
13
+ # ---- Paths / constants ----
14
+ LOG_FILE = "train.log"
15
+ GEN_LOG_FILE = "dataset_gen.log"
16
+ MODEL_DIR = "trained_model"
17
+ ZIP_FILE = "trained_model.zip"
18
+ ZIP_TEMP = ZIP_FILE + ".part" # atomic write to avoid corrupt downloads
19
 
20
+ # ---- Helpers ----
21
+ def _human_size(nbytes: int) -> str:
22
+ units = ["B", "KB", "MB", "GB", "TB"]
23
+ i, x = 0, float(nbytes)
24
+ while x >= 1024 and i < len(units) - 1:
25
+ x /= 1024.0
26
+ i += 1
27
+ return f"{x:.1f} {units[i]}"
28
 
29
+ def _download_info_text() -> str:
30
+ if not os.path.exists(ZIP_FILE):
31
+ return "No trained model yet."
32
+ size = _human_size(os.path.getsize(ZIP_FILE))
33
+ mtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getmtime(ZIP_FILE)))
34
+ return f"*Model ready:* {ZIP_FILE} \n*Size:* {size} \n*Last modified:* {mtime}"
35
+
36
+ def _read_file_safely(path: str, fallback: str):
37
+ if os.path.exists(path):
38
+ try:
39
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
40
+ return f.read()
41
+ except:
42
+ return fallback
43
+ return fallback
44
+
45
+ def ensure_clean():
46
+ for p in (ZIP_FILE, ZIP_TEMP):
47
+ if os.path.exists(p):
48
+ try:
49
+ os.remove(p)
50
+ except:
51
+ pass
52
+
53
+ def _zip_folder_atomic(src_dir: str, zip_path: str, tmp_path: str):
54
+ """Write to .part then rename β†’ avoids corrupt/half-written zips."""
55
+ if os.path.exists(tmp_path):
56
+ os.remove(tmp_path)
57
+ with zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
58
+ for root, _, files in os.walk(src_dir):
59
+ for fn in files:
60
+ full = os.path.join(root, fn)
61
+ arc = os.path.relpath(full, src_dir)
62
+ zf.write(full, arcname=arc)
63
+ if os.path.exists(zip_path):
64
+ os.remove(zip_path)
65
+ os.replace(tmp_path, zip_path)
66
+
67
+ # ============================================================
68
+ # DATASET GENERATOR (PYTHON)
69
+ # ============================================================
70
+ def start_generation(total, shard_size, out_dir, prefix):
71
+ """Kick off Python dataset generation in a background thread."""
72
+ total = int(total or 1_000_000)
73
+ shard_size = int(shard_size or 10_000)
74
+ out_dir = (out_dir or "python_dataset_v1").strip()
75
+ prefix = (prefix or "python").strip()
76
+
77
+ with open(GEN_LOG_FILE, "w") as log:
78
+ log.write(f"🚧 Generating dataset: total={total}, shard_size={shard_size}, out_dir={out_dir}, prefix={prefix}\n")
79
+
80
+ def _worker():
81
+ with open(GEN_LOG_FILE, "a") as log:
82
+ if not os.path.exists("make_python_dataset.py"):
83
+ log.write("❌ make_python_dataset.py not found in repo root.\n")
84
+ return
85
+ try:
86
+ proc = subprocess.Popen(
87
+ [
88
+ "python",
89
+ "make_python_dataset.py",
90
+ "--total", str(total),
91
+ "--shard_size", str(shard_size),
92
+ "--out_dir", out_dir,
93
+ "--prefix", prefix,
94
+ ],
95
+ stdout=log,
96
+ stderr=subprocess.STDOUT,
97
+ )
98
+ proc.wait()
99
+ log.write(f"\nπŸ”š Generator exited with code {proc.returncode}\n")
100
+ if proc.returncode == 0:
101
+ files = sorted(glob.glob(os.path.join(out_dir, "*.jsonl.gz")))
102
+ log.write(f"βœ… Done. Shards: {len(files)} in {out_dir}\n")
103
+ else:
104
+ log.write("❌ Generation failed.\n")
105
+ except Exception as e:
106
+ log.write(f"\n❌ Exception: {e}\n")
107
+
108
+ threading.Thread(target=_worker, daemon=True).start()
109
+ return f"πŸš€ Dataset generation started. Output folder: {out_dir}"
110
+
111
+ def read_gen_logs():
112
+ return _read_file_safely(GEN_LOG_FILE, "Waiting for generator logs...")
113
+
114
+ def list_shards(folder):
115
+ """Return a short preview of shard files (for sanity)."""
116
+ if not folder or not os.path.isdir(folder):
117
+ return "❌ Provide a valid folder path that contains .jsonl or .jsonl.gz shards."
118
+ jsonl = sorted(glob.glob(os.path.join(folder, "*.jsonl")))
119
+ gz = sorted(glob.glob(os.path.join(folder, "*.jsonl.gz")))
120
+ total = len(jsonl) + len(gz)
121
+ if total == 0:
122
+ return "No shards found (*.jsonl or *.jsonl.gz)."
123
+ preview = (jsonl + gz)[:10]
124
+ lines = [f"Found {total} shard(s). Showing first {len(preview)}:"] + [f"- {os.path.basename(p)}" for p in preview]
125
+ return "\n".join(lines)
126
+
127
+ # ============================================================
128
+ # TRAINING
129
+ # ============================================================
130
  def upload_file(file):
131
+ """Copy uploaded dataset to a stable path; return status + saved path."""
132
  if file is None:
133
+ return "❌ No file uploaded.", ""
134
+ os.makedirs("uploads", exist_ok=True)
135
+ dst = os.path.join("uploads", f"dataset_{uuid.uuid4().hex}.jsonl")
136
+ shutil.copy(file.name, dst)
137
+ return f"βœ… Uploaded: {os.path.basename(file.name)} β†’ {dst}", dst
138
+
139
+ def _train_single_file(dataset_path: str, log):
140
+ """Train once on a single JSON/JSONL file."""
141
+ proc = subprocess.Popen(
142
+ ["python", "train.py", "--dataset", dataset_path, "--output", MODEL_DIR],
143
+ stdout=log,
144
+ stderr=subprocess.STDOUT,
145
+ )
146
+ proc.wait()
147
+ log.write(f"\n ↳ train.py exited {proc.returncode} for {os.path.basename(dataset_path)}\n")
148
+ return proc.returncode == 0
149
+
150
+ def _train_worker(dataset_path: str, shards_folder: str):
151
+ with open(LOG_FILE, "w") as log:
152
+ log.write("πŸ”₯ Starting training...\n")
153
+
154
+ ok = True
155
+ with open(LOG_FILE, "a") as log:
156
+ if shards_folder:
157
+ log.write(f"πŸ“‚ Folder mode: {shards_folder}\n")
158
+ paths = sorted(glob.glob(os.path.join(shards_folder, "*.jsonl"))) + \
159
+ sorted(glob.glob(os.path.join(shards_folder, "*.jsonl.gz")))
160
+ if not paths:
161
+ log.write("❌ No shards found. Aborting.\n")
162
+ ok = False
163
+ else:
164
+ tmp = "tmp_train.jsonl"
165
+ for i, p in enumerate(paths, 1):
166
+ log.write(f"\n[{i}/{len(paths)}] Training on shard: {os.path.basename(p)}\n")
167
+ # if gz, stream to tmp jsonl
168
+ if p.endswith(".gz"):
169
+ try:
170
+ with gzip.open(p, "rt", encoding="utf-8") as rf, open(tmp, "w", encoding="utf-8") as wf:
171
+ for line in rf:
172
+ wf.write(line)
173
+ shard_path = tmp
174
+ except Exception as e:
175
+ log.write(f"❌ Failed to read gz shard: {e}\n")
176
+ ok = False
177
+ break
178
+ else:
179
+ shard_path = p
180
+ if not _train_single_file(shard_path, log):
181
+ ok = False
182
+ break
183
+ if os.path.exists(tmp):
184
+ try: os.remove(tmp)
185
+ except: pass
186
+ else:
187
+ if not dataset_path or not os.path.exists(dataset_path):
188
+ log.write("❌ Please upload a valid dataset first.\n")
189
+ ok = False
190
+ else:
191
+ ok = _train_single_file(dataset_path, log)
192
+
193
+ if ok and os.path.isdir(MODEL_DIR):
194
+ try:
195
+ time.sleep(0.5) # settle delay
196
+ _zip_folder_atomic(MODEL_DIR, ZIP_FILE, ZIP_TEMP)
197
+ sz = _human_size(os.path.getsize(ZIP_FILE))
198
+ log.write(f"\nβœ… Model zipped β†’ {ZIP_FILE} ({sz})\n")
199
+ except Exception as e:
200
+ log.write(f"\n❌ Zipping failed: {e}\n")
201
+ else:
202
+ log.write("\n❌ Training failed; no zip created.\n")
203
+
204
+ return ok
205
+
206
+ def start_training(dataset_path: str, shards_folder: str):
207
+ ensure_clean()
208
+ threading.Thread(target=_train_worker, args=(dataset_path, shards_folder), daemon=True).start()
209
+ return "πŸš€ Training started in the background. Use the Refresh buttons to update."
210
+
211
+ def read_logs_once():
212
+ return _read_file_safely(LOG_FILE, "Waiting for logs...")
213
 
214
+ def check_download():
215
+ """Return download button state + info text (manual, non-streaming)."""
216
+ if os.path.exists(ZIP_FILE):
217
+ return gr.update(visible=True, value=ZIP_FILE), _download_info_text()
218
  else:
219
+ return gr.update(visible=False, value=None), "No trained model yet."
220
 
221
+ # ============================================================
222
+ # TEST
223
+ # ============================================================
224
+ def upload_test_model_zip(zip_file):
225
+ """
226
+ Accept a model ZIP, extract to models/test_<uuid>/, return status + extracted path.
227
+ ZIP should contain a HF model folder (config.json + tokenizer + weights).
228
+ """
229
+ if zip_file is None:
230
+ return "❌ No file uploaded.", ""
231
+ extract_root = os.path.join("models", f"test_{uuid.uuid4().hex}")
232
+ os.makedirs(extract_root, exist_ok=True)
233
+ try:
234
+ with zipfile.ZipFile(zip_file.name, "r") as zf:
235
+ zf.extractall(extract_root)
236
+ return f"βœ… Model ZIP extracted to: {extract_root}", extract_root
237
+ except Exception as e:
238
+ return f"❌ Failed to extract: {e}", ""
239
+
240
+ def clear_uploaded_model():
241
+ return "Model cleared. Will use trained_model/ if available.", ""
242
 
243
+ def generate_response(prompt, uploaded_model_path):
244
+ if not prompt or not prompt.strip():
245
+ return "Please enter a prompt."
246
  try:
247
+ if uploaded_model_path and os.path.isdir(uploaded_model_path):
248
+ model_path = uploaded_model_path
249
+ src = "(uploaded model)"
250
+ elif os.path.isdir(MODEL_DIR):
251
+ model_path = MODEL_DIR
252
+ src = "(trained_model/)"
253
+ else:
254
+ model_path = "distilgpt2"
255
+ src = "(fallback: distilgpt2)"
256
+
257
+ gen = pipeline("text-generation", model=model_path, tokenizer="distilgpt2")
258
+ out = gen(prompt, max_length=256, do_sample=True, temperature=0.7, truncation=True)[0]["generated_text"]
259
+ return f"{out}\n\nβ€” using {src}"
260
  except Exception as e:
261
  return f"❌ Error: {e}"
262
 
263
+ # ------------- UI -------------
264
+ with gr.Blocks(title="Python AI Trainer (with Dataset Generator)") as app:
265
+ gr.Markdown("## 🐍 Python AI Trainer\nGenerate a large Python dataset, train (single file or folder of shards), download the model, and test any model (uploaded or trained).")
266
+
267
+ dataset_state = gr.State(value="") # path to single dataset file
268
+ shard_folder_state = gr.State(value="") # folder containing shards
269
+ test_model_state = gr.State(value="")
270
+
271
+ # =============== Generate Dataset ===============
272
+ with gr.Tab("πŸ§ͺ Generate Dataset"):
273
+ gr.Markdown("Generate a large Python dataset in shards (no streaming; use Refresh to see logs).")
274
+ with gr.Row():
275
+ total_in = gr.Number(value=1_000_000, label="Total samples")
276
+ shard_in = gr.Number(value=10_000, label="Rows per shard")
277
+ with gr.Row():
278
+ out_dir_in = gr.Textbox(value="python_dataset_v1", label="Output folder")
279
+ prefix_in = gr.Textbox(value="python", label="File prefix")
280
+ with gr.Row():
281
+ gen_btn = gr.Button("πŸš€ Start Generation")
282
+ gen_refresh_btn = gr.Button("πŸ” Refresh Logs")
283
+ gen_status = gr.Textbox(label="Generator Status", interactive=False)
284
+ gen_logs = gr.Textbox(label="Generator Logs", lines=16)
285
+ with gr.Row():
286
+ list_folder = gr.Textbox(value="python_dataset_v1", label="Preview shards in folder")
287
+ list_btn = gr.Button("πŸ‘€ List Shards")
288
+ list_out = gr.Textbox(label="Shard Preview", lines=8)
289
+
290
+ gen_btn.click(
291
+ fn=start_generation,
292
+ inputs=[total_in, shard_in, out_dir_in, prefix_in],
293
+ outputs=gen_status
294
+ ).then(fn=read_gen_logs, outputs=gen_logs)
295
+ gen_refresh_btn.click(fn=read_gen_logs, outputs=gen_logs)
296
+ list_btn.click(fn=list_shards, inputs=list_folder, outputs=list_out)
297
+
298
+ # ==================== Train ====================
299
+ with gr.Tab("🧠 Train"):
300
+ gr.Markdown("Upload a single JSONL *or* provide a folder with shards (.jsonl / .jsonl.gz).")
301
+ with gr.Row():
302
+ file_input = gr.File(label="Upload single JSONL dataset", file_types=[".jsonl"])
303
+ upload_btn = gr.Button("πŸ“€ Upload (single file)")
304
+ with gr.Row():
305
+ shards_folder = gr.Textbox(value="", label="Folder with shards (optional)")
306
+ use_folder_btn = gr.Button("πŸ“‚ Use Folder For Training")
307
+ status_box = gr.Textbox(label="Status", interactive=False)
308
+
309
+ with gr.Row():
310
+ start_btn = gr.Button("πŸš€ Start Training")
311
+ refresh_btn = gr.Button("πŸ” Refresh Logs")
312
+ refresh_dl_btn = gr.Button("πŸ“¦ Refresh Download Area")
313
+
314
+ log_output = gr.Textbox(label="πŸ“œ Training Logs", lines=18)
315
+
316
+ with gr.Group():
317
+ gr.Markdown("### πŸ“¦ Trained Model")
318
+ download_info = gr.Markdown(value="No trained model yet.")
319
+ download_btn = gr.DownloadButton(label="πŸ“₯ Download Trained Model (.zip)", visible=False, value=None)
320
+
321
+ upload_btn.click(fn=upload_file, inputs=file_input, outputs=[status_box, dataset_state])
322
+ use_folder_btn.click(
323
+ fn=lambda p: ("βœ… Using folder for training." if p.strip() else "❌ Provide a valid folder path.", p.strip()),
324
+ inputs=shards_folder,
325
+ outputs=[status_box, shard_folder_state]
326
+ )
327
+ start_btn.click(
328
+ fn=start_training,
329
+ inputs=[dataset_state, shard_folder_state],
330
+ outputs=status_box
331
+ ).then(fn=read_logs_once, outputs=log_output
332
+ ).then(fn=check_download, outputs=[download_btn, download_info])
333
+
334
+ refresh_btn.click(fn=read_logs_once, outputs=log_output)
335
+ refresh_dl_btn.click(fn=check_download, outputs=[download_btn, download_info])
336
+
337
+ # ===================== Test =====================
338
+ with gr.Tab("πŸš€ Test"):
339
+ gr.Markdown("Use an uploaded model ZIP or the just-trained model.")
340
+ with gr.Row():
341
+ test_zip = gr.File(label="Upload Model ZIP", file_types=[".zip"])
342
+ load_test_btn = gr.Button("πŸ“¦ Load Uploaded Model ZIP")
343
+ clear_test_btn = gr.Button("🧹 Clear Uploaded Model")
344
+ test_status = gr.Textbox(label="Test Model Status", interactive=False)
345
+ prompt_input = gr.Textbox(label="Prompt", placeholder="e.g., Write a Python function that parses CSV and computes average")
346
+ test_btn = gr.Button("πŸ” Generate")
347
+ response_output = gr.Textbox(label="AI Response", lines=12)
348
+
349
+ load_test_btn.click(fn=upload_test_model_zip, inputs=test_zip, outputs=[test_status, test_model_state])
350
+ clear_test_btn.click(fn=clear_uploaded_model, outputs=[test_status, test_model_state])
351
+ test_btn.click(fn=generate_response, inputs=[prompt_input, test_model_state], outputs=response_output)
352
+
353
+ # ---- Optional: auto-start on boot via env vars ----
354
+ AUTOSTART = os.getenv("AUTOSTART_TRAIN", "0") == "1"
355
+ AUTOSTART_SINGLE_DATASET = os.getenv("AUTOSTART_DATASET", "").strip()
356
+ AUTOSTART_SHARDS_FOLDER = os.getenv("AUTOSTART_SHARDS", "").strip()
357
+ if AUTOSTART and not os.path.exists(".autostart.started"):
358
+ open(".autostart.started", "w").close()
359
  try:
360
+ _ = start_training(AUTOSTART_SINGLE_DATASET if AUTOSTART_SINGLE_DATASET else "",
361
+ AUTOSTART_SHARDS_FOLDER if AUTOSTART_SHARDS_FOLDER else "")
362
+ _ = read_logs_once()
363
  except Exception as e:
364
+ with open(LOG_FILE, "a") as log:
365
+ log.write(f"\n❌ Autostart failed: {e}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  app.launch()