Percy3822 commited on
Commit
eca2f3b
·
verified ·
1 Parent(s): 3fe5c2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -101
app.py CHANGED
@@ -1,65 +1,41 @@
1
- # app.py
2
  import os, shutil, subprocess, zipfile, time
3
  from pathlib import Path
4
  import gradio as gr
5
 
6
- ROOT = Path(".").resolve()
7
- DATASET_PATH = ROOT / "dataset.jsonl"
8
- LOG_PATH = ROOT / "train.log"
9
- MODEL_DIR = ROOT / "trained_model"
10
- ZIP_PATH = ROOT / "trained_model.zip"
 
 
 
11
 
12
- # ---------- helpers ----------
13
- def list_workspace():
14
  rows = []
15
  for p in sorted(ROOT.iterdir(), key=lambda x: (x.is_file(), x.name.lower())):
16
- try:
17
- size = p.stat().st_size
18
- except Exception:
19
- size = 0
20
- rows.append(f"{'[DIR]' if p.is_dir() else ' '}\t{size:>10}\t{p.name}")
21
  return "\n".join(rows) or "(empty)"
22
 
23
- def list_zips():
24
- return [str(p) for p in ROOT.glob("*.zip")]
25
-
26
- def zip_trained_model():
27
- if not MODEL_DIR.exists():
28
- return False, "trained_model/ not found"
29
- # remove old zip
30
- if ZIP_PATH.exists():
31
- try:
32
- ZIP_PATH.unlink()
33
- except Exception as e:
34
- return False, f"could not remove old zip: {e}"
35
- # create zip
36
- try:
37
- with zipfile.ZipFile(ZIP_PATH, "w", compression=zipfile.ZIP_DEFLATED) as z:
38
- for path in MODEL_DIR.rglob("*"):
39
- z.write(path, arcname=path.relative_to(MODEL_DIR))
40
- except Exception as e:
41
- return False, f"zip error: {e}"
42
- return ZIP_PATH.exists(), f"created {ZIP_PATH.name}"
43
-
44
- # ---------- train ----------
45
- def upload_dataset(file):
46
- if not file:
47
- return "❌ No file selected.", list_workspace()
48
- shutil.copy(file.name, DATASET_PATH)
49
- return f"✅ Uploaded → {DATASET_PATH.name}", list_workspace()
50
-
51
- def start_training():
52
- # clean
53
- if MODEL_DIR.exists():
54
- shutil.rmtree(MODEL_DIR)
55
- if ZIP_PATH.exists():
56
- ZIP_PATH.unlink(missing_ok=True)
57
- LOG_PATH.write_text("🔥 Starting training...\n", encoding="utf-8")
58
 
 
59
  cmd = [
60
  "python", "train.py",
61
- "--dataset", str(DATASET_PATH),
62
- "--output", str(MODEL_DIR),
63
  "--model_name", "Salesforce/codegen-350M-multi",
64
  "--epochs", "1",
65
  "--batch_size", "2",
@@ -67,32 +43,33 @@ def start_training():
67
  "--learning_rate", "5e-5",
68
  "--subset", "0",
69
  ]
70
- with open(LOG_PATH, "a", encoding="utf-8") as lf:
71
- code = subprocess.Popen(cmd, stdout=lf, stderr=subprocess.STDOUT).wait()
72
-
73
- # zip if success
74
- if code == 0:
75
- ok, msg = zip_trained_model()
76
- info = f"Saved to: {MODEL_DIR.name} | {msg}"
77
- files = list_zips() if ok else []
78
- return ("✅ Training complete.", info, gr.Files.update(value=files, visible=ok), list_workspace())
79
- else:
80
- tail = ""
81
- if LOG_PATH.exists():
82
- with open(LOG_PATH, "r", encoding="utf-8") as f:
83
- tail = "".join(f.readlines()[-60:])
84
- return (f"❌ Training failed (exit {code}). See logs below.", tail, gr.Files.update(visible=False), list_workspace())
85
 
86
  def read_logs():
87
- if LOG_PATH.exists():
88
- return LOG_PATH.read_text(encoding="utf-8")[-20000:]
89
- return "⏳ Waiting for logs…"
90
-
91
- def refresh_download():
92
- files = list_zips()
93
- return gr.Files.update(value=files, visible=bool(files)), list_workspace()
94
-
95
- # ---------- test ----------
 
 
 
 
 
 
 
 
 
 
 
 
96
  def list_models():
97
  out = []
98
  for p in ROOT.iterdir():
@@ -100,25 +77,21 @@ def list_models():
100
  (p / "tokenizer.json").exists() or (p / "tokenizer_config.json").exists()
101
  ):
102
  out.append(str(p))
103
- # ensure trained_model on top if present
104
- if MODEL_DIR.exists() and str(MODEL_DIR) not in out:
105
- out.insert(0, str(MODEL_DIR))
106
  return sorted(out)
107
 
108
- def import_zip(zip_file):
109
- if not zip_file:
110
- return "❌ No zip selected.", list_models()
111
  dest = ROOT / f"imported_{int(time.time())}"
112
  dest.mkdir(parents=True, exist_ok=True)
113
- with zipfile.ZipFile(zip_file.name, "r") as z:
114
- z.extractall(dest)
115
  return f"✅ Imported to {dest.name}", list_models()
116
 
117
  def generate(model_path, prompt):
118
- if not model_path:
119
- return "❌ Select a model."
120
- if not prompt or not prompt.strip():
121
- return "❌ Enter a prompt."
122
  try:
123
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
124
  tok = AutoTokenizer.from_pretrained(model_path, use_fast=True)
@@ -126,41 +99,38 @@ def generate(model_path, prompt):
126
  tok.pad_token = tok.eos_token
127
  model = AutoModelForCausalLM.from_pretrained(model_path)
128
  gen = pipeline("text-generation", model=model, tokenizer=tok)
129
- out = gen(
130
- prompt, max_new_tokens=220, do_sample=True, temperature=0.2, top_p=0.9,
131
- repetition_penalty=1.2, no_repeat_ngram_size=4,
132
- eos_token_id=tok.eos_token_id, pad_token_id=tok.pad_token_id, truncation=True
133
- )[0]["generated_text"]
134
  return out
135
  except Exception as e:
136
  return f"❌ Error: {e}"
137
 
138
- # ---------- UI ----------
139
  with gr.Blocks(title="Python AI — Train & Test") as app:
140
- gr.Markdown("## 🧠 Python AI — Train & Test\nTrain Zip Download. Test models separately.\n")
141
 
142
  with gr.Tab("Train"):
143
  with gr.Row():
144
- ds = gr.File(label="📥 Upload JSONL dataset", file_types=[".jsonl", ".jsonl.gz", ".json"])
145
- ws = gr.Textbox(label="Workspace Explorer", lines=16, value=list_workspace())
146
  up_status = gr.Textbox(label="Upload Status", interactive=False)
147
  start = gr.Button("🚀 Start Training", variant="primary")
148
  logs = gr.Textbox(label="📜 Logs (click Refresh)", lines=18)
149
  refresh_logs_btn = gr.Button("Refresh Logs")
150
  status = gr.Textbox(label="Status", interactive=False)
151
- model_info = gr.Textbox(label="Model Output", interactive=False)
152
- downloads = gr.Files(label="📦 Downloads (zips)", value=list_zips(), interactive=False)
153
- refresh_dl_btn = gr.Button("Refresh Download Area")
154
 
155
  ds.change(upload_dataset, inputs=ds, outputs=[up_status, ws])
156
- start.click(start_training, outputs=[status, model_info, downloads, ws])
157
  refresh_logs_btn.click(read_logs, outputs=logs)
158
- refresh_dl_btn.click(refresh_download, outputs=[downloads, ws])
159
 
160
  with gr.Tab("Test"):
161
  refresh_btn = gr.Button("↻ Refresh Model List")
162
  model_list = gr.Dropdown(choices=list_models(), label="Available AIs", interactive=True)
163
- zip_in = gr.File(label="Or upload a model .zip", file_types=[".zip"])
164
  import_status = gr.Textbox(label="Import Status", interactive=False)
165
  prompt = gr.Textbox(label="Prompt", lines=8, placeholder="### Instruction:\nPython: write a function ...\n### Response:\n")
166
  go = gr.Button("Generate")
 
 
1
  import os, shutil, subprocess, zipfile, time
2
  from pathlib import Path
3
  import gradio as gr
4
 
5
+ ROOT = Path(".").resolve()
6
+ DATASET = ROOT / "dataset.jsonl"
7
+ LOG = ROOT / "train.log"
8
+ OUT_DIR = ROOT / "trained_model"
9
+ ZIP = ROOT / "trained_model.zip"
10
+ PID = ROOT / "TRAIN_PID"
11
+ DONE = ROOT / "TRAIN_DONE"
12
+ ERRF = ROOT / "TRAIN_ERROR"
13
 
14
+ def ls_workspace():
 
15
  rows = []
16
  for p in sorted(ROOT.iterdir(), key=lambda x: (x.is_file(), x.name.lower())):
17
+ sz = p.stat().st_size if p.exists() else 0
18
+ rows.append(f"{'[DIR]' if p.is_dir() else ' '}\t{sz:>10}\t{p.name}")
 
 
 
19
  return "\n".join(rows) or "(empty)"
20
 
21
+ def upload_dataset(f):
22
+ if not f: return "❌ No file.", ls_workspace()
23
+ shutil.copy(f.name, DATASET)
24
+ return f"✅ Uploaded → {DATASET.name}", ls_workspace()
25
+
26
+ def start_training(): # non-blocking
27
+ # clean previous
28
+ for p in [OUT_DIR, ZIP, DONE, ERRF, PID]:
29
+ if isinstance(p, Path) and p.is_dir():
30
+ shutil.rmtree(p, ignore_errors=True)
31
+ elif isinstance(p, Path) and p.exists():
32
+ p.unlink(missing_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ LOG.write_text("🔥 Training started in background…\n", encoding="utf-8")
35
  cmd = [
36
  "python", "train.py",
37
+ "--dataset", str(DATASET),
38
+ "--output", str(OUT_DIR),
39
  "--model_name", "Salesforce/codegen-350M-multi",
40
  "--epochs", "1",
41
  "--batch_size", "2",
 
43
  "--learning_rate", "5e-5",
44
  "--subset", "0",
45
  ]
46
+ with open(LOG, "a", encoding="utf-8") as lf:
47
+ proc = subprocess.Popen(cmd, stdout=lf, stderr=subprocess.STDOUT)
48
+ PID.write_text(str(proc.pid))
49
+ return "🚀 Training started. Use “Refresh Logs/Download”.", ls_workspace()
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def read_logs():
52
+ return LOG.read_text(encoding="utf-8")[-20000:] if LOG.exists() else "⏳ Waiting…"
53
+
54
+ def _zip_if_ready():
55
+ """Zip only when DONE flag exists and zip not created yet."""
56
+ if DONE.exists() and OUT_DIR.exists() and not ZIP.exists():
57
+ with zipfile.ZipFile(ZIP, "w", compression=zipfile.ZIP_DEFLATED) as z:
58
+ for p in OUT_DIR.rglob("*"):
59
+ z.write(p, arcname=p.relative_to(OUT_DIR))
60
+ return ZIP.exists()
61
+
62
+ def refresh_status_and_download():
63
+ status = "⏳ Training…"
64
+ if ERRF.exists():
65
+ status = f"❌ Error: {ERRF.read_text(encoding='utf-8')[-500:]}"
66
+ elif DONE.exists():
67
+ status = "✅ Training complete."
68
+ _zip_if_ready()
69
+ files = [str(ZIP)] if ZIP.exists() else []
70
+ return status, gr.Files.update(value=files, visible=bool(files)), ls_workspace()
71
+
72
+ # ---- Test tab ----
73
  def list_models():
74
  out = []
75
  for p in ROOT.iterdir():
 
77
  (p / "tokenizer.json").exists() or (p / "tokenizer_config.json").exists()
78
  ):
79
  out.append(str(p))
80
+ if OUT_DIR.exists() and str(OUT_DIR) not in out:
81
+ out.insert(0, str(OUT_DIR))
 
82
  return sorted(out)
83
 
84
+ def import_zip(z):
85
+ if not z: return "❌ No zip.", list_models()
 
86
  dest = ROOT / f"imported_{int(time.time())}"
87
  dest.mkdir(parents=True, exist_ok=True)
88
+ with zipfile.ZipFile(z.name, "r") as zp:
89
+ zp.extractall(dest)
90
  return f"✅ Imported to {dest.name}", list_models()
91
 
92
  def generate(model_path, prompt):
93
+ if not model_path: return "❌ Select a model."
94
+ if not prompt or not prompt.strip(): return "❌ Enter a prompt."
 
 
95
  try:
96
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
97
  tok = AutoTokenizer.from_pretrained(model_path, use_fast=True)
 
99
  tok.pad_token = tok.eos_token
100
  model = AutoModelForCausalLM.from_pretrained(model_path)
101
  gen = pipeline("text-generation", model=model, tokenizer=tok)
102
+ out = gen(prompt, max_new_tokens=220, do_sample=True, temperature=0.2, top_p=0.9,
103
+ repetition_penalty=1.2, no_repeat_ngram_size=4,
104
+ eos_token_id=tok.eos_token_id, pad_token_id=tok.pad_token_id,
105
+ truncation=True)[0]["generated_text"]
 
106
  return out
107
  except Exception as e:
108
  return f"❌ Error: {e}"
109
 
 
110
  with gr.Blocks(title="Python AI — Train & Test") as app:
111
+ gr.Markdown("## 🧠 Python AI — Train & Test\nBackground training with reliable zipping.\n")
112
 
113
  with gr.Tab("Train"):
114
  with gr.Row():
115
+ ds = gr.File(label="📥 Upload JSONL", file_types=[".jsonl", ".jsonl.gz", ".json"])
116
+ ws = gr.Textbox(label="Workspace", lines=16, value=ls_workspace())
117
  up_status = gr.Textbox(label="Upload Status", interactive=False)
118
  start = gr.Button("🚀 Start Training", variant="primary")
119
  logs = gr.Textbox(label="📜 Logs (click Refresh)", lines=18)
120
  refresh_logs_btn = gr.Button("Refresh Logs")
121
  status = gr.Textbox(label="Status", interactive=False)
122
+ downloads = gr.Files(label="📦 Downloads (zips)", value=[], interactive=False)
123
+ refresh_dl_btn = gr.Button("Refresh Status & Download")
 
124
 
125
  ds.change(upload_dataset, inputs=ds, outputs=[up_status, ws])
126
+ start.click(start_training, outputs=[status, ws])
127
  refresh_logs_btn.click(read_logs, outputs=logs)
128
+ refresh_dl_btn.click(refresh_status_and_download, outputs=[status, downloads, ws])
129
 
130
  with gr.Tab("Test"):
131
  refresh_btn = gr.Button("↻ Refresh Model List")
132
  model_list = gr.Dropdown(choices=list_models(), label="Available AIs", interactive=True)
133
+ zip_in = gr.File(label="Or upload model .zip", file_types=[".zip"])
134
  import_status = gr.Textbox(label="Import Status", interactive=False)
135
  prompt = gr.Textbox(label="Prompt", lines=8, placeholder="### Instruction:\nPython: write a function ...\n### Response:\n")
136
  go = gr.Button("Generate")