smarthillc commited on
Commit
b1b635d
Β·
1 Parent(s): 61b3c92

Add debug output to capture training errors

Browse files
Files changed (1) hide show
  1. app.py +32 -102
app.py CHANGED
@@ -2,10 +2,9 @@ import gradio as gr
2
  import os
3
  import subprocess
4
  import threading
5
- import time
6
 
7
  # Global variable to track training status
8
- training_status = {"status": "idle", "message": "", "progress": 0}
9
 
10
  def check_data():
11
  """Check if data is available"""
@@ -14,8 +13,6 @@ def check_data():
14
  files.append("βœ… Combined dataset: 9,302 examples")
15
  if os.path.exists("combined_balanced_training_data.csv"):
16
  files.append("βœ… Balanced dataset: 8,304 examples")
17
- if os.path.exists("data/clean_training_data.csv"):
18
- files.append("βœ… Clean manual data: 478 examples")
19
 
20
  if not files:
21
  return "❌ No training data found. Please upload data files."
@@ -46,7 +43,7 @@ def run_training_subprocess(hf_token, model_size, hub_username, num_epochs, use_
46
  "--data_path", data_path,
47
  "--model_size", size,
48
  "--num_epochs", str(num_epochs),
49
- "--use_lora" # Always use LoRA for efficiency
50
  ]
51
 
52
  if hf_token:
@@ -56,40 +53,24 @@ def run_training_subprocess(hf_token, model_size, hub_username, num_epochs, use_
56
 
57
  training_status["status"] = "running"
58
  training_status["message"] = "Starting training..."
59
- training_status["progress"] = 0
60
 
61
- # Run training
62
  process = subprocess.Popen(
63
  cmd,
64
  stdout=subprocess.PIPE,
65
- stderr=subprocess.STDOUT,
66
- text=True,
67
- bufsize=1
68
  )
69
 
70
- # Read output line by line
71
- for line in process.stdout:
72
- if "loss" in line.lower():
73
- training_status["message"] = line.strip()
74
- elif "epoch" in line.lower():
75
- # Try to extract progress
76
- try:
77
- if "/" in line:
78
- parts = line.split("/")
79
- current = float(parts[0].split()[-1])
80
- total = float(parts[1].split()[0])
81
- training_status["progress"] = int((current / total) * 100)
82
- except:
83
- pass
84
- elif "exact_match" in line.lower():
85
- training_status["message"] = f"Evaluation: {line.strip()}"
86
 
87
- process.wait()
88
 
89
  if process.returncode == 0:
90
  training_status["status"] = "completed"
91
- training_status["message"] = "Training completed successfully! Model pushed to HuggingFace Hub."
92
- training_status["progress"] = 100
93
  else:
94
  training_status["status"] = "error"
95
  training_status["message"] = f"Training failed with exit code {process.returncode}"
@@ -97,6 +78,7 @@ def run_training_subprocess(hf_token, model_size, hub_username, num_epochs, use_
97
  except Exception as e:
98
  training_status["status"] = "error"
99
  training_status["message"] = f"Error: {str(e)}"
 
100
 
101
  def train_model(hf_token, model_size, hub_username, num_epochs, use_balanced):
102
  """Start training in background thread"""
@@ -108,6 +90,9 @@ def train_model(hf_token, model_size, hub_username, num_epochs, use_balanced):
108
  if training_status["status"] == "running":
109
  return "⚠️ Training already in progress!"
110
 
 
 
 
111
  # Start training in background thread
112
  thread = threading.Thread(
113
  target=run_training_subprocess,
@@ -115,44 +100,22 @@ def train_model(hf_token, model_size, hub_username, num_epochs, use_balanced):
115
  )
116
  thread.start()
117
 
118
- return "πŸš€ Training started! Check status below..."
119
 
120
  def get_training_status():
121
  """Get current training status"""
122
  global training_status
123
 
124
- if training_status["status"] == "idle":
125
- return "πŸ’€ No training in progress"
126
- elif training_status["status"] == "running":
127
- return f"""πŸƒ Training in progress... ({training_status['progress']}%)
128
-
129
- {training_status['message']}"""
130
- elif training_status["status"] == "completed":
131
- return f"""βœ… Training completed!
132
-
133
- {training_status['message']}
134
-
135
- Your model is available at: https://huggingface.co/{training_status.get('hub_username', 'your-username')}/resume-normalizer-flan-t5"""
136
- else:
137
- return f"""❌ Training failed!
138
-
139
- {training_status['message']}"""
140
 
141
  # Create Gradio interface
142
  with gr.Blocks(title="Resume Normalizer Trainer") as app:
143
- gr.Markdown("""
144
- # Resume Normalizer Trainer
145
-
146
- Train a Flan-T5 model to normalize company names, job titles, and skills from resumes.
147
-
148
- **Features:**
149
- - Company name normalization (e.g., "Google Inc" β†’ "Alphabet Inc.")
150
- - Job title standardization (e.g., "SWE" β†’ "Software Engineer")
151
- - Skills normalization (e.g., "JS" β†’ "JavaScript")
152
- - Binary equivalency detection
153
-
154
- **Hardware:** Running on 4xL4 GPUs (96GB VRAM)
155
- """)
156
 
157
  with gr.Tab("πŸ“Š Check Data"):
158
  check_btn = gr.Button("Check Available Datasets", variant="primary")
@@ -165,35 +128,30 @@ with gr.Blocks(title="Resume Normalizer Trainer") as app:
165
  hf_token = gr.Textbox(
166
  label="HuggingFace Token",
167
  type="password",
168
- placeholder="hf_...",
169
- info="Required to push model to Hub"
170
  )
171
  hub_username = gr.Textbox(
172
  label="HuggingFace Username",
173
- value="aoisfhdugbos",
174
- info="Your HuggingFace username"
175
  )
176
 
177
  with gr.Column():
178
  model_size = gr.Dropdown(
179
  label="Model Size",
180
  choices=["T5-Base (250M)", "T5-Large (770M)"],
181
- value="T5-Base (250M)",
182
- info="Larger models are more accurate but slower"
183
  )
184
  num_epochs = gr.Slider(
185
  label="Training Epochs",
186
  minimum=1,
187
  maximum=10,
188
  value=5,
189
- step=1,
190
- info="More epochs = better quality but longer training"
191
  )
192
 
193
  use_balanced = gr.Checkbox(
194
  label="Use Balanced Dataset (8,304 examples)",
195
- value=False,
196
- info="Check to use balanced dataset instead of full dataset (9,302 examples)"
197
  )
198
 
199
  train_btn = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
@@ -205,39 +163,11 @@ with gr.Blocks(title="Resume Normalizer Trainer") as app:
205
  outputs=train_output
206
  )
207
 
208
- with gr.Tab("πŸ“ˆ Training Status"):
209
- gr.Markdown("Click the button below to refresh training status")
210
- status_btn = gr.Button("πŸ”„ Refresh Status", variant="secondary")
211
- status_output = gr.Textbox(label="Current Status", lines=10)
212
-
213
- status_btn.click(get_training_status, outputs=status_output)
214
-
215
- with gr.Tab("ℹ️ About"):
216
- gr.Markdown("""
217
- ## Resume Normalizer Model
218
-
219
- This trainer fine-tunes a Flan-T5 model for resume entity normalization tasks:
220
-
221
- ### Supported Tasks:
222
- 1. **Company Normalization**: Handles mergers, acquisitions, rebranding
223
- 2. **Job Title Standardization**: Recognizes equivalent roles and seniority
224
- 3. **Skills Normalization**: Standardizes technology names and abbreviations
225
- 4. **Equivalency Detection**: Binary classification for entity matching
226
-
227
- ### Model Architecture:
228
- - Base Model: Google Flan-T5 (instruction-tuned)
229
- - Fine-tuning: LoRA (Low-Rank Adaptation) for efficiency
230
- - Multi-task: Uses task prefixes ([COMPANY], [JOB], [SKILLS])
231
-
232
- ### Training Data:
233
- - 478 manually curated high-quality examples
234
- - 8,824 synthetic examples generated with GPT-4
235
- - Total: 9,302 training examples
236
 
237
- ### Expected Performance:
238
- - Inference: <100ms per query
239
- - Accuracy: >90% on test set
240
- - Model size: 250M-770M parameters
241
- """)
242
 
243
  app.launch()
 
2
  import os
3
  import subprocess
4
  import threading
 
5
 
6
  # Global variable to track training status
7
+ training_status = {"status": "idle", "message": "", "full_output": ""}
8
 
9
  def check_data():
10
  """Check if data is available"""
 
13
  files.append("βœ… Combined dataset: 9,302 examples")
14
  if os.path.exists("combined_balanced_training_data.csv"):
15
  files.append("βœ… Balanced dataset: 8,304 examples")
 
 
16
 
17
  if not files:
18
  return "❌ No training data found. Please upload data files."
 
43
  "--data_path", data_path,
44
  "--model_size", size,
45
  "--num_epochs", str(num_epochs),
46
+ "--use_lora"
47
  ]
48
 
49
  if hf_token:
 
53
 
54
  training_status["status"] = "running"
55
  training_status["message"] = "Starting training..."
56
+ training_status["full_output"] = f"Command: {' '.join(cmd)}\n\n"
57
 
58
+ # Run training and capture ALL output
59
  process = subprocess.Popen(
60
  cmd,
61
  stdout=subprocess.PIPE,
62
+ stderr=subprocess.PIPE,
63
+ text=True
 
64
  )
65
 
66
+ # Capture both stdout and stderr
67
+ stdout, stderr = process.communicate()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ training_status["full_output"] += f"=== STDOUT ===\n{stdout}\n\n=== STDERR ===\n{stderr}"
70
 
71
  if process.returncode == 0:
72
  training_status["status"] = "completed"
73
+ training_status["message"] = "Training completed successfully!"
 
74
  else:
75
  training_status["status"] = "error"
76
  training_status["message"] = f"Training failed with exit code {process.returncode}"
 
78
  except Exception as e:
79
  training_status["status"] = "error"
80
  training_status["message"] = f"Error: {str(e)}"
81
+ training_status["full_output"] = str(e)
82
 
83
  def train_model(hf_token, model_size, hub_username, num_epochs, use_balanced):
84
  """Start training in background thread"""
 
90
  if training_status["status"] == "running":
91
  return "⚠️ Training already in progress!"
92
 
93
+ # Reset status
94
+ training_status = {"status": "idle", "message": "", "full_output": ""}
95
+
96
  # Start training in background thread
97
  thread = threading.Thread(
98
  target=run_training_subprocess,
 
100
  )
101
  thread.start()
102
 
103
+ return "πŸš€ Training started! Check the Debug Output tab for detailed logs..."
104
 
105
  def get_training_status():
106
  """Get current training status"""
107
  global training_status
108
 
109
+ status_msg = f"""
110
+ Status: {training_status['status']}
111
+ Message: {training_status['message']}
112
+ """
113
+
114
+ return status_msg, training_status.get('full_output', '')
 
 
 
 
 
 
 
 
 
 
115
 
116
  # Create Gradio interface
117
  with gr.Blocks(title="Resume Normalizer Trainer") as app:
118
+ gr.Markdown("# Resume Normalizer Trainer - Debug Mode")
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  with gr.Tab("πŸ“Š Check Data"):
121
  check_btn = gr.Button("Check Available Datasets", variant="primary")
 
128
  hf_token = gr.Textbox(
129
  label="HuggingFace Token",
130
  type="password",
131
+ placeholder="hf_..."
 
132
  )
133
  hub_username = gr.Textbox(
134
  label="HuggingFace Username",
135
+ value="aoisfhdugbos"
 
136
  )
137
 
138
  with gr.Column():
139
  model_size = gr.Dropdown(
140
  label="Model Size",
141
  choices=["T5-Base (250M)", "T5-Large (770M)"],
142
+ value="T5-Base (250M)"
 
143
  )
144
  num_epochs = gr.Slider(
145
  label="Training Epochs",
146
  minimum=1,
147
  maximum=10,
148
  value=5,
149
+ step=1
 
150
  )
151
 
152
  use_balanced = gr.Checkbox(
153
  label="Use Balanced Dataset (8,304 examples)",
154
+ value=False
 
155
  )
156
 
157
  train_btn = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
 
163
  outputs=train_output
164
  )
165
 
166
+ with gr.Tab("πŸ› Debug Output"):
167
+ refresh_btn = gr.Button("πŸ”„ Refresh Debug Output", variant="secondary")
168
+ status_output = gr.Textbox(label="Status", lines=5)
169
+ debug_output = gr.Textbox(label="Full Training Output", lines=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ refresh_btn.click(get_training_status, outputs=[status_output, debug_output])
 
 
 
 
172
 
173
  app.launch()