Win-Stack / app.py
smarthillc
Add debug output to capture training errors
b1b635d
import gradio as gr
import os
import subprocess
import threading
# Global variable to track training status
training_status = {"status": "idle", "message": "", "full_output": ""}
def check_data():
"""Check if data is available"""
files = []
if os.path.exists("combined_final_training_data.csv"):
files.append("βœ… Combined dataset: 9,302 examples")
if os.path.exists("combined_balanced_training_data.csv"):
files.append("βœ… Balanced dataset: 8,304 examples")
if not files:
return "❌ No training data found. Please upload data files."
return "\n".join(files)
def run_training_subprocess(hf_token, model_size, hub_username, num_epochs, use_balanced):
"""Run training in subprocess"""
global training_status
try:
# Determine which data file to use
if use_balanced and os.path.exists("combined_balanced_training_data.csv"):
data_path = "combined_balanced_training_data.csv"
elif os.path.exists("combined_final_training_data.csv"):
data_path = "combined_final_training_data.csv"
else:
training_status["status"] = "error"
training_status["message"] = "No training data found!"
return
# Determine model size
size = "base" if "Base" in model_size else "large"
# Build command
cmd = [
"python", "train.py",
"--data_path", data_path,
"--model_size", size,
"--num_epochs", str(num_epochs),
"--use_lora"
]
if hf_token:
cmd.extend(["--hf_token", hf_token])
if hub_username:
cmd.extend(["--hub_username", hub_username])
training_status["status"] = "running"
training_status["message"] = "Starting training..."
training_status["full_output"] = f"Command: {' '.join(cmd)}\n\n"
# Run training and capture ALL output
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# Capture both stdout and stderr
stdout, stderr = process.communicate()
training_status["full_output"] += f"=== STDOUT ===\n{stdout}\n\n=== STDERR ===\n{stderr}"
if process.returncode == 0:
training_status["status"] = "completed"
training_status["message"] = "Training completed successfully!"
else:
training_status["status"] = "error"
training_status["message"] = f"Training failed with exit code {process.returncode}"
except Exception as e:
training_status["status"] = "error"
training_status["message"] = f"Error: {str(e)}"
training_status["full_output"] = str(e)
def train_model(hf_token, model_size, hub_username, num_epochs, use_balanced):
"""Start training in background thread"""
global training_status
if not hf_token:
return "❌ Please provide HuggingFace token"
if training_status["status"] == "running":
return "⚠️ Training already in progress!"
# Reset status
training_status = {"status": "idle", "message": "", "full_output": ""}
# Start training in background thread
thread = threading.Thread(
target=run_training_subprocess,
args=(hf_token, model_size, hub_username, num_epochs, use_balanced)
)
thread.start()
return "πŸš€ Training started! Check the Debug Output tab for detailed logs..."
def get_training_status():
"""Get current training status"""
global training_status
status_msg = f"""
Status: {training_status['status']}
Message: {training_status['message']}
"""
return status_msg, training_status.get('full_output', '')
# Create Gradio interface
with gr.Blocks(title="Resume Normalizer Trainer") as app:
gr.Markdown("# Resume Normalizer Trainer - Debug Mode")
with gr.Tab("πŸ“Š Check Data"):
check_btn = gr.Button("Check Available Datasets", variant="primary")
check_output = gr.Textbox(label="Dataset Status", lines=5)
check_btn.click(check_data, outputs=check_output)
with gr.Tab("πŸš€ Train Model"):
with gr.Row():
with gr.Column():
hf_token = gr.Textbox(
label="HuggingFace Token",
type="password",
placeholder="hf_..."
)
hub_username = gr.Textbox(
label="HuggingFace Username",
value="aoisfhdugbos"
)
with gr.Column():
model_size = gr.Dropdown(
label="Model Size",
choices=["T5-Base (250M)", "T5-Large (770M)"],
value="T5-Base (250M)"
)
num_epochs = gr.Slider(
label="Training Epochs",
minimum=1,
maximum=10,
value=5,
step=1
)
use_balanced = gr.Checkbox(
label="Use Balanced Dataset (8,304 examples)",
value=False
)
train_btn = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
train_output = gr.Textbox(label="Training Output", lines=5)
train_btn.click(
train_model,
inputs=[hf_token, model_size, hub_username, num_epochs, use_balanced],
outputs=train_output
)
with gr.Tab("πŸ› Debug Output"):
refresh_btn = gr.Button("πŸ”„ Refresh Debug Output", variant="secondary")
status_output = gr.Textbox(label="Status", lines=5)
debug_output = gr.Textbox(label="Full Training Output", lines=30)
refresh_btn.click(get_training_status, outputs=[status_output, debug_output])
app.launch()