aneeb15's picture
feat: implement dynamic evaluation page (remove static charts)
9ec2fac
"""
Auto-FineTune-Ops: Streamlit Dashboard
======================================
Premium interactive dashboard for ML fine-tuning pipeline.
"""
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys
import os
import json
import time
from datetime import datetime
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))
# Page configuration
st.set_page_config(
page_title="Auto-FineTune-Ops",
page_icon="πŸ€–",
layout="wide",
initial_sidebar_state="expanded"
)
# Premium CSS styling
st.markdown("""
<style>
/* Main container */
.main .block-container {
padding-top: 2rem;
padding-bottom: 2rem;
}
/* Cards */
.stMetric {
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
padding: 1rem;
border-radius: 12px;
border: 1px solid rgba(99, 102, 241, 0.2);
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
}
/* Gradient headers */
.gradient-header {
background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 2.5rem;
font-weight: 700;
margin-bottom: 1rem;
}
/* Info cards */
.info-card {
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
padding: 1.5rem;
border-radius: 16px;
border: 1px solid rgba(99, 102, 241, 0.3);
margin: 1rem 0;
}
/* Success badge */
.success-badge {
background: linear-gradient(90deg, #10b981, #059669);
color: white;
padding: 0.5rem 1rem;
border-radius: 20px;
font-weight: 600;
display: inline-block;
}
/* Warning badge */
.warning-badge {
background: linear-gradient(90deg, #f59e0b, #d97706);
color: white;
padding: 0.5rem 1rem;
border-radius: 20px;
font-weight: 600;
display: inline-block;
}
/* Sidebar styling */
section[data-testid="stSidebar"] {
background: linear-gradient(180deg, #0f0f23 0%, #1a1a2e 100%);
}
/* Button styling */
.stButton > button {
background: linear-gradient(90deg, #6366f1, #8b5cf6);
color: white;
border: none;
border-radius: 8px;
padding: 0.5rem 2rem;
font-weight: 600;
transition: all 0.3s ease;
}
.stButton > button:hover {
transform: translateY(-2px);
box-shadow: 0 4px 20px rgba(99, 102, 241, 0.4);
}
/* Progress bar */
.stProgress > div > div {
background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7);
}
/* Tab styling */
.stTabs [data-baseweb="tab-list"] {
gap: 8px;
}
.stTabs [data-baseweb="tab"] {
background: rgba(99, 102, 241, 0.1);
border-radius: 8px;
padding: 0.5rem 1rem;
}
.stTabs [aria-selected="true"] {
background: linear-gradient(90deg, #6366f1, #8b5cf6);
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'current_page' not in st.session_state:
st.session_state.current_page = 'home'
if 'uploaded_data' not in st.session_state:
st.session_state.uploaded_data = None
if 'processed_data_path' not in st.session_state:
st.session_state.processed_data_path = None
if 'model_path' not in st.session_state:
st.session_state.model_path = None
if 'training_goal' not in st.session_state:
st.session_state.training_goal = None
if 'pipeline_status' not in st.session_state:
st.session_state.pipeline_status = {
'data': 'pending',
'training': 'pending',
'evaluation': 'pending',
'deployment': 'pending'
}
# Sidebar navigation
with st.sidebar:
st.markdown('<p class="gradient-header" style="font-size: 1.5rem;">πŸ€– Auto-FineTune-Ops</p>', unsafe_allow_html=True)
st.markdown("---")
# Navigation
pages = {
'home': ('🏠', 'Dashboard'),
'data': ('πŸ“Š', 'Data Upload'),
'process': ('🧹', 'Processing'),
'training': ('πŸš€', 'Training'),
'evaluation': ('βš–οΈ', 'Evaluation'),
'deploy': ('🌐', 'Deploy')
}
for key, (icon, label) in pages.items():
if st.button(f"{icon} {label}", key=f"nav_{key}", use_container_width=True):
st.session_state.current_page = key
st.markdown("---")
# Pipeline status
st.markdown("### πŸ“‹ Pipeline Status")
status_icons = {'pending': '⏳', 'running': 'πŸ”„', 'complete': 'βœ…', 'error': '❌'}
for stage, status in st.session_state.pipeline_status.items():
st.markdown(f"{status_icons.get(status, '⏳')} **{stage.title()}**: {status}")
st.markdown("---")
st.markdown("*Built with ❀️ using Streamlit*")
# ============================================================================
# PAGE: HOME DASHBOARD
# ============================================================================
def render_home():
st.markdown('<p class="gradient-header">🏠 Pipeline Dashboard</p>', unsafe_allow_html=True)
st.markdown("**One-click autonomous ML fine-tuning pipeline**")
# Status cards
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="πŸ“Š Dataset",
value="Ready" if st.session_state.uploaded_data is not None else "Not Loaded",
delta="Uploaded" if st.session_state.uploaded_data is not None else None
)
with col2:
st.metric(
label="🧹 Processing",
value=st.session_state.pipeline_status['data'].title(),
delta="Complete" if st.session_state.pipeline_status['data'] == 'complete' else None
)
with col3:
st.metric(
label="πŸš€ Training",
value=st.session_state.pipeline_status['training'].title(),
delta="Complete" if st.session_state.pipeline_status['training'] == 'complete' else None
)
with col4:
st.metric(
label="βš–οΈ Evaluation",
value=st.session_state.pipeline_status['evaluation'].title(),
delta="Complete" if st.session_state.pipeline_status['evaluation'] == 'complete' else None
)
st.markdown("---")
# Quick start guide
st.markdown("### πŸš€ Quick Start Guide")
col1, col2 = st.columns(2)
with col1:
st.markdown("""
<div class="info-card">
<h4>πŸ“Š Step 1: Upload Data</h4>
<p>Upload your CSV/JSON dataset with instruction-response pairs.</p>
</div>
""", unsafe_allow_html=True)
st.markdown("""
<div class="info-card">
<h4>🧹 Step 2: Process Data</h4>
<p>The DataArchitectAgent will clean and format your data.</p>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown("""
<div class="info-card">
<h4>πŸš€ Step 3: Train Model</h4>
<p>Fine-tune with auto-configured hyperparameters.</p>
</div>
""", unsafe_allow_html=True)
st.markdown("""
<div class="info-card">
<h4>βš–οΈ Step 4: Evaluate</h4>
<p>Run Model Arena with LLM-as-Judge evaluation.</p>
</div>
""", unsafe_allow_html=True)
# Recent output files
st.markdown("---")
st.markdown("### πŸ“ Output Files")
output_dir = Path("./output")
if output_dir.exists():
tabs = st.tabs(["πŸ“‚ Models", "πŸ“Š Reports", "πŸ“ Logs"])
with tabs[0]:
models_dir = output_dir / "models"
if models_dir.exists():
models = list(models_dir.glob("*"))
if models:
for model in models[:5]:
st.markdown(f"- πŸ€– `{model.name}`")
else:
st.info("No trained models yet.")
else:
st.info("Models directory not found.")
with tabs[1]:
reports_dir = output_dir / "reports"
if reports_dir.exists():
reports = list(reports_dir.glob("*.json"))
if reports:
for report in reports[:5]:
st.markdown(f"- πŸ“Š `{report.name}`")
else:
st.info("No evaluation reports yet.")
else:
st.info("Reports directory not found.")
with tabs[2]:
logs_dir = output_dir / "logs"
if logs_dir.exists():
logs = list(logs_dir.glob("*.yaml"))
if logs:
for log in logs[:5]:
st.markdown(f"- πŸ“ `{log.name}`")
else:
st.info("No log files yet.")
else:
st.info("Logs directory not found.")
else:
st.info("Output directory will be created when you run the pipeline.")
# ============================================================================
# PAGE: DATA UPLOAD
# ============================================================================
def render_data_upload():
st.markdown('<p class="gradient-header">πŸ“Š Data Upload & Preview</p>', unsafe_allow_html=True)
# ── File Management Bar ──
if st.session_state.uploaded_data is not None:
fm1, fm2, fm3 = st.columns([3, 1, 1])
with fm1:
st.info(f"πŸ“‚ Currently loaded: **{st.session_state.get('uploaded_filename', 'dataset')}** ({len(st.session_state.uploaded_data):,} rows)")
with fm2:
if st.button("πŸ—‘οΈ Remove Dataset", type="secondary"):
st.session_state.uploaded_data = None
st.session_state.uploaded_filename = None
st.session_state.processed_data_path = None
st.session_state.pipeline_status['data'] = 'pending'
st.rerun()
with fm3:
if st.button("πŸ“Ž Add More Data"):
st.session_state['show_add_file'] = True
# ── File Uploader ──
show_uploader = (st.session_state.uploaded_data is None) or st.session_state.get('show_add_file', False)
if show_uploader:
upload_label = "Upload your dataset (CSV, JSON, or JSONL)" if st.session_state.uploaded_data is None else "Upload additional file to merge with current dataset"
uploaded_file = st.file_uploader(
upload_label,
type=['csv', 'json', 'jsonl'],
help="Your dataset should contain instruction-response pairs.",
key=f"uploader_{st.session_state.get('upload_counter', 0)}"
)
if uploaded_file:
try:
if uploaded_file.name.endswith('.csv'):
new_df = pd.read_csv(uploaded_file)
elif uploaded_file.name.endswith('.jsonl'):
new_df = pd.read_json(uploaded_file, lines=True)
else:
new_df = pd.read_json(uploaded_file)
# Merge or replace
if st.session_state.uploaded_data is not None and st.session_state.get('show_add_file', False):
existing_df = st.session_state.uploaded_data
if list(new_df.columns) == list(existing_df.columns):
st.session_state.uploaded_data = pd.concat([existing_df, new_df], ignore_index=True)
st.session_state.uploaded_filename = f"{st.session_state.get('uploaded_filename', 'data')} + {uploaded_file.name}"
st.success(f"βœ… Merged **{uploaded_file.name}** ({len(new_df):,} rows) β†’ Total: **{len(st.session_state.uploaded_data):,}** rows")
else:
st.error(f"❌ Column mismatch! Existing: {list(existing_df.columns)} vs New: {list(new_df.columns)}")
else:
st.session_state.uploaded_data = new_df
st.session_state.uploaded_filename = uploaded_file.name
st.success(f"βœ… Successfully loaded **{uploaded_file.name}**")
st.session_state['show_add_file'] = False
st.session_state['upload_counter'] = st.session_state.get('upload_counter', 0) + 1
except Exception as e:
st.error(f"Error loading file: {str(e)}")
# ── Data Display ──
if st.session_state.uploaded_data is not None:
df = st.session_state.uploaded_data
# Dataset statistics
st.markdown("### πŸ“ˆ Dataset Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Rows", f"{len(df):,}")
with col2:
st.metric("Total Columns", len(df.columns))
with col3:
total_bytes = df.memory_usage(deep=True).sum()
st.metric("Memory Size", f"{total_bytes / 1024:.1f} KB")
with col4:
missing = df.isnull().sum().sum()
st.metric("Missing Values", missing)
st.markdown("---")
# Column detection
st.markdown("### πŸ” Auto-Detected Columns")
instruction_patterns = ['instruction', 'prompt', 'question', 'query', 'user', 'input_text']
output_patterns = ['output', 'response', 'answer', 'completion', 'assistant', 'target']
detected_instruction = None
detected_output = None
for col in df.columns:
col_lower = col.lower()
for pattern in instruction_patterns:
if pattern in col_lower and not detected_instruction:
detected_instruction = col
for pattern in output_patterns:
if pattern in col_lower and not detected_output:
detected_output = col
col1, col2 = st.columns(2)
with col1:
if detected_instruction:
st.markdown(f'<span class="success-badge">Instruction: {detected_instruction}</span>', unsafe_allow_html=True)
else:
st.markdown(f'<span class="warning-badge">Instruction: Not detected</span>', unsafe_allow_html=True)
with col2:
if detected_output:
st.markdown(f'<span class="success-badge">Output: {detected_output}</span>', unsafe_allow_html=True)
else:
st.markdown(f'<span class="warning-badge">Output: Not detected</span>', unsafe_allow_html=True)
st.markdown("---")
# Full data preview (scrollable)
st.markdown("### πŸ‘€ Complete Data Preview")
st.caption(f"Showing all **{len(df):,}** rows. Scroll to browse the full dataset.")
st.dataframe(df, use_container_width=True, height=450)
# Download raw data
st.markdown("### πŸ“₯ Download Dataset")
dl1, dl2 = st.columns(2)
with dl1:
csv_data = df.to_csv(index=False).encode('utf-8')
st.download_button("⬇️ Download as CSV", csv_data,
file_name=f"{st.session_state.get('uploaded_filename', 'dataset').rsplit('.', 1)[0]}.csv",
mime="text/csv")
with dl2:
json_data = df.to_json(orient='records', indent=2).encode('utf-8')
st.download_button("⬇️ Download as JSON", json_data,
file_name=f"{st.session_state.get('uploaded_filename', 'dataset').rsplit('.', 1)[0]}.json",
mime="application/json")
# Column summary
st.markdown("### πŸ“‹ Column Summary")
col_info = []
for col in df.columns:
col_info.append({
'Column': col,
'Type': str(df[col].dtype),
'Non-Null': df[col].notna().sum(),
'Unique': df[col].nunique(),
'Sample': str(df[col].iloc[0])[:80] + '...' if len(str(df[col].iloc[0])) > 80 else str(df[col].iloc[0])
})
st.dataframe(pd.DataFrame(col_info), use_container_width=True)
# ============================================================================
# PAGE: DATA PROCESSING
# ============================================================================
def render_processing():
st.markdown('<p class="gradient-header">🧹 Advanced Data Processing</p>', unsafe_allow_html=True)
if st.session_state.uploaded_data is None:
st.warning("⚠️ Please upload a dataset first!")
if st.button("πŸ“Š Go to Data Upload"):
st.session_state.current_page = 'data'
st.rerun()
return
df = st.session_state.uploaded_data
# ── Dataset Stats Header ──
st.markdown("### πŸ“ˆ Dataset Statistics")
sc1, sc2, sc3, sc4 = st.columns(4)
with sc1:
st.metric("Total Rows", f"{len(df):,}")
with sc2:
st.metric("Columns", len(df.columns))
with sc3:
avg_len = int(df.iloc[:, 0].astype(str).str.len().mean()) if len(df) > 0 else 0
st.metric("Avg Text Length", f"{avg_len:,} chars")
with sc4:
est_tokens = int(avg_len * len(df) / 4) if avg_len > 0 else 0
st.metric("Est. Total Tokens", f"{est_tokens:,}")
st.markdown("---")
# ── Training Goal ──
goal = st.text_input(
"Training Goal",
value=st.session_state.training_goal or "assistant",
help="e.g., medical_assistant, customer_support, code_helper"
)
st.session_state.training_goal = goal
# ── Column Mapping ──
st.markdown("### 🎯 Column Mapping")
instruction_patterns = ['instruction', 'prompt', 'question', 'query', 'user', 'input_text', 'human']
output_patterns = ['output', 'response', 'answer', 'completion', 'assistant', 'target']
input_patterns = ['context', 'input', 'background', 'reference']
detected_instruction = detected_output = detected_input = None
available_columns = list(df.columns)
for col in available_columns:
col_lower = col.lower()
for p in instruction_patterns:
if p in col_lower and not detected_instruction:
detected_instruction = col
for p in output_patterns:
if p in col_lower and not detected_output:
detected_output = col
for p in input_patterns:
if p in col_lower and not detected_input:
detected_input = col
mc1, mc2, mc3 = st.columns(3)
with mc1:
instruction_col = st.selectbox("Instruction Column *", options=available_columns,
index=available_columns.index(detected_instruction) if detected_instruction else 0,
help="Column containing instructions/prompts/questions")
with mc2:
output_col = st.selectbox("Output Column *", options=available_columns,
index=available_columns.index(detected_output) if detected_output else (1 if len(available_columns) > 1 else 0),
help="Column containing responses/answers/outputs")
with mc3:
input_col_options = ["None"] + available_columns
default_input_idx = input_col_options.index(detected_input) if detected_input else 0
input_col_selection = st.selectbox("Input/Context Column (Optional)", options=input_col_options,
index=default_input_idx, help="Optional column containing additional context")
input_col = None if input_col_selection == "None" else input_col_selection
st.markdown("---")
# ── Safe Preset Button ──
if st.button("πŸ›‘οΈ Load Safe Preset", help="Apply recommended defaults for most datasets"):
st.session_state['safe_preset'] = True
st.rerun()
use_safe = st.session_state.get('safe_preset', False)
# ====================================================================
# 1️⃣ Text Cleaning Controls
# ====================================================================
with st.expander("1️⃣ Text Cleaning Controls", expanded=False):
tc1, tc2 = st.columns(2)
with tc1:
clean_html = st.checkbox("Remove HTML Tags", value=use_safe, help="Strip all HTML/XML tags from text")
clean_urls = st.checkbox("Remove URLs", value=use_safe, help="Remove http/https/www links")
clean_emojis = st.checkbox("Remove Emojis", value=False, help="Strip emoji characters")
clean_whitespace = st.checkbox("Normalize Whitespace", value=True, help="Collapse multiple spaces/tabs into one")
with tc2:
clean_lowercase = st.checkbox("Lowercase All Text", value=False, help="Convert text to lowercase (disable to preserve case)")
clean_special = st.checkbox("Remove Special Characters", value=False, help="Keep only alphanumeric + basic punctuation")
clean_linebreaks = st.checkbox("Strip Extra Line Breaks", value=True, help="Reduce 3+ newlines to double newlines")
# ====================================================================
# 2️⃣ Tokenization Controls
# ====================================================================
with st.expander("2️⃣ Tokenization Controls", expanded=False):
tk1, tk2 = st.columns(2)
with tk1:
tokenizer_choice = st.selectbox("Tokenizer", ["tiktoken", "HuggingFace"],
help="tiktoken = OpenAI-compatible, HuggingFace = model-specific tokenizer")
if tokenizer_choice == "HuggingFace":
hf_model_name = st.text_input("HF Model Name", value="meta-llama/Llama-3-8b",
help="HuggingFace model name for tokenizer")
else:
hf_model_name = ""
max_total_tokens = st.slider("Max Tokens per Sample", 128, 8192, 2048,
help="Maximum total tokens allowed per sample")
with tk2:
truncate_long = st.checkbox("Truncate Long Samples", value=False,
help="Cut text exceeding max tokens")
split_long = st.checkbox("Split Long Samples into Chunks", value=False,
help="Break long texts into overlapping chunks")
if split_long:
split_overlap = st.slider("Chunk Overlap Tokens", 0, 200, 50,
help="Number of overlapping tokens between chunks")
else:
split_overlap = 50
# Token stats preview
if st.button("πŸ“Š Show Token Stats Preview", key="token_stats_btn"):
with st.spinner("Counting tokens..."):
try:
from preprocessing.tokenization import TokenizationConfig, get_tokenizer, compute_token_stats
tk_cfg = TokenizationConfig(
tokenizer_name="tiktoken" if tokenizer_choice == "tiktoken" else hf_model_name,
)
tokenizer = get_tokenizer(tk_cfg)
is_tiktoken = tokenizer_choice == "tiktoken"
stats_cols = [c for c in [instruction_col, output_col] if c in df.columns]
stats = compute_token_stats(df.head(200), stats_cols, tokenizer, is_tiktoken)
for col_name, s in stats.items():
st.markdown(f"**{col_name}**: min={s['min']}, max={s['max']}, mean={s['mean']}, p95={s['p95']}")
except Exception as e:
st.warning(f"Could not compute token stats: {e}")
# ====================================================================
# 3️⃣ System Prompt Configuration
# ====================================================================
with st.expander("3️⃣ System Prompt Configuration", expanded=False):
system_prompt_text = st.text_area("Global System Prompt",
value="You are a helpful AI assistant." if not use_safe else "You are a helpful AI assistant.",
height=100, help="System prompt prepended to every sample in chat format")
prepend_system = st.checkbox("Prepend System Prompt to All Samples", value=True,
help="Include this system prompt in all formatted entries")
if st.button("πŸ‘οΈ Preview Formatted Chat JSON", key="preview_chat_btn"):
try:
from preprocessing.system_prompt import preview_formatted_json
preview = preview_formatted_json(df, system_prompt_text, instruction_col, output_col, input_col, n=2)
st.code(preview, language="json")
except Exception as e:
st.warning(f"Preview error: {e}")
# ====================================================================
# 4️⃣ Dataset Balancing
# ====================================================================
with st.expander("4️⃣ Dataset Balancing (Classification)", expanded=False):
balance_enabled = st.checkbox("Enable Class Balancing", value=False,
help="Balance class distribution for classification tasks")
if balance_enabled:
label_col_options = available_columns
label_col = st.selectbox("Label Column", options=label_col_options,
help="Column containing class labels")
balance_strategy = st.radio("Strategy", ["none", "oversample", "undersample"],
help="Oversample = duplicate minority, Undersample = drop majority")
# Show distribution chart
if label_col in df.columns:
from preprocessing.dataset_balancing import compute_label_distribution
dist = compute_label_distribution(df, label_col)
if dist:
fig = px.bar(x=list(dist.keys()), y=list(dist.values()),
labels={'x': 'Label', 'y': 'Count'}, title="Label Distribution")
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
font_color='#e2e8f0')
st.plotly_chart(fig, use_container_width=True)
else:
label_col = None
balance_strategy = "none"
# ====================================================================
# 5️⃣ Quality Filters
# ====================================================================
with st.expander("5️⃣ Quality Filters", expanded=False):
qf1, qf2 = st.columns(2)
with qf1:
min_words = st.number_input("Min Word Count", min_value=0, value=3 if use_safe else 0,
help="Minimum words required per sample (0 = no filter)")
max_words = st.number_input("Max Word Count", min_value=0, value=0,
help="Maximum words allowed per sample (0 = no limit)")
profanity_filter = st.checkbox("Profanity Filter", value=False,
help="Remove samples containing profane language")
with qf2:
language_filter = st.checkbox("Language Detection Filter", value=False,
help="Keep only samples in specified languages")
if language_filter:
allowed_langs = st.text_input("Allowed Languages (comma-separated)", value="en",
help="ISO 639-1 codes, e.g. en,fr,de")
else:
allowed_langs = "en"
remove_low_quality = st.checkbox("Remove Low-Quality Responses", value=use_safe,
help="Remove short / generic / placeholder responses")
# ====================================================================
# 6️⃣ Deduplication Advanced
# ====================================================================
with st.expander("6️⃣ Deduplication", expanded=False):
dedup_exact = st.checkbox("Remove Exact Duplicates", value=True,
help="Remove rows with identical instruction text")
dedup_semantic = st.checkbox("Remove Semantic Duplicates", value=False,
help="Use TF-IDF cosine similarity to find near-duplicates")
if dedup_semantic:
semantic_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.90, 0.01,
help="Cosine similarity above this threshold = duplicate (higher = stricter)")
else:
semantic_threshold = 0.90
# ====================================================================
# 7️⃣ Train / Validation Split
# ====================================================================
with st.expander("7️⃣ Train / Validation Split", expanded=False):
split_enabled = st.checkbox("Enable Train/Val Split", value=True,
help="Split dataset into training and validation sets")
if split_enabled:
train_ratio = st.slider("Train Ratio", 0.5, 0.95, 0.9 if use_safe else 0.8, 0.05,
help="Proportion of data used for training")
st.markdown(f"**Split**: {int(train_ratio*100)}% Train / {int((1-train_ratio)*100)}% Validation")
random_seed = st.number_input("Random Seed", min_value=0, value=42,
help="Seed for reproducible splits")
shuffle_data = st.checkbox("Shuffle Before Split", value=True,
help="Randomly shuffle data before splitting")
else:
train_ratio = 0.8
random_seed = 42
shuffle_data = True
# ====================================================================
# 8️⃣ Output Formatting
# ====================================================================
with st.expander("8️⃣ Output Formatting", expanded=False):
format_type = st.selectbox("Export Format", ["openai_chat", "completion", "classification", "custom"],
help="OpenAI Chat = messages format, Completion = prompt/completion, Classification = text/label")
custom_schema = {}
if format_type == "custom":
st.markdown("**Define Custom Schema** (output_key β†’ source_column)")
num_fields = st.number_input("Number of Fields", 1, 10, 2)
for i in range(int(num_fields)):
fc1, fc2 = st.columns(2)
with fc1:
key = st.text_input(f"Output Key {i+1}", value=f"field_{i+1}", key=f"ckey_{i}")
with fc2:
val = st.selectbox(f"Source Column {i+1}", options=available_columns, key=f"cval_{i}")
custom_schema[key] = val
# ====================================================================
# 9️⃣ Safety & PII Filtering
# ====================================================================
with st.expander("9️⃣ Safety & PII Filtering", expanded=False):
pii1, pii2 = st.columns(2)
with pii1:
pii_emails = st.checkbox("Detect & Mask Emails", value=use_safe,
help="Replace email addresses with [REDACTED]")
pii_phones = st.checkbox("Detect & Mask Phone Numbers", value=use_safe,
help="Replace phone numbers with [REDACTED]")
pii_ids = st.checkbox("Detect & Mask CNIC/SSN", value=use_safe,
help="Replace national ID / SSN patterns with [REDACTED]")
with pii2:
pii_keys = st.checkbox("Detect & Mask API Keys", value=use_safe,
help="Replace long hex/base64 strings that look like secrets")
pii_addresses = st.checkbox("Detect & Mask Addresses", value=False,
help="Replace street addresses and zip codes")
# ====================================================================
# πŸ”Ÿ Augmentation (Optional)
# ====================================================================
with st.expander("πŸ”Ÿ Augmentation (Optional)", expanded=False):
aug_enabled = st.checkbox("Enable Data Augmentation", value=False,
help="Generate synthetic variations of existing samples")
if aug_enabled:
ag1, ag2 = st.columns(2)
with ag1:
aug_paraphrase = st.checkbox("Paraphrase Instructions", value=True,
help="Synonym-based paraphrasing of instructions")
aug_variations = st.checkbox("Generate Variations", value=False,
help="Minor text variations (punctuation, casing)")
with ag2:
aug_backtranslate = st.checkbox("Back Translation", value=False,
help="Simulate back-translation for diversity")
aug_tone = st.checkbox("Tone Rewriting", value=False,
help="Rewrite instructions in different tones")
aug_factor = st.slider("Augmentation Factor", 1, 5, 1,
help="Number of augmented copies per original sample")
else:
aug_paraphrase = aug_variations = aug_backtranslate = aug_tone = False
aug_factor = 1
st.markdown("---")
# ── Run Pipeline Button ──
if st.button("πŸš€ Run Advanced Processing Pipeline", type="primary", use_container_width=True):
st.session_state.pipeline_status['data'] = 'running'
with st.spinner("Running preprocessing pipeline..."):
progress_bar = st.progress(0)
status_text = st.empty()
try:
from preprocessing.pipeline import PreprocessingPipeline, PreprocessingConfig
from preprocessing.text_cleaning import TextCleaningConfig
from preprocessing.tokenization import TokenizationConfig
from preprocessing.system_prompt import SystemPromptConfig
from preprocessing.dataset_balancing import BalancingConfig
from preprocessing.quality_filters import QualityFilterConfig
from preprocessing.deduplication import DeduplicationConfig
from preprocessing.train_val_split import SplitConfig
from preprocessing.output_formatter import OutputFormatConfig, format_dataset, export_jsonl, generate_preview
from preprocessing.pii_filter import PIIFilterConfig
from preprocessing.augmentation import AugmentationConfig
# Build config from UI values
config = PreprocessingConfig(
instruction_col=instruction_col,
output_col=output_col,
input_col=input_col,
label_col=label_col if balance_enabled else None,
text_cleaning=TextCleaningConfig(
remove_html=clean_html, remove_urls=clean_urls,
remove_emojis=clean_emojis, normalize_whitespace=clean_whitespace,
lowercase=clean_lowercase, remove_special_chars=clean_special,
strip_extra_linebreaks=clean_linebreaks,
),
tokenization=TokenizationConfig(
tokenizer_name="tiktoken" if tokenizer_choice == "tiktoken" else hf_model_name,
max_total_tokens=max_total_tokens,
truncate_long=truncate_long, split_long=split_long,
split_overlap=split_overlap,
),
system_prompt=SystemPromptConfig(
system_prompt=system_prompt_text,
prepend_to_all=prepend_system,
),
balancing=BalancingConfig(
enabled=balance_enabled,
label_column=label_col if balance_enabled else "",
strategy=balance_strategy if balance_enabled else "none",
),
quality_filters=QualityFilterConfig(
min_word_count=min_words, max_word_count=max_words,
profanity_filter=profanity_filter,
language_filter=language_filter,
allowed_languages=[l.strip() for l in allowed_langs.split(',')],
remove_low_quality=remove_low_quality,
),
deduplication=DeduplicationConfig(
remove_exact=dedup_exact, remove_semantic=dedup_semantic,
semantic_threshold=semantic_threshold,
),
split=SplitConfig(
enabled=split_enabled, train_ratio=train_ratio,
random_seed=int(random_seed), shuffle=shuffle_data,
),
output_format=OutputFormatConfig(
format_type=format_type, custom_schema=custom_schema,
),
pii_filter=PIIFilterConfig(
filter_emails=pii_emails, filter_phones=pii_phones,
filter_id_numbers=pii_ids, filter_api_keys=pii_keys,
filter_addresses=pii_addresses,
),
augmentation=AugmentationConfig(
enabled=aug_enabled, paraphrase=aug_paraphrase,
generate_variations=aug_variations,
back_translate=aug_backtranslate,
tone_rewrite=aug_tone,
augmentation_factor=aug_factor,
),
)
def progress_cb(stage_name, pct):
status_text.text(f"βš™οΈ {stage_name}...")
progress_bar.progress(min(pct, 100))
pipeline = PreprocessingPipeline(config)
train_df, val_df, logs = pipeline.run(df, progress_callback=progress_cb)
# Format output
sys_prompt = system_prompt_text if prepend_system else ""
formatted_data = format_dataset(
train_df, config.output_format,
system_prompt=sys_prompt,
instruction_col=instruction_col,
output_col=output_col,
input_col=input_col,
label_col=label_col if balance_enabled else None,
)
# Export
output_dir = Path("./output/processed_data")
output_dir.mkdir(parents=True, exist_ok=True)
train_path = export_jsonl(formatted_data, str(output_dir / f"{goal}_train.jsonl"))
val_path = None
if len(val_df) > 0:
val_formatted = format_dataset(
val_df, config.output_format,
system_prompt=sys_prompt,
instruction_col=instruction_col,
output_col=output_col,
input_col=input_col,
label_col=label_col if balance_enabled else None,
)
val_path = export_jsonl(val_formatted, str(output_dir / f"{goal}_val.jsonl"))
progress_bar.progress(100)
status_text.text("βœ… Pipeline complete!")
st.session_state.processed_data_path = train_path
st.session_state.pipeline_status['data'] = 'complete'
# ── Results ──
st.success(f"βœ… Training data saved to: `{train_path}`")
if val_path:
st.success(f"βœ… Validation data saved to: `{val_path}`")
# Stats
rc1, rc2, rc3, rc4 = st.columns(4)
with rc1:
st.metric("Original Rows", f"{len(df):,}")
with rc2:
st.metric("Train Samples", f"{len(train_df):,}")
with rc3:
st.metric("Val Samples", f"{len(val_df):,}")
with rc4:
removed = len(df) - len(train_df) - len(val_df)
st.metric("Removed", f"{max(0, removed):,}")
# ── Pipeline Logs ──
st.markdown("### πŸ“‹ Pipeline Logs")
log_data = []
for log in logs:
log_data.append({
'Stage': log.stage,
'Description': log.description,
'Rows Before': log.rows_before,
'Rows After': log.rows_after,
'Delta': log.rows_delta,
'Time (ms)': log.duration_ms,
})
st.dataframe(pd.DataFrame(log_data), use_container_width=True)
# ── Preview ──
st.markdown("### πŸ‘οΈ Output Preview")
preview_json = generate_preview(formatted_data, n=3)
st.code(preview_json, language="json")
# ── Download ──
st.markdown("### πŸ“₯ Download")
dl1, dl2 = st.columns(2)
with dl1:
with open(train_path, 'r', encoding='utf-8') as f:
st.download_button("⬇️ Download Train JSONL", f.read(),
file_name=f"{goal}_train.jsonl", mime="application/jsonl")
with dl2:
if val_path and Path(val_path).exists():
with open(val_path, 'r', encoding='utf-8') as f:
st.download_button("⬇️ Download Val JSONL", f.read(),
file_name=f"{goal}_val.jsonl", mime="application/jsonl")
except Exception as e:
st.session_state.pipeline_status['data'] = 'error'
st.error(f"❌ Pipeline Error: {str(e)}")
import traceback
st.code(traceback.format_exc())
# Show previously processed data
if st.session_state.processed_data_path:
st.markdown("---")
st.markdown("### πŸ“‚ Last Processed Data")
try:
processed_path = Path(st.session_state.processed_data_path)
if processed_path.exists():
with open(processed_path, encoding='utf-8') as f:
samples = [json.loads(line) for line in f.readlines()[:5]]
for i, sample in enumerate(samples):
with st.expander(f"Sample {i+1}"):
st.json(sample)
except Exception as e:
st.warning(f"Could not load preview: {e}")
# ============================================================================
# PAGE: TRAINING
# ============================================================================
def render_training():
st.markdown('<p class="gradient-header">πŸš€ Model Training</p>', unsafe_allow_html=True)
# Check prerequisites
if st.session_state.processed_data_path is None:
st.warning("⚠️ Please process your data first!")
if st.button("🧹 Go to Processing"):
st.session_state.current_page = 'process'
st.rerun()
return
# ── GPU Detection ──
try:
import torch
has_gpu = torch.cuda.is_available()
if has_gpu:
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
st.success(f"βœ… GPU Available: **{gpu_name}** ({gpu_memory:.1f} GB)")
except Exception:
has_gpu = False
# ── Download Preprocessed Data (always available) ──
st.markdown("### πŸ“₯ Preprocessed Training Data")
processed_path = Path(st.session_state.processed_data_path)
if processed_path.exists():
with open(processed_path, 'r', encoding='utf-8') as f:
processed_content = f.read()
dl1, dl2 = st.columns(2)
with dl1:
st.download_button("⬇️ Download Training JSONL", processed_content,
file_name=processed_path.name, mime="application/jsonl")
with dl2:
# Check for validation file
val_path = processed_path.parent / processed_path.name.replace('_train', '_val')
if val_path.exists():
with open(val_path, 'r', encoding='utf-8') as f:
st.download_button("⬇️ Download Validation JSONL", f.read(),
file_name=val_path.name, mime="application/jsonl")
try:
sample_count = sum(1 for _ in processed_content.split('\n') if _.strip())
except Exception:
sample_count = 0
st.info(f"πŸ“Š Dataset: **{sample_count:,}** samples ready for training")
else:
st.warning("Processed data file not found.")
st.markdown("---")
# ====================================================================
# TWO PATHS: GPU Training OR Colab Notebook
# ====================================================================
if has_gpu:
training_mode = "gpu"
else:
training_mode = st.radio("πŸ–₯️ Select Training Mode", [
"☁️ Use Google Colab (Recommended – Free GPU)",
"πŸ“€ Upload Fine-Tuned Model (Already trained externally)"
], help="No GPU detected on this machine. Choose how to proceed.")
# ====================================================================
# PATH A: GPU Training (local)
# ====================================================================
if training_mode == "gpu":
st.markdown("### βš™οΈ Training Configuration")
col1, col2 = st.columns(2)
with col1:
model_source = st.radio("Model Source", ["Preset Models", "Custom HuggingFace Model"])
if model_source == "Preset Models":
base_model = st.selectbox("Base Model", [
"unsloth/llama-3-8b-bnb-4bit",
"unsloth/llama-3-70b-bnb-4bit",
"unsloth/mistral-7b-bnb-4bit",
"unsloth/gemma-7b-bnb-4bit",
])
else:
base_model = st.text_input("HuggingFace Model ID",
value="unsloth/llama-3-8b-bnb-4bit",
help="Enter any HuggingFace model ID, e.g. 'meta-llama/Llama-3-8b', 'mistralai/Mistral-7B-v0.1'")
max_seq_length = st.slider("Max Sequence Length", 512, 4096, 2048)
with col2:
dataset_size = sample_count if sample_count > 0 else 1000
if dataset_size < 1000:
auto_rank, auto_alpha, auto_lr, auto_epochs = 8, 16, 2e-4, 5
size_category = "Small"
elif dataset_size < 10000:
auto_rank, auto_alpha, auto_lr, auto_epochs = 16, 32, 1e-4, 3
size_category = "Medium"
else:
auto_rank, auto_alpha, auto_lr, auto_epochs = 32, 64, 5e-5, 2
size_category = "Large"
st.success(f"Auto-configured for **{size_category}** dataset ({dataset_size:,} samples)")
st.markdown("---")
with st.expander("πŸ”§ Advanced Hyperparameters"):
hc1, hc2, hc3 = st.columns(3)
with hc1:
lora_rank = st.slider("LoRA Rank", 4, 64, auto_rank)
lora_alpha = st.slider("LoRA Alpha", 8, 128, auto_alpha)
with hc2:
learning_rate = st.select_slider("Learning Rate",
options=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4], value=auto_lr)
num_epochs = st.slider("Epochs", 1, 10, auto_epochs)
with hc3:
batch_size = st.slider("Batch Size", 1, 16, 4)
gradient_accumulation = st.slider("Gradient Accumulation", 1, 8, 4)
st.markdown("---")
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
if st.button("πŸš€ Start Training", type="primary", use_container_width=True):
st.session_state.pipeline_status['training'] = 'running'
with st.spinner("Training in progress..."):
progress_bar = st.progress(0)
status_text = st.empty()
try:
from agents.training_pilot import TrainingPilot, HyperParams
status_text.text("πŸ“¦ Loading model...")
progress_bar.progress(10)
pilot = TrainingPilot(
base_model=base_model,
max_seq_length=max_seq_length,
output_dir="./output/models"
)
status_text.text("πŸš€ Training...")
progress_bar.progress(30)
result = pilot.run(
data_path=st.session_state.processed_data_path,
output_name=st.session_state.training_goal
)
progress_bar.progress(100)
status_text.text("βœ… Training complete!")
st.session_state.model_path = result.model_path
st.session_state.pipeline_status['training'] = 'complete'
st.success(f"βœ… Model saved to: `{result.model_path}`")
rc1, rc2, rc3 = st.columns(3)
with rc1:
st.metric("Final Loss", f"{result.final_loss:.4f}")
with rc2:
st.metric("Training Time", f"{result.training_time:.1f}s")
with rc3:
st.metric("Total Steps", result.num_steps)
except Exception as e:
st.session_state.pipeline_status['training'] = 'error'
st.error(f"❌ Training failed: {str(e)}")
import traceback
st.code(traceback.format_exc())
# ====================================================================
# PATH B: Google Colab Notebook
# ====================================================================
elif "Colab" in training_mode:
st.markdown("### ☁️ Train on Google Colab (Free GPU)")
st.markdown("""
Since no GPU was detected on this machine, you can fine-tune your model on Google Colab with a free GPU.
Follow these steps:
""")
st.markdown("""
**Step 1:** Download your preprocessed training data (above) ⬆️
**Step 2:** Download or copy the Colab notebook below
**Step 3:** Open [Google Colab](https://colab.research.google.com/) β†’ Upload the notebook
**Step 4:** Upload your training JSONL to Colab's file browser
**Step 5:** Run all cells β†’ Download the fine-tuned model
**Step 6:** Come back here β†’ Upload your fine-tuned model results for evaluation
""")
# Show / Download Colab notebook
notebook_path = Path("./Auto_FineTune_Ops_Colab.ipynb")
if notebook_path.exists():
with open(notebook_path, 'r', encoding='utf-8') as f:
notebook_content = f.read()
st.download_button("πŸ““ Download Colab Notebook (.ipynb)", notebook_content,
file_name="Auto_FineTune_Ops_Colab.ipynb", mime="application/json",
type="primary", use_container_width=True)
with st.expander("πŸ‘οΈ View Notebook Code", expanded=False):
try:
import json as json_mod
nb = json_mod.loads(notebook_content)
for cell in nb.get('cells', []):
if cell.get('cell_type') == 'code':
source = ''.join(cell.get('source', []))
if source.strip():
st.code(source, language='python')
elif cell.get('cell_type') == 'markdown':
source = ''.join(cell.get('source', []))
st.markdown(source)
except Exception:
st.code(notebook_content[:5000], language='json')
else:
st.warning("⚠️ Colab notebook not found at `Auto_FineTune_Ops_Colab.ipynb`")
st.markdown("---")
st.markdown("### πŸ“€ After Training on Colab")
st.info("Once you've finished training on Colab, download your fine-tuned model outputs and upload them below for evaluation.")
# ====================================================================
# PATH C: Upload Fine-Tuned Model / Results
# ====================================================================
else:
st.markdown("### πŸ“€ Upload Fine-Tuned Model Results")
st.markdown("Upload outputs from your externally trained model for evaluation.")
# ── Upload Fine-Tuned Results (always shown at bottom) ──
st.markdown("---")
st.markdown("### πŸ“¦ Upload Fine-Tuned Results for Evaluation")
st.caption("If you trained on Colab or another machine, upload your model outputs here.")
upload_tab1, upload_tab2 = st.tabs(["πŸ“Š Upload Evaluation Results (JSONL)", "πŸ“ Upload Model Folder Path"])
with upload_tab1:
ft_file = st.file_uploader("Upload fine-tuned model outputs (JSONL with predictions)",
type=['jsonl', 'json'], key="ft_results_upload",
help="JSONL file with model predictions/outputs from your fine-tuned model")
if ft_file:
try:
ft_df = pd.read_json(ft_file, lines=ft_file.name.endswith('.jsonl'))
st.success(f"βœ… Loaded **{len(ft_df):,}** evaluation samples")
st.dataframe(ft_df.head(5), use_container_width=True)
# Save for evaluation
eval_output = Path("./output/eval_results")
eval_output.mkdir(parents=True, exist_ok=True)
eval_path = eval_output / f"finetuned_outputs_{ft_file.name}"
ft_df.to_json(eval_path, orient='records', lines=True)
st.session_state.model_path = str(eval_path)
st.session_state.pipeline_status['training'] = 'complete'
st.success(f"βœ… Results saved! You can now proceed to **Evaluation** page.")
if st.button("βš–οΈ Go to Evaluation"):
st.session_state.current_page = 'evaluation'
st.rerun()
except Exception as e:
st.error(f"Error loading file: {e}")
with upload_tab2:
model_folder = st.text_input("Model Folder Path",
placeholder="e.g., ./output/models/my_finetuned_model or /path/to/model",
help="Local path to the fine-tuned model directory (LoRA adapter or full model)")
if model_folder and st.button("βœ… Set Model Path"):
if Path(model_folder).exists():
st.session_state.model_path = model_folder
st.session_state.pipeline_status['training'] = 'complete'
st.success(f"βœ… Model path set to: `{model_folder}`")
else:
st.error(f"❌ Path not found: `{model_folder}`")
# ============================================================================
# PAGE: EVALUATION
# ============================================================================
def render_evaluation():
st.markdown('<p class="gradient-header">βš–οΈ Model Evaluation</p>', unsafe_allow_html=True)
# Initialize session state for results if not present
if 'eval_results' not in st.session_state:
st.session_state.eval_results = None
# ── Judge Provider Selection ──
st.markdown("### πŸ€– Select AI Judge Provider")
st.caption("Choose which LLM provider to use as the evaluation judge.")
judge_provider = st.selectbox("AI Provider", [
"OpenAI (GPT-4o, GPT-4-turbo, etc.)",
"Anthropic (Claude 3.5, Claude 3 Opus, etc.)",
"Groq (Llama 3, Mixtral, Gemma, etc.)",
"Custom OpenAI-Compatible Endpoint"
], help="Select the AI provider whose model will act as the judge.")
st.markdown("---")
st.markdown("### πŸ”‘ API Configuration")
api_key = None
base_url = None
if "OpenAI" in judge_provider:
col1, col2 = st.columns(2)
with col1:
api_key = st.text_input("OpenAI API Key", type="password", key="openai_key_input")
if api_key: os.environ["OPENAI_API_KEY"] = api_key
with col2:
judge_model = st.selectbox("Judge Model", ["gpt-4o", "gpt-4-turbo", "gpt-3.5-turbo"])
elif "Anthropic" in judge_provider:
col1, col2 = st.columns(2)
with col1:
api_key = st.text_input("Anthropic API Key", type="password", key="anthropic_key_input")
if api_key: os.environ["ANTHROPIC_API_KEY"] = api_key
with col2:
judge_model = st.selectbox("Judge Model", ["claude-3-5-sonnet-20241022", "claude-3-opus-20240229", "claude-3-sonnet-20240229"])
elif "Groq" in judge_provider:
col1, col2 = st.columns(2)
with col1:
api_key = st.text_input("Groq API Key", type="password", key="groq_key_input")
if api_key: os.environ["GROQ_API_KEY"] = api_key
with col2:
judge_model = st.selectbox("Judge Model", ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"])
base_url = "https://api.groq.com/openai/v1"
else: # Custom
col1, col2 = st.columns(2)
with col1:
base_url = st.text_input("API Base URL", placeholder="https://api.your-provider.com/v1")
api_key = st.text_input("API Key", type="password", key="custom_key_input")
if api_key: os.environ["OPENAI_API_KEY"] = api_key
with col2:
judge_model = st.text_input("Model Name", placeholder="e.g., my-model")
st.markdown("---")
# ── Evaluation Data ──
st.markdown("### πŸ“Š Evaluation Data")
# 1. Use data from training (if available)
if st.session_state.model_path and "finetuned_outputs" in str(st.session_state.model_path):
st.info(f"Using results from training: `{st.session_state.model_path}`")
try:
st.session_state['eval_data'] = pd.read_json(st.session_state.model_path, lines=True)
except Exception:
pass
# 2. Upload new data
eval_upload = st.file_uploader("Upload JSONL (Must contain: 'instruction', 'base_output', 'finetuned_output')",
type=['jsonl', 'json'], key="eval_uploader")
if eval_upload:
try:
df = pd.read_json(eval_upload, lines=eval_upload.name.endswith('.jsonl'))
required_cols = ['instruction', 'base_output', 'finetuned_output']
if all(col in df.columns for col in required_cols):
st.session_state['eval_data'] = df
st.success(f"βœ… Loaded {len(df)} samples")
else:
st.error(f"❌ Missing columns! Found: {list(df.columns)}. Required: {required_cols}")
except Exception as e:
st.error(f"Error loading file: {e}")
# Show Preview
if st.session_state.get('eval_data') is not None:
with st.expander("πŸ‘οΈ View Data Preview"):
st.dataframe(st.session_state['eval_data'].head(3), use_container_width=True)
st.markdown("---")
# ── Run Evaluation ──
if st.button("πŸš€ Run Dynamic Evaluation", type="primary", use_container_width=True):
if not api_key:
st.error("❌ Please provide an API Key above!")
return
if st.session_state.get('eval_data') is None:
st.error("❌ No evaluation data loaded!")
return
# Prepare Judge
st.session_state.pipeline_status['evaluation'] = 'running'
progress_bar = st.progress(0)
status_text = st.empty()
results = []
df = st.session_state['eval_data']
total = len(df)
try:
# Initialize Client
client = None
if "Anthropic" in judge_provider:
from anthropic import Anthropic
client = Anthropic(api_key=api_key)
else:
from openai import OpenAI
client = OpenAI(api_key=api_key, base_url=base_url)
JUDGE_PROMPT = """You are an expert evaluator comparing two AI responses.
Query: {prompt}
Response A (Base Model):
{response_a}
Response B (Fine-tuned Model):
{response_b}
Compare them on: Accuracy, Helpfulness, Clarity.
Return a valid JSON object ONLY:
{{
"winner": "A" or "B" or "TIE",
"score_a": <1-10>,
"score_b": <1-10>,
"reasoning": "short explanation",
"accuracy": {{"A": <1-10>, "B": <1-10>}},
"helpfulness": {{"A": <1-10>, "B": <1-10>}},
"clarity": {{"A": <1-10>, "B": <1-10>}}
}}
"""
for i, row in df.iterrows():
status_text.text(f"Evaluating sample {i+1}/{total}...")
prompt_text = JUDGE_PROMPT.format(
prompt=row['instruction'],
response_a=row['base_output'],
response_b=row['finetuned_output']
)
# Call API
if "Anthropic" in judge_provider:
resp = client.messages.create(
model=judge_model, max_tokens=1000,
messages=[{"role": "user", "content": prompt_text}]
).content[0].text
else:
resp = client.chat.completions.create(
model=judge_model, max_tokens=1000,
messages=[{"role": "user", "content": prompt_text}],
response_format={"type": "json_object"}
).choices[0].message.content
# Parse
try:
import json
# Clean json string if needed
if "```json" in resp: resp = resp.split("```json")[1].split("```")[0]
if "```" in resp: resp = resp.split("```")[1]
data = json.loads(resp.strip())
data['instruction'] = row['instruction']
results.append(data)
except Exception as e:
print(f"Parse error: {e}")
results.append({"winner": "TIE", "score_a": 5, "score_b": 5, "reasoning": "Error parsing judge response"})
progress_bar.progress((i + 1) / total)
st.session_state.eval_results = results
st.session_state.pipeline_status['evaluation'] = 'complete'
status_text.text("βœ… Evaluation Complete!")
except Exception as e:
st.error(f"Evaluation Failed: {str(e)}")
st.session_state.pipeline_status['evaluation'] = 'error'
# ── Display Results ──
if st.session_state.get('eval_results'):
results = st.session_state.eval_results
df_res = pd.DataFrame(results)
# Metrics
wins_b = len(df_res[df_res['winner'] == 'B'])
wins_a = len(df_res[df_res['winner'] == 'A'])
ties = len(df_res[df_res['winner'] == 'TIE'])
win_rate = (wins_b / len(df_res)) * 100
col1, col2, col3, col4 = st.columns(4)
col1.metric("Fine-tuned Win Rate", f"{win_rate:.1f}%")
col2.metric("Fine-Tuned Wins", wins_b)
col3.metric("Base Model Wins", wins_a)
col4.metric("Avg Score Improvement", f"{df_res['score_b'].mean() - df_res['score_a'].mean():.2f}")
# Charts
c1, c2 = st.columns(2)
with c1:
fig = px.pie(values=[wins_b, wins_a, ties], names=['Fine-tuned', 'Base', 'Ties'],
title="Win Distribution", color_discrete_sequence=['#6366f1', '#ef4444', '#94a3b8'])
st.plotly_chart(fig, use_container_width=True)
with c2:
avg_scores = pd.DataFrame({
'Model': ['Base', 'Fine-tuned'],
'Score': [df_res['score_a'].mean(), df_res['score_b'].mean()]
})
fig2 = px.bar(avg_scores, x='Model', y='Score', color='Model',
title="Average Overall Score", color_discrete_map={'Base': '#ef4444', 'Fine-tuned': '#6366f1'})
st.plotly_chart(fig2, use_container_width=True)
# Detailed Table
st.markdown("### πŸ“ Detailed Verdicts")
st.dataframe(df_res[['instruction', 'winner', 'score_a', 'score_b', 'reasoning']], use_container_width=True)
# Download
st.download_button("⬇️ Download Report (JSON)",
data=json.dumps(results, indent=2),
file_name="evaluation_report.json",
mime="application/json")
# ============================================================================
# PAGE: DEPLOYMENT
# ============================================================================
def render_deploy():
st.markdown('<p class="gradient-header">🌐 Model Deployment</p>', unsafe_allow_html=True)
# Model selection
st.markdown("### πŸ“¦ Select Model")
models_dir = Path("./output/models")
if models_dir.exists():
models = [d.name for d in models_dir.iterdir() if d.is_dir()]
if models:
selected_model = st.selectbox("Trained Models", models)
model_path = models_dir / selected_model
st.info(f"πŸ“‚ Model path: `{model_path}`")
else:
st.warning("No trained models found.")
selected_model = None
else:
st.warning("Models directory not found.")
selected_model = None
st.markdown("---")
# Deployment options
st.markdown("### πŸš€ Deployment Options")
col1, col2 = st.columns(2)
with col1:
st.markdown("""
<div class="info-card">
<h4>πŸ–₯️ Local FastAPI Server</h4>
<p>Deploy as a REST API on your local machine.</p>
</div>
""", unsafe_allow_html=True)
port = st.number_input("Port", value=8000, min_value=1000, max_value=65535)
if st.button("πŸš€ Start Server", disabled=not selected_model):
st.code(f"python scripts/deploy.py --model ./output/models/{selected_model} --port {port}")
st.info("Run the command above in your terminal to start the server.")
with col2:
st.markdown("""
<div class="info-card">
<h4>☁️ HuggingFace Hub</h4>
<p>Push your model to HuggingFace for sharing.</p>
</div>
""", unsafe_allow_html=True)
hf_token = st.text_input("HuggingFace Token", type="password")
repo_name = st.text_input("Repository Name", value=f"my-finetuned-{selected_model}" if selected_model else "")
if st.button("☁️ Push to Hub", disabled=not selected_model or not hf_token):
st.info("Pushing to HuggingFace Hub...")
st.markdown("---")
# API documentation
st.markdown("### πŸ“š API Documentation")
st.markdown("""
Once deployed, your API will have these endpoints:
| Endpoint | Method | Description |
|----------|--------|-------------|
| `/` | GET | API info |
| `/health` | GET | Health check |
| `/generate` | POST | Generate text |
| `/generate/batch` | POST | Batch generation |
""")
with st.expander("πŸ“ Example Request"):
st.code("""
import requests
response = requests.post("http://localhost:8000/generate", json={
"prompt": "What are the symptoms of the common cold?",
"max_tokens": 256,
"temperature": 0.7
})
print(response.json()["generated_text"])
""", language="python")
# ============================================================================
# MAIN ROUTER
# ============================================================================
def main():
page = st.session_state.current_page
if page == 'home':
render_home()
elif page == 'data':
render_data_upload()
elif page == 'process':
render_processing()
elif page == 'training':
render_training()
elif page == 'evaluation':
render_evaluation()
elif page == 'deploy':
render_deploy()
else:
render_home()
if __name__ == "__main__":
main()