Spaces:

aneeb15
/

Auto-FineTune-Ops

Configuration error

App Files Files Community

Auto-FineTune-Ops / app.py

aneeb15

feat: implement dynamic evaluation page (remove static charts)

9ec2fac 4 months ago

raw

history blame contribute delete

70.1 kB

	"""
	Auto-FineTune-Ops: Streamlit Dashboard
	======================================
	Premium interactive dashboard for ML fine-tuning pipeline.
	"""

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from pathlib import Path
	import sys
	import os
	import json
	import time
	from datetime import datetime

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).parent))

	# Page configuration
	st.set_page_config(
	page_title="Auto-FineTune-Ops",
	page_icon="🤖",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Premium CSS styling
	st.markdown("""
	<style>
	/* Main container */
	.main .block-container {
	padding-top: 2rem;
	padding-bottom: 2rem;
	}

	/* Cards */
	.stMetric {
	background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
	padding: 1rem;
	border-radius: 12px;
	border: 1px solid rgba(99, 102, 241, 0.2);
	box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
	}

	/* Gradient headers */
	.gradient-header {
	background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 2.5rem;
	font-weight: 700;
	margin-bottom: 1rem;
	}

	/* Info cards */
	.info-card {
	background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
	padding: 1.5rem;
	border-radius: 16px;
	border: 1px solid rgba(99, 102, 241, 0.3);
	margin: 1rem 0;
	}

	/* Success badge */
	.success-badge {
	background: linear-gradient(90deg, #10b981, #059669);
	color: white;
	padding: 0.5rem 1rem;
	border-radius: 20px;
	font-weight: 600;
	display: inline-block;
	}

	/* Warning badge */
	.warning-badge {
	background: linear-gradient(90deg, #f59e0b, #d97706);
	color: white;
	padding: 0.5rem 1rem;
	border-radius: 20px;
	font-weight: 600;
	display: inline-block;
	}

	/* Sidebar styling */
	section[data-testid="stSidebar"] {
	background: linear-gradient(180deg, #0f0f23 0%, #1a1a2e 100%);
	}

	/* Button styling */
	.stButton > button {
	background: linear-gradient(90deg, #6366f1, #8b5cf6);
	color: white;
	border: none;
	border-radius: 8px;
	padding: 0.5rem 2rem;
	font-weight: 600;
	transition: all 0.3s ease;
	}

	.stButton > button:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 20px rgba(99, 102, 241, 0.4);
	}

	/* Progress bar */
	.stProgress > div > div {
	background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7);
	}

	/* Tab styling */
	.stTabs [data-baseweb="tab-list"] {
	gap: 8px;
	}

	.stTabs [data-baseweb="tab"] {
	background: rgba(99, 102, 241, 0.1);
	border-radius: 8px;
	padding: 0.5rem 1rem;
	}

	.stTabs [aria-selected="true"] {
	background: linear-gradient(90deg, #6366f1, #8b5cf6);
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'current_page' not in st.session_state:
	st.session_state.current_page = 'home'
	if 'uploaded_data' not in st.session_state:
	st.session_state.uploaded_data = None
	if 'processed_data_path' not in st.session_state:
	st.session_state.processed_data_path = None
	if 'model_path' not in st.session_state:
	st.session_state.model_path = None
	if 'training_goal' not in st.session_state:
	st.session_state.training_goal = None
	if 'pipeline_status' not in st.session_state:
	st.session_state.pipeline_status = {
	'data': 'pending',
	'training': 'pending',
	'evaluation': 'pending',
	'deployment': 'pending'
	}

	# Sidebar navigation
	with st.sidebar:
	st.markdown('<p class="gradient-header" style="font-size: 1.5rem;">🤖 Auto-FineTune-Ops</p>', unsafe_allow_html=True)
	st.markdown("---")

	# Navigation
	pages = {
	'home': ('🏠', 'Dashboard'),
	'data': ('📊', 'Data Upload'),
	'process': ('🧹', 'Processing'),
	'training': ('🚀', 'Training'),
	'evaluation': ('⚖️', 'Evaluation'),
	'deploy': ('🌐', 'Deploy')
	}

	for key, (icon, label) in pages.items():
	if st.button(f"{icon} {label}", key=f"nav_{key}", use_container_width=True):
	st.session_state.current_page = key

	st.markdown("---")

	# Pipeline status
	st.markdown("### 📋 Pipeline Status")
	status_icons = {'pending': '⏳', 'running': '🔄', 'complete': '✅', 'error': '❌'}
	for stage, status in st.session_state.pipeline_status.items():
	st.markdown(f"{status_icons.get(status, '⏳')} {stage.title()}: {status}")

	st.markdown("---")
	st.markdown("Built with ❤️ using Streamlit")


	# ============================================================================
	# PAGE: HOME DASHBOARD
	# ============================================================================
	def render_home():
	st.markdown('<p class="gradient-header">🏠 Pipeline Dashboard</p>', unsafe_allow_html=True)
	st.markdown("One-click autonomous ML fine-tuning pipeline")

	# Status cards
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric(
	label="📊 Dataset",
	value="Ready" if st.session_state.uploaded_data is not None else "Not Loaded",
	delta="Uploaded" if st.session_state.uploaded_data is not None else None
	)

	with col2:
	st.metric(
	label="🧹 Processing",
	value=st.session_state.pipeline_status['data'].title(),
	delta="Complete" if st.session_state.pipeline_status['data'] == 'complete' else None
	)

	with col3:
	st.metric(
	label="🚀 Training",
	value=st.session_state.pipeline_status['training'].title(),
	delta="Complete" if st.session_state.pipeline_status['training'] == 'complete' else None
	)

	with col4:
	st.metric(
	label="⚖️ Evaluation",
	value=st.session_state.pipeline_status['evaluation'].title(),
	delta="Complete" if st.session_state.pipeline_status['evaluation'] == 'complete' else None
	)

	st.markdown("---")

	# Quick start guide
	st.markdown("### 🚀 Quick Start Guide")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("""
	<div class="info-card">
	<h4>📊 Step 1: Upload Data</h4>
	<p>Upload your CSV/JSON dataset with instruction-response pairs.</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="info-card">
	<h4>🧹 Step 2: Process Data</h4>
	<p>The DataArchitectAgent will clean and format your data.</p>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown("""
	<div class="info-card">
	<h4>🚀 Step 3: Train Model</h4>
	<p>Fine-tune with auto-configured hyperparameters.</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="info-card">
	<h4>⚖️ Step 4: Evaluate</h4>
	<p>Run Model Arena with LLM-as-Judge evaluation.</p>
	</div>
	""", unsafe_allow_html=True)

	# Recent output files
	st.markdown("---")
	st.markdown("### 📁 Output Files")

	output_dir = Path("./output")
	if output_dir.exists():
	tabs = st.tabs(["📂 Models", "📊 Reports", "📝 Logs"])

	with tabs[0]:
	models_dir = output_dir / "models"
	if models_dir.exists():
	models = list(models_dir.glob("*"))
	if models:
	for model in models[:5]:
	st.markdown(f"- 🤖 `{model.name}`")
	else:
	st.info("No trained models yet.")
	else:
	st.info("Models directory not found.")

	with tabs[1]:
	reports_dir = output_dir / "reports"
	if reports_dir.exists():
	reports = list(reports_dir.glob("*.json"))
	if reports:
	for report in reports[:5]:
	st.markdown(f"- 📊 `{report.name}`")
	else:
	st.info("No evaluation reports yet.")
	else:
	st.info("Reports directory not found.")

	with tabs[2]:
	logs_dir = output_dir / "logs"
	if logs_dir.exists():
	logs = list(logs_dir.glob("*.yaml"))
	if logs:
	for log in logs[:5]:
	st.markdown(f"- 📝 `{log.name}`")
	else:
	st.info("No log files yet.")
	else:
	st.info("Logs directory not found.")
	else:
	st.info("Output directory will be created when you run the pipeline.")


	# ============================================================================
	# PAGE: DATA UPLOAD
	# ============================================================================
	def render_data_upload():
	st.markdown('<p class="gradient-header">📊 Data Upload & Preview</p>', unsafe_allow_html=True)

	# ── File Management Bar ──
	if st.session_state.uploaded_data is not None:
	fm1, fm2, fm3 = st.columns([3, 1, 1])
	with fm1:
	st.info(f"📂 Currently loaded: {st.session_state.get('uploaded_filename', 'dataset')} ({len(st.session_state.uploaded_data):,} rows)")
	with fm2:
	if st.button("🗑️ Remove Dataset", type="secondary"):
	st.session_state.uploaded_data = None
	st.session_state.uploaded_filename = None
	st.session_state.processed_data_path = None
	st.session_state.pipeline_status['data'] = 'pending'
	st.rerun()
	with fm3:
	if st.button("📎 Add More Data"):
	st.session_state['show_add_file'] = True

	# ── File Uploader ──
	show_uploader = (st.session_state.uploaded_data is None) or st.session_state.get('show_add_file', False)

	if show_uploader:
	upload_label = "Upload your dataset (CSV, JSON, or JSONL)" if st.session_state.uploaded_data is None else "Upload additional file to merge with current dataset"
	uploaded_file = st.file_uploader(
	upload_label,
	type=['csv', 'json', 'jsonl'],
	help="Your dataset should contain instruction-response pairs.",
	key=f"uploader_{st.session_state.get('upload_counter', 0)}"
	)

	if uploaded_file:
	try:
	if uploaded_file.name.endswith('.csv'):
	new_df = pd.read_csv(uploaded_file)
	elif uploaded_file.name.endswith('.jsonl'):
	new_df = pd.read_json(uploaded_file, lines=True)
	else:
	new_df = pd.read_json(uploaded_file)

	# Merge or replace
	if st.session_state.uploaded_data is not None and st.session_state.get('show_add_file', False):
	existing_df = st.session_state.uploaded_data
	if list(new_df.columns) == list(existing_df.columns):
	st.session_state.uploaded_data = pd.concat([existing_df, new_df], ignore_index=True)
	st.session_state.uploaded_filename = f"{st.session_state.get('uploaded_filename', 'data')} + {uploaded_file.name}"
	st.success(f"✅ Merged {uploaded_file.name} ({len(new_df):,} rows) → Total: {len(st.session_state.uploaded_data):,} rows")
	else:
	st.error(f"❌ Column mismatch! Existing: {list(existing_df.columns)} vs New: {list(new_df.columns)}")
	else:
	st.session_state.uploaded_data = new_df
	st.session_state.uploaded_filename = uploaded_file.name
	st.success(f"✅ Successfully loaded {uploaded_file.name}")

	st.session_state['show_add_file'] = False
	st.session_state['upload_counter'] = st.session_state.get('upload_counter', 0) + 1

	except Exception as e:
	st.error(f"Error loading file: {str(e)}")

	# ── Data Display ──
	if st.session_state.uploaded_data is not None:
	df = st.session_state.uploaded_data

	# Dataset statistics
	st.markdown("### 📈 Dataset Statistics")
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Total Rows", f"{len(df):,}")
	with col2:
	st.metric("Total Columns", len(df.columns))
	with col3:
	total_bytes = df.memory_usage(deep=True).sum()
	st.metric("Memory Size", f"{total_bytes / 1024:.1f} KB")
	with col4:
	missing = df.isnull().sum().sum()
	st.metric("Missing Values", missing)

	st.markdown("---")

	# Column detection
	st.markdown("### 🔍 Auto-Detected Columns")
	instruction_patterns = ['instruction', 'prompt', 'question', 'query', 'user', 'input_text']
	output_patterns = ['output', 'response', 'answer', 'completion', 'assistant', 'target']

	detected_instruction = None
	detected_output = None

	for col in df.columns:
	col_lower = col.lower()
	for pattern in instruction_patterns:
	if pattern in col_lower and not detected_instruction:
	detected_instruction = col
	for pattern in output_patterns:
	if pattern in col_lower and not detected_output:
	detected_output = col

	col1, col2 = st.columns(2)
	with col1:
	if detected_instruction:
	st.markdown(f'<span class="success-badge">Instruction: {detected_instruction}</span>', unsafe_allow_html=True)
	else:
	st.markdown(f'<span class="warning-badge">Instruction: Not detected</span>', unsafe_allow_html=True)
	with col2:
	if detected_output:
	st.markdown(f'<span class="success-badge">Output: {detected_output}</span>', unsafe_allow_html=True)
	else:
	st.markdown(f'<span class="warning-badge">Output: Not detected</span>', unsafe_allow_html=True)

	st.markdown("---")

	# Full data preview (scrollable)
	st.markdown("### 👀 Complete Data Preview")
	st.caption(f"Showing all {len(df):,} rows. Scroll to browse the full dataset.")
	st.dataframe(df, use_container_width=True, height=450)

	# Download raw data
	st.markdown("### 📥 Download Dataset")
	dl1, dl2 = st.columns(2)
	with dl1:
	csv_data = df.to_csv(index=False).encode('utf-8')
	st.download_button("⬇️ Download as CSV", csv_data,
	file_name=f"{st.session_state.get('uploaded_filename', 'dataset').rsplit('.', 1)[0]}.csv",
	mime="text/csv")
	with dl2:
	json_data = df.to_json(orient='records', indent=2).encode('utf-8')
	st.download_button("⬇️ Download as JSON", json_data,
	file_name=f"{st.session_state.get('uploaded_filename', 'dataset').rsplit('.', 1)[0]}.json",
	mime="application/json")

	# Column summary
	st.markdown("### 📋 Column Summary")
	col_info = []
	for col in df.columns:
	col_info.append({
	'Column': col,
	'Type': str(df[col].dtype),
	'Non-Null': df[col].notna().sum(),
	'Unique': df[col].nunique(),
	'Sample': str(df[col].iloc[0])[:80] + '...' if len(str(df[col].iloc[0])) > 80 else str(df[col].iloc[0])
	})
	st.dataframe(pd.DataFrame(col_info), use_container_width=True)


	# ============================================================================
	# PAGE: DATA PROCESSING
	# ============================================================================
	def render_processing():
	st.markdown('<p class="gradient-header">🧹 Advanced Data Processing</p>', unsafe_allow_html=True)

	if st.session_state.uploaded_data is None:
	st.warning("⚠️ Please upload a dataset first!")
	if st.button("📊 Go to Data Upload"):
	st.session_state.current_page = 'data'
	st.rerun()
	return

	df = st.session_state.uploaded_data

	# ── Dataset Stats Header ──
	st.markdown("### 📈 Dataset Statistics")
	sc1, sc2, sc3, sc4 = st.columns(4)
	with sc1:
	st.metric("Total Rows", f"{len(df):,}")
	with sc2:
	st.metric("Columns", len(df.columns))
	with sc3:
	avg_len = int(df.iloc[:, 0].astype(str).str.len().mean()) if len(df) > 0 else 0
	st.metric("Avg Text Length", f"{avg_len:,} chars")
	with sc4:
	est_tokens = int(avg_len * len(df) / 4) if avg_len > 0 else 0
	st.metric("Est. Total Tokens", f"{est_tokens:,}")

	st.markdown("---")

	# ── Training Goal ──
	goal = st.text_input(
	"Training Goal",
	value=st.session_state.training_goal or "assistant",
	help="e.g., medical_assistant, customer_support, code_helper"
	)
	st.session_state.training_goal = goal

	# ── Column Mapping ──
	st.markdown("### 🎯 Column Mapping")
	instruction_patterns = ['instruction', 'prompt', 'question', 'query', 'user', 'input_text', 'human']
	output_patterns = ['output', 'response', 'answer', 'completion', 'assistant', 'target']
	input_patterns = ['context', 'input', 'background', 'reference']

	detected_instruction = detected_output = detected_input = None
	available_columns = list(df.columns)

	for col in available_columns:
	col_lower = col.lower()
	for p in instruction_patterns:
	if p in col_lower and not detected_instruction:
	detected_instruction = col
	for p in output_patterns:
	if p in col_lower and not detected_output:
	detected_output = col
	for p in input_patterns:
	if p in col_lower and not detected_input:
	detected_input = col

	mc1, mc2, mc3 = st.columns(3)
	with mc1:
	instruction_col = st.selectbox("Instruction Column *", options=available_columns,
	index=available_columns.index(detected_instruction) if detected_instruction else 0,
	help="Column containing instructions/prompts/questions")
	with mc2:
	output_col = st.selectbox("Output Column *", options=available_columns,
	index=available_columns.index(detected_output) if detected_output else (1 if len(available_columns) > 1 else 0),
	help="Column containing responses/answers/outputs")
	with mc3:
	input_col_options = ["None"] + available_columns
	default_input_idx = input_col_options.index(detected_input) if detected_input else 0
	input_col_selection = st.selectbox("Input/Context Column (Optional)", options=input_col_options,
	index=default_input_idx, help="Optional column containing additional context")
	input_col = None if input_col_selection == "None" else input_col_selection

	st.markdown("---")

	# ── Safe Preset Button ──
	if st.button("🛡️ Load Safe Preset", help="Apply recommended defaults for most datasets"):
	st.session_state['safe_preset'] = True
	st.rerun()

	use_safe = st.session_state.get('safe_preset', False)

	# ====================================================================
	# 1️⃣ Text Cleaning Controls
	# ====================================================================
	with st.expander("1️⃣ Text Cleaning Controls", expanded=False):
	tc1, tc2 = st.columns(2)
	with tc1:
	clean_html = st.checkbox("Remove HTML Tags", value=use_safe, help="Strip all HTML/XML tags from text")
	clean_urls = st.checkbox("Remove URLs", value=use_safe, help="Remove http/https/www links")
	clean_emojis = st.checkbox("Remove Emojis", value=False, help="Strip emoji characters")
	clean_whitespace = st.checkbox("Normalize Whitespace", value=True, help="Collapse multiple spaces/tabs into one")
	with tc2:
	clean_lowercase = st.checkbox("Lowercase All Text", value=False, help="Convert text to lowercase (disable to preserve case)")
	clean_special = st.checkbox("Remove Special Characters", value=False, help="Keep only alphanumeric + basic punctuation")
	clean_linebreaks = st.checkbox("Strip Extra Line Breaks", value=True, help="Reduce 3+ newlines to double newlines")

	# ====================================================================
	# 2️⃣ Tokenization Controls
	# ====================================================================
	with st.expander("2️⃣ Tokenization Controls", expanded=False):
	tk1, tk2 = st.columns(2)
	with tk1:
	tokenizer_choice = st.selectbox("Tokenizer", ["tiktoken", "HuggingFace"],
	help="tiktoken = OpenAI-compatible, HuggingFace = model-specific tokenizer")
	if tokenizer_choice == "HuggingFace":
	hf_model_name = st.text_input("HF Model Name", value="meta-llama/Llama-3-8b",
	help="HuggingFace model name for tokenizer")
	else:
	hf_model_name = ""
	max_total_tokens = st.slider("Max Tokens per Sample", 128, 8192, 2048,
	help="Maximum total tokens allowed per sample")
	with tk2:
	truncate_long = st.checkbox("Truncate Long Samples", value=False,
	help="Cut text exceeding max tokens")
	split_long = st.checkbox("Split Long Samples into Chunks", value=False,
	help="Break long texts into overlapping chunks")
	if split_long:
	split_overlap = st.slider("Chunk Overlap Tokens", 0, 200, 50,
	help="Number of overlapping tokens between chunks")
	else:
	split_overlap = 50

	# Token stats preview
	if st.button("📊 Show Token Stats Preview", key="token_stats_btn"):
	with st.spinner("Counting tokens..."):
	try:
	from preprocessing.tokenization import TokenizationConfig, get_tokenizer, compute_token_stats
	tk_cfg = TokenizationConfig(
	tokenizer_name="tiktoken" if tokenizer_choice == "tiktoken" else hf_model_name,
	)
	tokenizer = get_tokenizer(tk_cfg)
	is_tiktoken = tokenizer_choice == "tiktoken"
	stats_cols = [c for c in [instruction_col, output_col] if c in df.columns]
	stats = compute_token_stats(df.head(200), stats_cols, tokenizer, is_tiktoken)
	for col_name, s in stats.items():
	st.markdown(f"{col_name}: min={s['min']}, max={s['max']}, mean={s['mean']}, p95={s['p95']}")
	except Exception as e:
	st.warning(f"Could not compute token stats: {e}")

	# ====================================================================
	# 3️⃣ System Prompt Configuration
	# ====================================================================
	with st.expander("3️⃣ System Prompt Configuration", expanded=False):
	system_prompt_text = st.text_area("Global System Prompt",
	value="You are a helpful AI assistant." if not use_safe else "You are a helpful AI assistant.",
	height=100, help="System prompt prepended to every sample in chat format")
	prepend_system = st.checkbox("Prepend System Prompt to All Samples", value=True,
	help="Include this system prompt in all formatted entries")

	if st.button("👁️ Preview Formatted Chat JSON", key="preview_chat_btn"):
	try:
	from preprocessing.system_prompt import preview_formatted_json
	preview = preview_formatted_json(df, system_prompt_text, instruction_col, output_col, input_col, n=2)
	st.code(preview, language="json")
	except Exception as e:
	st.warning(f"Preview error: {e}")

	# ====================================================================
	# 4️⃣ Dataset Balancing
	# ====================================================================
	with st.expander("4️⃣ Dataset Balancing (Classification)", expanded=False):
	balance_enabled = st.checkbox("Enable Class Balancing", value=False,
	help="Balance class distribution for classification tasks")
	if balance_enabled:
	label_col_options = available_columns
	label_col = st.selectbox("Label Column", options=label_col_options,
	help="Column containing class labels")
	balance_strategy = st.radio("Strategy", ["none", "oversample", "undersample"],
	help="Oversample = duplicate minority, Undersample = drop majority")

	# Show distribution chart
	if label_col in df.columns:
	from preprocessing.dataset_balancing import compute_label_distribution
	dist = compute_label_distribution(df, label_col)
	if dist:
	fig = px.bar(x=list(dist.keys()), y=list(dist.values()),
	labels={'x': 'Label', 'y': 'Count'}, title="Label Distribution")
	fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
	font_color='#e2e8f0')
	st.plotly_chart(fig, use_container_width=True)
	else:
	label_col = None
	balance_strategy = "none"

	# ====================================================================
	# 5️⃣ Quality Filters
	# ====================================================================
	with st.expander("5️⃣ Quality Filters", expanded=False):
	qf1, qf2 = st.columns(2)
	with qf1:
	min_words = st.number_input("Min Word Count", min_value=0, value=3 if use_safe else 0,
	help="Minimum words required per sample (0 = no filter)")
	max_words = st.number_input("Max Word Count", min_value=0, value=0,
	help="Maximum words allowed per sample (0 = no limit)")
	profanity_filter = st.checkbox("Profanity Filter", value=False,
	help="Remove samples containing profane language")
	with qf2:
	language_filter = st.checkbox("Language Detection Filter", value=False,
	help="Keep only samples in specified languages")
	if language_filter:
	allowed_langs = st.text_input("Allowed Languages (comma-separated)", value="en",
	help="ISO 639-1 codes, e.g. en,fr,de")
	else:
	allowed_langs = "en"
	remove_low_quality = st.checkbox("Remove Low-Quality Responses", value=use_safe,
	help="Remove short / generic / placeholder responses")

	# ====================================================================
	# 6️⃣ Deduplication Advanced
	# ====================================================================
	with st.expander("6️⃣ Deduplication", expanded=False):
	dedup_exact = st.checkbox("Remove Exact Duplicates", value=True,
	help="Remove rows with identical instruction text")
	dedup_semantic = st.checkbox("Remove Semantic Duplicates", value=False,
	help="Use TF-IDF cosine similarity to find near-duplicates")
	if dedup_semantic:
	semantic_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.90, 0.01,
	help="Cosine similarity above this threshold = duplicate (higher = stricter)")
	else:
	semantic_threshold = 0.90

	# ====================================================================
	# 7️⃣ Train / Validation Split
	# ====================================================================
	with st.expander("7️⃣ Train / Validation Split", expanded=False):
	split_enabled = st.checkbox("Enable Train/Val Split", value=True,
	help="Split dataset into training and validation sets")
	if split_enabled:
	train_ratio = st.slider("Train Ratio", 0.5, 0.95, 0.9 if use_safe else 0.8, 0.05,
	help="Proportion of data used for training")
	st.markdown(f"Split: {int(train_ratio100)}% Train / {int((1-train_ratio)100)}% Validation")
	random_seed = st.number_input("Random Seed", min_value=0, value=42,
	help="Seed for reproducible splits")
	shuffle_data = st.checkbox("Shuffle Before Split", value=True,
	help="Randomly shuffle data before splitting")
	else:
	train_ratio = 0.8
	random_seed = 42
	shuffle_data = True

	# ====================================================================
	# 8️⃣ Output Formatting
	# ====================================================================
	with st.expander("8️⃣ Output Formatting", expanded=False):
	format_type = st.selectbox("Export Format", ["openai_chat", "completion", "classification", "custom"],
	help="OpenAI Chat = messages format, Completion = prompt/completion, Classification = text/label")

	custom_schema = {}
	if format_type == "custom":
	st.markdown("Define Custom Schema (output_key → source_column)")
	num_fields = st.number_input("Number of Fields", 1, 10, 2)
	for i in range(int(num_fields)):
	fc1, fc2 = st.columns(2)
	with fc1:
	key = st.text_input(f"Output Key {i+1}", value=f"field_{i+1}", key=f"ckey_{i}")
	with fc2:
	val = st.selectbox(f"Source Column {i+1}", options=available_columns, key=f"cval_{i}")
	custom_schema[key] = val

	# ====================================================================
	# 9️⃣ Safety & PII Filtering
	# ====================================================================
	with st.expander("9️⃣ Safety & PII Filtering", expanded=False):
	pii1, pii2 = st.columns(2)
	with pii1:
	pii_emails = st.checkbox("Detect & Mask Emails", value=use_safe,
	help="Replace email addresses with [REDACTED]")
	pii_phones = st.checkbox("Detect & Mask Phone Numbers", value=use_safe,
	help="Replace phone numbers with [REDACTED]")
	pii_ids = st.checkbox("Detect & Mask CNIC/SSN", value=use_safe,
	help="Replace national ID / SSN patterns with [REDACTED]")
	with pii2:
	pii_keys = st.checkbox("Detect & Mask API Keys", value=use_safe,
	help="Replace long hex/base64 strings that look like secrets")
	pii_addresses = st.checkbox("Detect & Mask Addresses", value=False,
	help="Replace street addresses and zip codes")

	# ====================================================================
	# 🔟 Augmentation (Optional)
	# ====================================================================
	with st.expander("🔟 Augmentation (Optional)", expanded=False):
	aug_enabled = st.checkbox("Enable Data Augmentation", value=False,
	help="Generate synthetic variations of existing samples")
	if aug_enabled:
	ag1, ag2 = st.columns(2)
	with ag1:
	aug_paraphrase = st.checkbox("Paraphrase Instructions", value=True,
	help="Synonym-based paraphrasing of instructions")
	aug_variations = st.checkbox("Generate Variations", value=False,
	help="Minor text variations (punctuation, casing)")
	with ag2:
	aug_backtranslate = st.checkbox("Back Translation", value=False,
	help="Simulate back-translation for diversity")
	aug_tone = st.checkbox("Tone Rewriting", value=False,
	help="Rewrite instructions in different tones")
	aug_factor = st.slider("Augmentation Factor", 1, 5, 1,
	help="Number of augmented copies per original sample")
	else:
	aug_paraphrase = aug_variations = aug_backtranslate = aug_tone = False
	aug_factor = 1

	st.markdown("---")

	# ── Run Pipeline Button ──
	if st.button("🚀 Run Advanced Processing Pipeline", type="primary", use_container_width=True):
	st.session_state.pipeline_status['data'] = 'running'

	with st.spinner("Running preprocessing pipeline..."):
	progress_bar = st.progress(0)
	status_text = st.empty()

	try:
	from preprocessing.pipeline import PreprocessingPipeline, PreprocessingConfig
	from preprocessing.text_cleaning import TextCleaningConfig
	from preprocessing.tokenization import TokenizationConfig
	from preprocessing.system_prompt import SystemPromptConfig
	from preprocessing.dataset_balancing import BalancingConfig
	from preprocessing.quality_filters import QualityFilterConfig
	from preprocessing.deduplication import DeduplicationConfig
	from preprocessing.train_val_split import SplitConfig
	from preprocessing.output_formatter import OutputFormatConfig, format_dataset, export_jsonl, generate_preview
	from preprocessing.pii_filter import PIIFilterConfig
	from preprocessing.augmentation import AugmentationConfig

	# Build config from UI values
	config = PreprocessingConfig(
	instruction_col=instruction_col,
	output_col=output_col,
	input_col=input_col,
	label_col=label_col if balance_enabled else None,
	text_cleaning=TextCleaningConfig(
	remove_html=clean_html, remove_urls=clean_urls,
	remove_emojis=clean_emojis, normalize_whitespace=clean_whitespace,
	lowercase=clean_lowercase, remove_special_chars=clean_special,
	strip_extra_linebreaks=clean_linebreaks,
	),
	tokenization=TokenizationConfig(
	tokenizer_name="tiktoken" if tokenizer_choice == "tiktoken" else hf_model_name,
	max_total_tokens=max_total_tokens,
	truncate_long=truncate_long, split_long=split_long,
	split_overlap=split_overlap,
	),
	system_prompt=SystemPromptConfig(
	system_prompt=system_prompt_text,
	prepend_to_all=prepend_system,
	),
	balancing=BalancingConfig(
	enabled=balance_enabled,
	label_column=label_col if balance_enabled else "",
	strategy=balance_strategy if balance_enabled else "none",
	),
	quality_filters=QualityFilterConfig(
	min_word_count=min_words, max_word_count=max_words,
	profanity_filter=profanity_filter,
	language_filter=language_filter,
	allowed_languages=[l.strip() for l in allowed_langs.split(',')],
	remove_low_quality=remove_low_quality,
	),
	deduplication=DeduplicationConfig(
	remove_exact=dedup_exact, remove_semantic=dedup_semantic,
	semantic_threshold=semantic_threshold,
	),
	split=SplitConfig(
	enabled=split_enabled, train_ratio=train_ratio,
	random_seed=int(random_seed), shuffle=shuffle_data,
	),
	output_format=OutputFormatConfig(
	format_type=format_type, custom_schema=custom_schema,
	),
	pii_filter=PIIFilterConfig(
	filter_emails=pii_emails, filter_phones=pii_phones,
	filter_id_numbers=pii_ids, filter_api_keys=pii_keys,
	filter_addresses=pii_addresses,
	),
	augmentation=AugmentationConfig(
	enabled=aug_enabled, paraphrase=aug_paraphrase,
	generate_variations=aug_variations,
	back_translate=aug_backtranslate,
	tone_rewrite=aug_tone,
	augmentation_factor=aug_factor,
	),
	)

	def progress_cb(stage_name, pct):
	status_text.text(f"⚙️ {stage_name}...")
	progress_bar.progress(min(pct, 100))

	pipeline = PreprocessingPipeline(config)
	train_df, val_df, logs = pipeline.run(df, progress_callback=progress_cb)

	# Format output
	sys_prompt = system_prompt_text if prepend_system else ""
	formatted_data = format_dataset(
	train_df, config.output_format,
	system_prompt=sys_prompt,
	instruction_col=instruction_col,
	output_col=output_col,
	input_col=input_col,
	label_col=label_col if balance_enabled else None,
	)

	# Export
	output_dir = Path("./output/processed_data")
	output_dir.mkdir(parents=True, exist_ok=True)
	train_path = export_jsonl(formatted_data, str(output_dir / f"{goal}_train.jsonl"))

	val_path = None
	if len(val_df) > 0:
	val_formatted = format_dataset(
	val_df, config.output_format,
	system_prompt=sys_prompt,
	instruction_col=instruction_col,
	output_col=output_col,
	input_col=input_col,
	label_col=label_col if balance_enabled else None,
	)
	val_path = export_jsonl(val_formatted, str(output_dir / f"{goal}_val.jsonl"))

	progress_bar.progress(100)
	status_text.text("✅ Pipeline complete!")

	st.session_state.processed_data_path = train_path
	st.session_state.pipeline_status['data'] = 'complete'

	# ── Results ──
	st.success(f"✅ Training data saved to: `{train_path}`")
	if val_path:
	st.success(f"✅ Validation data saved to: `{val_path}`")

	# Stats
	rc1, rc2, rc3, rc4 = st.columns(4)
	with rc1:
	st.metric("Original Rows", f"{len(df):,}")
	with rc2:
	st.metric("Train Samples", f"{len(train_df):,}")
	with rc3:
	st.metric("Val Samples", f"{len(val_df):,}")
	with rc4:
	removed = len(df) - len(train_df) - len(val_df)
	st.metric("Removed", f"{max(0, removed):,}")

	# ── Pipeline Logs ──
	st.markdown("### 📋 Pipeline Logs")
	log_data = []
	for log in logs:
	log_data.append({
	'Stage': log.stage,
	'Description': log.description,
	'Rows Before': log.rows_before,
	'Rows After': log.rows_after,
	'Delta': log.rows_delta,
	'Time (ms)': log.duration_ms,
	})
	st.dataframe(pd.DataFrame(log_data), use_container_width=True)

	# ── Preview ──
	st.markdown("### 👁️ Output Preview")
	preview_json = generate_preview(formatted_data, n=3)
	st.code(preview_json, language="json")

	# ── Download ──
	st.markdown("### 📥 Download")
	dl1, dl2 = st.columns(2)
	with dl1:
	with open(train_path, 'r', encoding='utf-8') as f:
	st.download_button("⬇️ Download Train JSONL", f.read(),
	file_name=f"{goal}_train.jsonl", mime="application/jsonl")
	with dl2:
	if val_path and Path(val_path).exists():
	with open(val_path, 'r', encoding='utf-8') as f:
	st.download_button("⬇️ Download Val JSONL", f.read(),
	file_name=f"{goal}_val.jsonl", mime="application/jsonl")

	except Exception as e:
	st.session_state.pipeline_status['data'] = 'error'
	st.error(f"❌ Pipeline Error: {str(e)}")
	import traceback
	st.code(traceback.format_exc())

	# Show previously processed data
	if st.session_state.processed_data_path:
	st.markdown("---")
	st.markdown("### 📂 Last Processed Data")
	try:
	processed_path = Path(st.session_state.processed_data_path)
	if processed_path.exists():
	with open(processed_path, encoding='utf-8') as f:
	samples = [json.loads(line) for line in f.readlines()[:5]]
	for i, sample in enumerate(samples):
	with st.expander(f"Sample {i+1}"):
	st.json(sample)
	except Exception as e:
	st.warning(f"Could not load preview: {e}")


	# ============================================================================
	# PAGE: TRAINING
	# ============================================================================
	def render_training():
	st.markdown('<p class="gradient-header">🚀 Model Training</p>', unsafe_allow_html=True)

	# Check prerequisites
	if st.session_state.processed_data_path is None:
	st.warning("⚠️ Please process your data first!")
	if st.button("🧹 Go to Processing"):
	st.session_state.current_page = 'process'
	st.rerun()
	return

	# ── GPU Detection ──
	try:
	import torch
	has_gpu = torch.cuda.is_available()
	if has_gpu:
	gpu_name = torch.cuda.get_device_name(0)
	gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
	st.success(f"✅ GPU Available: {gpu_name} ({gpu_memory:.1f} GB)")
	except Exception:
	has_gpu = False

	# ── Download Preprocessed Data (always available) ──
	st.markdown("### 📥 Preprocessed Training Data")
	processed_path = Path(st.session_state.processed_data_path)
	if processed_path.exists():
	with open(processed_path, 'r', encoding='utf-8') as f:
	processed_content = f.read()
	dl1, dl2 = st.columns(2)
	with dl1:
	st.download_button("⬇️ Download Training JSONL", processed_content,
	file_name=processed_path.name, mime="application/jsonl")
	with dl2:
	# Check for validation file
	val_path = processed_path.parent / processed_path.name.replace('_train', '_val')
	if val_path.exists():
	with open(val_path, 'r', encoding='utf-8') as f:
	st.download_button("⬇️ Download Validation JSONL", f.read(),
	file_name=val_path.name, mime="application/jsonl")
	try:
	sample_count = sum(1 for _ in processed_content.split('\n') if _.strip())
	except Exception:
	sample_count = 0
	st.info(f"📊 Dataset: {sample_count:,} samples ready for training")
	else:
	st.warning("Processed data file not found.")

	st.markdown("---")

	# ====================================================================
	# TWO PATHS: GPU Training OR Colab Notebook
	# ====================================================================
	if has_gpu:
	training_mode = "gpu"
	else:
	training_mode = st.radio("🖥️ Select Training Mode", [
	"☁️ Use Google Colab (Recommended – Free GPU)",
	"📤 Upload Fine-Tuned Model (Already trained externally)"
	], help="No GPU detected on this machine. Choose how to proceed.")

	# ====================================================================
	# PATH A: GPU Training (local)
	# ====================================================================
	if training_mode == "gpu":
	st.markdown("### ⚙️ Training Configuration")

	col1, col2 = st.columns(2)
	with col1:
	model_source = st.radio("Model Source", ["Preset Models", "Custom HuggingFace Model"])
	if model_source == "Preset Models":
	base_model = st.selectbox("Base Model", [
	"unsloth/llama-3-8b-bnb-4bit",
	"unsloth/llama-3-70b-bnb-4bit",
	"unsloth/mistral-7b-bnb-4bit",
	"unsloth/gemma-7b-bnb-4bit",
	])
	else:
	base_model = st.text_input("HuggingFace Model ID",
	value="unsloth/llama-3-8b-bnb-4bit",
	help="Enter any HuggingFace model ID, e.g. 'meta-llama/Llama-3-8b', 'mistralai/Mistral-7B-v0.1'")
	max_seq_length = st.slider("Max Sequence Length", 512, 4096, 2048)

	with col2:
	dataset_size = sample_count if sample_count > 0 else 1000
	if dataset_size < 1000:
	auto_rank, auto_alpha, auto_lr, auto_epochs = 8, 16, 2e-4, 5
	size_category = "Small"
	elif dataset_size < 10000:
	auto_rank, auto_alpha, auto_lr, auto_epochs = 16, 32, 1e-4, 3
	size_category = "Medium"
	else:
	auto_rank, auto_alpha, auto_lr, auto_epochs = 32, 64, 5e-5, 2
	size_category = "Large"
	st.success(f"Auto-configured for {size_category} dataset ({dataset_size:,} samples)")

	st.markdown("---")

	with st.expander("🔧 Advanced Hyperparameters"):
	hc1, hc2, hc3 = st.columns(3)
	with hc1:
	lora_rank = st.slider("LoRA Rank", 4, 64, auto_rank)
	lora_alpha = st.slider("LoRA Alpha", 8, 128, auto_alpha)
	with hc2:
	learning_rate = st.select_slider("Learning Rate",
	options=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4], value=auto_lr)
	num_epochs = st.slider("Epochs", 1, 10, auto_epochs)
	with hc3:
	batch_size = st.slider("Batch Size", 1, 16, 4)
	gradient_accumulation = st.slider("Gradient Accumulation", 1, 8, 4)

	st.markdown("---")

	col1, col2, col3 = st.columns([1, 2, 1])
	with col2:
	if st.button("🚀 Start Training", type="primary", use_container_width=True):
	st.session_state.pipeline_status['training'] = 'running'
	with st.spinner("Training in progress..."):
	progress_bar = st.progress(0)
	status_text = st.empty()
	try:
	from agents.training_pilot import TrainingPilot, HyperParams
	status_text.text("📦 Loading model...")
	progress_bar.progress(10)
	pilot = TrainingPilot(
	base_model=base_model,
	max_seq_length=max_seq_length,
	output_dir="./output/models"
	)
	status_text.text("🚀 Training...")
	progress_bar.progress(30)
	result = pilot.run(
	data_path=st.session_state.processed_data_path,
	output_name=st.session_state.training_goal
	)
	progress_bar.progress(100)
	status_text.text("✅ Training complete!")
	st.session_state.model_path = result.model_path
	st.session_state.pipeline_status['training'] = 'complete'
	st.success(f"✅ Model saved to: `{result.model_path}`")
	rc1, rc2, rc3 = st.columns(3)
	with rc1:
	st.metric("Final Loss", f"{result.final_loss:.4f}")
	with rc2:
	st.metric("Training Time", f"{result.training_time:.1f}s")
	with rc3:
	st.metric("Total Steps", result.num_steps)
	except Exception as e:
	st.session_state.pipeline_status['training'] = 'error'
	st.error(f"❌ Training failed: {str(e)}")
	import traceback
	st.code(traceback.format_exc())

	# ====================================================================
	# PATH B: Google Colab Notebook
	# ====================================================================
	elif "Colab" in training_mode:
	st.markdown("### ☁️ Train on Google Colab (Free GPU)")
	st.markdown("""
	Since no GPU was detected on this machine, you can fine-tune your model on Google Colab with a free GPU.
	Follow these steps:
	""")

	st.markdown("""
	Step 1: Download your preprocessed training data (above) ⬆️

	Step 2: Download or copy the Colab notebook below

	Step 3: Open [Google Colab](https://colab.research.google.com/) → Upload the notebook

	Step 4: Upload your training JSONL to Colab's file browser

	Step 5: Run all cells → Download the fine-tuned model

	Step 6: Come back here → Upload your fine-tuned model results for evaluation
	""")

	# Show / Download Colab notebook
	notebook_path = Path("./Auto_FineTune_Ops_Colab.ipynb")
	if notebook_path.exists():
	with open(notebook_path, 'r', encoding='utf-8') as f:
	notebook_content = f.read()

	st.download_button("📓 Download Colab Notebook (.ipynb)", notebook_content,
	file_name="Auto_FineTune_Ops_Colab.ipynb", mime="application/json",
	type="primary", use_container_width=True)

	with st.expander("👁️ View Notebook Code", expanded=False):
	try:
	import json as json_mod
	nb = json_mod.loads(notebook_content)
	for cell in nb.get('cells', []):
	if cell.get('cell_type') == 'code':
	source = ''.join(cell.get('source', []))
	if source.strip():
	st.code(source, language='python')
	elif cell.get('cell_type') == 'markdown':
	source = ''.join(cell.get('source', []))
	st.markdown(source)
	except Exception:
	st.code(notebook_content[:5000], language='json')
	else:
	st.warning("⚠️ Colab notebook not found at `Auto_FineTune_Ops_Colab.ipynb`")

	st.markdown("---")
	st.markdown("### 📤 After Training on Colab")
	st.info("Once you've finished training on Colab, download your fine-tuned model outputs and upload them below for evaluation.")

	# ====================================================================
	# PATH C: Upload Fine-Tuned Model / Results
	# ====================================================================
	else:
	st.markdown("### 📤 Upload Fine-Tuned Model Results")
	st.markdown("Upload outputs from your externally trained model for evaluation.")

	# ── Upload Fine-Tuned Results (always shown at bottom) ──
	st.markdown("---")
	st.markdown("### 📦 Upload Fine-Tuned Results for Evaluation")
	st.caption("If you trained on Colab or another machine, upload your model outputs here.")

	upload_tab1, upload_tab2 = st.tabs(["📊 Upload Evaluation Results (JSONL)", "📁 Upload Model Folder Path"])

	with upload_tab1:
	ft_file = st.file_uploader("Upload fine-tuned model outputs (JSONL with predictions)",
	type=['jsonl', 'json'], key="ft_results_upload",
	help="JSONL file with model predictions/outputs from your fine-tuned model")
	if ft_file:
	try:
	ft_df = pd.read_json(ft_file, lines=ft_file.name.endswith('.jsonl'))
	st.success(f"✅ Loaded {len(ft_df):,} evaluation samples")
	st.dataframe(ft_df.head(5), use_container_width=True)

	# Save for evaluation
	eval_output = Path("./output/eval_results")
	eval_output.mkdir(parents=True, exist_ok=True)
	eval_path = eval_output / f"finetuned_outputs_{ft_file.name}"
	ft_df.to_json(eval_path, orient='records', lines=True)

	st.session_state.model_path = str(eval_path)
	st.session_state.pipeline_status['training'] = 'complete'
	st.success(f"✅ Results saved! You can now proceed to Evaluation page.")

	if st.button("⚖️ Go to Evaluation"):
	st.session_state.current_page = 'evaluation'
	st.rerun()
	except Exception as e:
	st.error(f"Error loading file: {e}")

	with upload_tab2:
	model_folder = st.text_input("Model Folder Path",
	placeholder="e.g., ./output/models/my_finetuned_model or /path/to/model",
	help="Local path to the fine-tuned model directory (LoRA adapter or full model)")
	if model_folder and st.button("✅ Set Model Path"):
	if Path(model_folder).exists():
	st.session_state.model_path = model_folder
	st.session_state.pipeline_status['training'] = 'complete'
	st.success(f"✅ Model path set to: `{model_folder}`")
	else:
	st.error(f"❌ Path not found: `{model_folder}`")


	# ============================================================================
	# PAGE: EVALUATION
	# ============================================================================
	def render_evaluation():
	st.markdown('<p class="gradient-header">⚖️ Model Evaluation</p>', unsafe_allow_html=True)

	# Initialize session state for results if not present
	if 'eval_results' not in st.session_state:
	st.session_state.eval_results = None

	# ── Judge Provider Selection ──
	st.markdown("### 🤖 Select AI Judge Provider")
	st.caption("Choose which LLM provider to use as the evaluation judge.")

	judge_provider = st.selectbox("AI Provider", [
	"OpenAI (GPT-4o, GPT-4-turbo, etc.)",
	"Anthropic (Claude 3.5, Claude 3 Opus, etc.)",
	"Groq (Llama 3, Mixtral, Gemma, etc.)",
	"Custom OpenAI-Compatible Endpoint"
	], help="Select the AI provider whose model will act as the judge.")

	st.markdown("---")
	st.markdown("### 🔑 API Configuration")

	api_key = None
	base_url = None

	if "OpenAI" in judge_provider:
	col1, col2 = st.columns(2)
	with col1:
	api_key = st.text_input("OpenAI API Key", type="password", key="openai_key_input")
	if api_key: os.environ["OPENAI_API_KEY"] = api_key
	with col2:
	judge_model = st.selectbox("Judge Model", ["gpt-4o", "gpt-4-turbo", "gpt-3.5-turbo"])

	elif "Anthropic" in judge_provider:
	col1, col2 = st.columns(2)
	with col1:
	api_key = st.text_input("Anthropic API Key", type="password", key="anthropic_key_input")
	if api_key: os.environ["ANTHROPIC_API_KEY"] = api_key
	with col2:
	judge_model = st.selectbox("Judge Model", ["claude-3-5-sonnet-20241022", "claude-3-opus-20240229", "claude-3-sonnet-20240229"])

	elif "Groq" in judge_provider:
	col1, col2 = st.columns(2)
	with col1:
	api_key = st.text_input("Groq API Key", type="password", key="groq_key_input")
	if api_key: os.environ["GROQ_API_KEY"] = api_key
	with col2:
	judge_model = st.selectbox("Judge Model", ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"])
	base_url = "https://api.groq.com/openai/v1"

	else: # Custom
	col1, col2 = st.columns(2)
	with col1:
	base_url = st.text_input("API Base URL", placeholder="https://api.your-provider.com/v1")
	api_key = st.text_input("API Key", type="password", key="custom_key_input")
	if api_key: os.environ["OPENAI_API_KEY"] = api_key
	with col2:
	judge_model = st.text_input("Model Name", placeholder="e.g., my-model")

	st.markdown("---")

	# ── Evaluation Data ──
	st.markdown("### 📊 Evaluation Data")

	# 1. Use data from training (if available)
	if st.session_state.model_path and "finetuned_outputs" in str(st.session_state.model_path):
	st.info(f"Using results from training: `{st.session_state.model_path}`")
	try:
	st.session_state['eval_data'] = pd.read_json(st.session_state.model_path, lines=True)
	except Exception:
	pass

	# 2. Upload new data
	eval_upload = st.file_uploader("Upload JSONL (Must contain: 'instruction', 'base_output', 'finetuned_output')",
	type=['jsonl', 'json'], key="eval_uploader")

	if eval_upload:
	try:
	df = pd.read_json(eval_upload, lines=eval_upload.name.endswith('.jsonl'))
	required_cols = ['instruction', 'base_output', 'finetuned_output']
	if all(col in df.columns for col in required_cols):
	st.session_state['eval_data'] = df
	st.success(f"✅ Loaded {len(df)} samples")
	else:
	st.error(f"❌ Missing columns! Found: {list(df.columns)}. Required: {required_cols}")
	except Exception as e:
	st.error(f"Error loading file: {e}")

	# Show Preview
	if st.session_state.get('eval_data') is not None:
	with st.expander("👁️ View Data Preview"):
	st.dataframe(st.session_state['eval_data'].head(3), use_container_width=True)

	st.markdown("---")

	# ── Run Evaluation ──
	if st.button("🚀 Run Dynamic Evaluation", type="primary", use_container_width=True):
	if not api_key:
	st.error("❌ Please provide an API Key above!")
	return

	if st.session_state.get('eval_data') is None:
	st.error("❌ No evaluation data loaded!")
	return

	# Prepare Judge
	st.session_state.pipeline_status['evaluation'] = 'running'
	progress_bar = st.progress(0)
	status_text = st.empty()

	results = []
	df = st.session_state['eval_data']
	total = len(df)

	try:
	# Initialize Client
	client = None
	if "Anthropic" in judge_provider:
	from anthropic import Anthropic
	client = Anthropic(api_key=api_key)
	else:
	from openai import OpenAI
	client = OpenAI(api_key=api_key, base_url=base_url)

	JUDGE_PROMPT = """You are an expert evaluator comparing two AI responses.

	Query: {prompt}

	Response A (Base Model):
	{response_a}

	Response B (Fine-tuned Model):
	{response_b}

	Compare them on: Accuracy, Helpfulness, Clarity.
	Return a valid JSON object ONLY:
	{{
	"winner": "A" or "B" or "TIE",
	"score_a": <1-10>,
	"score_b": <1-10>,
	"reasoning": "short explanation",
	"accuracy": {{"A": <1-10>, "B": <1-10>}},
	"helpfulness": {{"A": <1-10>, "B": <1-10>}},
	"clarity": {{"A": <1-10>, "B": <1-10>}}
	}}
	"""

	for i, row in df.iterrows():
	status_text.text(f"Evaluating sample {i+1}/{total}...")

	prompt_text = JUDGE_PROMPT.format(
	prompt=row['instruction'],
	response_a=row['base_output'],
	response_b=row['finetuned_output']
	)

	# Call API
	if "Anthropic" in judge_provider:
	resp = client.messages.create(
	model=judge_model, max_tokens=1000,
	messages=[{"role": "user", "content": prompt_text}]
	).content[0].text
	else:
	resp = client.chat.completions.create(
	model=judge_model, max_tokens=1000,
	messages=[{"role": "user", "content": prompt_text}],
	response_format={"type": "json_object"}
	).choices[0].message.content

	# Parse
	try:
	import json
	# Clean json string if needed
	if "```json" in resp: resp = resp.split("```json")[1].split("```")[0]
	if "```" in resp: resp = resp.split("```")[1]

	data = json.loads(resp.strip())
	data['instruction'] = row['instruction']
	results.append(data)
	except Exception as e:
	print(f"Parse error: {e}")
	results.append({"winner": "TIE", "score_a": 5, "score_b": 5, "reasoning": "Error parsing judge response"})

	progress_bar.progress((i + 1) / total)

	st.session_state.eval_results = results
	st.session_state.pipeline_status['evaluation'] = 'complete'
	status_text.text("✅ Evaluation Complete!")

	except Exception as e:
	st.error(f"Evaluation Failed: {str(e)}")
	st.session_state.pipeline_status['evaluation'] = 'error'

	# ── Display Results ──
	if st.session_state.get('eval_results'):
	results = st.session_state.eval_results
	df_res = pd.DataFrame(results)

	# Metrics
	wins_b = len(df_res[df_res['winner'] == 'B'])
	wins_a = len(df_res[df_res['winner'] == 'A'])
	ties = len(df_res[df_res['winner'] == 'TIE'])
	win_rate = (wins_b / len(df_res)) * 100

	col1, col2, col3, col4 = st.columns(4)
	col1.metric("Fine-tuned Win Rate", f"{win_rate:.1f}%")
	col2.metric("Fine-Tuned Wins", wins_b)
	col3.metric("Base Model Wins", wins_a)
	col4.metric("Avg Score Improvement", f"{df_res['score_b'].mean() - df_res['score_a'].mean():.2f}")

	# Charts
	c1, c2 = st.columns(2)
	with c1:
	fig = px.pie(values=[wins_b, wins_a, ties], names=['Fine-tuned', 'Base', 'Ties'],
	title="Win Distribution", color_discrete_sequence=['#6366f1', '#ef4444', '#94a3b8'])
	st.plotly_chart(fig, use_container_width=True)

	with c2:
	avg_scores = pd.DataFrame({
	'Model': ['Base', 'Fine-tuned'],
	'Score': [df_res['score_a'].mean(), df_res['score_b'].mean()]
	})
	fig2 = px.bar(avg_scores, x='Model', y='Score', color='Model',
	title="Average Overall Score", color_discrete_map={'Base': '#ef4444', 'Fine-tuned': '#6366f1'})
	st.plotly_chart(fig2, use_container_width=True)

	# Detailed Table
	st.markdown("### 📝 Detailed Verdicts")
	st.dataframe(df_res[['instruction', 'winner', 'score_a', 'score_b', 'reasoning']], use_container_width=True)

	# Download
	st.download_button("⬇️ Download Report (JSON)",
	data=json.dumps(results, indent=2),
	file_name="evaluation_report.json",
	mime="application/json")


	# ============================================================================
	# PAGE: DEPLOYMENT
	# ============================================================================
	def render_deploy():
	st.markdown('<p class="gradient-header">🌐 Model Deployment</p>', unsafe_allow_html=True)

	# Model selection
	st.markdown("### 📦 Select Model")

	models_dir = Path("./output/models")
	if models_dir.exists():
	models = [d.name for d in models_dir.iterdir() if d.is_dir()]
	if models:
	selected_model = st.selectbox("Trained Models", models)
	model_path = models_dir / selected_model
	st.info(f"📂 Model path: `{model_path}`")
	else:
	st.warning("No trained models found.")
	selected_model = None
	else:
	st.warning("Models directory not found.")
	selected_model = None

	st.markdown("---")

	# Deployment options
	st.markdown("### 🚀 Deployment Options")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("""
	<div class="info-card">
	<h4>🖥️ Local FastAPI Server</h4>
	<p>Deploy as a REST API on your local machine.</p>
	</div>
	""", unsafe_allow_html=True)

	port = st.number_input("Port", value=8000, min_value=1000, max_value=65535)

	if st.button("🚀 Start Server", disabled=not selected_model):
	st.code(f"python scripts/deploy.py --model ./output/models/{selected_model} --port {port}")
	st.info("Run the command above in your terminal to start the server.")

	with col2:
	st.markdown("""
	<div class="info-card">
	<h4>☁️ HuggingFace Hub</h4>
	<p>Push your model to HuggingFace for sharing.</p>
	</div>
	""", unsafe_allow_html=True)

	hf_token = st.text_input("HuggingFace Token", type="password")
	repo_name = st.text_input("Repository Name", value=f"my-finetuned-{selected_model}" if selected_model else "")

	if st.button("☁️ Push to Hub", disabled=not selected_model or not hf_token):
	st.info("Pushing to HuggingFace Hub...")

	st.markdown("---")

	# API documentation
	st.markdown("### 📚 API Documentation")

	st.markdown("""
	Once deployed, your API will have these endpoints:

	\| Endpoint \| Method \| Description \|
	\|----------\|--------\|-------------\|
	\| `/` \| GET \| API info \|
	\| `/health` \| GET \| Health check \|
	\| `/generate` \| POST \| Generate text \|
	\| `/generate/batch` \| POST \| Batch generation \|
	""")

	with st.expander("📝 Example Request"):
	st.code("""
	import requests

	response = requests.post("http://localhost:8000/generate", json={
	"prompt": "What are the symptoms of the common cold?",
	"max_tokens": 256,
	"temperature": 0.7
	})
	print(response.json()["generated_text"])
	""", language="python")


	# ============================================================================
	# MAIN ROUTER
	# ============================================================================
	def main():
	page = st.session_state.current_page

	if page == 'home':
	render_home()
	elif page == 'data':
	render_data_upload()
	elif page == 'process':
	render_processing()
	elif page == 'training':
	render_training()
	elif page == 'evaluation':
	render_evaluation()
	elif page == 'deploy':
	render_deploy()
	else:
	render_home()


	if __name__ == "__main__":
	main()