| import React, { useState } from 'react' |
| import { |
| Database, |
| Cpu, |
| Zap, |
| Filter, |
| ArrowRight, |
| Code, |
| Play, |
| CheckCircle, |
| AlertCircle, |
| BookOpen, |
| Terminal, |
| Layers, |
| GitBranch, |
| Settings, |
| ChevronDown, |
| ChevronUp, |
| Copy, |
| ExternalLink, |
| Sparkles, |
| Rocket, |
| Server, |
| Globe, |
| Shield, |
| Star, |
| Menu, |
| X |
| } from 'lucide-react' |
|
|
| function App() { |
| const [activePhase, setActivePhase] = useState(0) |
| const [expandedSections, setExpandedSections] = useState({}) |
| const [copiedCode, setCopiedCode] = useState(null) |
| const [mobileMenuOpen, setMobileMenuOpen] = useState(false) |
|
|
| const toggleSection = (section) => { |
| setExpandedSections(prev => ({ |
| ...prev, |
| [section]: !prev[section] |
| })) |
| } |
|
|
| const copyToClipboard = (code, id) => { |
| navigator.clipboard.writeText(code) |
| setCopiedCode(id) |
| setTimeout(() => setCopiedCode(null), 2000) |
| } |
|
|
| const phases = [ |
| { |
| id: 0, |
| name: 'Seed Generator', |
| shortName: 'Phase 0', |
| icon: <Zap className="w-5 h-5" />, |
| space: 'mindchain/hf-space-nemo-seed-generator-huggingface-inference', |
| description: 'Generates initial seed Q&A pairs from a topic using LLM inference.', |
| color: 'amber', |
| gradient: 'from-amber-500 to-orange-600', |
| bgGradient: 'from-amber-500/10 to-orange-500/10', |
| endpoint: '/generate', |
| method: 'POST', |
| features: ['Topic-based generation', 'Configurable seed count', 'Quality pre-filtering'], |
| requestParams: [ |
| { name: 'topic', type: 'string', desc: 'Topic for Q&A generation' }, |
| { name: 'num_seeds', type: 'int', desc: 'Number of seeds to generate (default: 10)' }, |
| { name: 'model', type: 'string', desc: 'Qwen/Qwen2.5-7B-Instruct' }, |
| { name: 'provider', type: 'string', desc: 'together (recommended)' }, |
| ], |
| requestExample: `{ |
| "topic": "Python programming basics", |
| "num_seeds": 10, |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "provider": "together" |
| }`, |
| responseExample: `{ |
| "success": true, |
| "seeds": [ |
| { |
| "instruction": "What is a variable in Python?", |
| "output": "A variable in Python is a named container...", |
| "quality_score": 8 |
| } |
| ], |
| "count": 10 |
| }` |
| }, |
| { |
| id: 1, |
| name: 'Distilabel Generator', |
| shortName: 'Phase 1', |
| icon: <Cpu className="w-5 h-5" />, |
| space: 'mindchain/hf-space-distilabel-generator-huggingface-inference', |
| description: 'Generates synthetic data with LLM Judge scoring using UltraFeedback criteria.', |
| color: 'blue', |
| gradient: 'from-blue-500 to-cyan-500', |
| bgGradient: 'from-blue-500/10 to-cyan-500/10', |
| endpoint: '/generate', |
| method: 'POST', |
| features: ['UltraFeedback scoring', 'Multi-model support', 'Configurable thresholds'], |
| requestParams: [ |
| { name: 'seed_dataset', type: 'string', desc: 'HF dataset ID from Phase 0' }, |
| { name: 'num_records', type: 'int', desc: 'Records to generate (default: 100)' }, |
| { name: 'model', type: 'string', desc: 'Generator model' }, |
| { name: 'judge_model', type: 'string', desc: 'Judge model for scoring' }, |
| { name: 'use_judge', type: 'bool', desc: 'Enable LLM Judge (default: true)' }, |
| { name: 'min_score', type: 'int', desc: 'Minimum score filter (1-10)' }, |
| ], |
| requestExample: `{ |
| "seed_dataset": "mindchain/synthetic-seeds", |
| "num_records": 100, |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "provider": "together", |
| "judge_model": "Qwen/Qwen2.5-7B-Instruct", |
| "judge_provider": "together", |
| "use_judge": true, |
| "min_score": 5 |
| }`, |
| responseExample: `{ |
| "success": true, |
| "records": [ |
| { |
| "instruction": "Explain Python loops...", |
| "output": "Python loops allow you to...", |
| "quality_score": 8, |
| "distilabel_metadata": {...} |
| } |
| ], |
| "stats": { |
| "total": 100, |
| "avg_score": 7.2, |
| "filtered": 15 |
| } |
| }` |
| }, |
| { |
| id: 2, |
| name: 'Argilla Curator', |
| shortName: 'Phase 2', |
| icon: <Filter className="w-5 h-5" />, |
| space: 'mindchain/hf-space-argilla-curator-huggingface-inference', |
| description: 'Final curation with quality scoring and filtering for high-quality dataset.', |
| color: 'purple', |
| gradient: 'from-purple-500 to-pink-500', |
| bgGradient: 'from-purple-500/10 to-pink-500/10', |
| endpoint: '/curate', |
| method: 'POST', |
| features: ['Quality filtering', 'Score distribution', 'Hub push'], |
| requestParams: [ |
| { name: 'raw_dataset', type: 'string', desc: 'Raw dataset ID from Phase 1' }, |
| { name: 'model', type: 'string', desc: 'Judge model for re-scoring' }, |
| { name: 'provider', type: 'string', desc: 'Inference provider' }, |
| { name: 'min_score', type: 'int', desc: 'Minimum score (default: 7)' }, |
| { name: 'target_dataset', type: 'string', desc: 'Output dataset ID (optional)' }, |
| ], |
| requestExample: `{ |
| "raw_dataset": "mindchain/synthetic-distilabel-raw", |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "provider": "together", |
| "min_score": 7, |
| "target_dataset": "mindchain/synthetic-curated" |
| }`, |
| responseExample: `{ |
| "success": true, |
| "curated_count": 78, |
| "total_count": 100, |
| "filtered_count": 22, |
| "score_distribution": { |
| "7": 25, "8": 30, "9": 15, "10": 8 |
| } |
| }` |
| } |
| ] |
|
|
| const curlExample = (phase) => `curl -X POST https://${phase.space}.hf.space${phase.endpoint} \\ |
| -H "Content-Type: application/json" \\ |
| -d '${phase.requestExample.replace(/\n/g, ' ')}'` |
|
|
| const scrollToSection = (id) => { |
| document.getElementById(id)?.scrollIntoView({ behavior: 'smooth' }) |
| setMobileMenuOpen(false) |
| } |
|
|
| return ( |
| <div className="min-h-screen bg-slate-950 text-slate-100"> |
| {/* Navigation */} |
| <nav className="fixed top-0 left-0 right-0 z-50 bg-slate-950/80 backdrop-blur-xl border-b border-slate-800/50"> |
| <div className="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8"> |
| <div className="flex items-center justify-between h-16"> |
| <div className="flex items-center gap-3"> |
| <div className="relative"> |
| <div className="absolute inset-0 bg-gradient-to-r from-indigo-600 to-purple-600 rounded-xl blur-lg opacity-50"></div> |
| <div className="relative p-2 bg-gradient-to-r from-indigo-600 to-purple-600 rounded-xl"> |
| <Database className="w-5 h-5" /> |
| </div> |
| </div> |
| <div> |
| <h1 className="font-bold text-lg leading-tight">Synthetic Pipeline</h1> |
| <p className="text-xs text-slate-500">HF Spaces Documentation</p> |
| </div> |
| </div> |
| |
| {/* Desktop Nav */} |
| <div className="hidden md:flex items-center gap-6"> |
| <button onClick={() => scrollToSection('overview')} className="text-sm text-slate-400 hover:text-white transition-colors">Overview</button> |
| <button onClick={() => scrollToSection('pipeline')} className="text-sm text-slate-400 hover:text-white transition-colors">Pipeline</button> |
| <button onClick={() => scrollToSection('api')} className="text-sm text-slate-400 hover:text-white transition-colors">API</button> |
| <button onClick={() => scrollToSection('quickstart')} className="text-sm text-slate-400 hover:text-white transition-colors">Quick Start</button> |
| <a |
| href="https://huggingface.co/collections/mindchain/synthetic-data-pipeline" |
| target="_blank" |
| rel="noopener noreferrer" |
| className="flex items-center gap-2 px-4 py-2 bg-slate-800 hover:bg-slate-700 rounded-lg text-sm font-medium transition-colors" |
| > |
| <Globe className="w-4 h-4" /> |
| HF Collection |
| </a> |
| </div> |
| |
| {/* Mobile menu button */} |
| <button |
| onClick={() => setMobileMenuOpen(!mobileMenuOpen)} |
| className="md:hidden p-2 text-slate-400 hover:text-white" |
| > |
| {mobileMenuOpen ? <X className="w-5 h-5" /> : <Menu className="w-5 h-5" />} |
| </button> |
| </div> |
| </div> |
| |
| {/* Mobile Nav */} |
| {mobileMenuOpen && ( |
| <div className="md:hidden bg-slate-900 border-b border-slate-800"> |
| <div className="px-4 py-3 space-y-2"> |
| <button onClick={() => scrollToSection('overview')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">Overview</button> |
| <button onClick={() => scrollToSection('pipeline')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">Pipeline</button> |
| <button onClick={() => scrollToSection('api')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">API</button> |
| <button onClick={() => scrollToSection('quickstart')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">Quick Start</button> |
| </div> |
| </div> |
| )} |
| </nav> |
| |
| {/* Hero Section */} |
| <section className="pt-32 pb-20 px-4 relative overflow-hidden"> |
| {/* Background Effects */} |
| <div className="absolute inset-0 overflow-hidden"> |
| <div className="absolute top-1/4 -left-1/4 w-96 h-96 bg-indigo-600/20 rounded-full blur-3xl"></div> |
| <div className="absolute top-1/3 -right-1/4 w-96 h-96 bg-purple-600/20 rounded-full blur-3xl"></div> |
| <div className="absolute bottom-0 left-1/2 -translate-x-1/2 w-full h-1/2 bg-gradient-to-t from-slate-950 to-transparent"></div> |
| </div> |
| |
| <div className="max-w-4xl mx-auto text-center relative"> |
| <div className="inline-flex items-center gap-2 px-4 py-2 bg-slate-800/50 border border-slate-700 rounded-full text-sm text-slate-300 mb-6"> |
| <Sparkles className="w-4 h-4 text-amber-400" /> |
| <span>HuggingFace Spaces + HF Inference API</span> |
| </div> |
| |
| <h1 className="text-4xl sm:text-5xl lg:text-6xl font-bold mb-6"> |
| <span className="bg-gradient-to-r from-white via-slate-200 to-slate-400 bg-clip-text text-transparent"> |
| Synthetic Data Pipeline |
| </span> |
| </h1> |
| |
| <p className="text-lg sm:text-xl text-slate-400 max-w-2xl mx-auto mb-10"> |
| Generate high-quality synthetic Q&A datasets with a 3-phase pipeline. |
| CPU-only, no GPU required. |
| </p> |
| |
| <div className="flex flex-wrap items-center justify-center gap-4"> |
| <button |
| onClick={() => scrollToSection('quickstart')} |
| className="btn btn-primary flex items-center gap-2" |
| > |
| <Rocket className="w-4 h-4" /> |
| Get Started |
| </button> |
| <a |
| href="https://huggingface.co/collections/mindchain/synthetic-data-pipeline" |
| target="_blank" |
| rel="noopener noreferrer" |
| className="btn btn-secondary flex items-center gap-2" |
| > |
| <ExternalLink className="w-4 h-4" /> |
| View Spaces |
| </a> |
| </div> |
| </div> |
| </section> |
| |
| {/* Features Grid */} |
| <section className="py-12 px-4"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="grid grid-cols-2 md:grid-cols-4 gap-4"> |
| {[ |
| { icon: <Server className="w-5 h-5" />, label: 'CPU Only', desc: 'No GPU required' }, |
| { icon: <Zap className="w-5 h-5" />, label: 'Always On', desc: 'Docker Spaces' }, |
| { icon: <Shield className="w-5 h-5" />, label: 'Quality Scored', desc: 'LLM Judge' }, |
| { icon: <Globe className="w-5 h-5" />, label: 'HF Hub', desc: 'Dataset push' }, |
| ].map((feature, i) => ( |
| <div key={i} className="card card-hover p-4 text-center"> |
| <div className="inline-flex p-3 bg-slate-800 rounded-xl mb-3 text-indigo-400"> |
| {feature.icon} |
| </div> |
| <h3 className="font-semibold text-sm">{feature.label}</h3> |
| <p className="text-xs text-slate-500 mt-1">{feature.desc}</p> |
| </div> |
| ))} |
| </div> |
| </div> |
| </section> |
| |
| {/* Overview Section */} |
| <section id="overview" className="py-16 px-4"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-indigo-500/20 rounded-xl"> |
| <BookOpen className="w-5 h-5 text-indigo-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">Overview</h2> |
| </div> |
| |
| <div className="card p-6 lg:p-8"> |
| <p className="text-slate-300 text-lg leading-relaxed mb-8"> |
| This pipeline generates high-quality synthetic Q&A datasets using HuggingFace Spaces |
| and the HF Inference API. It follows a three-phase approach with LLM Judge scoring |
| based on UltraFeedback criteria. |
| </p> |
| |
| <div className="grid md:grid-cols-3 gap-6"> |
| {[ |
| { title: 'Seed Generation', desc: 'Create initial Q&A pairs from any topic', icon: <Zap className="w-5 h-5" />, color: 'amber' }, |
| { title: 'Distilabel Augmentation', desc: 'Generate with LLM Judge scoring', icon: <Cpu className="w-5 h-5" />, color: 'blue' }, |
| { title: 'Quality Curation', desc: 'Filter for high-quality records only', icon: <Filter className="w-5 h-5" />, color: 'purple' }, |
| ].map((item, i) => ( |
| <div key={i} className="relative group"> |
| <div className={`absolute inset-0 bg-gradient-to-r from-${item.color}-500/20 to-${item.color}-600/20 rounded-2xl blur-xl opacity-0 group-hover:opacity-100 transition-opacity`}></div> |
| <div className="relative bg-slate-800/50 border border-slate-700 rounded-2xl p-5 group-hover:border-slate-600 transition-colors"> |
| <div className={`inline-flex p-2 rounded-xl bg-${item.color}-500/20 text-${item.color}-400 mb-3`}> |
| {item.icon} |
| </div> |
| <h3 className="font-semibold mb-1">{item.title}</h3> |
| <p className="text-sm text-slate-400">{item.desc}</p> |
| </div> |
| </div> |
| ))} |
| </div> |
| </div> |
| </div> |
| </section> |
| |
| {/* Pipeline Architecture */} |
| <section id="pipeline" className="py-16 px-4 bg-slate-900/50"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-purple-500/20 rounded-xl"> |
| <Layers className="w-5 h-5 text-purple-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">Pipeline Architecture</h2> |
| </div> |
| |
| {/* Phase Selector */} |
| <div className="flex flex-col lg:flex-row gap-8"> |
| <div className="lg:w-1/3 space-y-3"> |
| {phases.map((phase) => ( |
| <button |
| key={phase.id} |
| onClick={() => setActivePhase(phase.id)} |
| className={`w-full text-left p-4 rounded-xl border-2 transition-all ${ |
| activePhase === phase.id |
| ? `border-${phase.color}-500 bg-gradient-to-r ${phase.bgGradient}` |
| : 'border-slate-700 hover:border-slate-600 bg-slate-800/30' |
| }`} |
| > |
| <div className="flex items-center gap-3"> |
| <div className={`p-2 rounded-lg bg-gradient-to-r ${phase.gradient}`}> |
| {phase.icon} |
| </div> |
| <div> |
| <div className="text-xs text-slate-500 mb-0.5">{phase.shortName}</div> |
| <div className="font-medium">{phase.name}</div> |
| </div> |
| </div> |
| </button> |
| ))} |
| </div> |
| |
| {/* Phase Details */} |
| <div className="lg:w-2/3"> |
| <div className="card overflow-hidden"> |
| <div className={`p-6 bg-gradient-to-r ${phases[activePhase].gradient}`}> |
| <div className="flex items-center gap-3 mb-2"> |
| {phases[activePhase].icon} |
| <h3 className="text-xl font-bold">{phases[activePhase].name}</h3> |
| </div> |
| <p className="text-white/80">{phases[activePhase].description}</p> |
| </div> |
| |
| <div className="p-6 space-y-6"> |
| {/* Features */} |
| <div className="flex flex-wrap gap-2"> |
| {phases[activePhase].features.map((f, i) => ( |
| <span key={i} className="px-3 py-1 bg-slate-800 border border-slate-700 rounded-full text-xs"> |
| {f} |
| </span> |
| ))} |
| </div> |
| |
| {/* Space Link */} |
| <div className="flex items-center gap-2"> |
| <ExternalLink className="w-4 h-4 text-slate-500" /> |
| <a |
| href={`https://huggingface.co/spaces/${phases[activePhase].space}`} |
| target="_blank" |
| rel="noopener noreferrer" |
| className="text-sm text-indigo-400 hover:text-indigo-300 font-mono" |
| > |
| {phases[activePhase].space} |
| </a> |
| </div> |
| |
| {/* Endpoint */} |
| <div> |
| <div className="flex items-center gap-2 mb-2"> |
| <Terminal className="w-4 h-4 text-slate-500" /> |
| <span className="text-xs text-slate-500 uppercase tracking-wider">Endpoint</span> |
| <span className="px-2 py-0.5 bg-emerald-500/20 text-emerald-400 text-xs font-medium rounded"> |
| {phases[activePhase].method} |
| </span> |
| </div> |
| <code className="code-block text-slate-300 text-xs sm:text-sm"> |
| https://{phases[activePhase].space}.hf.space{phases[activePhase].endpoint} |
| </code> |
| </div> |
| |
| {/* Parameters */} |
| <div> |
| <h4 className="text-xs text-slate-500 uppercase tracking-wider mb-3">Parameters</h4> |
| <div className="space-y-2"> |
| {phases[activePhase].requestParams.map((param, i) => ( |
| <div key={i} className="flex flex-wrap items-center gap-2 text-sm"> |
| <code className="px-2 py-1 bg-slate-800 rounded text-indigo-300 font-mono">{param.name}</code> |
| <span className="text-slate-500 text-xs">{param.type}</span> |
| <span className="text-slate-400">{param.desc}</span> |
| </div> |
| ))} |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </section> |
| |
| {/* API Examples Section */} |
| <section id="api" className="py-16 px-4"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-emerald-500/20 rounded-xl"> |
| <Code className="w-5 h-5 text-emerald-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">API Examples</h2> |
| </div> |
| |
| {/* Phase Tabs */} |
| <div className="flex gap-2 mb-6 overflow-x-auto pb-2"> |
| {phases.map((phase) => ( |
| <button |
| key={phase.id} |
| onClick={() => setActivePhase(phase.id)} |
| className={`flex items-center gap-2 px-4 py-2 rounded-lg font-medium text-sm whitespace-nowrap transition-colors ${ |
| activePhase === phase.id |
| ? `bg-gradient-to-r ${phase.gradient} text-white` |
| : 'bg-slate-800 text-slate-400 hover:text-white' |
| }`} |
| > |
| {phase.icon} |
| {phase.shortName} |
| </button> |
| ))} |
| </div> |
| |
| <div className="grid lg:grid-cols-2 gap-6"> |
| {/* Request */} |
| <div className="card overflow-hidden"> |
| <div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700"> |
| <span className="text-sm font-medium text-slate-300">Request Body</span> |
| <button |
| onClick={() => copyToClipboard(phases[activePhase].requestExample, 'req')} |
| className="text-slate-400 hover:text-white transition-colors" |
| > |
| {copiedCode === 'req' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />} |
| </button> |
| </div> |
| <pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin"> |
| {phases[activePhase].requestExample} |
| </pre> |
| </div> |
| |
| {/* Response */} |
| <div className="card overflow-hidden"> |
| <div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700"> |
| <span className="text-sm font-medium text-slate-300">Response</span> |
| <button |
| onClick={() => copyToClipboard(phases[activePhase].responseExample, 'res')} |
| className="text-slate-400 hover:text-white transition-colors" |
| > |
| {copiedCode === 'res' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />} |
| </button> |
| </div> |
| <pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin"> |
| {phases[activePhase].responseExample} |
| </pre> |
| </div> |
| </div> |
| |
| {/* cURL */} |
| <div className="card overflow-hidden mt-6"> |
| <div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700"> |
| <div className="flex items-center gap-2"> |
| <Terminal className="w-4 h-4 text-slate-500" /> |
| <span className="text-sm font-medium text-slate-300">cURL Command</span> |
| </div> |
| <button |
| onClick={() => copyToClipboard(curlExample(phases[activePhase]), 'curl')} |
| className="text-slate-400 hover:text-white transition-colors" |
| > |
| {copiedCode === 'curl' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />} |
| </button> |
| </div> |
| <pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin"> |
| {curlExample(phases[activePhase])} |
| </pre> |
| </div> |
| </div> |
| </section> |
| |
| {/* Quick Start */} |
| <section id="quickstart" className="py-16 px-4 bg-slate-900/50"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-amber-500/20 rounded-xl"> |
| <Rocket className="w-5 h-5 text-amber-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">Quick Start</h2> |
| </div> |
| |
| <div className="grid lg:grid-cols-3 gap-6"> |
| {[ |
| { step: 1, title: 'Generate Seeds', desc: 'Create initial Q&A pairs from your topic', color: 'amber', phase: 'Phase 0' }, |
| { step: 2, title: 'Generate Data', desc: 'Augment with LLM Judge quality scoring', color: 'blue', phase: 'Phase 1' }, |
| { step: 3, title: 'Curate Dataset', desc: 'Filter high-quality records (score >= 7)', color: 'purple', phase: 'Phase 2' }, |
| ].map((item) => ( |
| <div key={item.step} className="card card-hover p-6 relative"> |
| <div className={`absolute -top-3 -left-3 w-8 h-8 rounded-full bg-gradient-to-r ${ |
| item.color === 'amber' ? 'from-amber-500 to-orange-500' : |
| item.color === 'blue' ? 'from-blue-500 to-cyan-500' : |
| 'from-purple-500 to-pink-500' |
| } flex items-center justify-center font-bold text-sm`}> |
| {item.step} |
| </div> |
| <div className="pt-2"> |
| <span className={`text-xs font-medium ${ |
| item.color === 'amber' ? 'text-amber-400' : |
| item.color === 'blue' ? 'text-blue-400' : |
| 'text-purple-400' |
| }`}>{item.phase}</span> |
| <h3 className="font-semibold text-lg mt-1">{item.title}</h3> |
| <p className="text-sm text-slate-400 mt-2">{item.desc}</p> |
| </div> |
| </div> |
| ))} |
| </div> |
| </div> |
| </section> |
| |
| {/* Full Script */} |
| <section className="py-16 px-4"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-indigo-500/20 rounded-xl"> |
| <GitBranch className="w-5 h-5 text-indigo-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">Full Pipeline Script</h2> |
| </div> |
| |
| <div className="card overflow-hidden"> |
| <div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700"> |
| <span className="text-sm font-medium text-slate-300">run_pipeline.py</span> |
| <button |
| onClick={() => copyToClipboard(fullPipelineScript, 'script')} |
| className="text-slate-400 hover:text-white transition-colors" |
| > |
| {copiedCode === 'script' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />} |
| </button> |
| </div> |
| <pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin max-h-96"> |
| {fullPipelineScript} |
| </pre> |
| </div> |
| </div> |
| </section> |
| |
| {/* LLM Judge Criteria */} |
| <section className="py-16 px-4 bg-slate-900/50"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-pink-500/20 rounded-xl"> |
| <Star className="w-5 h-5 text-pink-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">LLM Judge Criteria (UltraFeedback)</h2> |
| </div> |
| |
| <div className="grid sm:grid-cols-2 lg:grid-cols-4 gap-4"> |
| {[ |
| { title: 'Instruction-Following', desc: 'Does the answer directly address the question?', color: 'blue' }, |
| { title: 'Truthfulness', desc: 'Is the information accurate and factually correct?', color: 'green' }, |
| { title: 'Honesty', desc: 'Does it avoid hallucination and acknowledge uncertainty?', color: 'yellow' }, |
| { title: 'Helpfulness', desc: 'Is the answer useful for learning?', color: 'purple' }, |
| ].map((item, i) => ( |
| <div key={i} className="card card-hover p-5"> |
| <div className={`w-10 h-1 rounded-full mb-4 bg-${item.color}-500`}></div> |
| <h3 className="font-semibold mb-2">{item.title}</h3> |
| <p className="text-sm text-slate-400">{item.desc}</p> |
| </div> |
| ))} |
| </div> |
| |
| <div className="card mt-8 p-6 flex flex-col sm:flex-row items-center justify-between gap-4"> |
| <div> |
| <h3 className="font-semibold">Score Range</h3> |
| <p className="text-sm text-slate-400">1-10 scale, recommended minimum: 7 for curated datasets</p> |
| </div> |
| <div className="flex gap-2"> |
| {[1, 2, 3, 4, 5, 6, 7, 8, 9, 10].map((n) => ( |
| <div |
| key={n} |
| className={`w-8 h-8 rounded-lg flex items-center justify-center text-xs font-bold ${ |
| n >= 7 ? 'bg-emerald-500/20 text-emerald-400 border border-emerald-500/30' : 'bg-slate-800 text-slate-500' |
| }`} |
| > |
| {n} |
| </div> |
| ))} |
| </div> |
| </div> |
| </div> |
| </section> |
| |
| {/* Configuration */} |
| <section className="py-16 px-4"> |
| <div className="max-w-6xl mx-auto"> |
| <div className="flex items-center gap-3 mb-8"> |
| <div className="p-2 bg-slate-500/20 rounded-xl"> |
| <Settings className="w-5 h-5 text-slate-400" /> |
| </div> |
| <h2 className="text-2xl font-bold">Configuration</h2> |
| </div> |
| |
| <div className="card divide-y divide-slate-800"> |
| {/* Models */} |
| <div> |
| <button |
| onClick={() => toggleSection('models')} |
| className="w-full flex items-center justify-between p-5 hover:bg-slate-800/30 transition-colors" |
| > |
| <span className="font-medium">Supported Models</span> |
| {expandedSections['models'] ? <ChevronUp className="w-5 h-5 text-slate-400" /> : <ChevronDown className="w-5 h-5 text-slate-400" />} |
| </button> |
| {expandedSections['models'] && ( |
| <div className="px-5 pb-5"> |
| <div className="grid sm:grid-cols-2 gap-6"> |
| <div> |
| <h4 className="text-sm font-medium text-indigo-400 mb-3">Generator Models</h4> |
| <div className="space-y-2"> |
| {[ |
| { name: 'Qwen/Qwen2.5-7B-Instruct', default: true }, |
| { name: 'Qwen/Qwen2.5-72B-Instruct', default: false }, |
| { name: 'openai/gpt-4o-mini', default: false }, |
| ].map((m, i) => ( |
| <div key={i} className="flex items-center gap-2 text-sm"> |
| <CheckCircle className="w-4 h-4 text-emerald-400 flex-shrink-0" /> |
| <code className="font-mono text-slate-300">{m.name}</code> |
| {m.default && <span className="text-xs text-slate-500">(default)</span>} |
| </div> |
| ))} |
| </div> |
| </div> |
| <div> |
| <h4 className="text-sm font-medium text-purple-400 mb-3">Judge Models</h4> |
| <div className="space-y-2"> |
| <div className="flex items-center gap-2 text-sm"> |
| <CheckCircle className="w-4 h-4 text-emerald-400" /> |
| <code className="font-mono text-slate-300">Qwen/Qwen2.5-7B-Instruct</code> |
| <span className="text-xs text-slate-500">(default)</span> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| )} |
| </div> |
| |
| {/* Providers */} |
| <div> |
| <button |
| onClick={() => toggleSection('providers')} |
| className="w-full flex items-center justify-between p-5 hover:bg-slate-800/30 transition-colors" |
| > |
| <span className="font-medium">Providers</span> |
| {expandedSections['providers'] ? <ChevronUp className="w-5 h-5 text-slate-400" /> : <ChevronDown className="w-5 h-5 text-slate-400" />} |
| </button> |
| {expandedSections['providers'] && ( |
| <div className="px-5 pb-5"> |
| <div className="space-y-2"> |
| {[ |
| { name: 'together', desc: '(default, recommended)' }, |
| { name: 'cerebras', desc: '' }, |
| { name: 'hf', desc: '(serverless)' }, |
| ].map((p, i) => ( |
| <div key={i} className="flex items-center gap-2 text-sm"> |
| <CheckCircle className="w-4 h-4 text-emerald-400" /> |
| <code className="font-mono text-slate-300">{p.name}</code> |
| {p.desc && <span className="text-xs text-slate-500">{p.desc}</span>} |
| </div> |
| ))} |
| </div> |
| </div> |
| )} |
| </div> |
| |
| {/* Environment */} |
| <div> |
| <button |
| onClick={() => toggleSection('env')} |
| className="w-full flex items-center justify-between p-5 hover:bg-slate-800/30 transition-colors" |
| > |
| <span className="font-medium">Environment Variables</span> |
| {expandedSections['env'] ? <ChevronUp className="w-5 h-5 text-slate-400" /> : <ChevronDown className="w-5 h-5 text-slate-400" />} |
| </button> |
| {expandedSections['env'] && ( |
| <div className="px-5 pb-5"> |
| <pre className="code-block text-sm"> |
| {`# Required for all spaces |
| HF_TOKEN=your_huggingface_token_here |
| |
| # Optional |
| MAX_RECORDS=1000 |
| DEFAULT_MODEL=Qwen/Qwen2.5-7B-Instruct |
| DEFAULT_PROVIDER=together`} |
| </pre> |
| </div> |
| )} |
| </div> |
| </div> |
| </div> |
| </section> |
| |
| {/* Footer */} |
| <footer className="py-12 px-4 border-t border-slate-800"> |
| <div className="max-w-6xl mx-auto text-center"> |
| <div className="flex items-center justify-center gap-2 mb-4"> |
| <div className="p-2 bg-gradient-to-r from-indigo-600 to-purple-600 rounded-xl"> |
| <Database className="w-4 h-4" /> |
| </div> |
| <span className="font-semibold">Synthetic Data Pipeline</span> |
| </div> |
| <p className="text-sm text-slate-500 mb-4"> |
| Powered by HuggingFace Spaces + HF Inference API |
| </p> |
| <div className="flex items-center justify-center gap-4"> |
| <a |
| href="https://huggingface.co/mindchain" |
| target="_blank" |
| rel="noopener noreferrer" |
| className="text-sm text-slate-400 hover:text-white transition-colors" |
| > |
| HuggingFace |
| </a> |
| <span className="text-slate-700">|</span> |
| <a |
| href="https://huggingface.co/collections/mindchain/synthetic-data-pipeline" |
| target="_blank" |
| rel="noopener noreferrer" |
| className="text-sm text-slate-400 hover:text-white transition-colors" |
| > |
| Collection |
| </a> |
| </div> |
| </div> |
| </footer> |
| </div> |
| ) |
| } |
|
|
| const fullPipelineScript = `#!/usr/bin/env python3 |
| """Full Synthetic Data Pipeline - Run all three phases""" |
| |
| import requests |
| import json |
| |
| # HF Spaces URLs |
| PHASE0_URL = "https://mindchain-hf-space-nemo-seed-generator-huggingface-inference.hf.space" |
| PHASE1_URL = "https://mindchain-hf-space-distilabel-generator-huggingface-inference.hf.space" |
| PHASE2_URL = "https://mindchain-hf-space-argilla-curator-huggingface-inference.hf.space" |
| |
| def phase0_generate_seeds(topic: str, num_seeds: int = 10): |
| """Phase 0: Generate seed Q&A pairs""" |
| print(f"\\n=== Phase 0: Generating {num_seeds} seeds for '{topic}' ===") |
| response = requests.post( |
| f"{PHASE0_URL}/generate", |
| json={ |
| "topic": topic, |
| "num_seeds": num_seeds, |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "provider": "together" |
| } |
| ) |
| result = response.json() |
| print(f"Generated {result.get('count', 0)} seeds") |
| return result |
| |
| def phase1_generate_synthetic(seed_dataset: str, num_records: int = 100): |
| """Phase 1: Generate synthetic data with Judge scoring""" |
| print(f"\\n=== Phase 1: Generating {num_records} records ===") |
| response = requests.post( |
| f"{PHASE1_URL}/generate", |
| json={ |
| "seed_dataset": seed_dataset, |
| "num_records": num_records, |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "provider": "together", |
| "judge_model": "Qwen/Qwen2.5-7B-Instruct", |
| "judge_provider": "together", |
| "use_judge": True, |
| "min_score": 5 |
| } |
| ) |
| result = response.json() |
| stats = result.get('stats', {}) |
| print(f"Generated {stats.get('total', 0)} records, avg score: {stats.get('avg_score', 0):.1f}") |
| return result |
| |
| def phase2_curate(raw_dataset: str, min_score: int = 7): |
| """Phase 2: Curate final high-quality dataset""" |
| print(f"\\n=== Phase 2: Curating with min_score={min_score} ===") |
| response = requests.post( |
| f"{PHASE2_URL}/curate", |
| json={ |
| "raw_dataset": raw_dataset, |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "provider": "together", |
| "min_score": min_score |
| } |
| ) |
| result = response.json() |
| print(f"Curated {result.get('curated_count', 0)}/{result.get('total_count', 0)} records") |
| return result |
| |
| if __name__ == "__main__": |
| # Run complete pipeline |
| seeds = phase0_generate_seeds("Python programming basics", num_seeds=10) |
| synthetic = phase1_generate_synthetic("mindchain/synthetic-seeds", num_records=100) |
| curated = phase2_curate("mindchain/synthetic-distilabel-raw", min_score=7) |
| |
| print("\\n=== Pipeline Complete! ===") |
| print(f"Final curated dataset: {curated.get('curated_count', 0)} high-quality records")` |
|
|
| export default App |
|
|