mindchain's picture
Upload src/App.jsx with huggingface_hub
d8bc360 verified
import React, { useState } from 'react'
import {
Database,
Cpu,
Zap,
Filter,
ArrowRight,
Code,
Play,
CheckCircle,
AlertCircle,
BookOpen,
Terminal,
Layers,
GitBranch,
Settings,
ChevronDown,
ChevronUp,
Copy,
ExternalLink,
Sparkles,
Rocket,
Server,
Globe,
Shield,
Star,
Menu,
X
} from 'lucide-react'
function App() {
const [activePhase, setActivePhase] = useState(0)
const [expandedSections, setExpandedSections] = useState({})
const [copiedCode, setCopiedCode] = useState(null)
const [mobileMenuOpen, setMobileMenuOpen] = useState(false)
const toggleSection = (section) => {
setExpandedSections(prev => ({
...prev,
[section]: !prev[section]
}))
}
const copyToClipboard = (code, id) => {
navigator.clipboard.writeText(code)
setCopiedCode(id)
setTimeout(() => setCopiedCode(null), 2000)
}
const phases = [
{
id: 0,
name: 'Seed Generator',
shortName: 'Phase 0',
icon: <Zap className="w-5 h-5" />,
space: 'mindchain/hf-space-nemo-seed-generator-huggingface-inference',
description: 'Generates initial seed Q&A pairs from a topic using LLM inference.',
color: 'amber',
gradient: 'from-amber-500 to-orange-600',
bgGradient: 'from-amber-500/10 to-orange-500/10',
endpoint: '/generate',
method: 'POST',
features: ['Topic-based generation', 'Configurable seed count', 'Quality pre-filtering'],
requestParams: [
{ name: 'topic', type: 'string', desc: 'Topic for Q&A generation' },
{ name: 'num_seeds', type: 'int', desc: 'Number of seeds to generate (default: 10)' },
{ name: 'model', type: 'string', desc: 'Qwen/Qwen2.5-7B-Instruct' },
{ name: 'provider', type: 'string', desc: 'together (recommended)' },
],
requestExample: `{
"topic": "Python programming basics",
"num_seeds": 10,
"model": "Qwen/Qwen2.5-7B-Instruct",
"provider": "together"
}`,
responseExample: `{
"success": true,
"seeds": [
{
"instruction": "What is a variable in Python?",
"output": "A variable in Python is a named container...",
"quality_score": 8
}
],
"count": 10
}`
},
{
id: 1,
name: 'Distilabel Generator',
shortName: 'Phase 1',
icon: <Cpu className="w-5 h-5" />,
space: 'mindchain/hf-space-distilabel-generator-huggingface-inference',
description: 'Generates synthetic data with LLM Judge scoring using UltraFeedback criteria.',
color: 'blue',
gradient: 'from-blue-500 to-cyan-500',
bgGradient: 'from-blue-500/10 to-cyan-500/10',
endpoint: '/generate',
method: 'POST',
features: ['UltraFeedback scoring', 'Multi-model support', 'Configurable thresholds'],
requestParams: [
{ name: 'seed_dataset', type: 'string', desc: 'HF dataset ID from Phase 0' },
{ name: 'num_records', type: 'int', desc: 'Records to generate (default: 100)' },
{ name: 'model', type: 'string', desc: 'Generator model' },
{ name: 'judge_model', type: 'string', desc: 'Judge model for scoring' },
{ name: 'use_judge', type: 'bool', desc: 'Enable LLM Judge (default: true)' },
{ name: 'min_score', type: 'int', desc: 'Minimum score filter (1-10)' },
],
requestExample: `{
"seed_dataset": "mindchain/synthetic-seeds",
"num_records": 100,
"model": "Qwen/Qwen2.5-7B-Instruct",
"provider": "together",
"judge_model": "Qwen/Qwen2.5-7B-Instruct",
"judge_provider": "together",
"use_judge": true,
"min_score": 5
}`,
responseExample: `{
"success": true,
"records": [
{
"instruction": "Explain Python loops...",
"output": "Python loops allow you to...",
"quality_score": 8,
"distilabel_metadata": {...}
}
],
"stats": {
"total": 100,
"avg_score": 7.2,
"filtered": 15
}
}`
},
{
id: 2,
name: 'Argilla Curator',
shortName: 'Phase 2',
icon: <Filter className="w-5 h-5" />,
space: 'mindchain/hf-space-argilla-curator-huggingface-inference',
description: 'Final curation with quality scoring and filtering for high-quality dataset.',
color: 'purple',
gradient: 'from-purple-500 to-pink-500',
bgGradient: 'from-purple-500/10 to-pink-500/10',
endpoint: '/curate',
method: 'POST',
features: ['Quality filtering', 'Score distribution', 'Hub push'],
requestParams: [
{ name: 'raw_dataset', type: 'string', desc: 'Raw dataset ID from Phase 1' },
{ name: 'model', type: 'string', desc: 'Judge model for re-scoring' },
{ name: 'provider', type: 'string', desc: 'Inference provider' },
{ name: 'min_score', type: 'int', desc: 'Minimum score (default: 7)' },
{ name: 'target_dataset', type: 'string', desc: 'Output dataset ID (optional)' },
],
requestExample: `{
"raw_dataset": "mindchain/synthetic-distilabel-raw",
"model": "Qwen/Qwen2.5-7B-Instruct",
"provider": "together",
"min_score": 7,
"target_dataset": "mindchain/synthetic-curated"
}`,
responseExample: `{
"success": true,
"curated_count": 78,
"total_count": 100,
"filtered_count": 22,
"score_distribution": {
"7": 25, "8": 30, "9": 15, "10": 8
}
}`
}
]
const curlExample = (phase) => `curl -X POST https://${phase.space}.hf.space${phase.endpoint} \\
-H "Content-Type: application/json" \\
-d '${phase.requestExample.replace(/\n/g, ' ')}'`
const scrollToSection = (id) => {
document.getElementById(id)?.scrollIntoView({ behavior: 'smooth' })
setMobileMenuOpen(false)
}
return (
<div className="min-h-screen bg-slate-950 text-slate-100">
{/* Navigation */}
<nav className="fixed top-0 left-0 right-0 z-50 bg-slate-950/80 backdrop-blur-xl border-b border-slate-800/50">
<div className="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div className="flex items-center justify-between h-16">
<div className="flex items-center gap-3">
<div className="relative">
<div className="absolute inset-0 bg-gradient-to-r from-indigo-600 to-purple-600 rounded-xl blur-lg opacity-50"></div>
<div className="relative p-2 bg-gradient-to-r from-indigo-600 to-purple-600 rounded-xl">
<Database className="w-5 h-5" />
</div>
</div>
<div>
<h1 className="font-bold text-lg leading-tight">Synthetic Pipeline</h1>
<p className="text-xs text-slate-500">HF Spaces Documentation</p>
</div>
</div>
{/* Desktop Nav */}
<div className="hidden md:flex items-center gap-6">
<button onClick={() => scrollToSection('overview')} className="text-sm text-slate-400 hover:text-white transition-colors">Overview</button>
<button onClick={() => scrollToSection('pipeline')} className="text-sm text-slate-400 hover:text-white transition-colors">Pipeline</button>
<button onClick={() => scrollToSection('api')} className="text-sm text-slate-400 hover:text-white transition-colors">API</button>
<button onClick={() => scrollToSection('quickstart')} className="text-sm text-slate-400 hover:text-white transition-colors">Quick Start</button>
<a
href="https://huggingface.co/collections/mindchain/synthetic-data-pipeline"
target="_blank"
rel="noopener noreferrer"
className="flex items-center gap-2 px-4 py-2 bg-slate-800 hover:bg-slate-700 rounded-lg text-sm font-medium transition-colors"
>
<Globe className="w-4 h-4" />
HF Collection
</a>
</div>
{/* Mobile menu button */}
<button
onClick={() => setMobileMenuOpen(!mobileMenuOpen)}
className="md:hidden p-2 text-slate-400 hover:text-white"
>
{mobileMenuOpen ? <X className="w-5 h-5" /> : <Menu className="w-5 h-5" />}
</button>
</div>
</div>
{/* Mobile Nav */}
{mobileMenuOpen && (
<div className="md:hidden bg-slate-900 border-b border-slate-800">
<div className="px-4 py-3 space-y-2">
<button onClick={() => scrollToSection('overview')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">Overview</button>
<button onClick={() => scrollToSection('pipeline')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">Pipeline</button>
<button onClick={() => scrollToSection('api')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">API</button>
<button onClick={() => scrollToSection('quickstart')} className="block w-full text-left px-3 py-2 text-slate-400 hover:text-white hover:bg-slate-800 rounded-lg">Quick Start</button>
</div>
</div>
)}
</nav>
{/* Hero Section */}
<section className="pt-32 pb-20 px-4 relative overflow-hidden">
{/* Background Effects */}
<div className="absolute inset-0 overflow-hidden">
<div className="absolute top-1/4 -left-1/4 w-96 h-96 bg-indigo-600/20 rounded-full blur-3xl"></div>
<div className="absolute top-1/3 -right-1/4 w-96 h-96 bg-purple-600/20 rounded-full blur-3xl"></div>
<div className="absolute bottom-0 left-1/2 -translate-x-1/2 w-full h-1/2 bg-gradient-to-t from-slate-950 to-transparent"></div>
</div>
<div className="max-w-4xl mx-auto text-center relative">
<div className="inline-flex items-center gap-2 px-4 py-2 bg-slate-800/50 border border-slate-700 rounded-full text-sm text-slate-300 mb-6">
<Sparkles className="w-4 h-4 text-amber-400" />
<span>HuggingFace Spaces + HF Inference API</span>
</div>
<h1 className="text-4xl sm:text-5xl lg:text-6xl font-bold mb-6">
<span className="bg-gradient-to-r from-white via-slate-200 to-slate-400 bg-clip-text text-transparent">
Synthetic Data Pipeline
</span>
</h1>
<p className="text-lg sm:text-xl text-slate-400 max-w-2xl mx-auto mb-10">
Generate high-quality synthetic Q&A datasets with a 3-phase pipeline.
CPU-only, no GPU required.
</p>
<div className="flex flex-wrap items-center justify-center gap-4">
<button
onClick={() => scrollToSection('quickstart')}
className="btn btn-primary flex items-center gap-2"
>
<Rocket className="w-4 h-4" />
Get Started
</button>
<a
href="https://huggingface.co/collections/mindchain/synthetic-data-pipeline"
target="_blank"
rel="noopener noreferrer"
className="btn btn-secondary flex items-center gap-2"
>
<ExternalLink className="w-4 h-4" />
View Spaces
</a>
</div>
</div>
</section>
{/* Features Grid */}
<section className="py-12 px-4">
<div className="max-w-6xl mx-auto">
<div className="grid grid-cols-2 md:grid-cols-4 gap-4">
{[
{ icon: <Server className="w-5 h-5" />, label: 'CPU Only', desc: 'No GPU required' },
{ icon: <Zap className="w-5 h-5" />, label: 'Always On', desc: 'Docker Spaces' },
{ icon: <Shield className="w-5 h-5" />, label: 'Quality Scored', desc: 'LLM Judge' },
{ icon: <Globe className="w-5 h-5" />, label: 'HF Hub', desc: 'Dataset push' },
].map((feature, i) => (
<div key={i} className="card card-hover p-4 text-center">
<div className="inline-flex p-3 bg-slate-800 rounded-xl mb-3 text-indigo-400">
{feature.icon}
</div>
<h3 className="font-semibold text-sm">{feature.label}</h3>
<p className="text-xs text-slate-500 mt-1">{feature.desc}</p>
</div>
))}
</div>
</div>
</section>
{/* Overview Section */}
<section id="overview" className="py-16 px-4">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-indigo-500/20 rounded-xl">
<BookOpen className="w-5 h-5 text-indigo-400" />
</div>
<h2 className="text-2xl font-bold">Overview</h2>
</div>
<div className="card p-6 lg:p-8">
<p className="text-slate-300 text-lg leading-relaxed mb-8">
This pipeline generates high-quality synthetic Q&A datasets using HuggingFace Spaces
and the HF Inference API. It follows a three-phase approach with LLM Judge scoring
based on UltraFeedback criteria.
</p>
<div className="grid md:grid-cols-3 gap-6">
{[
{ title: 'Seed Generation', desc: 'Create initial Q&A pairs from any topic', icon: <Zap className="w-5 h-5" />, color: 'amber' },
{ title: 'Distilabel Augmentation', desc: 'Generate with LLM Judge scoring', icon: <Cpu className="w-5 h-5" />, color: 'blue' },
{ title: 'Quality Curation', desc: 'Filter for high-quality records only', icon: <Filter className="w-5 h-5" />, color: 'purple' },
].map((item, i) => (
<div key={i} className="relative group">
<div className={`absolute inset-0 bg-gradient-to-r from-${item.color}-500/20 to-${item.color}-600/20 rounded-2xl blur-xl opacity-0 group-hover:opacity-100 transition-opacity`}></div>
<div className="relative bg-slate-800/50 border border-slate-700 rounded-2xl p-5 group-hover:border-slate-600 transition-colors">
<div className={`inline-flex p-2 rounded-xl bg-${item.color}-500/20 text-${item.color}-400 mb-3`}>
{item.icon}
</div>
<h3 className="font-semibold mb-1">{item.title}</h3>
<p className="text-sm text-slate-400">{item.desc}</p>
</div>
</div>
))}
</div>
</div>
</div>
</section>
{/* Pipeline Architecture */}
<section id="pipeline" className="py-16 px-4 bg-slate-900/50">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-purple-500/20 rounded-xl">
<Layers className="w-5 h-5 text-purple-400" />
</div>
<h2 className="text-2xl font-bold">Pipeline Architecture</h2>
</div>
{/* Phase Selector */}
<div className="flex flex-col lg:flex-row gap-8">
<div className="lg:w-1/3 space-y-3">
{phases.map((phase) => (
<button
key={phase.id}
onClick={() => setActivePhase(phase.id)}
className={`w-full text-left p-4 rounded-xl border-2 transition-all ${
activePhase === phase.id
? `border-${phase.color}-500 bg-gradient-to-r ${phase.bgGradient}`
: 'border-slate-700 hover:border-slate-600 bg-slate-800/30'
}`}
>
<div className="flex items-center gap-3">
<div className={`p-2 rounded-lg bg-gradient-to-r ${phase.gradient}`}>
{phase.icon}
</div>
<div>
<div className="text-xs text-slate-500 mb-0.5">{phase.shortName}</div>
<div className="font-medium">{phase.name}</div>
</div>
</div>
</button>
))}
</div>
{/* Phase Details */}
<div className="lg:w-2/3">
<div className="card overflow-hidden">
<div className={`p-6 bg-gradient-to-r ${phases[activePhase].gradient}`}>
<div className="flex items-center gap-3 mb-2">
{phases[activePhase].icon}
<h3 className="text-xl font-bold">{phases[activePhase].name}</h3>
</div>
<p className="text-white/80">{phases[activePhase].description}</p>
</div>
<div className="p-6 space-y-6">
{/* Features */}
<div className="flex flex-wrap gap-2">
{phases[activePhase].features.map((f, i) => (
<span key={i} className="px-3 py-1 bg-slate-800 border border-slate-700 rounded-full text-xs">
{f}
</span>
))}
</div>
{/* Space Link */}
<div className="flex items-center gap-2">
<ExternalLink className="w-4 h-4 text-slate-500" />
<a
href={`https://huggingface.co/spaces/${phases[activePhase].space}`}
target="_blank"
rel="noopener noreferrer"
className="text-sm text-indigo-400 hover:text-indigo-300 font-mono"
>
{phases[activePhase].space}
</a>
</div>
{/* Endpoint */}
<div>
<div className="flex items-center gap-2 mb-2">
<Terminal className="w-4 h-4 text-slate-500" />
<span className="text-xs text-slate-500 uppercase tracking-wider">Endpoint</span>
<span className="px-2 py-0.5 bg-emerald-500/20 text-emerald-400 text-xs font-medium rounded">
{phases[activePhase].method}
</span>
</div>
<code className="code-block text-slate-300 text-xs sm:text-sm">
https://{phases[activePhase].space}.hf.space{phases[activePhase].endpoint}
</code>
</div>
{/* Parameters */}
<div>
<h4 className="text-xs text-slate-500 uppercase tracking-wider mb-3">Parameters</h4>
<div className="space-y-2">
{phases[activePhase].requestParams.map((param, i) => (
<div key={i} className="flex flex-wrap items-center gap-2 text-sm">
<code className="px-2 py-1 bg-slate-800 rounded text-indigo-300 font-mono">{param.name}</code>
<span className="text-slate-500 text-xs">{param.type}</span>
<span className="text-slate-400">{param.desc}</span>
</div>
))}
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
{/* API Examples Section */}
<section id="api" className="py-16 px-4">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-emerald-500/20 rounded-xl">
<Code className="w-5 h-5 text-emerald-400" />
</div>
<h2 className="text-2xl font-bold">API Examples</h2>
</div>
{/* Phase Tabs */}
<div className="flex gap-2 mb-6 overflow-x-auto pb-2">
{phases.map((phase) => (
<button
key={phase.id}
onClick={() => setActivePhase(phase.id)}
className={`flex items-center gap-2 px-4 py-2 rounded-lg font-medium text-sm whitespace-nowrap transition-colors ${
activePhase === phase.id
? `bg-gradient-to-r ${phase.gradient} text-white`
: 'bg-slate-800 text-slate-400 hover:text-white'
}`}
>
{phase.icon}
{phase.shortName}
</button>
))}
</div>
<div className="grid lg:grid-cols-2 gap-6">
{/* Request */}
<div className="card overflow-hidden">
<div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700">
<span className="text-sm font-medium text-slate-300">Request Body</span>
<button
onClick={() => copyToClipboard(phases[activePhase].requestExample, 'req')}
className="text-slate-400 hover:text-white transition-colors"
>
{copiedCode === 'req' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />}
</button>
</div>
<pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin">
{phases[activePhase].requestExample}
</pre>
</div>
{/* Response */}
<div className="card overflow-hidden">
<div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700">
<span className="text-sm font-medium text-slate-300">Response</span>
<button
onClick={() => copyToClipboard(phases[activePhase].responseExample, 'res')}
className="text-slate-400 hover:text-white transition-colors"
>
{copiedCode === 'res' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />}
</button>
</div>
<pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin">
{phases[activePhase].responseExample}
</pre>
</div>
</div>
{/* cURL */}
<div className="card overflow-hidden mt-6">
<div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700">
<div className="flex items-center gap-2">
<Terminal className="w-4 h-4 text-slate-500" />
<span className="text-sm font-medium text-slate-300">cURL Command</span>
</div>
<button
onClick={() => copyToClipboard(curlExample(phases[activePhase]), 'curl')}
className="text-slate-400 hover:text-white transition-colors"
>
{copiedCode === 'curl' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />}
</button>
</div>
<pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin">
{curlExample(phases[activePhase])}
</pre>
</div>
</div>
</section>
{/* Quick Start */}
<section id="quickstart" className="py-16 px-4 bg-slate-900/50">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-amber-500/20 rounded-xl">
<Rocket className="w-5 h-5 text-amber-400" />
</div>
<h2 className="text-2xl font-bold">Quick Start</h2>
</div>
<div className="grid lg:grid-cols-3 gap-6">
{[
{ step: 1, title: 'Generate Seeds', desc: 'Create initial Q&A pairs from your topic', color: 'amber', phase: 'Phase 0' },
{ step: 2, title: 'Generate Data', desc: 'Augment with LLM Judge quality scoring', color: 'blue', phase: 'Phase 1' },
{ step: 3, title: 'Curate Dataset', desc: 'Filter high-quality records (score >= 7)', color: 'purple', phase: 'Phase 2' },
].map((item) => (
<div key={item.step} className="card card-hover p-6 relative">
<div className={`absolute -top-3 -left-3 w-8 h-8 rounded-full bg-gradient-to-r ${
item.color === 'amber' ? 'from-amber-500 to-orange-500' :
item.color === 'blue' ? 'from-blue-500 to-cyan-500' :
'from-purple-500 to-pink-500'
} flex items-center justify-center font-bold text-sm`}>
{item.step}
</div>
<div className="pt-2">
<span className={`text-xs font-medium ${
item.color === 'amber' ? 'text-amber-400' :
item.color === 'blue' ? 'text-blue-400' :
'text-purple-400'
}`}>{item.phase}</span>
<h3 className="font-semibold text-lg mt-1">{item.title}</h3>
<p className="text-sm text-slate-400 mt-2">{item.desc}</p>
</div>
</div>
))}
</div>
</div>
</section>
{/* Full Script */}
<section className="py-16 px-4">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-indigo-500/20 rounded-xl">
<GitBranch className="w-5 h-5 text-indigo-400" />
</div>
<h2 className="text-2xl font-bold">Full Pipeline Script</h2>
</div>
<div className="card overflow-hidden">
<div className="flex items-center justify-between px-4 py-3 bg-slate-800/50 border-b border-slate-700">
<span className="text-sm font-medium text-slate-300">run_pipeline.py</span>
<button
onClick={() => copyToClipboard(fullPipelineScript, 'script')}
className="text-slate-400 hover:text-white transition-colors"
>
{copiedCode === 'script' ? <CheckCircle className="w-4 h-4 text-emerald-400" /> : <Copy className="w-4 h-4" />}
</button>
</div>
<pre className="p-4 text-sm font-mono text-slate-300 overflow-x-auto scrollbar-thin max-h-96">
{fullPipelineScript}
</pre>
</div>
</div>
</section>
{/* LLM Judge Criteria */}
<section className="py-16 px-4 bg-slate-900/50">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-pink-500/20 rounded-xl">
<Star className="w-5 h-5 text-pink-400" />
</div>
<h2 className="text-2xl font-bold">LLM Judge Criteria (UltraFeedback)</h2>
</div>
<div className="grid sm:grid-cols-2 lg:grid-cols-4 gap-4">
{[
{ title: 'Instruction-Following', desc: 'Does the answer directly address the question?', color: 'blue' },
{ title: 'Truthfulness', desc: 'Is the information accurate and factually correct?', color: 'green' },
{ title: 'Honesty', desc: 'Does it avoid hallucination and acknowledge uncertainty?', color: 'yellow' },
{ title: 'Helpfulness', desc: 'Is the answer useful for learning?', color: 'purple' },
].map((item, i) => (
<div key={i} className="card card-hover p-5">
<div className={`w-10 h-1 rounded-full mb-4 bg-${item.color}-500`}></div>
<h3 className="font-semibold mb-2">{item.title}</h3>
<p className="text-sm text-slate-400">{item.desc}</p>
</div>
))}
</div>
<div className="card mt-8 p-6 flex flex-col sm:flex-row items-center justify-between gap-4">
<div>
<h3 className="font-semibold">Score Range</h3>
<p className="text-sm text-slate-400">1-10 scale, recommended minimum: 7 for curated datasets</p>
</div>
<div className="flex gap-2">
{[1, 2, 3, 4, 5, 6, 7, 8, 9, 10].map((n) => (
<div
key={n}
className={`w-8 h-8 rounded-lg flex items-center justify-center text-xs font-bold ${
n >= 7 ? 'bg-emerald-500/20 text-emerald-400 border border-emerald-500/30' : 'bg-slate-800 text-slate-500'
}`}
>
{n}
</div>
))}
</div>
</div>
</div>
</section>
{/* Configuration */}
<section className="py-16 px-4">
<div className="max-w-6xl mx-auto">
<div className="flex items-center gap-3 mb-8">
<div className="p-2 bg-slate-500/20 rounded-xl">
<Settings className="w-5 h-5 text-slate-400" />
</div>
<h2 className="text-2xl font-bold">Configuration</h2>
</div>
<div className="card divide-y divide-slate-800">
{/* Models */}
<div>
<button
onClick={() => toggleSection('models')}
className="w-full flex items-center justify-between p-5 hover:bg-slate-800/30 transition-colors"
>
<span className="font-medium">Supported Models</span>
{expandedSections['models'] ? <ChevronUp className="w-5 h-5 text-slate-400" /> : <ChevronDown className="w-5 h-5 text-slate-400" />}
</button>
{expandedSections['models'] && (
<div className="px-5 pb-5">
<div className="grid sm:grid-cols-2 gap-6">
<div>
<h4 className="text-sm font-medium text-indigo-400 mb-3">Generator Models</h4>
<div className="space-y-2">
{[
{ name: 'Qwen/Qwen2.5-7B-Instruct', default: true },
{ name: 'Qwen/Qwen2.5-72B-Instruct', default: false },
{ name: 'openai/gpt-4o-mini', default: false },
].map((m, i) => (
<div key={i} className="flex items-center gap-2 text-sm">
<CheckCircle className="w-4 h-4 text-emerald-400 flex-shrink-0" />
<code className="font-mono text-slate-300">{m.name}</code>
{m.default && <span className="text-xs text-slate-500">(default)</span>}
</div>
))}
</div>
</div>
<div>
<h4 className="text-sm font-medium text-purple-400 mb-3">Judge Models</h4>
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm">
<CheckCircle className="w-4 h-4 text-emerald-400" />
<code className="font-mono text-slate-300">Qwen/Qwen2.5-7B-Instruct</code>
<span className="text-xs text-slate-500">(default)</span>
</div>
</div>
</div>
</div>
</div>
)}
</div>
{/* Providers */}
<div>
<button
onClick={() => toggleSection('providers')}
className="w-full flex items-center justify-between p-5 hover:bg-slate-800/30 transition-colors"
>
<span className="font-medium">Providers</span>
{expandedSections['providers'] ? <ChevronUp className="w-5 h-5 text-slate-400" /> : <ChevronDown className="w-5 h-5 text-slate-400" />}
</button>
{expandedSections['providers'] && (
<div className="px-5 pb-5">
<div className="space-y-2">
{[
{ name: 'together', desc: '(default, recommended)' },
{ name: 'cerebras', desc: '' },
{ name: 'hf', desc: '(serverless)' },
].map((p, i) => (
<div key={i} className="flex items-center gap-2 text-sm">
<CheckCircle className="w-4 h-4 text-emerald-400" />
<code className="font-mono text-slate-300">{p.name}</code>
{p.desc && <span className="text-xs text-slate-500">{p.desc}</span>}
</div>
))}
</div>
</div>
)}
</div>
{/* Environment */}
<div>
<button
onClick={() => toggleSection('env')}
className="w-full flex items-center justify-between p-5 hover:bg-slate-800/30 transition-colors"
>
<span className="font-medium">Environment Variables</span>
{expandedSections['env'] ? <ChevronUp className="w-5 h-5 text-slate-400" /> : <ChevronDown className="w-5 h-5 text-slate-400" />}
</button>
{expandedSections['env'] && (
<div className="px-5 pb-5">
<pre className="code-block text-sm">
{`# Required for all spaces
HF_TOKEN=your_huggingface_token_here
# Optional
MAX_RECORDS=1000
DEFAULT_MODEL=Qwen/Qwen2.5-7B-Instruct
DEFAULT_PROVIDER=together`}
</pre>
</div>
)}
</div>
</div>
</div>
</section>
{/* Footer */}
<footer className="py-12 px-4 border-t border-slate-800">
<div className="max-w-6xl mx-auto text-center">
<div className="flex items-center justify-center gap-2 mb-4">
<div className="p-2 bg-gradient-to-r from-indigo-600 to-purple-600 rounded-xl">
<Database className="w-4 h-4" />
</div>
<span className="font-semibold">Synthetic Data Pipeline</span>
</div>
<p className="text-sm text-slate-500 mb-4">
Powered by HuggingFace Spaces + HF Inference API
</p>
<div className="flex items-center justify-center gap-4">
<a
href="https://huggingface.co/mindchain"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-slate-400 hover:text-white transition-colors"
>
HuggingFace
</a>
<span className="text-slate-700">|</span>
<a
href="https://huggingface.co/collections/mindchain/synthetic-data-pipeline"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-slate-400 hover:text-white transition-colors"
>
Collection
</a>
</div>
</div>
</footer>
</div>
)
}
const fullPipelineScript = `#!/usr/bin/env python3
"""Full Synthetic Data Pipeline - Run all three phases"""
import requests
import json
# HF Spaces URLs
PHASE0_URL = "https://mindchain-hf-space-nemo-seed-generator-huggingface-inference.hf.space"
PHASE1_URL = "https://mindchain-hf-space-distilabel-generator-huggingface-inference.hf.space"
PHASE2_URL = "https://mindchain-hf-space-argilla-curator-huggingface-inference.hf.space"
def phase0_generate_seeds(topic: str, num_seeds: int = 10):
"""Phase 0: Generate seed Q&A pairs"""
print(f"\\n=== Phase 0: Generating {num_seeds} seeds for '{topic}' ===")
response = requests.post(
f"{PHASE0_URL}/generate",
json={
"topic": topic,
"num_seeds": num_seeds,
"model": "Qwen/Qwen2.5-7B-Instruct",
"provider": "together"
}
)
result = response.json()
print(f"Generated {result.get('count', 0)} seeds")
return result
def phase1_generate_synthetic(seed_dataset: str, num_records: int = 100):
"""Phase 1: Generate synthetic data with Judge scoring"""
print(f"\\n=== Phase 1: Generating {num_records} records ===")
response = requests.post(
f"{PHASE1_URL}/generate",
json={
"seed_dataset": seed_dataset,
"num_records": num_records,
"model": "Qwen/Qwen2.5-7B-Instruct",
"provider": "together",
"judge_model": "Qwen/Qwen2.5-7B-Instruct",
"judge_provider": "together",
"use_judge": True,
"min_score": 5
}
)
result = response.json()
stats = result.get('stats', {})
print(f"Generated {stats.get('total', 0)} records, avg score: {stats.get('avg_score', 0):.1f}")
return result
def phase2_curate(raw_dataset: str, min_score: int = 7):
"""Phase 2: Curate final high-quality dataset"""
print(f"\\n=== Phase 2: Curating with min_score={min_score} ===")
response = requests.post(
f"{PHASE2_URL}/curate",
json={
"raw_dataset": raw_dataset,
"model": "Qwen/Qwen2.5-7B-Instruct",
"provider": "together",
"min_score": min_score
}
)
result = response.json()
print(f"Curated {result.get('curated_count', 0)}/{result.get('total_count', 0)} records")
return result
if __name__ == "__main__":
# Run complete pipeline
seeds = phase0_generate_seeds("Python programming basics", num_seeds=10)
synthetic = phase1_generate_synthetic("mindchain/synthetic-seeds", num_records=100)
curated = phase2_curate("mindchain/synthetic-distilabel-raw", min_score=7)
print("\\n=== Pipeline Complete! ===")
print(f"Final curated dataset: {curated.get('curated_count', 0)} high-quality records")`
export default App