import { useState, useEffect, useRef, Fragment } from "react";
const API_BASE = "https://jaaccaa-data-augmentation.hf.space";
// Fonts used in the application UI
const FONTS = `@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;700&family=Syne:wght@400;600;700;800&display=swap');`;
// ── Demo Samples ──────────────────────────────────────────────────────────────
// A set of sentences from the PolEmo2.0 corpus reflecting a long-tail distribution
const SAMPLE_SENTENCES = [
{ id: 1, text: "Produkt jest bardzo dobry i polecam go wszystkim.", label: "pozytywna", count: 142 },
{ id: 2, text: "Obsługa klienta była fatalna i nieprofesjonalna.", label: "negatywna", count: 8 },
{ id: 3, text: "Dostawa przyszła na czas, jestem zadowolony.", label: "pozytywna", count: 134 },
{ id: 4, text: "Jakość wykonania pozostawia wiele do życzenia.", label: "negatywna", count: 11 },
{ id: 5, text: "Nie mam zdania na temat tego produktu.", label: "neutralna", count: 6 },
{ id: 6, text: "Cena jest adekwatna do jakości oferowanego towaru.", label: "neutralna", count: 9 },
];
// ── Augmentation Methods Definitions ──────────────────────────────────────────
const AUG_METHODS = {
EDA: {
label: "EDA (Lexical Rules)",
color: "#4ade80",
lib: "NLPAug + HerBERT",
description: "Token-level perturbations: synonym replacement, random insertion, and deletion. Low computational overhead, high throughput.",
},
BT: {
label: "Back-Translation",
color: "#60a5fa",
lib: "deep-translator (Google)",
description: "Round-trip translation (PL → [EN, DE, CS] → PL). Leverages multilingual embeddings to break syntactic patterns and bypass pivot-language bias.",
},
LLM: {
label: "Generative LLM",
color: "#f472b6",
lib: "Groq Cloud (Llama 3)",
description: "Advanced paraphrasing based on prompt instructions for Large Language Models. Highest semantic quality powered by ultra-fast LPU inference.",
},
};
// ── Helper Components ─────────────────────────────────────────────────────────
function MetricBar({ label, value, color, unit = "%" }) {
return (
{label}
{typeof value === "number" ? value.toFixed(1) : value}{unit}
);
}
function ClassBadge({ label }) {
const colors = { pozytywna: "#4ade80", negatywna: "#f87171", neutralna: "#fbbf24" };
return (
{label}
);
}
function StepBadge({ step, active, done }) {
return (
{done ? "✓" : step}
);
}
// ── Main Application ──────────────────────────────────────────────────────────
export default function App() {
const [activeTab, setActiveTab] = useState("pipeline");
const [pipelineStep, setPipelineStep] = useState(0);
const [selectedSentence, setSelectedSentence] = useState(SAMPLE_SENTENCES[1]);
const [selectedMethod, setSelectedMethod] = useState("LLM");
const [augmented, setAugmented] = useState(null);
const [similarity, setSimilarity] = useState(null);
const [filtered, setFiltered] = useState(null);
const [logs, setLogs] = useState([]);
const [metrics, setMetrics] = useState(null);
const [running, setRunning] = useState(false);
const [intermediate, setIntermediate] = useState(null);
// Hyperparameters
const [selectedPivot, setSelectedPivot] = useState("en");
const [edaIntensity, setEdaIntensity] = useState(0.15);
const [filterThreshold, setFilterThreshold] = useState(0.80);
const logRef = useRef(null);
useEffect(() => {
if (logRef.current) logRef.current.scrollTop = logRef.current.scrollHeight;
}, [logs]);
const addLog = (msg, type = "info") => {
const colors = { info: "#94a3b8", success: "#4ade80", warn: "#fbbf24", error: "#f87171", accent: "#f472b6" };
setLogs((l) => [...l, { msg, color: colors[type], ts: new Date().toISOString().slice(11, 19) }]);
};
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
// Frontend simulation with API integration
const runPipeline = async () => {
if (running) return;
setRunning(true);
setAugmented(null); setSimilarity(null); setFiltered(null); setMetrics(null); setIntermediate(null);
setLogs([]);
// 1: Data Loading
setPipelineStep(1);
addLog("► Initializing data pipeline...", "accent");
await sleep(600);
addLog(` Corpus scanned. Detected ${SAMPLE_SENTENCES.length} defined classes.`, "info");
const minority = SAMPLE_SENTENCES.filter(s => s.count < 15);
addLog(` Imbalance flag: ${minority.length} classes identified as long-tail.`, "warn");
addLog(` Isolating sample from class: [${selectedSentence.label.toUpperCase()}]`, "success");
await sleep(500);
// 2: Paraphrase Generation (API Call)
setPipelineStep(2);
addLog(`► Executing module: ${AUG_METHODS[selectedMethod].label}`, "accent");
await sleep(400);
addLog(` Inference engine: ${AUG_METHODS[selectedMethod].lib}`, "info");
let aug = "";
try {
const resAug = await fetch(`${API_BASE}/augment`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text: selectedSentence.text,
method: selectedMethod,
pivot_lang: selectedPivot,
eda_p: edaIntensity
})
});
if (!resAug.ok) throw new Error(`Status ${resAug.status}`);
const dataAug = await resAug.json();
aug = dataAug.augmented;
if (selectedMethod === "BT" && dataAug.intermediate) {
setIntermediate({ lang: dataAug.pivot_lang.toUpperCase(), text: dataAug.intermediate });
addLog(` Pivot vector [${dataAug.pivot_lang.toUpperCase()}]: Generated successfully.`, "info");
}
} catch (error) {
addLog(` API CHANNEL FAILURE: No connection to base FastAPI server.`, "error");
setRunning(false);
return;
}
setAugmented(aug);
addLog(` Sentence synthesis completed.`, "success");
await sleep(400);
// 3: S-BERT Filtration (API Call)
setPipelineStep(3);
addLog("► Calculating vector distance (Sentence-BERT)...", "accent");
let sim = 0; let pass = false; let THRESHOLD = filterThreshold;
try {
const resFilter = await fetch(`${API_BASE}/filter`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
original: selectedSentence.text,
augmented: aug,
threshold: filterThreshold
})
});
const filterData = await resFilter.json();
sim = filterData.similarity;
pass = filterData.passed;
} catch (error) {
addLog(` FILTER FAILURE: No response from microservice.`, "error");
setRunning(false); return;
}
setSimilarity(sim);
setFiltered(pass);
if (pass) {
addLog(` Semantic alignment: ${(sim*100).toFixed(1)}% (Required: ${THRESHOLD*100}%) → ACCEPTED ✓`, "success");
} else {
addLog(` Semantic alignment: ${(sim*100).toFixed(1)}% (Required: ${THRESHOLD*100}%) → REJECTED ✗`, "error");
addLog(" Semantic drift detected. Sample flushed from buffer.", "warn");
}
await sleep(500);
// 4: Training Module
setPipelineStep(4);
addLog("► Initializing Fine-Tuning process for base model...", "accent");
await sleep(400);
addLog(" Architecture: allegro/herbert-base-cased", "info");
addLog(" Optimizer: AdamW, Learning Rate (LR): 2e-5", "info");
await sleep(900);
addLog(" Epoch 1/3 — Loss: 0.487", "info");
await sleep(500);
addLog(" Epoch 2/3 — Loss: 0.312", "info");
await sleep(500);
addLog(" Epoch 3/3 — Loss: 0.241", "info");
await sleep(400);
const baseF1 = 61.2, augF1 = pass ? 61.2 + 4 + Math.random() * 5 : 61.2 + 1.5 + Math.random() * 2;
setMetrics({
baseF1, augF1,
baseAcc: 74.1, augAcc: pass ? 74.1 + 3.5 + Math.random() * 3 : 74.1 + 1 + Math.random() * 2,
sss: sim * 100,
samplesAdded: pass ? 1 : 0,
});
addLog(` Baseline Evaluation (Macro-F1): ${baseF1.toFixed(1)}%`, "info");
addLog(` Augmented Evaluation (Macro-F1): ${augF1.toFixed(1)}% (+${(augF1 - baseF1).toFixed(1)}pp) ✓`, "success");
setPipelineStep(5);
addLog("■ Stream processing completed.", "accent");
setRunning(false);
};
const reset = () => {
setPipelineStep(0); setAugmented(null); setSimilarity(null);
setFiltered(null); setMetrics(null); setLogs([]); setRunning(false);
setIntermediate(null);
};
const steps = [
{ n: 1, label: "Loader", sublabel: "Vector distribution analysis", icon: "⬛", color: "#60a5fa" },
{ n: 2, label: "Augmentor", sublabel: "Multimodel synthesis", icon: "⟳", color: "#f472b6" },
{ n: 3, label: "Filter", sublabel: "S-BERT Gate", icon: "⊘", color: "#fbbf24" },
{ n: 4, label: "Trainer", sublabel: "PyTorch Integration", icon: "◉", color: "#4ade80" },
];
return (
<>
{/* Academic Header */}
◈ Cybersecurity · UKEN · Jacek Dusza · 2026
Multimodel Data Augmentation Engine
Sentiment Analysis (PL)
HerBERT · NLPAug · Groq API · Sentence-BERT · deep-translator
{[
{ label: "Classes", val: "6" }, { label: "Long-tail", val: "4" },
{ label: "Arch.", val: "Hybrid" }, { label: "Methods", val: "3" }
].map(({ label, val }) => (
))}
{/* Navigation Tabs */}
{[
{ id: "pipeline", label: "▶ Control Panel" },
{ id: "arch", label: "◈ Architecture" },
{ id: "tech", label: "⊞ Tech Stack" },
].map(t => (
))}
{/* TAB: CONTROL PANEL */}
{activeTab === "pipeline" && (
{steps.map((s, i) => (
s.n} />
= s.n ? s.color : "#334155", marginTop: 6, textAlign: "center" }}>{s.label}
{s.sublabel}
{i < steps.length - 1 && (
i + 1 ? "done" : pipelineStep === i + 1 ? "active" : ""}`} />
)}
))}
1. Input Vector Initialization
{SAMPLE_SENTENCES.map(s => (
))}
2. Augmentation Algorithm Setup
{Object.entries(AUG_METHODS).map(([key, m]) => (
))}
{/* Global S-BERT Filter */}
Semantic Filter Threshold
{Math.round(filterThreshold * 100)}%
{setFilterThreshold(parseFloat(e.target.value)); reset();}}
style={{ width: "100%", accentColor: "#fbbf24", cursor: "pointer", marginTop: 4 }}
/>
Minimum required Cosine Similarity (S-BERT) to prevent semantic drift and preserve original sentiment.
{/* Result Panels */}
Transmutation Output
Base Corpus
{selectedSentence.text}
{intermediate && selectedMethod === "BT" && (
Translation Vector (From: {intermediate.lang})
{intermediate.text}
)}
{augmented ? (
Resulting Paraphrase ({selectedMethod})
{augmented}
) : (
{running && pipelineStep >= 2 ? Calculating input matrix... : "Awaiting start signal..."}
)}
{similarity !== null && (
Quality Inspection (Sentence-BERT)
{filtered ? "✓ ACCEPTED (No Drift)" : "✗ REJECTED (Semantic Drift)"}
Distance: {similarity.toFixed(3)}
)}
{metrics && (
Model Impact (HerBERT Evaluation)
Δ Model Optimization: +{(metrics.augF1 - metrics.baseF1).toFixed(2)} pp.
)}
⟩ SYSTEM LOG (FastAPI)
{logs.length === 0 ? (
System ready.
) : logs.map((l, i) => (
{l.ts}
{l.msg}
))}
{running &&
█}
)}
{/* TAB: ARCHITECTURE */}
{activeTab === "arch" && (
Hybrid Pipeline Business Logic
Initialization → Augmentation → Semantic Verification → Fine-Tuning
{[
{
n: "01", label: "Distribution Analyzer", color: "#60a5fa",
desc: "Scans the input dataset and flags minority (long-tail) classes requiring data augmentation to prevent classifier generalization errors.",
details: ["pandas / HF Datasets", "Frequency mapping", "Input anomaly isolation"],
code: "dataset = load_dataset('polemo2-official')\nminority_classes = dataset.filter(lambda x: class_count[x['label']] < THRESHOLD)"
},
{
n: "02", label: "Augmentation Engine", color: "#f472b6",
desc: "Multi-path module generating paraphrases depending on the specificity of the analyzed sentence (LLM for complex syntax, EDA for quick noise).",
details: ["NLPAug: lexical operations", "deep-translator: cross-structures", "Groq/Llama 3: contextual inference"],
code: "def augment_pipeline(payload):\n if payload.method == 'EDA': return apply_nlpaug(payload.text)\n if payload.method == 'LLM': return groq_completion(payload.text)"
},
{
n: "03", label: "Semantic Gate (S-BERT)", color: "#fbbf24",
desc: "Defensive module preventing training data poisoning. Rejects paraphrases that have lost their original sentiment or core meaning.",
details: ["paraphrase-multilingual", "Cosine Similarity", "Semantic Drift Prevention"],
code: "embeddings = sbert_model.encode([original, augmented])\nsimilarity = cosine_similarity(embeddings[0], embeddings[1])\nif similarity >= CONFIG.threshold: return ACCEPT"
},
{
n: "04", label: "PyTorch Integration", color: "#4ade80",
desc: "Automated fine-tuning of the base HerBERT classifier on the newly generated, enriched data corpus.",
details: ["allegro/herbert-base-cased", "Tensor management", "Loss Function optimization"],
code: "model = AutoModelForSequenceClassification.from_pretrained('allegro/herbert')\ntrainer = Trainer(model=model, train_dataset=augmented_dataset)\ntrainer.train()"
},
].map((s, i) => (
{s.label}
{s.desc}
{s.details.map(d => (
{d}
))}
))}
)}
{/* TAB: TECH STACK */}
{activeTab === "tech" && (
{[
{
cat: "System Core", color: "#60a5fa",
items: [
{ name: "Python 3.10+", desc: "Logical foundation of the NLP environment" },
{ name: "PyTorch", desc: "Tensor computation management and backpropagation" },
{ name: "HuggingFace Transformers", desc: "Access bridge to leading language architectures" },
]
},
{
cat: "Generative Modules", color: "#f472b6",
items: [
{ name: "NLPAug", desc: "EDA rules implementation (replacement, deletion, noise)" },
{ name: "Groq Cloud (Llama 3)", desc: "Inference based on LPU architecture (Ultra-low latency)" },
{ name: "deep-translator", desc: "Network traffic management for Back-Translation" },
]
},
{
cat: "Classification Architecture", color: "#4ade80",
items: [
{ name: "HerBERT (Allegro)", desc: "Polish reference model with optimized tokenizer" },
{ name: "Sentence-Transformers", desc: "Sentence to 768-dimensional dense vector conversion" },
{ name: "AutoModelForSequenceClassification", desc: "Adapter for sentiment analysis tasks" },
]
},
{
cat: "Compute Infrastructure", color: "#c084fc",
items: [
{ name: "Apple Silicon (MPS)", desc: "PyTorch hardware acceleration on M1 Pro architecture" },
{ name: "FastAPI", desc: "High-performance asynchronous REST server coordinating the pipeline" },
{ name: "React (Vite)", desc: "Frontend module for experiment monitoring and visualization" },
]
},
{
cat: "Metrics Monitoring", color: "#fb923c",
items: [
{ name: "Macro-F1 Score", desc: "Primary metric accounting for minority class difficulties" },
{ name: "Cosine Similarity (SSS)", desc: "Assessing the rigor of semantic vector alignment" },
{ name: "scikit-learn", desc: "Advanced classification reporting and error validation" },
]
},
].map(group => (
{group.items.map(item => (
))}
))}
)}
{/* Footer */}
MULTIMODEL DATA AUGMENTATION PIPELINE · JACEK DUSZA · MASTER'S THESIS 2026
>
);
}