import { useState, useEffect, useRef, Fragment } from "react"; const API_BASE = "https://jaaccaa-data-augmentation.hf.space"; // Fonts used in the application UI const FONTS = `@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;700&family=Syne:wght@400;600;700;800&display=swap');`; // ── Demo Samples ────────────────────────────────────────────────────────────── // A set of sentences from the PolEmo2.0 corpus reflecting a long-tail distribution const SAMPLE_SENTENCES = [ { id: 1, text: "Produkt jest bardzo dobry i polecam go wszystkim.", label: "pozytywna", count: 142 }, { id: 2, text: "Obsługa klienta była fatalna i nieprofesjonalna.", label: "negatywna", count: 8 }, { id: 3, text: "Dostawa przyszła na czas, jestem zadowolony.", label: "pozytywna", count: 134 }, { id: 4, text: "Jakość wykonania pozostawia wiele do życzenia.", label: "negatywna", count: 11 }, { id: 5, text: "Nie mam zdania na temat tego produktu.", label: "neutralna", count: 6 }, { id: 6, text: "Cena jest adekwatna do jakości oferowanego towaru.", label: "neutralna", count: 9 }, ]; // ── Augmentation Methods Definitions ────────────────────────────────────────── const AUG_METHODS = { EDA: { label: "EDA (Lexical Rules)", color: "#4ade80", lib: "NLPAug + HerBERT", description: "Token-level perturbations: synonym replacement, random insertion, and deletion. Low computational overhead, high throughput.", }, BT: { label: "Back-Translation", color: "#60a5fa", lib: "deep-translator (Google)", description: "Round-trip translation (PL → [EN, DE, CS] → PL). Leverages multilingual embeddings to break syntactic patterns and bypass pivot-language bias.", }, LLM: { label: "Generative LLM", color: "#f472b6", lib: "Groq Cloud (Llama 3)", description: "Advanced paraphrasing based on prompt instructions for Large Language Models. Highest semantic quality powered by ultra-fast LPU inference.", }, }; // ── Helper Components ───────────────────────────────────────────────────────── function MetricBar({ label, value, color, unit = "%" }) { return (
{label} {typeof value === "number" ? value.toFixed(1) : value}{unit}
); } function ClassBadge({ label }) { const colors = { pozytywna: "#4ade80", negatywna: "#f87171", neutralna: "#fbbf24" }; return ( {label} ); } function StepBadge({ step, active, done }) { return (
{done ? "✓" : step}
); } // ── Main Application ────────────────────────────────────────────────────────── export default function App() { const [activeTab, setActiveTab] = useState("pipeline"); const [pipelineStep, setPipelineStep] = useState(0); const [selectedSentence, setSelectedSentence] = useState(SAMPLE_SENTENCES[1]); const [selectedMethod, setSelectedMethod] = useState("LLM"); const [augmented, setAugmented] = useState(null); const [similarity, setSimilarity] = useState(null); const [filtered, setFiltered] = useState(null); const [logs, setLogs] = useState([]); const [metrics, setMetrics] = useState(null); const [running, setRunning] = useState(false); const [intermediate, setIntermediate] = useState(null); // Hyperparameters const [selectedPivot, setSelectedPivot] = useState("en"); const [edaIntensity, setEdaIntensity] = useState(0.15); const [filterThreshold, setFilterThreshold] = useState(0.80); const logRef = useRef(null); useEffect(() => { if (logRef.current) logRef.current.scrollTop = logRef.current.scrollHeight; }, [logs]); const addLog = (msg, type = "info") => { const colors = { info: "#94a3b8", success: "#4ade80", warn: "#fbbf24", error: "#f87171", accent: "#f472b6" }; setLogs((l) => [...l, { msg, color: colors[type], ts: new Date().toISOString().slice(11, 19) }]); }; const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); // Frontend simulation with API integration const runPipeline = async () => { if (running) return; setRunning(true); setAugmented(null); setSimilarity(null); setFiltered(null); setMetrics(null); setIntermediate(null); setLogs([]); // 1: Data Loading setPipelineStep(1); addLog("► Initializing data pipeline...", "accent"); await sleep(600); addLog(` Corpus scanned. Detected ${SAMPLE_SENTENCES.length} defined classes.`, "info"); const minority = SAMPLE_SENTENCES.filter(s => s.count < 15); addLog(` Imbalance flag: ${minority.length} classes identified as long-tail.`, "warn"); addLog(` Isolating sample from class: [${selectedSentence.label.toUpperCase()}]`, "success"); await sleep(500); // 2: Paraphrase Generation (API Call) setPipelineStep(2); addLog(`► Executing module: ${AUG_METHODS[selectedMethod].label}`, "accent"); await sleep(400); addLog(` Inference engine: ${AUG_METHODS[selectedMethod].lib}`, "info"); let aug = ""; try { const resAug = await fetch(`${API_BASE}/augment`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: selectedSentence.text, method: selectedMethod, pivot_lang: selectedPivot, eda_p: edaIntensity }) }); if (!resAug.ok) throw new Error(`Status ${resAug.status}`); const dataAug = await resAug.json(); aug = dataAug.augmented; if (selectedMethod === "BT" && dataAug.intermediate) { setIntermediate({ lang: dataAug.pivot_lang.toUpperCase(), text: dataAug.intermediate }); addLog(` Pivot vector [${dataAug.pivot_lang.toUpperCase()}]: Generated successfully.`, "info"); } } catch (error) { addLog(` API CHANNEL FAILURE: No connection to base FastAPI server.`, "error"); setRunning(false); return; } setAugmented(aug); addLog(` Sentence synthesis completed.`, "success"); await sleep(400); // 3: S-BERT Filtration (API Call) setPipelineStep(3); addLog("► Calculating vector distance (Sentence-BERT)...", "accent"); let sim = 0; let pass = false; let THRESHOLD = filterThreshold; try { const resFilter = await fetch(`${API_BASE}/filter`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ original: selectedSentence.text, augmented: aug, threshold: filterThreshold }) }); const filterData = await resFilter.json(); sim = filterData.similarity; pass = filterData.passed; } catch (error) { addLog(` FILTER FAILURE: No response from microservice.`, "error"); setRunning(false); return; } setSimilarity(sim); setFiltered(pass); if (pass) { addLog(` Semantic alignment: ${(sim*100).toFixed(1)}% (Required: ${THRESHOLD*100}%) → ACCEPTED ✓`, "success"); } else { addLog(` Semantic alignment: ${(sim*100).toFixed(1)}% (Required: ${THRESHOLD*100}%) → REJECTED ✗`, "error"); addLog(" Semantic drift detected. Sample flushed from buffer.", "warn"); } await sleep(500); // 4: Training Module setPipelineStep(4); addLog("► Initializing Fine-Tuning process for base model...", "accent"); await sleep(400); addLog(" Architecture: allegro/herbert-base-cased", "info"); addLog(" Optimizer: AdamW, Learning Rate (LR): 2e-5", "info"); await sleep(900); addLog(" Epoch 1/3 — Loss: 0.487", "info"); await sleep(500); addLog(" Epoch 2/3 — Loss: 0.312", "info"); await sleep(500); addLog(" Epoch 3/3 — Loss: 0.241", "info"); await sleep(400); const baseF1 = 61.2, augF1 = pass ? 61.2 + 4 + Math.random() * 5 : 61.2 + 1.5 + Math.random() * 2; setMetrics({ baseF1, augF1, baseAcc: 74.1, augAcc: pass ? 74.1 + 3.5 + Math.random() * 3 : 74.1 + 1 + Math.random() * 2, sss: sim * 100, samplesAdded: pass ? 1 : 0, }); addLog(` Baseline Evaluation (Macro-F1): ${baseF1.toFixed(1)}%`, "info"); addLog(` Augmented Evaluation (Macro-F1): ${augF1.toFixed(1)}% (+${(augF1 - baseF1).toFixed(1)}pp) ✓`, "success"); setPipelineStep(5); addLog("■ Stream processing completed.", "accent"); setRunning(false); }; const reset = () => { setPipelineStep(0); setAugmented(null); setSimilarity(null); setFiltered(null); setMetrics(null); setLogs([]); setRunning(false); setIntermediate(null); }; const steps = [ { n: 1, label: "Loader", sublabel: "Vector distribution analysis", icon: "⬛", color: "#60a5fa" }, { n: 2, label: "Augmentor", sublabel: "Multimodel synthesis", icon: "⟳", color: "#f472b6" }, { n: 3, label: "Filter", sublabel: "S-BERT Gate", icon: "⊘", color: "#fbbf24" }, { n: 4, label: "Trainer", sublabel: "PyTorch Integration", icon: "◉", color: "#4ade80" }, ]; return ( <>
{/* Academic Header */}
◈ Cybersecurity · UKEN · Jacek Dusza · 2026

Multimodel Data Augmentation Engine Sentiment Analysis (PL)

HerBERT · NLPAug · Groq API · Sentence-BERT · deep-translator

{[ { label: "Classes", val: "6" }, { label: "Long-tail", val: "4" }, { label: "Arch.", val: "Hybrid" }, { label: "Methods", val: "3" } ].map(({ label, val }) => (
{val}
{label}
))}
{/* Navigation Tabs */}
{[ { id: "pipeline", label: "▶ Control Panel" }, { id: "arch", label: "◈ Architecture" }, { id: "tech", label: "⊞ Tech Stack" }, ].map(t => ( ))}
{/* TAB: CONTROL PANEL */} {activeTab === "pipeline" && (
{steps.map((s, i) => (
s.n} />
= s.n ? s.color : "#334155", marginTop: 6, textAlign: "center" }}>{s.label}
{s.sublabel}
{i < steps.length - 1 && (
i + 1 ? "done" : pipelineStep === i + 1 ? "active" : ""}`} /> )} ))}
1. Input Vector Initialization
{SAMPLE_SENTENCES.map(s => ( ))}
2. Augmentation Algorithm Setup
{Object.entries(AUG_METHODS).map(([key, m]) => ( ))}
{/* Global S-BERT Filter */}
Semantic Filter Threshold {Math.round(filterThreshold * 100)}%
{setFilterThreshold(parseFloat(e.target.value)); reset();}} style={{ width: "100%", accentColor: "#fbbf24", cursor: "pointer", marginTop: 4 }} />
Minimum required Cosine Similarity (S-BERT) to prevent semantic drift and preserve original sentiment.
{/* Result Panels */}
Transmutation Output
Base Corpus
{selectedSentence.text}
{intermediate && selectedMethod === "BT" && (
Translation Vector (From: {intermediate.lang})
{intermediate.text}
)} {augmented ? (
Resulting Paraphrase ({selectedMethod})
{augmented}
) : (
{running && pipelineStep >= 2 ? Calculating input matrix... : "Awaiting start signal..."}
)}
{similarity !== null && (
Quality Inspection (Sentence-BERT)
{filtered ? "✓ ACCEPTED (No Drift)" : "✗ REJECTED (Semantic Drift)"}
Distance: {similarity.toFixed(3)}
)} {metrics && (
Model Impact (HerBERT Evaluation)
Δ Model Optimization: +{(metrics.augF1 - metrics.baseF1).toFixed(2)} pp.
)}
⟩ SYSTEM LOG (FastAPI)
{logs.length === 0 ? ( System ready. ) : logs.map((l, i) => (
{l.ts} {l.msg}
))} {running && }
)} {/* TAB: ARCHITECTURE */} {activeTab === "arch" && (
Hybrid Pipeline Business Logic
Initialization → Augmentation → Semantic Verification → Fine-Tuning
{[ { n: "01", label: "Distribution Analyzer", color: "#60a5fa", desc: "Scans the input dataset and flags minority (long-tail) classes requiring data augmentation to prevent classifier generalization errors.", details: ["pandas / HF Datasets", "Frequency mapping", "Input anomaly isolation"], code: "dataset = load_dataset('polemo2-official')\nminority_classes = dataset.filter(lambda x: class_count[x['label']] < THRESHOLD)" }, { n: "02", label: "Augmentation Engine", color: "#f472b6", desc: "Multi-path module generating paraphrases depending on the specificity of the analyzed sentence (LLM for complex syntax, EDA for quick noise).", details: ["NLPAug: lexical operations", "deep-translator: cross-structures", "Groq/Llama 3: contextual inference"], code: "def augment_pipeline(payload):\n if payload.method == 'EDA': return apply_nlpaug(payload.text)\n if payload.method == 'LLM': return groq_completion(payload.text)" }, { n: "03", label: "Semantic Gate (S-BERT)", color: "#fbbf24", desc: "Defensive module preventing training data poisoning. Rejects paraphrases that have lost their original sentiment or core meaning.", details: ["paraphrase-multilingual", "Cosine Similarity", "Semantic Drift Prevention"], code: "embeddings = sbert_model.encode([original, augmented])\nsimilarity = cosine_similarity(embeddings[0], embeddings[1])\nif similarity >= CONFIG.threshold: return ACCEPT" }, { n: "04", label: "PyTorch Integration", color: "#4ade80", desc: "Automated fine-tuning of the base HerBERT classifier on the newly generated, enriched data corpus.", details: ["allegro/herbert-base-cased", "Tensor management", "Loss Function optimization"], code: "model = AutoModelForSequenceClassification.from_pretrained('allegro/herbert')\ntrainer = Trainer(model=model, train_dataset=augmented_dataset)\ntrainer.train()" }, ].map((s, i) => (
{s.n}
{i < 3 &&
}
{s.label}
{s.desc}
{s.details.map(d => ( {d} ))}
{s.code}
))}
)} {/* TAB: TECH STACK */} {activeTab === "tech" && (
{[ { cat: "System Core", color: "#60a5fa", items: [ { name: "Python 3.10+", desc: "Logical foundation of the NLP environment" }, { name: "PyTorch", desc: "Tensor computation management and backpropagation" }, { name: "HuggingFace Transformers", desc: "Access bridge to leading language architectures" }, ] }, { cat: "Generative Modules", color: "#f472b6", items: [ { name: "NLPAug", desc: "EDA rules implementation (replacement, deletion, noise)" }, { name: "Groq Cloud (Llama 3)", desc: "Inference based on LPU architecture (Ultra-low latency)" }, { name: "deep-translator", desc: "Network traffic management for Back-Translation" }, ] }, { cat: "Classification Architecture", color: "#4ade80", items: [ { name: "HerBERT (Allegro)", desc: "Polish reference model with optimized tokenizer" }, { name: "Sentence-Transformers", desc: "Sentence to 768-dimensional dense vector conversion" }, { name: "AutoModelForSequenceClassification", desc: "Adapter for sentiment analysis tasks" }, ] }, { cat: "Compute Infrastructure", color: "#c084fc", items: [ { name: "Apple Silicon (MPS)", desc: "PyTorch hardware acceleration on M1 Pro architecture" }, { name: "FastAPI", desc: "High-performance asynchronous REST server coordinating the pipeline" }, { name: "React (Vite)", desc: "Frontend module for experiment monitoring and visualization" }, ] }, { cat: "Metrics Monitoring", color: "#fb923c", items: [ { name: "Macro-F1 Score", desc: "Primary metric accounting for minority class difficulties" }, { name: "Cosine Similarity (SSS)", desc: "Assessing the rigor of semantic vector alignment" }, { name: "scikit-learn", desc: "Advanced classification reporting and error validation" }, ] }, ].map(group => (
{group.cat}
{group.items.map(item => (
{item.name}
{item.desc}
))}
))}
)} {/* Footer */}
MULTIMODEL DATA AUGMENTATION PIPELINE · JACEK DUSZA · MASTER'S THESIS 2026
); }