{label} {typeof value === "number" ? value.toFixed(1) : value}{unit}

); } function ClassBadge({ label }) { const colors = { pozytywna: "#4ade80", negatywna: "#f87171", neutralna: "#fbbf24" }; return ( {label} ); } function StepBadge({ step, active, done }) { return (

{done ? "✓" : step}

); } // ── Main Application ────────────────────────────────────────────────────────── export default function App() { const [activeTab, setActiveTab] = useState("pipeline"); const [pipelineStep, setPipelineStep] = useState(0); const [selectedSentence, setSelectedSentence] = useState(SAMPLE_SENTENCES[1]); const [selectedMethod, setSelectedMethod] = useState("LLM"); const [augmented, setAugmented] = useState(null); const [similarity, setSimilarity] = useState(null); const [filtered, setFiltered] = useState(null); const [logs, setLogs] = useState([]); const [metrics, setMetrics] = useState(null); const [running, setRunning] = useState(false); const [intermediate, setIntermediate] = useState(null); // Hyperparameters const [selectedPivot, setSelectedPivot] = useState("en"); const [edaIntensity, setEdaIntensity] = useState(0.15); const [filterThreshold, setFilterThreshold] = useState(0.80); const logRef = useRef(null); useEffect(() => { if (logRef.current) logRef.current.scrollTop = logRef.current.scrollHeight; }, [logs]); const addLog = (msg, type = "info") => { const colors = { info: "#94a3b8", success: "#4ade80", warn: "#fbbf24", error: "#f87171", accent: "#f472b6" }; setLogs((l) => [...l, { msg, color: colors[type], ts: new Date().toISOString().slice(11, 19) }]); }; const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); // Frontend simulation with API integration const runPipeline = async () => { if (running) return; setRunning(true); setAugmented(null); setSimilarity(null); setFiltered(null); setMetrics(null); setIntermediate(null); setLogs([]); // 1: Data Loading setPipelineStep(1); addLog("► Initializing data pipeline...", "accent"); await sleep(600); addLog(` Corpus scanned. Detected ${SAMPLE_SENTENCES.length} defined classes.`, "info"); const minority = SAMPLE_SENTENCES.filter(s => s.count < 15); addLog(` Imbalance flag: ${minority.length} classes identified as long-tail.`, "warn"); addLog(` Isolating sample from class: [${selectedSentence.label.toUpperCase()}]`, "success"); await sleep(500); // 2: Paraphrase Generation (API Call) setPipelineStep(2); addLog(`► Executing module: ${AUG_METHODS[selectedMethod].label}`, "accent"); await sleep(400); addLog(` Inference engine: ${AUG_METHODS[selectedMethod].lib}`, "info"); let aug = ""; try { const resAug = await fetch(`${API_BASE}/augment`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: selectedSentence.text, method: selectedMethod, pivot_lang: selectedPivot, eda_p: edaIntensity }) }); if (!resAug.ok) throw new Error(`Status ${resAug.status}`); const dataAug = await resAug.json(); aug = dataAug.augmented; if (selectedMethod === "BT" && dataAug.intermediate) { setIntermediate({ lang: dataAug.pivot_lang.toUpperCase(), text: dataAug.intermediate }); addLog(` Pivot vector [${dataAug.pivot_lang.toUpperCase()}]: Generated successfully.`, "info"); } } catch (error) { addLog(` API CHANNEL FAILURE: No connection to base FastAPI server.`, "error"); setRunning(false); return; } setAugmented(aug); addLog(` Sentence synthesis completed.`, "success"); await sleep(400); // 3: S-BERT Filtration (API Call) setPipelineStep(3); addLog("► Calculating vector distance (Sentence-BERT)...", "accent"); let sim = 0; let pass = false; let THRESHOLD = filterThreshold; try { const resFilter = await fetch(`${API_BASE}/filter`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ original: selectedSentence.text, augmented: aug, threshold: filterThreshold }) }); const filterData = await resFilter.json(); sim = filterData.similarity; pass = filterData.passed; } catch (error) { addLog(` FILTER FAILURE: No response from microservice.`, "error"); setRunning(false); return; } setSimilarity(sim); setFiltered(pass); if (pass) { addLog(` Semantic alignment: ${(sim*100).toFixed(1)}% (Required: ${THRESHOLD*100}%) → ACCEPTED ✓`, "success"); } else { addLog(` Semantic alignment: ${(sim*100).toFixed(1)}% (Required: ${THRESHOLD*100}%) → REJECTED ✗`, "error"); addLog(" Semantic drift detected. Sample flushed from buffer.", "warn"); } await sleep(500); // 4: Training Module setPipelineStep(4); addLog("► Initializing Fine-Tuning process for base model...", "accent"); await sleep(400); addLog(" Architecture: allegro/herbert-base-cased", "info"); addLog(" Optimizer: AdamW, Learning Rate (LR): 2e-5", "info"); await sleep(900); addLog(" Epoch 1/3 — Loss: 0.487", "info"); await sleep(500); addLog(" Epoch 2/3 — Loss: 0.312", "info"); await sleep(500); addLog(" Epoch 3/3 — Loss: 0.241", "info"); await sleep(400); const baseF1 = 61.2, augF1 = pass ? 61.2 + 4 + Math.random() * 5 : 61.2 + 1.5 + Math.random() * 2; setMetrics({ baseF1, augF1, baseAcc: 74.1, augAcc: pass ? 74.1 + 3.5 + Math.random() * 3 : 74.1 + 1 + Math.random() * 2, sss: sim * 100, samplesAdded: pass ? 1 : 0, }); addLog(` Baseline Evaluation (Macro-F1): ${baseF1.toFixed(1)}%`, "info"); addLog(` Augmented Evaluation (Macro-F1): ${augF1.toFixed(1)}% (+${(augF1 - baseF1).toFixed(1)}pp) ✓`, "success"); setPipelineStep(5); addLog("■ Stream processing completed.", "accent"); setRunning(false); }; const reset = () => { setPipelineStep(0); setAugmented(null); setSimilarity(null); setFiltered(null); setMetrics(null); setLogs([]); setRunning(false); setIntermediate(null); }; const steps = [ { n: 1, label: "Loader", sublabel: "Vector distribution analysis", icon: "⬛", color: "#60a5fa" }, { n: 2, label: "Augmentor", sublabel: "Multimodel synthesis", icon: "⟳", color: "#f472b6" }, { n: 3, label: "Filter", sublabel: "S-BERT Gate", icon: "⊘", color: "#fbbf24" }, { n: 4, label: "Trainer", sublabel: "PyTorch Integration", icon: "◉", color: "#4ade80" }, ]; return ( <>

{/* Academic Header */}

◈ Cybersecurity · UKEN · Jacek Dusza · 2026

Multimodel Data Augmentation Engine Sentiment Analysis (PL)

HerBERT · NLPAug · Groq API · Sentence-BERT · deep-translator

{[ { label: "Classes", val: "6" }, { label: "Long-tail", val: "4" }, { label: "Arch.", val: "Hybrid" }, { label: "Methods", val: "3" } ].map(({ label, val }) => (

{val}

{label}

))}

{/* Navigation Tabs */}

{[ { id: "pipeline", label: "▶ Control Panel" }, { id: "arch", label: "◈ Architecture" }, { id: "tech", label: "⊞ Tech Stack" }, ].map(t => ( ))}

{/* TAB: CONTROL PANEL */} {activeTab === "pipeline" && (

{steps.map((s, i) => (

s.n} />

= s.n ? s.color : "#334155", marginTop: 6, textAlign: "center" }}>{s.label}

{s.sublabel}

{i < steps.length - 1 && (

i + 1 ? "done" : pipelineStep === i + 1 ? "active" : ""}`} /> )} ))}

1. Input Vector Initialization

{SAMPLE_SENTENCES.map(s => ( ))}

2. Augmentation Algorithm Setup

{Object.entries(AUG_METHODS).map(([key, m]) => ( ))}

{/* Global S-BERT Filter */}

Semantic Filter Threshold {Math.round(filterThreshold * 100)}%

{setFilterThreshold(parseFloat(e.target.value)); reset();}} style={{ width: "100%", accentColor: "#fbbf24", cursor: "pointer", marginTop: 4 }} />

Minimum required Cosine Similarity (S-BERT) to prevent semantic drift and preserve original sentiment.

{/* Result Panels */}

Transmutation Output

Base Corpus

{selectedSentence.text}

{intermediate && selectedMethod === "BT" && (

Translation Vector (From: {intermediate.lang})

{intermediate.text}

)} {augmented ? (

Resulting Paraphrase ({selectedMethod})

{augmented}

) : (

{running && pipelineStep >= 2 ? Calculating input matrix... : "Awaiting start signal..."}

)}

{similarity !== null && (

Quality Inspection (Sentence-BERT)

{filtered ? "✓ ACCEPTED (No Drift)" : "✗ REJECTED (Semantic Drift)"}

Distance: {similarity.toFixed(3)}

)} {metrics && (

Model Impact (HerBERT Evaluation)

Δ Model Optimization: +{(metrics.augF1 - metrics.baseF1).toFixed(2)} pp.

)}

⟩ SYSTEM LOG (FastAPI)

{logs.length === 0 ? ( System ready. ) : logs.map((l, i) => (

{l.ts} {l.msg}

))} {running && █}

)} {/* TAB: ARCHITECTURE */} {activeTab === "arch" && (

Hybrid Pipeline Business Logic

Initialization → Augmentation → Semantic Verification → Fine-Tuning

{[ { n: "01", label: "Distribution Analyzer", color: "#60a5fa", desc: "Scans the input dataset and flags minority (long-tail) classes requiring data augmentation to prevent classifier generalization errors.", details: ["pandas / HF Datasets", "Frequency mapping", "Input anomaly isolation"], code: "dataset = load_dataset('polemo2-official')\nminority_classes = dataset.filter(lambda x: class_count[x['label']] < THRESHOLD)" }, { n: "02", label: "Augmentation Engine", color: "#f472b6", desc: "Multi-path module generating paraphrases depending on the specificity of the analyzed sentence (LLM for complex syntax, EDA for quick noise).", details: ["NLPAug: lexical operations", "deep-translator: cross-structures", "Groq/Llama 3: contextual inference"], code: "def augment_pipeline(payload):\n if payload.method == 'EDA': return apply_nlpaug(payload.text)\n if payload.method == 'LLM': return groq_completion(payload.text)" }, { n: "03", label: "Semantic Gate (S-BERT)", color: "#fbbf24", desc: "Defensive module preventing training data poisoning. Rejects paraphrases that have lost their original sentiment or core meaning.", details: ["paraphrase-multilingual", "Cosine Similarity", "Semantic Drift Prevention"], code: "embeddings = sbert_model.encode([original, augmented])\nsimilarity = cosine_similarity(embeddings[0], embeddings[1])\nif similarity >= CONFIG.threshold: return ACCEPT" }, { n: "04", label: "PyTorch Integration", color: "#4ade80", desc: "Automated fine-tuning of the base HerBERT classifier on the newly generated, enriched data corpus.", details: ["allegro/herbert-base-cased", "Tensor management", "Loss Function optimization"], code: "model = AutoModelForSequenceClassification.from_pretrained('allegro/herbert')\ntrainer = Trainer(model=model, train_dataset=augmented_dataset)\ntrainer.train()" }, ].map((s, i) => (

{s.n}

{i < 3 &&

}

{s.label}

{s.desc}

{s.details.map(d => ( {d} ))}

{s.code}

))}

)} {/* TAB: TECH STACK */} {activeTab === "tech" && (

{[ { cat: "System Core", color: "#60a5fa", items: [ { name: "Python 3.10+", desc: "Logical foundation of the NLP environment" }, { name: "PyTorch", desc: "Tensor computation management and backpropagation" }, { name: "HuggingFace Transformers", desc: "Access bridge to leading language architectures" }, ] }, { cat: "Generative Modules", color: "#f472b6", items: [ { name: "NLPAug", desc: "EDA rules implementation (replacement, deletion, noise)" }, { name: "Groq Cloud (Llama 3)", desc: "Inference based on LPU architecture (Ultra-low latency)" }, { name: "deep-translator", desc: "Network traffic management for Back-Translation" }, ] }, { cat: "Classification Architecture", color: "#4ade80", items: [ { name: "HerBERT (Allegro)", desc: "Polish reference model with optimized tokenizer" }, { name: "Sentence-Transformers", desc: "Sentence to 768-dimensional dense vector conversion" }, { name: "AutoModelForSequenceClassification", desc: "Adapter for sentiment analysis tasks" }, ] }, { cat: "Compute Infrastructure", color: "#c084fc", items: [ { name: "Apple Silicon (MPS)", desc: "PyTorch hardware acceleration on M1 Pro architecture" }, { name: "FastAPI", desc: "High-performance asynchronous REST server coordinating the pipeline" }, { name: "React (Vite)", desc: "Frontend module for experiment monitoring and visualization" }, ] }, { cat: "Metrics Monitoring", color: "#fb923c", items: [ { name: "Macro-F1 Score", desc: "Primary metric accounting for minority class difficulties" }, { name: "Cosine Similarity (SSS)", desc: "Assessing the rigor of semantic vector alignment" }, { name: "scikit-learn", desc: "Advanced classification reporting and error validation" }, ] }, ].map(group => (

{group.cat}

{group.items.map(item => (

{item.name}

{item.desc}

))}

)} {/* Footer */}

MULTIMODEL DATA AUGMENTATION PIPELINE · JACEK DUSZA · MASTER'S THESIS 2026

); }