| | <!DOCTYPE html> |
| | <html lang="vi"> |
| | <head> |
| | <meta charset="UTF-8"> |
| | <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| | <title>Sen-1 References - Vietnamese Text Classification</title> |
| | <style> |
| | :root { |
| | --bg: #0d1117; |
| | --surface: #161b22; |
| | --surface2: #1c2333; |
| | --border: #30363d; |
| | --text: #e6edf3; |
| | --text2: #8b949e; |
| | --accent: #58a6ff; |
| | --accent2: #3fb950; |
| | --accent3: #d2a8ff; |
| | --accent4: #f0883e; |
| | --red: #f85149; |
| | --tag-bg: #1f2937; |
| | } |
| | * { margin: 0; padding: 0; box-sizing: border-box; } |
| | body { |
| | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif; |
| | background: var(--bg); |
| | color: var(--text); |
| | line-height: 1.6; |
| | } |
| | a { color: var(--accent); text-decoration: none; } |
| | a:hover { text-decoration: underline; } |
| | |
| | |
| | .header { |
| | background: var(--surface); |
| | border-bottom: 1px solid var(--border); |
| | padding: 1.5rem 2rem; |
| | position: sticky; |
| | top: 0; |
| | z-index: 100; |
| | } |
| | .header-inner { |
| | max-width: 1200px; |
| | margin: 0 auto; |
| | display: flex; |
| | align-items: center; |
| | justify-content: space-between; |
| | gap: 1rem; |
| | } |
| | .logo { |
| | display: flex; |
| | align-items: center; |
| | gap: 0.75rem; |
| | } |
| | .logo-icon { |
| | width: 36px; |
| | height: 36px; |
| | background: linear-gradient(135deg, var(--accent), var(--accent3)); |
| | border-radius: 8px; |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | font-size: 18px; |
| | font-weight: 700; |
| | color: #fff; |
| | } |
| | .logo h1 { |
| | font-size: 1.2rem; |
| | font-weight: 600; |
| | } |
| | .logo span { color: var(--text2); font-weight: 400; font-size: 0.9rem; } |
| | .stats { |
| | display: flex; |
| | gap: 1.5rem; |
| | font-size: 0.85rem; |
| | color: var(--text2); |
| | } |
| | .stat-num { color: var(--text); font-weight: 600; font-size: 1.1rem; } |
| | |
| | |
| | .tabs { |
| | max-width: 1200px; |
| | margin: 0 auto; |
| | padding: 0 2rem; |
| | display: flex; |
| | gap: 0; |
| | border-bottom: 1px solid var(--border); |
| | overflow-x: auto; |
| | } |
| | .tab { |
| | padding: 0.75rem 1.25rem; |
| | cursor: pointer; |
| | color: var(--text2); |
| | font-size: 0.9rem; |
| | border-bottom: 2px solid transparent; |
| | white-space: nowrap; |
| | transition: all 0.15s; |
| | } |
| | .tab:hover { color: var(--text); } |
| | .tab.active { |
| | color: var(--text); |
| | border-bottom-color: var(--accent4); |
| | font-weight: 500; |
| | } |
| | |
| | |
| | .content { |
| | max-width: 1200px; |
| | margin: 0 auto; |
| | padding: 2rem; |
| | } |
| | .panel { display: none; } |
| | .panel.active { display: block; } |
| | |
| | |
| | .papers-grid { |
| | display: grid; |
| | gap: 1rem; |
| | } |
| | .paper-card { |
| | background: var(--surface); |
| | border: 1px solid var(--border); |
| | border-radius: 8px; |
| | padding: 1.25rem 1.5rem; |
| | transition: border-color 0.15s; |
| | } |
| | .paper-card:hover { border-color: var(--accent); } |
| | .paper-title { |
| | font-size: 1.05rem; |
| | font-weight: 600; |
| | margin-bottom: 0.5rem; |
| | display: flex; |
| | align-items: flex-start; |
| | gap: 0.5rem; |
| | } |
| | .paper-meta { |
| | display: flex; |
| | flex-wrap: wrap; |
| | gap: 0.75rem; |
| | font-size: 0.85rem; |
| | color: var(--text2); |
| | margin-bottom: 0.5rem; |
| | } |
| | .paper-abstract { |
| | font-size: 0.88rem; |
| | color: var(--text2); |
| | line-height: 1.5; |
| | margin-top: 0.75rem; |
| | display: -webkit-box; |
| | -webkit-line-clamp: 3; |
| | -webkit-box-orient: vertical; |
| | overflow: hidden; |
| | } |
| | .paper-abstract.expanded { |
| | -webkit-line-clamp: unset; |
| | } |
| | .tag { |
| | display: inline-block; |
| | padding: 0.15rem 0.55rem; |
| | border-radius: 12px; |
| | font-size: 0.75rem; |
| | font-weight: 500; |
| | } |
| | .tag-venue { background: #1a3a2a; color: var(--accent2); } |
| | .tag-year { background: #2a1a3a; color: var(--accent3); } |
| | .tag-file { background: var(--tag-bg); color: var(--text2); } |
| | .tag-pdf { background: #3a1a1a; color: var(--red); } |
| | .tag-tex { background: #1a2a3a; color: var(--accent); } |
| | .tag-md { background: #2a2a1a; color: var(--accent4); } |
| | .expand-btn { |
| | background: none; |
| | border: none; |
| | color: var(--accent); |
| | cursor: pointer; |
| | font-size: 0.82rem; |
| | margin-top: 0.3rem; |
| | } |
| | |
| | |
| | .filters { |
| | display: flex; |
| | flex-wrap: wrap; |
| | gap: 0.5rem; |
| | margin-bottom: 1.5rem; |
| | } |
| | .filter-btn { |
| | padding: 0.4rem 0.9rem; |
| | border-radius: 20px; |
| | border: 1px solid var(--border); |
| | background: var(--surface); |
| | color: var(--text2); |
| | cursor: pointer; |
| | font-size: 0.82rem; |
| | transition: all 0.15s; |
| | } |
| | .filter-btn:hover { border-color: var(--accent); color: var(--text); } |
| | .filter-btn.active { background: var(--accent); color: #000; border-color: var(--accent); font-weight: 600; } |
| | |
| | |
| | table { |
| | width: 100%; |
| | border-collapse: collapse; |
| | margin: 1rem 0; |
| | font-size: 0.88rem; |
| | } |
| | th, td { |
| | padding: 0.65rem 0.85rem; |
| | text-align: left; |
| | border-bottom: 1px solid var(--border); |
| | } |
| | th { |
| | background: var(--surface2); |
| | color: var(--text2); |
| | font-weight: 600; |
| | font-size: 0.8rem; |
| | text-transform: uppercase; |
| | letter-spacing: 0.03em; |
| | position: sticky; |
| | top: 0; |
| | } |
| | tr:hover td { background: var(--surface); } |
| | .highlight { color: var(--accent2); font-weight: 600; } |
| | |
| | |
| | .section-title { |
| | font-size: 1.4rem; |
| | font-weight: 700; |
| | margin-bottom: 0.5rem; |
| | } |
| | .section-desc { |
| | color: var(--text2); |
| | margin-bottom: 1.5rem; |
| | font-size: 0.92rem; |
| | } |
| | h3 { |
| | font-size: 1.1rem; |
| | margin: 2rem 0 0.75rem; |
| | padding-bottom: 0.4rem; |
| | border-bottom: 1px solid var(--border); |
| | } |
| | |
| | |
| | .network { |
| | background: var(--surface); |
| | border: 1px solid var(--border); |
| | border-radius: 8px; |
| | padding: 1.5rem; |
| | font-family: 'SF Mono', 'Fira Code', 'Consolas', monospace; |
| | font-size: 0.82rem; |
| | line-height: 1.8; |
| | overflow-x: auto; |
| | white-space: pre; |
| | color: var(--text2); |
| | } |
| | .network b { color: var(--accent); font-weight: 600; } |
| | .network em { color: var(--accent2); font-style: normal; } |
| | .network u { color: var(--accent3); text-decoration: none; } |
| | |
| | |
| | blockquote { |
| | border-left: 3px solid var(--accent3); |
| | padding: 0.75rem 1.25rem; |
| | margin: 1rem 0; |
| | background: var(--surface); |
| | border-radius: 0 6px 6px 0; |
| | font-style: italic; |
| | color: var(--text2); |
| | } |
| | blockquote strong { color: var(--text); } |
| | |
| | |
| | .cards-row { |
| | display: grid; |
| | grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
| | gap: 1rem; |
| | margin: 1rem 0; |
| | } |
| | .info-card { |
| | background: var(--surface); |
| | border: 1px solid var(--border); |
| | border-radius: 8px; |
| | padding: 1.25rem; |
| | } |
| | .info-card h4 { |
| | font-size: 0.95rem; |
| | margin-bottom: 0.5rem; |
| | color: var(--accent); |
| | } |
| | .info-card p, .info-card li { |
| | font-size: 0.85rem; |
| | color: var(--text2); |
| | line-height: 1.6; |
| | } |
| | .info-card ul { padding-left: 1.2rem; } |
| | |
| | |
| | .principle { |
| | background: var(--surface); |
| | border: 1px solid var(--border); |
| | border-radius: 8px; |
| | padding: 1.25rem 1.5rem; |
| | margin-bottom: 1rem; |
| | } |
| | .principle h4 { |
| | color: var(--accent4); |
| | margin-bottom: 0.5rem; |
| | } |
| | .principle p, .principle li { |
| | font-size: 0.88rem; |
| | color: var(--text2); |
| | line-height: 1.6; |
| | } |
| | .principle ul { padding-left: 1.2rem; margin-top: 0.3rem; } |
| | |
| | |
| | .lb-hero { |
| | text-align: center; |
| | padding: 2.5rem 1rem 1.5rem; |
| | } |
| | .lb-hero h2 { |
| | font-size: 2rem; |
| | font-weight: 800; |
| | background: linear-gradient(135deg, var(--accent), var(--accent3)); |
| | -webkit-background-clip: text; |
| | -webkit-text-fill-color: transparent; |
| | background-clip: text; |
| | } |
| | .lb-hero p { |
| | color: var(--text2); |
| | font-size: 0.92rem; |
| | margin-top: 0.5rem; |
| | } |
| | .lb-updated { |
| | color: var(--text2); |
| | font-size: 0.75rem; |
| | margin-top: 0.25rem; |
| | opacity: 0.7; |
| | } |
| | .lb-spotlight { |
| | display: grid; |
| | grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); |
| | gap: 1rem; |
| | margin: 1.5rem 0; |
| | } |
| | .lb-spot-card { |
| | background: var(--surface); |
| | border: 1px solid var(--border); |
| | border-radius: 10px; |
| | padding: 1.25rem; |
| | transition: border-color 0.15s; |
| | } |
| | .lb-spot-card:hover { border-color: var(--accent); } |
| | .lb-spot-card h4 { |
| | font-size: 0.78rem; |
| | text-transform: uppercase; |
| | letter-spacing: 0.05em; |
| | color: var(--text2); |
| | margin-bottom: 0.15rem; |
| | } |
| | .lb-spot-card .lb-bench-name { |
| | font-size: 0.72rem; |
| | color: var(--accent); |
| | margin-bottom: 0.75rem; |
| | } |
| | .lb-rank-list { list-style: none; } |
| | .lb-rank-list li { |
| | display: flex; |
| | align-items: center; |
| | justify-content: space-between; |
| | padding: 0.35rem 0; |
| | font-size: 0.85rem; |
| | border-bottom: 1px solid rgba(48,54,61,0.5); |
| | } |
| | .lb-rank-list li:last-child { border-bottom: none; } |
| | .lb-rank-num { |
| | display: inline-flex; |
| | align-items: center; |
| | justify-content: center; |
| | width: 22px; |
| | height: 22px; |
| | border-radius: 50%; |
| | font-size: 0.7rem; |
| | font-weight: 700; |
| | margin-right: 0.6rem; |
| | flex-shrink: 0; |
| | } |
| | .lb-rank-1 { background: #fbbf24; color: #000; } |
| | .lb-rank-2 { background: #94a3b8; color: #000; } |
| | .lb-rank-3 { background: #b45309; color: #fff; } |
| | .lb-rank-other { background: var(--tag-bg); color: var(--text2); } |
| | .lb-model-name { flex: 1; } |
| | .lb-model-name.lb-sen1 { color: var(--accent4); font-weight: 600; } |
| | .lb-score { |
| | font-weight: 600; |
| | font-variant-numeric: tabular-nums; |
| | color: var(--text); |
| | } |
| | .lb-score-top { color: var(--accent2); } |
| | .lb-section-title { |
| | font-size: 1.1rem; |
| | font-weight: 700; |
| | margin: 2.5rem 0 0.5rem; |
| | padding-bottom: 0.4rem; |
| | border-bottom: 1px solid var(--border); |
| | } |
| | .lb-section-desc { |
| | color: var(--text2); |
| | font-size: 0.85rem; |
| | margin-bottom: 1rem; |
| | } |
| | |
| | |
| | .lb-table-wrap { |
| | overflow-x: auto; |
| | border: 1px solid var(--border); |
| | border-radius: 8px; |
| | margin: 1rem 0; |
| | } |
| | .lb-table { |
| | width: 100%; |
| | border-collapse: collapse; |
| | font-size: 0.84rem; |
| | min-width: 900px; |
| | } |
| | .lb-table th { |
| | background: var(--surface2); |
| | color: var(--text2); |
| | font-weight: 600; |
| | font-size: 0.72rem; |
| | text-transform: uppercase; |
| | letter-spacing: 0.04em; |
| | padding: 0.7rem 0.75rem; |
| | border-bottom: 2px solid var(--border); |
| | cursor: pointer; |
| | white-space: nowrap; |
| | user-select: none; |
| | position: sticky; |
| | top: 0; |
| | } |
| | .lb-table th:hover { color: var(--text); } |
| | .lb-table th .sort-arrow { margin-left: 0.3rem; font-size: 0.65rem; color: var(--accent); } |
| | .lb-table td { |
| | padding: 0.6rem 0.75rem; |
| | border-bottom: 1px solid rgba(48,54,61,0.5); |
| | white-space: nowrap; |
| | } |
| | .lb-table tr:hover td { background: rgba(88,166,255,0.04); } |
| | .lb-table .lb-row-sen1 { background: rgba(240,136,62,0.06); } |
| | .lb-table .lb-row-sen1:hover td { background: rgba(240,136,62,0.1); } |
| | .lb-table .model-type { |
| | display: inline-block; |
| | width: 8px; |
| | height: 24px; |
| | border-radius: 2px; |
| | margin-right: 0.5rem; |
| | vertical-align: middle; |
| | } |
| | .type-traditional { background: var(--accent4); } |
| | .type-transformer { background: var(--accent); } |
| | .type-multilingual { background: var(--accent3); } |
| | .lb-table .cell-best { color: var(--accent2); font-weight: 700; } |
| | .lb-table .cell-na { color: var(--text2); opacity: 0.5; } |
| | .lb-table .model-cell { |
| | display: flex; |
| | align-items: center; |
| | gap: 0.5rem; |
| | } |
| | |
| | |
| | @media (max-width: 768px) { |
| | .header-inner { flex-direction: column; align-items: flex-start; } |
| | .content { padding: 1rem; } |
| | .cards-row { grid-template-columns: 1fr; } |
| | .lb-spotlight { grid-template-columns: 1fr; } |
| | .lb-hero h2 { font-size: 1.5rem; } |
| | } |
| | </style> |
| | </head> |
| | <body> |
| |
|
| | <div class="header"> |
| | <div class="header-inner"> |
| | <div class="logo"> |
| | <div class="logo-icon">S1</div> |
| | <div> |
| | <h1>Sen-1 References <span>Vietnamese Text Classification</span></h1> |
| | </div> |
| | </div> |
| | <div class="stats"> |
| | <div><span class="stat-num" id="total-papers">12</span> papers</div> |
| | <div><span class="stat-num">10</span> PDFs</div> |
| | <div><span class="stat-num">7</span> LaTeX</div> |
| | </div> |
| | </div> |
| | </div> |
| |
|
| | <div class="tabs" id="tabs"> |
| | <div class="tab active" data-tab="papers">Papers</div> |
| | <div class="tab" data-tab="comparison">Benchmarks</div> |
| | <div class="tab" data-tab="sota">SOTA</div> |
| | <div class="tab" data-tab="network">Citation Network</div> |
| | <div class="tab" data-tab="leaderboard">Leaderboard</div> |
| | <div class="tab" data-tab="science">How to Do Science</div> |
| | </div> |
| |
|
| | |
| | <div class="content"> |
| | <div class="panel active" id="panel-papers"> |
| | <div class="section-title">Paper Database</div> |
| | <div class="section-desc">Research papers related to Vietnamese text classification, fetched from arXiv and ACL Anthology.</div> |
| |
|
| | <div class="filters"> |
| | <button class="filter-btn active" data-cat="all">All (12)</button> |
| | <button class="filter-btn" data-cat="vn-cls">Vietnamese Classification</button> |
| | <button class="filter-btn" data-cat="vn-pretrained">Vietnamese Pretrained</button> |
| | <button class="filter-btn" data-cat="multilingual">Multilingual Models</button> |
| | <button class="filter-btn" data-cat="datasets">Datasets</button> |
| | </div> |
| |
|
| | <div class="papers-grid" id="papers-grid"> |
| |
|
| | |
| | <div class="paper-card" data-cats="vn-pretrained"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/2003.00744" target="_blank">PhoBERT: Pre-trained language models for Vietnamese</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2020</span> |
| | <span class="tag tag-venue">EMNLP Findings</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Dat Quoc Nguyen, Anh Tuan Nguyen</span> |
| | </div> |
| | <div class="paper-abstract">We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent best pre-trained multilingual model XLM-R and improves the state-of-the-art in multiple Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and Natural language inference.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"> |
| | <code>2020.arxiv.nguyen/</code> · <code>2020.findings.anh/</code> |
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="vn-pretrained"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/2310.11166" target="_blank">ViSoBERT: A Pre-Trained Language Model for Vietnamese Social Media Text Processing</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2023</span> |
| | <span class="tag tag-venue">EMNLP 2023</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Quoc-Nam Nguyen, Thang Chau Phan, Duc-Vu Nguyen, Kiet Van Nguyen</span> |
| | </div> |
| | <div class="paper-abstract">We present the first monolingual pre-trained language model for Vietnamese social media texts, ViSoBERT, which is pre-trained on a large-scale corpus of high-quality and diverse Vietnamese social media texts using XLM-R architecture. ViSoBERT surpasses the previous state-of-the-art models on multiple Vietnamese social media tasks with far fewer parameters.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"> |
| | <code>2023.arxiv.nguyen/</code> · <code>2023.emnlp.kiet/</code> |
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="vn-pretrained"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/2006.15994" target="_blank">Improving Sequence Tagging for Vietnamese Text Using Transformer-based Neural Models</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2020</span> |
| | <span class="tag tag-venue">arXiv</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Viet Bui The, Oanh Tran Thi, Phuong Le-Hong</span> |
| | </div> |
| | <div class="paper-abstract">Introduces viBERT (trained on 10GB) and vELECTRA (trained on 60GB) Vietnamese pretrained models. Strong performance on sequence tagging and text classification tasks. vELECTRA achieves 95.26% on ViOCD complaint classification in the SMTCE benchmark.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2020.arxiv.the/</code></div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="vn-cls"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/2209.10482" target="_blank">SMTCE: A Social Media Text Classification Evaluation Benchmark and BERTology Models for Vietnamese</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2022</span> |
| | <span class="tag tag-venue">PACLIC 2022</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Luan Thanh Nguyen, Kiet Van Nguyen, Ngan Luu-Thuy Nguyen</span> |
| | </div> |
| | <div class="paper-abstract">GLUE-inspired benchmark for Vietnamese social media text classification. Compares multilingual (mBERT, XLM-R, DistilmBERT) and monolingual (PhoBERT, viBERT, vELECTRA, viBERT4news) BERT models. Monolingual models consistently outperform multilingual for Vietnamese.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"> |
| | <code>2022.arxiv.nguyen/</code> · <code>2022.paclic.ngan/</code> |
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="vn-cls"> |
| | <div class="paper-title"> |
| | <a href="https://ieeexplore.ieee.org/document/4223084/" target="_blank">A Comparative Study on Vietnamese Text Classification Methods</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2007</span> |
| | <span class="tag tag-venue">IEEE RIVF</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Cong Duy Vu Hoang, Dien Dinh, Le Nguyen Nguyen, Quoc Hung Ngo</span> |
| | </div> |
| | <div class="paper-abstract">Seminal paper introducing VNTC corpus and comparing BOW and N-gram language model approaches for Vietnamese text classification. N-gram LM achieves 97.1% accuracy, SVM Multi achieves 93.4% on 10-topic news classification. The VNTC dataset remains the standard benchmark.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2007.rivf.hoang/</code></div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="multilingual"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/1907.11692" target="_blank">RoBERTa: A Robustly Optimized BERT Pretraining Approach</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2019</span> |
| | <span class="tag tag-venue">arXiv</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Yinhan Liu, Myle Ott, Naman Goyal, ...</span> |
| | </div> |
| | <div class="paper-abstract">PhoBERT is based on the RoBERTa architecture. Key optimizations over BERT: dynamic masking, larger batches, more training data, removal of Next Sentence Prediction (NSP). Foundation for most Vietnamese pretrained models.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2019.arxiv.liu/</code></div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="multilingual"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/1911.02116" target="_blank">Unsupervised Cross-lingual Representation Learning at Scale (XLM-RoBERTa)</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2019</span> |
| | <span class="tag tag-venue">ACL 2020</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Alexis Conneau, Kartikay Khandelwal, Naman Goyal, ...</span> |
| | </div> |
| | <div class="paper-abstract">Multilingual pretrained model trained on 100 languages (2.5TB CC-100). Strong multilingual baseline for Vietnamese, but consistently outperformed by monolingual PhoBERT on Vietnamese-specific tasks.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2019.arxiv.conneau/</code></div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="datasets"> |
| | <div class="paper-title"> |
| | <a href="https://arxiv.org/abs/1911.09339" target="_blank">Emotion Recognition for Vietnamese Social Media Text (UIT-VSMEC)</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2019</span> |
| | <span class="tag tag-venue">CSoNet 2020</span> |
| | <span class="tag tag-pdf">PDF</span> |
| | <span class="tag tag-tex">TEX</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Vong Anh Ho, Duong Huynh-Cong Nguyen, Danh Hoang Nguyen, ...</span> |
| | </div> |
| | <div class="paper-abstract">Introduces UIT-VSMEC corpus: 6,927 emotion-annotated Vietnamese social media sentences with 7 labels (sadness, enjoyment, anger, disgust, fear, surprise, other). CNN baseline achieves 59.74% weighted F1.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2019.arxiv.ho/</code></div> |
| | </div> |
| |
|
| | |
| | <div class="paper-card" data-cats="datasets"> |
| | <div class="paper-title"> |
| | <a href="https://ieeexplore.ieee.org/document/8573337/" target="_blank">UIT-VSFC: Vietnamese Students' Feedback Corpus for Sentiment Analysis</a> |
| | </div> |
| | <div class="paper-meta"> |
| | <span class="tag tag-year">2018</span> |
| | <span class="tag tag-venue">KSE 2018</span> |
| | <span class="tag tag-md">MD</span> |
| | <span>Kiet Van Nguyen, Vu Duc Nguyen, Phu Xuan-Vinh Nguyen, ...</span> |
| | </div> |
| | <div class="paper-abstract">16,175 Vietnamese student feedback sentences annotated for sentiment (3 classes: positive, negative, neutral) and topic classification. Inter-annotator agreement: 91.20% for sentiment. MaxEnt baseline: 88% sentiment F1.</div> |
| | <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2018.kse.nguyen/</code></div> |
| | </div> |
| |
|
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="panel" id="panel-comparison"> |
| | <div class="section-title">Benchmark Comparison</div> |
| | <div class="section-desc">Vietnamese text classification results across datasets and models.</div> |
| |
|
| | <h3>VNTC Dataset (10-topic News Classification)</h3> |
| | <table> |
| | <tr><th>Model</th><th>Year</th><th>Accuracy</th><th>F1 (weighted)</th><th>Training</th><th>Inference</th><th>Size</th></tr> |
| | <tr><td>N-gram LM (Vu et al.)</td><td>2007</td><td class="highlight">97.1%</td><td>-</td><td>~79 min</td><td>-</td><td>-</td></tr> |
| | <tr><td>SVM Multi (Vu et al.)</td><td>2007</td><td>93.4%</td><td>-</td><td>~79 min</td><td>-</td><td>-</td></tr> |
| | <tr><td>sonar_core_1 (SVC)</td><td>-</td><td>92.80%</td><td>92.0%</td><td>~54.6 min</td><td>-</td><td>~75MB</td></tr> |
| | <tr style="background:var(--surface2)"><td><strong>Sen-1 (LinearSVC)</strong></td><td>2026</td><td>92.49%</td><td>92.40%</td><td class="highlight">37.6s</td><td class="highlight">66K/sec</td><td class="highlight">2.4MB</td></tr> |
| | <tr><td>PhoBERT-base*</td><td>2020</td><td>~95-97%</td><td>~95%</td><td>Hours (GPU)</td><td>~20/sec</td><td>~400MB</td></tr> |
| | </table> |
| | <p style="font-size:0.78rem;color:var(--text2)">*PhoBERT not directly evaluated on VNTC; estimates from similar tasks.</p> |
| |
|
| | <h3>UTS2017_Bank Dataset (14-category Banking)</h3> |
| | <table> |
| | <tr><th>Model</th><th>Accuracy</th><th>F1 (weighted)</th><th>F1 (macro)</th><th>Training</th></tr> |
| | <tr style="background:var(--surface2)"><td><strong>Sen-1</strong></td><td class="highlight">75.76%</td><td class="highlight">72.70%</td><td>36.18%</td><td class="highlight">0.13s</td></tr> |
| | <tr><td>sonar_core_1</td><td>72.47%</td><td>66.0%</td><td>-</td><td>~5.3s</td></tr> |
| | </table> |
| |
|
| | <h3>Vietnamese Pretrained Models</h3> |
| | <table> |
| | <tr><th>Model</th><th>Architecture</th><th>Pre-training Data</th><th>Languages</th><th>Vietnamese Tasks</th></tr> |
| | <tr><td><strong>PhoBERT</strong></td><td>RoBERTa</td><td>20GB Vietnamese</td><td>1 (vi)</td><td>SOTA: POS, NER, NLI</td></tr> |
| | <tr><td><strong>ViSoBERT</strong></td><td>XLM-R</td><td>Social media corpus</td><td>1 (vi)</td><td>SOTA: social media tasks</td></tr> |
| | <tr><td><strong>vELECTRA</strong></td><td>ELECTRA</td><td>60GB Vietnamese</td><td>1 (vi)</td><td>Strong on classification</td></tr> |
| | <tr><td>viBERT</td><td>BERT</td><td>10GB Vietnamese</td><td>1 (vi)</td><td>Baseline</td></tr> |
| | <tr><td>XLM-R</td><td>RoBERTa</td><td>CC-100 (2.5TB)</td><td>100</td><td>Strong multilingual</td></tr> |
| | <tr><td>mBERT</td><td>BERT</td><td>Wikipedia</td><td>104</td><td>Weakest on Vietnamese</td></tr> |
| | </table> |
| |
|
| | <h3>SMTCE Benchmark (Best model per task)</h3> |
| | <table> |
| | <tr><th>Task</th><th>Best Model</th><th>Score</th><th>Runner-up</th></tr> |
| | <tr><td>UIT-VSMEC (Emotion)</td><td class="highlight">PhoBERT</td><td>65.44% F1</td><td>viBERT4news</td></tr> |
| | <tr><td>ViOCD (Complaint)</td><td class="highlight">vELECTRA</td><td>95.26% F1</td><td>PhoBERT</td></tr> |
| | <tr><td>ViHSD (Hate Speech)</td><td class="highlight">PhoBERT</td><td>-</td><td>XLM-R</td></tr> |
| | <tr><td>ViCTSD (Constructive)</td><td class="highlight">PhoBERT</td><td>-</td><td>vELECTRA</td></tr> |
| | <tr><td>UIT-VSFC (Sentiment)</td><td class="highlight">PhoBERT</td><td>-</td><td>viBERT</td></tr> |
| | </table> |
| |
|
| | <h3>Model Efficiency</h3> |
| | <table> |
| | <tr><th>Model</th><th>Size</th><th>VNTC Accuracy</th><th>Efficiency (Acc/MB)</th></tr> |
| | <tr style="background:var(--surface2)"><td><strong>Sen-1</strong></td><td class="highlight">2.4 MB</td><td>92.49%</td><td class="highlight">38.5</td></tr> |
| | <tr><td>PhoBERT-base</td><td>~400 MB</td><td>~95%</td><td>0.24</td></tr> |
| | <tr><td>XLM-R-base</td><td>~1.1 GB</td><td>~93%</td><td>0.08</td></tr> |
| | </table> |
| | <p style="margin-top:0.75rem;font-size:0.9rem;color:var(--accent2)"><strong>Sen-1 is ~160x more efficient</strong> in accuracy-per-MB than PhoBERT.</p> |
| | </div> |
| |
|
| | |
| | <div class="panel" id="panel-sota"> |
| | <div class="section-title">State-of-the-Art</div> |
| | <div class="section-desc">Current SOTA for Vietnamese text classification tasks (as of 2026).</div> |
| |
|
| | <table> |
| | <tr><th>Task</th><th>Dataset</th><th>SOTA Model</th><th>Score</th><th>Paper</th></tr> |
| | <tr><td>News Classification</td><td>VNTC</td><td>N-gram LM</td><td class="highlight">97.1% Acc</td><td>Vu et al. 2007</td></tr> |
| | <tr><td>Emotion Recognition</td><td>UIT-VSMEC</td><td class="highlight">ViSoBERT</td><td>SOTA F1</td><td>Nguyen et al. 2023</td></tr> |
| | <tr><td>Sentiment Analysis</td><td>UIT-VSFC</td><td class="highlight">PhoBERT</td><td>SOTA F1</td><td>SMTCE 2022</td></tr> |
| | <tr><td>Hate Speech</td><td>ViHSD</td><td class="highlight">PhoBERT/ViSoBERT</td><td>SOTA F1</td><td>SMTCE/ViSoBERT</td></tr> |
| | <tr><td>Complaint Detection</td><td>ViOCD</td><td class="highlight">vELECTRA</td><td>95.26% F1</td><td>SMTCE 2022</td></tr> |
| | <tr><td>Spam Reviews</td><td>ViSpamReviews</td><td class="highlight">ViSoBERT</td><td>SOTA F1</td><td>Nguyen et al. 2023</td></tr> |
| | </table> |
| |
|
| | <h3>Key Trends</h3> |
| | <div class="cards-row"> |
| | <div class="info-card"> |
| | <h4>Monolingual > Multilingual</h4> |
| | <p>PhoBERT, ViSoBERT, vELECTRA consistently outperform XLM-R, mBERT on Vietnamese tasks.</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>Domain-specific Pretraining</h4> |
| | <p>ViSoBERT (social media) outperforms PhoBERT (general) on social media tasks.</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>Traditional ML Still Competitive</h4> |
| | <p>TF-IDF + SVM achieves 92%+ on news classification with 160x less resources.</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>Word Segmentation Matters</h4> |
| | <p>~5% accuracy gap between syllable-level (Sen-1) and word-level approaches.</p> |
| | </div> |
| | </div> |
| |
|
| | <h3>Sen-1 Position</h3> |
| | <div class="network"><b> Accuracy</b> |
| | High ^ |
| | | <b>PhoBERT</b> <u>ViSoBERT</u> |
| | | * * |
| | | |
| | | <em>N-gram (2007)</em> |
| | | * |
| | | <b style="color:var(--accent4)">Sen-1</b> |
| | | * |
| | | |
| | Low | |
| | +-------------------------------> |
| | Fast Slow |
| | Inference Speed</div> |
| | <p style="margin-top:1rem;font-size:0.9rem;color:var(--text2)"> |
| | Sen-1 = <strong style="color:var(--accent4)">fast + lightweight</strong> quadrant: edge deployment, real-time batch processing, resource-constrained environments. |
| | </p> |
| |
|
| | <h3>Open Questions</h3> |
| | <div class="cards-row"> |
| | <div class="info-card"> |
| | <h4>RQ1</h4> |
| | <p>Can word segmentation close the gap between Sen-1 and PhoBERT?</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>RQ2</h4> |
| | <p>How does Sen-1 perform on social media/informal text?</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>RQ3</h4> |
| | <p>Can ensemble (Sen-1 + lightweight transformer) get speed + accuracy?</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>RQ4</h4> |
| | <p>Minimum dataset size where PhoBERT outperforms TF-IDF+SVM?</p> |
| | </div> |
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="panel" id="panel-network"> |
| | <div class="section-title">Citation Network</div> |
| | <div class="section-desc">How the papers in this collection relate to each other and to Sen-1.</div> |
| |
|
| | <div class="network"><b>Vu et al. 2007</b> (VNTC dataset) |
| | | |
| | +---> Vietnamese text classification research |
| | | | |
| | | <b>RoBERTa</b> (2019) ---> <b>PhoBERT</b> (2020) ---> <u>ViSoBERT</u> (2023) |
| | | | | |
| | | <b>XLM-R</b> (2019) ------> <em>vELECTRA</em> (2020) <em>SMTCE benchmark</em> (2022) |
| | | | | |
| | | <b style="color:var(--accent4)">Sen-2</b> (future) UIT-VSMEC, UIT-VSFC |
| | | |
| | +---> <b style="color:var(--accent4)">Sen-1</b> (TF-IDF + SVM baseline) |
| | | |
| | +---> 92.49% VNTC | 75.76% UTS2017_Bank |
| | | |
| | +---> Phase 2: word segmentation, PhoBERT comparison |
| | | |
| | +---> Phase 3: <b style="color:var(--accent4)">Sen-2</b> (PhoBERT-based)</div> |
| |
|
| | <h3>Available Datasets</h3> |
| | <table> |
| | <tr><th>Dataset</th><th>Task</th><th>Samples</th><th>Classes</th><th>Domain</th><th>Source</th></tr> |
| | <tr><td><strong>VNTC</strong></td><td>Topic</td><td>84,132</td><td>10</td><td>News</td><td><a href="https://github.com/duyvuleo/VNTC">GitHub</a></td></tr> |
| | <tr><td><strong>UTS2017_Bank</strong></td><td>Intent</td><td>1,977</td><td>14</td><td>Banking</td><td>HuggingFace</td></tr> |
| | <tr><td><strong>UIT-VSMEC</strong></td><td>Emotion</td><td>6,927</td><td>7</td><td>Social media</td><td>UIT NLP</td></tr> |
| | <tr><td><strong>UIT-VSFC</strong></td><td>Sentiment</td><td>16,175</td><td>3</td><td>Education</td><td><a href="https://huggingface.co/datasets/uitnlp/vietnamese_students_feedback">HuggingFace</a></td></tr> |
| | <tr><td><strong>SMTCE</strong></td><td>Multi-task</td><td>Multiple</td><td>Various</td><td>Social media</td><td><a href="https://arxiv.org/abs/2209.10482">arXiv</a></td></tr> |
| | </table> |
| |
|
| | <h3>Research Gaps</h3> |
| | <div class="cards-row"> |
| | <div class="info-card"> |
| | <h4>Gap 1</h4> |
| | <p>No comprehensive TF-IDF vs PhoBERT comparison on same Vietnamese benchmarks with controlled experiments.</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>Gap 2</h4> |
| | <p>Limited edge/resource-constrained deployment studies. Most work focuses on accuracy, not efficiency.</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>Gap 3</h4> |
| | <p>Class imbalance handling for Vietnamese datasets is under-explored.</p> |
| | </div> |
| | <div class="info-card"> |
| | <h4>Gap 4</h4> |
| | <p>Cross-domain evaluation and ablation studies for Vietnamese features are rare.</p> |
| | </div> |
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="panel" id="panel-leaderboard"> |
| |
|
| | <div class="lb-hero"> |
| | <h2>Vietnamese Text Classification Leaderboard</h2> |
| | <p>Comprehensive comparison of models across Vietnamese NLP benchmarks, speed, and efficiency.</p> |
| | <div class="lb-updated">Updated: February 2026 · Inspired by <a href="https://www.vellum.ai/llm-leaderboard" target="_blank">Vellum LLM Leaderboard</a></div> |
| | </div> |
| |
|
| | |
| | <div class="lb-section-title">Quality Benchmarks</div> |
| | <div class="lb-section-desc">Top models per dataset, ranked by primary metric.</div> |
| |
|
| | <div class="lb-spotlight"> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>News Classification</h4> |
| | <div class="lb-bench-name">VNTC (10 topics, 84K samples)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">N-gram LM</span><span class="lb-score lb-score-top">97.1%</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">PhoBERT-base*</span><span class="lb-score">~95%</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">SVM Multi</span><span class="lb-score">93.4%</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">92.80%</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">5</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score">92.49%</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Banking Classification</h4> |
| | <div class="lb-bench-name">UTS2017_Bank (14 categories, 1.9K samples)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">75.76%</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">72.47%</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Emotion Recognition</h4> |
| | <div class="lb-bench-name">UIT-VSMEC (7 classes, 6.9K samples)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">ViSoBERT</span><span class="lb-score lb-score-top">SOTA</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">PhoBERT</span><span class="lb-score">65.44%</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">viBERT4news</span><span class="lb-score">-</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">CNN baseline</span><span class="lb-score">59.74%</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Sentiment Analysis</h4> |
| | <div class="lb-bench-name">UIT-VSFC (3 classes, 16K samples)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">PhoBERT</span><span class="lb-score lb-score-top">SOTA</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">viBERT</span><span class="lb-score">-</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">MaxEnt baseline</span><span class="lb-score">88%</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Complaint Detection</h4> |
| | <div class="lb-bench-name">ViOCD (SMTCE benchmark)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">vELECTRA</span><span class="lb-score lb-score-top">95.26%</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">PhoBERT</span><span class="lb-score">-</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">XLM-R</span><span class="lb-score">-</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Hate Speech Detection</h4> |
| | <div class="lb-bench-name">ViHSD (SMTCE benchmark)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">PhoBERT</span><span class="lb-score lb-score-top">SOTA</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">ViSoBERT</span><span class="lb-score">-</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">XLM-R</span><span class="lb-score">-</span></li> |
| | </ol> |
| | </div> |
| |
|
| | </div> |
| |
|
| | |
| | <div class="lb-section-title">Performance Metrics</div> |
| | <div class="lb-section-desc">Speed, latency, and efficiency rankings.</div> |
| |
|
| | <div class="lb-spotlight"> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Fastest Inference</h4> |
| | <div class="lb-bench-name">Batch throughput (samples/sec)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">66,678/s</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">TF-IDF + SVM (sklearn)</span><span class="lb-score">~50K/s</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">PhoBERT (GPU)</span><span class="lb-score">~20/s</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Smallest Model</h4> |
| | <div class="lb-bench-name">Model file size</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">2.4 MB</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">~75 MB</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">PhoBERT-base</span><span class="lb-score">~400 MB</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">XLM-R-base</span><span class="lb-score">~1.1 GB</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Most Efficient</h4> |
| | <div class="lb-bench-name">Accuracy per MB (VNTC)</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">38.5</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">1.24</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">PhoBERT-base</span><span class="lb-score">0.24</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">XLM-R-base</span><span class="lb-score">0.08</span></li> |
| | </ol> |
| | </div> |
| |
|
| | |
| | <div class="lb-spot-card"> |
| | <h4>Fastest Training</h4> |
| | <div class="lb-bench-name">VNTC full training time</div> |
| | <ol class="lb-rank-list"> |
| | <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1 (Rust)</span><span class="lb-score lb-score-top">37.6s</span></li> |
| | <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">TF-IDF+SVM (sklearn)</span><span class="lb-score">~2 min</span></li> |
| | <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">54.6 min</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">N-gram LM</span><span class="lb-score">~79 min</span></li> |
| | <li><span class="lb-rank-num lb-rank-other">5</span><span class="lb-model-name">PhoBERT fine-tune</span><span class="lb-score">Hours</span></li> |
| | </ol> |
| | </div> |
| |
|
| | </div> |
| |
|
| | |
| | <div class="lb-section-title">Comprehensive Comparison</div> |
| | <div class="lb-section-desc">All models with operational and benchmark metrics. Click column headers to sort.</div> |
| |
|
| | <div style="display:flex;gap:0.5rem;margin-bottom:1rem;flex-wrap:wrap"> |
| | <span style="display:inline-flex;align-items:center;gap:0.35rem;font-size:0.78rem;color:var(--text2)"><span class="model-type type-traditional" style="width:10px;height:10px;display:inline-block"></span> Traditional ML</span> |
| | <span style="display:inline-flex;align-items:center;gap:0.35rem;font-size:0.78rem;color:var(--text2)"><span class="model-type type-transformer" style="width:10px;height:10px;display:inline-block"></span> Vietnamese Transformer</span> |
| | <span style="display:inline-flex;align-items:center;gap:0.35rem;font-size:0.78rem;color:var(--text2)"><span class="model-type type-multilingual" style="width:10px;height:10px;display:inline-block"></span> Multilingual</span> |
| | </div> |
| |
|
| | <div class="lb-table-wrap"> |
| | <table class="lb-table" id="lb-main-table"> |
| | <thead> |
| | <tr> |
| | <th data-col="0" data-type="num">#</th> |
| | <th data-col="1" data-type="str">Model</th> |
| | <th data-col="2" data-type="str">Type</th> |
| | <th data-col="3" data-type="str">Architecture</th> |
| | <th data-col="4" data-type="num">Size</th> |
| | <th data-col="5" data-type="num">VNTC<br>Acc %</th> |
| | <th data-col="6" data-type="num">UTS2017<br>Acc %</th> |
| | <th data-col="7" data-type="num">UIT-VSMEC<br>F1 %</th> |
| | <th data-col="8" data-type="num">ViOCD<br>F1 %</th> |
| | <th data-col="9" data-type="num">Training</th> |
| | <th data-col="10" data-type="num">Inference<br>/sec</th> |
| | <th data-col="11" data-type="num">Eff.<br>Acc/MB</th> |
| | </tr> |
| | </thead> |
| | <tbody> |
| | <tr><td>1</td><td><div class="model-cell"><span class="model-type type-traditional"></span>N-gram LM</div></td><td>Traditional</td><td>N-gram Language Model</td><td class="cell-na">n/a</td><td class="cell-best">97.1</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>~79 min</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>2</td><td><div class="model-cell"><span class="model-type type-transformer"></span>PhoBERT-base</div></td><td>Transformer</td><td>RoBERTa (20GB vi)</td><td>~400 MB</td><td>~95</td><td class="cell-na">n/a</td><td>65.44</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td>~20</td><td>0.24</td></tr> |
| | <tr><td>3</td><td><div class="model-cell"><span class="model-type type-transformer"></span>ViSoBERT</div></td><td>Transformer</td><td>XLM-R (social media)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-best">SOTA</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>4</td><td><div class="model-cell"><span class="model-type type-transformer"></span>vELECTRA</div></td><td>Transformer</td><td>ELECTRA (60GB vi)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-best">95.26</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>5</td><td><div class="model-cell"><span class="model-type type-traditional"></span>SVM Multi</div></td><td>Traditional</td><td>SVM + BOW</td><td class="cell-na">n/a</td><td>93.4</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>~79 min</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>6</td><td><div class="model-cell"><span class="model-type type-traditional"></span>sonar_core_1</div></td><td>Traditional</td><td>TF-IDF + SVC (RBF)</td><td>~75 MB</td><td>92.80</td><td>72.47</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>54.6 min</td><td class="cell-na">n/a</td><td>1.24</td></tr> |
| | <tr class="lb-row-sen1"><td>7</td><td><div class="model-cell"><span class="model-type type-traditional"></span><strong style="color:var(--accent4)">Sen-1</strong></div></td><td>Traditional</td><td>TF-IDF + LinearSVC (Rust)</td><td class="cell-best">2.4 MB</td><td>92.49</td><td class="cell-best">75.76</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-best">37.6s</td><td class="cell-best">66,678</td><td class="cell-best">38.5</td></tr> |
| | <tr><td>8</td><td><div class="model-cell"><span class="model-type type-multilingual"></span>XLM-R-base</div></td><td>Multilingual</td><td>RoBERTa (100 langs)</td><td>~1.1 GB</td><td>~93</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td>0.08</td></tr> |
| | <tr><td>9</td><td><div class="model-cell"><span class="model-type type-multilingual"></span>mBERT</div></td><td>Multilingual</td><td>BERT (104 langs)</td><td>~700 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>10</td><td><div class="model-cell"><span class="model-type type-transformer"></span>viBERT</div></td><td>Transformer</td><td>BERT (10GB vi)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>11</td><td><div class="model-cell"><span class="model-type type-transformer"></span>viBERT4news</div></td><td>Transformer</td><td>BERT (news domain)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>12</td><td><div class="model-cell"><span class="model-type type-traditional"></span>MaxEnt baseline</div></td><td>Traditional</td><td>Maximum Entropy</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>13</td><td><div class="model-cell"><span class="model-type type-traditional"></span>CNN baseline</div></td><td>Traditional</td><td>Convolutional NN</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>59.74</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | <tr><td>14</td><td><div class="model-cell"><span class="model-type type-multilingual"></span>DistilmBERT</div></td><td>Multilingual</td><td>DistilBERT (multilingual)</td><td>~260 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> |
| | </tbody> |
| | </table> |
| | </div> |
| |
|
| | <p style="font-size:0.75rem;color:var(--text2);margin-top:0.5rem"> |
| | * PhoBERT VNTC estimate based on similar Vietnamese classification tasks. Blank cells indicate benchmark not evaluated. |
| | <br>Efficiency = VNTC Accuracy / Model Size in MB. Higher is better. |
| | </p> |
| |
|
| | </div> |
| |
|
| | |
| | <div class="panel" id="panel-science"> |
| | <div class="section-title">How to Do Science</div> |
| | <div class="section-desc">Research methodology guide compiled from Hamming, Schulman, Marek Rei, and Microsoft Research Asia.</div> |
| |
|
| | <h3>1. Choosing Important Problems</h3> |
| |
|
| | <blockquote>"If you do not work on an important problem, it's unlikely you'll do important work." <strong>-- Richard Hamming</strong></blockquote> |
| |
|
| | <div class="principle"> |
| | <h4>Hamming's Principles</h4> |
| | <ul> |
| | <li>Maintain a list of <strong>10-20 important problems</strong> in your field</li> |
| | <li>A problem becomes important when you have a <strong>reasonable attack</strong></li> |
| | <li>Dedicate deep thinking time (Friday "Great Thoughts Time")</li> |
| | <li>Have <strong>courage</strong> to pursue unconventional ideas</li> |
| | </ul> |
| | </div> |
| |
|
| | <div class="principle"> |
| | <h4>Schulman's Framework (OpenAI)</h4> |
| | <ul> |
| | <li><strong>Work on the right problems</strong></li> |
| | <li><strong>Make continual progress</strong></li> |
| | <li><strong>Achieve continual personal growth</strong></li> |
| | </ul> |
| | <p style="margin-top:0.5rem">Develop "research taste" by reading broadly, collaborating widely, and asking: "If this succeeds, how big is the impact?"</p> |
| | </div> |
| |
|
| | <div class="principle"> |
| | <h4>Microsoft Research Asia (Dr. Ming Zhou)</h4> |
| | <ul> |
| | <li>Read recent <strong>ACL proceedings</strong> to find your field</li> |
| | <li>Target <strong>"blue ocean"</strong> areas - new fields with less competition</li> |
| | <li>Verify 3 prerequisites: math/ML framework, standard datasets, active research teams</li> |
| | <li>Find <strong>gaps</strong>: what can be improved, combined, or inverted</li> |
| | </ul> |
| | </div> |
| |
|
| | <h3>2. Reading Papers</h3> |
| | <div class="principle"> |
| | <h4>Effective Reading Strategy</h4> |
| | <ul> |
| | <li><strong>Read broadly</strong>: Not just NLP - also cognitive science, neuroscience, linguistics, vision</li> |
| | <li><strong>Read deeply</strong>: Become the "world-leading expert" on your narrow question</li> |
| | <li><strong>Read textbooks</strong>: More knowledge-dense than papers</li> |
| | <li><strong>Follow citation chains</strong> via Google Scholar, Semantic Scholar</li> |
| | <li>Use <strong>PRISMA</strong> methodology for systematic reviews</li> |
| | </ul> |
| | </div> |
| |
|
| | <h3>3. Running Experiments</h3> |
| |
|
| | <div class="principle"> |
| | <h4>Step 1: Reproduce baselines first</h4> |
| | <p>"Reimplement existing state-of-the-art work first to validate your setup." -- Marek Rei</p> |
| | <ul> |
| | <li>Choose open source project, compile, run demo, <strong>match results</strong></li> |
| | <li>Understand the algorithm deeply, then reimplement</li> |
| | <li>Test on standard test set until results align</li> |
| | </ul> |
| | </div> |
| |
|
| | <div class="principle"> |
| | <h4>Step 2: Simple baseline (1-2 weeks)</h4> |
| | <p>Implement the simplest approach before building complex architectures. Verify your setup works.</p> |
| | </div> |
| |
|
| | <div class="principle"> |
| | <h4>Step 3: Rigorous experimentation</h4> |
| | <ul> |
| | <li><strong>Debug</strong>: Don't assume bug-free code. Test with toy examples. Add assertions.</li> |
| | <li><strong>Evaluate</strong>: Separate train/dev/test. Run 10+ times. Report mean + std.</li> |
| | <li><strong>Ablate</strong>: Significance tests and ablation studies for every novel component.</li> |
| | <li><strong>Avoid</strong>: Single-run results, weak-only baselines, blind trend-following.</li> |
| | </ul> |
| | </div> |
| |
|
| | <h3>4. Writing Papers</h3> |
| | <div class="principle"> |
| | <h4>ACL Paper Structure (Dr. Ming Zhou)</h4> |
| | <ul> |
| | <li><strong>Title</strong>: Specific, no generic words</li> |
| | <li><strong>Abstract</strong>: Problem + Method + Advantage + Achievement</li> |
| | <li><strong>Introduction</strong>: Background → existing → limitations → contribution (≤3 points)</li> |
| | <li><strong>Related Work</strong>: Organized by <strong>theme</strong>, not chronology</li> |
| | <li><strong>Methodology</strong>: Problem definition → notation → formulas</li> |
| | <li><strong>Experiments</strong>: Purpose → data → parameters → reproducibility</li> |
| | <li><strong>Limitations</strong>: Required by ACL - honest assessment</li> |
| | </ul> |
| | <p style="margin-top:0.5rem"><strong>Revision</strong>: 3 passes - self review → team review → outsider review.</p> |
| | </div> |
| |
|
| | <h3>5. Mindset & Habits</h3> |
| |
|
| | <table> |
| | <tr><th>Principle</th><th>Lesson</th></tr> |
| | <tr><td><strong>Open doors</strong></td><td>Stay connected to the community; know emerging problems</td></tr> |
| | <tr><td><strong>Preparation</strong></td><td>"Luck favors the prepared mind" (Pasteur)</td></tr> |
| | <tr><td><strong>Constraints</strong></td><td>Difficult conditions often lead to breakthroughs</td></tr> |
| | <tr><td><strong>Commitment</strong></td><td>Deep immersion activates subconscious problem-solving</td></tr> |
| | <tr><td><strong>Selling work</strong></td><td>Presentation matters - great work needs effective communication</td></tr> |
| | </table> |
| |
|
| | <h3>Essential Reading</h3> |
| | <div class="papers-grid" style="margin-top:0.75rem"> |
| | <div class="paper-card"> |
| | <div class="paper-title"><a href="https://www.cs.virginia.edu/~robins/YouAndYourResearch.html" target="_blank">You and Your Research</a></div> |
| | <div class="paper-meta"><span>Richard Hamming</span> · <span>Choosing important problems, mindset</span></div> |
| | </div> |
| | <div class="paper-card"> |
| | <div class="paper-title"><a href="http://joschu.net/blog/opinionated-guide-ml-research.html" target="_blank">An Opinionated Guide to ML Research</a></div> |
| | <div class="paper-meta"><span>John Schulman (OpenAI)</span> · <span>Problem selection, progress, growth</span></div> |
| | </div> |
| | <div class="paper-card"> |
| | <div class="paper-title"><a href="https://www.marekrei.com/blog/ml-nlp-research-project-advice/" target="_blank">ML/NLP Research Project Advice</a></div> |
| | <div class="paper-meta"><span>Marek Rei</span> · <span>Practical experiment workflow</span></div> |
| | </div> |
| | <div class="paper-card"> |
| | <div class="paper-title"><a href="https://microsoft.com/en-us/research/lab/microsoft-research-asia/articles/make-first-accomplishment-nlp-field/" target="_blank">How to Make First Accomplishment in NLP</a></div> |
| | <div class="paper-meta"><span>Dr. Ming Zhou (MSRA)</span> · <span>NLP research methodology</span></div> |
| | </div> |
| | </div> |
| | </div> |
| |
|
| | </div> |
| |
|
| | <script> |
| | |
| | document.querySelectorAll('.tab').forEach(tab => { |
| | tab.addEventListener('click', () => { |
| | document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); |
| | document.querySelectorAll('.panel').forEach(p => p.classList.remove('active')); |
| | tab.classList.add('active'); |
| | document.getElementById('panel-' + tab.dataset.tab).classList.add('active'); |
| | }); |
| | }); |
| | |
| | |
| | document.querySelectorAll('.filter-btn').forEach(btn => { |
| | btn.addEventListener('click', () => { |
| | document.querySelectorAll('.filter-btn').forEach(b => b.classList.remove('active')); |
| | btn.classList.add('active'); |
| | const cat = btn.dataset.cat; |
| | document.querySelectorAll('.paper-card[data-cats]').forEach(card => { |
| | if (cat === 'all' || card.dataset.cats === cat) { |
| | card.style.display = ''; |
| | } else { |
| | card.style.display = 'none'; |
| | } |
| | }); |
| | }); |
| | }); |
| | |
| | |
| | document.querySelectorAll('.paper-abstract').forEach(abs => { |
| | abs.addEventListener('click', () => { |
| | abs.classList.toggle('expanded'); |
| | }); |
| | abs.style.cursor = 'pointer'; |
| | }); |
| | |
| | |
| | (function() { |
| | const table = document.getElementById('lb-main-table'); |
| | if (!table) return; |
| | const thead = table.querySelector('thead'); |
| | const tbody = table.querySelector('tbody'); |
| | let sortCol = -1, sortAsc = true; |
| | |
| | function parseVal(td) { |
| | let txt = td.textContent.trim(); |
| | if (txt === 'n/a' || txt === '-' || txt === 'SOTA') return -Infinity; |
| | txt = txt.replace(/[~,/s]/g, '').replace('MB', '').replace('GB', function() { return ''; }); |
| | |
| | let raw = td.textContent.trim(); |
| | if (raw.includes('GB')) { |
| | let n = parseFloat(raw.replace(/[~]/g, '')); |
| | return n * 1000; |
| | } |
| | if (raw.includes('MB')) return parseFloat(raw.replace(/[~]/g, '')); |
| | if (raw.includes('min')) return parseFloat(raw) * 60; |
| | if (raw.includes('Hours')) return 36000; |
| | if (raw.endsWith('s') && !raw.includes('/s')) return parseFloat(raw); |
| | let num = parseFloat(txt.replace(/[^0-9.\-]/g, '')); |
| | return isNaN(num) ? -Infinity : num; |
| | } |
| | |
| | thead.querySelectorAll('th').forEach((th, i) => { |
| | th.addEventListener('click', () => { |
| | if (sortCol === i) { sortAsc = !sortAsc; } |
| | else { sortCol = i; sortAsc = false; } |
| | const rows = Array.from(tbody.querySelectorAll('tr')); |
| | rows.sort((a, b) => { |
| | let va = parseVal(a.children[i]); |
| | let vb = parseVal(b.children[i]); |
| | if (th.dataset.type === 'str') { |
| | va = a.children[i].textContent.trim().toLowerCase(); |
| | vb = b.children[i].textContent.trim().toLowerCase(); |
| | return sortAsc ? va.localeCompare(vb) : vb.localeCompare(va); |
| | } |
| | return sortAsc ? va - vb : vb - va; |
| | }); |
| | |
| | rows.forEach((r, idx) => { r.children[0].textContent = idx + 1; tbody.appendChild(r); }); |
| | |
| | thead.querySelectorAll('.sort-arrow').forEach(a => a.remove()); |
| | const arrow = document.createElement('span'); |
| | arrow.className = 'sort-arrow'; |
| | arrow.textContent = sortAsc ? '\u25B2' : '\u25BC'; |
| | th.appendChild(arrow); |
| | }); |
| | }); |
| | })(); |
| | </script> |
| | </body> |
| | </html> |
| |
|