| { | |
| "papers": { | |
| "2020.arxiv.nguyen": { | |
| "id": "2020.arxiv.nguyen", | |
| "title": "PhoBERT: Pre-trained language models for Vietnamese", | |
| "authors": [ | |
| "Dat Quoc Nguyen", | |
| "Anh Tuan Nguyen" | |
| ], | |
| "year": 2020, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/2003.00744", | |
| "arxiv_id": "2003.00744", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and Natural language inference. We release PhoBERT to facilitate future research and downstream applications for Vietnamese NLP. Our PhoBERT models are available at https://github.com/VinAIResearch/PhoBERT", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2020.arxiv.nguyen", | |
| "fetched": true | |
| }, | |
| "2023.arxiv.nguyen": { | |
| "id": "2023.arxiv.nguyen", | |
| "title": "ViSoBERT: A Pre-Trained Language Model for Vietnamese Social Media Text Processing", | |
| "authors": [ | |
| "Quoc-Nam Nguyen", | |
| "Thang Chau Phan", | |
| "Duc-Vu Nguyen", | |
| "Kiet Van Nguyen" | |
| ], | |
| "year": 2023, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/2310.11166", | |
| "arxiv_id": "2310.11166", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "English and Chinese, known as resource-rich languages, have witnessed the strong development of transformer-based language models for natural language processing tasks. Although Vietnam has approximately 100M people speaking Vietnamese, several pre-trained models, e.g., PhoBERT, ViBERT, and vELECTRA, performed well on general Vietnamese NLP tasks, including POS tagging and named entity recognition. These pre-trained language models are still limited to Vietnamese social media tasks. In this paper, we present the first monolingual pre-trained language model for Vietnamese social media texts, ViSoBERT, which is pre-trained on a large-scale corpus of high-quality and diverse Vietnamese social media texts using XLM-R architecture. Moreover, we explored our pre-trained model on five important natural language downstream tasks on Vietnamese social media texts: emotion recognition, hate speech detection, sentiment analysis, spam reviews detection, and hate speech spans detection. Our experiments demonstrate that ViSoBERT, with far fewer parameters, surpasses the previous state-of-the-art models on multiple Vietnamese social media tasks. Our ViSoBERT model is available only for research purposes.", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2023.arxiv.nguyen", | |
| "fetched": true | |
| }, | |
| "2022.arxiv.nguyen": { | |
| "id": "2022.arxiv.nguyen", | |
| "title": "SMTCE: A Social Media Text Classification Evaluation Benchmark and BERTology Models for Vietnamese", | |
| "authors": [ | |
| "Luan Thanh Nguyen", | |
| "Kiet Van Nguyen", | |
| "Ngan Luu-Thuy Nguyen" | |
| ], | |
| "year": 2022, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/2209.10482", | |
| "arxiv_id": "2209.10482", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "Text classification is a typical natural language processing or computational linguistics task with various interesting applications. As the number of users on social media platforms increases, data acceleration promotes emerging studies on Social Media Text Classification (SMTC) or social media text mining on these valuable resources. In contrast to English, Vietnamese, one of the low-resource languages, is still not concentrated on and exploited thoroughly. Inspired by the success of the GLUE, we introduce the Social Media Text Classification Evaluation (SMTCE) benchmark, as a collection of datasets and models across a diverse set of SMTC tasks. With the proposed benchmark, we implement and analyze the effectiveness of a variety of multilingual BERT-based models (mBERT, XLM-R, and DistilmBERT) and monolingual BERT-based models (PhoBERT, viBERT, vELECTRA, and viBERT4news) for tasks in the SMTCE benchmark. Monolingual models outperform multilingual models and achieve state-of-the-art results on all text classification tasks. It provides an objective assessment of multilingual and monolingual BERT-based models on the benchmark, which will benefit future studies about BERTology in the Vietnamese language.", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2022.arxiv.nguyen", | |
| "fetched": true | |
| }, | |
| "2019.arxiv.ho": { | |
| "id": "2019.arxiv.ho", | |
| "title": "Emotion Recognition for Vietnamese Social Media Text", | |
| "authors": [ | |
| "Vong Anh Ho", | |
| "Duong Huynh-Cong Nguyen", | |
| "Danh Hoang Nguyen", | |
| "Linh Thi-Van Pham", | |
| "Duc-Vu Nguyen", | |
| "Kiet Van Nguyen", | |
| "Ngan Luu-Thuy Nguyen" | |
| ], | |
| "year": 2019, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/1911.09339", | |
| "arxiv_id": "1911.09339", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "Emotion recognition or emotion prediction is a higher approach or a special case of sentiment analysis. In this task, the result is not produced in terms of either polarity: positive or negative or in the form of rating (from 1 to 5) but of a more detailed level of analysis in which the results are depicted in more expressions like sadness, enjoyment, anger, disgust, fear, and surprise. Emotion recognition plays a critical role in measuring the brand value of a product by recognizing specific emotions of customers' comments. In this study, we have achieved two targets. First and foremost, we built a standard Vietnamese Social Media Emotion Corpus (UIT-VSMEC) with exactly 6,927 emotion-annotated sentences, contributing to emotion recognition research in Vietnamese which is a low-resource language in natural language processing (NLP). Secondly, we assessed and measured machine learning and deep neural network models on our UIT-VSMEC corpus. As a result, the CNN model achieved the highest performance with the weighted F1-score of 59.74%. Our corpus is available at our research website.", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2019.arxiv.ho", | |
| "fetched": true | |
| }, | |
| "2019.arxiv.liu": { | |
| "id": "2019.arxiv.liu", | |
| "title": "RoBERTa: A Robustly Optimized BERT Pretraining Approach", | |
| "authors": [ | |
| "Yinhan Liu", | |
| "Myle Ott", | |
| "Naman Goyal", | |
| "Jingfei Du", | |
| "Mandar Joshi", | |
| "Danqi Chen", | |
| "Omer Levy", | |
| "Mike Lewis", | |
| "Luke Zettlemoyer", | |
| "Veselin Stoyanov" | |
| ], | |
| "year": 2019, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/1907.11692", | |
| "arxiv_id": "1907.11692", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2019.arxiv.liu", | |
| "fetched": true | |
| }, | |
| "2020.arxiv.the": { | |
| "id": "2020.arxiv.the", | |
| "title": "Improving Sequence Tagging for Vietnamese Text Using Transformer-based Neural Models", | |
| "authors": [ | |
| "Viet Bui The", | |
| "Oanh Tran Thi", | |
| "Phuong Le-Hong" | |
| ], | |
| "year": 2020, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/2006.15994", | |
| "arxiv_id": "2006.15994", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "This paper describes our study on using mutilingual BERT embeddings and some new neural models for improving sequence tagging tasks for the Vietnamese language. We propose new model architectures and evaluate them extensively on two named entity recognition datasets of VLSP 2016 and VLSP 2018, and on two part-of-speech tagging datasets of VLSP 2010 and VLSP 2013. Our proposed models outperform existing methods and achieve new state-of-the-art results. In particular, we have pushed the accuracy of part-of-speech tagging to 95.40% on the VLSP 2010 corpus, to 96.77% on the VLSP 2013 corpus; and the F1 score of named entity recognition to 94.07% on the VLSP 2016 corpus, to 90.31% on the VLSP 2018 corpus. Our code and pre-trained models viBERT and vELECTRA are released as open source to facilitate adoption and further research.", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2020.arxiv.the", | |
| "fetched": true | |
| }, | |
| "2019.arxiv.conneau": { | |
| "id": "2019.arxiv.conneau", | |
| "title": "Unsupervised Cross-lingual Representation Learning at Scale", | |
| "authors": [ | |
| "Alexis Conneau", | |
| "Kartikay Khandelwal", | |
| "Naman Goyal", | |
| "Vishrav Chaudhary", | |
| "Guillaume Wenzek", | |
| "Francisco Guzmán", | |
| "Edouard Grave", | |
| "Myle Ott", | |
| "Luke Zettlemoyer", | |
| "Veselin Stoyanov" | |
| ], | |
| "year": 2019, | |
| "venue": "arXiv", | |
| "url": "https://arxiv.org/abs/1911.02116", | |
| "arxiv_id": "1911.02116", | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6% average accuracy on XNLI, +13% average F1 score on MLQA, and +2.4% F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7% in XNLI accuracy for Swahili and 11.4% for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code, data and models publicly available.", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2019.arxiv.conneau", | |
| "fetched": true | |
| }, | |
| "2022.paclic.ngan": { | |
| "id": "2022.paclic.ngan", | |
| "title": "SMTCE: A Social Media Text Classification Evaluation Benchmark and BERTology Models for Vietnamese", | |
| "authors": [ | |
| "Luan Thanh Nguyen", | |
| "Kiet Van Nguyen", | |
| "Ngan Luu-Thuy Nguyen" | |
| ], | |
| "year": 2022, | |
| "venue": "PACLIC 2022", | |
| "url": "https://aclanthology.org/2022.paclic-1.31", | |
| "arxiv_id": null, | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2022.paclic.ngan", | |
| "fetched": true | |
| }, | |
| "2023.emnlp.kiet": { | |
| "id": "2023.emnlp.kiet", | |
| "title": "ViSoBERT: A Pre-Trained Language Model for Vietnamese Social Media Text Processing", | |
| "authors": [ | |
| "Nam Nguyen", | |
| "Thang Phan", | |
| "Duc-Vu Nguyen", | |
| "Kiet Nguyen" | |
| ], | |
| "year": 2023, | |
| "venue": "EMNLP 2023", | |
| "url": "https://aclanthology.org/2023.emnlp-main.315", | |
| "arxiv_id": null, | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2023.emnlp.kiet", | |
| "fetched": true | |
| }, | |
| "2020.findings.anh": { | |
| "id": "2020.findings.anh", | |
| "title": "PhoBERT: Pre-trained language models for Vietnamese", | |
| "authors": [ | |
| "Dat Quoc Nguyen", | |
| "Anh Tuan Nguyen" | |
| ], | |
| "year": 2020, | |
| "venue": "EMNLP Findings 2020", | |
| "url": "https://aclanthology.org/2020.findings-emnlp.92", | |
| "arxiv_id": null, | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "", | |
| "tldr": "", | |
| "keywords": [], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2020.findings.anh", | |
| "fetched": true | |
| }, | |
| "2018.kse.nguyen": { | |
| "id": "2018.kse.nguyen", | |
| "title": "UIT-VSFC: Vietnamese Students Feedback Corpus for Sentiment Analysis", | |
| "authors": [ | |
| "Kiet Van Nguyen", | |
| "Vu Duc Nguyen", | |
| "Phu Xuan-Vinh Nguyen", | |
| "Tham Thi-Hong Truong", | |
| "Ngan Luu-Thuy Nguyen" | |
| ], | |
| "year": 2018, | |
| "venue": "KSE 2018", | |
| "url": "https://ieeexplore.ieee.org/document/8573337/", | |
| "arxiv_id": null, | |
| "s2_id": null, | |
| "doi": "10.1109/KSE.2018.8573337", | |
| "citation_count": 0, | |
| "abstract": "Vietnamese Students Feedback Corpus (UIT-VSFC) consisting of over 16,000 sentences annotated for sentiment and topic classification.", | |
| "tldr": "", | |
| "keywords": [ | |
| "sentiment analysis", | |
| "Vietnamese", | |
| "dataset" | |
| ], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2018.kse.nguyen", | |
| "fetched": true | |
| }, | |
| "2007.rivf.hoang": { | |
| "id": "2007.rivf.hoang", | |
| "title": "A Comparative Study on Vietnamese Text Classification Methods", | |
| "authors": [ | |
| "Cong Duy Vu Hoang", | |
| "Dien Dinh", | |
| "Le Nguyen Nguyen", | |
| "Quoc Hung Ngo" | |
| ], | |
| "year": 2007, | |
| "venue": "IEEE RIVF 2007", | |
| "url": "https://ieeexplore.ieee.org/document/4223084/", | |
| "arxiv_id": null, | |
| "s2_id": null, | |
| "doi": null, | |
| "citation_count": 0, | |
| "abstract": "Comparative study of BOW and N-gram approaches for Vietnamese text classification on VNTC corpus, achieving >95% accuracy.", | |
| "tldr": "", | |
| "keywords": [ | |
| "text classification", | |
| "Vietnamese", | |
| "SVM", | |
| "TF-IDF", | |
| "n-gram", | |
| "VNTC" | |
| ], | |
| "references": [], | |
| "cited_by": [], | |
| "local_path": "references/2007.rivf.hoang", | |
| "fetched": true | |
| } | |
| }, | |
| "s2_cache": {} | |
| } |