NguyenTuan-UET commited on Jan 30

Commit

3a69ac8

1 Parent(s): 963890d

Initial commit: keyword-extraction-viet (models + vncorenlp via Git LFS)

Files changed (28) hide show

.gitattributes +8 -0
.gitignore +4 -0
.idea/.gitignore +3 -0
.idea/inspectionProfiles/Project_Default.xml +38 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/keyword-extraction-viet.iml +10 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
README.md +109 -3
app.py +70 -0
keybertvi_model.py +72 -0
model/keyword_extraction_utils.py +260 -0
model/named_entities.py +43 -0
model/process_text.py +104 -0
pipeline.py +134 -0
pretrained-models/ner-vietnamese-electra-base.pt +3 -0
pretrained-models/phobert.pt +3 -0
pretrained-models/vncorenlp/VnCoreNLP-1.2.jar +3 -0
pretrained-models/vncorenlp/models/dep/vi-dep.xz +3 -0
pretrained-models/vncorenlp/models/ner/vi-500brownclusters.xz +3 -0
pretrained-models/vncorenlp/models/ner/vi-ner.xz +3 -0
pretrained-models/vncorenlp/models/ner/vi-pretrainedembeddings.xz +3 -0
pretrained-models/vncorenlp/models/postagger/vi-tagger +3 -0
pretrained-models/vncorenlp/models/wordsegmenter/vi-vocab +3 -0
pretrained-models/vncorenlp/models/wordsegmenter/wordsegmenter.rdr +3 -0
requirements.txt +3 -0
vietnamese-stopwords-dash.txt +1998 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+vncorenlp filter=lfs diff=lfs merge=lfs -text
+pretrained-models/phobert.pt filter=lfs diff=lfs merge=lfs -text
+pretrained-models/ner-vietnamese-electra-base.pt filter=lfs diff=lfs merge=lfs -text
+pretrained-models/vncorenlp/models/postagger/vi-tagger filter=lfs diff=lfs merge=lfs -text
+pretrained-models/vncorenlp/VnCoreNLP-1.2.jar filter=lfs diff=lfs merge=lfs -text
+pretrained-models/*.pt filter=lfs diff=lfs merge=lfs -text
+pretrained-models/vncorenlp/*.jar filter=lfs diff=lfs merge=lfs -text
+pretrained-models/vncorenlp/models/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+venv
+test_file.txt
+scrap.py

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,38 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyChainedComparisonsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoreConstantInTheMiddle" value="true" />
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="13">
+            <item index="0" class="java.lang.String" itemvalue="scikit-image" />
+            <item index="1" class="java.lang.String" itemvalue="scipy" />
+            <item index="2" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="3" class="java.lang.String" itemvalue="PyYAML" />
+            <item index="4" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="5" class="java.lang.String" itemvalue="torch" />
+            <item index="6" class="java.lang.String" itemvalue="numpy" />
+            <item index="7" class="java.lang.String" itemvalue="torchvision" />
+            <item index="8" class="java.lang.String" itemvalue="pandas" />
+            <item index="9" class="java.lang.String" itemvalue="tqdm" />
+            <item index="10" class="java.lang.String" itemvalue="imageio" />
+            <item index="11" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="12" class="java.lang.String" itemvalue="pytz" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N803" />
+          <option value="N806" />
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/keyword-extraction-viet.iml ADDED Viewed

	@@ -0,0 +1,10 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (keyword-extraction-viet)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/keyword-extraction-viet.iml" filepath="$PROJECT_DIR$/.idea/keyword-extraction-viet.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

README.md CHANGED Viewed

@@ -1,3 +1,109 @@
----
-license: mit
----

+---
+tags:
+- keyword-extraction
+language:
+- vi
+---
+# <a name="introduction"></a>  KeyBERTVi - Keyword Extraction for Vietnamese language
+Inspired by [KeyBERT](https://github.com/MaartenGr/KeyBERT), KeyBERTVi implements a similar keyword extraction technique that leverages the embeddings of [PhoBERT](https://huggingface.co/vinai/phobert-base) and minimal linguistics properties to extract keywords and keyphrases that are most similar to the document.
+<a name="toc"/></a>
+## Table of Contents
+<!--ts-->
+   1. [About the Project](#about)
+   2. [Getting Started](#gettingstarted)
+        2.1. [Installation](#installation)
+        2.2. [Basic Usage](#usage)
+        2.3. [Diversify Results](#diversify)
+   3. [Limitations](#limitations)
+<!--te-->
+<a name="about"/></a>
+## 1. About the Project
+This implementation took inspiration from the simple yet intuitive and powerful method of [KeyBERT](https://github.com/MaartenGr/KeyBERT/), applied for the Vietnamese language. PhoBERT are used to generate both document-level embeddings and word-level embeddings for extracted N-grams. Cosine similarity is then used to compute which N-grams are most similar to the document-level embedding, thus can be perceived as most representative of the document.
+Preprocessing catered to the Vietnamese language was applied.
+Test with your own documents at [KeyBERTVi Space](https://huggingface.co/spaces/tpha4308/keybertvi-app).
+<a name="gettingstarted"/></a>
+## 2. Getting Started
+<a name="installation"/></a>
+###  2.1. Setting up
+```bash
+  git clone https://huggingface.co/tpha4308/keyword-extraction-viet
+```
+You can use existing pre-trained models in the repo or download your own and put them in `pretrained-models` folder.
+```python
+  phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
+  phobert.eval()
+  torch.save(phobert, f'{dir_path}/pretrained-models/phobert.pt')
+  ner_model = AutoModelForTokenClassification.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
+  ner_model.eval()
+  torch.save(ner_model, f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
+```
+**Note:** `dir_path` is the absolute path to the repo.
+As [PhoBERT](https://huggingface.co/vinai/phobert-base) requires [VnCoreNLP](https://github.com/vncorenlp/VnCoreNLP) as part of pre-processing, the folder `pretrained-models/vncorenlp` is required. To download your own:
+```bash
+  pip install py_vncorenlp
+```
+```python
+  import py_vncorenlp
+  py_vncorenlp.download_model(save_dir=f'{dir_path}/pretrained-models/vncorenlp')
+```
+<a name="usage"/></a>
+###  2.2. Basic Usage
+```python
+  phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
+  phobert.eval()
+  ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
+  ner_model.eval()
+  kw_pipeline = KeywordExtractorPipeline(phobert, ner_model)
+```
+```python
+  title = "Truyền thuyết và hiện tại Thành Cổ Loa"
+  text = """
+            Nhắc đến Cổ Loa, người ta nghĩ ngay đến truyền thuyết về An Dương Vương được thần Kim Quy bày cho cách xây thành, về chiếc lẫy nỏ thần làm từ móng chân rùa thần và mối tình bi thương Mỵ Châu – Trọng Thủy. Đằng sau những câu chuyện thiên về tâm linh ấy, thế hệ con cháu còn khám phá được những giá trị khảo cổ to lớn của Cổ Loa.
+            Khu di tích Cổ Loa cách trung – tâm Hà Nội 17km thuộc huyện Đông Anh, Hà Nội, có diện tích bảo tồn gần 500ha được coi là địa chỉ văn hóa đặc biệt của thủ đô và cả nước. Cổ Loa có hàng loạt di chỉ khảo cổ học đã được phát hiện, phản ánh quá trình phát triển liên tục của dân tộc ta từ sơ khai qua các thời kỳ đồ đồng, đồ đá và đồ sắt mà đỉnh cao là văn hóa Đông Sơn, vẫn được coi là nền văn minh sông Hồng thời kỳ tiền sử của dân tộc Việt Nam.
+            Cổ Loa từng là kinh đô của nhà nước Âu Lạc thời kỳ An Dương Vương (thế kỷ III TCN) và của nước Đại Việt thời Ngô Quyền (thế kỷ X) mà thành Cổ Loa là một di tích minh chứng còn lại cho đến ngày nay. Thành Cổ Loa được các nhà khảo cổ học đánh giá là “tòa thành cổ nhất, quy mô lớn vào bậc nhất, cấu trúc cũng thuộc loại độc đáo nhất trong lịch sử xây dựng thành lũy của người Việt cổ”.
+          """
+  inp = {"title": title, "text": text}
+  kws = kw_pipeline(inputs=inp, min_freq=1, ngram_n=(1, 3), top_n=5, diversify_result=False)
+  [('Khu di_tích Cổ_Loa', 0.88987315),
+  ('Âu_Lạc thời_kỳ An_Dương_Vương', 0.8680505),
+  ('thành Cổ_Loa', 0.8661723),
+  ('hàng_loạt di_chỉ khảo_cổ_học', 0.8644231),
+  ('lịch_sử xây_dựng thành_luỹ', 0.8375939)]
+```
+<a name="diversify"/></a>
+###  2.3. Diversify Results
+More information needed
+<a name="limitations"/></a>
+## 3. Limitations
+More information needed
+## References
+1. https://github.com/MaartenGr/KeyBERT
+2. https://github.com/VinAIResearch/PhoBERT
+3. https://huggingface.co/NlpHUST/ner-vietnamese-electra-base
+4. https://github.com/undertheseanlp/underthesea
+5. https://github.com/vncorenlp/VnCoreNLP

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+import torch
+import os
+from pipeline import KeywordExtractorPipeline
+DIR_PATH = os.path.dirname(os.path.realpath(__file__))
+def extract_keyword(title, text, top_n, ngram_low_range, ngram_high_range, min_freq, diversify_result):
+    inp = {"text": text, "title": title}
+    keyword_ls = kw_pipeline(inputs=inp, min_freq=min_freq, ngram_n=(ngram_low_range, ngram_high_range),
+                             top_n=top_n, diversify_result=diversify_result)
+    result = ''
+    for kw, score in keyword_ls:
+        result += f'{kw}: {score}\n'
+    return result
+if gr.NO_RELOAD:
+    print("Loading PhoBERT model")
+    phobert = torch.load(f'{DIR_PATH}/pretrained-models/phobert.pt')
+    phobert.eval()
+    print("Loading NER model")
+    ner_model = torch.load(f'{DIR_PATH}/pretrained-models/ner-vietnamese-electra-base.pt')
+    ner_model.eval()
+    kw_pipeline = KeywordExtractorPipeline(phobert, ner_model)
+if __name__ == "__main__":
+    demo = gr.Interface(fn=extract_keyword,
+                        inputs=[
+                            gr.Text(
+                                label="Title",
+                                lines=1,
+                                value="Enter title here",
+                            ),
+                            gr.Textbox(
+                                label="Text",
+                                lines=5,
+                                value="Enter text here",
+                            ),
+                            gr.Number(
+                                label="Top N keywords",
+                                info="Number of keywords retrieved",
+                                value=10
+                            ),
+                            gr.Number(
+                                label="Ngram low range",
+                                value=1
+                            ),
+                            gr.Number(
+                                label="Ngram high range",
+                                value=3
+                            ),
+                            gr.Number(
+                                label="Ngram minimum frequency",
+                                value=1
+                            ),
+                            gr.Checkbox(
+                                label="Diversify result"
+                            )
+                        ],
+                        # inputs=["text", "textbox", "number", "number", "number", "number", "checkbox"],
+                        outputs=gr.Textbox(
+                            label="Keywords Extracted",
+                        )
+                        )
+    demo.launch(share=True)  # Share your demo with just 1 extra parameter 🚀

keybertvi_model.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# import py_vncorenlp
+# from transformers import AutoTokenizer, pipeline
+# import torch
+# import os
+# from model.keyword_extraction_utils import extract_keywords
+#
+#
+# class KeyBERTVi:
+#
+#     def __init__(self, stopwords_file_path=None):
+#         self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
+#                                                 save_dir=f'{dir_path}/pretrained-models/vncorenlp')
+#         # model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp')
+#         print("Loading PhoBERT model")
+#         self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
+#
+#         # use absolute path because torch is cached
+#         self.phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
+#         self.phobert.eval()
+#
+#         print("Loading NER model")
+#         ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
+#         ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
+#         ner_model.eval()
+#         self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
+#
+#         if stopwords_file_path is None:
+#             stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
+#         with open(stopwords_file_path) as f:
+#             self.stopwords = [w.strip() for w in f.readlines()]
+#
+#     def extract_keywords(self, title, text, ngram_range=(1, 3), top_n=5, use_kmeans=False, use_mmr=False, min_freq=1):
+#         keyword_ls = extract_keywords(text, title,
+#                                       self.ner_pipeline,
+#                                       self.annotator,
+#                                       self.phobert_tokenizer,
+#                                       self.phobert,
+#                                       self.stopwords,
+#                                       ngram_n=ngram_range,
+#                                       top_n=top_n,
+#                                       use_kmeans=use_kmeans,
+#                                       use_mmr=use_mmr,
+#                                       min_freq=min_freq)
+#         return keyword_ls
+#
+#     def highlight(self, text, keywords):
+#         kw_ls = [' '.join(kw.split('_')) for kw, score in keywords]
+#         for key in kw_ls:
+#             text = text.replace(f" {key}", f" <mark>{key}</mark>")
+#         return text
+#
+#
+# dir_path = os.path.dirname(os.path.realpath(__file__))
+# if __name__ == "__main__":
+#     # args
+#     # print(dir_path)
+#
+#     stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
+#
+#     # text_file_path = sys.argv[1]
+#     # with open(f'{dir_path}/{text_file_path}', 'r') as f:
+#     #     text = ' '.join([ln.strip() for ln in f.readlines()])
+#         # print(text)
+#
+#     # kw_model = KeyBERTVi()
+#     # model_name_on_hub = "KeyBERTVi"
+#     # kw_model.save_pretrained(model_name_on_hub)
+#     # kw_model.phobert_tokenizer.save_pretrained(model_name_on_hub)
+#
+#     # title = None
+#     # keyword_ls = kw_model.extract_keywords(title, text, ngram_range=(1, 3), top_n=5)
+#     # print(keyword_ls)

model/keyword_extraction_utils.py ADDED Viewed

	@@ -0,0 +1,260 @@

+from string import punctuation
+import numpy as np
+import torch
+from sklearn.cluster import KMeans
+from model.named_entities import get_named_entities
+punctuation = [c for c in punctuation if c != "_"]
+punctuation += ["“", "–", ",", "…", "”", "–"]
+ethnicity_dict_map = {"H'Mông": "HMông",
+                      "H'mông": "HMông",
+                      "H’mông": "HMông",
+                      "H’Mông": "HMông",
+                      "H’MÔNG": "HMông",
+                      "M'Nông": "MNông",
+                      "M'nông": "MNông",
+                      "M'NÔNG": "MNông",
+                      "M’Nông": "MNông",
+                      "M’NÔNG": "MNông",
+                      "K’Ho": "KHo",
+                      "K’Mẻo": "KMẻo"}
+def sub_sentence(sentence):
+    sent = []
+    start_index = 0
+    while start_index < len(sentence):
+        idx_list = []
+        for p in punctuation:
+            idx = sentence.find(p, start_index)
+            if idx != -1:
+                idx_list.append(idx)
+        if len(idx_list) == 0:
+            sent.append(sentence[start_index:].strip())
+            break
+        end_index = min(idx_list)
+        subsent = sentence[start_index:end_index].strip()
+        if len(subsent) > 0:
+            sent.append(subsent)
+        start_index = end_index + 1
+    return sent
+def check_for_stopwords(ngram, stopwords_ls):
+    for ngram_elem in ngram.split():
+        for w in stopwords_ls:
+            if ngram_elem == w:  # or ngram_elem.lower() == w:
+                return True
+    return False
+def compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True):
+    if subsentences:
+        output_sub_sentences = []
+        for sentence in segmentised_doc:
+            output_sub_sentences += sub_sentence(sentence)
+    else:
+        output_sub_sentences = segmentised_doc
+    ngram_list = []
+    for sentence in output_sub_sentences:
+        sent = sentence.split()
+        for i in range(len(sent) - ngram_n + 1):
+            ngram = ' '.join(sent[i:i + ngram_n])
+            if ngram not in ngram_list and not check_for_stopwords(ngram, stopwords_ls):
+                ngram_list.append(ngram)
+    final_ngram_list = []
+    for ngram in ngram_list:
+        contains_number = False
+        for char in ngram:
+            if char.isnumeric():
+                contains_number = True
+                break
+        if not contains_number:
+            final_ngram_list.append(ngram)
+    return final_ngram_list
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def get_doc_embeddings(segmentised_doc, tokenizer, phobert, stopwords):
+    doc_embedding = torch.zeros(size=(len(segmentised_doc), 768))
+    for i, sentence in enumerate(segmentised_doc):
+        sent_removed_stopwords = ' '.join([word for word in sentence.split() if word not in stopwords])
+        sentence_embedding = tokenizer.encode(sent_removed_stopwords)
+        input_ids = torch.tensor([sentence_embedding])
+        with torch.no_grad():
+            features = phobert(input_ids)
+        if i == 0:
+            doc_embedding[i, :] = 2 * features.pooler_output.flatten()
+        else:
+            doc_embedding[i, :] = features.pooler_output.flatten()
+    return torch.mean(doc_embedding, axis=0)
+def get_segmentised_doc(nlp, rdrsegmenter, title, doc):
+    for i, j in ethnicity_dict_map.items():
+        if title is not None:
+            title = title.replace(i, j)
+        doc = doc.replace(i, j)
+    segmentised_doc = rdrsegmenter.word_segment(doc)
+    if title is not None:
+        segmentised_doc = rdrsegmenter.word_segment(title) + rdrsegmenter.word_segment(doc)
+    ne_ls = set(get_named_entities(nlp, doc))
+    segmentised_doc_ne = []
+    for sent in segmentised_doc:
+        for ne in ne_ls:
+            sent = sent.replace(ne, '_'.join(ne.split()))
+        segmentised_doc_ne.append(sent)
+    return ne_ls, segmentised_doc_ne
+def compute_ngram_embeddings(tokenizer, phobert, ngram_list):
+    ngram_embeddings = {}
+    for ngram in ngram_list:
+        ngram_copy = ngram
+        if ngram.isupper():
+            ngram_copy = ngram.lower()
+        word_embedding = tokenizer.encode(ngram_copy)
+        input_ids = torch.tensor([word_embedding])
+        with torch.no_grad():
+            word_features = phobert(input_ids)
+        ngram_embeddings[ngram] = word_features.pooler_output
+    return ngram_embeddings
+def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
+    ngram_similarity_dict = {}
+    for ngram in ngram_list:
+        similarity_score = cosine_similarity(ngram_embeddings[ngram], doc_embedding.T).flatten()[0]
+        # similarity_score = normalised_cosine_similarity(ngram_embeddings[ngram], doc_embedding.T).flatten()[0]
+        ngram_similarity_dict[ngram] = similarity_score
+    return ngram_similarity_dict
+def diversify_result_kmeans(ngram_result, ngram_embeddings, top_n=5):
+    best_ngrams = sorted(ngram_result, key=ngram_result.get, reverse=True)[:top_n * 4]
+    best_ngram_embeddings = np.array([ngram_embeddings[ngram] for ngram in best_ngrams]).squeeze()
+    vote = {}
+    for niter in range(100):
+        kmeans = KMeans(n_clusters=top_n, init='k-means++', random_state=niter * 2, n_init="auto").fit(
+            best_ngram_embeddings)
+        kmeans_result = kmeans.labels_
+        res = {}
+        for i in range(len(kmeans_result)):
+            if kmeans_result[i] not in res:
+                res[kmeans_result[i]] = []
+            res[kmeans_result[i]].append((best_ngrams[i], ngram_result[best_ngrams[i]]))
+        final_result = [res[k][0] for k in res]
+        for keyword in final_result:
+            if keyword not in vote:
+                vote[keyword] = 0
+            vote[keyword] += 1
+    diversify_result_ls = sorted(vote, key=vote.get, reverse=True)
+    return diversify_result_ls[:top_n]
+def remove_duplicates(ngram_result):
+    to_remove = set()
+    for ngram in ngram_result:
+        for ngram2 in ngram_result:
+            if ngram not in to_remove and ngram != ngram2 and ngram.lower() == ngram2.lower():
+                new_score = np.mean([ngram_result[ngram], ngram_result[ngram2]])
+                ngram_result[ngram] = new_score
+                to_remove.add(ngram2)
+    for ngram in to_remove:
+        ngram_result.pop(ngram)
+    return ngram_result
+def compute_filtered_text(annotator, title, text):
+    annotated = annotator.annotate_text(text)
+    if title is not None:
+        annotated = annotator.annotate_text(title + '. ' + text)
+    filtered_sentences = []
+    keep_tags = ['N', 'Np', 'V', 'Nc']
+    for key in annotated.keys():
+        # print(key,annotated[key])
+        sent = ' '.join([dict_['wordForm'] for dict_ in annotated[key] if dict_['posTag'] in keep_tags])
+        filtered_sentences.append(sent)
+    return filtered_sentences
+def get_candidate_ngrams(segmentised_doc, filtered_segmentised_doc, ngram_n, stopwords_ls):
+    # get actual ngrams
+    actual_ngram_list = compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True)
+    # get filtered ngrams
+    filtered_ngram_list = compute_ngram_list(filtered_segmentised_doc, ngram_n, stopwords_ls,
+                                             subsentences=False)
+    # get candiate ngrams
+    candidate_ngram = [ngram for ngram in filtered_ngram_list if ngram in actual_ngram_list]
+    return candidate_ngram
+def limit_minimum_frequency(doc_segmentised, ngram_list, min_freq=1):
+    ngram_dict_freq = {}
+    for ngram in ngram_list:
+        ngram_n = len(ngram.split())
+        count = 0
+        for sentence in doc_segmentised:
+            sent = sentence.split()
+            # print(sent)
+            for i in range(len(sent) - ngram_n + 1):
+                pair = ' '.join(sent[i:i + ngram_n])
+                # print(pair, ngram)
+                if pair == ngram:
+                    count += 1
+            # print(ngram, count)
+        if count >= min_freq:
+            ngram_dict_freq[ngram] = count
+    return ngram_dict_freq
+def remove_overlapping_ngrams(ngram_list):
+    to_remove = set()
+    for ngram1 in ngram_list:
+        for ngram2 in ngram_list:
+            if len(ngram1.split()) > len(ngram2.split()) and (ngram1.startswith(ngram2) or ngram1.endswith(ngram2)):
+                # print(ngram1, ngram2)
+                # print()
+                to_remove.add(ngram2)
+    # print("To removed")
+    # print(to_remove)
+    for kw in to_remove:
+        ngram_list.remove(kw)
+    return ngram_list

model/named_entities.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from underthesea import sent_tokenize
+def substring(w, ls):
+    for w2 in ls:
+        if w != w2 and w in w2:
+            return True
+    return False
+def get_ner_phrases(sent_ner_result):
+    ner_list = []
+    current_ner = [sent_ner_result[0]["word"]]
+    current_idx = sent_ner_result[0]["index"]
+    for i in range(1, len(sent_ner_result)):
+        if sent_ner_result[i]["index"] == current_idx + 1:
+            current_ner.append(sent_ner_result[i]["word"])
+        else:
+            ner_list.append((' '.join(current_ner), sent_ner_result[i - 1]['entity']))
+            current_ner = [sent_ner_result[i]["word"]]
+        current_idx = sent_ner_result[i]["index"]
+    ner_list.append((' '.join(current_ner), sent_ner_result[len(sent_ner_result) - 1]['entity']))
+    return ner_list
+def get_named_entities(nlp, doc):
+    ner_lists = []
+    for sent in sent_tokenize(doc):
+        sent_ner_result = nlp(sent)
+        if len(sent_ner_result) > 0:
+            ner_lists += get_ner_phrases(sent_ner_result)
+    # print(ner_lists)
+    ner_list_non_dup = []
+    for (entity, ner_type) in ner_lists:
+        if entity not in ner_list_non_dup and ner_type.startswith('I'):
+            ner_list_non_dup.append(entity)
+    ner_list_final = [w.replace(" ##", "") for w in ner_list_non_dup if not substring(w, ner_list_non_dup)]
+    return ner_list_final

model/process_text.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from string import punctuation
+def process_text_pipeline(text):
+    full_text_processed = replace_all(text.strip())
+    while '\n\n' in full_text_processed:
+        full_text_processed = full_text_processed.replace('\n\n', '\n')
+    full_text_processed = process_sticking_sentences(full_text_processed)
+    while '  ' in full_text_processed:
+        full_text_processed = full_text_processed.replace('  ', ' ')
+    return full_text_processed
+def replace_all(text):
+    dict_map = {
+        "òa": "oà",
+        "Òa": "Oà",
+        "ÒA": "OÀ",
+        "óa": "oá",
+        "Óa": "Oá",
+        "ÓA": "OÁ",
+        "ỏa": "oả",
+        "Ỏa": "Oả",
+        "ỎA": "OẢ",
+        "õa": "oã",
+        "Õa": "Oã",
+        "ÕA": "OÃ",
+        "ọa": "oạ",
+        "Ọa": "Oạ",
+        "ỌA": "OẠ",
+        "òe": "oè",
+        "Òe": "Oè",
+        "ÒE": "OÈ",
+        "óe": "oé",
+        "Óe": "Oé",
+        "ÓE": "OÉ",
+        "ỏe": "oẻ",
+        "Ỏe": "Oẻ",
+        "ỎE": "OẺ",
+        "õe": "oẽ",
+        "Õe": "Oẽ",
+        "ÕE": "OẼ",
+        "ọe": "oẹ",
+        "Ọe": "Oẹ",
+        "ỌE": "OẸ",
+        "ùy": "uỳ",
+        "Ùy": "Uỳ",
+        "ÙY": "UỲ",
+        "úy": "uý",
+        "Úy": "Uý",
+        "ÚY": "UÝ",
+        "ủy": "uỷ",
+        "Ủy": "Uỷ",
+        "ỦY": "UỶ",
+        "ũy": "uỹ",
+        "Ũy": "Uỹ",
+        "ŨY": "UỸ",
+        "ụy": "uỵ",
+        "Ụy": "Uỵ",
+        "ỤY": "UỴ",
+        "\xa0": " ",
+        "…": "...",
+        "''": '"',
+        "&#34;": '"',
+        "&#39;": "'",
+        "H'Mông": "Hmông",
+        "H'mông": "Hmông",
+        "H’mông": "Hmông",
+        "H’Mông": "Hmông",
+        "H’MÔNG": "Hmông",
+        "M'Nông": "Mnông",
+        "M'nông": "Mnông",
+        "M'NÔNG": "Mnông",
+        "M’Nông": "Mnông",
+        "M’NÔNG": "Mnông",
+        '\u200b\u200b': ""
+    }
+    for i, j in dict_map.items():
+        text = text.replace(i, j)
+    return text
+def process_sticking_sentences(full_text):
+    for i in range(len(full_text) - 1):
+        c1 = full_text[i]
+        c2 = full_text[i + 1]
+        # 'end of sentence.Start'
+        if c1 in punctuation and c2.isalpha() and c2.isupper():
+            before = full_text[:i + 1]
+            after = full_text[i + 1:]
+            full_text = before + " " + after
+        # 'end of sentenceStart'
+        if c1.isalpha() and c1.islower() and c2.isalpha() and c2.isupper():
+            before = full_text[:i + 1]
+            after = full_text[i + 1:]
+            full_text = before + ". " + after
+    return full_text

pipeline.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import py_vncorenlp
+from transformers import AutoTokenizer, Pipeline, pipeline
+import os
+from model.keyword_extraction_utils import *
+from model.process_text import process_text_pipeline
+dir_path = os.path.dirname(os.path.realpath(__file__))
+class KeywordExtractorPipeline(Pipeline):
+    def __init__(self, model, ner_model, **kwargs):
+        super().__init__(model, **kwargs)
+        self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
+                                                save_dir=f'{dir_path}/pretrained-models/vncorenlp')
+        print("Loading PhoBERT tokenizer")
+        self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
+        self.phobert = model
+        print("Loading NER tokenizer")
+        ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
+        self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
+        stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
+        with open(stopwords_file_path) as f:
+            self.stopwords = [w.strip() for w in f.readlines()]
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        forward_kwargs = {}
+        postprocess_kwargs = {}
+        for possible_preprocess_kwarg in ["text", "title"]:
+            if possible_preprocess_kwarg in kwargs:
+                preprocess_kwargs[possible_preprocess_kwarg] = kwargs[possible_preprocess_kwarg]
+        for possible_forward_kwarg in ["ngram_n", "min_freq"]:
+            if possible_forward_kwarg in kwargs:
+                forward_kwargs[possible_forward_kwarg] = kwargs[possible_forward_kwarg]
+        for possible_postprocess_kwarg in ["top_n", "diversify_result"]:
+            if possible_postprocess_kwarg in kwargs:
+                postprocess_kwargs[possible_postprocess_kwarg] = kwargs[possible_postprocess_kwarg]
+        return preprocess_kwargs, forward_kwargs, postprocess_kwargs
+    def preprocess(self, inputs):
+        title = None
+        if inputs['title']:
+            title = process_text_pipeline(inputs['title'])
+        text = process_text_pipeline(inputs['text'])
+        return {"text": text, "title": title}
+    def _forward(self, model_inputs, ngram_n, min_freq):
+        text = model_inputs['text']
+        title = model_inputs['title']
+        # Getting segmentised document
+        ne_ls, doc_segmentised = get_segmentised_doc(self.ner_pipeline, self.annotator, title, text)
+        filtered_doc_segmentised = compute_filtered_text(self.annotator, title, text)
+        doc_embedding = get_doc_embeddings(filtered_doc_segmentised, self.phobert_tokenizer, self.phobert,
+                                           self.stopwords)
+        ngram_list = self.generate_ngram_list(doc_segmentised, filtered_doc_segmentised, ne_ls, ngram_n, min_freq)
+        print("Final ngram list")
+        print(sorted(ngram_list))
+        ngram_embeddings = compute_ngram_embeddings(self.phobert_tokenizer, self.phobert, ngram_list)
+        return {"ngram_list": ngram_list, "ngram_embeddings": ngram_embeddings, "doc_embedding": doc_embedding}
+    def postprocess(self, model_outputs, top_n, diversify_result):
+        ngram_list = model_outputs['ngram_list']
+        ngram_embeddings = model_outputs['ngram_embeddings']
+        doc_embedding = model_outputs['doc_embedding']
+        ngram_result = self.extract_keywords(doc_embedding, ngram_list, ngram_embeddings)
+        non_diversified = sorted([(ngram, ngram_result[ngram]) for ngram in ngram_result],
+                                 key=lambda x: x[1], reverse=True)[:top_n]
+        if diversify_result:
+            return diversify_result_kmeans(ngram_result, ngram_embeddings, top_n=top_n)
+        return non_diversified
+    def generate_ngram_list(self, doc_segmentised, filtered_doc_segmentised, ne_ls, ngram_n, min_freq):
+        ngram_low, ngram_high = ngram_n
+        # Adding ngram
+        ngram_list = set()
+        for n in range(ngram_low, ngram_high + 1):
+            ngram_list.update(get_candidate_ngrams(doc_segmentised, filtered_doc_segmentised, n, self.stopwords))
+        # print(sorted(ngram_list))
+        # Adding named entities ngram list
+        ne_ls_segmented = [self.annotator.word_segment(ne)[0] for ne in ne_ls]
+        print("Named Entities list")
+        print(ne_ls_segmented)
+        ngram_list.update(ne_ls_segmented)
+        # print(sorted(ngram_list))
+        # Removing overlapping ngrams
+        ngram_list = remove_overlapping_ngrams(ngram_list)
+        # print("Removed overlapping ngrams")
+        # print(sorted(ngram_list))
+        # Limit ngrams by minimum frequency
+        if min_freq > 1:
+            ngram_list = limit_minimum_frequency(doc_segmentised, ngram_list, min_freq=min_freq)
+            return ngram_list.keys()
+        return ngram_list
+    def extract_keywords(self, doc_embedding, ngram_list, ngram_embeddings):
+        ngram_result = compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding)
+        ngram_result = remove_duplicates(ngram_result)
+        return ngram_result
+if __name__ == "__main__":
+    phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
+    phobert.eval()
+    ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
+    ner_model.eval()
+    kw_pipeline = KeywordExtractorPipeline(phobert, ner_model)
+    text_file_path = f'{dir_path}/test_file.txt'
+    with open(text_file_path, 'r') as f:
+        text = ' '.join([ln.strip() for ln in f.readlines()])
+    inp = {"text": text, "title": None}
+    kws = kw_pipeline(inputs=inp, min_freq=1, ngram_n=(1, 3), top_n=5, diversify_result=False)
+    print(kws)

pretrained-models/ner-vietnamese-electra-base.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af136f7fcb73fba5edee9021032227ede597c2882f39d910b3b830cf49bf5d52
+size 532423140

pretrained-models/phobert.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:984b4f9b49a06331113974f50c8dc96b845bf808034b79993c3ddbf4a946d872
+size 540111904

pretrained-models/vncorenlp/VnCoreNLP-1.2.jar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e2811cdbc2ddfc71d04be5dc36e185c88dcd1ad4d5d69e4ff2e1369dccf7793
+size 27412703

pretrained-models/vncorenlp/models/dep/vi-dep.xz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:266e4a3a55d5edd1607d5f036c2f95b70c0a6c80f58b57fd9962677a6ef331b7
+size 16048864

pretrained-models/vncorenlp/models/ner/vi-500brownclusters.xz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d30f9cfdf0af193a69e185d1acda0306a9fbe1321f8a700f7c66557a90f92b8c
+size 5599844

pretrained-models/vncorenlp/models/ner/vi-ner.xz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f04c5e752d7f99a6313b758fc2607a2c3906e58b1d60a37eb0192aead73d61f7
+size 9956876

pretrained-models/vncorenlp/models/ner/vi-pretrainedembeddings.xz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00d3d034f1b23a8bfe5168195741fde845808c212e6dfcd4c94bead1665eb0fc
+size 57313672

pretrained-models/vncorenlp/models/postagger/vi-tagger ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a95608a5982db89c11353b451154ec396eccc0ff1f5b22874935ecdf4e0ace01
+size 29709468

pretrained-models/vncorenlp/models/wordsegmenter/vi-vocab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a47c5b55bbce163029d37730a67b9479740388695c29c106c112b815613eaa5
+size 526544

pretrained-models/vncorenlp/models/wordsegmenter/wordsegmenter.rdr ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e62f96bd93e37a24f364238e8d8ae986fa5dad6dbc9f4eae622ab3651b7fa06
+size 128508

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+py_vncorenlp
+transformers

vietnamese-stopwords-dash.txt ADDED Viewed

	@@ -0,0 +1,1998 @@

+a_lô
+a_ha
+ai
+ai_ai
+ai_nấy
+ai_đó
+alô
+amen
+anh
+anh_ấy
+ba
+ba_ba
+ba_bản
+ba_cùng
+ba_họ
+ba_ngày
+ba_ngôi
+ba_tăng
+bao_giờ
+bao_lâu
+bao_nhiêu
+bao_nả
+bay_biến
+biết
+biết_bao
+biết_bao_nhiêu
+biết_chắc
+biết_chừng_nào
+biết_mình
+biết_mấy
+biết_thế
+biết_trước
+biết_việc
+biết_đâu
+biết_đâu_chừng
+biết_đâu_đấy
+biết_được
+buổi
+buổi_làm
+buổi_mới
+buổi_ngày
+buổi_sớm
+bà
+bà_ấy
+bài
+bài_bác
+bài_bỏ
+bài_cái
+bác
+bán
+bán_cấp
+bán_dạ
+bán_thế
+bây_bẩy
+bây_chừ
+bây_giờ
+bây_nhiêu
+bèn
+béng
+bên
+bên_bị
+bên_có
+bên_cạnh
+bông
+bước
+bước_khỏi
+bước_tới
+bước_đi
+bạn
+bản
+bản_bộ
+bản_riêng
+bản_thân
+bản_ý
+bất_chợt
+bất_cứ
+bất_giác
+bất_kì
+bất_kể
+bất_kỳ
+bất_luận
+bất_ngờ
+bất_nhược
+bất_quá
+bất_quá_chỉ
+bất_thình_lình
+bất_tử
+bất_đồ
+bấy
+bấy_chầy
+bấy_chừ
+bấy_giờ
+bấy_lâu
+bấy_lâu_nay
+bấy_nay
+bấy_nhiêu
+bập_bà_bập_bõm
+bập_bõm
+bắt_đầu
+bắt_đầu_từ
+bằng
+bằng_cứ
+bằng_không
+bằng_người
+bằng_nhau
+bằng_như
+bằng_nào
+bằng_nấy
+bằng_vào
+bằng_được
+bằng_ấy
+bển
+bệt
+bị
+bị_chú
+bị_vì
+bỏ
+bỏ_bà
+bỏ_cha
+bỏ_cuộc
+bỏ_không
+bỏ_lại
+bỏ_mình
+bỏ_mất
+bỏ_mẹ
+bỏ_nhỏ
+bỏ_quá
+bỏ_ra
+bỏ_riêng
+bỏ_việc
+bỏ_xa
+bỗng
+bỗng_chốc
+bỗng_dưng
+bỗng_không
+bỗng_nhiên
+bỗng_nhưng
+bỗng_thấy
+bỗng_đâu
+bộ
+bộ_thuộc
+bộ_điều
+bội_phần
+bớ
+bởi
+bởi_ai
+bởi_chưng
+bởi_nhưng
+bởi_sao
+bởi_thế
+bởi_thế_cho_nên
+bởi_tại
+bởi_vì
+bởi_vậy
+bởi_đâu
+bức
+cao
+cao_lâu
+cao_ráo
+cao_răng
+cao_sang
+cao_số
+cao_thấp
+cao_thế
+cao_xa
+cha
+cha_chả
+chao_ôi
+chia_sẻ
+chiếc
+cho
+cho_biết
+cho_chắc
+cho_hay
+cho_nhau
+cho_nên
+cho_rằng
+cho_rồi
+cho_thấy
+cho_tin
+cho_tới
+cho_tới_khi
+cho_về
+cho_ăn
+cho_đang
+cho_được
+cho_đến
+cho_đến_khi
+cho_đến_nỗi
+choa
+chu_cha
+chui_cha
+chung
+chung_cho
+chung_chung
+chung_cuộc
+chung_cục
+chung_nhau
+chung_qui
+chung_quy
+chung_quy_lại
+chung_ái
+chuyển
+chuyển_tự
+chuyển_đạt
+chuyện
+chuẩn_bị
+chành_chạnh
+chí_chết
+chính
+chính_bản
+chính_giữa
+chính_là
+chính_thị
+chính_điểm
+chùn_chùn
+chùn_chũn
+chú
+chú_dẫn
+chú_khách
+chú_mày
+chú_mình
+chúng
+chúng_mình
+chúng_ta
+chúng_tôi
+chúng_ông
+chăn_chắn
+chăng
+chăng_chắc
+chăng_nữa
+chơi
+chơi_họ
+chưa
+chưa_bao_giờ
+chưa_chắc
+chưa_có
+chưa_cần
+chưa_dùng
+chưa_dễ
+chưa_kể
+chưa_tính
+chưa_từng
+chầm_chập
+chậc
+chắc
+chắc_chắn
+chắc_dạ
+chắc_hẳn
+chắc_lòng
+chắc_người
+chắc_vào
+chắc_ăn
+chẳng_lẽ
+chẳng_những
+chẳng_nữa
+chẳng_phải
+chết_nỗi
+chết_thật
+chết_tiệt
+chỉ
+chỉ_chính
+chỉ_có
+chỉ_là
+chỉ_tên
+chỉn
+chị
+chị_bộ
+chị_ấy
+chịu
+chịu_chưa
+chịu_lời
+chịu_tốt
+chịu_ăn
+chọn
+chọn_bên
+chọn_ra
+chốc_chốc
+chớ
+chớ_chi
+chớ_gì
+chớ_không
+chớ_kể
+chớ_như
+chợt
+chợt_nghe
+chợt_nhìn
+chủn
+chứ
+chứ_ai
+chứ_còn
+chứ_gì
+chứ_không
+chứ_không_phải
+chứ_lại
+chứ_lị
+chứ_như
+chứ_sao
+coi_bộ
+coi_mòi
+con
+con_con
+con_dạ
+con_nhà
+con_tính
+cu_cậu
+cuối
+cuối_cùng
+cuối_điểm
+cuốn
+cuộc
+càng
+càng_càng
+càng_hay
+cá_nhân
+các
+các_cậu
+cách
+cách_bức
+cách_không
+cách_nhau
+cách_đều
+cái
+cái_gì
+cái_họ
+cái_đã
+cái_đó
+cái_ấy
+cây_nước
+còn
+còn_như
+còn_nữa
+còn_thời_gian
+còn_về
+có
+có_ai
+có_chuyện
+có_chăng
+có_chăng_là
+có_chứ
+có_cơ
+có_dễ
+có_họ
+có_khi
+có_ngày
+có_người
+có_nhiều
+có_nhà
+có_phải
+có_số
+có_tháng
+có_thế
+có_thể
+có_vẻ
+có_ý
+có_ăn
+có_điều
+có_điều_kiện
+có_đáng
+có_đâu
+có_được
+cóc_khô
+cô
+cô_mình
+cô_quả
+cô_tăng
+cô_ấy
+công_nhiên
+cùng
+cùng_chung
+cùng_cực
+cùng_nhau
+cùng_tuổi
+cùng_tột
+cùng_với
+cùng_ăn
+căn
+căn_cái
+căn_cắt
+căn_tính
+cũng
+cũng_như
+cũng_nên
+cũng_thế
+cũng_vậy
+cũng_vậy_thôi
+cũng_được
+cơ
+cơ_chỉ
+cơ_chừng
+cơ_cùng
+cơ_dẫn
+cơ_hồ
+cơ_hội
+cơ_mà
+cơn
+cả
+cả_nghe
+cả_nghĩ
+cả_ngày
+cả_người
+cả_nhà
+cả_năm
+cả_thảy
+cả_thể
+cả_tin
+cả_ăn
+cả_đến
+cảm_thấy
+cảm_ơn
+cấp
+cấp_số
+cấp_trực_tiếp
+cần
+cần_cấp
+cần_gì
+cần_số
+cật_lực
+cật_sức
+cậu
+cổ_lai
+cụ_thể
+cụ_thể_là
+cụ_thể_như
+của
+của_ngọt
+của_tin
+cứ
+cứ_như
+cứ_việc
+cứ_điểm
+cực_lực
+do
+do_vì
+do_vậy
+do_đó
+duy
+duy_chỉ
+duy_có
+dài
+dài_lời
+dài_ra
+dành
+dành_dành
+dào
+dì
+dù
+dù_cho
+dù_dì
+dù_gì
+dù_rằng
+dù_sao
+dùng
+dùng_cho
+dùng_hết
+dùng_làm
+dùng_đến
+dưới
+dưới_nước
+dạ
+dạ_bán
+dạ_con
+dạ_dài
+dạ_dạ
+dạ_khách
+dần_dà
+dần_dần
+dầu_sao
+dẫn
+dẫu
+dẫu_mà
+dẫu_rằng
+dẫu_sao
+dễ
+dễ_dùng
+dễ_gì
+dễ_khiến
+dễ_nghe
+dễ_ngươi
+dễ_như_chơi
+dễ_sợ
+dễ_sử_dụng
+dễ_thường
+dễ_thấy
+dễ_ăn
+dễ_đâu
+dở_chừng
+dữ
+dữ_cách
+em
+em_em
+giá_trị
+giá_trị_thực_tế
+giảm
+giảm_chính
+giảm_thấp
+giảm_thế
+giống
+giống_người
+giống_nhau
+giống_như
+giờ
+giờ_lâu
+giờ_này
+giờ_đi
+giờ_đây
+giờ_đến
+giữ
+giữ_lấy
+giữ_ý
+giữa
+giữa_lúc
+gây
+gây_cho
+gây_giống
+gây_ra
+gây_thêm
+gì
+gì_gì
+gì_đó
+gần
+gần_bên
+gần_hết
+gần_ngày
+gần_như
+gần_xa
+gần_đây
+gần_đến
+gặp
+gặp_khó_khăn
+gặp_phải
+gồm
+hay
+hay_biết
+hay_hay
+hay_không
+hay_là
+hay_làm
+hay_nhỉ
+hay_nói
+hay_sao
+hay_tin
+hay_đâu
+hiểu
+hiện_nay
+hiện_tại
+hoàn_toàn
+hoặc
+hoặc_là
+hãy
+hãy_còn
+hơn
+hơn_cả
+hơn_hết
+hơn_là
+hơn_nữa
+hơn_trước
+hầu_hết
+hết
+hết_chuyện
+hết_cả
+hết_của
+hết_nói
+hết_ráo
+hết_rồi
+hết_ý
+họ
+họ_gần
+họ_xa
+hỏi
+hỏi_lại
+hỏi_xem
+hỏi_xin
+hỗ_trợ
+khi
+khi_khác
+khi_không
+khi_nào
+khi_nên
+khi_trước
+khiến
+khoảng
+khoảng_cách
+khoảng_không
+khá
+khá_tốt
+khác
+khác_gì
+khác_khác
+khác_nhau
+khác_nào
+khác_thường
+khác_xa
+khách
+khó
+khó_biết
+khó_chơi
+khó_khăn
+khó_làm
+khó_mở
+khó_nghe
+khó_nghĩ
+khó_nói
+khó_thấy
+khó_tránh
+không
+không_ai
+không_bao_giờ
+không_bao_lâu
+không_biết
+không_bán
+không_chỉ
+không_còn
+không_có
+không_có_gì
+không_cùng
+không_cần
+không_cứ
+không_dùng
+không_gì
+không_hay
+không_khỏi
+không_kể
+không_ngoài
+không_nhận
+không_những
+không_phải
+không_phải_không
+không_thể
+không_tính
+không_điều_kiện
+không_được
+không_đầy
+không_để
+khẳng_định
+khỏi
+khỏi_nói
+kể
+kể_cả
+kể_như
+kể_tới
+kể_từ
+liên_quan
+loại
+loại_từ
+luôn
+luôn_cả
+luôn_luôn
+luôn_tay
+là
+là_cùng
+là_là
+là_nhiều
+là_phải
+là_thế_nào
+là_vì
+là_ít
+làm
+làm_bằng
+làm_cho
+làm_dần_dần
+làm_gì
+làm_lòng
+làm_lại
+làm_lấy
+làm_mất
+làm_ngay
+làm_như
+làm_nên
+làm_ra
+làm_riêng
+làm_sao
+làm_theo
+làm_thế_nào
+làm_tin
+làm_tôi
+làm_tăng
+làm_tại
+làm_tắp_lự
+làm_vì
+làm_đúng
+làm_được
+lâu
+lâu_các
+lâu_lâu
+lâu_nay
+lâu_ngày
+lên
+lên_cao
+lên_cơn
+lên_mạnh
+lên_ngôi
+lên_nước
+lên_số
+lên_xuống
+lên_đến
+lòng
+lòng_không
+lúc
+lúc_khác
+lúc_lâu
+lúc_nào
+lúc_này
+lúc_sáng
+lúc_trước
+lúc_đi
+lúc_đó
+lúc_đến
+lúc_ấy
+lý_do
+lượng
+lượng_cả
+lượng_số
+lượng_từ
+lại
+lại_bộ
+lại_cái
+lại_còn
+lại_giống
+lại_làm
+lại_người
+lại_nói
+lại_nữa
+lại_quả
+lại_thôi
+lại_ăn
+lại_đây
+lấy
+lấy_có
+lấy_cả
+lấy_giống
+lấy_làm
+lấy_lý_do
+lấy_lại
+lấy_ra
+lấy_ráo
+lấy_sau
+lấy_số
+lấy_thêm
+lấy_thế
+lấy_vào
+lấy_xuống
+lấy_được
+lấy_để
+lần
+lần_khác
+lần_lần
+lần_nào
+lần_này
+lần_sang
+lần_sau
+lần_theo
+lần_trước
+lần_tìm
+lớn
+lớn_lên
+lớn_nhỏ
+lời
+lời_chú
+lời_nói
+mang
+mang_lại
+mang_mang
+mang_nặng
+mang_về
+muốn
+mà
+mà_cả
+mà_không
+mà_lại
+mà_thôi
+mà_vẫn
+mình
+mạnh
+mất
+mất_còn
+mọi
+mọi_giờ
+mọi_khi
+mọi_lúc
+mọi_người
+mọi_nơi
+mọi_sự
+mọi_thứ
+mọi_việc
+mối
+mỗi
+mỗi_lúc
+mỗi_lần
+mỗi_một
+mỗi_ngày
+mỗi_người
+một
+một_cách
+một_cơn
+một_khi
+một_lúc
+một_số
+một_vài
+một_ít
+mới
+mới_hay
+mới_rồi
+mới_đây
+mở
+mở_mang
+mở_nước
+mở_ra
+mợ
+mức
+nay
+ngay
+ngay_bây_giờ
+ngay_cả
+ngay_khi
+ngay_khi_đến
+ngay_lúc
+ngay_lúc_này
+ngay_lập_tức
+ngay_thật
+ngay_tức_khắc
+ngay_tức_thì
+ngay_từ
+nghe
+nghe_chừng
+nghe_hiểu
+nghe_không
+nghe_lại
+nghe_nhìn
+nghe_như
+nghe_nói
+nghe_ra
+nghe_rõ
+nghe_thấy
+nghe_tin
+nghe_trực_tiếp
+nghe_đâu
+nghe_đâu_như
+nghe_được
+nghen
+nghiễm_nhiên
+nghĩ
+nghĩ_lại
+nghĩ_ra
+nghĩ_tới
+nghĩ_xa
+nghĩ_đến
+nghỉm
+ngoài
+ngoài_này
+ngoài_ra
+ngoài_xa
+ngoải
+nguồn
+ngày
+ngày_càng
+ngày_cấp
+ngày_giờ
+ngày_ngày
+ngày_nào
+ngày_này
+ngày_nọ
+ngày_qua
+ngày_rày
+ngày_tháng
+ngày_xưa
+ngày_xửa
+ngày_đến
+ngày_ấy
+ngôi
+ngôi_thứ
+ngõ_hầu
+ngăn_ngắt
+ngươi
+người_hỏi
+người_khác
+người_khách
+người_mình
+người_nghe
+người_người
+người_nhận
+ngọn
+ngọn_nguồn
+ngọt
+ngồi
+ngồi_bệt
+ngồi_không
+ngồi_sau
+ngồi_trệt
+ngộ_nhỡ
+nhanh
+nhanh_lên
+nhanh_tay
+nhau
+nhiên_hậu
+nhiều
+nhiều_ít
+nhiệt_liệt
+nhung_nhăng
+nhà
+nhà_chung
+nhà_khó
+nhà_làm
+nhà_ngoài
+nhà_ngươi
+nhà_tôi
+nhà_việc
+nhân_dịp
+nhân_tiện
+nhé
+nhìn
+nhìn_chung
+nhìn_lại
+nhìn_nhận
+nhìn_theo
+nhìn_thấy
+nhìn_xuống
+nhóm
+nhón_nhén
+như
+như_ai
+như_chơi
+như_không
+như_là
+như_nhau
+như_quả
+như_sau
+như_thường
+như_thế
+như_thế_nào
+như_thể
+như_trên
+như_trước
+như_tuồng
+như_vậy
+như_ý
+nhưng
+nhưng_mà
+nhược_bằng
+nhất
+nhất_loạt
+nhất_luật
+nhất_là
+nhất_mực
+nhất_nhất
+nhất_quyết
+nhất_sinh
+nhất_thiết
+nhất_thì
+nhất_tâm
+nhất_tề
+nhất_đán
+nhất_định
+nhận
+nhận_biết
+nhận_họ
+nhận_làm
+nhận_nhau
+nhận_ra
+nhận_thấy
+nhận_việc
+nhận_được
+nhằm
+nhằm_khi
+nhằm_lúc
+nhằm_vào
+nhằm_để
+nhỉ
+nhỏ
+nhỏ_người
+nhớ
+nhớ_bập_bõm
+nhớ_lại
+nhớ_lấy
+nhớ_ra
+nhờ
+nhờ_chuyển
+nhờ_có
+nhờ_nhờ
+nhờ_đó
+nhỡ_ra
+những
+những_ai
+những_khi
+những_là
+những_lúc
+những_muốn
+những_như
+nào
+nào_cũng
+nào_hay
+nào_là
+nào_phải
+nào_đâu
+nào_đó
+này
+này_nọ
+nên
+nên_chi
+nên_chăng
+nên_làm
+nên_người
+nên_tránh
+nó
+nóc
+nói
+nói_bông
+nói_chung
+nói_khó
+nói_là
+nói_lên
+nói_lại
+nói_nhỏ
+nói_phải
+nói_qua
+nói_ra
+nói_riêng
+nói_rõ
+nói_thêm
+nói_thật
+nói_toẹt
+nói_trước
+nói_tốt
+nói_với
+nói_xa
+nói_ý
+nói_đến
+nói_đủ
+năm
+năm_tháng
+nơi
+nơi_nơi
+nước_bài
+nước_cùng
+nước_lên
+nước_nặng
+nước_quả
+nước_xuống
+nước_ăn
+nước_đến
+nấy
+nặng
+nặng_căn
+nặng_mình
+nặng_về
+nếu
+nếu_có
+nếu_cần
+nếu_không
+nếu_mà
+nếu_như
+nếu_thế
+nếu_vậy
+nếu_được
+nền
+nọ
+nớ
+nức_nở
+nữa
+nữa_khi
+nữa_là
+nữa_rồi
+oai_oái
+oái
+pho
+phè
+phè_phè
+phía
+phía_bên
+phía_bạn
+phía_dưới
+phía_sau
+phía_trong
+phía_trên
+phía_trước
+phóc
+phót
+phù_hợp
+phăn_phắt
+phương_chi
+phải
+phải_biết
+phải_chi
+phải_chăng
+phải_cách
+phải_cái
+phải_giờ
+phải_khi
+phải_không
+phải_lại
+phải_lời
+phải_người
+phải_như
+phải_rồi
+phải_tay
+phần
+phần_lớn
+phần_nhiều
+phần_nào
+phần_sau
+phần_việc
+phắt
+phỉ_phui
+phỏng
+phỏng_như
+phỏng_nước
+phỏng_theo
+phỏng_tính
+phốc
+phụt
+phứt
+qua
+qua_chuyện
+qua_khỏi
+qua_lại
+qua_lần
+qua_ngày
+qua_tay
+qua_thì
+qua_đi
+quan_trọng
+quan_trọng_vấn_đề
+quan_tâm
+quay
+quay_bước
+quay_lại
+quay_số
+quay_đi
+quá
+quá_bán
+quá_bộ
+quá_giờ
+quá_lời
+quá_mức
+quá_nhiều
+quá_tay
+quá_thì
+quá_tin
+quá_trình
+quá_tuổi
+quá_đáng
+quá_ư
+quả
+quả_là
+quả_thật
+quả_thế
+quả_vậy
+quận
+ra
+ra_bài
+ra_bộ
+ra_chơi
+ra_gì
+ra_lại
+ra_lời
+ra_ngôi
+ra_người
+ra_sao
+ra_tay
+ra_vào
+ra_ý
+ra_điều
+ra_đây
+ren_rén
+riu_ríu
+riêng
+riêng_từng
+riệt
+rày
+ráo
+ráo_cả
+ráo_nước
+ráo_trọi
+rén
+rén_bước
+rích
+rón_rén
+rõ
+rõ_là
+rõ_thật
+rút_cục
+răng
+răng_răng
+rất
+rất_lâu
+rằng
+rằng_là
+rốt_cuộc
+rốt_cục
+rồi
+rồi_nữa
+rồi_ra
+rồi_sao
+rồi_sau
+rồi_tay
+rồi_thì
+rồi_xem
+rồi_đây
+rứa
+sa_sả
+sang
+sang_năm
+sang_sáng
+sang_tay
+sao
+sao_bản
+sao_bằng
+sao_cho
+sao_vậy
+sao_đang
+sau
+sau_chót
+sau_cuối
+sau_cùng
+sau_hết
+sau_này
+sau_nữa
+sau_sau
+sau_đây
+sau_đó
+so
+so_với
+song_le
+suýt
+suýt_nữa
+sáng
+sáng_ngày
+sáng_rõ
+sáng_thế
+sáng_ý
+sì
+sì_sì
+sất
+sắp
+sắp_đặt
+sẽ
+sẽ_biết
+sẽ_hay
+số
+số_cho_biết
+số_cụ_thể
+số_loại
+số_là
+số_phần
+số_thiếu
+sốt_sột
+sớm
+sớm_ngày
+sở_dĩ
+sử_dụng
+sự
+sự_thế
+sự_việc
+tanh
+tanh_tanh
+tay
+tay_quay
+tha_hồ
+tha_hồ_chơi
+tha_hồ_ăn
+than_ôi
+thanh
+thanh_ba
+thanh_chuyển
+thanh_không
+thanh_thanh
+thanh_tính
+thanh_điều_kiện
+thanh_điểm
+thay_đổi
+thay_đổi_tình_trạng
+theo
+theo_bước
+theo_như
+theo_tin
+thi_thoảng
+thiếu
+thiếu_gì
+thiếu_điểm
+thoạt
+thoạt_nghe
+thoạt_nhiên
+thoắt
+thuần
+thuần_ái
+thuộc
+thuộc_bài
+thuộc_cách
+thuộc_lại
+thuộc_từ
+thà
+thà_là
+thà_rằng
+thành_ra
+thành_thử
+thái_quá
+tháng
+tháng_ngày
+tháng_năm
+tháng_tháng
+thêm
+thêm_chuyện
+thêm_giờ
+thêm_vào
+thì
+thì_giờ
+thì_là
+thì_phải
+thì_ra
+thì_thôi
+thình_lình
+thích
+thích_cứ
+thích_thuộc
+thích_tự
+thích_ý
+thím
+thôi
+thôi_việc
+thúng_thắng
+thương_ôi
+thường
+thường_bị
+thường_hay
+thường_khi
+thường_số
+thường_sự
+thường_thôi
+thường_thường
+thường_tính
+thường_tại
+thường_xuất_hiện
+thường_đến
+thảo_hèn
+thảo_nào
+thấp
+thấp_cơ
+thấp_thỏm
+thấp_xuống
+thấy
+thấy_tháng
+thẩy
+thậm
+thậm_chí
+thậm_cấp
+thậm_từ
+thật
+thật_chắc
+thật_là
+thật_lực
+thật_quả
+thật_ra
+thật_sự
+thật_thà
+thật_tốt
+thật_vậy
+thế
+thế_chuẩn_bị
+thế_là
+thế_lại
+thế_mà
+thế_nào
+thế_nên
+thế_ra
+thế_sự
+thế_thì
+thế_thôi
+thế_thường
+thế_thế
+thế_à
+thế_đó
+thếch
+thỉnh_thoảng
+thỏm
+thốc
+thốc_tháo
+thốt
+thốt_nhiên
+thốt_nói
+thốt_thôi
+thộc
+thời_gian
+thời_gian_sử_dụng
+thời_gian_tính
+thời_điểm
+thục_mạng
+thứ
+thứ_bản
+thứ_đến
+thửa
+thực_hiện
+thực_hiện_đúng
+thực_ra
+thực_sự
+thực_tế
+thực_vậy
+tin
+tin_thêm
+tin_vào
+tiếp_theo
+tiếp_tục
+tiếp_đó
+tiện_thể
+toà
+toé_khói
+toẹt
+trong
+trong_khi
+trong_lúc
+trong_mình
+trong_ngoài
+trong_này
+trong_số
+trong_vùng
+trong_đó
+trong_ấy
+tránh
+tránh_khỏi
+tránh_ra
+tránh_tình_trạng
+tránh_xa
+trên
+trên_bộ
+trên_dưới
+trước
+trước_hết
+trước_khi
+trước_kia
+trước_nay
+trước_ngày
+trước_nhất
+trước_sau
+trước_tiên
+trước_tuổi
+trước_đây
+trước_đó
+trả
+trả_của
+trả_lại
+trả_ngay
+trả_trước
+trếu_tráo
+trển
+trệt
+trệu_trạo
+trỏng
+trời_đất_ơi
+trở_thành
+trừ_phi
+trực_tiếp
+trực_tiếp_làm
+tuy
+tuy_có
+tuy_là
+tuy_nhiên
+tuy_rằng
+tuy_thế
+tuy_vậy
+tuy_đã
+tuyệt_nhiên
+tuần_tự
+tuốt_luốt
+tuốt_tuồn_tuột
+tuốt_tuột
+tuổi
+tuổi_cả
+tuổi_tôi
+tà_tà
+tên
+tên_chính
+tên_cái
+tên_họ
+tên_tự
+tênh
+tênh_tênh
+tìm
+tìm_bạn
+tìm_cách
+tìm_hiểu
+tìm_ra
+tình_trạng
+tính
+tính_căn
+tính_phỏng
+tính_từ
+tít_mù
+tò_te
+tôi
+tôi_con
+tông_tốc
+tù_tì
+tăm_tắp
+tăng
+tăng_chúng
+tăng_cấp
+tăng_giảm
+tăng_thêm
+tăng_thế
+tại
+tại_lòng
+tại_nơi
+tại_sao
+tại_tôi
+tại_vì
+tại_đâu
+tại_đây
+tại_đó
+tạo
+tạo_cơ_hội
+tạo_nên
+tạo_ra
+tạo_ý
+tạo_điều_kiện
+tấm
+tấm_bản
+tấm_các
+tấn
+tấn_tới
+tất_cả
+tất_cả_bao_nhiêu
+tất_thảy
+tất_tần_tật
+tất_tật
+tập_trung
+tắp
+tắp_lự
+tắp_tắp
+tọt
+tỏ_ra
+tỏ_vẻ
+tốc_tả
+tối_ư
+tốt
+tốt_bạn
+tốt_bộ
+tốt_hơn
+tốt_mối
+tốt_ngày
+tột
+tột_cùng
+tớ
+tới
+tới_gần
+tới_mức
+tới_nơi
+tới_thì
+tức_thì
+tức_tốc
+từ
+từ_căn
+từ_giờ
+từ_khi
+từ_loại
+từ_nay
+từ_thế
+từ_tính
+từ_tại
+từ_từ
+từ_ái
+từ_điều
+từ_đó
+từ_ấy
+từng
+từng_cái
+từng_giờ
+từng_nhà
+từng_phần
+từng_thời_gian
+từng_đơn_vị
+từng_ấy
+tự
+tự_cao
+tự_khi
+tự_lượng
+tự_tính
+tự_tạo
+tự_vì
+tự_ý
+tự_ăn
+tựu_trung
+veo
+veo_veo
+việc
+việc_gì
+vung_thiên_địa
+vung_tàn_tán
+vung_tán_tàn
+và
+vài
+vài_ba
+vài_người
+vài_nhà
+vài_nơi
+vài_tên
+vài_điều
+vào
+vào_gặp
+vào_khoảng
+vào_lúc
+vào_vùng
+vào_đến
+vâng
+vâng_chịu
+vâng_dạ
+vâng_vâng
+vâng_ý
+vèo
+vèo_vèo
+vì
+vì_chưng
+vì_rằng
+vì_sao
+vì_thế
+vì_vậy
+ví_bằng
+ví_dù
+ví_phỏng
+ví_thử
+vô_hình_trung
+vô_kể
+vô_luận
+vô_vàn
+vùng
+vùng_lên
+vùng_nước
+văng_tê
+vượt
+vượt_khỏi
+vượt_quá
+vạn_nhất
+vả_chăng
+vả_lại
+vấn_đề
+vấn_đề_quan_trọng
+vẫn
+vẫn_thế
+vậy
+vậy_là
+vậy_mà
+vậy_nên
+vậy_ra
+vậy_thì
+vậy_ư
+về
+về_không
+về_phần
+về_sau
+về_tay
+vị_trí
+vị_tất
+vốn_dĩ
+với
+với_lại
+với_nhau
+vở
+vụt
+vừa
+vừa_khi
+vừa_lúc
+vừa_mới
+vừa_qua
+vừa_rồi
+vừa_vừa
+xa
+xa_cách
+xa_gần
+xa_nhà
+xa_tanh
+xa_tắp
+xa_xa
+xa_xả
+xem
+xem_lại
+xem_ra
+xem_số
+xin
+xin_gặp
+xin_vâng
+xiết_bao
+xon_xón
+xoành_xoạch
+xoét
+xoẳn
+xoẹt
+xuất_kì_bất_ý
+xuất_kỳ_bất_ý
+xuể
+xuống
+xăm_xúi
+xăm_xăm
+xăm_xắm
+xảy_ra
+xềnh_xệch
+xệp
+xử_lý
+yêu_cầu
+à
+à_này
+à_ơi
+ào
+ào_vào
+ào_ào
+á
+á_à
+ái
+ái_chà
+ái_dà
+áng
+áng_như
+âu_là
+ít
+ít_biết
+ít_có
+ít_hơn
+ít_khi
+ít_lâu
+ít_nhiều
+ít_nhất
+ít_nữa
+ít_quá
+ít_ra
+ít_thôi
+ít_thấy
+ô_hay
+ô_hô
+ô_kê
+ô_kìa
+ôi_chao
+ôi_thôi
+ông
+ông_nhỏ
+ông_tạo
+ông_từ
+ông_ấy
+ông_ổng
+úi
+úi_chà
+úi_dào
+ý
+ý_chừng
+ý_da
+ý_hoặc
+ăn
+ăn_chung
+ăn_chắc
+ăn_chịu
+ăn_cuộc
+ăn_hết
+ăn_hỏi
+ăn_làm
+ăn_người
+ăn_ngồi
+ăn_quá
+ăn_riêng
+ăn_tay
+ăn_trên
+ăn_về
+đang
+đang_tay
+đang_thì
+điều
+điều_gì
+điều_kiện
+điểm
+điểm_chính
+điểm_gặp
+điểm_đầu_tiên
+đành_đạch
+đáng
+đáng_kể
+đáng_lí
+đáng_lý
+đáng_lẽ
+đáng_số
+đánh_đùng
+đáo_để
+đâu
+đâu_có
+đâu_cũng
+đâu_như
+đâu_nào
+đâu_phải
+đâu_đâu
+đâu_đây
+đâu_đó
+đây
+đây_này
+đây_rồi
+đây_đó
+đã
+đã_hay
+đã_không
+đã_là
+đã_lâu
+đã_thế
+đã_vậy
+đã_đủ
+đó
+đó_đây
+đúng
+đúng_ngày
+đúng_ra
+đúng_tuổi
+đúng_với
+đơn_vị
+đưa
+đưa_cho
+đưa_chuyện
+đưa_em
+đưa_ra
+đưa_tay
+đưa_tin
+đưa_tới
+đưa_vào
+đưa_về
+đưa_xuống
+đưa_đến
+được
+được_cái
+được_lời
+được_nước
+được_tin
+đại_loại
+đại_phàm
+đại_để
+đạt
+đảm_bảo
+đầu_tiên
+đầy
+đầy_năm
+đầy_phè
+đầy_tuổi
+đặc_biệt
+đặt
+đặt_làm
+đặt_mình
+đặt_mức
+đặt_ra
+đặt_trước
+đặt_để
+đến
+đến_bao_giờ
+đến_cùng
+đến_cùng_cực
+đến_cả
+đến_giờ
+đến_gần
+đến_hay
+đến_khi
+đến_lúc
+đến_lời
+đến_nay
+đến_ngày
+đến_nơi
+đến_nỗi
+đến_thì
+đến_thế
+đến_tuổi
+đến_xem
+đến_điều
+đến_đâu
+đều
+đều_bước
+đều_nhau
+đều_đều
+để
+để_cho
+để_giống
+để_không
+để_lòng
+để_lại
+để_mà
+để_phần
+để_được
+để_đến_nỗi
+đối_với
+đồng_thời
+đủ
+đủ_dùng
+đủ_nơi
+đủ_số
+đủ_điều
+đủ_điểm
+ơ
+ơ_hay
+ơ_kìa
+ơi
+ơi_là
+ư
+ạ
+ạ_ơi
+ấy
+ấy_là
+ầu_ơ
+ắt
+ắt_hẳn
+ắt_là
+ắt_phải
+ắt_thật
+ối_dào
+ối_giời
+ối_giời_ơi
+ồ
+ồ_ồ
+ổng
+ớ
+ớ_này
+ờ
+ờ_ờ
+ở
+ở_lại
+ở_như
+ở_nhờ
+ở_năm
+ở_trên
+ở_vào
+ở_đây
+ở_đó
+ở_được
+ủa
+ứ_hự
+ứ_ừ
+ừ
+ừ_nhé
+ừ_thì
+ừ_ào
+ừ_ừ
+ử
+Chỉ
+Các
+Có
+Cần
+Nhưng
+Tuy_nhiên
+Từ
+Cũng
+từ
+đi
+đến
+Cho_dù
+Chủ_yếu
+Còn
+Có_lẽ
+Có_thể
+Dù
+Dĩ_nhiên
+Những
+Và
+Vì
+Tất_nhiên
+cho_dù
+có_lẽ
+lắm
+Hiện_nay
+Tại_đây
+Như_vậy
+Từ
+Vậy
+Ở
+Bởi
+Theo
+Trên
+Việc
+Do_đó
+Hơn_nữa
+Trong
+Bên
+Cùng
+Cả
+ây
+Khi
+Sau
+Mỗi
+Về
+Không_chỉ
+Mặc_dù
+mặc_dù
+Nhiều
+Như
+Do
+Nếu
+Nếu_như
+Hoặc
+Nhờ
+Hiện
+Hiện_tại
+hiện
+cạnh
+Xung_quanh
+tóm_lại
+Cho
+Sau_này
+Vào
+ngày_nay
+chung_quanh
+Qua
+Thông_qua
+bao_gồm