OCRSpellCorrection / dictionary /convert_frequency.py
TangSan003's picture
Clean deploy via GitHub
0c6d0a6
from underthesea import word_tokenize, sent_tokenize, text_normalize, ner
import re
import json
from pprintpp import pprint
output_file = "frequency_vi_test.txt"
json_file = "../craw/job_details.json"
def is_valid_token( token):
if re.match(r'^\d+(\.\d+)?$', token):
return False
if re.match(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$", token):
return False
elif re.match(r"^(0[3|5|7|8|9])([0-9]{8})$", token):
return False
return all(c.isalpha() or c == '_' for c in token)
seen = set()
with open(json_file, "r", encoding="utf-8") as f:
jobs = json.load(f)
words_file = [detail[text] for job in jobs for detail in job["job_detail"] for text in detail]
with open(output_file, "w", encoding="utf-8") as f:
for i, words in enumerate(words_file):
print(f"{(i+1)/len(words_file)*100:.2f}%: {i+1}/{len(words_file)}")
words_sent_tokenize = sent_tokenize(words.replace("\n", ". "))
for words_sent in words_sent_tokenize:
words_sent = text_normalize(words_sent)
save_word = word_tokenize(words_sent, format="text")
tokens = [token for token in word_tokenize(save_word) if is_valid_token(token)]
for word in tokens:
word = word.lower()
if word not in seen:
save_word = word
if len(word.replace("_", " ").split()) !=1:
for split in word.replace("_", " ").split():
if split not in seen:
count = sum(text.lower().count(split.lower()) for text in words_file)
if count >1000:
f.write(f"{split}${count}\n")
seen.add(split)
count = sum(text.lower().count(save_word.replace("_", " ").lower()) for text in words_file)
if count > 5:
f.write(f"{save_word}${count}\n")
seen.add(word)
print("Done:", output_file)