theformatisvalid commited on
Commit
a55d080
·
verified ·
1 Parent(s): 62f9592

Update src/tokenizers_analysis.py

Browse files
Files changed (1) hide show
  1. src/tokenizers_analysis.py +7 -89
src/tokenizers_analysis.py CHANGED
@@ -1,89 +1,7 @@
1
- import json
2
- from nltk.tokenize import word_tokenize
3
- from razdel import tokenize as razdel_tokenize
4
- from snowballstemmer import RussianStemmer
5
- import pymorphy2
6
- import re
7
- import time
8
- from sentence_transformers import SentenceTransformer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
-
11
-
12
- def tokenize_naive_space(text):
13
- return text.split(' ')
14
-
15
-
16
- def tokenize_regex(text):
17
- return re.findall(r'[а-яА-ЯёЁ]+', text)
18
-
19
-
20
- def tokenize_nltk(text):
21
- return word_tokenize(text, language='russian')
22
-
23
-
24
- def tokenize_razdel(text):
25
- return list(map(lambda x: x.text, razdel_tokenize(text)))
26
-
27
-
28
- def stem_snowball(tokens):
29
- stemmer = RussianStemmer()
30
- return stemmer.stemWords(tokens)
31
-
32
-
33
- def lemmatize_pymorphy(words):
34
- morph = pymorphy2.MorphAnalyzer()
35
- lemmas = []
36
- for word in words:
37
- lemmas.append(morph.parse(word)[0].normal_form)
38
- return lemmas
39
-
40
-
41
- def calculate_oov(text, vocabulary):
42
- words = text.split(' ')
43
- oov_count = 0
44
- for word in words:
45
- if word not in vocabulary:
46
- oov_count += 1
47
- return oov_count / len(words)
48
-
49
-
50
- def calculate_similarity(text1, text2, model):
51
- embedding1 = model.encode(text1, convert_to_tensor=False).reshape(1, -1)
52
- embedding2 = model.encode(text2, convert_to_tensor=False).reshape(1, -1)
53
- return cosine_similarity(embedding1, embedding2)[0][0]
54
-
55
-
56
- if __name__ == '__main__':
57
- texts = []
58
- with open('core/preprocessed_core.jsonl', encoding='utf-8') as file:
59
- for line in file:
60
- row = json.loads(line)
61
- texts.append(row['text'])
62
- n_articles = len(texts)
63
- tokenizers = [tokenize_naive_space, tokenize_regex, tokenize_nltk, tokenize_razdel]
64
-
65
- methods = []
66
- for tokenizer in tokenizers:
67
- methods.extend([[tokenizer], [tokenizer, stem_snowball], [tokenizer, lemmatize_pymorphy]])
68
-
69
- csv_string = 'method;vocabulary volume;OOV percentage;processing speed;semantic consistency\n'
70
- sim_model = SentenceTransformer('all-MiniLM-L6-v2')
71
- for method in methods:
72
- print('running', " + ".join(map(lambda x: x.__name__, method)))
73
- start_time = time.time()
74
- vocabulary = set()
75
- similarities = []
76
- for text in texts:
77
- tokens = text
78
- for func in method:
79
- tokens = func(tokens)
80
- similarities.append(calculate_similarity(text, ' '.join(tokens), sim_model))
81
- with open(f'tokenized_texts/{"_".join(map(lambda x: x.__name__, method))}', 'a', encoding='utf-8') as file:
82
- file.write(' '.join(tokens) + '\n')
83
- vocabulary = vocabulary.union(tokens)
84
- end_time = time.time()
85
- csv_string += f'{" + ".join(map(lambda x: x.__name__, method))};{len(vocabulary)};'
86
- csv_string += f'{calculate_oov(" ".join(texts), vocabulary)};{end_time - start_time};'
87
- csv_string += f'{sum(similarities)/len(similarities)}\n'
88
- with open('reports/tokenization_metrics.csv', 'w') as file:
89
- file.write(csv_string)
 
1
+ def calculate_oov(text, vocabulary):
2
+ words = text.split(' ')
3
+ oov_count = 0
4
+ for word in words:
5
+ if word not in vocabulary:
6
+ oov_count += 1
7
+ return oov_count / len(words)