Shu-vi commited on
Commit
d4d8ed5
·
verified ·
1 Parent(s): a80103c

Upload 8 files

Browse files
src/Clustering.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, SpectralClustering
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+ import hdbscan
4
+
5
+
6
+ def k_means(docs, k = 5):
7
+ return KMeans(n_clusters=k).fit_predict(docs)
8
+
9
+ def mini_batch_means(docs, n_clusters):
10
+ model = MiniBatchKMeans(n_clusters=n_clusters, batch_size=256, random_state=42)
11
+ return model.fit_predict(docs)
12
+
13
+ def use_hdbscan(docs):
14
+ model = hdbscan.HDBSCAN(metric="euclidean", min_cluster_size=3)
15
+ return model.fit_predict(docs)
16
+
17
+ def agglomerative_clustering(docs, n_clusters=5):
18
+ model = AgglomerativeClustering(n_clusters=n_clusters, metric="cosine", linkage="average")
19
+ return model.fit_predict(docs)
20
+
21
+ def spectral_clustering(docs, n_clusters=5):
22
+ sim = cosine_similarity(docs)
23
+ model = SpectralClustering(
24
+ n_clusters=n_clusters,
25
+ affinity='precomputed',
26
+ random_state=42
27
+ )
28
+ return model.fit_predict(sim)
src/Tokenizer.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
2
+ import re
3
+
4
+ def create_bpe():
5
+ tokenizer = Tokenizer.from_pretrained("Shu-vi/Russian_BPE_Tokenizer_16k")
6
+ def _inner(text: str):
7
+ return tokenizer.encode(text).tokens
8
+ return _inner
9
+
10
+ def tokenize_naive(text: str):
11
+ # Простая токенизация по пробелам (и отделяем лишние пунктуации у концов)
12
+ parts = text.split()
13
+ tokens = [p.strip("«»()[]{}.,:;!?\"'“”—–…") for p in parts if p.strip("«»()[]{}.,:;!?\"'“”—–…")]
14
+ return tokens
15
+
16
+ def tokenize_regex(text: str):
17
+ return re.compile(r"[A-Za-zА-Яа-яЁё]+(?:[-'][A-Za-zА-Яа-яЁё]+)*", flags=re.UNICODE).findall(text)
src/Vectorization.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gensim.models import Word2Vec, FastText
2
+ import joblib
3
+
4
+ def create_tfidf():
5
+ vectorizer = joblib.load("tfidf_vectorizer.pkl")
6
+ def _inner(docs):
7
+ return vectorizer.transform(docs).toarray()
8
+ return _inner, vectorizer # возвращаем и функцию, и векторaйзер
9
+
10
+ def create_w2v():
11
+ model = Word2Vec.load("./word2vec.model")
12
+ def _inner(word):
13
+ if word in model.wv:
14
+ return model.wv[word]
15
+ else:
16
+ return None
17
+ return _inner, model # возвращаем и функцию, и модель
18
+
19
+ def create_fasttext():
20
+ model = FastText.load("./fasttext.model")
21
+ def _inner(word):
22
+ if word in model.wv:
23
+ return model.wv[word]
24
+ else:
25
+ return None
26
+ return _inner, model # возвращаем и функцию, и модель
src/fasttext.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f5758a85c3e61d5a58795a2fe3028b962f84468fb915491bd67b05260ce175b
3
+ size 3768810
src/processed_corpus.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
src/streamlit_app.py CHANGED
@@ -1,40 +1,262 @@
1
- import altair as alt
 
2
  import numpy as np
3
  import pandas as pd
4
- import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ import streamlit as st
2
+ import json
3
  import numpy as np
4
  import pandas as pd
5
+ import plotly.express as px
6
+ from sklearn.decomposition import PCA
7
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
8
+
9
+ from Vectorization import create_tfidf, create_w2v, create_fasttext
10
+ from Tokenizer import create_bpe, tokenize_naive, tokenize_regex
11
+ from Clustering import k_means, agglomerative_clustering, spectral_clustering, mini_batch_means, use_hdbscan
12
+
13
+
14
+ # Загрузка моделей один раз при запуске
15
+ @st.cache_resource
16
+ def load_models():
17
+ bpe = create_bpe()
18
+ tfidf_func, tfidf_vectorizer = create_tfidf()
19
+ w2v_func, w2v_model = create_w2v()
20
+ fasttext_func, fasttext_model = create_fasttext()
21
+ return bpe, tfidf_func, tfidf_vectorizer, w2v_func, w2v_model, fasttext_func, fasttext_model
22
+
23
+
24
+ @st.cache_data
25
+ def load_corpus():
26
+ corpus = []
27
+ with open("processed_corpus.jsonl", "r", encoding="utf-8") as infile:
28
+ for line in infile:
29
+ data = json.loads(line)
30
+ if "text" in data:
31
+ corpus.append(data["text"])
32
+ return corpus
33
+
34
+
35
+ def get_metrics(X_emb, labels):
36
+ """Вычисление метрик кластеризации"""
37
+ metrics = {}
38
+ try:
39
+ metrics['silhouette'] = silhouette_score(X_emb, labels)
40
+ except:
41
+ metrics['silhouette'] = None
42
+ try:
43
+ metrics['calinski_harabasz'] = calinski_harabasz_score(X_emb, labels)
44
+ except:
45
+ metrics['calinski_harabasz'] = None
46
+ try:
47
+ metrics['davies_bouldin'] = davies_bouldin_score(X_emb, labels)
48
+ except:
49
+ metrics['davies_bouldin'] = None
50
+ return metrics
51
+
52
+
53
+ def vectorize_text(corpus, tokenization_method, vectorization_method, models):
54
+ bpe, tfidf_func, tfidf_vectorizer, w2v_func, w2v_model, fasttext_func, fasttext_model = models
55
+
56
+ # Токенизация
57
+ if tokenization_method == "Naive (whitespace)":
58
+ tokens_list = [tokenize_naive(text) for text in corpus]
59
+ sentences = [" ".join(tokens) for tokens in tokens_list]
60
+ elif tokenization_method == "Regex":
61
+ tokens_list = [tokenize_regex(text) for text in corpus]
62
+ sentences = [" ".join(tokens) for tokens in tokens_list]
63
+ else: # BPE
64
+ tokens_list = [bpe(text) for text in corpus]
65
+ sentences = [" ".join(tokens) for tokens in tokens_list]
66
+
67
+ # Векторизация
68
+ if vectorization_method == "TF-IDF":
69
+ embeddings = tfidf_func(sentences)
70
+ return embeddings, tokens_list, sentences, tfidf_vectorizer
71
+
72
+ elif vectorization_method == "Word2Vec":
73
+ embeddings = []
74
+ for tokens in tokens_list:
75
+ doc_embed = []
76
+ for token in tokens:
77
+ token_embedding = w2v_func(token)
78
+ if token_embedding is not None:
79
+ doc_embed.append(token_embedding)
80
+ if len(doc_embed) > 0:
81
+ embeddings.append(np.mean(doc_embed, axis=0))
82
+ else:
83
+ embeddings.append(np.zeros(300))
84
+ return np.array(embeddings), tokens_list, sentences, w2v_model # возвращаем модель
85
+
86
+ else: # FastText
87
+ embeddings = []
88
+ for tokens in tokens_list:
89
+ doc_embed = []
90
+ for token in tokens:
91
+ token_embedding = fasttext_func(token)
92
+ if token_embedding is not None:
93
+ doc_embed.append(token_embedding)
94
+ if len(doc_embed) > 0:
95
+ embeddings.append(np.mean(doc_embed, axis=0))
96
+ else:
97
+ embeddings.append(np.zeros(300))
98
+ return np.array(embeddings), tokens_list, sentences, fasttext_model # возвращаем модель
99
+
100
+ def get_top_tfidf_words(tfidf_vectorizer, cluster_docs, feature_names, n_words=10):
101
+ """Получение топ-N слов для TF-IDF"""
102
+ cluster_vectors = tfidf_vectorizer.transform(cluster_docs)
103
+ cluster_mean = np.mean(cluster_vectors.toarray(), axis=0)
104
+ top_indices = np.argsort(cluster_mean)[-n_words:][::-1]
105
+ return [feature_names[i] for i in top_indices]
106
+
107
+
108
+ def main():
109
+ st.set_page_config(page_title="Text Clustering Analysis", layout="wide")
110
+ st.title("Анализ кластеризации текстов")
111
+
112
+ # Загрузка данных и моделей
113
+ with st.spinner("Загрузка моделей и данных..."):
114
+ models = load_models()
115
+ corpus = load_corpus()
116
+
117
+ st.sidebar.header("Настройки кластеризации")
118
+
119
+ # Выбор количества документов для анализа
120
+ sample_size = st.sidebar.slider(
121
+ "Количество документов для анализа",
122
+ min_value=100,
123
+ max_value=len(corpus),
124
+ value=min(1000, len(corpus)),
125
+ step=100
126
+ )
127
+
128
+ corpus_sample = corpus[:sample_size]
129
+
130
+ # Выбор методов
131
+ tokenization_method = st.sidebar.selectbox(
132
+ "Метод токенизации",
133
+ ["Naive (whitespace)", "Regex", "BPE"]
134
+ )
135
+
136
+ vectorization_method = st.sidebar.selectbox(
137
+ "Метод векторизации",
138
+ ["TF-IDF", "Word2Vec", "FastText"]
139
+ )
140
+
141
+ clustering_method = st.sidebar.selectbox(
142
+ "Алгоритм кластеризации",
143
+ ["K-Means", "Mini-Batch K-Means", "Agglomerative", "Spectral", "HDBSCAN"]
144
+ )
145
+
146
+ n_clusters = st.sidebar.slider(
147
+ "Количество кластеров",
148
+ min_value=2,
149
+ max_value=10,
150
+ value=5,
151
+ step=1
152
+ )
153
+
154
+ # Кнопка запуска анализа
155
+ if st.sidebar.button("Запустить кластеризацию"):
156
+ with st.spinner("Выполняется векторизация и кластеризация..."):
157
+ # Векторизация
158
+ embeddings, tokens_list, sentences, vectorizer_or_model = vectorize_text(
159
+ corpus_sample, tokenization_method, vectorization_method, models
160
+ )
161
+
162
+ # Кластеризация
163
+ if clustering_method == "K-Means":
164
+ labels = k_means(embeddings, k=n_clusters)
165
+ elif clustering_method == "Mini-Batch K-Means":
166
+ labels = mini_batch_means(embeddings, n_clusters=n_clusters)
167
+ elif clustering_method == "Agglomerative":
168
+ labels = agglomerative_clustering(embeddings, n_clusters=n_clusters)
169
+ elif clustering_method == "Spectral":
170
+ labels = spectral_clustering(embeddings, n_clusters=n_clusters)
171
+ else: # HDBSCAN
172
+ labels = use_hdbscan(embeddings)
173
+
174
+ # Вычисление метрик
175
+ metrics = get_metrics(embeddings, labels)
176
+
177
+ # Визуализация
178
+ st.header("Результаты кластеризации")
179
+
180
+ # Метрики
181
+ col1, col2, col3 = st.columns(3)
182
+ with col1:
183
+ st.metric("Silhouette Score",
184
+ f"{metrics['silhouette']:.3f}" if metrics['silhouette'] else "N/A")
185
+ with col2:
186
+ st.metric("Calinski-Harabasz",
187
+ f"{metrics['calinski_harabasz']:.3f}" if metrics['calinski_harabasz'] else "N/A")
188
+ with col3:
189
+ st.metric("Davies-Bouldin",
190
+ f"{metrics['davies_bouldin']:.3f}" if metrics['davies_bouldin'] else "N/A")
191
+
192
+ # Визуализация кластеров
193
+ st.subheader("Визуализация кластеров")
194
+
195
+ # Уменьшение размерности для визуализации
196
+ pca = PCA(n_components=2)
197
+ embeddings_2d = pca.fit_transform(embeddings)
198
+
199
+ viz_df = pd.DataFrame({
200
+ 'x': embeddings_2d[:, 0],
201
+ 'y': embeddings_2d[:, 1],
202
+ 'cluster': labels,
203
+ 'text': corpus_sample
204
+ })
205
+
206
+ fig = px.scatter(viz_df, x='x', y='y', color='cluster',
207
+ hover_data=['text'], title="PCA визуализация кластеров")
208
+ st.plotly_chart(fig, use_container_width=True)
209
+
210
+ # Анализ по кластерам
211
+ st.subheader("Анализ по кластерам")
212
+
213
+ unique_clusters = np.unique(labels)
214
+
215
+ for cluster_id in unique_clusters:
216
+ if cluster_id == -1:
217
+ continue
218
+
219
+ cluster_mask = labels == cluster_id
220
+ cluster_docs = [corpus_sample[i] for i in range(len(corpus_sample)) if cluster_mask[i]]
221
+ cluster_size = len(cluster_docs)
222
+
223
+ with st.expander(f"Кластер {cluster_id} (размер: {cluster_size})"):
224
+ # Топ слова для TF-IDF
225
+ if vectorization_method == "TF-IDF":
226
+ st.write("**Топ-10 характерных слов:**")
227
+ cluster_sentences = [sentences[i] for i in range(len(sentences)) if cluster_mask[i]]
228
+ if len(cluster_sentences) > 0:
229
+ cluster_vectors = vectorizer_or_model.transform(cluster_sentences)
230
+ cluster_mean = np.mean(cluster_vectors.toarray(), axis=0)
231
+ feature_names = vectorizer_or_model.get_feature_names_out()
232
+ top_indices = np.argsort(cluster_mean)[-10:][::-1]
233
+ top_words = [feature_names[i] for i in top_indices]
234
+ for word in top_words:
235
+ st.write(f"- {word}")
236
+
237
+ # Ближайшие слова для эмбеддингов
238
+ elif vectorization_method in ["Word2Vec", "FastText"]:
239
+ st.write("**Ближайшие слова к центроиду:**")
240
+ cluster_embeddings = embeddings[cluster_mask]
241
+ centroid = np.mean(cluster_embeddings, axis=0)
242
+ try:
243
+ similar_words = vectorizer_or_model.wv.most_similar(positive=[centroid], topn=10)
244
+ for word, similarity in similar_words:
245
+ st.write(f"- {word} (сходство: {similarity:.3f})")
246
+ except Exception as e:
247
+ st.error(f"Ошибка при поиске похожих слов: {e}")
248
+
249
+ # Общая статистика
250
+ st.subheader("Статистика кластеров")
251
+ cluster_stats = pd.DataFrame({
252
+ 'Cluster': labels,
253
+ 'Count': 1
254
+ }).groupby('Cluster').count().reset_index()
255
+
256
+ fig_bar = px.bar(cluster_stats, x='Cluster', y='Count',
257
+ title="Распределение документов по кластерам")
258
+ st.plotly_chart(fig_bar, use_container_width=True)
259
+
260
 
261
+ if __name__ == "__main__":
262
+ main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:538c98fc42664c81287834754e6ee18e1d0a54a6f80f47317d5f5de97e908ee8
3
+ size 597203
src/word2vec.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7b5683021876c5ccf555eebfa2e742edab15d7aaee34efa8160a5c6f23d98b
3
+ size 3768676