Upload 8 files
Browse files- src/Clustering.py +28 -0
- src/Tokenizer.py +17 -0
- src/Vectorization.py +26 -0
- src/fasttext.model +3 -0
- src/processed_corpus.jsonl +0 -0
- src/streamlit_app.py +259 -37
- src/tfidf_vectorizer.pkl +3 -0
- src/word2vec.model +3 -0
src/Clustering.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, SpectralClustering
|
| 2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
+
import hdbscan
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def k_means(docs, k = 5):
|
| 7 |
+
return KMeans(n_clusters=k).fit_predict(docs)
|
| 8 |
+
|
| 9 |
+
def mini_batch_means(docs, n_clusters):
|
| 10 |
+
model = MiniBatchKMeans(n_clusters=n_clusters, batch_size=256, random_state=42)
|
| 11 |
+
return model.fit_predict(docs)
|
| 12 |
+
|
| 13 |
+
def use_hdbscan(docs):
|
| 14 |
+
model = hdbscan.HDBSCAN(metric="euclidean", min_cluster_size=3)
|
| 15 |
+
return model.fit_predict(docs)
|
| 16 |
+
|
| 17 |
+
def agglomerative_clustering(docs, n_clusters=5):
|
| 18 |
+
model = AgglomerativeClustering(n_clusters=n_clusters, metric="cosine", linkage="average")
|
| 19 |
+
return model.fit_predict(docs)
|
| 20 |
+
|
| 21 |
+
def spectral_clustering(docs, n_clusters=5):
|
| 22 |
+
sim = cosine_similarity(docs)
|
| 23 |
+
model = SpectralClustering(
|
| 24 |
+
n_clusters=n_clusters,
|
| 25 |
+
affinity='precomputed',
|
| 26 |
+
random_state=42
|
| 27 |
+
)
|
| 28 |
+
return model.fit_predict(sim)
|
src/Tokenizer.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tokenizers import Tokenizer
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
def create_bpe():
|
| 5 |
+
tokenizer = Tokenizer.from_pretrained("Shu-vi/Russian_BPE_Tokenizer_16k")
|
| 6 |
+
def _inner(text: str):
|
| 7 |
+
return tokenizer.encode(text).tokens
|
| 8 |
+
return _inner
|
| 9 |
+
|
| 10 |
+
def tokenize_naive(text: str):
|
| 11 |
+
# Простая токенизация по пробелам (и отделяем лишние пунктуации у концов)
|
| 12 |
+
parts = text.split()
|
| 13 |
+
tokens = [p.strip("«»()[]{}.,:;!?\"'“”—–…") for p in parts if p.strip("«»()[]{}.,:;!?\"'“”—–…")]
|
| 14 |
+
return tokens
|
| 15 |
+
|
| 16 |
+
def tokenize_regex(text: str):
|
| 17 |
+
return re.compile(r"[A-Za-zА-Яа-яЁё]+(?:[-'][A-Za-zА-Яа-яЁё]+)*", flags=re.UNICODE).findall(text)
|
src/Vectorization.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gensim.models import Word2Vec, FastText
|
| 2 |
+
import joblib
|
| 3 |
+
|
| 4 |
+
def create_tfidf():
|
| 5 |
+
vectorizer = joblib.load("tfidf_vectorizer.pkl")
|
| 6 |
+
def _inner(docs):
|
| 7 |
+
return vectorizer.transform(docs).toarray()
|
| 8 |
+
return _inner, vectorizer # возвращаем и функцию, и векторaйзер
|
| 9 |
+
|
| 10 |
+
def create_w2v():
|
| 11 |
+
model = Word2Vec.load("./word2vec.model")
|
| 12 |
+
def _inner(word):
|
| 13 |
+
if word in model.wv:
|
| 14 |
+
return model.wv[word]
|
| 15 |
+
else:
|
| 16 |
+
return None
|
| 17 |
+
return _inner, model # возвращаем и функцию, и модель
|
| 18 |
+
|
| 19 |
+
def create_fasttext():
|
| 20 |
+
model = FastText.load("./fasttext.model")
|
| 21 |
+
def _inner(word):
|
| 22 |
+
if word in model.wv:
|
| 23 |
+
return model.wv[word]
|
| 24 |
+
else:
|
| 25 |
+
return None
|
| 26 |
+
return _inner, model # возвращаем и функцию, и модель
|
src/fasttext.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f5758a85c3e61d5a58795a2fe3028b962f84468fb915491bd67b05260ce175b
|
| 3 |
+
size 3768810
|
src/processed_corpus.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,262 @@
|
|
| 1 |
-
import
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
""
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
-
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import json
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
from sklearn.decomposition import PCA
|
| 7 |
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
| 8 |
+
|
| 9 |
+
from Vectorization import create_tfidf, create_w2v, create_fasttext
|
| 10 |
+
from Tokenizer import create_bpe, tokenize_naive, tokenize_regex
|
| 11 |
+
from Clustering import k_means, agglomerative_clustering, spectral_clustering, mini_batch_means, use_hdbscan
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Загрузка моделей один раз при запуске
|
| 15 |
+
@st.cache_resource
|
| 16 |
+
def load_models():
|
| 17 |
+
bpe = create_bpe()
|
| 18 |
+
tfidf_func, tfidf_vectorizer = create_tfidf()
|
| 19 |
+
w2v_func, w2v_model = create_w2v()
|
| 20 |
+
fasttext_func, fasttext_model = create_fasttext()
|
| 21 |
+
return bpe, tfidf_func, tfidf_vectorizer, w2v_func, w2v_model, fasttext_func, fasttext_model
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@st.cache_data
|
| 25 |
+
def load_corpus():
|
| 26 |
+
corpus = []
|
| 27 |
+
with open("processed_corpus.jsonl", "r", encoding="utf-8") as infile:
|
| 28 |
+
for line in infile:
|
| 29 |
+
data = json.loads(line)
|
| 30 |
+
if "text" in data:
|
| 31 |
+
corpus.append(data["text"])
|
| 32 |
+
return corpus
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_metrics(X_emb, labels):
|
| 36 |
+
"""Вычисление метрик кластеризации"""
|
| 37 |
+
metrics = {}
|
| 38 |
+
try:
|
| 39 |
+
metrics['silhouette'] = silhouette_score(X_emb, labels)
|
| 40 |
+
except:
|
| 41 |
+
metrics['silhouette'] = None
|
| 42 |
+
try:
|
| 43 |
+
metrics['calinski_harabasz'] = calinski_harabasz_score(X_emb, labels)
|
| 44 |
+
except:
|
| 45 |
+
metrics['calinski_harabasz'] = None
|
| 46 |
+
try:
|
| 47 |
+
metrics['davies_bouldin'] = davies_bouldin_score(X_emb, labels)
|
| 48 |
+
except:
|
| 49 |
+
metrics['davies_bouldin'] = None
|
| 50 |
+
return metrics
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def vectorize_text(corpus, tokenization_method, vectorization_method, models):
|
| 54 |
+
bpe, tfidf_func, tfidf_vectorizer, w2v_func, w2v_model, fasttext_func, fasttext_model = models
|
| 55 |
+
|
| 56 |
+
# Токенизация
|
| 57 |
+
if tokenization_method == "Naive (whitespace)":
|
| 58 |
+
tokens_list = [tokenize_naive(text) for text in corpus]
|
| 59 |
+
sentences = [" ".join(tokens) for tokens in tokens_list]
|
| 60 |
+
elif tokenization_method == "Regex":
|
| 61 |
+
tokens_list = [tokenize_regex(text) for text in corpus]
|
| 62 |
+
sentences = [" ".join(tokens) for tokens in tokens_list]
|
| 63 |
+
else: # BPE
|
| 64 |
+
tokens_list = [bpe(text) for text in corpus]
|
| 65 |
+
sentences = [" ".join(tokens) for tokens in tokens_list]
|
| 66 |
+
|
| 67 |
+
# Векторизация
|
| 68 |
+
if vectorization_method == "TF-IDF":
|
| 69 |
+
embeddings = tfidf_func(sentences)
|
| 70 |
+
return embeddings, tokens_list, sentences, tfidf_vectorizer
|
| 71 |
+
|
| 72 |
+
elif vectorization_method == "Word2Vec":
|
| 73 |
+
embeddings = []
|
| 74 |
+
for tokens in tokens_list:
|
| 75 |
+
doc_embed = []
|
| 76 |
+
for token in tokens:
|
| 77 |
+
token_embedding = w2v_func(token)
|
| 78 |
+
if token_embedding is not None:
|
| 79 |
+
doc_embed.append(token_embedding)
|
| 80 |
+
if len(doc_embed) > 0:
|
| 81 |
+
embeddings.append(np.mean(doc_embed, axis=0))
|
| 82 |
+
else:
|
| 83 |
+
embeddings.append(np.zeros(300))
|
| 84 |
+
return np.array(embeddings), tokens_list, sentences, w2v_model # возвращаем модель
|
| 85 |
+
|
| 86 |
+
else: # FastText
|
| 87 |
+
embeddings = []
|
| 88 |
+
for tokens in tokens_list:
|
| 89 |
+
doc_embed = []
|
| 90 |
+
for token in tokens:
|
| 91 |
+
token_embedding = fasttext_func(token)
|
| 92 |
+
if token_embedding is not None:
|
| 93 |
+
doc_embed.append(token_embedding)
|
| 94 |
+
if len(doc_embed) > 0:
|
| 95 |
+
embeddings.append(np.mean(doc_embed, axis=0))
|
| 96 |
+
else:
|
| 97 |
+
embeddings.append(np.zeros(300))
|
| 98 |
+
return np.array(embeddings), tokens_list, sentences, fasttext_model # возвращаем модель
|
| 99 |
+
|
| 100 |
+
def get_top_tfidf_words(tfidf_vectorizer, cluster_docs, feature_names, n_words=10):
|
| 101 |
+
"""Получение топ-N слов для TF-IDF"""
|
| 102 |
+
cluster_vectors = tfidf_vectorizer.transform(cluster_docs)
|
| 103 |
+
cluster_mean = np.mean(cluster_vectors.toarray(), axis=0)
|
| 104 |
+
top_indices = np.argsort(cluster_mean)[-n_words:][::-1]
|
| 105 |
+
return [feature_names[i] for i in top_indices]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def main():
|
| 109 |
+
st.set_page_config(page_title="Text Clustering Analysis", layout="wide")
|
| 110 |
+
st.title("Анализ кластеризации текстов")
|
| 111 |
+
|
| 112 |
+
# Загрузка данных и моделей
|
| 113 |
+
with st.spinner("Загрузка моделей и данных..."):
|
| 114 |
+
models = load_models()
|
| 115 |
+
corpus = load_corpus()
|
| 116 |
+
|
| 117 |
+
st.sidebar.header("Настройки кластеризации")
|
| 118 |
+
|
| 119 |
+
# Выбор количества документов для анализа
|
| 120 |
+
sample_size = st.sidebar.slider(
|
| 121 |
+
"Количество документов для анализа",
|
| 122 |
+
min_value=100,
|
| 123 |
+
max_value=len(corpus),
|
| 124 |
+
value=min(1000, len(corpus)),
|
| 125 |
+
step=100
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
corpus_sample = corpus[:sample_size]
|
| 129 |
+
|
| 130 |
+
# Выбор методов
|
| 131 |
+
tokenization_method = st.sidebar.selectbox(
|
| 132 |
+
"Метод токенизации",
|
| 133 |
+
["Naive (whitespace)", "Regex", "BPE"]
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
vectorization_method = st.sidebar.selectbox(
|
| 137 |
+
"Метод векторизации",
|
| 138 |
+
["TF-IDF", "Word2Vec", "FastText"]
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
clustering_method = st.sidebar.selectbox(
|
| 142 |
+
"Алгоритм кластеризации",
|
| 143 |
+
["K-Means", "Mini-Batch K-Means", "Agglomerative", "Spectral", "HDBSCAN"]
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
n_clusters = st.sidebar.slider(
|
| 147 |
+
"Количество кластеров",
|
| 148 |
+
min_value=2,
|
| 149 |
+
max_value=10,
|
| 150 |
+
value=5,
|
| 151 |
+
step=1
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# Кнопка запуска анализа
|
| 155 |
+
if st.sidebar.button("Запустить кластеризацию"):
|
| 156 |
+
with st.spinner("Выполняется векторизация и кластеризация..."):
|
| 157 |
+
# Векторизация
|
| 158 |
+
embeddings, tokens_list, sentences, vectorizer_or_model = vectorize_text(
|
| 159 |
+
corpus_sample, tokenization_method, vectorization_method, models
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Кластеризация
|
| 163 |
+
if clustering_method == "K-Means":
|
| 164 |
+
labels = k_means(embeddings, k=n_clusters)
|
| 165 |
+
elif clustering_method == "Mini-Batch K-Means":
|
| 166 |
+
labels = mini_batch_means(embeddings, n_clusters=n_clusters)
|
| 167 |
+
elif clustering_method == "Agglomerative":
|
| 168 |
+
labels = agglomerative_clustering(embeddings, n_clusters=n_clusters)
|
| 169 |
+
elif clustering_method == "Spectral":
|
| 170 |
+
labels = spectral_clustering(embeddings, n_clusters=n_clusters)
|
| 171 |
+
else: # HDBSCAN
|
| 172 |
+
labels = use_hdbscan(embeddings)
|
| 173 |
+
|
| 174 |
+
# Вычисление метрик
|
| 175 |
+
metrics = get_metrics(embeddings, labels)
|
| 176 |
+
|
| 177 |
+
# Визуализация
|
| 178 |
+
st.header("Результаты кластеризации")
|
| 179 |
+
|
| 180 |
+
# Метрики
|
| 181 |
+
col1, col2, col3 = st.columns(3)
|
| 182 |
+
with col1:
|
| 183 |
+
st.metric("Silhouette Score",
|
| 184 |
+
f"{metrics['silhouette']:.3f}" if metrics['silhouette'] else "N/A")
|
| 185 |
+
with col2:
|
| 186 |
+
st.metric("Calinski-Harabasz",
|
| 187 |
+
f"{metrics['calinski_harabasz']:.3f}" if metrics['calinski_harabasz'] else "N/A")
|
| 188 |
+
with col3:
|
| 189 |
+
st.metric("Davies-Bouldin",
|
| 190 |
+
f"{metrics['davies_bouldin']:.3f}" if metrics['davies_bouldin'] else "N/A")
|
| 191 |
+
|
| 192 |
+
# Визуализация кластеров
|
| 193 |
+
st.subheader("Визуализация кластеров")
|
| 194 |
+
|
| 195 |
+
# Уменьшение размерности для визуализации
|
| 196 |
+
pca = PCA(n_components=2)
|
| 197 |
+
embeddings_2d = pca.fit_transform(embeddings)
|
| 198 |
+
|
| 199 |
+
viz_df = pd.DataFrame({
|
| 200 |
+
'x': embeddings_2d[:, 0],
|
| 201 |
+
'y': embeddings_2d[:, 1],
|
| 202 |
+
'cluster': labels,
|
| 203 |
+
'text': corpus_sample
|
| 204 |
+
})
|
| 205 |
+
|
| 206 |
+
fig = px.scatter(viz_df, x='x', y='y', color='cluster',
|
| 207 |
+
hover_data=['text'], title="PCA визуализация кластеров")
|
| 208 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 209 |
+
|
| 210 |
+
# Анализ по кластерам
|
| 211 |
+
st.subheader("Анализ по кластерам")
|
| 212 |
+
|
| 213 |
+
unique_clusters = np.unique(labels)
|
| 214 |
+
|
| 215 |
+
for cluster_id in unique_clusters:
|
| 216 |
+
if cluster_id == -1:
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
cluster_mask = labels == cluster_id
|
| 220 |
+
cluster_docs = [corpus_sample[i] for i in range(len(corpus_sample)) if cluster_mask[i]]
|
| 221 |
+
cluster_size = len(cluster_docs)
|
| 222 |
+
|
| 223 |
+
with st.expander(f"Кластер {cluster_id} (размер: {cluster_size})"):
|
| 224 |
+
# Топ слова для TF-IDF
|
| 225 |
+
if vectorization_method == "TF-IDF":
|
| 226 |
+
st.write("**Топ-10 характерных слов:**")
|
| 227 |
+
cluster_sentences = [sentences[i] for i in range(len(sentences)) if cluster_mask[i]]
|
| 228 |
+
if len(cluster_sentences) > 0:
|
| 229 |
+
cluster_vectors = vectorizer_or_model.transform(cluster_sentences)
|
| 230 |
+
cluster_mean = np.mean(cluster_vectors.toarray(), axis=0)
|
| 231 |
+
feature_names = vectorizer_or_model.get_feature_names_out()
|
| 232 |
+
top_indices = np.argsort(cluster_mean)[-10:][::-1]
|
| 233 |
+
top_words = [feature_names[i] for i in top_indices]
|
| 234 |
+
for word in top_words:
|
| 235 |
+
st.write(f"- {word}")
|
| 236 |
+
|
| 237 |
+
# Ближайшие слова для эмбеддингов
|
| 238 |
+
elif vectorization_method in ["Word2Vec", "FastText"]:
|
| 239 |
+
st.write("**Ближайшие слова к центроиду:**")
|
| 240 |
+
cluster_embeddings = embeddings[cluster_mask]
|
| 241 |
+
centroid = np.mean(cluster_embeddings, axis=0)
|
| 242 |
+
try:
|
| 243 |
+
similar_words = vectorizer_or_model.wv.most_similar(positive=[centroid], topn=10)
|
| 244 |
+
for word, similarity in similar_words:
|
| 245 |
+
st.write(f"- {word} (сходство: {similarity:.3f})")
|
| 246 |
+
except Exception as e:
|
| 247 |
+
st.error(f"Ошибка при поиске похожих слов: {e}")
|
| 248 |
+
|
| 249 |
+
# Общая статистика
|
| 250 |
+
st.subheader("Статистика кластеров")
|
| 251 |
+
cluster_stats = pd.DataFrame({
|
| 252 |
+
'Cluster': labels,
|
| 253 |
+
'Count': 1
|
| 254 |
+
}).groupby('Cluster').count().reset_index()
|
| 255 |
+
|
| 256 |
+
fig_bar = px.bar(cluster_stats, x='Cluster', y='Count',
|
| 257 |
+
title="Распределение документов по кластерам")
|
| 258 |
+
st.plotly_chart(fig_bar, use_container_width=True)
|
| 259 |
+
|
| 260 |
|
| 261 |
+
if __name__ == "__main__":
|
| 262 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/tfidf_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:538c98fc42664c81287834754e6ee18e1d0a54a6f80f47317d5f5de97e908ee8
|
| 3 |
+
size 597203
|
src/word2vec.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb7b5683021876c5ccf555eebfa2e742edab15d7aaee34efa8160a5c6f23d98b
|
| 3 |
+
size 3768676
|