strongeryongchao commited on
Commit
c45bf71
·
verified ·
1 Parent(s): 1a44012

Upload summary_utils.py

Browse files
Files changed (1) hide show
  1. summary_utils.py +29 -0
summary_utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ import numpy as np
3
+ import jieba.analyse
4
+
5
+ def extract_keywords_per_cluster(sentences, labels, top_k=5):
6
+ cluster_keywords = {}
7
+ clusters = set(labels)
8
+ for c in clusters:
9
+ if c == -1:
10
+ continue
11
+ cluster_sents = [s for s, l in zip(sentences, labels) if l == c]
12
+ vectorizer = TfidfVectorizer(max_features=1000)
13
+ tfidf_matrix = vectorizer.fit_transform(cluster_sents)
14
+ scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()
15
+ keywords = np.array(vectorizer.get_feature_names_out())[np.argsort(scores)[::-1]]
16
+ cluster_keywords[c] = keywords[:top_k].tolist()
17
+ return cluster_keywords
18
+
19
+ def summarize_per_cluster(sentences, labels, top_k=3):
20
+ cluster_summaries = {}
21
+ clusters = set(labels)
22
+ for c in clusters:
23
+ if c == -1:
24
+ continue
25
+ cluster_sents = [s for s, l in zip(sentences, labels) if l == c]
26
+ text = "。".join(cluster_sents)
27
+ keywords = jieba.analyse.textrank(text, topK=top_k, withWeight=False)
28
+ cluster_summaries[c] = list(keywords)
29
+ return cluster_summaries