evannh commited on
Commit
08aea01
·
verified ·
1 Parent(s): f520a4e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from umap import UMAP
6
+ from hdbscan import HDBSCAN
7
+ from sentence_transformers import SentenceTransformer
8
+ from sklearn.feature_extraction.text import CountVectorizer
9
+ from bertopic import BERTopic
10
+ from bertopic.representation import MaximalMarginalRelevance
11
+ from bertopic.vectorizers import ClassTfidfTransformer
12
+
13
+ # Charger les stopwords
14
+ try:
15
+ stop_words = stopwords.words('english')
16
+ except LookupError:
17
+ nltk.download('stopwords')
18
+ stop_words = stopwords.words('english')
19
+
20
+ # Stopwords personnalisés
21
+ custom_stopwords = ["made", "sure"]
22
+ stop_words.extend(custom_stopwords)
23
+
24
+ # Pipeline BERTopic personnalisé
25
+ def generate_topics(file):
26
+ docs = pd.read_csv(file.name)
27
+
28
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
29
+ embeddings = embedding_model.encode(docs['text'].tolist(), show_progress_bar=True)
30
+
31
+ umap_model = UMAP(
32
+ n_neighbors=20,
33
+ n_components=5,
34
+ min_dist=0.0,
35
+ metric='cosine',
36
+ random_state=42
37
+ )
38
+
39
+ hdbscan_model = HDBSCAN(
40
+ min_cluster_size=60,
41
+ min_samples=1,
42
+ metric='euclidean',
43
+ cluster_selection_method='eom',
44
+ prediction_data=True
45
+ )
46
+
47
+ vectorizer_model = CountVectorizer(
48
+ stop_words=stop_words,
49
+ min_df=1,
50
+ ngram_range=(1, 3)
51
+ )
52
+
53
+ ctfidf_model = ClassTfidfTransformer()
54
+ representation_model = MaximalMarginalRelevance(diversity=0.7)
55
+
56
+ BERT_model = BERTopic(
57
+ embedding_model=embedding_model,
58
+ umap_model=umap_model,
59
+ hdbscan_model=hdbscan_model,
60
+ vectorizer_model=vectorizer_model,
61
+ ctfidf_model=ctfidf_model,
62
+ representation_model=representation_model,
63
+ verbose=True
64
+ )
65
+
66
+ topics, _ = BERT_model.fit_transform(docs['text'].tolist())
67
+ BERT_model.reduce_outliers(docs['text'].tolist(), topics)
68
+
69
+ fig = BERT_model.visualize_documents(docs['text'].tolist())
70
+ return fig
71
+
72
+ # Interface Gradio
73
+ demo = gr.Interface(
74
+ fn=generate_topics,
75
+ inputs=gr.File(label="Upload bbc-text.csv"),
76
+ outputs=gr.Plot(label="Topic Map"),
77
+ title="Topic Modeling avec BERTopic",
78
+ description="Téléversez un fichier CSV avec une colonne 'text' pour générer une visualisation thématique interactive."
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ demo.launch()