degregor69 commited on
Commit
91040aa
·
1 Parent(s): da3cfc3

first commit

Browse files
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from bs4 import BeautifulSoup
3
+ import joblib
4
+ import gradio as gr
5
+
6
+ classes = ['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
7
+ 'ios', 'iphone', 'java', 'javascript', 'jquery', 'json', 'linux',
8
+ 'mysql', 'node.js', 'objective-c', 'performance', 'php', 'python',
9
+ 'reactjs', 'ruby-on-rails', 'spring', 'sql', 'sql-server', 'swift',
10
+ 'unit-testing', 'windows', 'xcode']
11
+
12
+ pa_classifier = joblib.load("passive_aggressive_classifier.pkl")
13
+ tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
14
+
15
+ def run(text: str):
16
+ text = clean_input_string(text)
17
+ vectors = get_tf_idf_vectors([text])
18
+ prediction_list = pa_classifier.predict(vectors)[0]
19
+ predicted_classes = [classes[i] for i, pred in enumerate(prediction_list) if pred == 1]
20
+ if predicted_classes:
21
+ return " ".join(predicted_classes)
22
+ return "Pas de thème retrouvé."
23
+
24
+
25
+
26
+ def get_tf_idf_vectors(x_as_string):
27
+ return tfidf_vectorizer.transform(x_as_string)
28
+ def clean_input_string(text: str) -> str:
29
+ text = remove_escape_sequences(text)
30
+ text = remove_html_tags(text)
31
+ text_as_list = remove_punctuation(text)
32
+ text_as_list = remove_stopwords(text_as_list)
33
+ text_as_list = lemmatize(text_as_list)
34
+ return " ".join(text_as_list)
35
+
36
+
37
+ def remove_escape_sequences(text: str):
38
+ return text.encode('unicode-escape').decode('utf-8').replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\\\', '').replace('\\\'', '').replace('\"', '')
39
+
40
+
41
+ def remove_html_tags(text: str):
42
+ soup = BeautifulSoup(text, 'html.parser')
43
+ return soup.get_text()
44
+
45
+
46
+ def remove_punctuation(text: str):
47
+ tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+')
48
+ text = text.lower()
49
+ return tokenizer.tokenize(text)
50
+
51
+
52
+
53
+ def remove_stopwords(words_list: list):
54
+ sw = set()
55
+ sw.update(tuple(nltk.corpus.stopwords.words('english')))
56
+ return [word for word in words_list if word not in sw]
57
+
58
+
59
+ def lemmatize(words_list: list):
60
+ lemmatizer = nltk.WordNetLemmatizer()
61
+ return [lemmatizer.lemmatize(word) for word in words_list]
62
+
63
+
64
+ def greet(name, intensity):
65
+ return "Hello, " + name + "!" * int(intensity)
66
+
67
+ demo = gr.Interface(
68
+ fn=run,
69
+ inputs=["text"],
70
+ outputs=["text"],
71
+ )
72
+
73
+ demo.launch()
passive_aggressive_classifier.pkl ADDED
Binary file (483 kB). View file
 
tfidf_vectorizer.pkl ADDED
Binary file (56.1 kB). View file