Spaces:
Runtime error
Runtime error
Commit ·
91040aa
1
Parent(s): da3cfc3
first commit
Browse files- app.py +73 -0
- passive_aggressive_classifier.pkl +0 -0
- tfidf_vectorizer.pkl +0 -0
app.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import joblib
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
classes = ['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
|
| 7 |
+
'ios', 'iphone', 'java', 'javascript', 'jquery', 'json', 'linux',
|
| 8 |
+
'mysql', 'node.js', 'objective-c', 'performance', 'php', 'python',
|
| 9 |
+
'reactjs', 'ruby-on-rails', 'spring', 'sql', 'sql-server', 'swift',
|
| 10 |
+
'unit-testing', 'windows', 'xcode']
|
| 11 |
+
|
| 12 |
+
pa_classifier = joblib.load("passive_aggressive_classifier.pkl")
|
| 13 |
+
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
|
| 14 |
+
|
| 15 |
+
def run(text: str):
|
| 16 |
+
text = clean_input_string(text)
|
| 17 |
+
vectors = get_tf_idf_vectors([text])
|
| 18 |
+
prediction_list = pa_classifier.predict(vectors)[0]
|
| 19 |
+
predicted_classes = [classes[i] for i, pred in enumerate(prediction_list) if pred == 1]
|
| 20 |
+
if predicted_classes:
|
| 21 |
+
return " ".join(predicted_classes)
|
| 22 |
+
return "Pas de thème retrouvé."
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_tf_idf_vectors(x_as_string):
|
| 27 |
+
return tfidf_vectorizer.transform(x_as_string)
|
| 28 |
+
def clean_input_string(text: str) -> str:
|
| 29 |
+
text = remove_escape_sequences(text)
|
| 30 |
+
text = remove_html_tags(text)
|
| 31 |
+
text_as_list = remove_punctuation(text)
|
| 32 |
+
text_as_list = remove_stopwords(text_as_list)
|
| 33 |
+
text_as_list = lemmatize(text_as_list)
|
| 34 |
+
return " ".join(text_as_list)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def remove_escape_sequences(text: str):
|
| 38 |
+
return text.encode('unicode-escape').decode('utf-8').replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\\\', '').replace('\\\'', '').replace('\"', '')
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def remove_html_tags(text: str):
|
| 42 |
+
soup = BeautifulSoup(text, 'html.parser')
|
| 43 |
+
return soup.get_text()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def remove_punctuation(text: str):
|
| 47 |
+
tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+')
|
| 48 |
+
text = text.lower()
|
| 49 |
+
return tokenizer.tokenize(text)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def remove_stopwords(words_list: list):
|
| 54 |
+
sw = set()
|
| 55 |
+
sw.update(tuple(nltk.corpus.stopwords.words('english')))
|
| 56 |
+
return [word for word in words_list if word not in sw]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def lemmatize(words_list: list):
|
| 60 |
+
lemmatizer = nltk.WordNetLemmatizer()
|
| 61 |
+
return [lemmatizer.lemmatize(word) for word in words_list]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def greet(name, intensity):
|
| 65 |
+
return "Hello, " + name + "!" * int(intensity)
|
| 66 |
+
|
| 67 |
+
demo = gr.Interface(
|
| 68 |
+
fn=run,
|
| 69 |
+
inputs=["text"],
|
| 70 |
+
outputs=["text"],
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
demo.launch()
|
passive_aggressive_classifier.pkl
ADDED
|
Binary file (483 kB). View file
|
|
|
tfidf_vectorizer.pkl
ADDED
|
Binary file (56.1 kB). View file
|
|
|