File size: 2,300 Bytes
91040aa
 
 
 
 
71c908a
 
 
91040aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import nltk
from bs4 import BeautifulSoup
import joblib
import gradio as gr

nltk.download('stopwords')
nltk.download('wordnet')

classes = ['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
           'ios', 'iphone', 'java', 'javascript', 'jquery', 'json', 'linux',
           'mysql', 'node.js', 'objective-c', 'performance', 'php', 'python',
           'reactjs', 'ruby-on-rails', 'spring', 'sql', 'sql-server', 'swift',
           'unit-testing', 'windows', 'xcode']

pa_classifier = joblib.load("passive_aggressive_classifier.pkl")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")

def run(text: str):
    text = clean_input_string(text)
    vectors = get_tf_idf_vectors([text])
    prediction_list = pa_classifier.predict(vectors)[0]
    predicted_classes = [classes[i] for i, pred in enumerate(prediction_list) if pred == 1]
    if predicted_classes:
        return " ".join(predicted_classes)
    return "Pas de thème retrouvé."



def get_tf_idf_vectors(x_as_string):
    return tfidf_vectorizer.transform(x_as_string)
def clean_input_string(text: str) -> str:
    text = remove_escape_sequences(text)
    text = remove_html_tags(text)
    text_as_list = remove_punctuation(text)
    text_as_list = remove_stopwords(text_as_list)
    text_as_list = lemmatize(text_as_list)
    return " ".join(text_as_list)


def remove_escape_sequences(text: str):
    return text.encode('unicode-escape').decode('utf-8').replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\\\', '').replace('\\\'', '').replace('\"', '')


def remove_html_tags(text: str):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()


def remove_punctuation(text: str):
    tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+')
    text = text.lower()
    return tokenizer.tokenize(text)



def remove_stopwords(words_list: list):
    sw = set()
    sw.update(tuple(nltk.corpus.stopwords.words('english')))
    return [word for word in words_list if word not in sw]


def lemmatize(words_list: list):
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words_list]


def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=run,
    inputs=["text"],
    outputs=["text"],
)

demo.launch()