import nltk from bs4 import BeautifulSoup import joblib import gradio as gr nltk.download('stopwords') nltk.download('wordnet') classes = ['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html', 'ios', 'iphone', 'java', 'javascript', 'jquery', 'json', 'linux', 'mysql', 'node.js', 'objective-c', 'performance', 'php', 'python', 'reactjs', 'ruby-on-rails', 'spring', 'sql', 'sql-server', 'swift', 'unit-testing', 'windows', 'xcode'] pa_classifier = joblib.load("passive_aggressive_classifier.pkl") tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl") def run(text: str): text = clean_input_string(text) vectors = get_tf_idf_vectors([text]) prediction_list = pa_classifier.predict(vectors)[0] predicted_classes = [classes[i] for i, pred in enumerate(prediction_list) if pred == 1] if predicted_classes: return " ".join(predicted_classes) return "Pas de thème retrouvé." def get_tf_idf_vectors(x_as_string): return tfidf_vectorizer.transform(x_as_string) def clean_input_string(text: str) -> str: text = remove_escape_sequences(text) text = remove_html_tags(text) text_as_list = remove_punctuation(text) text_as_list = remove_stopwords(text_as_list) text_as_list = lemmatize(text_as_list) return " ".join(text_as_list) def remove_escape_sequences(text: str): return text.encode('unicode-escape').decode('utf-8').replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\\\', '').replace('\\\'', '').replace('\"', '') def remove_html_tags(text: str): soup = BeautifulSoup(text, 'html.parser') return soup.get_text() def remove_punctuation(text: str): tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+') text = text.lower() return tokenizer.tokenize(text) def remove_stopwords(words_list: list): sw = set() sw.update(tuple(nltk.corpus.stopwords.words('english'))) return [word for word in words_list if word not in sw] def lemmatize(words_list: list): lemmatizer = nltk.WordNetLemmatizer() return [lemmatizer.lemmatize(word) for word in words_list] def greet(name, intensity): return "Hello, " + name + "!" * int(intensity) demo = gr.Interface( fn=run, inputs=["text"], outputs=["text"], ) demo.launch()