ocr-project-5 / app.py
degregor69's picture
added stopwords download to nltk
71c908a
import nltk
from bs4 import BeautifulSoup
import joblib
import gradio as gr
nltk.download('stopwords')
nltk.download('wordnet')
classes = ['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
'ios', 'iphone', 'java', 'javascript', 'jquery', 'json', 'linux',
'mysql', 'node.js', 'objective-c', 'performance', 'php', 'python',
'reactjs', 'ruby-on-rails', 'spring', 'sql', 'sql-server', 'swift',
'unit-testing', 'windows', 'xcode']
pa_classifier = joblib.load("passive_aggressive_classifier.pkl")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
def run(text: str):
text = clean_input_string(text)
vectors = get_tf_idf_vectors([text])
prediction_list = pa_classifier.predict(vectors)[0]
predicted_classes = [classes[i] for i, pred in enumerate(prediction_list) if pred == 1]
if predicted_classes:
return " ".join(predicted_classes)
return "Pas de thème retrouvé."
def get_tf_idf_vectors(x_as_string):
return tfidf_vectorizer.transform(x_as_string)
def clean_input_string(text: str) -> str:
text = remove_escape_sequences(text)
text = remove_html_tags(text)
text_as_list = remove_punctuation(text)
text_as_list = remove_stopwords(text_as_list)
text_as_list = lemmatize(text_as_list)
return " ".join(text_as_list)
def remove_escape_sequences(text: str):
return text.encode('unicode-escape').decode('utf-8').replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\\\', '').replace('\\\'', '').replace('\"', '')
def remove_html_tags(text: str):
soup = BeautifulSoup(text, 'html.parser')
return soup.get_text()
def remove_punctuation(text: str):
tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+')
text = text.lower()
return tokenizer.tokenize(text)
def remove_stopwords(words_list: list):
sw = set()
sw.update(tuple(nltk.corpus.stopwords.words('english')))
return [word for word in words_list if word not in sw]
def lemmatize(words_list: list):
lemmatizer = nltk.WordNetLemmatizer()
return [lemmatizer.lemmatize(word) for word in words_list]
def greet(name, intensity):
return "Hello, " + name + "!" * int(intensity)
demo = gr.Interface(
fn=run,
inputs=["text"],
outputs=["text"],
)
demo.launch()