Toro-Angel's picture
Update analyzer.py
9668e53 verified
from flask import Flask, request, jsonify
import os
import re
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
nltk.download('punkt', download_dir='/tmp/nltk_data')
nltk.download('stopwords', download_dir='/tmp/nltk_data')
nltk.download('wordnet', download_dir='/tmp/nltk_data')
nltk.data.path.append('/tmp/nltk_data')
app = Flask(__name__)
# Function to preprocess text data
def clean_text_with_lemmatization(texts):
stopwords_set = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
cleaned_texts = []
for text in texts:
text = text.lower()
text = re.sub(r'[^\w\s]', ' ', text)
words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords_set]
cleaned_texts.append(' '.join(words))
return cleaned_texts
# Function to determine sentiment label based on probability
def get_sentiment_label(prob):
if prob < 0.30:
return "Negative"
elif 0.35 <= prob < 0.5:
return "Slightly Negative"
elif prob == 0.5:
return "Neutral"
elif 0.5 < prob < 0.70:
return "Slightly Positive"
else:
return "Positive"
# Function to load or train the model
def load_model():
model_path = 'sentiment_pipeline.pkl'
if os.path.exists(model_path):
return joblib.load(model_path)
else:
return train_model('trainData.json')
# Function to train the model
def train_model(json_file_path):
with open(json_file_path, 'r') as file:
data = json.load(file)
X = [entry['text'] for entry in data]
y = [entry['label'] for entry in data]
pipeline = Pipeline([
('vectorizer', CountVectorizer()),
('classifier', MultinomialNB())
])
pipeline.fit(X, y)
joblib.dump(pipeline, 'sentiment_pipeline.pkl')
return pipeline
# Endpoint to process new reviews
@app.route('/', methods=['POST'])
def predict_sentiment():
pipeline = load_model()
new_reviews_json = request.json
new_reviews = [review['CUSTOMERREVIEWS'] for review in new_reviews_json['reviewsModel']]
cleaned_new_reviews = clean_text_with_lemmatization(new_reviews)
predicted_probabilities = pipeline.predict_proba(cleaned_new_reviews)
results = []
for i, review_info in enumerate(new_reviews_json['reviewsModel']):
original_review = review_info['CUSTOMERREVIEWS']
negative_probability_new = predicted_probabilities[i, 0]
positive_probability_new = predicted_probabilities[i, 1]
sentiment = get_sentiment_label(positive_probability_new)
formatted_negative_probability = f"{negative_probability_new:.4f}"
formatted_positive_probability = f"{positive_probability_new:.4f}"
results.append({
"review_text": original_review,
"negative_probability": formatted_negative_probability,
"positive_probability": formatted_positive_probability,
"sentiment": sentiment
})
overall_negative_probability = predicted_probabilities.mean(axis=0)[0]
overall_positive_probability = predicted_probabilities.mean(axis=0)[1]
overall_sentiment = get_sentiment_label(overall_positive_probability)
results.append({
"overall_negative_probability": f"{overall_negative_probability:.4f}",
"overall_positive_probability": f"{overall_positive_probability:.4f}",
"overall_sentiment": overall_sentiment
})
return jsonify(results)
#if __name__ == '__main__':
# app.run(debug=True)