from flask import Flask, request, jsonify import os import re import json import nltk from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import joblib from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline nltk.download('punkt', download_dir='/tmp/nltk_data') nltk.download('stopwords', download_dir='/tmp/nltk_data') nltk.download('wordnet', download_dir='/tmp/nltk_data') nltk.data.path.append('/tmp/nltk_data') app = Flask(__name__) # Function to preprocess text data def clean_text_with_lemmatization(texts): stopwords_set = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() cleaned_texts = [] for text in texts: text = text.lower() text = re.sub(r'[^\w\s]', ' ', text) words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords_set] cleaned_texts.append(' '.join(words)) return cleaned_texts # Function to determine sentiment label based on probability def get_sentiment_label(prob): if prob < 0.30: return "Negative" elif 0.35 <= prob < 0.5: return "Slightly Negative" elif prob == 0.5: return "Neutral" elif 0.5 < prob < 0.70: return "Slightly Positive" else: return "Positive" # Function to load or train the model def load_model(): model_path = 'sentiment_pipeline.pkl' if os.path.exists(model_path): return joblib.load(model_path) else: return train_model('trainData.json') # Function to train the model def train_model(json_file_path): with open(json_file_path, 'r') as file: data = json.load(file) X = [entry['text'] for entry in data] y = [entry['label'] for entry in data] pipeline = Pipeline([ ('vectorizer', CountVectorizer()), ('classifier', MultinomialNB()) ]) pipeline.fit(X, y) joblib.dump(pipeline, 'sentiment_pipeline.pkl') return pipeline # Endpoint to process new reviews @app.route('/', methods=['POST']) def predict_sentiment(): pipeline = load_model() new_reviews_json = request.json new_reviews = [review['CUSTOMERREVIEWS'] for review in new_reviews_json['reviewsModel']] cleaned_new_reviews = clean_text_with_lemmatization(new_reviews) predicted_probabilities = pipeline.predict_proba(cleaned_new_reviews) results = [] for i, review_info in enumerate(new_reviews_json['reviewsModel']): original_review = review_info['CUSTOMERREVIEWS'] negative_probability_new = predicted_probabilities[i, 0] positive_probability_new = predicted_probabilities[i, 1] sentiment = get_sentiment_label(positive_probability_new) formatted_negative_probability = f"{negative_probability_new:.4f}" formatted_positive_probability = f"{positive_probability_new:.4f}" results.append({ "review_text": original_review, "negative_probability": formatted_negative_probability, "positive_probability": formatted_positive_probability, "sentiment": sentiment }) overall_negative_probability = predicted_probabilities.mean(axis=0)[0] overall_positive_probability = predicted_probabilities.mean(axis=0)[1] overall_sentiment = get_sentiment_label(overall_positive_probability) results.append({ "overall_negative_probability": f"{overall_negative_probability:.4f}", "overall_positive_probability": f"{overall_positive_probability:.4f}", "overall_sentiment": overall_sentiment }) return jsonify(results) #if __name__ == '__main__': # app.run(debug=True)