|
|
from flask import Flask, request, send_file, jsonify |
|
|
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer |
|
|
import pandas as pd |
|
|
import torch |
|
|
import tempfile |
|
|
import os |
|
|
import re |
|
|
from collections import Counter |
|
|
from flask_cors import CORS |
|
|
|
|
|
app = Flask(__name__) |
|
|
CORS(app) |
|
|
|
|
|
|
|
|
model_name = "AbdoIR/x-sentiment-analysis/fine_tuned_model" |
|
|
model = DistilBertForSequenceClassification.from_pretrained(model_name) |
|
|
tokenizer = DistilBertTokenizer.from_pretrained(model_name) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
model.to(device) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
def predict_sentiment(texts): |
|
|
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt") |
|
|
encodings = {key: val.to(device) for key, val in encodings.items()} |
|
|
with torch.no_grad(): |
|
|
outputs = model(**encodings) |
|
|
predictions = torch.argmax(outputs.logits, dim=1) |
|
|
sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"} |
|
|
return [sentiment_map[p.item()] for p in predictions] |
|
|
|
|
|
|
|
|
def get_top_words(texts, n=30): |
|
|
all_words = [] |
|
|
for text in texts: |
|
|
tokens = re.findall(r'\b\w{3,}\b', str(text).lower()) |
|
|
all_words.extend(tokens) |
|
|
counter = Counter(all_words) |
|
|
most_common = counter.most_common(n) |
|
|
return pd.DataFrame(most_common, columns=['word', 'count']) |
|
|
|
|
|
|
|
|
@app.route('/predict', methods=['POST']) |
|
|
def predict(): |
|
|
if 'file' not in request.files: |
|
|
return jsonify({'error': 'No file uploaded'}), 400 |
|
|
|
|
|
file = request.files['file'] |
|
|
try: |
|
|
df = pd.read_csv(file) |
|
|
except Exception: |
|
|
try: |
|
|
file.seek(0) |
|
|
df = pd.read_excel(file) |
|
|
except Exception: |
|
|
return jsonify({'error': 'Unable to read the file'}), 400 |
|
|
|
|
|
if 'content' in df.columns: |
|
|
text_col = 'content' |
|
|
elif 'tweet' in df.columns: |
|
|
text_col = 'tweet' |
|
|
else: |
|
|
return jsonify({'error': 'No "content" or "tweet" column found'}), 400 |
|
|
|
|
|
texts = df[text_col].astype(str).tolist() |
|
|
df['sentiment'] = predict_sentiment(texts) |
|
|
|
|
|
top_words_df = get_top_words(texts) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
sentiment_path = os.path.join(temp_dir, 'final_data.csv') |
|
|
df.to_csv(sentiment_path, index=False) |
|
|
|
|
|
words_path = os.path.join(temp_dir, 'word_frequent.csv') |
|
|
top_words_df.to_csv(words_path, index=False) |
|
|
|
|
|
return jsonify({ |
|
|
'sentiment_file': f'/download?file={sentiment_path}', |
|
|
'top_words_file': f'/download?file={words_path}', |
|
|
'sentiment_data': df.to_dict(orient='records'), |
|
|
'top_words_data': top_words_df.to_dict(orient='records') |
|
|
}) |
|
|
|
|
|
|
|
|
@app.route('/download') |
|
|
def download(): |
|
|
file_path = request.args.get('file') |
|
|
if not file_path or not os.path.exists(file_path): |
|
|
return jsonify({'error': 'File not found'}), 404 |
|
|
return send_file(file_path, as_attachment=True) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
app.run(host="0.0.0.0", port=5000, debug=True) |
|
|
|