letijo03's picture
Update app.py
2b3f13e verified
from flask import Flask, request, render_template_string, jsonify, send_from_directory
import requests
import pandas as pd
import re
import time
from random import randint, choice
import os
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from peft import PeftModel, PeftConfig # Ensure peft library is installed
import torch
from collections import defaultdict
flask_app = Flask(__name__)
# List of account credentials (only cookies shown here)
ACCOUNTS = [
{
"cookie": "SPC_F=hmR34sqS9gRUgA35BL857RAqy0Hn0sU8; REC_T_ID=64c7c97b-cc02-11ef-bac2-8a756e8ab50a; _gcl_au=1.1.834093345.1744027851"
},
{
"cookie": "SPC_F=6nZYVWCtsBQBzqW8DPio55dDmfqxKFdM; REC_T_ID=14d45038-03a9-11f0-877a-2a88f69ba114; _gcl_au=1.1.938658524.1742268522"
}
# Add more accounts as needed
# {
# 1"cookie": "SPC_F=hmR34sqS9gRUgA35BL857RAqy0Hn0sU8; REC_T_ID=64c7c97b-cc02-11ef-bac2-8a756e8ab50a; _gcl_au=1.1.1391686440.1736149630"
# 2"cookie": "SPC_F=6nZYVWCtsBQBzqW8DPio55dDmfqxKFdM; REC_T_ID=14d45038-03a9-11f0-877a-2a88f69ba114; _gcl_au=1.1.938658524.1742268522"
# }
]
def get_headers(shop_id=None, item_id=None):
"""Randomly pick an account and return a header set with a dynamic Referer."""
account = choice(ACCOUNTS)
if shop_id and item_id:
referer = f"https://shopee.ph/product/{shop_id}/{item_id}"
else:
referer = "https://shopee.ph"
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
"Cookie": account["cookie"],
"X-Api-Source": "rweb",
"X-Requested-With": "XMLHttpRequest",
"Accept": "application/json",
"Content-Type": "application/json",
"X-Shopee-Language": "en",
"Referer": referer,
"af-ac-enc-dat": "null" # Adding the 'af-ac-enc-dat' header as advised
}
return headers
# Load the base XLM-RoBERTa model with the correct number of labels (3 labels for classification)
tokenizer = XLMRobertaTokenizer.from_pretrained("letijo03/lora-adapter-32",use_fast=True, trust_remote_code=True)
base_model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3)
config = PeftConfig.from_pretrained("letijo03/lora-adapter-32")
model = PeftModel.from_pretrained(base_model, "letijo03/lora-adapter-32")
model.eval()
def get_ids_from_url(url):
patterns = [
r"i\.(\d+)\.(\d+)",
r"/product/(\d+)/(\d+)"
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return int(match.group(1)), int(match.group(2))
raise ValueError("Invalid Shopee URL format. Please use a valid Shopee product URL.")
def fetch_comments(shop_id, item_id, limit=50, offset=0, retries=3):
url = f"https://shopee.ph/api/v2/item/get_ratings?itemid={item_id}&shopid={shop_id}&limit={limit}&offset={offset}"
for attempt in range(retries):
headers = get_headers(shop_id, item_id)
response = requests.get(url, headers=headers)
if response.status_code == 418:
print("Received status code 418. Rotating account and waiting before retry.")
time.sleep(randint(15 * 60, 30 * 60))
continue
try:
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
if attempt < retries - 1:
time.sleep(2)
except Exception as err:
print(f"An error occurred: {err}")
if attempt < retries - 1:
time.sleep(2)
return None
def extract_comments(data):
comments = []
if data and 'data' in data and 'ratings' in data['data']:
for rating in data['data']['ratings']:
comment_parts = []
if 'tag_info' in rating:
for tag in rating['tag_info']:
tag_text = f"{tag.get('tag_name', '')}: {tag.get('tag_value', '')}"
comment_parts.append(tag_text)
main_comment = rating.get('comment', '').strip()
if main_comment:
comment_parts.append(main_comment)
full_comment = "\n".join(comment_parts)
comment = {
'Username': rating.get('author_username', ''),
'Rating': rating.get('rating_star', 0),
'Date and Time': pd.to_datetime(rating.get('ctime', 0), unit='s').strftime('%Y-%m-%d %H:%M'),
'Comment': full_comment
}
comments.append(comment)
return comments
def clean_data(df):
df['Comment'] = df['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
df = df[df['Comment'].str.strip() != '']
return df
def classify_sentiment(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1)
return prediction.item()
def generate_insights(df):
insights = {}
sentiment_mapping = {2: 'Positive', 1: 'Neutral', 0: 'Negative'}
for sentiment_value, sentiment_label in sentiment_mapping.items():
subset = df[df['Sentiment'] == sentiment_value]
count = len(subset)
if count == 0:
insights[sentiment_label] = f"There are no significant comments for {sentiment_label.lower()} sentiment."
else:
comments = subset['Comment'].dropna().tolist()
insights[sentiment_label] = generate_comment_insight(comments)
return insights
def generate_comment_insight(comments):
# Return a sample of comments for insight (e.g., first 5 comments)
return '<br>'.join(comments[:20]) # Adjust the number of comments as needed
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>Shopee Product Comment Sentiment Analysis</title>
<style>
body {
font-family: Arial, sans-serif;
background-color: #f5f5f5;
margin: 0;
padding: 0;
color: #333;
}
header {
background-color: #FF5722;
color: white;
padding: 20px;
text-align: center;
}
header h1 {
margin: 0;
font-size: 2em;
}
main {
padding: 20px;
max-width: 900px;
margin: 0 auto;
background-color: white;
border-radius: 8px;
box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1);
}
form {
margin: 20px auto;
max-width: 400px;
display: flex;
flex-direction: column;
gap: 15px;
background-color: #f9f9f9;
padding: 20px;
border-radius: 8px;
box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1);
}
input, button {
padding: 12px;
font-size: 1.1em;
border: 1px solid #ccc;
border-radius: 6px;
}
input { background-color: #fff; }
button {
background-color: #FF5722;
color: white;
border: none;
cursor: pointer;
transition: background-color 0.3s ease;
}
button:hover { background-color: #E64A19; }
input:focus { border-color: #FF5722; outline: none; }
.error-message { color: red; font-weight: bold; }
.success-message { color: green; font-weight: bold; }
#loadingContainer {
display: none;
flex-direction: column;
justify-content: center;
align-items: center;
font-size: 16px;
color: #FF5722;
height: 100vh;
position: fixed;
top: 0; left: 0; right: 0; bottom: 0;
background-color: rgba(255, 255, 255, 0.8);
z-index: 9999;
text-align: center;
}
.spinner {
border: 4px solid rgba(0, 0, 0, 0.1);
border-left-color: #FF5722;
border-radius: 50%;
width: 50px;
height: 50px;
animation: spin 1s linear infinite;
margin-top: 10px;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
#chartContainer {
display: flex;
justify-content: center;
align-items: center;
width: 100%;
max-width: 800px;
height: 600px;
margin: 20px auto;
}
footer {
background: linear-gradient(90deg, #ff5722, #ff7043);
color: white;
text-align: center;
padding: 1rem;
font-size: 0.9rem;
margin-top: auto;
}
.result-message {
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
text-align: center;
margin-top: 20px;
}
.download-link {
margin-top: 15px;
padding: 10px 20px;
background-color: #FF5722;
color: white;
text-decoration: none;
border-radius: 5px;
font-size: 16px;
font-weight: bold;
display: inline-block;
}
.download-link:hover { background-color: #e64a19; }
.insights {
margin-top: 2rem;
padding: 2rem;
background: white;
border-radius: 16px;
box-shadow: 0 6px 18px rgba(0, 0, 0, 0.15);
text-align: left;
overflow-y: auto;
max-height: 400px;
font-size: 1rem;
line-height: 1.5;
}
.insights h3 {
margin-bottom: 1rem;
color: #ff5722;
}
.insights ul {
list-style-type: none;
padding: 0;
}
.insights ul li {
background-color: #f1f1f1;
margin: 0.5rem 0;
padding: 1rem;
border-radius: 8px;
font-size: 1rem;
}
.insights .comment-text {
font-style: italic;
font-size: 0.9rem;
color: #555;
}
.google-visualization-title {
text-align: center;
font-size: 20px;
color: orange;
font-weight: bold;
}
</style>
<script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
<script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
<script>
google.charts.load('current', { 'packages': ['corechart', 'bar'] });
function drawPieChart(chartData) {
const data = google.visualization.arrayToDataTable(chartData);
const options = {
title: 'Sentiment Analysis Results',
titleTextStyle: {
fontSize: 35, // Font size
bold: true, // Optional: make the title bold
color: '#FF5722', // Optional: change the title color
},
titlePosition: 'center', // Centers the title
pieHole: 0.5,
is3D: true,
width: '100%',
legend: { position: 'bottom' },
backgroundColor: 'transparent',
height: 600,
slices: {
0: { color: '#4caf50' },
1: { color: '#ffc107' },
2: { color: '#f44336' }
},
pieSliceText: 'percentage',
tooltip: { trigger: 'focus' }
};
const chart = new google.visualization.PieChart(document.getElementById('chartContainer'));
chart.draw(data, options);
}
document.addEventListener("DOMContentLoaded", function () {
document.getElementById("scrapeForm").onsubmit = async function(e) {
e.preventDefault();
const url = document.getElementById("url").value;
const resultDiv = document.getElementById("result");
const downloadLinkDiv = document.getElementById("downloadLink");
const chartDiv = document.getElementById("chartContainer");
const loadingContainer = document.getElementById("loadingContainer");
loadingContainer.style.display = "flex";
resultDiv.innerHTML = "";
downloadLinkDiv.innerHTML = "";
chartDiv.innerHTML = "";
try {
const response = await fetch('/scrape', {
method: 'POST',
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
body: new URLSearchParams({ 'url': url })
});
const data = await response.json();
loadingContainer.style.display = "none";
if (data.error) {
resultDiv.innerHTML = `<div class="result-message"><p class="error-message">${data.error}</p></div>`;
} else {
// 1. Success message
resultDiv.innerHTML = `<div class="result-message"><p class="success-message">${data.message}</p></div>`;
// 2. Download link
if (data.filename) {
downloadLinkDiv.innerHTML = `<div class="result-message"><a href="/download/${data.filename}" download class="download-link">Download CSV</a></div>`;
}
// 3. Chart
let chartData = [["Sentiment", "Count"],
["Positive", data.chart_data.Positive || 0],
["Neutral", data.chart_data.Neutral || 0],
["Negative", data.chart_data.Negative || 0]
];
google.charts.setOnLoadCallback(() => {
drawPieChart(chartData);
// 4. Insights (added after chart)
const insightsDiv = document.createElement('div');
insightsDiv.classList.add('insights');
insightsDiv.innerHTML = `
<h3>Insights</h3>
${Object.entries(data.insights).map(([sentiment, comments]) => `
<div>
<strong>${sentiment} Comments:</strong>
<ul>
${comments.split('<br>').map(comment =>
`<li><span class="comment-text">${comment}</span></li>`
).join('')}
</ul>
</div>
`).join('')}
`;
// Insert insights after chart
chartDiv.insertAdjacentElement('afterend', insightsDiv);
});
}
} catch (error) {
loadingContainer.style.display = "none";
resultDiv.innerHTML = `<p class="error-message">Error sending request: ${error.message}</p>`;
console.error('Fetch error:', error);
}
};
});
</script>
</head>
<body>
<header>
<h1>Shopee Product Comment Sentiment Analysis</h1>
</header>
<main>
<form id="scrapeForm">
<label for="url">Enter Shopee Product URL:</label>
<input type="text" id="url" name="url" placeholder="Enter the URL here" required />
<button type="submit">Generate</button>
</form>
<div id="loadingContainer">
<div class="spinner"></div>
<p>Loading...</p>
</div>
<div id="result"></div>
<div id="downloadLink"></div>
<div id="chartContainer"></div>
</main>
<footer>
<p>&copy; 2025 Shopee Sentiment Analysis. All rights reserved.</p>
</footer>
</body>
</html>
"""
@flask_app.route('/')
def index():
return render_template_string(html_template)
@flask_app.route('/scrape', methods=['POST'])
def scrape():
url = request.form.get('url')
try:
shop_id, item_id = get_ids_from_url(url)
except ValueError as e:
return jsonify({'error': str(e)})
all_comments = []
offset = 0
limit = 50
while True:
data = fetch_comments(shop_id, item_id, limit=limit, offset=offset)
if data is None:
break
comments = extract_comments(data)
if not comments:
break
all_comments.extend(comments)
if len(comments) < limit:
break
offset += limit
time.sleep(randint(2, 5))
if all_comments:
df = pd.DataFrame(all_comments)
df = clean_data(df)
df['Sentiment'] = df['Comment'].apply(classify_sentiment)
positive_count = len(df[df['Sentiment'] == 2])
neutral_count = len(df[df['Sentiment'] == 1])
negative_count = len(df[df['Sentiment'] == 0])
chart_data_counts = {
"Positive": positive_count,
"Neutral": neutral_count,
"Negative": negative_count
}
insights = generate_insights(df)
csv_filename = 'shopee_comments_formatted.csv'
os.makedirs('static', exist_ok=True)
csv_filepath = os.path.join('static', csv_filename)
df.to_csv(csv_filepath, index=False)
return jsonify({
'message': 'Successfully scraped and analyzed comments.',
'filename': csv_filename,
'chart_data': chart_data_counts,
'insights': insights
})
else:
return jsonify({'error': 'No comments found or unable to fetch comments.'})
# New route to serve download requests from the "static" folder.
@flask_app.route('/download/<path:filename>')
def download(filename):
return send_from_directory('static', filename, as_attachment=True)
from asgiref.wsgi import WsgiToAsgi
app = WsgiToAsgi(flask_app)
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))