Spaces:
Sleeping
Sleeping
| from flask import Flask, request, render_template_string, jsonify, send_from_directory | |
| import requests | |
| import pandas as pd | |
| import re | |
| import time | |
| from random import randint, choice | |
| import os | |
| from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer | |
| from peft import PeftModel, PeftConfig # Ensure peft library is installed | |
| import torch | |
| from collections import defaultdict | |
| flask_app = Flask(__name__) | |
| # List of account credentials (only cookies shown here) | |
| ACCOUNTS = [ | |
| { | |
| "cookie": "SPC_F=hmR34sqS9gRUgA35BL857RAqy0Hn0sU8; REC_T_ID=64c7c97b-cc02-11ef-bac2-8a756e8ab50a; _gcl_au=1.1.834093345.1744027851" | |
| }, | |
| { | |
| "cookie": "SPC_F=6nZYVWCtsBQBzqW8DPio55dDmfqxKFdM; REC_T_ID=14d45038-03a9-11f0-877a-2a88f69ba114; _gcl_au=1.1.938658524.1742268522" | |
| } | |
| # Add more accounts as needed | |
| # { | |
| # 1"cookie": "SPC_F=hmR34sqS9gRUgA35BL857RAqy0Hn0sU8; REC_T_ID=64c7c97b-cc02-11ef-bac2-8a756e8ab50a; _gcl_au=1.1.1391686440.1736149630" | |
| # 2"cookie": "SPC_F=6nZYVWCtsBQBzqW8DPio55dDmfqxKFdM; REC_T_ID=14d45038-03a9-11f0-877a-2a88f69ba114; _gcl_au=1.1.938658524.1742268522" | |
| # } | |
| ] | |
| def get_headers(shop_id=None, item_id=None): | |
| """Randomly pick an account and return a header set with a dynamic Referer.""" | |
| account = choice(ACCOUNTS) | |
| if shop_id and item_id: | |
| referer = f"https://shopee.ph/product/{shop_id}/{item_id}" | |
| else: | |
| referer = "https://shopee.ph" | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", | |
| "Cookie": account["cookie"], | |
| "X-Api-Source": "rweb", | |
| "X-Requested-With": "XMLHttpRequest", | |
| "Accept": "application/json", | |
| "Content-Type": "application/json", | |
| "X-Shopee-Language": "en", | |
| "Referer": referer, | |
| "af-ac-enc-dat": "null" # Adding the 'af-ac-enc-dat' header as advised | |
| } | |
| return headers | |
| # Load the base XLM-RoBERTa model with the correct number of labels (3 labels for classification) | |
| tokenizer = XLMRobertaTokenizer.from_pretrained("letijo03/lora-adapter-32",use_fast=True, trust_remote_code=True) | |
| base_model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3) | |
| config = PeftConfig.from_pretrained("letijo03/lora-adapter-32") | |
| model = PeftModel.from_pretrained(base_model, "letijo03/lora-adapter-32") | |
| model.eval() | |
| def get_ids_from_url(url): | |
| patterns = [ | |
| r"i\.(\d+)\.(\d+)", | |
| r"/product/(\d+)/(\d+)" | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, url) | |
| if match: | |
| return int(match.group(1)), int(match.group(2)) | |
| raise ValueError("Invalid Shopee URL format. Please use a valid Shopee product URL.") | |
| def fetch_comments(shop_id, item_id, limit=50, offset=0, retries=3): | |
| url = f"https://shopee.ph/api/v2/item/get_ratings?itemid={item_id}&shopid={shop_id}&limit={limit}&offset={offset}" | |
| for attempt in range(retries): | |
| headers = get_headers(shop_id, item_id) | |
| response = requests.get(url, headers=headers) | |
| if response.status_code == 418: | |
| print("Received status code 418. Rotating account and waiting before retry.") | |
| time.sleep(randint(15 * 60, 30 * 60)) | |
| continue | |
| try: | |
| response.raise_for_status() | |
| return response.json() | |
| except requests.exceptions.HTTPError as http_err: | |
| print(f"HTTP error occurred: {http_err}") | |
| if attempt < retries - 1: | |
| time.sleep(2) | |
| except Exception as err: | |
| print(f"An error occurred: {err}") | |
| if attempt < retries - 1: | |
| time.sleep(2) | |
| return None | |
| def extract_comments(data): | |
| comments = [] | |
| if data and 'data' in data and 'ratings' in data['data']: | |
| for rating in data['data']['ratings']: | |
| comment_parts = [] | |
| if 'tag_info' in rating: | |
| for tag in rating['tag_info']: | |
| tag_text = f"{tag.get('tag_name', '')}: {tag.get('tag_value', '')}" | |
| comment_parts.append(tag_text) | |
| main_comment = rating.get('comment', '').strip() | |
| if main_comment: | |
| comment_parts.append(main_comment) | |
| full_comment = "\n".join(comment_parts) | |
| comment = { | |
| 'Username': rating.get('author_username', ''), | |
| 'Rating': rating.get('rating_star', 0), | |
| 'Date and Time': pd.to_datetime(rating.get('ctime', 0), unit='s').strftime('%Y-%m-%d %H:%M'), | |
| 'Comment': full_comment | |
| } | |
| comments.append(comment) | |
| return comments | |
| def clean_data(df): | |
| df['Comment'] = df['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x))) | |
| df = df[df['Comment'].str.strip() != ''] | |
| return df | |
| def classify_sentiment(text): | |
| inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| outputs = model(**inputs) | |
| prediction = torch.argmax(outputs.logits, dim=-1) | |
| return prediction.item() | |
| def generate_insights(df): | |
| insights = {} | |
| sentiment_mapping = {2: 'Positive', 1: 'Neutral', 0: 'Negative'} | |
| for sentiment_value, sentiment_label in sentiment_mapping.items(): | |
| subset = df[df['Sentiment'] == sentiment_value] | |
| count = len(subset) | |
| if count == 0: | |
| insights[sentiment_label] = f"There are no significant comments for {sentiment_label.lower()} sentiment." | |
| else: | |
| comments = subset['Comment'].dropna().tolist() | |
| insights[sentiment_label] = generate_comment_insight(comments) | |
| return insights | |
| def generate_comment_insight(comments): | |
| # Return a sample of comments for insight (e.g., first 5 comments) | |
| return '<br>'.join(comments[:20]) # Adjust the number of comments as needed | |
| html_template = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"/> | |
| <title>Shopee Product Comment Sentiment Analysis</title> | |
| <style> | |
| body { | |
| font-family: Arial, sans-serif; | |
| background-color: #f5f5f5; | |
| margin: 0; | |
| padding: 0; | |
| color: #333; | |
| } | |
| header { | |
| background-color: #FF5722; | |
| color: white; | |
| padding: 20px; | |
| text-align: center; | |
| } | |
| header h1 { | |
| margin: 0; | |
| font-size: 2em; | |
| } | |
| main { | |
| padding: 20px; | |
| max-width: 900px; | |
| margin: 0 auto; | |
| background-color: white; | |
| border-radius: 8px; | |
| box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1); | |
| } | |
| form { | |
| margin: 20px auto; | |
| max-width: 400px; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 15px; | |
| background-color: #f9f9f9; | |
| padding: 20px; | |
| border-radius: 8px; | |
| box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1); | |
| } | |
| input, button { | |
| padding: 12px; | |
| font-size: 1.1em; | |
| border: 1px solid #ccc; | |
| border-radius: 6px; | |
| } | |
| input { background-color: #fff; } | |
| button { | |
| background-color: #FF5722; | |
| color: white; | |
| border: none; | |
| cursor: pointer; | |
| transition: background-color 0.3s ease; | |
| } | |
| button:hover { background-color: #E64A19; } | |
| input:focus { border-color: #FF5722; outline: none; } | |
| .error-message { color: red; font-weight: bold; } | |
| .success-message { color: green; font-weight: bold; } | |
| #loadingContainer { | |
| display: none; | |
| flex-direction: column; | |
| justify-content: center; | |
| align-items: center; | |
| font-size: 16px; | |
| color: #FF5722; | |
| height: 100vh; | |
| position: fixed; | |
| top: 0; left: 0; right: 0; bottom: 0; | |
| background-color: rgba(255, 255, 255, 0.8); | |
| z-index: 9999; | |
| text-align: center; | |
| } | |
| .spinner { | |
| border: 4px solid rgba(0, 0, 0, 0.1); | |
| border-left-color: #FF5722; | |
| border-radius: 50%; | |
| width: 50px; | |
| height: 50px; | |
| animation: spin 1s linear infinite; | |
| margin-top: 10px; | |
| } | |
| @keyframes spin { | |
| to { transform: rotate(360deg); } | |
| } | |
| #chartContainer { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| width: 100%; | |
| max-width: 800px; | |
| height: 600px; | |
| margin: 20px auto; | |
| } | |
| footer { | |
| background: linear-gradient(90deg, #ff5722, #ff7043); | |
| color: white; | |
| text-align: center; | |
| padding: 1rem; | |
| font-size: 0.9rem; | |
| margin-top: auto; | |
| } | |
| .result-message { | |
| display: flex; | |
| flex-direction: column; | |
| justify-content: center; | |
| align-items: center; | |
| text-align: center; | |
| margin-top: 20px; | |
| } | |
| .download-link { | |
| margin-top: 15px; | |
| padding: 10px 20px; | |
| background-color: #FF5722; | |
| color: white; | |
| text-decoration: none; | |
| border-radius: 5px; | |
| font-size: 16px; | |
| font-weight: bold; | |
| display: inline-block; | |
| } | |
| .download-link:hover { background-color: #e64a19; } | |
| .insights { | |
| margin-top: 2rem; | |
| padding: 2rem; | |
| background: white; | |
| border-radius: 16px; | |
| box-shadow: 0 6px 18px rgba(0, 0, 0, 0.15); | |
| text-align: left; | |
| overflow-y: auto; | |
| max-height: 400px; | |
| font-size: 1rem; | |
| line-height: 1.5; | |
| } | |
| .insights h3 { | |
| margin-bottom: 1rem; | |
| color: #ff5722; | |
| } | |
| .insights ul { | |
| list-style-type: none; | |
| padding: 0; | |
| } | |
| .insights ul li { | |
| background-color: #f1f1f1; | |
| margin: 0.5rem 0; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| font-size: 1rem; | |
| } | |
| .insights .comment-text { | |
| font-style: italic; | |
| font-size: 0.9rem; | |
| color: #555; | |
| } | |
| .google-visualization-title { | |
| text-align: center; | |
| font-size: 20px; | |
| color: orange; | |
| font-weight: bold; | |
| } | |
| </style> | |
| <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script> | |
| <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script> | |
| <script> | |
| google.charts.load('current', { 'packages': ['corechart', 'bar'] }); | |
| function drawPieChart(chartData) { | |
| const data = google.visualization.arrayToDataTable(chartData); | |
| const options = { | |
| title: 'Sentiment Analysis Results', | |
| titleTextStyle: { | |
| fontSize: 35, // Font size | |
| bold: true, // Optional: make the title bold | |
| color: '#FF5722', // Optional: change the title color | |
| }, | |
| titlePosition: 'center', // Centers the title | |
| pieHole: 0.5, | |
| is3D: true, | |
| width: '100%', | |
| legend: { position: 'bottom' }, | |
| backgroundColor: 'transparent', | |
| height: 600, | |
| slices: { | |
| 0: { color: '#4caf50' }, | |
| 1: { color: '#ffc107' }, | |
| 2: { color: '#f44336' } | |
| }, | |
| pieSliceText: 'percentage', | |
| tooltip: { trigger: 'focus' } | |
| }; | |
| const chart = new google.visualization.PieChart(document.getElementById('chartContainer')); | |
| chart.draw(data, options); | |
| } | |
| document.addEventListener("DOMContentLoaded", function () { | |
| document.getElementById("scrapeForm").onsubmit = async function(e) { | |
| e.preventDefault(); | |
| const url = document.getElementById("url").value; | |
| const resultDiv = document.getElementById("result"); | |
| const downloadLinkDiv = document.getElementById("downloadLink"); | |
| const chartDiv = document.getElementById("chartContainer"); | |
| const loadingContainer = document.getElementById("loadingContainer"); | |
| loadingContainer.style.display = "flex"; | |
| resultDiv.innerHTML = ""; | |
| downloadLinkDiv.innerHTML = ""; | |
| chartDiv.innerHTML = ""; | |
| try { | |
| const response = await fetch('/scrape', { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, | |
| body: new URLSearchParams({ 'url': url }) | |
| }); | |
| const data = await response.json(); | |
| loadingContainer.style.display = "none"; | |
| if (data.error) { | |
| resultDiv.innerHTML = `<div class="result-message"><p class="error-message">${data.error}</p></div>`; | |
| } else { | |
| // 1. Success message | |
| resultDiv.innerHTML = `<div class="result-message"><p class="success-message">${data.message}</p></div>`; | |
| // 2. Download link | |
| if (data.filename) { | |
| downloadLinkDiv.innerHTML = `<div class="result-message"><a href="/download/${data.filename}" download class="download-link">Download CSV</a></div>`; | |
| } | |
| // 3. Chart | |
| let chartData = [["Sentiment", "Count"], | |
| ["Positive", data.chart_data.Positive || 0], | |
| ["Neutral", data.chart_data.Neutral || 0], | |
| ["Negative", data.chart_data.Negative || 0] | |
| ]; | |
| google.charts.setOnLoadCallback(() => { | |
| drawPieChart(chartData); | |
| // 4. Insights (added after chart) | |
| const insightsDiv = document.createElement('div'); | |
| insightsDiv.classList.add('insights'); | |
| insightsDiv.innerHTML = ` | |
| <h3>Insights</h3> | |
| ${Object.entries(data.insights).map(([sentiment, comments]) => ` | |
| <div> | |
| <strong>${sentiment} Comments:</strong> | |
| <ul> | |
| ${comments.split('<br>').map(comment => | |
| `<li><span class="comment-text">${comment}</span></li>` | |
| ).join('')} | |
| </ul> | |
| </div> | |
| `).join('')} | |
| `; | |
| // Insert insights after chart | |
| chartDiv.insertAdjacentElement('afterend', insightsDiv); | |
| }); | |
| } | |
| } catch (error) { | |
| loadingContainer.style.display = "none"; | |
| resultDiv.innerHTML = `<p class="error-message">Error sending request: ${error.message}</p>`; | |
| console.error('Fetch error:', error); | |
| } | |
| }; | |
| }); | |
| </script> | |
| </head> | |
| <body> | |
| <header> | |
| <h1>Shopee Product Comment Sentiment Analysis</h1> | |
| </header> | |
| <main> | |
| <form id="scrapeForm"> | |
| <label for="url">Enter Shopee Product URL:</label> | |
| <input type="text" id="url" name="url" placeholder="Enter the URL here" required /> | |
| <button type="submit">Generate</button> | |
| </form> | |
| <div id="loadingContainer"> | |
| <div class="spinner"></div> | |
| <p>Loading...</p> | |
| </div> | |
| <div id="result"></div> | |
| <div id="downloadLink"></div> | |
| <div id="chartContainer"></div> | |
| </main> | |
| <footer> | |
| <p>© 2025 Shopee Sentiment Analysis. All rights reserved.</p> | |
| </footer> | |
| </body> | |
| </html> | |
| """ | |
| def index(): | |
| return render_template_string(html_template) | |
| def scrape(): | |
| url = request.form.get('url') | |
| try: | |
| shop_id, item_id = get_ids_from_url(url) | |
| except ValueError as e: | |
| return jsonify({'error': str(e)}) | |
| all_comments = [] | |
| offset = 0 | |
| limit = 50 | |
| while True: | |
| data = fetch_comments(shop_id, item_id, limit=limit, offset=offset) | |
| if data is None: | |
| break | |
| comments = extract_comments(data) | |
| if not comments: | |
| break | |
| all_comments.extend(comments) | |
| if len(comments) < limit: | |
| break | |
| offset += limit | |
| time.sleep(randint(2, 5)) | |
| if all_comments: | |
| df = pd.DataFrame(all_comments) | |
| df = clean_data(df) | |
| df['Sentiment'] = df['Comment'].apply(classify_sentiment) | |
| positive_count = len(df[df['Sentiment'] == 2]) | |
| neutral_count = len(df[df['Sentiment'] == 1]) | |
| negative_count = len(df[df['Sentiment'] == 0]) | |
| chart_data_counts = { | |
| "Positive": positive_count, | |
| "Neutral": neutral_count, | |
| "Negative": negative_count | |
| } | |
| insights = generate_insights(df) | |
| csv_filename = 'shopee_comments_formatted.csv' | |
| os.makedirs('static', exist_ok=True) | |
| csv_filepath = os.path.join('static', csv_filename) | |
| df.to_csv(csv_filepath, index=False) | |
| return jsonify({ | |
| 'message': 'Successfully scraped and analyzed comments.', | |
| 'filename': csv_filename, | |
| 'chart_data': chart_data_counts, | |
| 'insights': insights | |
| }) | |
| else: | |
| return jsonify({'error': 'No comments found or unable to fetch comments.'}) | |
| # New route to serve download requests from the "static" folder. | |
| def download(filename): | |
| return send_from_directory('static', filename, as_attachment=True) | |
| from asgiref.wsgi import WsgiToAsgi | |
| app = WsgiToAsgi(flask_app) | |
| if __name__ == '__main__': | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860))) | |