from flask import Flask, request, render_template_string, jsonify, send_from_directory import requests import pandas as pd import re import time from random import randint, choice import os from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer from peft import PeftModel, PeftConfig # Ensure peft library is installed import torch from collections import defaultdict flask_app = Flask(__name__) # List of account credentials (only cookies shown here) ACCOUNTS = [ { "cookie": "SPC_F=hmR34sqS9gRUgA35BL857RAqy0Hn0sU8; REC_T_ID=64c7c97b-cc02-11ef-bac2-8a756e8ab50a; _gcl_au=1.1.834093345.1744027851" }, { "cookie": "SPC_F=6nZYVWCtsBQBzqW8DPio55dDmfqxKFdM; REC_T_ID=14d45038-03a9-11f0-877a-2a88f69ba114; _gcl_au=1.1.938658524.1742268522" } # Add more accounts as needed # { # 1"cookie": "SPC_F=hmR34sqS9gRUgA35BL857RAqy0Hn0sU8; REC_T_ID=64c7c97b-cc02-11ef-bac2-8a756e8ab50a; _gcl_au=1.1.1391686440.1736149630" # 2"cookie": "SPC_F=6nZYVWCtsBQBzqW8DPio55dDmfqxKFdM; REC_T_ID=14d45038-03a9-11f0-877a-2a88f69ba114; _gcl_au=1.1.938658524.1742268522" # } ] def get_headers(shop_id=None, item_id=None): """Randomly pick an account and return a header set with a dynamic Referer.""" account = choice(ACCOUNTS) if shop_id and item_id: referer = f"https://shopee.ph/product/{shop_id}/{item_id}" else: referer = "https://shopee.ph" headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "Cookie": account["cookie"], "X-Api-Source": "rweb", "X-Requested-With": "XMLHttpRequest", "Accept": "application/json", "Content-Type": "application/json", "X-Shopee-Language": "en", "Referer": referer, "af-ac-enc-dat": "null" # Adding the 'af-ac-enc-dat' header as advised } return headers # Load the base XLM-RoBERTa model with the correct number of labels (3 labels for classification) tokenizer = XLMRobertaTokenizer.from_pretrained("letijo03/lora-adapter-32",use_fast=True, trust_remote_code=True) base_model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3) config = PeftConfig.from_pretrained("letijo03/lora-adapter-32") model = PeftModel.from_pretrained(base_model, "letijo03/lora-adapter-32") model.eval() def get_ids_from_url(url): patterns = [ r"i\.(\d+)\.(\d+)", r"/product/(\d+)/(\d+)" ] for pattern in patterns: match = re.search(pattern, url) if match: return int(match.group(1)), int(match.group(2)) raise ValueError("Invalid Shopee URL format. Please use a valid Shopee product URL.") def fetch_comments(shop_id, item_id, limit=50, offset=0, retries=3): url = f"https://shopee.ph/api/v2/item/get_ratings?itemid={item_id}&shopid={shop_id}&limit={limit}&offset={offset}" for attempt in range(retries): headers = get_headers(shop_id, item_id) response = requests.get(url, headers=headers) if response.status_code == 418: print("Received status code 418. Rotating account and waiting before retry.") time.sleep(randint(15 * 60, 30 * 60)) continue try: response.raise_for_status() return response.json() except requests.exceptions.HTTPError as http_err: print(f"HTTP error occurred: {http_err}") if attempt < retries - 1: time.sleep(2) except Exception as err: print(f"An error occurred: {err}") if attempt < retries - 1: time.sleep(2) return None def extract_comments(data): comments = [] if data and 'data' in data and 'ratings' in data['data']: for rating in data['data']['ratings']: comment_parts = [] if 'tag_info' in rating: for tag in rating['tag_info']: tag_text = f"{tag.get('tag_name', '')}: {tag.get('tag_value', '')}" comment_parts.append(tag_text) main_comment = rating.get('comment', '').strip() if main_comment: comment_parts.append(main_comment) full_comment = "\n".join(comment_parts) comment = { 'Username': rating.get('author_username', ''), 'Rating': rating.get('rating_star', 0), 'Date and Time': pd.to_datetime(rating.get('ctime', 0), unit='s').strftime('%Y-%m-%d %H:%M'), 'Comment': full_comment } comments.append(comment) return comments def clean_data(df): df['Comment'] = df['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x))) df = df[df['Comment'].str.strip() != ''] return df def classify_sentiment(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) outputs = model(**inputs) prediction = torch.argmax(outputs.logits, dim=-1) return prediction.item() def generate_insights(df): insights = {} sentiment_mapping = {2: 'Positive', 1: 'Neutral', 0: 'Negative'} for sentiment_value, sentiment_label in sentiment_mapping.items(): subset = df[df['Sentiment'] == sentiment_value] count = len(subset) if count == 0: insights[sentiment_label] = f"There are no significant comments for {sentiment_label.lower()} sentiment." else: comments = subset['Comment'].dropna().tolist() insights[sentiment_label] = generate_comment_insight(comments) return insights def generate_comment_insight(comments): # Return a sample of comments for insight (e.g., first 5 comments) return '
'.join(comments[:20]) # Adjust the number of comments as needed html_template = """ Shopee Product Comment Sentiment Analysis

Shopee Product Comment Sentiment Analysis

Loading...

""" @flask_app.route('/') def index(): return render_template_string(html_template) @flask_app.route('/scrape', methods=['POST']) def scrape(): url = request.form.get('url') try: shop_id, item_id = get_ids_from_url(url) except ValueError as e: return jsonify({'error': str(e)}) all_comments = [] offset = 0 limit = 50 while True: data = fetch_comments(shop_id, item_id, limit=limit, offset=offset) if data is None: break comments = extract_comments(data) if not comments: break all_comments.extend(comments) if len(comments) < limit: break offset += limit time.sleep(randint(2, 5)) if all_comments: df = pd.DataFrame(all_comments) df = clean_data(df) df['Sentiment'] = df['Comment'].apply(classify_sentiment) positive_count = len(df[df['Sentiment'] == 2]) neutral_count = len(df[df['Sentiment'] == 1]) negative_count = len(df[df['Sentiment'] == 0]) chart_data_counts = { "Positive": positive_count, "Neutral": neutral_count, "Negative": negative_count } insights = generate_insights(df) csv_filename = 'shopee_comments_formatted.csv' os.makedirs('static', exist_ok=True) csv_filepath = os.path.join('static', csv_filename) df.to_csv(csv_filepath, index=False) return jsonify({ 'message': 'Successfully scraped and analyzed comments.', 'filename': csv_filename, 'chart_data': chart_data_counts, 'insights': insights }) else: return jsonify({'error': 'No comments found or unable to fetch comments.'}) # New route to serve download requests from the "static" folder. @flask_app.route('/download/') def download(filename): return send_from_directory('static', filename, as_attachment=True) from asgiref.wsgi import WsgiToAsgi app = WsgiToAsgi(flask_app) if __name__ == '__main__': import uvicorn uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))