Spaces:
Running
Running
| # from loguru import logger | |
| # import pandas as pd | |
| # import json | |
| # from datetime import datetime | |
| # import ast | |
| # import numpy as np | |
| # from pymongo import MongoClient | |
| # from collections import defaultdict | |
| # from tqdm import tqdm | |
| # import time | |
| # import requests | |
| # import json | |
| # import os | |
| # import pandas as pd | |
| # import nltk | |
| # from nltk.tokenize import sent_tokenize, word_tokenize | |
| # from nltk.corpus import stopwords | |
| # from textblob import TextBlob | |
| # import re | |
| # from transformers import BertTokenizer, BertModel | |
| # from transformers import RobertaTokenizer, RobertaModel | |
| # import torch | |
| # from sklearn.metrics.pairwise import cosine_similarity | |
| # import numpy as np | |
| # # Download NLTK resources | |
| # nltk.download('punkt') | |
| # nltk.download('averaged_perceptron_tagger') | |
| # nltk.download('stopwords') | |
| # nltk.download('punkt_tab') | |
| # nltk.download('averaged_perceptron_tagger_eng') | |
| # class Preprocessor: | |
| # def __init__(self,df): | |
| # self.df=df | |
| # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') | |
| # self.model = RobertaModel.from_pretrained('roberta-base') | |
| # self.stop_words = set(stopwords.words('english')) | |
| # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line | |
| # def get_bert_embedding(self, text): | |
| # inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) | |
| # with torch.no_grad(): | |
| # outputs = self.model(**inputs) | |
| # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() | |
| # def preprocess_text(self,text): | |
| # return text if pd.notna(text) else "" | |
| # def calculate_duration(self, time_range): | |
| # if not isinstance(time_range, str) or "-" not in time_range: | |
| # return None | |
| # start_str, end_str = time_range.split('-') | |
| # start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip() | |
| # end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip() | |
| # try: | |
| # start = datetime.strptime(start_str, '%H:%M') | |
| # end = datetime.strptime(end_str, '%H:%M') | |
| # duration = (end - start).total_seconds() / 3600 | |
| # return duration if duration >= 0 else duration + 24 | |
| # except ValueError: | |
| # return None | |
| # def calculate_sentiment_severity(self, text): | |
| # if pd.isna(text) or not text.strip(): | |
| # return pd.Series({"good_severity": 0.0, "bad_severity": 0.0}) | |
| # # Get sentiment polarity (-1 to 1) | |
| # blob = TextBlob(text) | |
| # polarity = blob.sentiment.polarity | |
| # # Define severity weights | |
| # good_weight = 0.7 | |
| # bad_weight = 0.3 | |
| # if polarity > 0: | |
| # good_severity = good_weight * polarity | |
| # bad_severity = 0.0 | |
| # elif polarity < 0: | |
| # good_severity = 0.0 | |
| # bad_severity = bad_weight * abs(polarity) | |
| # else: # Neutral (polarity = 0) | |
| # good_severity = 0.0 | |
| # bad_severity = 0.0 | |
| # return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity}) | |
| # def get_avg_duration(self, hours_str): | |
| # if pd.isna(hours_str) or not isinstance(hours_str, str): | |
| # return pd.NA | |
| # try: | |
| # hours_dict = ast.literal_eval(hours_str) | |
| # if not hours_dict: | |
| # return pd.NA | |
| # durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()] | |
| # valid_durations = [d for d in durations if d is not None] | |
| # return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA | |
| # except (ValueError, SyntaxError, ZeroDivisionError): | |
| # return pd.NA | |
| # def calculate_time_since_last_review(self): | |
| # present_date = datetime.now() | |
| # user_latest_timestamp = {} | |
| # # Convert review_date to datetime | |
| # self.df["review_date"] = pd.to_datetime(self.df["review_date"]) | |
| # # Calculate hours difference for each user's latest review | |
| # for user_id in self.df["user_id"].unique(): | |
| # latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max() | |
| # if not isinstance(latest_date, datetime): | |
| # latest_date = latest_date.to_pydatetime() | |
| # hours_difference = (present_date - latest_date).total_seconds() / 3600 | |
| # user_latest_timestamp[user_id] = hours_difference | |
| # # Map the hours difference to a new column | |
| # self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp) | |
| # def calculate_time_since_last_review_business(self): | |
| # present_date = datetime.now() | |
| # # Ensure review_date is in datetime format | |
| # self.df["review_date"] = pd.to_datetime(self.df["review_date"]) | |
| # # Initialize dictionary to store hours since last review for each business | |
| # business_latest_timestamp = {} | |
| # # Iterate over unique business_ids | |
| # for business_id in self.df["business_id"].unique(): | |
| # # Get the latest review date for this business | |
| # latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max() | |
| # # Convert to datetime object if needed | |
| # if not isinstance(latest_date, datetime): | |
| # latest_date = latest_date.to_pydatetime() | |
| # # Calculate hours difference (already in hours) | |
| # hours_difference = (present_date - latest_date).total_seconds() / 3600 | |
| # business_latest_timestamp[business_id] = hours_difference | |
| # # Map the hours difference to the new column | |
| # self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp) | |
| # def calculate_user_account_age(self): | |
| # present_date = datetime.now() | |
| # # Convert yelping_since to datetime | |
| # self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"]) | |
| # # Calculate user account age in days | |
| # self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days | |
| # def calculate_avg_time_between_reviews(self): | |
| # # Ensure review_date is in datetime format | |
| # self.df["review_date"] = pd.to_datetime(self.df["review_date"]) | |
| # # Sort the DataFrame by user_id and review_date to ensure chronological order | |
| # self.df = self.df.sort_values(["user_id", "review_date"]) | |
| # # Define helper function to calculate average time between reviews | |
| # def calculate_avg_time(group): | |
| # if len(group) == 1: | |
| # return 0 # If only one review, assign 0 | |
| # # Calculate differences in hours between consecutive reviews | |
| # diffs = group["review_date"].diff().dt.total_seconds() / 3600 | |
| # # Drop the first NaN (from diff) and compute the mean | |
| # return diffs.dropna().mean() | |
| # # Apply the function to each user_id group and create a mapping | |
| # avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time) | |
| # # Map the average time back to the original DataFrame | |
| # self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user) | |
| # def calculate_user_degree(self): | |
| # # Calculate the number of unique businesses per user | |
| # user_business_counts = self.df.groupby("user_id")["business_id"].nunique() | |
| # # Map the counts back to the original DataFrame | |
| # self.df["user_degree"] = self.df["user_id"].map(user_business_counts) | |
| # def calculate_business_degree(self): | |
| # # Calculate the number of unique users per business | |
| # business_user_counts = self.df.groupby("business_id")["user_id"].nunique() | |
| # # Map the counts back to the original DataFrame | |
| # self.df["business_degree"] = self.df["business_id"].map(business_user_counts) | |
| # def calculate_rating_variance_user(self): | |
| # # Calculate the mode (most frequent rating) per user | |
| # user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0]) | |
| # # Map the most frequent rating back to the original DataFrame | |
| # self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode) | |
| # def calculate_user_review_burst_count(self): | |
| # # Ensure review_date is in datetime format | |
| # self.df["review_date"] = pd.to_datetime(self.df["review_date"]) | |
| # # Sort by user_id and review_date for chronological order | |
| # self.df = self.df.sort_values(["user_id", "review_date"]) | |
| # # Function to calculate the max number of reviews in any 20-day window | |
| # def calculate_burst_count(group): | |
| # if len(group) <= 1: | |
| # return 0 # No burst if 1 or fewer reviews | |
| # # Convert review_date to a Series for rolling window | |
| # dates = group["review_date"] | |
| # # Calculate the number of reviews within 20 days of each review | |
| # burst_counts = [] | |
| # for i, date in enumerate(dates): | |
| # # Count reviews within 20 days after this date | |
| # window_end = date + pd.Timedelta(days=20) | |
| # count = ((dates >= date) & (dates <= window_end)).sum() | |
| # burst_counts.append(count) | |
| # # Return the maximum burst count for this user | |
| # return max(burst_counts) | |
| # # Calculate the burst count per user | |
| # user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count) | |
| # # Map the burst count back to the original DataFrame | |
| # self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts) | |
| # def calculate_business_review_burst_count(self): | |
| # # Ensure review_date is in datetime format | |
| # self.df["review_date"] = pd.to_datetime(self.df["review_date"]) | |
| # # Sort by business_id and review_date for chronological order | |
| # self.df = self.df.sort_values(["business_id", "review_date"]) | |
| # # Function to calculate the max number of reviews in any 10-day window | |
| # def calculate_burst_count(group): | |
| # if len(group) <= 1: | |
| # return 0 # No burst if 1 or fewer reviews | |
| # # Convert review_date to a Series for rolling window | |
| # dates = group["review_date"] | |
| # # Calculate the number of reviews within 10 days of each review | |
| # burst_counts = [] | |
| # for i, date in enumerate(dates): | |
| # # Count reviews within 10 days after this date | |
| # window_end = date + pd.Timedelta(days=10) | |
| # count = ((dates >= date) & (dates <= window_end)).sum() | |
| # burst_counts.append(count) | |
| # # Return the maximum burst count for this business | |
| # return max(burst_counts) | |
| # # Calculate the burst count per business | |
| # business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count) | |
| # # Map the burst count back to the original DataFrame | |
| # self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts) | |
| # def calculate_temporal_similarity(self): | |
| # self.df["review_date"] = pd.to_datetime(self.df["review_date"]) | |
| # # Extract the day of the week (0 = Monday, 6 = Sunday) | |
| # self.df["day_of_week"] = self.df["review_date"].dt.dayofweek | |
| # # Function to calculate avg hours between reviews on frequent days | |
| # def calculate_avg_hours_on_frequent_days(group): | |
| # frequent_days = group["day_of_week"].mode().tolist() | |
| # if len(group) <= 1: | |
| # return 0 | |
| # frequent_reviews = group[group["day_of_week"].isin(frequent_days)] | |
| # if len(frequent_reviews) <= 1: | |
| # return 0 | |
| # frequent_reviews = frequent_reviews.sort_values("review_date") | |
| # diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600 | |
| # return diffs.dropna().mean() | |
| # # Calculate average hours for each user | |
| # avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days) | |
| # # Map the average hours to the new column | |
| # self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user) | |
| # # Drop temporary column | |
| # self.df = self.df.drop(columns=["day_of_week"]) | |
| # def calculate_rating_deviation_from_business_average(self): | |
| # # Calculate the average rating per business | |
| # business_avg_rating = self.df.groupby("business_id")["review_stars"].mean() | |
| # # Map the average rating to each row | |
| # self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating) | |
| # # Calculate the deviation from the business average | |
| # self.df["rating_deviation_from_business_average"] = ( | |
| # self.df["review_stars"] - self.df["business_avg_rating"] | |
| # ) | |
| # # Drop the temporary column | |
| # self.df = self.df.drop(columns=["business_avg_rating"]) | |
| # def calculate_review_like_ratio(self): | |
| # # Create a binary column for liked reviews (stars >= 4) | |
| # self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int) | |
| # # Calculate the like ratio per user | |
| # user_like_ratio = self.df.groupby("user_id")["is_liked"].mean() | |
| # # Map the like ratio back to the DataFrame | |
| # self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio) | |
| # # Drop the temporary column | |
| # self.df = self.df.drop(columns=["is_liked"]) | |
| # def calculate_latest_checkin_hours(self): | |
| # self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"]) | |
| # # Function to get the latest check-in date from a list of strings | |
| # def get_latest_checkin(checkin_list): | |
| # if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN | |
| # return None | |
| # if isinstance(checkin_list, str): | |
| # checkin_dates = checkin_list.split(", ") | |
| # else: | |
| # checkin_dates = checkin_list | |
| # return pd.to_datetime(checkin_dates).max() | |
| # # Apply the function to get the latest check-in date per row | |
| # self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin) | |
| # # Calculate the hours difference between latest check-in and yelping_since | |
| # self.df["latest_checkin_hours"] = ( | |
| # (self.df["latest_checkin_date"] - self.df["yelping_since"]) | |
| # .dt.total_seconds() / 3600 | |
| # ) | |
| # # Drop the temporary column | |
| # self.df = self.df.drop(columns=["latest_checkin_date"]) | |
| # self.df["latest_checkin_hours"].fillna(0,inplace=True) | |
| # def compute_pronoun_density(self, text): | |
| # text = self.preprocess_text(text) | |
| # if not text: | |
| # return 0 | |
| # words = word_tokenize(text.lower()) | |
| # pos_tags = nltk.pos_tag(words) | |
| # pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we']) | |
| # return pronouns / len(words) if words else 0 | |
| # def compute_avg_sentence_length(self, text): | |
| # text = self.preprocess_text(text) | |
| # if not text: | |
| # return 0 | |
| # sentences = sent_tokenize(text) | |
| # return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0 | |
| # def compute_excessive_punctuation(self, text): | |
| # text = self.preprocess_text(text) | |
| # return len(re.findall(r'[!?.]{2,}', text)) | |
| # def compute_sentiment_polarity(self, text): | |
| # text = self.preprocess_text(text) | |
| # return TextBlob(text).sentiment.polarity if text else 0 | |
| # def compute_code_switching_flag(self, text): | |
| # text = self.preprocess_text(text) | |
| # if not text: | |
| # return 0 | |
| # tokens = self.tokenizer.tokenize(text.lower()) | |
| # if not tokens: | |
| # return 0 | |
| # english_words = self.stop_words # Use self.stop_words from __init__ | |
| # token_set = set(tokens) | |
| # english_count = sum(1 for token in tokens if token in english_words) | |
| # non_english_pattern = re.compile(r'[^\x00-\x7F]') | |
| # has_non_ascii = 1 if non_english_pattern.search(text) else 0 | |
| # english_ratio = english_count / len(tokens) if tokens else 0 | |
| # non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii) | |
| # # Flag as code-switching if: | |
| # # 1. Mixed English presence (ratio between 0.1 and 0.9) | |
| # # 2. Non-ASCII characters present OR some non-English subword tokens | |
| # if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0): | |
| # return 1 | |
| # return 0 | |
| # def batch_tokenize(self, texts, batch_size=32, max_length=512): | |
| # tokenized_outputs = [] | |
| # for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"): | |
| # batch_texts = texts[i:i + batch_size] | |
| # valid_texts = [self.preprocess_text(t) for t in batch_texts] | |
| # # Tokenize with fixed max_length to ensure consistent tensor sizes | |
| # inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length) | |
| # tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU | |
| # # Concatenate on GPU with consistent sizes | |
| # return torch.cat(tokenized_outputs, dim=0) | |
| # def compute_grammar_error_score(self, texts, tokenized_ids): | |
| # print("Computing grammar error scores...") | |
| # error_scores = np.zeros(len(texts), dtype=float) | |
| # vocab_set = set(self.tokenizer.get_vocab().keys()) | |
| # for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")): | |
| # if input_ids.sum() == 0: # Empty input | |
| # continue | |
| # tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True) | |
| # unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words) | |
| # total_count = len([t for t in tokens if t not in self.stop_words]) | |
| # error_scores[i] = unknown_count / total_count if total_count > 0 else 0 | |
| # return error_scores | |
| # def compute_repetitive_words_count(self, texts, tokenized_ids): | |
| # print("Computing repetitive words counts...") | |
| # rep_counts = np.zeros(len(texts), dtype=int) | |
| # for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")): | |
| # if input_ids.sum() == 0: # Empty input | |
| # continue | |
| # tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True) | |
| # valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2] | |
| # if valid_tokens: | |
| # token_counts = {} | |
| # for token in valid_tokens: | |
| # token_counts[token] = token_counts.get(token, 0) + 1 | |
| # rep_counts[i] = sum(1 for count in token_counts.values() if count > 1) | |
| # return rep_counts | |
| # def preprocess_text_for_similarity(self, text): | |
| # if pd.isna(text) or not text.strip(): | |
| # return [] | |
| # return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words] | |
| # def batch_encode_words(self, texts, batch_size=32, max_length=512): | |
| # word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")] | |
| # vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))} | |
| # encoded_batches = [] | |
| # for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"): | |
| # batch_words = word_lists[i:i + batch_size] | |
| # encoded = np.zeros((len(batch_words), max_length), dtype=np.int64) | |
| # for j, words in enumerate(batch_words): | |
| # if words: | |
| # word_ids = [vocab.get(w, 0) for w in words][:max_length] | |
| # encoded[j, :len(word_ids)] = word_ids | |
| # encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device) | |
| # encoded_batches.append(encoded_tensor) | |
| # return torch.cat(encoded_batches, dim=0), vocab | |
| # def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512): | |
| # all_texts = self.df["review_text"].tolist() | |
| # all_users = self.df["user_id"].tolist() | |
| # all_review_ids = self.df["review_id"].tolist() | |
| # encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length) | |
| # similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores | |
| # for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")): | |
| # if pd.isna(review_id) or pd.isna(user_id): | |
| # continue | |
| # current_words = encoded_words[i] | |
| # if current_words.sum() == 0: | |
| # continue | |
| # other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)], | |
| # dtype=torch.long).to(self.device) | |
| # if not other_indices.numel(): | |
| # continue | |
| # other_words = encoded_words[other_indices] | |
| # current_set = torch.unique(current_words[current_words > 0]) | |
| # other_flat = other_words[other_words > 0] | |
| # if other_flat.numel() == 0: | |
| # continue | |
| # other_set = torch.unique(other_flat) | |
| # intersection = torch.sum(torch.isin(current_set, other_set)).float() | |
| # union = torch.unique(torch.cat([current_set, other_set])).numel() | |
| # similarity = intersection / union if union > 0 else 0.0 | |
| # similarity_scores[review_id] = similarity.item() | |
| # return pd.Series(similarity_scores, index=all_review_ids) | |
| # def calculate_friend_count(self): | |
| # friends = [] | |
| # for v in self.df["friends"]: | |
| # if isinstance(v, str): | |
| # friends.append(len(v.split(","))) | |
| # elif type(v)==int or type(v)==float: | |
| # friends.append(0) | |
| # self.df["friends"] = friends | |
| # def count_elite_years(self, elite): | |
| # if pd.isna(elite): | |
| # return 0 | |
| # return len(str(elite).split(",")) | |
| # def transform_elite_status(self): | |
| # self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False) | |
| # self.df["elite"] = self.df["elite"].astype(int) | |
| # def calculate_review_useful_funny_cool(self): | |
| # self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0) | |
| # self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0) | |
| # self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0) | |
| # self.df["review_useful_funny_cool"] = ( | |
| # self.df["review_useful"] + | |
| # self.df["review_funny"] + | |
| # self.df["review_cool"] | |
| # ) | |
| # self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int) | |
| # def calculate_user_useful_funny_cool(self): | |
| # self.df["user_useful_funny_cool"] = ( | |
| # self.df["user_useful"] + | |
| # self.df["user_funny"] + | |
| # self.df["user_cool"] | |
| # ) | |
| # self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int) | |
| # def compute_fake_score(self, row): | |
| # suspicion_points = 0 | |
| # # Linguistic Features | |
| # if row["pronoun_density"] < 0.01: # Low personal engagement | |
| # suspicion_points += 1 | |
| # if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths | |
| # suspicion_points += 1 | |
| # if row["grammar_error_score"] > 5: # Many errors | |
| # suspicion_points += 1 | |
| # if row["repetitive_words_count"] > 5: # High repetition | |
| # suspicion_points += 1 | |
| # if row["code_switching_flag"] == 1: # Language mixing | |
| # suspicion_points += 1 | |
| # if row["excessive_punctuation_count"] > 3: # Overuse of punctuation | |
| # suspicion_points += 1 | |
| # if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment | |
| # suspicion_points += 1 | |
| # # Review Patterns | |
| # if row["similarity_to_other_reviews"] > 0.8: # High duplication | |
| # suspicion_points += 1 | |
| # if row["user_review_burst_count"] > 5: # Spammy bursts | |
| # suspicion_points += 1 | |
| # if row["business_review_burst_count"] > 5: # Targeted bursts | |
| # suspicion_points += 1 | |
| # if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation | |
| # suspicion_points += 1 | |
| # if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio | |
| # suspicion_points += 1 | |
| # # User Behavior | |
| # if row["user_account_age"] < 30: # Very new account (days) | |
| # suspicion_points += 1 | |
| # if row["average_time_between_reviews"] < 24: # Rapid reviews (hours) | |
| # suspicion_points += 1 | |
| # if row["user_degree"] < 2: # Low business interaction | |
| # suspicion_points += 1 | |
| # if row["time_since_last_review_user"] < 24: # Recent burst (hours) | |
| # suspicion_points += 1 | |
| # # Threshold: 3 or more points = fake | |
| # return 1 if suspicion_points >= 3 else 0 | |
| # def dropping_unncessary_columns(self): | |
| # self.df.drop("review_text", axis=1, inplace=True) | |
| # self.df.drop("review_date", axis=1, inplace=True) | |
| # self.df.drop("business_name", axis=1, inplace=True) | |
| # self.df.drop("address", axis=1, inplace=True) | |
| # self.df.drop("city", axis=1, inplace=True) | |
| # self.df.drop("state", axis=1, inplace=True) | |
| # self.df.drop("postal_code", axis=1, inplace=True) | |
| # self.df.drop("categories", axis=1, inplace=True) | |
| # self.df.drop("user_name", axis=1, inplace=True) | |
| # self.df.drop("yelping_since", axis=1, inplace=True) | |
| # self.df.drop("checkin_date", axis=1, inplace=True) | |
| # self.df.drop("review_useful", axis=1, inplace=True) | |
| # self.df.drop("review_funny", axis=1, inplace=True) | |
| # self.df.drop("review_cool", axis=1, inplace=True) | |
| # self.df.drop("user_useful", axis=1, inplace=True) | |
| # self.df.drop("user_funny", axis=1, inplace=True) | |
| # self.df.drop("user_cool", axis=1, inplace=True) | |
| # self.df.drop("is_open", axis=1, inplace=True) | |
| # self.df.drop("compliment_hot", axis=1, inplace=True) | |
| # self.df.drop("compliment_more", axis=1, inplace=True) | |
| # self.df.drop("compliment_profile", axis=1, inplace=True) | |
| # self.df.drop("compliment_cute", axis=1, inplace=True) | |
| # self.df.drop("compliment_list", axis=1, inplace=True) | |
| # self.df.drop("compliment_note", axis=1, inplace=True) | |
| # self.df.drop("compliment_plain", axis=1, inplace=True) | |
| # self.df.drop("compliment_cool", axis=1, inplace=True) | |
| # self.df.drop("compliment_funny", axis=1, inplace=True) | |
| # self.df.drop("compliment_writer", axis=1, inplace=True) | |
| # self.df.drop("compliment_photos", axis=1, inplace=True) | |
| # def run_pipeline(self): | |
| # logger.info("FINALYZING HOURS COLUMN ...") | |
| # self.df["hours"] = self.df["hours"].apply(self.get_avg_duration) | |
| # self.df["hours"] = self.df["hours"].fillna(0) | |
| # print(self.df["hours"][:10]) | |
| # print(self.df["hours"].isnull().sum()) | |
| # logger.info("FINALYZING ATTRIBUTES COLUMN ...") | |
| # self.df.drop("attributes",axis=1,inplace=True) | |
| # logger.info("CREATING time_since_last_review_user COLUMN ...") | |
| # self.calculate_time_since_last_review() | |
| # print(np.unique(self.df["time_since_last_review_user"] )) | |
| # logger.info("CREATING time_since_last_review_business COLUMN ...") | |
| # self.calculate_time_since_last_review_business() | |
| # print(np.unique(self.df["time_since_last_review_business"] )) | |
| # logger.info("CREATING user_account_age COLUMN ...") | |
| # self.calculate_user_account_age() | |
| # print(np.unique(self.df["user_account_age"] )) | |
| # logger.info("CREATING average_time_between_reviews COLUMN ...") | |
| # self.calculate_avg_time_between_reviews() | |
| # print(np.unique(self.df["average_time_between_reviews"] )) | |
| # logger.info("CREATING user_degree COLUMN ...") | |
| # self.calculate_user_degree() | |
| # print(np.unique(self.df["user_degree"] )) | |
| # logger.info("CREATING business_degree COLUMN ...") | |
| # self.calculate_business_degree() | |
| # print(np.unique(self.df["business_degree"] )) | |
| # logger.info("CREATING rating_variance_user COLUMN ...") | |
| # self.calculate_rating_variance_user() | |
| # print(np.unique(self.df["rating_variance_user"] )) | |
| # logger.info("CREATING user_review_burst_count COLUMN ...") | |
| # self.calculate_user_review_burst_count() | |
| # print(np.unique(self.df["user_review_burst_count"] )) | |
| # logger.info("CREATING business_review_burst_count COLUMN ...") | |
| # self.calculate_business_review_burst_count() | |
| # print(np.unique(self.df["business_review_burst_count"] )) | |
| # logger.info("CREATING temporal_similarity COLUMN ...") | |
| # self.calculate_temporal_similarity() | |
| # print(np.unique(self.df["temporal_similarity"] )) | |
| # logger.info("CREATING rating_deviation_from_business_average COLUMN ...") | |
| # self.calculate_rating_deviation_from_business_average() | |
| # print(np.unique(self.df["rating_deviation_from_business_average"] )) | |
| # logger.info("CREATING review_like_ratio COLUMN ...") | |
| # self.calculate_review_like_ratio() | |
| # print(np.unique(self.df["review_like_ratio"] )) | |
| # logger.info("CREATING latest_checkin_hours COLUMN ...") | |
| # self.calculate_latest_checkin_hours() | |
| # print(np.unique(self.df["latest_checkin_hours"] )) | |
| # logger.info("CREATING pronoun_density COLUMN ...") | |
| # self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density) | |
| # print(np.unique(self.df["pronoun_density"] )) | |
| # logger.info("CREATING avg_sentence_length COLUMN ...") | |
| # self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length) | |
| # print(np.unique(self.df["avg_sentence_length"] )) | |
| # logger.info("CREATING excessive_punctuation_count COLUMN ...") | |
| # self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation) | |
| # print(np.unique(self.df["excessive_punctuation_count"] )) | |
| # logger.info("CREATING sentiment_polarity COLUMN ...") | |
| # self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity) | |
| # print(np.unique(self.df["sentiment_polarity"] )) | |
| # logger.info("CREATING good_severity and bad_severity COLUMNS ...") | |
| # severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity) | |
| # self.df[["good_severity", "bad_severity"]] = severity_scores | |
| # print(np.unique(self.df["good_severity"] )) | |
| # print(np.unique(self.df["bad_severity"] )) | |
| # logger.info("CREATING code_switching_flag COLUMN ...") | |
| # self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag) | |
| # print(np.unique(self.df["code_switching_flag"] )) | |
| # all_texts = self.df["review_text"].tolist() | |
| # tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512) | |
| # logger.info("CREATING grammar_error_score COLUMN ...") | |
| # self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids) | |
| # print(np.unique(self.df["grammar_error_score"] )) | |
| # logger.info("CREATING repetitive_words_count COLUMN ...") | |
| # self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids) | |
| # print(np.unique(self.df["repetitive_words_count"] )) | |
| # logger.info("CREATING similarity_to_other_reviews COLUMN ...") | |
| # similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512) | |
| # self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores) | |
| # print(np.unique(self.df["similarity_to_other_reviews"] )) | |
| # logger.info("CREATING friends COLUMN ...") | |
| # self.calculate_friend_count() | |
| # print(self.df["friends"].value_counts()) | |
| # logger.info("CREATING elite COLUMN ...") | |
| # self.transform_elite_status() | |
| # print(self.df["elite"].value_counts()) | |
| # logger.info("CREATING review_useful_funny_cool COLUMN ...") | |
| # self.calculate_review_useful_funny_cool() | |
| # print(self.df["review_useful_funny_cool"].value_counts()) | |
| # logger.info("CREATING user_useful_funny_cool COLUMN ...") | |
| # self.calculate_user_useful_funny_cool() | |
| # print(self.df["user_useful_funny_cool"].value_counts()) | |
| # # logger.info("CREATING LABEL COLUMN ...") | |
| # # self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1) | |
| # # print(self.df["fake"].value_counts()) | |
| # logger.info("DELETING THE UNWANTED COLUMNS ...") | |
| # self.dropping_unncessary_columns() | |
| # print() | |
| # logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....") | |
| # print(set(self.df.isnull().sum().values)) | |
| # return self.df | |
| from loguru import logger | |
| import pandas as pd | |
| import json | |
| from datetime import datetime | |
| import ast | |
| import numpy as np | |
| from pymongo import MongoClient | |
| from collections import defaultdict | |
| from tqdm import tqdm | |
| import time | |
| import requests | |
| import json | |
| import os | |
| import pandas as pd | |
| import nltk | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import stopwords | |
| from textblob import TextBlob | |
| import re | |
| from transformers import BertTokenizer, BertModel | |
| from transformers import RobertaTokenizer, RobertaModel | |
| import torch | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.preprocessing import LabelEncoder | |
| from transformers import BertTokenizer, BertModel | |
| import torch | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import re | |
| from tqdm import tqdm | |
| from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
| import nltk | |
| from scipy.stats import zscore | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Download NLTK resources | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('stopwords') | |
| nltk.download('punkt_tab') | |
| nltk.download('averaged_perceptron_tagger_eng') | |
| class Preprocessor: | |
| def __init__(self,df): | |
| self.df=df | |
| def create_columns(self): | |
| self.df["topic_creation_text"]=self.df["desc"]+self.df["title"]+self.df["share_title"] | |
| self.df['create_time_dt'] = pd.to_datetime(self.df['create_time'], unit='s') | |
| self.df['create_year'] = self.df['create_time_dt'].dt.year | |
| self.df['create_month'] = self.df['create_time_dt'].dt.month | |
| self.df['create_day'] = self.df['create_time_dt'].dt.day | |
| self.df['create_hour'] = self.df['create_time_dt'].dt.hour | |
| self.df['create_day_of_week'] = self.df['create_time_dt'].dt.dayofweek | |
| self.df['create_is_weekend'] = self.df['create_day_of_week'].isin([5, 6]).astype(int) | |
| self.df['create_days_since_creation'] = (pd.Timestamp('2025-04-27') - self.df['create_time_dt']).dt.days | |
| self.df.drop(["create_time"],axis=1,inplace=True) | |
| def average_posting_freq_per_day(self): | |
| sec_id = list(set(self.df["sec_id"])) | |
| posts_per_user = [] | |
| for i, user in enumerate(self.df["sec_id"]): | |
| try: | |
| days = (pd.to_datetime("2025-05-11") - pd.to_datetime(self.df["birthday"].iloc[i])).days | |
| sec_id = self.df["sec_id"].iloc[i] | |
| posts = len(self.df[self.df["sec_id"] == sec_id]) | |
| posts_per_day = posts / days if days > 0 else 0 | |
| posts_per_user.append(posts_per_day) | |
| except (ValueError, TypeError): | |
| print("error") | |
| posts_per_user.append(None) | |
| self.df["posts_per_user"]=posts_per_user | |
| def diversity_topics_about(self): | |
| self.df["diversity_of_topics"] = self.df.groupby('sec_id')['topic'].transform('nunique') / 150 | |
| def avg_post_length(self): | |
| try: | |
| avg_post_length=[] | |
| for user in self.df["sec_id"]: | |
| df_user=self.df[self.df["sec_id"]==user] | |
| post_lengths=0 | |
| for posts in df_user["desc"]: | |
| post_lengths+=len(posts) | |
| post_lengths=(post_lengths/len(df_user)) | |
| avg_post_length.append(post_lengths) | |
| except Exception as e: | |
| print(f"ERROR for : {posts} as {e}") | |
| self.df["avg_post_length"]=avg_post_length | |
| def sentiment_profile_of_posts(self): | |
| nltk.download('vader_lexicon') | |
| try: | |
| # Initialize VADER sentiment analyzer | |
| sia = SentimentIntensityAnalyzer() | |
| # Calculate average sentiment score per user | |
| self.df["avg_sentiment_score"] = self.df.groupby('sec_id')['desc'].transform( | |
| lambda x: x.apply(lambda text: sia.polarity_scores(text)['compound']).mean()) | |
| except Exception as e: | |
| logger.info(f"ERROR AS {e}") | |
| def get_words(self,text): | |
| if pd.isna(text): | |
| return [] | |
| # Convert to lowercase, remove punctuation, split into words | |
| text = re.sub(r'[^\w\s]', '', str(text).lower()) | |
| return text.split() | |
| def lexical_diversity(self): | |
| self.df['lexical_diversity'] = self.df.groupby('sec_id')['desc'].transform( | |
| lambda x: len(set(word for text in x for word in self.get_words(text))) / | |
| len([word for text in x for word in self.get_words(text)]) if len([word for text in x for word in self.get_words(text)]) > 0 else 0) | |
| def popularity(self): | |
| self.df['popularity'] = self.df.groupby('sec_id')['sec_id'].transform('count') | |
| def sentiment_distribution(self): | |
| nltk.download('vader_lexicon') | |
| sia = SentimentIntensityAnalyzer() | |
| self.df['avg_sentiment_score'] = self.df.groupby('sec_id')['desc'].transform( | |
| lambda x: x.apply(lambda text: sia.polarity_scores(str(text))['compound'] if not pd.isna(text) else 0).mean() | |
| ) | |
| self.df['sentiment_variance'] = self.df.groupby('sec_id')['desc'].transform( | |
| lambda x: x.apply(lambda text: sia.polarity_scores(str(text))['compound'] if not pd.isna(text) else 0).var(ddof=1) if len(x) > 1 else 0 | |
| ) | |
| def post_length(self): | |
| post_length=[] | |
| for posts in self.df["desc"]: | |
| post_length.append(len(posts)) | |
| self.df["post_length"]=post_length | |
| def sentiment_scores_post(self): | |
| sia = SentimentIntensityAnalyzer() | |
| self.df['sentiment_score'] = self.df['desc'].apply( | |
| lambda x: sia.polarity_scores(str(x))['compound'] if not pd.isna(x) else 0) | |
| def timing_post(self): | |
| self.df = self.df.sort_values(['sec_id', 'create_time_dt']) | |
| self.df['time_since_prev_post'] = self.df.groupby('sec_id')['create_time_dt'].diff().dt.total_seconds().fillna(0) | |
| def compute_lexical_similarity(self,group): | |
| group = group.sort_values('create_time_dt') | |
| texts = group['desc'].fillna('').astype(str).tolist() | |
| similarity_scores = [] | |
| if len(texts) <= 1: | |
| return [0] * len(texts) | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform(texts) | |
| for i in range(len(texts)): | |
| if i == 0: | |
| similarity_scores.append(0) # No previous posts for the first post | |
| else: | |
| sim = cosine_similarity(tfidf_matrix[i:i+1], tfidf_matrix[:i]).flatten() | |
| similarity_scores.append(np.mean(sim) if len(sim) > 0 else 0) | |
| return similarity_scores | |
| def lexical_similarity(self): | |
| self.df['lexical_similarity'] = self.df.groupby('sec_id').apply(self.compute_lexical_similarity).explode().reset_index(drop=True).astype(float) | |
| def ratio(self): | |
| ratio=[] | |
| for ratios in self.df["ratio"]: | |
| ratio.append(int(ratios[:-1])) | |
| self.df["ratio"]=ratio | |
| def encoding(self): | |
| le=LabelEncoder() | |
| cat_columns=list(self.df.select_dtypes(exclude=np.number).columns) | |
| cat_columns.remove("sec_id") | |
| for col in cat_columns: | |
| self.df[col]=le.fit_transform(self.df[col]) | |
| def label_creation(self): | |
| user_stats = self.df.groupby('sec_id').agg({ | |
| 'aweme_count': 'mean', | |
| 'create_days_since_creation': 'mean', | |
| 'digg_count': 'mean', | |
| 'play_count': 'mean', | |
| 'comment_count': 'mean', | |
| 'follower_count': 'mean', | |
| 'lexical_similarity': 'mean', | |
| 'diversity_of_topics': 'mean', | |
| 'sentiment_score': 'mean', | |
| 'posts_per_user': 'mean' | |
| }).reset_index() | |
| user_stats['posting_rate'] = user_stats['posts_per_user'] / (user_stats['create_days_since_creation'] + 1) | |
| user_stats['engagement_ratio'] = (user_stats['digg_count'] + user_stats['comment_count'] + user_stats['play_count']) / (user_stats['follower_count'] + 1) | |
| features = ['posting_rate', 'engagement_ratio', 'lexical_similarity', 'diversity_of_topics', 'create_days_since_creation'] | |
| for feature in features: | |
| user_stats[f'{feature}_z'] = zscore(user_stats[feature].fillna(user_stats[feature].mean())) | |
| user_stats['fake_score'] = ( | |
| 1.0 * user_stats['posting_rate_z'] + # High posting rate -> fake | |
| -1.0 * user_stats['engagement_ratio_z'] + # Low engagement -> fake | |
| 1.0 * user_stats['lexical_similarity_z'] + # High similarity -> fake | |
| -1.0 * user_stats['diversity_of_topics_z'] + # Low diversity -> fake | |
| -1.0 * user_stats['create_days_since_creation_z'] # New account -> fake | |
| ) | |
| user_stats = user_stats.sort_values(by='fake_score', ascending=False) | |
| n_users = len(user_stats) | |
| n_fake = int(0.5 * n_users) | |
| user_stats['is_fake'] = 0 # Initialize all as genuine | |
| user_stats.iloc[:n_fake, user_stats.columns.get_loc('is_fake')] = 1 # Set top 75% to fake | |
| user_labels = user_stats[['sec_id', 'is_fake']] | |
| # fake_percentage = user_labels['is_fake'].mean() * 100 | |
| self.df = self.df.merge(user_labels[['sec_id', 'is_fake']], on='sec_id', how='left') | |
| def run_pipeline(self): | |
| logger.info("INSIDE TEST PREPROCESS") | |
| print(self.df) | |
| self.df["signature"].fillna("",inplace=True) | |
| self.df["desc"].fillna("",inplace=True) | |
| self.df["title"].fillna("",inplace=True) | |
| self.df.dropna(inplace=True) | |
| self.df["topic_creation_text"]=self.df["desc"]+self.df["title"]+self.df["share_title"] | |
| logger.info("CREATING COLUMN PREDICT...") | |
| self.create_columns() | |
| logger.info("AVERAGE POSTING PER FREQ PER DAY ...") | |
| self.average_posting_freq_per_day() | |
| logger.info("DIVERSITY TOPICS ABOUT ...") | |
| self.diversity_topics_about() | |
| logger.info("AVG POST LENGTH ...") | |
| self.avg_post_length() | |
| logger.info("SENTIMENT PROFILE OF POSTS...") | |
| self.sentiment_profile_of_posts() | |
| logger.info(f"{self.df.isnull().sum()} NAN values after creating dataset ") | |
| logger.info("LEXICAL DIVERSITY ...") | |
| self.lexical_diversity() | |
| logger.info("POPULARITY...") | |
| self.popularity() | |
| logger.info("SENTIMENT DISTRIBUTION ...") | |
| self.sentiment_distribution() | |
| logger.info("POST LENGTH ...") | |
| self.post_length() | |
| logger.info("SENTIMENT SCORES POST...") | |
| self.sentiment_scores_post() | |
| logger.info("TIMING POST...") | |
| self.timing_post() | |
| logger.info("LEXICAL SIMILARITY ...") | |
| self.lexical_similarity() | |
| self.df.drop(["desc","title","share_title","birthday","topic_creation_text","signature","create_time_dt"],axis=1,inplace=True) | |
| self.df.reset_index(drop=True,inplace=True) | |
| logger.info("ENCODING ...") | |
| self.encoding() | |
| # logger.info(" LABEL CREATION ...") | |
| # self.label_creation() | |
| self.df.dropna(inplace=True) | |
| print(self.df.info()) | |
| print(self.df.isnull().sum()) | |
| # self.df.to_csv(os.path.join(self.path,self.filename),index=False) | |
| return self.df | |