import json import os import re import sys from collections import Counter import numpy as np import pandas as pd import torch from torch.utils.data import Dataset sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from src.config import ( dataset_csv, numeric_features, data_processed, data_raw, label_map, max_seq_len, max_vocab_size, glove_dim, glove_file, ) def download_dataset(): if os.path.exists(dataset_csv): return print("Dataset not found, downloading from Hugging Face") from datasets import load_dataset os.makedirs(data_raw, exist_ok=True) ds = load_dataset("airt-ml/twitter-human-bots", split="train") ds.to_csv(dataset_csv) print(f"Downloaded {len(ds):,} rows to {dataset_csv}") def load_and_preprocess(): print("Loading dataset") download_dataset() if not os.path.exists(dataset_csv): raise FileNotFoundError("Dataset not found and download failed.") df = pd.read_csv(dataset_csv) print(f"Rows: {len(df):,}") df["label"] = df["account_type"].map(label_map) df = df.dropna(subset=["label"]) df["label"] = df["label"].astype(int) bot_count = df["label"].sum() print(f"Bots: {bot_count:,}, humans: {len(df) - bot_count:,}") df["followers_count"] = df["followers_count"].fillna(0).astype(float) df["friends_count"] = df["friends_count"].fillna(0).astype(float) df["statuses_count"] = df["statuses_count"].fillna(0).astype(float) df["favourites_count"] = df["favourites_count"].fillna(0).astype(float) df["account_age_days"] = df["account_age_days"].fillna(365).astype(float).clip(lower=1) df["average_tweets_per_day"] = df["average_tweets_per_day"].fillna(0).astype(float) df["verified"] = df["verified"].fillna(False).astype(int) df["default_profile"] = df["default_profile"].fillna(False).astype(int) df["default_profile_image"] = df["default_profile_image"].fillna(False).astype(int) df["description"] = df["description"].fillna("") df["screen_name"] = df["screen_name"].fillna("") df["location"] = df["location"].fillna("") df["followers_to_friends_ratio"] = df["followers_count"] / df["friends_count"].clip(lower=1) df["favourites_to_statuses_ratio"] = df["favourites_count"] / df["statuses_count"].clip(lower=1) df["friends_to_followers_ratio"] = df["friends_count"] / df["followers_count"].clip(lower=1) df["statuses_to_followers_ratio"] = df["statuses_count"] / df["followers_count"].clip(lower=1) df["has_description"] = (df["description"].str.len() > 0).astype(int) df["has_location"] = (df["location"].str.len() > 0).astype(int) df["description_length"] = df["description"].str.len() df["screen_name_length"] = df["screen_name"].str.len() df["profile_completeness"] = ( df["has_description"] + df["has_location"] + (1 - df["default_profile"]) + (1 - df["default_profile_image"]) + df["verified"] ) df["screen_name_digits"] = df["screen_name"].apply(lambda x: sum(c.isdigit() for c in str(x))) df["screen_name_digit_ratio"] = df["screen_name_digits"] / df["screen_name_length"].clip(lower=1) df["screen_name_has_underscore"] = df["screen_name"].str.contains("_", na=False).astype(int) df["tweets_per_follower"] = df["statuses_count"] / df["followers_count"].clip(lower=1) df["tweets_per_day_per_follower"] = df["average_tweets_per_day"] / df["followers_count"].clip(lower=1) df["bio_url_count"] = df["description"].str.count(r"http|www\.|\.com|\.net") df["bio_hashtag_count"] = df["description"].str.count(r"#") df["bio_mention_count"] = df["description"].str.count(r"@") df["bio_word_count"] = df["description"].str.split().str.len().fillna(0).astype(int) news_pattern = r"\b(?:news|breaking|daily|magazine|journal|times|herald|tribune|gazette|broadcast|media|press|reporter|journalist|editor|anchor|correspondent|coverage|headlines|report)\b" org_pattern = r"\b(?:official|corp|inc\.?|llc|ltd|company|brand|store|shop|support|customer|service|team|foundation|organisation|organization|ngo|charity)\b" df["bio_has_news_keywords"] = df["description"].str.lower().str.contains(news_pattern, regex=True, na=False).astype(int) df["bio_has_org_keywords"] = df["description"].str.lower().str.contains(org_pattern, regex=True, na=False).astype(int) df["bio_likely_organisation"] = ( (df["bio_has_news_keywords"] | df["bio_has_org_keywords"]) & (df["followers_count"] > 1000) & (df["account_age_days"] > 365) ).astype(int) df["is_established_account"] = ( (df["verified"] == 1) & (df["followers_count"] > 10000) & (df["account_age_days"] > 365) ).astype(int) df["log_followers_count"] = np.log1p(df["followers_count"]) df["log_friends_count"] = np.log1p(df["friends_count"]) df["log_statuses_count"] = np.log1p(df["statuses_count"]) df["log_favourites_count"] = np.log1p(df["favourites_count"]) df["log_tweets_per_follower"] = np.log1p(df["tweets_per_follower"]) df["log_followers_to_friends_ratio"] = np.log1p(df["followers_to_friends_ratio"]) print(f"Engineered {len(numeric_features)} features") texts = [] for _, row in df.iterrows(): desc = str(row.get("description", "") or "") desc = re.sub(r"http\S+", "", desc) desc = re.sub(r"\s+", " ", desc).strip() texts.append(desc if desc else "") n = len(df) indices = np.random.RandomState(42).permutation(n) train_end = int(0.7 * n) val_end = int(0.85 * n) user_ids = [str(i) for i in range(n)] splits = { "train": [user_ids[i] for i in indices[:train_end]], "val": [user_ids[i] for i in indices[train_end:val_end]], "test": [user_ids[i] for i in indices[val_end:]], } print(f"Split: {len(splits['train']):,} train, {len(splits['val']):,} val, {len(splits['test']):,} test") print(f"Preprocessed {n:,} users") return df, texts, df["label"].values, splits, user_ids def save_processed(df, texts, labels, splits, user_ids): os.makedirs(data_processed, exist_ok=True) numeric_values = df[numeric_features].values.astype(np.float32) train_indices = [int(uid) for uid in splits["train"]] train_numeric = numeric_values[train_indices] mean = train_numeric.mean(axis=0) std = train_numeric.std(axis=0) std[std == 0] = 1.0 numeric_normalised = (numeric_values - mean) / std print(f"Normalised {len(numeric_features)} features (fitted on train split)") torch.save({ "texts": texts, "labels": torch.tensor(labels, dtype=torch.long), "user_ids": user_ids, "splits": splits, "numeric_features": torch.tensor(numeric_normalised, dtype=torch.float32), "numeric_mean": mean, "numeric_std": std, }, os.path.join(data_processed, "processed.pt")) print(f"Saved to {data_processed}/") def load_processed(): path = os.path.join(data_processed, "processed.pt") if not os.path.exists(path): raise FileNotFoundError(f"Run: python src/data.py (no data at {path})") return torch.load(path, weights_only=False) class TwiBotDataset(Dataset): def __init__(self, texts, numeric_features, labels, user_ids, split_ids=None): if split_ids is not None: id_set = set(split_ids) indices = [i for i, uid in enumerate(user_ids) if uid in id_set] else: indices = list(range(len(user_ids))) self.texts = [texts[i] for i in indices] self.numeric = numeric_features[indices] self.labels = labels[indices] def __len__(self): return len(self.labels) def __getitem__(self, idx): return {"text": self.texts[idx], "numeric": self.numeric[idx], "label": self.labels[idx]} def create_datasets(data, splits): texts = data["texts"] numeric = data["numeric_features"] labels = data["labels"] user_ids = data["user_ids"] train = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("train")) val = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("val", splits.get("valid"))) test = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("test")) print(f"Datasets: train={len(train)}, val={len(val)}, test={len(test)}") return train, val, test def _tokenize_text(text): text = text.lower() text = re.sub(r"", " url ", text) text = re.sub(r"[^\w\s]", " ", text) return text.split() class GloveVocab: def __init__(self, word2idx=None): self.word2idx = word2idx or {"": 0, "": 1} @property def vocab_size(self): return len(self.word2idx) @classmethod def build_from_corpus(cls, texts, max_vocab=max_vocab_size): counter = Counter() for text in texts: counter.update(_tokenize_text(text)) word2idx = {"": 0, "": 1} for word, _ in counter.most_common(max_vocab - 2): word2idx[word] = len(word2idx) vocab = cls(word2idx) print(f"Vocabulary: {vocab.vocab_size:,} words") return vocab def tokenize_batch(self, texts, max_len=max_seq_len): batch = [] for text in texts: tokens = _tokenize_text(text) ids = [self.word2idx.get(t, 1) for t in tokens[:max_len]] ids += [0] * (max_len - len(ids)) batch.append(ids) return torch.tensor(batch, dtype=torch.long) def load_glove_embeddings(self, path=glove_file): print(f"Loading GloVe from {os.path.basename(path)}") glove = {} with open(path, "r", encoding="utf-8") as f: for line in f: parts = line.rstrip().split(" ") if parts[0] in self.word2idx: glove[parts[0]] = np.array(parts[1:], dtype=np.float32) matrix = np.random.normal(scale=0.6, size=(self.vocab_size, glove_dim)).astype(np.float32) matrix[0] = 0.0 found = sum(1 for w in self.word2idx if w in glove) print(f"GloVe coverage: {found:,}/{self.vocab_size:,} ({found/self.vocab_size*100:.1f}%)") for w, i in self.word2idx.items(): if w in glove: matrix[i] = glove[w] return matrix def random_embeddings(self): matrix = np.random.normal(scale=0.6, size=(self.vocab_size, glove_dim)).astype(np.float32) matrix[0] = 0.0 print(f"Using random embeddings ({self.vocab_size:,} x {glove_dim})") return matrix def save(self, path): with open(path, "w") as f: json.dump(self.word2idx, f) @classmethod def load(cls, path): with open(path) as f: return cls(json.load(f)) if __name__ == "__main__": df, texts, labels, splits, user_ids = load_and_preprocess() save_processed(df, texts, labels, splits, user_ids)