Spaces:

mobucheeri
/

twitter-bot-detector

Running

File size: 10,930 Bytes

08b3b18

import json
import os
import re
import sys
from collections import Counter
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.config import (
    dataset_csv, numeric_features, data_processed, data_raw, label_map,
    max_seq_len, max_vocab_size, glove_dim, glove_file,
)

def download_dataset():
    if os.path.exists(dataset_csv):
        return

    print("Dataset not found, downloading from Hugging Face")
    from datasets import load_dataset

    os.makedirs(data_raw, exist_ok=True)
    ds = load_dataset("airt-ml/twitter-human-bots", split="train")
    ds.to_csv(dataset_csv)
    print(f"Downloaded {len(ds):,} rows to {dataset_csv}")


def load_and_preprocess():
    print("Loading dataset")
    download_dataset()

    if not os.path.exists(dataset_csv):
        raise FileNotFoundError("Dataset not found and download failed.")

    df = pd.read_csv(dataset_csv)
    print(f"Rows: {len(df):,}")

    df["label"] = df["account_type"].map(label_map)
    df = df.dropna(subset=["label"])
    df["label"] = df["label"].astype(int)

    bot_count = df["label"].sum()
    print(f"Bots: {bot_count:,}, humans: {len(df) - bot_count:,}")

    df["followers_count"] = df["followers_count"].fillna(0).astype(float)
    df["friends_count"] = df["friends_count"].fillna(0).astype(float)
    df["statuses_count"] = df["statuses_count"].fillna(0).astype(float)
    df["favourites_count"] = df["favourites_count"].fillna(0).astype(float)
    df["account_age_days"] = df["account_age_days"].fillna(365).astype(float).clip(lower=1)
    df["average_tweets_per_day"] = df["average_tweets_per_day"].fillna(0).astype(float)
    df["verified"] = df["verified"].fillna(False).astype(int)
    df["default_profile"] = df["default_profile"].fillna(False).astype(int)
    df["default_profile_image"] = df["default_profile_image"].fillna(False).astype(int)
    df["description"] = df["description"].fillna("")
    df["screen_name"] = df["screen_name"].fillna("")
    df["location"] = df["location"].fillna("")

    df["followers_to_friends_ratio"] = df["followers_count"] / df["friends_count"].clip(lower=1)
    df["favourites_to_statuses_ratio"] = df["favourites_count"] / df["statuses_count"].clip(lower=1)
    df["friends_to_followers_ratio"] = df["friends_count"] / df["followers_count"].clip(lower=1)
    df["statuses_to_followers_ratio"] = df["statuses_count"] / df["followers_count"].clip(lower=1)

    df["has_description"] = (df["description"].str.len() > 0).astype(int)
    df["has_location"] = (df["location"].str.len() > 0).astype(int)
    df["description_length"] = df["description"].str.len()
    df["screen_name_length"] = df["screen_name"].str.len()
    df["profile_completeness"] = (
        df["has_description"] + df["has_location"]
        + (1 - df["default_profile"]) + (1 - df["default_profile_image"])
        + df["verified"]
    )

    df["screen_name_digits"] = df["screen_name"].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df["screen_name_digit_ratio"] = df["screen_name_digits"] / df["screen_name_length"].clip(lower=1)
    df["screen_name_has_underscore"] = df["screen_name"].str.contains("_", na=False).astype(int)

    df["tweets_per_follower"] = df["statuses_count"] / df["followers_count"].clip(lower=1)
    df["tweets_per_day_per_follower"] = df["average_tweets_per_day"] / df["followers_count"].clip(lower=1)

    df["bio_url_count"] = df["description"].str.count(r"http|www\.|\.com|\.net")
    df["bio_hashtag_count"] = df["description"].str.count(r"#")
    df["bio_mention_count"] = df["description"].str.count(r"@")
    df["bio_word_count"] = df["description"].str.split().str.len().fillna(0).astype(int)

    news_pattern = r"\b(?:news|breaking|daily|magazine|journal|times|herald|tribune|gazette|broadcast|media|press|reporter|journalist|editor|anchor|correspondent|coverage|headlines|report)\b"
    org_pattern = r"\b(?:official|corp|inc\.?|llc|ltd|company|brand|store|shop|support|customer|service|team|foundation|organisation|organization|ngo|charity)\b"
    df["bio_has_news_keywords"] = df["description"].str.lower().str.contains(news_pattern, regex=True, na=False).astype(int)
    df["bio_has_org_keywords"] = df["description"].str.lower().str.contains(org_pattern, regex=True, na=False).astype(int)
    df["bio_likely_organisation"] = (
        (df["bio_has_news_keywords"] | df["bio_has_org_keywords"])
        & (df["followers_count"] > 1000)
        & (df["account_age_days"] > 365)
    ).astype(int)
    df["is_established_account"] = (
        (df["verified"] == 1)
        & (df["followers_count"] > 10000)
        & (df["account_age_days"] > 365)
    ).astype(int)

    df["log_followers_count"] = np.log1p(df["followers_count"])
    df["log_friends_count"] = np.log1p(df["friends_count"])
    df["log_statuses_count"] = np.log1p(df["statuses_count"])
    df["log_favourites_count"] = np.log1p(df["favourites_count"])
    df["log_tweets_per_follower"] = np.log1p(df["tweets_per_follower"])
    df["log_followers_to_friends_ratio"] = np.log1p(df["followers_to_friends_ratio"])

    print(f"Engineered {len(numeric_features)} features")

    texts = []
    for _, row in df.iterrows():
        desc = str(row.get("description", "") or "")
        desc = re.sub(r"http\S+", "<URL>", desc)
        desc = re.sub(r"\s+", " ", desc).strip()
        texts.append(desc if desc else "<EMPTY>")

    n = len(df)
    indices = np.random.RandomState(42).permutation(n)
    train_end = int(0.7 * n)
    val_end = int(0.85 * n)

    user_ids = [str(i) for i in range(n)]
    splits = {
        "train": [user_ids[i] for i in indices[:train_end]],
        "val": [user_ids[i] for i in indices[train_end:val_end]],
        "test": [user_ids[i] for i in indices[val_end:]],
    }

    print(f"Split: {len(splits['train']):,} train, {len(splits['val']):,} val, {len(splits['test']):,} test")
    print(f"Preprocessed {n:,} users")

    return df, texts, df["label"].values, splits, user_ids

def save_processed(df, texts, labels, splits, user_ids):
    os.makedirs(data_processed, exist_ok=True)

    numeric_values = df[numeric_features].values.astype(np.float32)

    train_indices = [int(uid) for uid in splits["train"]]
    train_numeric = numeric_values[train_indices]
    mean = train_numeric.mean(axis=0)
    std = train_numeric.std(axis=0)
    std[std == 0] = 1.0

    numeric_normalised = (numeric_values - mean) / std
    print(f"Normalised {len(numeric_features)} features (fitted on train split)")

    torch.save({
        "texts": texts,
        "labels": torch.tensor(labels, dtype=torch.long),
        "user_ids": user_ids,
        "splits": splits,
        "numeric_features": torch.tensor(numeric_normalised, dtype=torch.float32),
        "numeric_mean": mean,
        "numeric_std": std,
    }, os.path.join(data_processed, "processed.pt"))

    print(f"Saved to {data_processed}/")

def load_processed():
    path = os.path.join(data_processed, "processed.pt")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Run: python src/data.py  (no data at {path})")
    return torch.load(path, weights_only=False)

class TwiBotDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, user_ids, split_ids=None):
        if split_ids is not None:
            id_set = set(split_ids)
            indices = [i for i, uid in enumerate(user_ids) if uid in id_set]
        else:
            indices = list(range(len(user_ids)))

        self.texts = [texts[i] for i in indices]
        self.numeric = numeric_features[indices]
        self.labels = labels[indices]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {"text": self.texts[idx], "numeric": self.numeric[idx], "label": self.labels[idx]}

def create_datasets(data, splits):
    texts = data["texts"]
    numeric = data["numeric_features"]
    labels = data["labels"]
    user_ids = data["user_ids"]
    train = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("train"))
    val   = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("val", splits.get("valid")))
    test  = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("test"))
    print(f"Datasets: train={len(train)}, val={len(val)}, test={len(test)}")
    return train, val, test

def _tokenize_text(text):
    text = text.lower()
    text = re.sub(r"<URL>", " url ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    return text.split()

class GloveVocab:
    def __init__(self, word2idx=None):
        self.word2idx = word2idx or {"<PAD>": 0, "<UNK>": 1}

    @property
    def vocab_size(self):
        return len(self.word2idx)

    @classmethod
    def build_from_corpus(cls, texts, max_vocab=max_vocab_size):
        counter = Counter()
        for text in texts:
            counter.update(_tokenize_text(text))
        word2idx = {"<PAD>": 0, "<UNK>": 1}
        for word, _ in counter.most_common(max_vocab - 2):
            word2idx[word] = len(word2idx)
        vocab = cls(word2idx)
        print(f"Vocabulary: {vocab.vocab_size:,} words")
        return vocab

    def tokenize_batch(self, texts, max_len=max_seq_len):
        batch = []
        for text in texts:
            tokens = _tokenize_text(text)
            ids = [self.word2idx.get(t, 1) for t in tokens[:max_len]]
            ids += [0] * (max_len - len(ids))
            batch.append(ids)
        return torch.tensor(batch, dtype=torch.long)

    def load_glove_embeddings(self, path=glove_file):
        print(f"Loading GloVe from {os.path.basename(path)}")
        glove = {}
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.rstrip().split(" ")
                if parts[0] in self.word2idx:
                    glove[parts[0]] = np.array(parts[1:], dtype=np.float32)

        matrix = np.random.normal(scale=0.6, size=(self.vocab_size, glove_dim)).astype(np.float32)
        matrix[0] = 0.0
        found = sum(1 for w in self.word2idx if w in glove)
        print(f"GloVe coverage: {found:,}/{self.vocab_size:,} ({found/self.vocab_size*100:.1f}%)")
        for w, i in self.word2idx.items():
            if w in glove:
                matrix[i] = glove[w]
        return matrix

    def random_embeddings(self):
        matrix = np.random.normal(scale=0.6, size=(self.vocab_size, glove_dim)).astype(np.float32)
        matrix[0] = 0.0
        print(f"Using random embeddings ({self.vocab_size:,} x {glove_dim})")
        return matrix

    def save(self, path):
        with open(path, "w") as f:
            json.dump(self.word2idx, f)

    @classmethod
    def load(cls, path):
        with open(path) as f:
            return cls(json.load(f))

if __name__ == "__main__":
    df, texts, labels, splits, user_ids = load_and_preprocess()
    save_processed(df, texts, labels, splits, user_ids)