Spaces:

mobucheeri
/

twitter-bot-detector

Running

App Files Files Community

twitter-bot-detector / src /data.py

mobucheeri

initial deployment

08b3b18 23 days ago

raw

history blame contribute delete

10.9 kB

	import json
	import os
	import re
	import sys
	from collections import Counter
	import numpy as np
	import pandas as pd
	import torch
	from torch.utils.data import Dataset
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from src.config import (
	dataset_csv, numeric_features, data_processed, data_raw, label_map,
	max_seq_len, max_vocab_size, glove_dim, glove_file,
	)

	def download_dataset():
	if os.path.exists(dataset_csv):
	return

	print("Dataset not found, downloading from Hugging Face")
	from datasets import load_dataset

	os.makedirs(data_raw, exist_ok=True)
	ds = load_dataset("airt-ml/twitter-human-bots", split="train")
	ds.to_csv(dataset_csv)
	print(f"Downloaded {len(ds):,} rows to {dataset_csv}")


	def load_and_preprocess():
	print("Loading dataset")
	download_dataset()

	if not os.path.exists(dataset_csv):
	raise FileNotFoundError("Dataset not found and download failed.")

	df = pd.read_csv(dataset_csv)
	print(f"Rows: {len(df):,}")

	df["label"] = df["account_type"].map(label_map)
	df = df.dropna(subset=["label"])
	df["label"] = df["label"].astype(int)

	bot_count = df["label"].sum()
	print(f"Bots: {bot_count:,}, humans: {len(df) - bot_count:,}")

	df["followers_count"] = df["followers_count"].fillna(0).astype(float)
	df["friends_count"] = df["friends_count"].fillna(0).astype(float)
	df["statuses_count"] = df["statuses_count"].fillna(0).astype(float)
	df["favourites_count"] = df["favourites_count"].fillna(0).astype(float)
	df["account_age_days"] = df["account_age_days"].fillna(365).astype(float).clip(lower=1)
	df["average_tweets_per_day"] = df["average_tweets_per_day"].fillna(0).astype(float)
	df["verified"] = df["verified"].fillna(False).astype(int)
	df["default_profile"] = df["default_profile"].fillna(False).astype(int)
	df["default_profile_image"] = df["default_profile_image"].fillna(False).astype(int)
	df["description"] = df["description"].fillna("")
	df["screen_name"] = df["screen_name"].fillna("")
	df["location"] = df["location"].fillna("")

	df["followers_to_friends_ratio"] = df["followers_count"] / df["friends_count"].clip(lower=1)
	df["favourites_to_statuses_ratio"] = df["favourites_count"] / df["statuses_count"].clip(lower=1)
	df["friends_to_followers_ratio"] = df["friends_count"] / df["followers_count"].clip(lower=1)
	df["statuses_to_followers_ratio"] = df["statuses_count"] / df["followers_count"].clip(lower=1)

	df["has_description"] = (df["description"].str.len() > 0).astype(int)
	df["has_location"] = (df["location"].str.len() > 0).astype(int)
	df["description_length"] = df["description"].str.len()
	df["screen_name_length"] = df["screen_name"].str.len()
	df["profile_completeness"] = (
	df["has_description"] + df["has_location"]
	+ (1 - df["default_profile"]) + (1 - df["default_profile_image"])
	+ df["verified"]
	)

	df["screen_name_digits"] = df["screen_name"].apply(lambda x: sum(c.isdigit() for c in str(x)))
	df["screen_name_digit_ratio"] = df["screen_name_digits"] / df["screen_name_length"].clip(lower=1)
	df["screen_name_has_underscore"] = df["screen_name"].str.contains("_", na=False).astype(int)

	df["tweets_per_follower"] = df["statuses_count"] / df["followers_count"].clip(lower=1)
	df["tweets_per_day_per_follower"] = df["average_tweets_per_day"] / df["followers_count"].clip(lower=1)

	df["bio_url_count"] = df["description"].str.count(r"http\|www\.\|\.com\|\.net")
	df["bio_hashtag_count"] = df["description"].str.count(r"#")
	df["bio_mention_count"] = df["description"].str.count(r"@")
	df["bio_word_count"] = df["description"].str.split().str.len().fillna(0).astype(int)

	news_pattern = r"\b(?:news\|breaking\|daily\|magazine\|journal\|times\|herald\|tribune\|gazette\|broadcast\|media\|press\|reporter\|journalist\|editor\|anchor\|correspondent\|coverage\|headlines\|report)\b"
	org_pattern = r"\b(?:official\|corp\|inc\.?\|llc\|ltd\|company\|brand\|store\|shop\|support\|customer\|service\|team\|foundation\|organisation\|organization\|ngo\|charity)\b"
	df["bio_has_news_keywords"] = df["description"].str.lower().str.contains(news_pattern, regex=True, na=False).astype(int)
	df["bio_has_org_keywords"] = df["description"].str.lower().str.contains(org_pattern, regex=True, na=False).astype(int)
	df["bio_likely_organisation"] = (
	(df["bio_has_news_keywords"] \| df["bio_has_org_keywords"])
	& (df["followers_count"] > 1000)
	& (df["account_age_days"] > 365)
	).astype(int)
	df["is_established_account"] = (
	(df["verified"] == 1)
	& (df["followers_count"] > 10000)
	& (df["account_age_days"] > 365)
	).astype(int)

	df["log_followers_count"] = np.log1p(df["followers_count"])
	df["log_friends_count"] = np.log1p(df["friends_count"])
	df["log_statuses_count"] = np.log1p(df["statuses_count"])
	df["log_favourites_count"] = np.log1p(df["favourites_count"])
	df["log_tweets_per_follower"] = np.log1p(df["tweets_per_follower"])
	df["log_followers_to_friends_ratio"] = np.log1p(df["followers_to_friends_ratio"])

	print(f"Engineered {len(numeric_features)} features")

	texts = []
	for _, row in df.iterrows():
	desc = str(row.get("description", "") or "")
	desc = re.sub(r"http\S+", "<URL>", desc)
	desc = re.sub(r"\s+", " ", desc).strip()
	texts.append(desc if desc else "<EMPTY>")

	n = len(df)
	indices = np.random.RandomState(42).permutation(n)
	train_end = int(0.7 * n)
	val_end = int(0.85 * n)

	user_ids = [str(i) for i in range(n)]
	splits = {
	"train": [user_ids[i] for i in indices[:train_end]],
	"val": [user_ids[i] for i in indices[train_end:val_end]],
	"test": [user_ids[i] for i in indices[val_end:]],
	}

	print(f"Split: {len(splits['train']):,} train, {len(splits['val']):,} val, {len(splits['test']):,} test")
	print(f"Preprocessed {n:,} users")

	return df, texts, df["label"].values, splits, user_ids

	def save_processed(df, texts, labels, splits, user_ids):
	os.makedirs(data_processed, exist_ok=True)

	numeric_values = df[numeric_features].values.astype(np.float32)

	train_indices = [int(uid) for uid in splits["train"]]
	train_numeric = numeric_values[train_indices]
	mean = train_numeric.mean(axis=0)
	std = train_numeric.std(axis=0)
	std[std == 0] = 1.0

	numeric_normalised = (numeric_values - mean) / std
	print(f"Normalised {len(numeric_features)} features (fitted on train split)")

	torch.save({
	"texts": texts,
	"labels": torch.tensor(labels, dtype=torch.long),
	"user_ids": user_ids,
	"splits": splits,
	"numeric_features": torch.tensor(numeric_normalised, dtype=torch.float32),
	"numeric_mean": mean,
	"numeric_std": std,
	}, os.path.join(data_processed, "processed.pt"))

	print(f"Saved to {data_processed}/")

	def load_processed():
	path = os.path.join(data_processed, "processed.pt")
	if not os.path.exists(path):
	raise FileNotFoundError(f"Run: python src/data.py (no data at {path})")
	return torch.load(path, weights_only=False)

	class TwiBotDataset(Dataset):
	def __init__(self, texts, numeric_features, labels, user_ids, split_ids=None):
	if split_ids is not None:
	id_set = set(split_ids)
	indices = [i for i, uid in enumerate(user_ids) if uid in id_set]
	else:
	indices = list(range(len(user_ids)))

	self.texts = [texts[i] for i in indices]
	self.numeric = numeric_features[indices]
	self.labels = labels[indices]

	def __len__(self):
	return len(self.labels)

	def __getitem__(self, idx):
	return {"text": self.texts[idx], "numeric": self.numeric[idx], "label": self.labels[idx]}

	def create_datasets(data, splits):
	texts = data["texts"]
	numeric = data["numeric_features"]
	labels = data["labels"]
	user_ids = data["user_ids"]
	train = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("train"))
	val = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("val", splits.get("valid")))
	test = TwiBotDataset(texts, numeric, labels, user_ids, splits.get("test"))
	print(f"Datasets: train={len(train)}, val={len(val)}, test={len(test)}")
	return train, val, test

	def _tokenize_text(text):
	text = text.lower()
	text = re.sub(r"<URL>", " url ", text)
	text = re.sub(r"[^\w\s]", " ", text)
	return text.split()

	class GloveVocab:
	def __init__(self, word2idx=None):
	self.word2idx = word2idx or {"<PAD>": 0, "<UNK>": 1}

	@property
	def vocab_size(self):
	return len(self.word2idx)

	@classmethod
	def build_from_corpus(cls, texts, max_vocab=max_vocab_size):
	counter = Counter()
	for text in texts:
	counter.update(_tokenize_text(text))
	word2idx = {"<PAD>": 0, "<UNK>": 1}
	for word, _ in counter.most_common(max_vocab - 2):
	word2idx[word] = len(word2idx)
	vocab = cls(word2idx)
	print(f"Vocabulary: {vocab.vocab_size:,} words")
	return vocab

	def tokenize_batch(self, texts, max_len=max_seq_len):
	batch = []
	for text in texts:
	tokens = _tokenize_text(text)
	ids = [self.word2idx.get(t, 1) for t in tokens[:max_len]]
	ids += [0] * (max_len - len(ids))
	batch.append(ids)
	return torch.tensor(batch, dtype=torch.long)

	def load_glove_embeddings(self, path=glove_file):
	print(f"Loading GloVe from {os.path.basename(path)}")
	glove = {}
	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	parts = line.rstrip().split(" ")
	if parts[0] in self.word2idx:
	glove[parts[0]] = np.array(parts[1:], dtype=np.float32)

	matrix = np.random.normal(scale=0.6, size=(self.vocab_size, glove_dim)).astype(np.float32)
	matrix[0] = 0.0
	found = sum(1 for w in self.word2idx if w in glove)
	print(f"GloVe coverage: {found:,}/{self.vocab_size:,} ({found/self.vocab_size*100:.1f}%)")
	for w, i in self.word2idx.items():
	if w in glove:
	matrix[i] = glove[w]
	return matrix

	def random_embeddings(self):
	matrix = np.random.normal(scale=0.6, size=(self.vocab_size, glove_dim)).astype(np.float32)
	matrix[0] = 0.0
	print(f"Using random embeddings ({self.vocab_size:,} x {glove_dim})")
	return matrix

	def save(self, path):
	with open(path, "w") as f:
	json.dump(self.word2idx, f)

	@classmethod
	def load(cls, path):
	with open(path) as f:
	return cls(json.load(f))

	if __name__ == "__main__":
	df, texts, labels, splits, user_ids = load_and_preprocess()
	save_processed(df, texts, labels, splits, user_ids)