import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import Trainer, TrainingArguments from torch.utils.data import Dataset import torch import re import string import logging logging.basicConfig(level=logging.INFO) def load_dataset(path="./combined.csv"): df = pd.read_csv(path, dtype={'text': str, 'label': str}) # Explicitly set dtypes df = df.dropna() # Remove any null values # Ensure consistent column names if 'news' in df.columns: df = df.rename(columns={"news": "text"}) if 'target' in df.columns: df = df.rename(columns={"target": "label"}) # Convert labels to integers safely label_map = {"real": 0, "fake": 1} df['label'] = df['label'].str.lower().map(label_map) # Drop any rows where label mapping failed df = df.dropna(subset=['label']) df['label'] = df['label'].astype(int) X = df['text'].apply(str).tolist() # Ensure text is string y = df['label'].tolist() return train_test_split(X, y, test_size=0.2, random_state=42) class NewsDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) encoding = self.tokenizer( text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt" ) return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long) } def train_model(train_texts, train_labels, val_texts, val_labels): tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small') model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=2) train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len=128) val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len=128) training_args = TrainingArguments( output_dir='./results', num_train_epochs=5, per_device_train_batch_size=8, per_device_eval_batch_size=8, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', evaluation_strategy="epoch", save_strategy="epoch" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train() return tokenizer, model def predict_news(tokenizer, model, news_text): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) print(device) model.eval() encoding = tokenizer( str(news_text), max_length=128, padding='max_length', truncation=True, return_tensors="pt" ) input_ids = encoding['input_ids'].to(device) attention_mask = encoding['attention_mask'].to(device) with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) prediction = torch.argmax(outputs.logits, dim=1).item() return "Fake" if prediction == 1 else "Real" def main(): try: X_train, X_test, y_train, y_test = load_dataset() tokenizer, model = train_model(X_train, y_train, X_test, y_test) while True: user_input = input("\nEnter news text (or 'exit' to quit): ") if user_input.lower() == 'exit': break result = predict_news(tokenizer, model, user_input) print(f"The news is: {result}") except Exception as e: logging.error(f"An error occurred: {str(e)}") raise if __name__ == "__main__": main()