Spaces:
Running
Running
| import os | |
| import torch | |
| from torch.utils.data import DataLoader, Dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| from utils.preprocess import preprocess_text | |
| FAKE_DIR = "data/fake_news/financeiros" | |
| REAL_DIR = "data/real_news/financeiros" | |
| MODEL_DIR = "app/model" | |
| MAX_LEN = 256 | |
| BATCH_SIZE = 8 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # ========= LOAD DATA ========= | |
| def load_texts(directory, label): | |
| samples = [] | |
| for root, _, files in os.walk(directory): | |
| for fname in files: | |
| if fname.endswith(".txt"): | |
| path = os.path.join(root, fname) | |
| with open(path, "r", encoding="utf-8") as f: | |
| text = preprocess_text(f.read()) | |
| samples.append((text, label)) | |
| return samples | |
| def load_dataset(): | |
| fake = load_texts(FAKE_DIR, 0) | |
| real = load_texts(REAL_DIR, 1) | |
| data = fake + real | |
| texts = [t for t, _ in data] | |
| labels = [l for _, l in data] | |
| return texts, labels | |
| # ========= DATASET ========= | |
| class NewsDataset(Dataset): | |
| def __init__(self, texts, labels, tokenizer): | |
| self.texts = texts | |
| self.labels = labels | |
| self.tok = tokenizer | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| enc = self.tok( | |
| self.texts[idx], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=MAX_LEN, | |
| return_tensors="pt" | |
| ) | |
| enc = {k: v.squeeze() for k, v in enc.items()} | |
| enc["labels"] = torch.tensor(self.labels[idx], dtype=torch.long) | |
| return enc | |
| # ========= EVALUATION ========= | |
| def evaluate(): | |
| print("Carregando modelo...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device) | |
| print("Carregando dataset...") | |
| texts, labels = load_dataset() | |
| dataset = NewsDataset(texts, labels, tokenizer) | |
| loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False) | |
| model.eval() | |
| preds = [] | |
| true_labels = [] | |
| print("\nAvaliando...\n") | |
| with torch.no_grad(): | |
| for batch in loader: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| outputs = model(**batch) | |
| p = torch.argmax(outputs.logits, dim=1).cpu().numpy() | |
| l = batch["labels"].cpu().numpy() | |
| preds.extend(p) | |
| true_labels.extend(l) | |
| # === METRICS === | |
| acc = accuracy_score(true_labels, preds) | |
| print(f"Accuracy: {acc:.4f}") | |
| print("\nClassification Report:") | |
| print(classification_report(true_labels, preds, target_names=["Fake", "Real"])) | |
| print("\nConfusion Matrix:") | |
| print(confusion_matrix(true_labels, preds)) | |
| if __name__ == "__main__": | |
| evaluate() | |