Spaces:
Sleeping
Sleeping
File size: 6,243 Bytes
4d6e8c2 cf376ee 8b76c22 4d6e8c2 1688082 4d6e8c2 261ff27 49eadc2 4d6e8c2 cf376ee 4d6e8c2 49eadc2 1c33274 70f5f26 1c33274 70f5f26 4d6e8c2 70f5f26 261ff27 4d6e8c2 76fccaf c8df9ce 261ff27 4d6e8c2 70f5f26 49eadc2 8b76c22 49eadc2 40aafd5 49eadc2 9fde312 261ff27 8021f3c 0e65d94 8021f3c 1814075 8021f3c 223c003 261ff27 223c003 983ced3 cd856ca 8b76c22 cd856ca 261ff27 1814075 261ff27 8021f3c 261ff27 4d6e8c2 70f5f26 f5ed443 70f5f26 1de5df2 f5ed443 8b76c22 4d6e8c2 261ff27 4d6e8c2 b45028b 4d6e8c2 70f5f26 4d6e8c2 1c33274 4d6e8c2 b45028b 261ff27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score
import random
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info
import torch
import numpy as np
router = APIRouter()
DESCRIPTION = "BERT V1.1"
ROUTE = "/text"
@router.post(ROUTE, tags=["Text Task"],
description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
"""
Evaluate text classification for climate disinformation detection.
Current Model: BERT
- Uses a pre-trained BERT model for sequence classification
"""
# Get space info
username, space_url = get_space_info()
# Define the label mapping
LABEL_MAPPING = {
"0_not_relevant": 0,
"1_not_happening": 1,
"2_not_human": 2,
"3_not_bad": 3,
"4_solutions_harmful_unnecessary": 4,
"5_science_unreliable": 5,
"6_proponents_biased": 6,
"7_fossil_fuels_needed": 7
}
# Load and prepare the dataset
dataset = load_dataset(request.dataset_name)
# Convert string labels to integers
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
# Split dataset
train_test = dataset["train"]
test_dataset = dataset["test"]
print('dataset type: ' , test_dataset.column_names) # Debugging step
print('dataset type: ' , test_dataset['quote'][:5]) # Debugging step
# Start tracking emissions
tracker.start()
tracker.start_task("inference")
#--------------------------------------------------------------------------------------------
# YOUR MODEL INFERENCE CODE HERE
# Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
#--------------------------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = ["cococli/bert-base-uncased-frugalai", 'cococli/roberta-base-frugalai', "cococli/distilbert-base-uncased-frugalai",
"cococli/albert-base-v2-frugalai", "cococli/bert-base-uncased-coco-frugalai",
"cococli/distilbert-base-uncased-coco-frugalai", "cococli/albert-base-v2-coco-frugalai","cococli/electra-small-discriminator-coco-frugalai",
'cococli/roberta-base-coco-frugalai', "cococli/distilbert-base-uncased-climate-frugalai","cococli/albert-base-v2-climate-frugalai",
"cococli/electra-small-discriminator-frugalai", "cococli/bert-base-uncased-climate-frugalai","cococli/roberta-base-climate-frugalai",
]
tokenizer = AutoTokenizer.from_pretrained(model_name[0])
model = AutoModelForSequenceClassification.from_pretrained(model[0]).to(device)
# def tokenize_function(examples):
# return tokenizer(examples["quote"], padding=True, truncation=True, return_tensors='pt')
# print('BEFORE TOKENIZING')
# # Tokenize the test dataset
# tokenized_test = test_dataset.map(tokenize_function, batched=True)
# print('AFTER TOKENIZING')
# print(tokenized_test.column_names) # Debugging step
# print(tokenized_test['input_ids'][:5]) # Debugging step
# # Create DataLoader
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# dataloader = DataLoader(tokenized_test, batch_size=16, shuffle=False, collate_fn=data_collator)
print("Started prediction run")
# tokenized_test = tokenizer(test_dataset['quote'], padding=True, truncation=True, return_tensors='pt')
# Model inference
model.eval()
predictions = np.array([])
batch_size = 32
with torch.no_grad():
for i in range(0, len(test_dataset['quote']), batch_size):
batch_quotes = test_dataset['quote'][i:i + batch_size]
print(f'Processing batch {i // batch_size + 1}')
# Tokenize the input data for the current batch
tokenized_inputs = tokenizer(batch_quotes, padding=True, truncation=True, return_tensors='pt').to(device)
# Forward pass through the model
p = model(**tokenized_inputs)
output = torch.argmax(p.logits, dim=1).cpu().numpy()
# print(p)
predictions = np.append(predictions, output)
print("Finished prediction run")
# Ensure "label" column exists in dataset
print(test_dataset.column_names) # Debugging step
# Extract true labels
true_labels = test_dataset["label"]
#--------------------------------------------------------------------------------------------
# YOUR MODEL INFERENCE STOPS HERE
#--------------------------------------------------------------------------------------------
print(predictions)
print(true_labels)
print('Accuracy: ', (true_labels == predictions)/len(true_labels))
print('Accuracy: ', accuracy_score(true_labels, predictions))
print('F1 SCORE: ', f1_score(true_labels, predictions))
# Stop tracking emissions
emissions_data = tracker.stop_task()
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print('Accuracy: ', accuracy)
# Prepare results dictionary
results = {
"username": username,
"space_url": space_url,
"submission_timestamp": datetime.now().isoformat(),
"model_description": DESCRIPTION,
"accuracy": float(accuracy),
"energy_consumed_wh": emissions_data.energy_consumed * 1000,
"emissions_gco2eq": emissions_data.emissions * 1000,
"emissions_data": clean_emissions_data(emissions_data),
"api_route": ROUTE,
"dataset_config": {
"dataset_name": request.dataset_name,
"test_size": request.test_size,
"test_seed": request.test_seed
}
}
print('Results: ', results)
return results |