submission-template-tree

Sleeping

App Files Files Community

evgeniiarazum commited on Jan 31

Commit

58e2728

verified ·

1 Parent(s): d753f94

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +61 -62

tasks/text.py CHANGED Viewed

@@ -13,20 +13,28 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
 router = APIRouter()
-MODEL_TYPE = "baseline"
-DESCRIPTIONS = {
-    "distilbert_frugalai": "distilbert tuned on frugal ai data",
-    "modernbert_frugalai": "distilbert tuned on frugal ai data",
-    "mpnet_frugalai": "mpnet tuned on frugal ai data",
-}
 ROUTE = "/text"
 class TextDataset(Dataset):
-    def __init__(self, texts, tokenizer, max_length=256):
         self.texts = texts
-        self.encodings = tokenizer(
             texts,
             truncation=True,
             padding=True,
@@ -35,43 +43,38 @@ class TextDataset(Dataset):
         )
     def __getitem__(self, idx):
-        item = {key: val[idx] for key, val in self.encodings.items()}
         return item
     def __len__(self) -> int:
         return len(self.texts)
-def baseline_model(dataset_length: int):
-    # Make random predictions (placeholder for actual model inference)
-    # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
-    # My favorite baseline is the most common class.
-    predictions = [0] * dataset_length
-    return predictions
-def bert_model(test_dataset: dict, model_type: str):
     texts = test_dataset["quote"]
-    model_repo = f"evgeniiarazum/{MODEL_TYPE}"
-    print(f"Loading from model_repo: {model_repo}")
-    config = AutoConfig.from_pretrained(model_repo)
-    model = AutoModelForSequenceClassification.from_pretrained(model_repo)
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
     else:
-        device = torch.device("cpu")
-    print("Using device:", device)
     model = model.to(device)
     dataset = TextDataset(texts, tokenizer=tokenizer)
-    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
     model.eval()
     with torch.no_grad():
-        print("Starting model run.")
         predictions = np.array([])
         for batch in dataloader:
             test_input_ids = batch["input_ids"].to(device)
@@ -79,21 +82,18 @@ def bert_model(test_dataset: dict, model_type: str):
             outputs = model(test_input_ids, test_attention_mask)
             p = torch.argmax(outputs.logits, dim=1)
             predictions = np.append(predictions, p.cpu().numpy())
-        print("End of model run.")
     return predictions
 @router.post(ROUTE, tags=["Text Task"])
-async def evaluate_text(
-    request: TextEvaluationRequest,
-    model_type: str = MODEL_TYPE,
-    # This should be an API query parameter, but it looks like the submission repo
-    # https://huggingface.co/spaces/frugal-ai-challenge/submission-portal
-    # is built in a way to not accept any other endpoints or parameters.
-):
     """
     Evaluate text classification for climate disinformation detection.
     Current Model: Random Baseline
     - Makes random predictions from the label space (0-7)
     - Used as a baseline for comparison
@@ -110,7 +110,7 @@ async def evaluate_text(
         "4_solutions_harmful_unnecessary": 4,
         "5_science_unreliable": 5,
         "6_proponents_biased": 6,
-        "7_fossil_fuels_needed": 7,
     }
     # Load and prepare the dataset
@@ -120,44 +120,43 @@ async def evaluate_text(
     dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
     # Split dataset
-    train_test = dataset["train"].train_test_split(
-        test_size=request.test_size, seed=request.test_seed
-    )
     test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
-    # --------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
-    # --------------------------------------------------------------------------------------------
     true_labels = test_dataset["label"]
-    if model_type == "baseline":
         predictions = baseline_model(len(true_labels))
-    elif model_type in ["distilbert_frugalai", "modernbert_frugalai", "mpnet_frugalai"]:
-        predictions = bert_model(test_dataset, model_type)
-    else:
-        raise ValueError(model_type)
-    # --------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
-    # --------------------------------------------------------------------------------------------
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
     # Calculate accuracy
     accuracy = accuracy_score(true_labels, predictions)
     # Prepare results dictionary
     results = {
         "username": username,
         "space_url": space_url,
         "submission_timestamp": datetime.now().isoformat(),
-        "model_description": DESCRIPTIONS[model_type],
         "accuracy": float(accuracy),
         "energy_consumed_wh": emissions_data.energy_consumed * 1000,
         "emissions_gco2eq": emissions_data.emissions * 1000,
@@ -166,8 +165,8 @@ async def evaluate_text(
         "dataset_config": {
             "dataset_name": request.dataset_name,
             "test_size": request.test_size,
-            "test_seed": request.test_seed,
-        },
     }
-    return results

 router = APIRouter()
+DESCRIPTION = "Random Baseline"
 ROUTE = "/text"
+models_descriptions = {
+    "baseline": "random baseline", # Baseline
+    "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier", # Submitted
+    "bert_base_pruned": "Pruned BERT base model", # Submitted
+    'climate_bert_pruned': "Fine-tuned and pruned DistilRoBERTa pre-trained on climate texts", # Not working
+    "sbert_distilroberta": "Fine-tuned sentence transformer DistilRoBERTa"
+}
+def baseline_model(dataset_length: int):
+    # Make random predictions (placeholder for actual model inference)
+    predictions = [random.randint(0, 7) for _ in range(dataset_length)]
+    return predictions
 class TextDataset(Dataset):
+    def __init__(self, texts, tokenizer, max_length=512):
         self.texts = texts
+        self.tokenized_texts = tokenizer(
             texts,
             truncation=True,
             padding=True,
         )
     def __getitem__(self, idx):
+        item = {key: val[idx] for key, val in self.tokenized_texts.items()}
         return item
     def __len__(self) -> int:
         return len(self.texts)
+def bert_classifier(test_dataset: dict, model: str):
+    print("Starting BERT model run")
     texts = test_dataset["quote"]
+    model_repo = f"evgeniiarazum/{model}"
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
+    if model in ["distilbert_frugalai", "deberta_frugalai", "modernbert_frugalai", "distilroberta_frugalai"]:
+        model = AutoModelForSequenceClassification.from_pretrained(model_repo)
     else:
+        raise(ValueError)
+    # Use CUDA if available
+    device, _, _ = get_backend()
     model = model.to(device)
+    # Prepare dataset
     dataset = TextDataset(texts, tokenizer=tokenizer)
+    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
     model.eval()
     with torch.no_grad():
         predictions = np.array([])
         for batch in dataloader:
             test_input_ids = batch["input_ids"].to(device)
             outputs = model(test_input_ids, test_attention_mask)
             p = torch.argmax(outputs.logits, dim=1)
             predictions = np.append(predictions, p.cpu().numpy())
+    print("Finished BERT model run")
     return predictions
 @router.post(ROUTE, tags=["Text Task"])
+async def evaluate_text(request: TextEvaluationRequest,
+                        model: str = "distilbert_frugalai"):
     """
     Evaluate text classification for climate disinformation detection.
     Current Model: Random Baseline
     - Makes random predictions from the label space (0-7)
     - Used as a baseline for comparison
         "4_solutions_harmful_unnecessary": 4,
         "5_science_unreliable": 5,
         "6_proponents_biased": 6,
+        "7_fossil_fuels_needed": 7
     }
     # Load and prepare the dataset
     dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
     # Split dataset
+    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
     test_dataset = train_test["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
+    #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
+    #--------------------------------------------------------------------------------------------
+    # Make random predictions (placeholder for actual model inference)
     true_labels = test_dataset["label"]
+    if model == "baseline":
         predictions = baseline_model(len(true_labels))
+    elif 'bert' in model:
+        predictions = bert_classifier(test_dataset, model)
+    #--------------------------------------------------------------------------------------------
     # YOUR MODEL INFERENCE STOPS HERE
+    #--------------------------------------------------------------------------------------------
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
     # Calculate accuracy
     accuracy = accuracy_score(true_labels, predictions)
     # Prepare results dictionary
     results = {
         "username": username,
         "space_url": space_url,
         "submission_timestamp": datetime.now().isoformat(),
+        "model_description": models_descriptions[model],
         "accuracy": float(accuracy),
         "energy_consumed_wh": emissions_data.energy_consumed * 1000,
         "emissions_gco2eq": emissions_data.emissions * 1000,
         "dataset_config": {
             "dataset_name": request.dataset_name,
             "test_size": request.test_size,
+            "test_seed": request.test_seed
+        }
     }
+    return results