Spaces:

TraceForce
/

varun-kd-finetune

Sleeping

App Files Files Community

Varun Wadhwa commited on Jan 7

Commit

8e7d1ea

unverified ·

1 Parent(s): 2293530

Logs

Browse files

Files changed (1) hide show

app.py +40 -50

app.py CHANGED Viewed

@@ -78,31 +78,32 @@ print(raw_dataset.column_names)
 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
-def align_labels_with_tokens(labels):
     aligned_label_ids = []
-    aligned_label_ids.append(-100)
-    for i, label in enumerate(labels):
-        if label.startswith("B-"):
-            label = label.replace("B-", "I-")
-        aligned_label_ids.append(label2id[label])
-    aligned_label_ids.append(-100)
     return aligned_label_ids
 # create tokenize function
 def tokenize_function(examples):
-    # tokenize and truncate text. The examples argument would have already stripped
-    # the train or test label.
-    new_labels = []
     inputs = tokenizer(
         examples['mbert_tokens'],
         is_split_into_words=True,
-        padding=True,
         truncation=True,
-        max_length=512)
-    for _, labels in enumerate(examples['mbert_token_classes']):
-        new_labels.append(align_labels_with_tokens(labels))
-    inputs["labels"] = new_labels
     return inputs
 # tokenize training and validation datasets
@@ -111,54 +112,43 @@ tokenized_data = raw_dataset.map(
     batched=True)
 tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 # data collator
-data_collator = DataCollatorForTokenClassification(tokenizer)
 st.write(tokenized_data["train"][:2]["labels"])
 # Function to evaluate model performance
 def evaluate_model(model, dataloader, device):
-    model.eval()  # Set model to evaluation mode
-    all_preds = []
-    all_labels = []
-    # Disable gradient calculations
     with torch.no_grad():
         for batch in dataloader:
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
-            labels = batch['labels'].to(device).cpu().numpy()
-            x = len(labels[0])
-            print(x)
-            print("OTHERS:")
-            for l in labels:
-                if len(l) != x:
-                    print(len(l))
-                    break
-            # Forward pass to get logits
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs.logits
-            # Get predictions
-            preds = torch.argmax(logits, dim=-1).cpu().numpy()
-            all_preds.extend(preds)
-            all_labels.extend(labels)
-    # Calculate evaluation metrics
-    print("evaluate_model sizes")
-    print(len(all_preds[0]))
-    print(len(all_labels[0]))
-    all_preds = np.asarray(all_preds, dtype=np.float32)
-    all_labels = np.asarray(all_labels, dtype=np.float32)
-    print("Flattened sizes")
-    print(all_preds.size)
-    print(all_labels.size)
-    all_preds = all_preds.flatten()
-    all_labels = all_labels.flatten()
     accuracy = accuracy_score(all_labels, all_preds)
-    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
     return accuracy, precision, recall, f1

 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
+def align_labels_with_tokens(labels, word_ids, max_length):
     aligned_label_ids = []
+    for word_id in word_ids:
+        if word_id is None:
+            aligned_label_ids.append(-100)
+        else:
+            aligned_label_ids.append(label2id[labels[word_id]].replace("B-", "I-"))
+    # Pad to max length
+    aligned_label_ids += [-100] * (max_length - len(aligned_label_ids))
     return aligned_label_ids
 # create tokenize function
 def tokenize_function(examples):
     inputs = tokenizer(
         examples['mbert_tokens'],
         is_split_into_words=True,
         truncation=True,
+        max_length=512,
+        padding="max_length"
+    )
+    word_ids = inputs.word_ids()
+    inputs["labels"] = [
+        align_labels_with_tokens(labels, word_ids, tokenizer.model_max_length)
+        for labels in examples['mbert_token_classes']
+    ]
     return inputs
 # tokenize training and validation datasets
     batched=True)
 tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 # data collator
+data_collator = DataCollatorForTokenClassification(
+    tokenizer, padding=True, truncation=True, max_length=512
+)
 st.write(tokenized_data["train"][:2]["labels"])
 # Function to evaluate model performance
 def evaluate_model(model, dataloader, device):
+    model.eval()
+    all_preds, all_labels = [], []
     with torch.no_grad():
         for batch in dataloader:
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs.logits
+            preds = torch.argmax(logits, dim=-1)
+            # Mask out padding tokens (-100 in labels)
+            mask = labels != -100
+            valid_preds = preds[mask]
+            valid_labels = labels[mask]
+            all_preds.extend(valid_preds.cpu().numpy())
+            all_labels.extend(valid_labels.cpu().numpy())
+    # Convert to numpy arrays for metrics calculation
+    all_preds = np.array(all_preds)
+    all_labels = np.array(all_labels)
     accuracy = accuracy_score(all_labels, all_preds)
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        all_labels, all_preds, average='micro'
+    )
     return accuracy, precision, recall, f1