impresso-project
/

ner-stacked-bert-multilingual

@@ -238,38 +238,47 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         return outputs, text_chunks, text
     def postprocess(self, outputs, **kwargs):
         tokens_result, text_chunks, text = outputs
         # Initialize variables for collecting results across chunks
         predictions = {task: [] for task in self.label_map.keys()}
         confidence_scores = {task: [] for task in self.label_map.keys()}
         # Collect predictions from each chunk
         for chunk_result in tokens_result:
             for task, logits in chunk_result.logits.items():
                 predictions[task].extend(torch.argmax(logits, dim=-1).tolist())
                 confidence_scores[task].extend(F.softmax(logits, dim=-1).tolist())
-        # Decode and process the predictions
-        decoded_predictions = {}
-        for task, preds in predictions.items():
-            decoded_predictions[task] = [
-                [self.id2label[task][label] for label in seq] for seq in preds
-            ]
         # Extract entities from the combined predictions
         entities = {}
-        # print(decoded_predictions)
         for task, preds in predictions.items():
-            print('preds', len(preds))
-            print('text_chunks', len(text_chunks))
-            print('confidence_scores[task]', len(confidence_scores[task]))
-            words_list, preds_list, confidence_list = realign(
-                text_chunks,
-                preds,
-                confidence_scores[task],
-                self.tokenizer,
-                self.id2label[task],
-            )
-            print(words_list, preds_list, confidence_list)
-            entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities

         return outputs, text_chunks, text
     def postprocess(self, outputs, **kwargs):
+        """
+        Postprocess the outputs of the model for NER tasks.
+        outputs: Model predictions for all chunks
+        """
         tokens_result, text_chunks, text = outputs
         # Initialize variables for collecting results across chunks
         predictions = {task: [] for task in self.label_map.keys()}
         confidence_scores = {task: [] for task in self.label_map.keys()}
         # Collect predictions from each chunk
         for chunk_result in tokens_result:
             for task, logits in chunk_result.logits.items():
                 predictions[task].extend(torch.argmax(logits, dim=-1).tolist())
                 confidence_scores[task].extend(F.softmax(logits, dim=-1).tolist())
         # Extract entities from the combined predictions
         entities = {}
         for task, preds in predictions.items():
+            print('preds', len(preds))  # Debugging line
+            print('text_chunks', len(text_chunks))  # Debugging line
+            print('confidence_scores[task]', len(confidence_scores[task]))  # Debugging line
+            # Process each chunk individually
+            for idx, text_chunk in enumerate(text_chunks):
+                words_list, preds_list, confidence_list = realign(
+                    text_chunk,  # Single chunk of text
+                    preds[idx * len(text_chunk.split()): (idx + 1) * len(text_chunk.split())],
+                    confidence_scores[task][idx * len(text_chunk.split()): (idx + 1) * len(text_chunk.split())],
+                    self.tokenizer,
+                    self.id2label[task],
+                )
+                print(words_list, preds_list, confidence_list)  # Debugging line
+                # Get entities for this chunk
+                chunk_entities = get_entities(words_list, preds_list, confidence_list, text)
+                # Append chunk entities to the task-level results
+                if task not in entities:
+                    entities[task] = []
+                entities[task].extend(chunk_entities)
         return entities