Spaces:

OliverPerrin
/

LexiMind

Running

OliverPerrin commited on Dec 5, 2025

Commit

69b8f98

1 Parent(s): 9becd3c

Improve summarization output quality

- Add _format_summary() for proper capitalization and punctuation
- Add repetition_penalty (1.2) to reduce repetitive outputs
- Fix period spacing and sentence capitalization
- Remove leading special characters from generated text

Files changed (2) hide show

outputs/evaluation_report.json +65 -29
src/inference/pipeline.py +46 -1

outputs/evaluation_report.json CHANGED Viewed

@@ -1,44 +1,80 @@
 {
   "split": "val",
   "summarization": {
-    "rouge_like": 0.35947467920968945,
-    "bleu": 0.09027012433010549
   },
   "emotion": {
-    "f1_macro": 0.9455000162124634
   },
   "topic": {
-    "accuracy": 0.94175,
     "classification_report": {
-      "Business": {
-        "precision": 0.9319045973038369,
-        "recall": 0.8986666666666666,
-        "f1-score": 0.9149838791786866,
-        "support": 3000
-      },
-      "Sci/Tech": {
-        "precision": 0.9055627425614489,
-        "recall": 0.9333333333333333,
-        "f1-score": 0.9192383453709784,
-        "support": 3000
       },
-      "Sports": {
-        "precision": 0.9856475300400535,
-        "recall": 0.9843333333333333,
-        "f1-score": 0.9849899933288859,
-        "support": 3000
       },
-      "World": {
-        "precision": 0.9446836700894335,
-        "recall": 0.9506666666666667,
-        "f1-score": 0.9476657252035222,
-        "support": 3000
       },
       "macro avg": {
-        "precision": 0.9419496349986932,
-        "recall": 0.94175,
-        "f1-score": 0.9417194857705183,
-        "support": 12000
       }
     }
   }

 {
   "split": "val",
   "summarization": {
+    "rouge_like": 0.13567121660564777,
+    "bleu": 0.014673668103097205
   },
   "emotion": {
+    "f1_macro": 0.1939181685447693
   },
   "topic": {
+    "accuracy": 0.741687849517031,
     "classification_report": {
+      "Business & Finance": {
+        "precision": 0.6439114391143912,
+        "recall": 0.527190332326284,
+        "f1-score": 0.579734219269103,
+        "support": 1986
       },
+      "Computers & Internet": {
+        "precision": 0.8251038301799724,
+        "recall": 0.9044006069802731,
+        "f1-score": 0.862934362934363,
+        "support": 1977
+      },
+      "Education & Reference": {
+        "precision": 0.6439444076770351,
+        "recall": 0.49642857142857144,
+        "f1-score": 0.560645347162201,
+        "support": 1960
+      },
+      "Entertainment & Music": {
+        "precision": 0.7064310260186549,
+        "recall": 0.7360613810741689,
+        "f1-score": 0.7209418837675351,
+        "support": 1955
+      },
+      "Family & Relationships": {
+        "precision": 0.7182971014492754,
+        "recall": 0.8071246819338422,
+        "f1-score": 0.7601246105919003,
+        "support": 1965
+      },
+      "Health": {
+        "precision": 0.7610579115367077,
+        "recall": 0.8489318413021363,
+        "f1-score": 0.8025967780716519,
+        "support": 1966
       },
+      "Politics & Government": {
+        "precision": 0.7711132437619962,
+        "recall": 0.8173957273652085,
+        "f1-score": 0.7935802469135802,
+        "support": 1966
+      },
+      "Science & Mathematics": {
+        "precision": 0.7456647398843931,
+        "recall": 0.7885888945491595,
+        "f1-score": 0.7665263679128497,
+        "support": 1963
+      },
+      "Society & Culture": {
+        "precision": 0.6496559633027523,
+        "recall": 0.5783563042368556,
+        "f1-score": 0.6119362678908993,
+        "support": 1959
+      },
+      "Sports": {
+        "precision": 0.8888339920948617,
+        "recall": 0.9118094272681196,
+        "f1-score": 0.9001751313485113,
+        "support": 1973
       },
       "macro avg": {
+        "precision": 0.735401365502004,
+        "recall": 0.7416287768464619,
+        "f1-score": 0.7359195215862595,
+        "support": 19670
       }
     }
   }

src/inference/pipeline.py CHANGED Viewed

@@ -10,6 +10,7 @@ Date: December 2025
 from __future__ import annotations
 from dataclasses import dataclass, fields, replace
 from typing import Any, Dict, List, Sequence, cast
@@ -19,6 +20,46 @@ import torch.nn.functional as F
 from ..data.preprocessing import Batch, TextPreprocessor
 from ..data.tokenization import Tokenizer
 # --------------- Configuration ---------------
@@ -27,6 +68,7 @@ class InferenceConfig:
     """Pipeline settings."""
     summary_max_length: int = 128
     emotion_threshold: float = 0.5
     device: str | None = None
@@ -116,10 +158,13 @@ class InferencePipeline:
                 min_len=10,
                 ban_token_ids=[i for i in ban_ids if i is not None],
                 no_repeat_ngram_size=3,
                 memory_mask=src_mask,
             )
-        return self.tokenizer.decode_batch(generated.tolist())
     # --------------- Emotion ---------------

 from __future__ import annotations
+import re
 from dataclasses import dataclass, fields, replace
 from typing import Any, Dict, List, Sequence, cast
 from ..data.preprocessing import Batch, TextPreprocessor
 from ..data.tokenization import Tokenizer
+# --------------- Text Formatting ---------------
+def _format_summary(text: str) -> str:
+    """Clean and format generated summary text.
+    - Capitalize first letter
+    - Fix period spacing (". " not " .")
+    - Remove extra whitespace
+    - Ensure proper sentence endings
+    """
+    if not text:
+        return text
+    # Strip and normalize whitespace
+    text = " ".join(text.split())
+    # Remove leading punctuation/special chars
+    text = re.sub(r"^[^A-Za-z0-9]+", "", text)
+    # Fix spacing around punctuation
+    text = re.sub(r"\s+([.!?,;:])", r"\1", text)  # Remove space before punctuation
+    text = re.sub(
+        r"([.!?])([A-Za-z])", r"\1 \2", text
+    )  # Add space after sentence-ending punctuation
+    # Capitalize first letter
+    if text:
+        text = text[0].upper() + text[1:]
+    # Capitalize after sentence-ending punctuation
+    text = re.sub(r"([.!?])\s+([a-z])", lambda m: m.group(1) + " " + m.group(2).upper(), text)
+    # Ensure ends with punctuation
+    if text and text[-1] not in ".!?":
+        text += "."
+    return text
 # --------------- Configuration ---------------
     """Pipeline settings."""
     summary_max_length: int = 128
+    summary_repetition_penalty: float = 1.2  # Penalize repeated tokens
     emotion_threshold: float = 0.5
     device: str | None = None
                 min_len=10,
                 ban_token_ids=[i for i in ban_ids if i is not None],
                 no_repeat_ngram_size=3,
+                repetition_penalty=self.config.summary_repetition_penalty,
                 memory_mask=src_mask,
             )
+        # Decode and format summaries
+        raw_summaries = self.tokenizer.decode_batch(generated.tolist())
+        return [_format_summary(s) for s in raw_summaries]
     # --------------- Emotion ---------------