Spaces:

OliverPerrin
/

LexiMind

Sleeping

App Files Files Community

OliverPerrin commited on Dec 3, 2025

Commit

c0044cc

1 Parent(s): 2dcb4b5

Fix: Resolve mypy type errors and configuration

Browse files

Files changed (11) hide show

pyproject.toml +40 -1
src/data/dataset.py +3 -3
src/data/preprocessing.py +1 -1
src/data/tokenization.py +11 -8
src/inference/pipeline.py +3 -3
src/models/factory.py +1 -1
src/training/metrics.py +4 -4
src/training/trainer.py +1 -1
src/utils/config.py +1 -1
src/utils/labels.py +1 -1
src/visualization/metrics.py +2 -0

pyproject.toml CHANGED Viewed

@@ -66,4 +66,43 @@ line-ending = "auto"
 [tool.pytest.ini_options]
 testpaths = ["tests"]
-python_files = "test_*.py"

 [tool.pytest.ini_options]
 testpaths = ["tests"]
+python_files = "test_*.py"
+[tool.mypy]
+python_version = "3.9"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = false
+check_untyped_defs = true
+[[tool.mypy.overrides]]
+module = [
+    "torch.*",
+    "transformers.*",
+    "datasets.*",
+    "numpy.*",
+    "pandas.*",
+    "sklearn.*",
+    "matplotlib.*",
+    "seaborn.*",
+    "nltk.*",
+    "tqdm.*",
+    "yaml.*",
+    "omegaconf.*",
+    "gradio.*",
+    "requests.*",
+    "kaggle.*",
+    "streamlit.*",
+    "plotly.*",
+    "faiss.*",
+    "huggingface_hub.*",
+    "hydra.*",
+    "bitsandbytes.*",
+    "accelerate.*",
+    "fastapi.*",
+    "mlflow.*",
+    "pydantic.*",
+    "rouge_score.*"
+]
+ignore_missing_imports = true
+follow_imports = "skip"

src/data/dataset.py CHANGED Viewed

@@ -11,7 +11,7 @@ from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
 from torch.utils.data import Dataset
-@dataclass(slots=True)
 class SummarizationExample:
     """Container for abstractive summarization samples."""
@@ -19,7 +19,7 @@ class SummarizationExample:
     summary: str
-@dataclass(slots=True)
 class EmotionExample:
     """Container for multi-label emotion classification samples."""
@@ -27,7 +27,7 @@ class EmotionExample:
     emotions: Sequence[str]
-@dataclass(slots=True)
 class TopicExample:
     """Container for topic clustering / classification samples."""

 from torch.utils.data import Dataset
+@dataclass
 class SummarizationExample:
     """Container for abstractive summarization samples."""
     summary: str
+@dataclass
 class EmotionExample:
     """Container for multi-label emotion classification samples."""
     emotions: Sequence[str]
+@dataclass
 class TopicExample:
     """Container for topic clustering / classification samples."""

src/data/preprocessing.py CHANGED Viewed

@@ -31,7 +31,7 @@ class BasicTextCleaner(BaseEstimator, TransformerMixin):
         return " ".join(item.split())
-@dataclass(slots=True)
 class Batch:
     """Bundle of tensors returned by the text preprocessor."""

         return " ".join(item.split())
+@dataclass
 class Batch:
     """Bundle of tensors returned by the text preprocessor."""

src/data/tokenization.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
-@dataclass(slots=True)
 class TokenizerConfig:
     pretrained_model_name: str = "facebook/bart-base"
     max_length: int = 512
@@ -72,11 +72,14 @@ class Tokenizer:
     def encode(self, text: str) -> List[int]:
         content = text.lower() if self.config.lower else text
-        return self._tokenizer.encode(
-            content,
-            max_length=self.config.max_length,
-            truncation=self.config.truncation,
-            padding=self.config.padding,
         )
     def encode_batch(self, texts: Sequence[str]) -> List[List[int]]:
@@ -114,11 +117,11 @@ class Tokenizer:
         }
     def decode(self, token_ids: Iterable[int]) -> str:
-        return self._tokenizer.decode(list(token_ids), skip_special_tokens=True)
     def decode_batch(self, sequences: Sequence[Sequence[int]]) -> List[str]:
         prepared = [list(seq) for seq in sequences]
-        return self._tokenizer.batch_decode(prepared, skip_special_tokens=True)
     def prepare_decoder_inputs(self, labels: torch.Tensor) -> torch.Tensor:
         """Shift decoder labels to create input ids prefixed by BOS."""

 from transformers import AutoTokenizer, PreTrainedTokenizerBase
+@dataclass
 class TokenizerConfig:
     pretrained_model_name: str = "facebook/bart-base"
     max_length: int = 512
     def encode(self, text: str) -> List[int]:
         content = text.lower() if self.config.lower else text
+        return cast(
+            List[int],
+            self._tokenizer.encode(
+                content,
+                max_length=self.config.max_length,
+                truncation=self.config.truncation,
+                padding=self.config.padding,
+            ),
         )
     def encode_batch(self, texts: Sequence[str]) -> List[List[int]]:
         }
     def decode(self, token_ids: Iterable[int]) -> str:
+        return cast(str, self._tokenizer.decode(list(token_ids), skip_special_tokens=True))
     def decode_batch(self, sequences: Sequence[Sequence[int]]) -> List[str]:
         prepared = [list(seq) for seq in sequences]
+        return cast(List[str], self._tokenizer.batch_decode(prepared, skip_special_tokens=True))
     def prepare_decoder_inputs(self, labels: torch.Tensor) -> torch.Tensor:
         """Shift decoder labels to create input ids prefixed by BOS."""

src/inference/pipeline.py CHANGED Viewed

@@ -12,7 +12,7 @@ from ..data.preprocessing import Batch, TextPreprocessor
 from ..data.tokenization import Tokenizer
-@dataclass(slots=True)
 class InferenceConfig:
     """Configuration knobs for the inference pipeline."""
@@ -21,13 +21,13 @@ class InferenceConfig:
     device: str | None = None
-@dataclass(slots=True)
 class EmotionPrediction:
     labels: List[str]
     scores: List[float]
-@dataclass(slots=True)
 class TopicPrediction:
     label: str
     confidence: float

 from ..data.tokenization import Tokenizer
+@dataclass
 class InferenceConfig:
     """Configuration knobs for the inference pipeline."""
     device: str | None = None
+@dataclass
 class EmotionPrediction:
     labels: List[str]
     scores: List[float]
+@dataclass
 class TopicPrediction:
     label: str
     confidence: float

src/models/factory.py CHANGED Viewed

@@ -17,7 +17,7 @@ from .heads import ClassificationHead, LMHead
 from .multitask import MultiTaskModel
-@dataclass(slots=True)
 class ModelConfig:
     """Configuration describing the transformer architecture."""

 from .multitask import MultiTaskModel
+@dataclass
 class ModelConfig:
     """Configuration describing the transformer architecture."""

src/training/metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Sequence
 import numpy as np
 import torch
@@ -11,7 +11,7 @@ from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_f
 def accuracy(predictions: Sequence[int | str], targets: Sequence[int | str]) -> float:
-    return accuracy_score(targets, predictions)
 def multilabel_f1(predictions: torch.Tensor, targets: torch.Tensor) -> float:
@@ -51,7 +51,7 @@ def calculate_bleu(predictions: Sequence[str], references: Sequence[str]) -> flo
         ref_tokens = [ref.split()]  # BLEU expects list of references
         scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))
-    return sum(scores) / len(scores)
 def classification_report_dict(
@@ -87,4 +87,4 @@ def get_confusion_matrix(
     predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None
 ) -> np.ndarray:
     """Compute confusion matrix."""
-    return confusion_matrix(targets, predictions, labels=labels)

 from __future__ import annotations
+from typing import Any, Dict, List, Sequence, cast
 import numpy as np
 import torch
 def accuracy(predictions: Sequence[int | str], targets: Sequence[int | str]) -> float:
+    return cast(float, accuracy_score(targets, predictions))
 def multilabel_f1(predictions: torch.Tensor, targets: torch.Tensor) -> float:
         ref_tokens = [ref.split()]  # BLEU expects list of references
         scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))
+    return cast(float, sum(scores) / len(scores))
 def classification_report_dict(
     predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None
 ) -> np.ndarray:
     """Compute confusion matrix."""
+    return cast(np.ndarray, confusion_matrix(targets, predictions, labels=labels))

src/training/trainer.py CHANGED Viewed

@@ -17,7 +17,7 @@ from ..data.tokenization import Tokenizer
 from .metrics import accuracy, multilabel_f1, rouge_like
-@dataclass(slots=True)
 class TrainerConfig:
     max_epochs: int = 1
     gradient_clip_norm: float = 1.0

 from .metrics import accuracy, multilabel_f1, rouge_like
+@dataclass
 class TrainerConfig:
     max_epochs: int = 1
     gradient_clip_norm: float = 1.0

src/utils/config.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Dict
 import yaml
-@dataclass(slots=True)
 class Config:
     data: Dict[str, Any]

 import yaml
+@dataclass
 class Config:
     data: Dict[str, Any]

src/utils/labels.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import List
-@dataclass(slots=True)
 class LabelMetadata:
     """Container for label vocabularies persisted after training."""

 from typing import List
+@dataclass
 class LabelMetadata:
     """Container for label vocabularies persisted after training."""

src/visualization/metrics.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Metric plotting helpers."""
 import matplotlib.pyplot as plt

 """Metric plotting helpers."""
+from __future__ import annotations
 import matplotlib.pyplot as plt