OliverPerrin commited on
Commit
c0044cc
·
1 Parent(s): 2dcb4b5

Fix: Resolve mypy type errors and configuration

Browse files
pyproject.toml CHANGED
@@ -66,4 +66,43 @@ line-ending = "auto"
66
 
67
  [tool.pytest.ini_options]
68
  testpaths = ["tests"]
69
- python_files = "test_*.py"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  [tool.pytest.ini_options]
68
  testpaths = ["tests"]
69
+ python_files = "test_*.py"
70
+
71
+ [tool.mypy]
72
+ python_version = "3.9"
73
+ warn_return_any = true
74
+ warn_unused_configs = true
75
+ disallow_untyped_defs = false
76
+ check_untyped_defs = true
77
+
78
+ [[tool.mypy.overrides]]
79
+ module = [
80
+ "torch.*",
81
+ "transformers.*",
82
+ "datasets.*",
83
+ "numpy.*",
84
+ "pandas.*",
85
+ "sklearn.*",
86
+ "matplotlib.*",
87
+ "seaborn.*",
88
+ "nltk.*",
89
+ "tqdm.*",
90
+ "yaml.*",
91
+ "omegaconf.*",
92
+ "gradio.*",
93
+ "requests.*",
94
+ "kaggle.*",
95
+ "streamlit.*",
96
+ "plotly.*",
97
+ "faiss.*",
98
+ "huggingface_hub.*",
99
+ "hydra.*",
100
+ "bitsandbytes.*",
101
+ "accelerate.*",
102
+ "fastapi.*",
103
+ "mlflow.*",
104
+ "pydantic.*",
105
+ "rouge_score.*"
106
+ ]
107
+ ignore_missing_imports = true
108
+ follow_imports = "skip"
src/data/dataset.py CHANGED
@@ -11,7 +11,7 @@ from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
11
  from torch.utils.data import Dataset
12
 
13
 
14
- @dataclass(slots=True)
15
  class SummarizationExample:
16
  """Container for abstractive summarization samples."""
17
 
@@ -19,7 +19,7 @@ class SummarizationExample:
19
  summary: str
20
 
21
 
22
- @dataclass(slots=True)
23
  class EmotionExample:
24
  """Container for multi-label emotion classification samples."""
25
 
@@ -27,7 +27,7 @@ class EmotionExample:
27
  emotions: Sequence[str]
28
 
29
 
30
- @dataclass(slots=True)
31
  class TopicExample:
32
  """Container for topic clustering / classification samples."""
33
 
 
11
  from torch.utils.data import Dataset
12
 
13
 
14
+ @dataclass
15
  class SummarizationExample:
16
  """Container for abstractive summarization samples."""
17
 
 
19
  summary: str
20
 
21
 
22
+ @dataclass
23
  class EmotionExample:
24
  """Container for multi-label emotion classification samples."""
25
 
 
27
  emotions: Sequence[str]
28
 
29
 
30
+ @dataclass
31
  class TopicExample:
32
  """Container for topic clustering / classification samples."""
33
 
src/data/preprocessing.py CHANGED
@@ -31,7 +31,7 @@ class BasicTextCleaner(BaseEstimator, TransformerMixin):
31
  return " ".join(item.split())
32
 
33
 
34
- @dataclass(slots=True)
35
  class Batch:
36
  """Bundle of tensors returned by the text preprocessor."""
37
 
 
31
  return " ".join(item.split())
32
 
33
 
34
+ @dataclass
35
  class Batch:
36
  """Bundle of tensors returned by the text preprocessor."""
37
 
src/data/tokenization.py CHANGED
@@ -9,7 +9,7 @@ import torch
9
  from transformers import AutoTokenizer, PreTrainedTokenizerBase
10
 
11
 
12
- @dataclass(slots=True)
13
  class TokenizerConfig:
14
  pretrained_model_name: str = "facebook/bart-base"
15
  max_length: int = 512
@@ -72,11 +72,14 @@ class Tokenizer:
72
 
73
  def encode(self, text: str) -> List[int]:
74
  content = text.lower() if self.config.lower else text
75
- return self._tokenizer.encode(
76
- content,
77
- max_length=self.config.max_length,
78
- truncation=self.config.truncation,
79
- padding=self.config.padding,
 
 
 
80
  )
81
 
82
  def encode_batch(self, texts: Sequence[str]) -> List[List[int]]:
@@ -114,11 +117,11 @@ class Tokenizer:
114
  }
115
 
116
  def decode(self, token_ids: Iterable[int]) -> str:
117
- return self._tokenizer.decode(list(token_ids), skip_special_tokens=True)
118
 
119
  def decode_batch(self, sequences: Sequence[Sequence[int]]) -> List[str]:
120
  prepared = [list(seq) for seq in sequences]
121
- return self._tokenizer.batch_decode(prepared, skip_special_tokens=True)
122
 
123
  def prepare_decoder_inputs(self, labels: torch.Tensor) -> torch.Tensor:
124
  """Shift decoder labels to create input ids prefixed by BOS."""
 
9
  from transformers import AutoTokenizer, PreTrainedTokenizerBase
10
 
11
 
12
+ @dataclass
13
  class TokenizerConfig:
14
  pretrained_model_name: str = "facebook/bart-base"
15
  max_length: int = 512
 
72
 
73
  def encode(self, text: str) -> List[int]:
74
  content = text.lower() if self.config.lower else text
75
+ return cast(
76
+ List[int],
77
+ self._tokenizer.encode(
78
+ content,
79
+ max_length=self.config.max_length,
80
+ truncation=self.config.truncation,
81
+ padding=self.config.padding,
82
+ ),
83
  )
84
 
85
  def encode_batch(self, texts: Sequence[str]) -> List[List[int]]:
 
117
  }
118
 
119
  def decode(self, token_ids: Iterable[int]) -> str:
120
+ return cast(str, self._tokenizer.decode(list(token_ids), skip_special_tokens=True))
121
 
122
  def decode_batch(self, sequences: Sequence[Sequence[int]]) -> List[str]:
123
  prepared = [list(seq) for seq in sequences]
124
+ return cast(List[str], self._tokenizer.batch_decode(prepared, skip_special_tokens=True))
125
 
126
  def prepare_decoder_inputs(self, labels: torch.Tensor) -> torch.Tensor:
127
  """Shift decoder labels to create input ids prefixed by BOS."""
src/inference/pipeline.py CHANGED
@@ -12,7 +12,7 @@ from ..data.preprocessing import Batch, TextPreprocessor
12
  from ..data.tokenization import Tokenizer
13
 
14
 
15
- @dataclass(slots=True)
16
  class InferenceConfig:
17
  """Configuration knobs for the inference pipeline."""
18
 
@@ -21,13 +21,13 @@ class InferenceConfig:
21
  device: str | None = None
22
 
23
 
24
- @dataclass(slots=True)
25
  class EmotionPrediction:
26
  labels: List[str]
27
  scores: List[float]
28
 
29
 
30
- @dataclass(slots=True)
31
  class TopicPrediction:
32
  label: str
33
  confidence: float
 
12
  from ..data.tokenization import Tokenizer
13
 
14
 
15
+ @dataclass
16
  class InferenceConfig:
17
  """Configuration knobs for the inference pipeline."""
18
 
 
21
  device: str | None = None
22
 
23
 
24
+ @dataclass
25
  class EmotionPrediction:
26
  labels: List[str]
27
  scores: List[float]
28
 
29
 
30
+ @dataclass
31
  class TopicPrediction:
32
  label: str
33
  confidence: float
src/models/factory.py CHANGED
@@ -17,7 +17,7 @@ from .heads import ClassificationHead, LMHead
17
  from .multitask import MultiTaskModel
18
 
19
 
20
- @dataclass(slots=True)
21
  class ModelConfig:
22
  """Configuration describing the transformer architecture."""
23
 
 
17
  from .multitask import MultiTaskModel
18
 
19
 
20
+ @dataclass
21
  class ModelConfig:
22
  """Configuration describing the transformer architecture."""
23
 
src/training/metrics.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  from __future__ import annotations
4
 
5
- from typing import Any, Dict, List, Sequence
6
 
7
  import numpy as np
8
  import torch
@@ -11,7 +11,7 @@ from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_f
11
 
12
 
13
  def accuracy(predictions: Sequence[int | str], targets: Sequence[int | str]) -> float:
14
- return accuracy_score(targets, predictions)
15
 
16
 
17
  def multilabel_f1(predictions: torch.Tensor, targets: torch.Tensor) -> float:
@@ -51,7 +51,7 @@ def calculate_bleu(predictions: Sequence[str], references: Sequence[str]) -> flo
51
  ref_tokens = [ref.split()] # BLEU expects list of references
52
  scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))
53
 
54
- return sum(scores) / len(scores)
55
 
56
 
57
  def classification_report_dict(
@@ -87,4 +87,4 @@ def get_confusion_matrix(
87
  predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None
88
  ) -> np.ndarray:
89
  """Compute confusion matrix."""
90
- return confusion_matrix(targets, predictions, labels=labels)
 
2
 
3
  from __future__ import annotations
4
 
5
+ from typing import Any, Dict, List, Sequence, cast
6
 
7
  import numpy as np
8
  import torch
 
11
 
12
 
13
  def accuracy(predictions: Sequence[int | str], targets: Sequence[int | str]) -> float:
14
+ return cast(float, accuracy_score(targets, predictions))
15
 
16
 
17
  def multilabel_f1(predictions: torch.Tensor, targets: torch.Tensor) -> float:
 
51
  ref_tokens = [ref.split()] # BLEU expects list of references
52
  scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))
53
 
54
+ return cast(float, sum(scores) / len(scores))
55
 
56
 
57
  def classification_report_dict(
 
87
  predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None
88
  ) -> np.ndarray:
89
  """Compute confusion matrix."""
90
+ return cast(np.ndarray, confusion_matrix(targets, predictions, labels=labels))
src/training/trainer.py CHANGED
@@ -17,7 +17,7 @@ from ..data.tokenization import Tokenizer
17
  from .metrics import accuracy, multilabel_f1, rouge_like
18
 
19
 
20
- @dataclass(slots=True)
21
  class TrainerConfig:
22
  max_epochs: int = 1
23
  gradient_clip_norm: float = 1.0
 
17
  from .metrics import accuracy, multilabel_f1, rouge_like
18
 
19
 
20
+ @dataclass
21
  class TrainerConfig:
22
  max_epochs: int = 1
23
  gradient_clip_norm: float = 1.0
src/utils/config.py CHANGED
@@ -7,7 +7,7 @@ from typing import Any, Dict
7
  import yaml
8
 
9
 
10
- @dataclass(slots=True)
11
  class Config:
12
  data: Dict[str, Any]
13
 
 
7
  import yaml
8
 
9
 
10
+ @dataclass
11
  class Config:
12
  data: Dict[str, Any]
13
 
src/utils/labels.py CHANGED
@@ -8,7 +8,7 @@ from pathlib import Path
8
  from typing import List
9
 
10
 
11
- @dataclass(slots=True)
12
  class LabelMetadata:
13
  """Container for label vocabularies persisted after training."""
14
 
 
8
  from typing import List
9
 
10
 
11
+ @dataclass
12
  class LabelMetadata:
13
  """Container for label vocabularies persisted after training."""
14
 
src/visualization/metrics.py CHANGED
@@ -1,5 +1,7 @@
1
  """Metric plotting helpers."""
2
 
 
 
3
  import matplotlib.pyplot as plt
4
 
5
 
 
1
  """Metric plotting helpers."""
2
 
3
+ from __future__ import annotations
4
+
5
  import matplotlib.pyplot as plt
6
 
7