Spaces:

OliverPerrin
/

LexiMind

Sleeping

App Files Files Community

OliverPerrin commited on Dec 3, 2025

Commit

f9d964d

1 Parent(s): f6d689c

Style: Fix linting errors and organize imports (ruff & mypy)

Browse files

Files changed (20) hide show

scripts/download_data.py +49 -22
scripts/eval_rouge.py +164 -150
scripts/preprocess_data.py +23 -10
src/api/dependencies.py +1 -0
src/api/routes.py +4 -1
src/data/dataloader.py +28 -9
src/data/preprocessing.py +0 -2
src/inference/factory.py +10 -6
src/models/__init__.py +4 -4
src/models/heads.py +15 -5
src/models/multitask.py +26 -11
src/models/positional_encoding.py +14 -15
src/training/utils.py +0 -2
src/visualization/embeddings.py +9 -7
tests/test_models/test_decoder_step.py +4 -4
tests/test_models/test_encoder.py +3 -3
tests/test_models/test_encoder_layer.py +3 -2
tests/test_models/test_feedforward.py +8 -8
tests/test_models/test_heads.py +3 -3
tests/test_models/test_multitask.py +49 -13

scripts/download_data.py CHANGED Viewed

@@ -13,14 +13,12 @@ from urllib.request import urlopen
 from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(PROJECT_ROOT))
 from src.utils.config import load_yaml
 DOWNLOAD_TIMEOUT = 60
 DEFAULT_SUMMARIZATION_DATASET = "gowrishankarp/newspaper-text-summarization-cnn-dailymail"
 DEFAULT_EMOTION_DATASET = "dair-ai/emotion"
@@ -33,16 +31,19 @@ def kaggle_download(dataset: str, output_dir: str) -> None:
     target = Path(output_dir)
     target.mkdir(parents=True, exist_ok=True)
     try:
-        run([
-            "kaggle",
-            "datasets",
-            "download",
-            "-d",
-            dataset,
-            "-p",
-            str(target),
-            "--unzip",
-        ], check=True)
     except CalledProcessError as error:
         raise RuntimeError(
             "Kaggle download failed. Verify that the Kaggle CLI is authenticated,"
@@ -71,8 +72,14 @@ def parse_args() -> argparse.Namespace:
         default="configs/data/datasets.yaml",
         help="Path to the dataset configuration YAML.",
     )
-    parser.add_argument("--skip-kaggle", action="store_true", help="Skip downloading the Kaggle summarization dataset.")
-    parser.add_argument("--skip-book", action="store_true", help="Skip downloading Gutenberg book texts.")
     return parser.parse_args()
@@ -92,11 +99,14 @@ def _write_jsonl(records: Iterable[dict[str, object]], destination: Path) -> Non
             handle.write(json.dumps(record, ensure_ascii=False) + "\n")
-def _emotion_records(dataset_split: Dataset, label_names: list[str] | None) -> Iterator[dict[str, object]]:
     for item in dataset_split:
         data = dict(item)
         text = data.get("text", "")
         label_value = data.get("label")
         def resolve_label(index: object) -> str:
             if isinstance(index, int) and label_names and 0 <= index < len(label_names):
                 return label_names[index]
@@ -109,11 +119,14 @@ def _emotion_records(dataset_split: Dataset, label_names: list[str] | None) -> I
         yield {"text": text, "emotions": labels}
-def _topic_records(dataset_split: Dataset, label_names: list[str] | None) -> Iterator[dict[str, object]]:
     for item in dataset_split:
         data = dict(item)
         text = data.get("text") or data.get("content") or ""
         label_value = data.get("label")
         def resolve_topic(raw: object) -> str:
             if label_names:
                 idx: int | None = None
@@ -142,12 +155,18 @@ def main() -> None:
     raw_paths = config.get("raw", {}) if isinstance(config, dict) else {}
     downloads_cfg = config.get("downloads", {}) if isinstance(config, dict) else {}
-    summarization_cfg = downloads_cfg.get("summarization", {}) if isinstance(downloads_cfg, dict) else {}
     summarization_dataset = summarization_cfg.get("dataset", DEFAULT_SUMMARIZATION_DATASET)
-    summarization_output = summarization_cfg.get("output", raw_paths.get("summarization", "data/raw/summarization"))
     if not args.skip_kaggle and summarization_dataset:
-        print(f"Downloading summarization dataset '{summarization_dataset}' -> {summarization_output}")
         kaggle_download(summarization_dataset, summarization_output)
     else:
         print("Skipping Kaggle summarization download.")
@@ -174,7 +193,11 @@ def main() -> None:
             name = str(entry.get("name") or "gutenberg_text")
             url = str(entry.get("url") or DEFAULT_BOOK_URL)
             output_value = entry.get("output")
-            destination = Path(output_value) if isinstance(output_value, str) and output_value else books_root / f"{name}.txt"
             destination.parent.mkdir(parents=True, exist_ok=True)
             print(f"Downloading Gutenberg text '{name}' from {url} -> {destination}")
             gutenberg_download(url, str(destination))
@@ -192,7 +215,9 @@ def main() -> None:
         if first_emotion_key is not None
         else None
     )
-    emotion_label_names = emotion_label_feature.names if isinstance(emotion_label_feature, ClassLabel) else None
     for split_name, split in emotion_dataset.items():
         output_path = emotion_dir / f"{str(split_name)}.jsonl"
         _write_jsonl(_emotion_records(split, emotion_label_names), output_path)
@@ -209,7 +234,9 @@ def main() -> None:
         if first_topic_key is not None
         else None
     )
-    topic_label_names = topic_label_feature.names if isinstance(topic_label_feature, ClassLabel) else None
     for split_name, split in topic_dataset.items():
         output_path = topic_dir / f"{str(split_name)}.jsonl"
         _write_jsonl(_topic_records(split, topic_label_names), output_path)

 from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(PROJECT_ROOT))
 from src.utils.config import load_yaml
 DOWNLOAD_TIMEOUT = 60
 DEFAULT_SUMMARIZATION_DATASET = "gowrishankarp/newspaper-text-summarization-cnn-dailymail"
 DEFAULT_EMOTION_DATASET = "dair-ai/emotion"
     target = Path(output_dir)
     target.mkdir(parents=True, exist_ok=True)
     try:
+        run(
+            [
+                "kaggle",
+                "datasets",
+                "download",
+                "-d",
+                dataset,
+                "-p",
+                str(target),
+                "--unzip",
+            ],
+            check=True,
+        )
     except CalledProcessError as error:
         raise RuntimeError(
             "Kaggle download failed. Verify that the Kaggle CLI is authenticated,"
         default="configs/data/datasets.yaml",
         help="Path to the dataset configuration YAML.",
     )
+    parser.add_argument(
+        "--skip-kaggle",
+        action="store_true",
+        help="Skip downloading the Kaggle summarization dataset.",
+    )
+    parser.add_argument(
+        "--skip-book", action="store_true", help="Skip downloading Gutenberg book texts."
+    )
     return parser.parse_args()
             handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+def _emotion_records(
+    dataset_split: Dataset, label_names: list[str] | None
+) -> Iterator[dict[str, object]]:
     for item in dataset_split:
         data = dict(item)
         text = data.get("text", "")
         label_value = data.get("label")
         def resolve_label(index: object) -> str:
             if isinstance(index, int) and label_names and 0 <= index < len(label_names):
                 return label_names[index]
         yield {"text": text, "emotions": labels}
+def _topic_records(
+    dataset_split: Dataset, label_names: list[str] | None
+) -> Iterator[dict[str, object]]:
     for item in dataset_split:
         data = dict(item)
         text = data.get("text") or data.get("content") or ""
         label_value = data.get("label")
         def resolve_topic(raw: object) -> str:
             if label_names:
                 idx: int | None = None
     raw_paths = config.get("raw", {}) if isinstance(config, dict) else {}
     downloads_cfg = config.get("downloads", {}) if isinstance(config, dict) else {}
+    summarization_cfg = (
+        downloads_cfg.get("summarization", {}) if isinstance(downloads_cfg, dict) else {}
+    )
     summarization_dataset = summarization_cfg.get("dataset", DEFAULT_SUMMARIZATION_DATASET)
+    summarization_output = summarization_cfg.get(
+        "output", raw_paths.get("summarization", "data/raw/summarization")
+    )
     if not args.skip_kaggle and summarization_dataset:
+        print(
+            f"Downloading summarization dataset '{summarization_dataset}' -> {summarization_output}"
+        )
         kaggle_download(summarization_dataset, summarization_output)
     else:
         print("Skipping Kaggle summarization download.")
             name = str(entry.get("name") or "gutenberg_text")
             url = str(entry.get("url") or DEFAULT_BOOK_URL)
             output_value = entry.get("output")
+            destination = (
+                Path(output_value)
+                if isinstance(output_value, str) and output_value
+                else books_root / f"{name}.txt"
+            )
             destination.parent.mkdir(parents=True, exist_ok=True)
             print(f"Downloading Gutenberg text '{name}' from {url} -> {destination}")
             gutenberg_download(url, str(destination))
         if first_emotion_key is not None
         else None
     )
+    emotion_label_names = (
+        emotion_label_feature.names if isinstance(emotion_label_feature, ClassLabel) else None
+    )
     for split_name, split in emotion_dataset.items():
         output_path = emotion_dir / f"{str(split_name)}.jsonl"
         _write_jsonl(_emotion_records(split, emotion_label_names), output_path)
         if first_topic_key is not None
         else None
     )
+    topic_label_names = (
+        topic_label_feature.names if isinstance(topic_label_feature, ClassLabel) else None
+    )
     for split_name, split in topic_dataset.items():
         output_path = topic_dir / f"{str(split_name)}.jsonl"
         _write_jsonl(_topic_records(split, topic_label_names), output_path)

scripts/eval_rouge.py CHANGED Viewed

@@ -3,181 +3,195 @@ from __future__ import annotations
 import argparse
 import json
 from collections import defaultdict
 from pathlib import Path
 from statistics import fmean
 from typing import Dict, Iterable, List, Sequence, Tuple
-import sys
 from rouge_score import rouge_scorer
 from tqdm import tqdm
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 if str(PROJECT_ROOT) not in sys.path:
-	sys.path.insert(0, str(PROJECT_ROOT))
 from src.inference.factory import create_inference_pipeline
 def parse_args() -> argparse.Namespace:
-	parser = argparse.ArgumentParser(description="Evaluate LexiMind summaries with ROUGE metrics.")
-	parser.add_argument("data", type=Path, help="Path to JSONL file with source text and gold summaries.")
-	parser.add_argument("checkpoint", type=Path, help="Path to the trained checkpoint (e.g., checkpoints/best.pt).")
-	parser.add_argument("labels", type=Path, help="Path to label metadata (e.g., artifacts/labels.json).")
-	parser.add_argument(
-		"--tokenizer-dir",
-		type=Path,
-		default=Path("artifacts/hf_tokenizer"),
-		help="Directory containing the saved tokenizer artifacts.",
-	)
-	parser.add_argument(
-		"--model-config",
-		type=Path,
-		default=None,
-		help="Optional YAML config describing the model architecture.",
-	)
-	parser.add_argument("--device", type=str, default="cpu", help="Device to run inference on (cpu or cuda).")
-	parser.add_argument("--batch-size", type=int, default=8, help="Number of samples per inference batch.")
-	parser.add_argument(
-		"--max-samples",
-		type=int,
-		default=None,
-		help="If provided, limit evaluation to the first N samples for quick smoke tests.",
-	)
-	parser.add_argument(
-		"--max-length",
-		type=int,
-		default=128,
-		help="Maximum length to pass into the summarization head during generation.",
-	)
-	parser.add_argument(
-		"--metrics",
-		type=str,
-		nargs="+",
-		default=("rouge1", "rouge2", "rougeL"),
-		help="ROUGE metrics to compute.",
-	)
-	parser.add_argument(
-		"--source-field",
-		type=str,
-		default="source",
-		help="Field name containing the input document in the JSONL examples.",
-	)
-	parser.add_argument(
-		"--target-field",
-		type=str,
-		default="summary",
-		help="Field name containing the reference summary in the JSONL examples.",
-	)
-	parser.add_argument(
-		"--no-stemmer",
-		action="store_true",
-		help="Disable Porter stemming inside the ROUGE scorer (defaults to enabled).",
-	)
-	parser.add_argument(
-		"--output",
-		type=Path,
-		default=None,
-		help="Optional path to save a JSON report with aggregate metrics and sample counts.",
-	)
-	return parser.parse_args()
 def load_examples(
-	path: Path,
-	source_field: str,
-	target_field: str,
-	max_samples: int | None,
 ) -> List[Tuple[str, str]]:
-	examples: List[Tuple[str, str]] = []
-	with path.open("r", encoding="utf-8") as handle:
-		for line in handle:
-			line = line.strip()
-			if not line:
-				continue
-			record = json.loads(line)
-			try:
-				source = str(record[source_field])
-				target = str(record[target_field])
-			except KeyError as exc:  # pragma: no cover - invalid data surface at runtime
-				raise KeyError(f"Missing field in record: {exc} (available keys: {list(record)})") from exc
-			examples.append((source, target))
-			if max_samples is not None and len(examples) >= max_samples:
-				break
-	if not examples:
-		raise ValueError(f"No examples loaded from {path}")
-	return examples
-def batched(items: Sequence[Tuple[str, str]], batch_size: int) -> Iterable[Sequence[Tuple[str, str]]]:
-	for start in range(0, len(items), batch_size):
-		yield items[start : start + batch_size]
 def aggregate_scores(raw_scores: Dict[str, Dict[str, List[float]]]) -> Dict[str, Dict[str, float]]:
-	aggregated: Dict[str, Dict[str, float]] = {}
-	for metric, components in raw_scores.items():
-		aggregated[metric] = {
-			component: (fmean(values) if values else 0.0) for component, values in components.items()
-		}
-	return aggregated
 def main() -> None:
-	args = parse_args()
-	pipeline, _ = create_inference_pipeline(
-		checkpoint_path=args.checkpoint,
-		labels_path=args.labels,
-		tokenizer_dir=args.tokenizer_dir,
-		model_config_path=args.model_config,
-		device=args.device,
-		summary_max_length=args.max_length,
-	)
-	examples = load_examples(args.data, args.source_field, args.target_field, args.max_samples)
-	scorer = rouge_scorer.RougeScorer(list(args.metrics), use_stemmer=not args.no_stemmer)
-	score_store: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
-	for batch in tqdm(
-		list(batched(examples, args.batch_size)),
-		desc="Evaluating",
-		total=(len(examples) + args.batch_size - 1) // args.batch_size,
-	):
-		documents = [item[0] for item in batch]
-		references = [item[1] for item in batch]
-		predictions = pipeline.summarize(documents, max_length=args.max_length)
-		for reference, prediction in zip(references, predictions):
-			scores = scorer.score(reference, prediction)
-			for metric_name, score in scores.items():
-				score_store[metric_name]["precision"].append(score.precision)
-				score_store[metric_name]["recall"].append(score.recall)
-				score_store[metric_name]["fmeasure"].append(score.fmeasure)
-	aggregated = aggregate_scores(score_store)
-	report = {
-		"num_examples": len(examples),
-		"metrics": aggregated,
-		"config": {
-			"data": str(args.data),
-			"checkpoint": str(args.checkpoint),
-			"tokenizer_dir": str(args.tokenizer_dir),
-			"metrics": list(args.metrics),
-			"max_length": args.max_length,
-			"batch_size": args.batch_size,
-			"device": args.device,
-		},
-	}
-	print(json.dumps(report, indent=2))
-	if args.output:
-		args.output.parent.mkdir(parents=True, exist_ok=True)
-		with args.output.open("w", encoding="utf-8") as handle:
-			json.dump(report, handle, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
-	main()

 import argparse
 import json
+import sys
 from collections import defaultdict
 from pathlib import Path
 from statistics import fmean
 from typing import Dict, Iterable, List, Sequence, Tuple
 from rouge_score import rouge_scorer
 from tqdm import tqdm
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 from src.inference.factory import create_inference_pipeline
 def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate LexiMind summaries with ROUGE metrics.")
+    parser.add_argument(
+        "data", type=Path, help="Path to JSONL file with source text and gold summaries."
+    )
+    parser.add_argument(
+        "checkpoint", type=Path, help="Path to the trained checkpoint (e.g., checkpoints/best.pt)."
+    )
+    parser.add_argument(
+        "labels", type=Path, help="Path to label metadata (e.g., artifacts/labels.json)."
+    )
+    parser.add_argument(
+        "--tokenizer-dir",
+        type=Path,
+        default=Path("artifacts/hf_tokenizer"),
+        help="Directory containing the saved tokenizer artifacts.",
+    )
+    parser.add_argument(
+        "--model-config",
+        type=Path,
+        default=None,
+        help="Optional YAML config describing the model architecture.",
+    )
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="Device to run inference on (cpu or cuda)."
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=8, help="Number of samples per inference batch."
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=None,
+        help="If provided, limit evaluation to the first N samples for quick smoke tests.",
+    )
+    parser.add_argument(
+        "--max-length",
+        type=int,
+        default=128,
+        help="Maximum length to pass into the summarization head during generation.",
+    )
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=("rouge1", "rouge2", "rougeL"),
+        help="ROUGE metrics to compute.",
+    )
+    parser.add_argument(
+        "--source-field",
+        type=str,
+        default="source",
+        help="Field name containing the input document in the JSONL examples.",
+    )
+    parser.add_argument(
+        "--target-field",
+        type=str,
+        default="summary",
+        help="Field name containing the reference summary in the JSONL examples.",
+    )
+    parser.add_argument(
+        "--no-stemmer",
+        action="store_true",
+        help="Disable Porter stemming inside the ROUGE scorer (defaults to enabled).",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Optional path to save a JSON report with aggregate metrics and sample counts.",
+    )
+    return parser.parse_args()
 def load_examples(
+    path: Path,
+    source_field: str,
+    target_field: str,
+    max_samples: int | None,
 ) -> List[Tuple[str, str]]:
+    examples: List[Tuple[str, str]] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line:
+                continue
+            record = json.loads(line)
+            try:
+                source = str(record[source_field])
+                target = str(record[target_field])
+            except KeyError as exc:  # pragma: no cover - invalid data surface at runtime
+                raise KeyError(
+                    f"Missing field in record: {exc} (available keys: {list(record)})"
+                ) from exc
+            examples.append((source, target))
+            if max_samples is not None and len(examples) >= max_samples:
+                break
+    if not examples:
+        raise ValueError(f"No examples loaded from {path}")
+    return examples
+def batched(
+    items: Sequence[Tuple[str, str]], batch_size: int
+) -> Iterable[Sequence[Tuple[str, str]]]:
+    for start in range(0, len(items), batch_size):
+        yield items[start : start + batch_size]
 def aggregate_scores(raw_scores: Dict[str, Dict[str, List[float]]]) -> Dict[str, Dict[str, float]]:
+    aggregated: Dict[str, Dict[str, float]] = {}
+    for metric, components in raw_scores.items():
+        aggregated[metric] = {
+            component: (fmean(values) if values else 0.0)
+            for component, values in components.items()
+        }
+    return aggregated
 def main() -> None:
+    args = parse_args()
+    pipeline, _ = create_inference_pipeline(
+        checkpoint_path=args.checkpoint,
+        labels_path=args.labels,
+        tokenizer_dir=args.tokenizer_dir,
+        model_config_path=args.model_config,
+        device=args.device,
+        summary_max_length=args.max_length,
+    )
+    examples = load_examples(args.data, args.source_field, args.target_field, args.max_samples)
+    scorer = rouge_scorer.RougeScorer(list(args.metrics), use_stemmer=not args.no_stemmer)
+    score_store: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
+    for batch in tqdm(
+        list(batched(examples, args.batch_size)),
+        desc="Evaluating",
+        total=(len(examples) + args.batch_size - 1) // args.batch_size,
+    ):
+        documents = [item[0] for item in batch]
+        references = [item[1] for item in batch]
+        predictions = pipeline.summarize(documents, max_length=args.max_length)
+        for reference, prediction in zip(references, predictions):
+            scores = scorer.score(reference, prediction)
+            for metric_name, score in scores.items():
+                score_store[metric_name]["precision"].append(score.precision)
+                score_store[metric_name]["recall"].append(score.recall)
+                score_store[metric_name]["fmeasure"].append(score.fmeasure)
+    aggregated = aggregate_scores(score_store)
+    report = {
+        "num_examples": len(examples),
+        "metrics": aggregated,
+        "config": {
+            "data": str(args.data),
+            "checkpoint": str(args.checkpoint),
+            "tokenizer_dir": str(args.tokenizer_dir),
+            "metrics": list(args.metrics),
+            "max_length": args.max_length,
+            "batch_size": args.batch_size,
+            "device": args.device,
+        },
+    }
+    print(json.dumps(report, indent=2))
+    if args.output:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        with args.output.open("w", encoding="utf-8") as handle:
+            json.dump(report, handle, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
+    main()

scripts/preprocess_data.py CHANGED Viewed

@@ -25,8 +25,15 @@ def parse_args() -> argparse.Namespace:
         default="configs/data/datasets.yaml",
         help="Path to data configuration YAML.",
     )
-    parser.add_argument("--val-ratio", type=float, default=0.1, help="Validation split size for topic dataset when no validation split is present.")
-    parser.add_argument("--seed", type=int, default=17, help="Random seed for deterministic splitting.")
     return parser.parse_args()
@@ -73,7 +80,9 @@ def preprocess_books(
     for book_path in sorted(raw_dir.glob("*.txt")):
         text = book_path.read_text(encoding="utf-8").lstrip("\ufeff")
         normalized = text.replace("\r\n", "\n")
-        paragraphs = [paragraph.strip() for paragraph in normalized.split("\n\n") if paragraph.strip()]
         records: list[Dict[str, object]] = []
         for paragraph_id, paragraph in enumerate(paragraphs):
@@ -130,7 +139,9 @@ def preprocess_summarization(raw_dir: Path, processed_dir: Path) -> None:
         output_path = processed_dir / f"{split}.jsonl"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         print(f"Writing summarization split '{split}' to {output_path}")
-        with source_path.open("r", encoding="utf-8", newline="") as source_handle, output_path.open("w", encoding="utf-8") as sink:
             reader = csv.DictReader(source_handle)
             for row in reader:
                 article = row.get("article") or row.get("Article") or ""
@@ -167,7 +178,7 @@ def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCle
         assert source_path is not None
         path = source_path
-        def iter_records() -> Iterator[Dict[str, object]]:
             if path.suffix == ".jsonl":
                 for row in _read_jsonl(path):
                     raw_text = str(row.get("text", ""))
@@ -186,12 +197,12 @@ def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCle
                 delimiter = ";" if path.suffix == ".txt" else ","
                 with path.open("r", encoding="utf-8", newline="") as handle:
                     reader = csv.reader(handle, delimiter=delimiter)
-                    for row in reader:
-                        if not row:
                             continue
-                        raw_text = str(row[0])
                         text = cleaner.transform([raw_text])[0]
-                        raw_labels = row[1] if len(row) > 1 else ""
                         labels = [label.strip() for label in raw_labels.split(",") if label.strip()]
                         if not labels:
                             labels = ["neutral"]
@@ -303,7 +314,9 @@ def main() -> None:
     topic_raw = Path(raw_cfg.get("topic", "data/raw/topic"))
     books_processed = Path(processed_cfg.get("books", "data/processed/books"))
-    summarization_processed = Path(processed_cfg.get("summarization", "data/processed/summarization"))
     emotion_processed = Path(processed_cfg.get("emotion", "data/processed/emotion"))
     topic_processed = Path(processed_cfg.get("topic", "data/processed/topic"))

         default="configs/data/datasets.yaml",
         help="Path to data configuration YAML.",
     )
+    parser.add_argument(
+        "--val-ratio",
+        type=float,
+        default=0.1,
+        help="Validation split size for topic dataset when no validation split is present.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=17, help="Random seed for deterministic splitting."
+    )
     return parser.parse_args()
     for book_path in sorted(raw_dir.glob("*.txt")):
         text = book_path.read_text(encoding="utf-8").lstrip("\ufeff")
         normalized = text.replace("\r\n", "\n")
+        paragraphs = [
+            paragraph.strip() for paragraph in normalized.split("\n\n") if paragraph.strip()
+        ]
         records: list[Dict[str, object]] = []
         for paragraph_id, paragraph in enumerate(paragraphs):
         output_path = processed_dir / f"{split}.jsonl"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         print(f"Writing summarization split '{split}' to {output_path}")
+        with source_path.open("r", encoding="utf-8", newline="") as source_handle, output_path.open(
+            "w", encoding="utf-8"
+        ) as sink:
             reader = csv.DictReader(source_handle)
             for row in reader:
                 article = row.get("article") or row.get("Article") or ""
         assert source_path is not None
         path = source_path
+        def iter_records(path: Path = path) -> Iterator[Dict[str, object]]:
             if path.suffix == ".jsonl":
                 for row in _read_jsonl(path):
                     raw_text = str(row.get("text", ""))
                 delimiter = ";" if path.suffix == ".txt" else ","
                 with path.open("r", encoding="utf-8", newline="") as handle:
                     reader = csv.reader(handle, delimiter=delimiter)
+                    for csv_row in reader:
+                        if not csv_row:
                             continue
+                        raw_text = str(csv_row[0])
                         text = cleaner.transform([raw_text])[0]
+                        raw_labels = csv_row[1] if len(csv_row) > 1 else ""
                         labels = [label.strip() for label in raw_labels.split(",") if label.strip()]
                         if not labels:
                             labels = ["neutral"]
     topic_raw = Path(raw_cfg.get("topic", "data/raw/topic"))
     books_processed = Path(processed_cfg.get("books", "data/processed/books"))
+    summarization_processed = Path(
+        processed_cfg.get("summarization", "data/processed/summarization")
+    )
     emotion_processed = Path(processed_cfg.get("emotion", "data/processed/emotion"))
     topic_processed = Path(processed_cfg.get("topic", "data/processed/topic"))

src/api/dependencies.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pathlib import Path
 from fastapi import HTTPException, status
 from ..utils.logging import get_logger
 logger = get_logger(__name__)
 from ..inference.factory import create_inference_pipeline

 from fastapi import HTTPException, status
 from ..utils.logging import get_logger
 logger = get_logger(__name__)
 from ..inference.factory import create_inference_pipeline

src/api/routes.py CHANGED Viewed

@@ -11,7 +11,10 @@ router = APIRouter()
 @router.post("/summarize", response_model=SummaryResponse)
-def summarize(payload: SummaryRequest, pipeline: InferencePipeline = Depends(get_pipeline)) -> SummaryResponse:
     try:
         outputs = pipeline.batch_predict([payload.text])
     except Exception as exc:  # noqa: BLE001 - surface inference error to client

 @router.post("/summarize", response_model=SummaryResponse)
+def summarize(
+    payload: SummaryRequest,
+    pipeline: InferencePipeline = Depends(get_pipeline),  # noqa: B008
+) -> SummaryResponse:
     try:
         outputs = pipeline.batch_predict([payload.text])
     except Exception as exc:  # noqa: BLE001 - surface inference error to client

src/data/dataloader.py CHANGED Viewed

@@ -1,19 +1,32 @@
 """Task-aware DataLoader builders for the LexiMind multitask suite."""
 from __future__ import annotations
-from typing import Iterable, List
 import torch
 from torch.utils.data import DataLoader
-from .dataset import EmotionDataset, EmotionExample, SummarizationDataset, SummarizationExample, TopicDataset, TopicExample
 from .tokenization import Tokenizer
 class SummarizationCollator:
     """Prepare encoder-decoder batches for abstractive summarization."""
-    def __init__(self, tokenizer: Tokenizer, *, max_source_length: int | None = None, max_target_length: int | None = None) -> None:
         self.tokenizer = tokenizer
         self.max_source_length = max_source_length
         self.max_target_length = max_target_length
@@ -29,17 +42,17 @@ class SummarizationCollator:
         # We want:
         # tgt_ids (decoder input): [BOS, A, B, EOS] (drop last PAD or EOS if full)
         # labels (target): [A, B, EOS, PAD] (drop first BOS)
         ids = target_enc["input_ids"]
         mask = target_enc["attention_mask"]
         # Slice to create shifted inputs/targets
         # tgt_ids: everything except the last token
         tgt_ids = ids[:, :-1]
         # labels: everything except the first token (BOS)
         labels = ids[:, 1:].clone()
         # Adjust mask for labels to ignore padding
         # The mask corresponds to the original ids. We slice it to match labels.
         labels_mask = mask[:, 1:]
@@ -56,7 +69,9 @@ class SummarizationCollator:
 class EmotionCollator:
     """Prepare batches for multi-label emotion classification."""
-    def __init__(self, tokenizer: Tokenizer, dataset: EmotionDataset, *, max_length: int | None = None) -> None:
         self.tokenizer = tokenizer
         self.binarizer = dataset.binarizer
         self.max_length = max_length
@@ -76,7 +91,9 @@ class EmotionCollator:
 class TopicCollator:
     """Prepare batches for topic classification using the projection head."""
-    def __init__(self, tokenizer: Tokenizer, dataset: TopicDataset, *, max_length: int | None = None) -> None:
         self.tokenizer = tokenizer
         self.encoder = dataset.encoder
         self.max_length = max_length
@@ -84,7 +101,9 @@ class TopicCollator:
     def __call__(self, batch: List[TopicExample]) -> dict[str, torch.Tensor]:
         texts = [example.text for example in batch]
         encoded = self.tokenizer.batch_encode(texts, max_length=self.max_length)
-        labels = torch.as_tensor(self.encoder.transform([example.topic for example in batch]), dtype=torch.long)
         return {
             "input_ids": encoded["input_ids"],
             "attention_mask": encoded["attention_mask"],

 """Task-aware DataLoader builders for the LexiMind multitask suite."""
 from __future__ import annotations
+from typing import List
 import torch
 from torch.utils.data import DataLoader
+from .dataset import (
+    EmotionDataset,
+    EmotionExample,
+    SummarizationDataset,
+    SummarizationExample,
+    TopicDataset,
+    TopicExample,
+)
 from .tokenization import Tokenizer
 class SummarizationCollator:
     """Prepare encoder-decoder batches for abstractive summarization."""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        *,
+        max_source_length: int | None = None,
+        max_target_length: int | None = None,
+    ) -> None:
         self.tokenizer = tokenizer
         self.max_source_length = max_source_length
         self.max_target_length = max_target_length
         # We want:
         # tgt_ids (decoder input): [BOS, A, B, EOS] (drop last PAD or EOS if full)
         # labels (target): [A, B, EOS, PAD] (drop first BOS)
         ids = target_enc["input_ids"]
         mask = target_enc["attention_mask"]
         # Slice to create shifted inputs/targets
         # tgt_ids: everything except the last token
         tgt_ids = ids[:, :-1]
         # labels: everything except the first token (BOS)
         labels = ids[:, 1:].clone()
         # Adjust mask for labels to ignore padding
         # The mask corresponds to the original ids. We slice it to match labels.
         labels_mask = mask[:, 1:]
 class EmotionCollator:
     """Prepare batches for multi-label emotion classification."""
+    def __init__(
+        self, tokenizer: Tokenizer, dataset: EmotionDataset, *, max_length: int | None = None
+    ) -> None:
         self.tokenizer = tokenizer
         self.binarizer = dataset.binarizer
         self.max_length = max_length
 class TopicCollator:
     """Prepare batches for topic classification using the projection head."""
+    def __init__(
+        self, tokenizer: Tokenizer, dataset: TopicDataset, *, max_length: int | None = None
+    ) -> None:
         self.tokenizer = tokenizer
         self.encoder = dataset.encoder
         self.max_length = max_length
     def __call__(self, batch: List[TopicExample]) -> dict[str, torch.Tensor]:
         texts = [example.text for example in batch]
         encoded = self.tokenizer.batch_encode(texts, max_length=self.max_length)
+        labels = torch.as_tensor(
+            self.encoder.transform([example.topic for example in batch]), dtype=torch.long
+        )
         return {
             "input_ids": encoded["input_ids"],
             "attention_mask": encoded["attention_mask"],

src/data/preprocessing.py CHANGED Viewed

@@ -1,13 +1,11 @@
 """Text preprocessing utilities built around Hugging Face tokenizers."""
 from __future__ import annotations
-import re
 from dataclasses import dataclass, replace
 from typing import Iterable, List, Sequence
 import torch
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
 from .tokenization import Tokenizer, TokenizerConfig

 """Text preprocessing utilities built around Hugging Face tokenizers."""
 from __future__ import annotations
 from dataclasses import dataclass, replace
 from typing import Iterable, List, Sequence
 import torch
 from sklearn.base import BaseEstimator, TransformerMixin
 from .tokenization import Tokenizer, TokenizerConfig

src/inference/factory.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch
 from ..data.preprocessing import TextPreprocessor
 from ..data.tokenization import Tokenizer, TokenizerConfig
-from ..models.factory import ModelConfig, build_multitask_model, load_model_config
 from ..utils.io import load_state
 from ..utils.labels import LabelMetadata, load_label_metadata
 from .pipeline import InferenceConfig, InferencePipeline
@@ -38,7 +38,9 @@ def create_inference_pipeline(
         chosen_dir = Path(tokenizer_dir) if tokenizer_dir is not None else default_dir
         local_tokenizer_dir = chosen_dir
         if local_tokenizer_dir.exists():
-            resolved_tokenizer_config = TokenizerConfig(pretrained_model_name=str(local_tokenizer_dir))
         else:
             raise ValueError(
                 "No tokenizer configuration provided and default tokenizer directory "
@@ -46,11 +48,13 @@ def create_inference_pipeline(
             )
     tokenizer = Tokenizer(resolved_tokenizer_config)
     # Default to base config if not specified (checkpoint was trained with base config)
     if model_config_path is None:
-        model_config_path = Path(__file__).resolve().parent.parent.parent / "configs" / "model" / "base.yaml"
     model_config = load_model_config(model_config_path)
     model = build_multitask_model(
         tokenizer,
@@ -59,7 +63,7 @@ def create_inference_pipeline(
         config=model_config,
         load_pretrained=False,
     )
     # Load checkpoint - weights will load separately since factory doesn't tie them
     load_state(model, str(checkpoint))

 from ..data.preprocessing import TextPreprocessor
 from ..data.tokenization import Tokenizer, TokenizerConfig
+from ..models.factory import build_multitask_model, load_model_config
 from ..utils.io import load_state
 from ..utils.labels import LabelMetadata, load_label_metadata
 from .pipeline import InferenceConfig, InferencePipeline
         chosen_dir = Path(tokenizer_dir) if tokenizer_dir is not None else default_dir
         local_tokenizer_dir = chosen_dir
         if local_tokenizer_dir.exists():
+            resolved_tokenizer_config = TokenizerConfig(
+                pretrained_model_name=str(local_tokenizer_dir)
+            )
         else:
             raise ValueError(
                 "No tokenizer configuration provided and default tokenizer directory "
             )
     tokenizer = Tokenizer(resolved_tokenizer_config)
     # Default to base config if not specified (checkpoint was trained with base config)
     if model_config_path is None:
+        model_config_path = (
+            Path(__file__).resolve().parent.parent.parent / "configs" / "model" / "base.yaml"
+        )
     model_config = load_model_config(model_config_path)
     model = build_multitask_model(
         tokenizer,
         config=model_config,
         load_pretrained=False,
     )
     # Load checkpoint - weights will load separately since factory doesn't tie them
     load_state(model, str(checkpoint))

src/models/__init__.py CHANGED Viewed

@@ -8,13 +8,13 @@ This package provides a from-scratch transformer implementation with:
 - MultiTaskModel: composable wrapper for encoder/decoder + task heads
 """
-from .encoder import TransformerEncoder, TransformerEncoderLayer
-from .decoder import TransformerDecoder, TransformerDecoderLayer, create_causal_mask
 from .attention import MultiHeadAttention
 from .feedforward import FeedForward
-from .positional_encoding import PositionalEncoding
-from .heads import ClassificationHead, TokenClassificationHead, LMHead, ProjectionHead
 from .multitask import MultiTaskModel
 __all__ = [
     "TransformerEncoder",

 - MultiTaskModel: composable wrapper for encoder/decoder + task heads
 """
 from .attention import MultiHeadAttention
+from .decoder import TransformerDecoder, TransformerDecoderLayer, create_causal_mask
+from .encoder import TransformerEncoder, TransformerEncoderLayer
 from .feedforward import FeedForward
+from .heads import ClassificationHead, LMHead, ProjectionHead, TokenClassificationHead
 from .multitask import MultiTaskModel
+from .positional_encoding import PositionalEncoding
 __all__ = [
     "TransformerEncoder",

src/models/heads.py CHANGED Viewed

@@ -9,7 +9,7 @@ Includes:
 Keep these heads minimal, well-tested, and easy to compose on top of encoder/decoder outputs.
 """
-from typing import Optional, Literal
 import torch
 import torch.nn as nn
@@ -96,8 +96,12 @@ class LMHead(nn.Module):
         if tie_embedding is not None:
             # Validate sizes
-            assert tie_embedding.num_embeddings == vocab_size, "vocab size mismatch for weight tying"
-            assert tie_embedding.embedding_dim == d_model, "embedding dim must match d_model for weight tying"
             # Tie weights: point the projection weight to the embedding weight Tensor
             # Remove the existing projection parameter in favor of the embedding weight
             # This keeps the same Parameter object, so updates affect both modules.
@@ -122,7 +126,13 @@ class ProjectionHead(nn.Module):
         dropout: dropout probability
     """
-    def __init__(self, d_model: int, proj_dim: int = 128, hidden_dim: Optional[int] = None, dropout: float = 0.1):
         super().__init__()
         if hidden_dim is None:
             hidden_dim = max(d_model, proj_dim)
@@ -148,4 +158,4 @@ class ProjectionHead(nn.Module):
         elif orig_dim == 2:
             return self.net(x)
         else:
-            raise ValueError("Input must be 2D or 3D tensor")

 Keep these heads minimal, well-tested, and easy to compose on top of encoder/decoder outputs.
 """
+from typing import Literal, Optional
 import torch
 import torch.nn as nn
         if tie_embedding is not None:
             # Validate sizes
+            assert (
+                tie_embedding.num_embeddings == vocab_size
+            ), "vocab size mismatch for weight tying"
+            assert (
+                tie_embedding.embedding_dim == d_model
+            ), "embedding dim must match d_model for weight tying"
             # Tie weights: point the projection weight to the embedding weight Tensor
             # Remove the existing projection parameter in favor of the embedding weight
             # This keeps the same Parameter object, so updates affect both modules.
         dropout: dropout probability
     """
+    def __init__(
+        self,
+        d_model: int,
+        proj_dim: int = 128,
+        hidden_dim: Optional[int] = None,
+        dropout: float = 0.1,
+    ):
         super().__init__()
         if hidden_dim is None:
             hidden_dim = max(d_model, proj_dim)
         elif orig_dim == 2:
             return self.net(x)
         else:
+            raise ValueError("Input must be 2D or 3D tensor")

src/models/multitask.py CHANGED Viewed

@@ -14,16 +14,17 @@ Design goals:
   seq2seq tasks (encoder -> decoder -> LMHead)
 - Minimal dependencies on training loop; return logits and (optionally) loss
 """
-from typing import Optional, Dict, Any, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 # Import your components
 from .encoder import TransformerEncoder
-from .decoder import TransformerDecoder
-from .heads import ClassificationHead, TokenClassificationHead, LMHead
 class MultiTaskModel(nn.Module):
@@ -112,15 +113,21 @@ class MultiTaskModel(nn.Module):
             if "input_ids" in inputs:
                 encoder_mask = None
                 if "attention_mask" in inputs:
-                    encoder_mask = self._expand_attention_mask(inputs["attention_mask"], inputs["input_ids"].device)
                 enc_out = self.encoder(inputs["input_ids"], mask=encoder_mask)
             elif "embeddings" in inputs:
                 encoder_mask = inputs.get("attention_mask")
                 if encoder_mask is not None:
-                    encoder_mask = self._expand_attention_mask(encoder_mask, inputs["embeddings"].device)
                 enc_out = self.encoder(inputs["embeddings"], mask=encoder_mask)
             else:
-                raise ValueError("inputs must contain 'input_ids' or 'embeddings' for encoder tasks")
             logits = head(enc_out)
             if return_loss:
@@ -152,7 +159,9 @@ class MultiTaskModel(nn.Module):
             elif "src_embeddings" in inputs:
                 memory = self.encoder(inputs["src_embeddings"], mask=encoder_mask)
             else:
-                raise ValueError("inputs must contain 'src_ids' or 'src_embeddings' for seq2seq tasks")
             # If training / teacher forcing: expect tgt_ids (shifted by caller) or embeddings
             if "tgt_ids" in inputs:
@@ -162,7 +171,9 @@ class MultiTaskModel(nn.Module):
             else:
                 # For generation time you may call decoder.greedy_decode separately.
                 # Here we don't attempt to generate when labels not provided.
-                raise ValueError("Seq2seq tasks require 'tgt_ids' or 'tgt_embeddings' for training forward")
             decoder_out = self.decoder(decoder_inputs, memory, memory_mask=src_mask)
@@ -209,13 +220,17 @@ class MultiTaskModel(nn.Module):
         if isinstance(head, TokenClassificationHead):
             # logits: (B, T, C), labels: (B, T)
             B, T, C = logits.shape
-            loss = F.cross_entropy(logits.view(B * T, C), labels.view(B * T).long(), ignore_index=ignore_index)
             return loss
         if isinstance(head, LMHead):
             # logits: (B, T, V), labels: (B, T)
             B, T, V = logits.shape
-            loss = F.cross_entropy(logits.view(B * T, V), labels.view(B * T).long(), ignore_index=ignore_index)
             return loss
         # Generic fall-back: try CrossEntropy on final dim
@@ -234,4 +249,4 @@ class MultiTaskModel(nn.Module):
             return bool_mask.unsqueeze(1) & bool_mask.unsqueeze(2)
         if bool_mask.dim() in (3, 4):
             return bool_mask
-        raise ValueError("Attention mask must be 2D, 3D, or 4D tensor")

   seq2seq tasks (encoder -> decoder -> LMHead)
 - Minimal dependencies on training loop; return logits and (optionally) loss
 """
+from typing import Any, Dict, Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from .decoder import TransformerDecoder
 # Import your components
 from .encoder import TransformerEncoder
+from .heads import ClassificationHead, LMHead, TokenClassificationHead
 class MultiTaskModel(nn.Module):
             if "input_ids" in inputs:
                 encoder_mask = None
                 if "attention_mask" in inputs:
+                    encoder_mask = self._expand_attention_mask(
+                        inputs["attention_mask"], inputs["input_ids"].device
+                    )
                 enc_out = self.encoder(inputs["input_ids"], mask=encoder_mask)
             elif "embeddings" in inputs:
                 encoder_mask = inputs.get("attention_mask")
                 if encoder_mask is not None:
+                    encoder_mask = self._expand_attention_mask(
+                        encoder_mask, inputs["embeddings"].device
+                    )
                 enc_out = self.encoder(inputs["embeddings"], mask=encoder_mask)
             else:
+                raise ValueError(
+                    "inputs must contain 'input_ids' or 'embeddings' for encoder tasks"
+                )
             logits = head(enc_out)
             if return_loss:
             elif "src_embeddings" in inputs:
                 memory = self.encoder(inputs["src_embeddings"], mask=encoder_mask)
             else:
+                raise ValueError(
+                    "inputs must contain 'src_ids' or 'src_embeddings' for seq2seq tasks"
+                )
             # If training / teacher forcing: expect tgt_ids (shifted by caller) or embeddings
             if "tgt_ids" in inputs:
             else:
                 # For generation time you may call decoder.greedy_decode separately.
                 # Here we don't attempt to generate when labels not provided.
+                raise ValueError(
+                    "Seq2seq tasks require 'tgt_ids' or 'tgt_embeddings' for training forward"
+                )
             decoder_out = self.decoder(decoder_inputs, memory, memory_mask=src_mask)
         if isinstance(head, TokenClassificationHead):
             # logits: (B, T, C), labels: (B, T)
             B, T, C = logits.shape
+            loss = F.cross_entropy(
+                logits.view(B * T, C), labels.view(B * T).long(), ignore_index=ignore_index
+            )
             return loss
         if isinstance(head, LMHead):
             # logits: (B, T, V), labels: (B, T)
             B, T, V = logits.shape
+            loss = F.cross_entropy(
+                logits.view(B * T, V), labels.view(B * T).long(), ignore_index=ignore_index
+            )
             return loss
         # Generic fall-back: try CrossEntropy on final dim
             return bool_mask.unsqueeze(1) & bool_mask.unsqueeze(2)
         if bool_mask.dim() in (3, 4):
             return bool_mask
+        raise ValueError("Attention mask must be 2D, 3D, or 4D tensor")

src/models/positional_encoding.py CHANGED Viewed

@@ -7,31 +7,33 @@ Injects information about the position of tokens in a sequence, since
 self-attention has no inherent notion of token order.
 """
 import torch
 import torch.nn as nn
-import math
 class PositionalEncoding(nn.Module):
     """
     Implements the sinusoidal positional encoding from "Attention Is All You Need".
     Formula:
         PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
         PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
     Where:
         pos: position in sequence (0 to max_len-1)
         i: dimension index (0 to d_model/2)
     Args:
         d_model: Dimension of the model embeddings
         max_len: Maximum sequence length to pre-compute
         dropout: Dropout probability to apply after adding positional encoding
     Shape:
         Input: (batch, seq_len, d_model)
         Output: (batch, seq_len, d_model)
     Example:
         >>> pos_enc = PositionalEncoding(d_model=512, max_len=5000)
         >>> x = torch.randn(32, 100, 512)  # (batch, seq, d_model)
@@ -39,7 +41,7 @@ class PositionalEncoding(nn.Module):
         >>> output.shape
         torch.Size([32, 100, 512])
     """
     def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
@@ -49,23 +51,20 @@ class PositionalEncoding(nn.Module):
         # Apply sin to even indices, cos to odd indices
         # Register as buffer (not a parameter, but part of state_dict)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
-        )
         pe = torch.zeros(max_len, d_model)
         pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
         pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
         pe = pe.unsqueeze(0)
         self.register_buffer("pe", pe)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Add positional encoding to input embeddings.
         Args:
             x: Input embeddings (batch, seq_len, d_model)
         Returns:
             x with positional encoding added (batch, seq_len, d_model)
         """
@@ -76,4 +75,4 @@ class PositionalEncoding(nn.Module):
         x = x + self.pe[:, : x.size(1)].requires_grad_(False)
         # self.pe contains pre-computed encodings for all positions
         # just need to add the first seq_len positions to x
-        return self.dropout(x)

 self-attention has no inherent notion of token order.
 """
+import math
 import torch
 import torch.nn as nn
 class PositionalEncoding(nn.Module):
     """
     Implements the sinusoidal positional encoding from "Attention Is All You Need".
     Formula:
         PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
         PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
     Where:
         pos: position in sequence (0 to max_len-1)
         i: dimension index (0 to d_model/2)
     Args:
         d_model: Dimension of the model embeddings
         max_len: Maximum sequence length to pre-compute
         dropout: Dropout probability to apply after adding positional encoding
     Shape:
         Input: (batch, seq_len, d_model)
         Output: (batch, seq_len, d_model)
     Example:
         >>> pos_enc = PositionalEncoding(d_model=512, max_len=5000)
         >>> x = torch.randn(32, 100, 512)  # (batch, seq, d_model)
         >>> output.shape
         torch.Size([32, 100, 512])
     """
     def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         # Apply sin to even indices, cos to odd indices
         # Register as buffer (not a parameter, but part of state_dict)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
         pe = torch.zeros(max_len, d_model)
         pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
         pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
         pe = pe.unsqueeze(0)
         self.register_buffer("pe", pe)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Add positional encoding to input embeddings.
         Args:
             x: Input embeddings (batch, seq_len, d_model)
         Returns:
             x with positional encoding added (batch, seq_len, d_model)
         """
         x = x + self.pe[:, : x.size(1)].requires_grad_(False)
         # self.pe contains pre-computed encodings for all positions
         # just need to add the first seq_len positions to x
+        return self.dropout(x)

src/training/utils.py CHANGED Viewed

@@ -9,7 +9,6 @@ from typing import Optional
 import numpy as np
 import torch
 _seed_sequence: Optional[np.random.SeedSequence] = None
 _seed_lock = threading.Lock()
 _spawn_counter = 0
@@ -33,4 +32,3 @@ def set_seed(seed: int) -> np.random.Generator:
         _spawn_counter = 1
     _thread_local.rng = rng
     return rng

 import numpy as np
 import torch
 _seed_sequence: Optional[np.random.SeedSequence] = None
 _seed_lock = threading.Lock()
 _spawn_counter = 0
         _spawn_counter = 1
     _thread_local.rng = rng
     return rng

src/visualization/embeddings.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Embedding visualization helpers."""
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
-import numpy as np
 from sklearn.manifold import TSNE
@@ -16,15 +16,17 @@ def plot_tsne(embeddings: np.ndarray, labels: list[str]) -> None:
         raise ValueError("number of samples in embeddings must equal length of labels")
     if embeddings.shape[1] < 2:
         raise ValueError("embeddings must have at least 2 features for t-SNE visualization")
     reducer = TSNE(n_components=2, init="pca", learning_rate="auto")
     projection = reducer.fit_transform(embeddings)
-    df = pd.DataFrame({
-        "x": projection[:, 0],
-        "y": projection[:, 1],
-        "label": labels,
-    })
     plt.figure()
     sns.scatterplot(data=df, x="x", y="y", hue="label", palette="tab10", s=50)
     plt.legend(title="Labels", loc="best")

 """Embedding visualization helpers."""
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 import seaborn as sns
 from sklearn.manifold import TSNE
         raise ValueError("number of samples in embeddings must equal length of labels")
     if embeddings.shape[1] < 2:
         raise ValueError("embeddings must have at least 2 features for t-SNE visualization")
     reducer = TSNE(n_components=2, init="pca", learning_rate="auto")
     projection = reducer.fit_transform(embeddings)
+    df = pd.DataFrame(
+        {
+            "x": projection[:, 0],
+            "y": projection[:, 1],
+            "label": labels,
+        }
+    )
     plt.figure()
     sns.scatterplot(data=df, x="x", y="y", hue="label", palette="tab10", s=50)
     plt.legend(title="Labels", loc="best")

tests/test_models/test_decoder_step.py CHANGED Viewed

@@ -1,6 +1,7 @@
-import torch
-import pytest
 from typing import Any, Dict, cast
 from src.models.decoder import TransformerDecoder
@@ -93,6 +94,5 @@ def test_step_cache_growth_and_shapes():
     for i in range(num_layers):
         assert f"mem_k_{i}" in cache and f"mem_v_{i}" in cache
         mem_k = cache[f"mem_k_{i}"]
-        mem_v = cache[f"mem_v_{i}"]
         assert mem_k.shape[0] == batch_size
-        assert mem_k.shape[2] == src_len  # seq length of memory

 from typing import Any, Dict, cast
+import torch
 from src.models.decoder import TransformerDecoder
     for i in range(num_layers):
         assert f"mem_k_{i}" in cache and f"mem_v_{i}" in cache
         mem_k = cache[f"mem_k_{i}"]
         assert mem_k.shape[0] == batch_size
+        assert mem_k.shape[2] == src_len  # seq length of memory

tests/test_models/test_encoder.py CHANGED Viewed

@@ -1,6 +1,6 @@
-import math
-import torch
 import pytest
 from src.models.encoder import TransformerEncoder
@@ -173,4 +173,4 @@ def test_train_eval_determinism_and_dropout_effect():
 if __name__ == "__main__":
-    pytest.main([__file__, "-q"])

 import pytest
+import torch
 from src.models.encoder import TransformerEncoder
 if __name__ == "__main__":
+    pytest.main([__file__, "-q"])

tests/test_models/test_encoder_layer.py CHANGED Viewed

@@ -1,5 +1,6 @@
-import torch
 import pytest
 from src.models.encoder import TransformerEncoderLayer
@@ -83,4 +84,4 @@ def test_mask_broadcasting_accepts_3d_and_4d_mask():
 if __name__ == "__main__":
     # Run tests interactively if needed
-    pytest.main([__file__, "-q"])

 import pytest
+import torch
 from src.models.encoder import TransformerEncoderLayer
 if __name__ == "__main__":
     # Run tests interactively if needed
+    pytest.main([__file__, "-q"])

tests/test_models/test_feedforward.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-import pytest
 from src.models.feedforward import FeedForward
@@ -38,15 +38,15 @@ class TestFeedForward:
         # Parameter existence
         param_names = [name for name, _ in ffn.named_parameters()]
-        assert any('linear1' in name for name in param_names)
-        assert any('linear2' in name for name in param_names)
         # Parameter shapes
         shapes = {name: p.shape for name, p in ffn.named_parameters()}
-        assert shapes.get('linear1.weight') == (d_ff, d_model)
-        assert shapes.get('linear2.weight') == (d_model, d_ff)
-        assert shapes.get('linear1.bias') == (d_ff,)
-        assert shapes.get('linear2.bias') == (d_model,)
         # ensure gradients flow
         x = torch.randn(3, 5, d_model)
@@ -54,4 +54,4 @@ class TestFeedForward:
         loss = out.sum()
         loss.backward()
         for _, p in ffn.named_parameters():
-            assert p.grad is not None

 import torch
 from src.models.feedforward import FeedForward
         # Parameter existence
         param_names = [name for name, _ in ffn.named_parameters()]
+        assert any("linear1" in name for name in param_names)
+        assert any("linear2" in name for name in param_names)
         # Parameter shapes
         shapes = {name: p.shape for name, p in ffn.named_parameters()}
+        assert shapes.get("linear1.weight") == (d_ff, d_model)
+        assert shapes.get("linear2.weight") == (d_model, d_ff)
+        assert shapes.get("linear1.bias") == (d_ff,)
+        assert shapes.get("linear2.bias") == (d_model,)
         # ensure gradients flow
         x = torch.randn(3, 5, d_model)
         loss = out.sum()
         loss.backward()
         for _, p in ffn.named_parameters():
+            assert p.grad is not None

tests/test_models/test_heads.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import torch
-import pytest
 import torch.nn as nn
 from src.models.heads import (
     ClassificationHead,
-    TokenClassificationHead,
     LMHead,
     ProjectionHead,
 )
@@ -101,4 +101,4 @@ def test_projection_head_2d_and_3d_behavior_and_grad():
     loss = out3.sum()
     loss.backward()
     grads = [p.grad for p in head.parameters() if p.requires_grad]
-    assert any(g is not None for g in grads)

 import torch
 import torch.nn as nn
 from src.models.heads import (
     ClassificationHead,
     LMHead,
     ProjectionHead,
+    TokenClassificationHead,
 )
     loss = out3.sum()
     loss.backward()
     grads = [p.grad for p in head.parameters() if p.requires_grad]
+    assert any(g is not None for g in grads)

tests/test_models/test_multitask.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
-import pytest
-from src.models.encoder import TransformerEncoder
 from src.models.decoder import TransformerDecoder
 from src.models.heads import ClassificationHead, LMHead, TokenClassificationHead
 from src.models.multitask import MultiTaskModel
@@ -17,8 +17,16 @@ def test_multitask_encoder_classification_forward_and_loss():
     seq_len = 8
     num_labels = 5
-    enc = TransformerEncoder(vocab_size=vocab_size, d_model=d_model, num_layers=num_layers,
-                             num_heads=num_heads, d_ff=d_ff, dropout=0.0, max_len=seq_len, pad_token_id=0)
     mt = MultiTaskModel(encoder=enc)
     head = ClassificationHead(d_model=d_model, num_labels=num_labels, pooler="mean", dropout=0.0)
@@ -30,7 +38,9 @@ def test_multitask_encoder_classification_forward_and_loss():
     logits = mt.forward("sentiment", {"input_ids": input_ids})
     assert logits.shape == (batch_size, num_labels)
-    loss, logits2 = mt.forward("sentiment", {"input_ids": input_ids, "labels": labels}, return_loss=True)
     assert loss.item() >= 0
     # grads
     loss.backward()
@@ -49,10 +59,26 @@ def test_multitask_seq2seq_lm_forward_and_loss():
     src_len = 7
     tgt_len = 6
-    enc = TransformerEncoder(vocab_size=vocab_size, d_model=d_model, num_layers=num_layers,
-                             num_heads=num_heads, d_ff=d_ff, dropout=0.0, max_len=src_len, pad_token_id=0)
-    dec = TransformerDecoder(vocab_size=vocab_size, d_model=d_model, num_layers=num_layers,
-                             num_heads=num_heads, d_ff=d_ff, dropout=0.0, max_len=tgt_len, pad_token_id=0)
     mt = MultiTaskModel(encoder=enc, decoder=dec)
     lm_head = LMHead(d_model=d_model, vocab_size=vocab_size, tie_embedding=None)
     mt.add_head("summarize", lm_head)
@@ -65,7 +91,9 @@ def test_multitask_seq2seq_lm_forward_and_loss():
     logits = mt.forward("summarize", {"src_ids": src_ids, "tgt_ids": tgt_ids})
     assert logits.shape == (batch_size, tgt_len, vocab_size)
-    loss, logits2 = mt.forward("summarize", {"src_ids": src_ids, "tgt_ids": tgt_ids, "labels": labels}, return_loss=True)
     assert loss.item() >= 0
     loss.backward()
     grads = [p.grad for p in mt.parameters() if p.requires_grad]
@@ -83,8 +111,16 @@ def test_token_classification_forward_and_loss():
     seq_len = 5
     num_labels = 7
-    enc = TransformerEncoder(vocab_size=vocab_size, d_model=d_model, num_layers=num_layers,
-                             num_heads=num_heads, d_ff=d_ff, dropout=0.0, max_len=seq_len, pad_token_id=0)
     mt = MultiTaskModel(encoder=enc)
     head = TokenClassificationHead(d_model=d_model, num_labels=num_labels, dropout=0.0)
     mt.add_head("ner", head)
@@ -99,4 +135,4 @@ def test_token_classification_forward_and_loss():
     assert loss.item() >= 0
     loss.backward()
     grads = [p.grad for p in mt.parameters() if p.requires_grad]
-    assert any(g is not None for g in grads)

 import torch
 from src.models.decoder import TransformerDecoder
+from src.models.encoder import TransformerEncoder
 from src.models.heads import ClassificationHead, LMHead, TokenClassificationHead
 from src.models.multitask import MultiTaskModel
     seq_len = 8
     num_labels = 5
+    enc = TransformerEncoder(
+        vocab_size=vocab_size,
+        d_model=d_model,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        d_ff=d_ff,
+        dropout=0.0,
+        max_len=seq_len,
+        pad_token_id=0,
+    )
     mt = MultiTaskModel(encoder=enc)
     head = ClassificationHead(d_model=d_model, num_labels=num_labels, pooler="mean", dropout=0.0)
     logits = mt.forward("sentiment", {"input_ids": input_ids})
     assert logits.shape == (batch_size, num_labels)
+    loss, logits2 = mt.forward(
+        "sentiment", {"input_ids": input_ids, "labels": labels}, return_loss=True
+    )
     assert loss.item() >= 0
     # grads
     loss.backward()
     src_len = 7
     tgt_len = 6
+    enc = TransformerEncoder(
+        vocab_size=vocab_size,
+        d_model=d_model,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        d_ff=d_ff,
+        dropout=0.0,
+        max_len=src_len,
+        pad_token_id=0,
+    )
+    dec = TransformerDecoder(
+        vocab_size=vocab_size,
+        d_model=d_model,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        d_ff=d_ff,
+        dropout=0.0,
+        max_len=tgt_len,
+        pad_token_id=0,
+    )
     mt = MultiTaskModel(encoder=enc, decoder=dec)
     lm_head = LMHead(d_model=d_model, vocab_size=vocab_size, tie_embedding=None)
     mt.add_head("summarize", lm_head)
     logits = mt.forward("summarize", {"src_ids": src_ids, "tgt_ids": tgt_ids})
     assert logits.shape == (batch_size, tgt_len, vocab_size)
+    loss, logits2 = mt.forward(
+        "summarize", {"src_ids": src_ids, "tgt_ids": tgt_ids, "labels": labels}, return_loss=True
+    )
     assert loss.item() >= 0
     loss.backward()
     grads = [p.grad for p in mt.parameters() if p.requires_grad]
     seq_len = 5
     num_labels = 7
+    enc = TransformerEncoder(
+        vocab_size=vocab_size,
+        d_model=d_model,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        d_ff=d_ff,
+        dropout=0.0,
+        max_len=seq_len,
+        pad_token_id=0,
+    )
     mt = MultiTaskModel(encoder=enc)
     head = TokenClassificationHead(d_model=d_model, num_labels=num_labels, dropout=0.0)
     mt.add_head("ner", head)
     assert loss.item() >= 0
     loss.backward()
     grads = [p.grad for p in mt.parameters() if p.requires_grad]
+    assert any(g is not None for g in grads)