Spaces:

OliverPerrin
/

LexiMind

Running

App Files Files Community

OliverPerrin commited on Dec 3, 2025

Commit

29f2de2

1 Parent(s): f9d964d

Style: Apply ruff formatting

Browse files

Files changed (37) hide show

scripts/demo_gradio.py +1 -0
scripts/download_data.py +1 -0
scripts/eval_rouge.py +1 -0
scripts/evaluate.py +1 -0
scripts/export_model.py +22 -5
scripts/inference.py +1 -0
scripts/preprocess_data.py +5 -3
scripts/train.py +1 -0
src/api/app.py +1 -0
src/api/dependencies.py +1 -0
src/api/routes.py +1 -0
src/api/schemas.py +1 -0
src/data/dataloader.py +1 -0
src/data/dataset.py +4 -1
src/data/preprocessing.py +1 -0
src/data/tokenization.py +13 -4
src/inference/__init__.py +5 -5
src/inference/factory.py +1 -0
src/inference/pipeline.py +1 -0
src/inference/postprocessing.py +1 -0
src/models/decoder.py +1 -0
src/models/factory.py +1 -0
src/models/heads.py +1 -0
src/models/multitask.py +1 -0
src/training/metrics.py +1 -0
src/training/trainer.py +4 -3
src/utils/config.py +1 -0
src/utils/io.py +2 -1
src/utils/labels.py +1 -0
src/utils/logging.py +1 -0
src/utils/random.py +1 -0
src/visualization/attention.py +1 -0
src/visualization/metrics.py +1 -0
tests/test_api/test_routes.py +2 -1
tests/test_data/test_download_records.py +16 -11
tests/test_inference/test_pipeline.py +1 -0
tests/test_models/test_positional_encoding.py +0 -1

scripts/demo_gradio.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Minimal Gradio demo for the LexiMind multitask model.
 Shows raw model outputs without any post-processing tricks.
 """
 from __future__ import annotations
 import json

 Minimal Gradio demo for the LexiMind multitask model.
 Shows raw model outputs without any post-processing tricks.
 """
 from __future__ import annotations
 import json

scripts/download_data.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Download datasets used by LexiMind."""
 from __future__ import annotations
 import argparse

 """Download datasets used by LexiMind."""
 from __future__ import annotations
 import argparse

scripts/eval_rouge.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Utility script to evaluate LexiMind summaries with ROUGE."""
 from __future__ import annotations
 import argparse

 """Utility script to evaluate LexiMind summaries with ROUGE."""
 from __future__ import annotations
 import argparse

scripts/evaluate.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Evaluate the multitask model on processed validation/test splits.
 This is used for getting definitive scores on my test set after training is complete.
 """
 from __future__ import annotations
 import argparse

 Evaluate the multitask model on processed validation/test splits.
 This is used for getting definitive scores on my test set after training is complete.
 """
 from __future__ import annotations
 import argparse

scripts/export_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Rebuild and export the trained multitask model for downstream use."""
 from __future__ import annotations
 import argparse
@@ -14,11 +15,27 @@ from src.utils.labels import load_label_metadata
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Export LexiMind model weights")
-    parser.add_argument("--checkpoint", default="checkpoints/best.pt", help="Path to the trained checkpoint.")
-    parser.add_argument("--output", default="outputs/model.pt", help="Output path for the exported state dict.")
-    parser.add_argument("--labels", default="artifacts/labels.json", help="Label metadata JSON produced after training.")
-    parser.add_argument("--model-config", default="configs/model/base.yaml", help="Model architecture configuration.")
-    parser.add_argument("--data-config", default="configs/data/datasets.yaml", help="Data configuration (for tokenizer settings).")
     return parser.parse_args()

 """Rebuild and export the trained multitask model for downstream use."""
 from __future__ import annotations
 import argparse
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Export LexiMind model weights")
+    parser.add_argument(
+        "--checkpoint", default="checkpoints/best.pt", help="Path to the trained checkpoint."
+    )
+    parser.add_argument(
+        "--output", default="outputs/model.pt", help="Output path for the exported state dict."
+    )
+    parser.add_argument(
+        "--labels",
+        default="artifacts/labels.json",
+        help="Label metadata JSON produced after training.",
+    )
+    parser.add_argument(
+        "--model-config",
+        default="configs/model/base.yaml",
+        help="Model architecture configuration.",
+    )
+    parser.add_argument(
+        "--data-config",
+        default="configs/data/datasets.yaml",
+        help="Data configuration (for tokenizer settings).",
+    )
     return parser.parse_args()

scripts/inference.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Run inference with the multitask model."""
 from __future__ import annotations
 import argparse

 """Run inference with the multitask model."""
 from __future__ import annotations
 import argparse

scripts/preprocess_data.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Preprocess raw datasets into JSONL splits for LexiMind training."""
 from __future__ import annotations
 import argparse
@@ -139,9 +140,10 @@ def preprocess_summarization(raw_dir: Path, processed_dir: Path) -> None:
         output_path = processed_dir / f"{split}.jsonl"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         print(f"Writing summarization split '{split}' to {output_path}")
-        with source_path.open("r", encoding="utf-8", newline="") as source_handle, output_path.open(
-            "w", encoding="utf-8"
-        ) as sink:
             reader = csv.DictReader(source_handle)
             for row in reader:
                 article = row.get("article") or row.get("Article") or ""

 """Preprocess raw datasets into JSONL splits for LexiMind training."""
 from __future__ import annotations
 import argparse
         output_path = processed_dir / f"{split}.jsonl"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         print(f"Writing summarization split '{split}' to {output_path}")
+        with (
+            source_path.open("r", encoding="utf-8", newline="") as source_handle,
+            output_path.open("w", encoding="utf-8") as sink,
+        ):
             reader = csv.DictReader(source_handle)
             for row in reader:
                 article = row.get("article") or row.get("Article") or ""

scripts/train.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """End-to-end training entrypoint for the LexiMind multitask model."""
 from __future__ import annotations
 import json

 """End-to-end training entrypoint for the LexiMind multitask model."""
 from __future__ import annotations
 import json

src/api/app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """FastAPI application entrypoint."""
 from fastapi import FastAPI
 from .routes import router

 """FastAPI application entrypoint."""
 from fastapi import FastAPI
 from .routes import router

src/api/dependencies.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Dependency providers for the FastAPI application."""
 from __future__ import annotations
 from functools import lru_cache

 """Dependency providers for the FastAPI application."""
 from __future__ import annotations
 from functools import lru_cache

src/api/routes.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """API routes."""
 from typing import cast
 from fastapi import APIRouter, Depends, HTTPException, status

 """API routes."""
 from typing import cast
 from fastapi import APIRouter, Depends, HTTPException, status

src/api/schemas.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """API schemas."""
 from pydantic import BaseModel


1	"""API schemas."""
2	+
3	from pydantic import BaseModel
4
5

src/data/dataloader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Task-aware DataLoader builders for the LexiMind multitask suite."""
 from __future__ import annotations
 from typing import List

 """Task-aware DataLoader builders for the LexiMind multitask suite."""
 from __future__ import annotations
 from typing import List

src/data/dataset.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Dataset definitions for the LexiMind multitask training pipeline."""
 from __future__ import annotations
 import json
@@ -179,7 +180,9 @@ def _load_jsonl_generic(
         if first_non_ws == "[":
             payloads = _safe_json_load(handle, data_path)
             if not isinstance(payloads, list):
-                raise ValueError(f"Expected a JSON array in '{data_path}' but found {type(payloads).__name__}")
             for idx, payload in enumerate(payloads):
                 if not isinstance(payload, dict):
                     raise ValueError(

 """Dataset definitions for the LexiMind multitask training pipeline."""
 from __future__ import annotations
 import json
         if first_non_ws == "[":
             payloads = _safe_json_load(handle, data_path)
             if not isinstance(payloads, list):
+                raise ValueError(
+                    f"Expected a JSON array in '{data_path}' but found {type(payloads).__name__}"
+                )
             for idx, payload in enumerate(payloads):
                 if not isinstance(payload, dict):
                     raise ValueError(

src/data/preprocessing.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Text preprocessing utilities built around Hugging Face tokenizers."""
 from __future__ import annotations
 from dataclasses import dataclass, replace

 """Text preprocessing utilities built around Hugging Face tokenizers."""
 from __future__ import annotations
 from dataclasses import dataclass, replace

src/data/tokenization.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Tokenizer wrapper around HuggingFace models used across LexiMind."""
 from __future__ import annotations
 from dataclasses import dataclass
@@ -23,13 +24,19 @@ class Tokenizer:
     def __init__(self, config: TokenizerConfig | None = None) -> None:
         cfg = config or TokenizerConfig()
         self.config = cfg
-        self._tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(cfg.pretrained_model_name)
         self._pad_token_id = self._resolve_id(self._tokenizer.pad_token_id)
         self._bos_token_id = self._resolve_id(
-            self._tokenizer.bos_token_id if self._tokenizer.bos_token_id is not None else self._tokenizer.cls_token_id
         )
         self._eos_token_id = self._resolve_id(
-            self._tokenizer.eos_token_id if self._tokenizer.eos_token_id is not None else self._tokenizer.sep_token_id
         )
     @property
@@ -84,7 +91,9 @@ class Tokenizer:
         )
         return cast(List[List[int]], encoded["input_ids"])
-    def batch_encode(self, texts: Sequence[str], *, max_length: int | None = None) -> dict[str, torch.Tensor]:
         normalized = [text.lower() if self.config.lower else text for text in texts]
         encoded = self._tokenizer(
             normalized,

 """Tokenizer wrapper around HuggingFace models used across LexiMind."""
 from __future__ import annotations
 from dataclasses import dataclass
     def __init__(self, config: TokenizerConfig | None = None) -> None:
         cfg = config or TokenizerConfig()
         self.config = cfg
+        self._tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
+            cfg.pretrained_model_name
+        )
         self._pad_token_id = self._resolve_id(self._tokenizer.pad_token_id)
         self._bos_token_id = self._resolve_id(
+            self._tokenizer.bos_token_id
+            if self._tokenizer.bos_token_id is not None
+            else self._tokenizer.cls_token_id
         )
         self._eos_token_id = self._resolve_id(
+            self._tokenizer.eos_token_id
+            if self._tokenizer.eos_token_id is not None
+            else self._tokenizer.sep_token_id
         )
     @property
         )
         return cast(List[List[int]], encoded["input_ids"])
+    def batch_encode(
+        self, texts: Sequence[str], *, max_length: int | None = None
+    ) -> dict[str, torch.Tensor]:
         normalized = [text.lower() if self.config.lower else text for text in texts]
         encoded = self._tokenizer(
             normalized,

src/inference/__init__.py CHANGED Viewed

@@ -4,9 +4,9 @@ from .factory import create_inference_pipeline
 from .pipeline import EmotionPrediction, InferenceConfig, InferencePipeline, TopicPrediction
 __all__ = [
-	"InferencePipeline",
-	"InferenceConfig",
-	"EmotionPrediction",
-	"TopicPrediction",
-	"create_inference_pipeline",
 ]

 from .pipeline import EmotionPrediction, InferenceConfig, InferencePipeline, TopicPrediction
 __all__ = [
+    "InferencePipeline",
+    "InferenceConfig",
+    "EmotionPrediction",
+    "TopicPrediction",
+    "create_inference_pipeline",
 ]

src/inference/factory.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Helpers to assemble an inference pipeline from saved artifacts."""
 from __future__ import annotations
 from pathlib import Path

 """Helpers to assemble an inference pipeline from saved artifacts."""
 from __future__ import annotations
 from pathlib import Path

src/inference/pipeline.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Inference helpers for multitask LexiMind models."""
 from __future__ import annotations
 from dataclasses import dataclass, fields, replace

 """Inference helpers for multitask LexiMind models."""
 from __future__ import annotations
 from dataclasses import dataclass, fields, replace

src/inference/postprocessing.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Output cleaning helpers."""
 from typing import List


1	"""Output cleaning helpers."""
2	+
3	from typing import List
4
5

src/models/decoder.py CHANGED Viewed

@@ -12,6 +12,7 @@ Conventions:
 - This decoder uses Pre-LN (RMSNorm before each sublayer).
 - RMSNorm is just simpler than LayerNorm and more computationally efficient, it's become the modern convention. These reasons are why I used it here.
 """
 import math
 from typing import Dict, List, Optional, Tuple, Union

 - This decoder uses Pre-LN (RMSNorm before each sublayer).
 - RMSNorm is just simpler than LayerNorm and more computationally efficient, it's become the modern convention. These reasons are why I used it here.
 """
 import math
 from typing import Dict, List, Optional, Tuple, Union

src/models/factory.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Factory helpers to assemble multitask models for inference/training."""
 from __future__ import annotations
 from dataclasses import dataclass

 """Factory helpers to assemble multitask models for inference/training."""
 from __future__ import annotations
 from dataclasses import dataclass

src/models/heads.py CHANGED Viewed

@@ -9,6 +9,7 @@ Includes:
 Keep these heads minimal, well-tested, and easy to compose on top of encoder/decoder outputs.
 """
 from typing import Literal, Optional
 import torch

 Keep these heads minimal, well-tested, and easy to compose on top of encoder/decoder outputs.
 """
 from typing import Literal, Optional
 import torch

src/models/multitask.py CHANGED Viewed

@@ -14,6 +14,7 @@ Design goals:
   seq2seq tasks (encoder -> decoder -> LMHead)
 - Minimal dependencies on training loop; return logits and (optionally) loss
 """
 from typing import Any, Dict, Optional
 import torch

   seq2seq tasks (encoder -> decoder -> LMHead)
 - Minimal dependencies on training loop; return logits and (optionally) loss
 """
 from typing import Any, Dict, Optional
 import torch

src/training/metrics.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Metric helpers used during training and evaluation."""
 from __future__ import annotations
 from typing import Any, Dict, List, Sequence

 """Metric helpers used during training and evaluation."""
 from __future__ import annotations
 from typing import Any, Dict, List, Sequence

src/training/trainer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Multi-task trainer coordinating summarization, emotion, and topic heads."""
 from __future__ import annotations
 import shutil
@@ -330,9 +331,9 @@ class Trainer:
         """Generate and print sample summaries to monitor quality during training."""
         self.model.eval()
         samples_generated = 0
-        print(f"\n{'='*80}")
         print(f"[Validation Generation - Epoch {epoch}]")
-        print(f"{'='*80}")
         with torch.no_grad():
             for batch in val_loader:
@@ -400,7 +401,7 @@ class Trainer:
                 samples_generated += 1
-        print(f"{'='*80}\n")
         self.model.train()
     def _print_epoch_progress(

 """Multi-task trainer coordinating summarization, emotion, and topic heads."""
 from __future__ import annotations
 import shutil
         """Generate and print sample summaries to monitor quality during training."""
         self.model.eval()
         samples_generated = 0
+        print(f"\n{'=' * 80}")
         print(f"[Validation Generation - Epoch {epoch}]")
+        print(f"{'=' * 80}")
         with torch.no_grad():
             for batch in val_loader:
                 samples_generated += 1
+        print(f"{'=' * 80}\n")
         self.model.train()
     def _print_epoch_progress(

src/utils/config.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """YAML config loader."""
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict

 """YAML config loader."""
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict

src/utils/io.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Checkpoint IO helpers."""
 from pathlib import Path
 import torch
@@ -12,4 +13,4 @@ def save_state(model: torch.nn.Module, path: str) -> None:
 def load_state(model: torch.nn.Module, path: str) -> None:
     state = torch.load(path, map_location="cpu", weights_only=True)
-    model.load_state_dict(state)

 """Checkpoint IO helpers."""
 from pathlib import Path
 import torch
 def load_state(model: torch.nn.Module, path: str) -> None:
     state = torch.load(path, map_location="cpu", weights_only=True)
+    model.load_state_dict(state)

src/utils/labels.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Label metadata helpers for multitask inference."""
 from __future__ import annotations
 import json

 """Label metadata helpers for multitask inference."""
 from __future__ import annotations
 import json

src/utils/logging.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Logging setup."""
 import logging


1	"""Logging setup."""
2	+
3	import logging
4
5

src/utils/random.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Randomness helpers."""
 import random
 import numpy as np

 """Randomness helpers."""
 import random
 import numpy as np

src/visualization/attention.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Attention plotting utilities."""
 from typing import Sequence
 import matplotlib.pyplot as plt

 """Attention plotting utilities."""
 from typing import Sequence
 import matplotlib.pyplot as plt

src/visualization/metrics.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Metric plotting helpers."""
 import matplotlib.pyplot as plt


1	"""Metric plotting helpers."""
2	+
3	import matplotlib.pyplot as plt
4
5

tests/test_api/test_routes.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """API integration tests for the inference endpoint."""
 from __future__ import annotations
 from fastapi.testclient import TestClient
@@ -31,4 +32,4 @@ def test_summarize_route_returns_pipeline_outputs() -> None:
         assert payload["topic"] == "news"
         assert payload["topic_confidence"] == 0.8
     finally:
-        app.dependency_overrides.clear()

 """API integration tests for the inference endpoint."""
 from __future__ import annotations
 from fastapi.testclient import TestClient
         assert payload["topic"] == "news"
         assert payload["topic_confidence"] == 0.8
     finally:
+        app.dependency_overrides.clear()

tests/test_data/test_download_records.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Unit tests for dataset record helpers in scripts.download_data."""
 from __future__ import annotations
 import importlib.util
@@ -26,11 +27,13 @@ class DummyDataset:
 class DownloadDataRecordTests(unittest.TestCase):
     def test_emotion_records_handles_out_of_range_labels(self) -> None:
-        dataset_split = DummyDataset([
-            {"text": "sample", "label": 1},
-            {"text": "multi", "label": [0, 5]},
-            {"text": "string", "label": "2"},
-        ])
         label_names = ["sadness", "joy", "love"]
         records = list(
             download_data._emotion_records(
@@ -45,12 +48,14 @@ class DownloadDataRecordTests(unittest.TestCase):
         self.assertEqual(records[2]["emotions"], ["2"])
     def test_topic_records_handles_varied_label_inputs(self) -> None:
-        dataset_split = DummyDataset([
-            {"text": "news", "label": 3},
-            {"text": "list", "label": [1]},
-            {"text": "unknown", "label": "5"},
-            {"text": "missing", "label": []},
-        ])
         label_names = ["World", "Sports", "Business", "Sci/Tech"]
         records = list(
             download_data._topic_records(

 """Unit tests for dataset record helpers in scripts.download_data."""
 from __future__ import annotations
 import importlib.util
 class DownloadDataRecordTests(unittest.TestCase):
     def test_emotion_records_handles_out_of_range_labels(self) -> None:
+        dataset_split = DummyDataset(
+            [
+                {"text": "sample", "label": 1},
+                {"text": "multi", "label": [0, 5]},
+                {"text": "string", "label": "2"},
+            ]
+        )
         label_names = ["sadness", "joy", "love"]
         records = list(
             download_data._emotion_records(
         self.assertEqual(records[2]["emotions"], ["2"])
     def test_topic_records_handles_varied_label_inputs(self) -> None:
+        dataset_split = DummyDataset(
+            [
+                {"text": "news", "label": 3},
+                {"text": "list", "label": [1]},
+                {"text": "unknown", "label": "5"},
+                {"text": "missing", "label": []},
+            ]
+        )
         label_names = ["World", "Sports", "Business", "Sci/Tech"]
         records = list(
             download_data._topic_records(

tests/test_inference/test_pipeline.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Integration tests for the inference pipeline."""
 from __future__ import annotations
 from pathlib import Path

 """Integration tests for the inference pipeline."""
 from __future__ import annotations
 from pathlib import Path

tests/test_models/test_positional_encoding.py CHANGED Viewed

@@ -4,7 +4,6 @@
 Tests for positional encoding.
 """
 import matplotlib
 import torch

 Tests for positional encoding.
 """
 import matplotlib
 import torch