shinfxh commited on Feb 20

Commit

a0eaa2d

0 Parent(s):

initial commit

Files changed (20) hide show

.gitattributes +36 -0
.gitignore +33 -0
LICENSE +21 -0
README.md +167 -0
checkpoints/reverso/args.json +15 -0
checkpoints/reverso_nano/args.json +15 -0
checkpoints/reverso_small/args.json +15 -0
checkpoints/reverso_small/checkpoint.pth +3 -0
config/dataset_properties.json +1 -0
config/downsample_factors.json +4 -0
example/eval_gift.py +432 -0
example/forecast_demo.py +137 -0
example/requirements.txt +3 -0
figures/gift_eval_pareto_overall.png +3 -0
figures/new_arch.png +3 -0
pyproject.toml +30 -0
requirements.txt +4 -0
reverso/__init__.py +6 -0
reverso/forecast.py +93 -0
reverso/model.py +191 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,33 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+*.egg
+dist/
+build/
+.eggs/
+# Environments
+.env
+.venv
+venv/
+# Data and results (large files)
+data/
+results/
+gift-eval/
+# Outputs
+*.png
+!figures/*.png
+# Scripts
+push.sh
+# Tools
+.mypy_cache/
+.ruff_cache/
+.pytest_cache/
+.ipynb_checkpoints/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Reverso Authors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,167 @@

+---
+license: mit
+library_name: pytorch
+pipeline_tag: time-series-forecasting
+tags:
+  - time-series
+  - forecasting
+  - zero-shot
+  - convolution
+  - deltanet
+  - flash-fft-conv
+  - flash-linear-attention
+---
+<h1 align="center">Reverso</h1>
+<h3 align="center">
+  Efficient time-series foundation models for zero-shot forecasting.
+</h3>
+<p align="center">
+  <a href="https://arxiv.org/abs/2602.17634">Paper</a> •
+  <a href="https://github.com/shinfxh/reverso">GitHub</a> •
+  <a href="https://huggingface.co/shinfxh/reverso">Hugging Face</a>
+</p>
+<p align="center">
+  By combining long convolutions with linear RNN layers, Reverso matches the performance of transformer-based models that are over <b>100x larger</b>.
+</p>
+## Key Results
+<p align="center">
+  <img src="figures/gift_eval_pareto_overall.png" width="800">
+</p>
+Evaluated on [Gift-Eval](https://github.com/SalesforceAIResearch/gift-eval), a comprehensive time-series forecasting benchmark spanning 97 tasks within 23 datasets across 7 domains.
+| Model | Params | Gift-Eval MASE |
+|---|---|---|
+| **Reverso** | 2.6M | **0.711** |
+| Reverso-Small | 550K | 0.726 |
+| Reverso-Nano | 200K | 0.760 |
+For reference, Xihe-Max (1.5B params) achieves 0.711 and TimesFM-2.5 (200M params) achieves 0.705 on the same benchmark.
+## Installation
+```bash
+pip install -r requirements.txt
+pip install --no-build-isolation git+https://github.com/HazyResearch/flash-fft-conv.git#subdirectory=csrc/flashfftconv
+pip install --no-build-isolation git+https://github.com/HazyResearch/flash-fft-conv.git
+pip install -e .
+```
+### Requirements
+- Python >= 3.11
+- PyTorch 2.6.0
+- CUDA-compatible GPU
+- [FlashFFTConv](https://github.com/HazyResearch/flash-fft-conv)
+- [flash-linear-attention](https://github.com/sustcsonglin/flash-linear-attention)
+## Model Architecture
+<p align="center">
+  <img src="figures/new_arch.png" width="800">
+</p>
+Reverso uses a hybrid architecture that interleaves:
+1. **Long convolution layers** ([FlashFFTConv](https://github.com/HazyResearch/flash-fft-conv)) with gated short convolutions
+2. **DeltaNet layers** for modeling sequential dependencies
+3. **MLP layers** for channel mixing
+4. **Attention-based decoder head** for producing the final forecast
+Input sequences are normalized to [0, 1] and processed point-wise (no patching). The model predicts 48 time steps at a time and rolls out autoregressively for longer horizons.
+| Config | Params | Layers | d_model |
+|---|---|---|---|
+| Reverso | 2.6M | 8 | 128 |
+| Reverso-Small | 550K | 4 | 64 |
+| Reverso-Nano | 200K | 2 | 32 |
+The modeling code is in [`reverso/`](reverso/).
+## Quick Start
+```python
+import torch
+from reverso import load_model, forecast
+model, cfg = load_model(
+    "checkpoints/reverso_small/checkpoint.pth",
+    "checkpoints/reverso_small/args.json",
+    device="cuda",
+)
+context = torch.full((1, 2048, 1), 5.0, device="cuda")  # (batch, seq_len, 1)
+predictions = forecast(
+    model, context,
+    prediction_length=96,
+    seq_len=cfg.seq_len,
+    output_token_len=cfg.output_token_len,
+)
+print(predictions.shape)  # (1, 96, 1)
+```
+## Examples
+Install the example dependencies first:
+```bash
+pip install -r example/requirements.txt
+```
+### Forecast Demo
+Run Reverso on synthetic signals (constant, linear, sine, sawtooth, square):
+```bash
+python example/forecast_demo.py --signal all
+```
+Use `--signal sine` to run a single signal, or `--list` to see all options.
+### Gift-Eval Benchmark
+To reproduce the benchmark results, first follow the [Gift-Eval setup instructions](https://github.com/SalesforceAIResearch/gift-eval) to install the package and download the data. Then run:
+```bash
+python example/eval_gift.py \
+    --checkpoint checkpoints/reverso_small/checkpoint.pth \
+    --output-dir results/ \
+    --force-flip-invariance
+```
+> **Note:** Dependencies within Gift-Eval may conflict with those in Reverso. If you encounter issues, try upgrading `huggingface_hub`:
+> ```bash
+> pip install --upgrade huggingface_hub
+> ```
+> **Note:** While running this benchmark, it is recommended to use flip invariance, but this requires two forward passes of the model. The inference speed is also not fully optimized and could be further sped up.
+## Available Checkpoints
+| Model | Status | Path |
+|---|---|---|
+| Reverso-Small (550K) | Available | `checkpoints/reverso_small/` |
+| Reverso (2.6M) | Coming soon | — |
+| Reverso-Nano (200K) | Coming soon | — |
+## Citation
+```bibtex
+@misc{fu2026reversoefficienttimeseries,
+      title={Reverso: Efficient Time Series Foundation Models for Zero-shot Forecasting},
+      author={Xinghong Fu and Yanhong Li and Georgios Papaioannou and Yoon Kim},
+      year={2026},
+      eprint={2602.17634},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2602.17634},
+}
+```
+## License
+This project is licensed under the [MIT License](LICENSE).

checkpoints/reverso/args.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "seq_len": 2048,
+  "input_token_len": 2048,
+  "output_token_len": 48,
+  "e_layers": 8,
+  "d_model": 128,
+  "d_intermediate": 256,
+  "output_bottleneck_dim": 48,
+  "expand_v": 1.0,
+  "state_weaving": 1,
+  "gating_kernel_size": 3,
+  "main_module": "conv,attn,conv,attn",
+  "use_norm": true,
+  "learn_bias": 1
+}

checkpoints/reverso_nano/args.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "seq_len": 2048,
+  "input_token_len": 2048,
+  "output_token_len": 48,
+  "e_layers": 2,
+  "d_model": 32,
+  "d_intermediate": 256,
+  "output_bottleneck_dim": 48,
+  "expand_v": 1.0,
+  "state_weaving": 1,
+  "gating_kernel_size": 3,
+  "main_module": "conv,attn,conv,attn",
+  "use_norm": true,
+  "learn_bias": 1
+}

checkpoints/reverso_small/args.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "seq_len": 2048,
+  "input_token_len": 2048,
+  "output_token_len": 48,
+  "e_layers": 4,
+  "d_model": 64,
+  "d_intermediate": 256,
+  "output_bottleneck_dim": 48,
+  "expand_v": 1.0,
+  "state_weaving": 1,
+  "gating_kernel_size": 3,
+  "main_module": "conv,attn,conv,attn",
+  "use_norm": true,
+  "learn_bias": 1
+}

checkpoints/reverso_small/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a475a728a7d6a625bead27b283abd4e7746d3a7099086e5a8cf6a23bb647502b
+size 2252946

config/dataset_properties.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"m4_yearly": {"domain": "Econ/Fin", "frequency": "A", "num_variates": 1}, "m4_quarterly": {"domain": "Econ/Fin", "frequency": "Q", "num_variates": 1}, "m4_monthly": {"domain": "Econ/Fin", "frequency": "M", "num_variates": 1}, "m4_weekly": {"domain": "Econ/Fin", "frequency": "W", "num_variates": 1}, "m4_daily": {"domain": "Econ/Fin", "frequency": "D", "num_variates": 1}, "m4_hourly": {"domain": "Econ/Fin", "frequency": "H", "num_variates": 1}, "electricity": {"domain": "Energy", "frequency": "W", "num_variates": 1}, "ett1": {"domain": "Energy", "frequency": "W", "num_variates": 7}, "ett2": {"domain": "Energy", "frequency": "W", "num_variates": 7}, "solar": {"domain": "Energy", "frequency": "W", "num_variates": 1}, "hospital": {"domain": "Healthcare", "frequency": "M", "num_variates": 1}, "covid_deaths": {"domain": "Healthcare", "frequency": "D", "num_variates": 1}, "us_births": {"domain": "Healthcare", "frequency": "M", "num_variates": 1}, "saugeen": {"domain": "Nature", "frequency": "M", "num_variates": 1}, "temperature_rain": {"domain": "Nature", "frequency": "D", "num_variates": 1}, "kdd_cup_2018": {"domain": "Nature", "frequency": "D", "num_variates": 1}, "jena_weather": {"domain": "Nature", "frequency": "D", "num_variates": 21}, "car_parts": {"domain": "Sales", "frequency": "M", "num_variates": 1}, "restaurant": {"domain": "Sales", "frequency": "D", "num_variates": 1}, "hierarchical_sales": {"domain": "Sales", "frequency": "W-WED", "num_variates": 1}, "loop_seattle": {"domain": "Transport", "frequency": "D", "num_variates": 1}, "sz_taxi": {"domain": "Transport", "frequency": "H", "num_variates": 1}, "m_dense": {"domain": "Transport", "frequency": "D", "num_variates": 1}, "bitbrains_fast_storage": {"domain": "Web/CloudOps", "frequency": "H", "num_variates": 2}, "bitbrains_rnd": {"domain": "Web/CloudOps", "frequency": "H", "num_variates": 2}, "bizitobs_application": {"domain": "Web/CloudOps", "frequency": "10S", "num_variates": 2}, "bizitobs_service": {"domain": "Web/CloudOps", "frequency": "10S", "num_variates": 2}, "bizitobs_l2c": {"domain": "Web/CloudOps", "frequency": "H", "num_variates": 7}}

config/downsample_factors.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "bizitobs_l2c/5t/medium": 7,
+  "bizitobs_l2c/5t/long": 7
+}

example/eval_gift.py ADDED Viewed

	@@ -0,0 +1,432 @@

+"""
+GiftEval evaluation script for Reverso.
+"""
+import os
+import json
+import math
+import argparse
+import csv
+from types import SimpleNamespace
+from typing import List, Optional, Tuple
+from datetime import datetime
+import numpy as np
+import torch
+import pandas as pd
+from reverso.forecast import load_checkpoint
+try:
+    from torch.cuda.amp import autocast as autocast_fp
+except Exception:
+    autocast_fp = None
+def numpy_fill(arr: np.ndarray) -> np.ndarray:
+    mask = np.isnan(arr)
+    idx = np.where(~mask, np.arange(mask.shape[1]), 0)
+    np.maximum.accumulate(idx, axis=1, out=idx)
+    out = arr[np.arange(idx.shape[0])[:, None], idx]
+    return out
+class ReversoPredictor:
+    """GiftEval predictor for reverso.Model."""
+    def __init__(
+        self,
+        prediction_length: int,
+        checkpoint_path: Optional[str] = None,
+        device: str = "cuda",
+        seq_len: int = 2048,
+        input_token_len: int = 2048,
+        output_token_len: int = 48,
+        e_layers: int = 8,
+        d_model: int = 128,
+        d_intermediate: int = 512,
+        output_bottleneck_dim: int = 48,
+        expand_v: float = 1.0,
+        state_weaving: int = 1,
+        gating_kernel_size: int = 3,
+        main_module: str = "conv,attn,conv,attn,conv,attn,conv,attn",
+        num_samples: int = 100,
+        batch_size: int = 256,
+        use_amp: int = 1,
+        downsample_factor: int = 1,
+        force_flip_invariance: bool = False,
+    ):
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.prediction_length = int(prediction_length)
+        self.num_samples = int(num_samples)
+        self.batch_size = int(batch_size)
+        self.seq_len = int(seq_len)
+        self.input_token_len = int(input_token_len)
+        self.output_token_len = int(output_token_len)
+        self.use_amp = int(use_amp)
+        self.downsample_factor = int(downsample_factor)
+        self.force_flip_invariance = bool(force_flip_invariance)
+        args = SimpleNamespace(
+            input_token_len=self.input_token_len,
+            output_token_len=self.output_token_len,
+            seq_len=self.seq_len,
+            d_model=int(d_model),
+            d_intermediate=int(d_intermediate),
+            use_norm=True,
+            learn_bias=1,
+            output_bottleneck_dim=int(output_bottleneck_dim),
+            expand_v=float(expand_v),
+            state_weaving=int(state_weaving),
+            gating_kernel_size=int(gating_kernel_size),
+            main_module=str(main_module),
+        )
+        from reverso import model as model_impl
+        try:
+            self.model = model_impl.Model(args).to(self.device)
+        except RuntimeError as e:
+            if "CUDA" in str(e):
+                print(f"CUDA not usable ({e}); falling back to CPU.")
+                self.device = torch.device("cpu")
+                self.use_amp = 0
+                self.model = model_impl.Model(args).to(self.device)
+            else:
+                raise
+        self.model.eval()
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            self._load_checkpoint(checkpoint_path)
+        else:
+            print("Warning: checkpoint_path not provided or file not found. Using randomly initialized weights.")
+    def _load_checkpoint(self, ckpt_path: str):
+        load_checkpoint(self.model, ckpt_path, device=str(self.device))
+    def _downsample_if_needed(self, series: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        cur = series
+        if self.downsample_factor > 1:
+            cur = cur[::self.downsample_factor]
+        return cur, self.downsample_factor
+    def _left_pad_to_len(self, arr: np.ndarray, target_len: int) -> Tuple[np.ndarray, int]:
+        if arr.shape[0] >= target_len:
+            return arr[-target_len:], 0
+        pad_len = target_len - arr.shape[0]
+        fill_value = arr[0] if arr.shape[0] > 0 else 0.0
+        padding = np.full((pad_len,), fill_value, dtype=arr.dtype)
+        return np.concatenate([padding, arr], axis=0), pad_len
+    def _prepare_context_matrix(self, context: List[torch.Tensor]) -> Tuple[torch.Tensor, List[int]]:
+        xs = []
+        downsample_factors = []
+        for c in context:
+            cur, downsample_factor = self._downsample_if_needed(c)
+            downsample_factors.append(downsample_factor)
+            cur_np = cur.detach().cpu().float().numpy()
+            cur_np, _ = self._left_pad_to_len(cur_np, self.seq_len)
+            x2d = cur_np[None, :]
+            x_interp = np.copy(x2d)
+            series = x2d[0]
+            if np.any(np.isnan(series)):
+                valid_mask = ~np.isnan(series)
+                if np.sum(valid_mask) >= 2:
+                    valid_indices = np.where(valid_mask)[0]
+                    valid_values = series[valid_mask]
+                    x_interp[0] = np.interp(np.arange(len(series)), valid_indices, valid_values)
+                else:
+                    x_interp = numpy_fill(x2d)
+            ff = numpy_fill(x_interp)
+            bf = np.flip(numpy_fill(np.flip(x_interp, axis=1)), axis=1)
+            x_imp = np.where(np.isnan(ff), bf, ff)
+            x_imp = np.where(np.isnan(x_imp), 0.0, x_imp)
+            xs.append(x_imp[0])
+        x = torch.tensor(np.stack(xs), device=self.device, dtype=torch.float32).unsqueeze(-1)
+        return x, downsample_factors
+    def _decode_autoregressive(self, init_ctx: torch.Tensor, use_bf16: bool, downsample_factors: List[int]) -> torch.Tensor:
+        B, _, C = init_ctx.shape
+        roll_len = int(self.output_token_len)
+        target_pred_lens = [int(self.prediction_length) // int(max(1, df)) for df in downsample_factors]
+        max_target_pred_len = max(target_pred_lens)
+        steps = math.ceil(max_target_pred_len / roll_len)
+        preds: List[torch.Tensor] = []
+        batch_ctx = init_ctx
+        y_mark = torch.zeros(B, self.output_token_len, C, device=self.device, dtype=init_ctx.dtype)
+        for _ in range(steps):
+            x_in = batch_ctx[:, -self.seq_len:, :]
+            x_mark = torch.zeros_like(x_in)
+            if autocast_fp is not None and self.use_amp and use_bf16:
+                try:
+                    with autocast_fp(dtype=torch.bfloat16):
+                        outputs = self.model(x_in, x_mark, y_mark)
+                except Exception:
+                    outputs = self.model(x_in, x_mark, y_mark)
+            else:
+                outputs = self.model(x_in, x_mark, y_mark)
+            out_chunk = outputs[:, -self.output_token_len:, :]
+            take_chunk = out_chunk[:, :roll_len, :]
+            preds.append(take_chunk)
+            batch_ctx = torch.cat([batch_ctx, take_chunk], dim=1)
+        return torch.cat(preds, dim=1)
+    @torch.no_grad()
+    def predict(self, test_data_input, use_bf16_if_available: bool = True):
+        from gluonts.itertools import batcher
+        from gluonts.model.forecast import SampleForecast
+        forecasts = []
+        use_bf16 = bool(
+            use_bf16_if_available
+            and self.device.type == "cuda"
+            and torch.cuda.is_available()
+            and torch.cuda.is_bf16_supported()
+        )
+        for batch in batcher(test_data_input, batch_size=self.batch_size):
+            targets = [torch.tensor(entry["target"], dtype=torch.float32) for entry in batch]
+            batch_ctx, downsample_factors = self._prepare_context_matrix(targets)
+            pred_pos = self._decode_autoregressive(batch_ctx, use_bf16, downsample_factors)
+            if self.force_flip_invariance:
+                pred_neg = self._decode_autoregressive(-batch_ctx, use_bf16, downsample_factors)
+                pred_full = 0.5 * (pred_pos - pred_neg)
+            else:
+                pred_full = pred_pos
+            if torch.isnan(pred_full).any():
+                pf_2d = pred_full.squeeze(-1).detach().cpu().numpy()
+                pf_2d = numpy_fill(pf_2d)
+                pred_full = torch.tensor(pf_2d, device=pred_full.device, dtype=pred_full.dtype).unsqueeze(-1)
+            pred_full_np = pred_full.float().squeeze(-1).detach().cpu().numpy()
+            pred_list = []
+            for i in range(len(downsample_factors)):
+                df = downsample_factors[i]
+                target_pred_len = int(self.prediction_length) // int(max(1, df))
+                seq = pred_full_np[i, :target_pred_len]
+                if df > 1:
+                    old_len = len(seq)
+                    new_len = int(self.prediction_length)
+                    seq = np.interp(np.linspace(0, 1, new_len), np.linspace(0, 1, old_len), seq)
+                pred_list.append(seq)
+            pred_full_np = np.array(pred_list)
+            for i, ts in enumerate(batch):
+                start_date = ts["start"] + len(ts["target"])
+                samples = np.repeat(pred_full_np[i][None, :], self.num_samples, axis=0)
+                forecasts.append(SampleForecast(samples=samples, start_date=start_date))
+        return forecasts
+# ==========================
+# GiftEval evaluation script
+# ==========================
+from gluonts.ev.metrics import (
+    MAE, MAPE, MASE, MSE, MSIS, ND, NRMSE, RMSE, SMAPE,
+    MeanWeightedSumQuantileLoss,
+)
+METRICS = [
+    MSE(forecast_type="mean"),
+    MSE(forecast_type=0.5),
+    MAE(),
+    MASE(),
+    MAPE(),
+    SMAPE(),
+    MSIS(),
+    RMSE(),
+    NRMSE(),
+    ND(),
+    MeanWeightedSumQuantileLoss(quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
+]
+PRETTY_NAMES = {
+    "saugeenday": "saugeen",
+    "temperature_rain_with_missing": "temperature_rain",
+    "kdd_cup_2018_with_missing": "kdd_cup_2018",
+    "car_parts_with_missing": "car_parts",
+}
+SHORT_DATASETS = "m4_yearly m4_quarterly m4_monthly m4_weekly m4_daily m4_hourly electricity/15T electricity/H electricity/D electricity/W solar/10T solar/H solar/D solar/W hospital covid_deaths us_births/D us_births/M us_births/W saugeenday/D saugeenday/M saugeenday/W temperature_rain_with_missing kdd_cup_2018_with_missing/H kdd_cup_2018_with_missing/D car_parts_with_missing restaurant hierarchical_sales/D hierarchical_sales/W LOOP_SEATTLE/5T LOOP_SEATTLE/H LOOP_SEATTLE/D SZ_TAXI/15T SZ_TAXI/H M_DENSE/H M_DENSE/D ett1/15T ett1/H ett1/D ett1/W ett2/W ett2/D jena_weather/10T jena_weather/H jena_weather/D bitbrains_fast_storage/5T bitbrains_fast_storage/H bitbrains_rnd/5T bitbrains_rnd/H bizitobs_application bizitobs_service bizitobs_l2c/5T bizitobs_l2c/H"
+MED_LONG_DATASETS = "electricity/15T electricity/H solar/10T solar/H kdd_cup_2018_with_missing/H LOOP_SEATTLE/5T LOOP_SEATTLE/H SZ_TAXI/15T M_DENSE/H ett1/15T ett1/H ett2/15T ett2/H jena_weather/10T jena_weather/H bitbrains_fast_storage/5T bitbrains_rnd/5T bizitobs_application bizitobs_service bizitobs_l2c/5T bizitobs_l2c/H"
+def main():
+    parser = argparse.ArgumentParser(description="Run Reverso GiftEval across datasets")
+    parser.add_argument("--checkpoint", default='checkpoints/reverso_small/checkpoint.pth', help="Path to model checkpoint")
+    parser.add_argument("--json_path", default='checkpoints/reverso_small/args.json', help="Path to JSON file with model config overrides")
+    parser.add_argument("--output-dir", dest="output_dir", default='results/reverso_small', help="Output directory for results")
+    parser.add_argument("--dataset", default=None, help="Filter to specific dataset (substring match)")
+    parser.add_argument("--term", default=None, choices=["short", "medium", "long"], help="Filter to specific term")
+    parser.add_argument("--force-flip-invariance", dest="force_flip_invariance", action="store_true",
+                        help="Average f(x) with -f(-x) for flip invariance")
+    parser.add_argument("--downsample-json", dest="downsample_json",
+                        default="config/downsample_factors.json",
+                        help="Path to JSON with downsample factors per dataset/term")
+    args = parser.parse_args()
+    # Load model config from JSON if provided
+    json_cfg = {}
+    if args.json_path and os.path.isfile(args.json_path):
+        with open(args.json_path, "r") as f:
+            json_cfg = json.load(f)
+    # Model hyperparameters
+    SEQ_LEN = int(json_cfg.get("seq_len", 2048))
+    INPUT_TOKEN_LEN = int(json_cfg.get("input_token_len", 2048))
+    OUTPUT_TOKEN_LEN = int(json_cfg.get("output_token_len", 48))
+    E_LAYERS = int(json_cfg.get("e_layers", 8))
+    D_MODEL = int(json_cfg.get("d_model", 128))
+    D_INTERMEDIATE = int(json_cfg.get("d_intermediate", 512))
+    OUTPUT_BOTTLENECK_DIM = int(json_cfg.get("output_bottleneck_dim", 48))
+    EXPAND_V = float(json_cfg.get("expand_v", 1.0))
+    STATE_WEAVING = int(json_cfg.get("state_weaving", 1))
+    GATING_KERNEL_SIZE = int(json_cfg.get("gating_kernel_size", 3))
+    MAIN_MODULE = str(json_cfg.get("main_module", "conv,attn,conv,attn,conv,attn,conv,attn"))
+    DEVICE = "cuda"
+    NUM_SAMPLES = 100
+    BATCH_SIZE = 256
+    USE_AMP = 1
+    downsample_map = {}
+    if os.path.isfile(args.downsample_json):
+        with open(args.downsample_json, "r") as f:
+            downsample_map = json.load(f)
+    # Setup datasets
+    all_datasets = sorted(set(SHORT_DATASETS.split() + MED_LONG_DATASETS.split()))
+    med_long_set = set(MED_LONG_DATASETS.split())
+    all_terms = ["short", "medium", "long"]
+    with open("config/dataset_properties.json", "r") as f:
+        dataset_properties = json.load(f)
+    os.environ.setdefault("GIFT_EVAL", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data"))
+    if args.dataset:
+        all_datasets = [ds for ds in all_datasets if args.dataset in ds]
+        if not all_datasets:
+            print(f"No datasets found matching '{args.dataset}'")
+            return
+    if args.term:
+        all_terms = [args.term]
+    # Setup output
+    output_dir = args.output_dir or os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")
+    os.makedirs(output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    csv_path = os.path.join(output_dir, f"all_results_{timestamp}.csv")
+    with open(csv_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            "dataset", "model",
+            "eval_metrics/MSE[mean]", "eval_metrics/MSE[0.5]",
+            "eval_metrics/MAE[0.5]", "eval_metrics/MASE[0.5]",
+            "eval_metrics/MAPE[0.5]", "eval_metrics/sMAPE[0.5]",
+            "eval_metrics/MSIS", "eval_metrics/RMSE[mean]",
+            "eval_metrics/NRMSE[mean]", "eval_metrics/ND[0.5]",
+            "eval_metrics/mean_weighted_sum_quantile_loss",
+            "domain", "num_variates",
+        ])
+    from gluonts.model import evaluate_model
+    from gluonts.time_feature import get_seasonality
+    from gift_eval.data import Dataset
+    print(f"Evaluating {len(all_datasets)} datasets, terms: {all_terms}")
+    print(f"Flip invariance: {args.force_flip_invariance}")
+    for ds_num, ds_name in enumerate(all_datasets):
+        if "/" in ds_name:
+            ds_key = PRETTY_NAMES.get(ds_name.split("/")[0].lower(), ds_name.split("/")[0].lower())
+            ds_freq = ds_name.split("/")[1]
+        else:
+            ds_key = PRETTY_NAMES.get(ds_name.lower(), ds_name.lower())
+            ds_freq = dataset_properties[ds_key]["frequency"]
+        print(f"[{ds_num + 1}/{len(all_datasets)}] {ds_name}")
+        for term in all_terms:
+            if term in ("medium", "long") and ds_name not in med_long_set:
+                continue
+            ds_config = f"{ds_key}/{ds_freq}/{term}"
+            probe = Dataset(name=ds_name, term=term, to_univariate=False)
+            to_univariate = probe.target_dim != 1
+            dataset = Dataset(name=ds_name, term=term, to_univariate=to_univariate)
+            season_length = get_seasonality(dataset.freq)
+            downsample_key = f"{ds_key}/{ds_freq}/{term}".lower()
+            downsample_factor = downsample_map.get(downsample_key, 1)
+            info = f"  {term}: {len(dataset.test_data)} instances"
+            if downsample_factor > 1:
+                info += f", downsample={downsample_factor}"
+            print(info)
+            predictor = ReversoPredictor(
+                prediction_length=dataset.prediction_length,
+                checkpoint_path=args.checkpoint,
+                device=DEVICE,
+                seq_len=SEQ_LEN,
+                input_token_len=INPUT_TOKEN_LEN,
+                output_token_len=OUTPUT_TOKEN_LEN,
+                e_layers=E_LAYERS,
+                d_model=D_MODEL,
+                d_intermediate=D_INTERMEDIATE,
+                output_bottleneck_dim=OUTPUT_BOTTLENECK_DIM,
+                expand_v=EXPAND_V,
+                state_weaving=STATE_WEAVING,
+                gating_kernel_size=GATING_KERNEL_SIZE,
+                main_module=MAIN_MODULE,
+                num_samples=NUM_SAMPLES,
+                batch_size=BATCH_SIZE,
+                use_amp=USE_AMP,
+                downsample_factor=downsample_factor,
+                force_flip_invariance=args.force_flip_invariance,
+            )
+            res = evaluate_model(
+                predictor,
+                test_data=dataset.test_data,
+                metrics=METRICS,
+                batch_size=BATCH_SIZE,
+                axis=None,
+                mask_invalid_label=True,
+                allow_nan_forecast=False,
+                seasonality=season_length,
+            )
+            with open(csv_path, "a", newline="") as f:
+                writer = csv.writer(f)
+                writer.writerow([
+                    ds_config, "reverso",
+                    res["MSE[mean]"][0], res["MSE[0.5]"][0],
+                    res["MAE[0.5]"][0], res["MASE[0.5]"][0],
+                    res["MAPE[0.5]"][0], res["sMAPE[0.5]"][0],
+                    res["MSIS"][0], res["RMSE[mean]"][0],
+                    res["NRMSE[mean]"][0], res["ND[0.5]"][0],
+                    res["mean_weighted_sum_quantile_loss"][0],
+                    dataset_properties[ds_key]["domain"],
+                    dataset_properties[ds_key]["num_variates"],
+                ])
+    print(f"\nResults saved to: {csv_path}")
+if __name__ == "__main__":
+    main()

example/forecast_demo.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Demo: autoregressive forecasting on simple synthetic signals.
+Run all signals:     python example/forecast_demo.py --signal all
+Run one signal:      python example/forecast_demo.py --signal sine
+List available:      python example/forecast_demo.py --list
+"""
+import argparse
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from reverso.forecast import load_model, forecast
+# ---------------------------------------------------------------------------
+# Signal generators — each returns float32 array of length n
+# ---------------------------------------------------------------------------
+def signal_constant(n: int) -> np.ndarray:
+    return np.full(n, 5.0, dtype=np.float32)
+def signal_linear(n: int) -> np.ndarray:
+    return np.linspace(0, 40, n).astype(np.float32)
+def signal_sine(n: int) -> np.ndarray:
+    t = np.arange(n, dtype=np.float64)
+    return (5.0 * np.sin(2 * np.pi * t / 200)).astype(np.float32)
+def signal_sawtooth(n: int) -> np.ndarray:
+    t = np.arange(n, dtype=np.float64)
+    period = 200
+    return (10.0 * (t % period) / period).astype(np.float32)
+def signal_square(n: int) -> np.ndarray:
+    t = np.arange(n, dtype=np.float64)
+    return (5.0 * np.sign(np.sin(2 * np.pi * t / 200))).astype(np.float32)
+SIGNALS = {
+    "constant": ("Constant", signal_constant),
+    "linear": ("Linear trend", signal_linear),
+    "sine": ("Sine wave", signal_sine),
+    "sawtooth": ("Sawtooth wave", signal_sawtooth),
+    "square": ("Square wave", signal_square),
+}
+def run_one(name, label, gen_fn, model, cfg, device, context_length, prediction_length,
+            output_dir, flip_invariance=False):
+    total_len = context_length + prediction_length
+    signal = gen_fn(total_len)
+    context_np = signal[:context_length]
+    ground_truth = signal[context_length:]
+    context_tensor = torch.tensor(context_np, device=device).unsqueeze(0).unsqueeze(-1)
+    pred_pos = forecast(
+        model, context_tensor, prediction_length,
+        seq_len=cfg.seq_len, output_token_len=cfg.output_token_len,
+    )
+    if flip_invariance:
+        pred_neg = forecast(
+            model, -context_tensor, prediction_length,
+            seq_len=cfg.seq_len, output_token_len=cfg.output_token_len,
+        )
+        preds_tensor = 0.5 * (pred_pos - pred_neg)
+    else:
+        preds_tensor = pred_pos
+    preds = preds_tensor[0, :, 0].float().cpu().numpy()
+    ctx_t = np.arange(context_length)
+    pred_t = np.arange(context_length, total_len)
+    fig, ax = plt.subplots(figsize=(14, 5))
+    ax.plot(ctx_t, context_np, color="steelblue", label="Context")
+    ax.plot(pred_t, ground_truth, color="gray", linestyle="--", label="Ground truth")
+    ax.plot(pred_t, preds, color="tomato", label="Forecast")
+    ax.axvline(context_length, color="black", linestyle=":", alpha=0.5)
+    ax.set_xlabel("Time step")
+    ax.set_ylabel("Value")
+    ax.set_title(f"Reverso: {label}")
+    ax.legend()
+    fig.tight_layout()
+    out_path = f"{output_dir}/{name}_forecast.png"
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+    print(f"  {label:25s} -> {out_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Reverso forecast demo on synthetic signals")
+    parser.add_argument("--signal", type=str, default="all",
+                        help="Signal name, or 'all' to run every signal")
+    parser.add_argument("--list", action="store_true", help="List available signals and exit")
+    parser.add_argument("--checkpoint", type=str,
+                        default="checkpoints/reverso_small/checkpoint.pth")
+    parser.add_argument("--args-json", type=str,
+                        default="checkpoints/reverso_small/args.json")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--context-length", type=int, default=2048)
+    parser.add_argument("--prediction-length", type=int, default=480)
+    parser.add_argument("--output-dir", type=str, default="example")
+    parser.add_argument("--flip-invariance", action="store_true",
+                        help="Average f(x) with -f(-x) for flip invariance")
+    args = parser.parse_args()
+    if args.list:
+        for name, (label, _) in SIGNALS.items():
+            print(f"  {name:15s}  {label}")
+        return
+    model, cfg = load_model(args.checkpoint, args.args_json, args.device)
+    print(f"Model loaded: {sum(p.numel() for p in model.parameters()):,} params")
+    if args.signal == "all":
+        to_run = list(SIGNALS.items())
+    else:
+        if args.signal not in SIGNALS:
+            print(f"Unknown signal '{args.signal}'. Use --list to see options.")
+            return
+        to_run = [(args.signal, SIGNALS[args.signal])]
+    for name, (label, gen_fn) in to_run:
+        run_one(name, label, gen_fn, model, cfg, args.device,
+                args.context_length, args.prediction_length, args.output_dir,
+                args.flip_invariance)
+if __name__ == "__main__":
+    main()

example/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+matplotlib
+gluonts~=0.15.1
+python-dotenv==1.0.0

figures/gift_eval_pareto_overall.png ADDED Viewed

Git LFS Details

SHA256: dd1e9d6b26355dd1e773948df13806d844c599b0f9a454c15b02f091b45ad8a6
Pointer size: 132 Bytes
Size of remote file: 2.08 MB

figures/new_arch.png ADDED Viewed

Git LFS Details

SHA256: 71d34bc959984b3d89e1c2a83e6842a5a05606401db0d52867e21c70f1f683eb
Pointer size: 131 Bytes
Size of remote file: 169 kB

pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[build-system]
+requires = ["setuptools>=64"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "reverso"
+version = "0.1.0"
+description = "Efficient time-series foundation models for zero-shot forecasting"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.11"
+dependencies = [
+    "torch>=2.6.0",
+    "numpy",
+    "pandas",
+    "flash-linear-attention",
+]
+[project.optional-dependencies]
+examples = [
+    "matplotlib",
+    "gluonts~=0.15.1",
+    "python-dotenv>=1.0.0",
+]
+[project.urls]
+Repository = "https://github.com/shinfxh/reverso"
+[tool.setuptools.packages.find]
+include = ["reverso*"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch==2.6.0
+numpy
+pandas
+flash-linear-attention

reverso/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Reverso: Efficient time-series foundation models for zero-shot forecasting."""
+from reverso.model import Model
+from reverso.forecast import forecast, load_checkpoint, load_model
+__all__ = ["Model", "forecast", "load_checkpoint", "load_model"]

reverso/forecast.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Autoregressive forecasting utilities for Reverso."""
+import json
+import math
+from types import SimpleNamespace
+import torch
+from reverso.model import Model
+def load_checkpoint(model: Model, checkpoint_path: str, device: str = "cuda"):
+    """Load a checkpoint into an existing Reverso model.
+    Handles common checkpoint formats (raw state_dict, or dicts keyed by
+    "model_state_dict", "state_dict", "model", "ema", "ema_state_dict")
+    and strips the "module." prefix left by DDP.
+    """
+    raw = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    state_dict = raw
+    if isinstance(raw, dict):
+        for k in ("model_state_dict", "state_dict", "model", "ema", "ema_state_dict"):
+            if k in raw and isinstance(raw[k], dict):
+                state_dict = raw[k]
+                break
+    state_dict = {k.removeprefix("module."): v for k, v in state_dict.items()}
+    model.load_state_dict(state_dict, strict=True)
+def load_model(checkpoint_path: str, args_json: str, device: str = "cuda"):
+    """Load a Reverso model from a checkpoint and config JSON.
+    Returns:
+        (model, cfg) tuple.
+    """
+    with open(args_json) as f:
+        cfg = SimpleNamespace(**json.load(f))
+    model = Model(cfg).to(device)
+    load_checkpoint(model, checkpoint_path, device)
+    model.eval()
+    return model, cfg
+@torch.no_grad()
+def forecast(
+    model: Model,
+    context: torch.Tensor,
+    prediction_length: int,
+    seq_len: int,
+    output_token_len: int,
+    use_amp: bool = True,
+) -> torch.Tensor:
+    """Autoregressive multi-step forecast.
+    Follows the rollout pattern from eval_gift.py's _decode_autoregressive.
+    Args:
+        model: Reverso Model (already on the target device, in eval mode).
+        context: Input context tensor of shape (B, L, 1).
+        prediction_length: Number of future steps to predict.
+        seq_len: Model's context window length (cfg.seq_len).
+        output_token_len: Steps produced per model call (cfg.output_token_len).
+        use_amp: Whether to use bfloat16 autocast (requires CUDA).
+    Returns:
+        Predictions tensor of shape (B, prediction_length, 1).
+    """
+    device = context.device
+    B, _, C = context.shape
+    roll_len = output_token_len
+    steps = math.ceil(prediction_length / roll_len)
+    batch_ctx = context
+    preds = []
+    y_mark = torch.zeros(B, output_token_len, C, device=device, dtype=context.dtype)
+    for _ in range(steps):
+        x_in = batch_ctx[:, -seq_len:, :]
+        x_mark = torch.zeros_like(x_in)
+        if use_amp and device.type == "cuda":
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                outputs = model(x_in, x_mark, y_mark)
+        else:
+            outputs = model(x_in, x_mark, y_mark)
+        out_chunk = outputs[:, -output_token_len:, :]
+        take_chunk = out_chunk[:, :roll_len, :]
+        preds.append(take_chunk)
+        batch_ctx = torch.cat([batch_ctx, take_chunk], dim=1)
+    return torch.cat(preds, dim=1)[:, :prediction_length, :]

reverso/model.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Reverso: conv-attention hybrid for time series forecasting.
+"""
+import torch
+from torch import nn
+import torch.nn.functional as F
+from flashfftconv import FlashFFTConv
+from fla.layers import DeltaNet
+from typing import Any
+class Gating(nn.Module):
+    def __init__(self, channels, temporal_kernel=3):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv1d(channels, channels, kernel_size=temporal_kernel,
+                      padding=temporal_kernel // 2, groups=channels),
+            nn.SiLU(),
+            nn.Conv1d(channels, channels, kernel_size=1),
+        )
+    def forward(self, x):
+        return torch.sigmoid(self.net(x))
+class MLPBlock(nn.Module):
+    def __init__(self, d_in, d_out, d_intermediate=0):
+        super().__init__()
+        self.norm = nn.LayerNorm(d_out)
+        if d_intermediate and d_intermediate > 0:
+            self.linear = nn.Linear(d_in, d_intermediate)
+            self.linear_final = nn.Linear(d_intermediate, d_out)
+        else:
+            self.linear = nn.Linear(d_in, d_out)
+            self.linear_final = nn.Identity()
+        self.activation = nn.ReLU()
+        self.skip_linear = nn.Linear(d_in, d_out) if d_in != d_out else nn.Identity()
+    def forward(self, x):
+        if x.ndim == 3:
+            x = x.permute(0, 2, 1)
+        residual = self.skip_linear(x)
+        y = self.linear(x)
+        y = self.activation(y)
+        y = self.linear_final(y)
+        y = self.norm(y)
+        y = residual + y
+        if y.ndim == 3:
+            y = y.permute(0, 2, 1)
+        return y
+class CNNBlock(nn.Module):
+    def __init__(self, channels, seq_len, flashfftconv, gating_kernel_size=3):
+        super().__init__()
+        self.flashfftconv = flashfftconv
+        self.k = nn.Parameter(torch.randn(channels, seq_len, dtype=torch.float32))
+        self.pregate = Gating(channels, gating_kernel_size)
+        self.activation = nn.ReLU()
+        self.norm = nn.LayerNorm(channels)
+    def forward(self, x):
+        residual = x
+        x_conv = x.contiguous().to(torch.bfloat16)
+        pregate = self.pregate(x_conv.float()).to(x_conv.dtype)
+        postgate = torch.ones_like(x_conv)
+        out = self.flashfftconv(x_conv, self.k, pregate=pregate, postgate=postgate)
+        out = self.activation(out)
+        out = out.transpose(1, 2)
+        out = self.norm(out)
+        out = out.transpose(1, 2)
+        out = out + residual
+        return out
+class AttentionBlock(nn.Module):
+    def __init__(self, d_model, expand_v, state_weaving=False, is_intermediate=False):
+        super().__init__()
+        self.state_weaving = state_weaving
+        self.is_intermediate = is_intermediate
+        self.attention = DeltaNet(
+            mode='chunk',
+            d_model=d_model,
+            expand_k=1.0,
+            expand_v=expand_v,
+            num_heads=4,
+            use_beta=True,
+            use_gate=False,
+            use_short_conv=True,
+            conv_size=4,
+            allow_neg_eigval=False,
+            qk_activation='silu',
+            qk_norm='l2',
+            layer_idx=0,
+        )
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x):
+        x_t = x.transpose(1, 2)
+        residual = x_t
+        if self.state_weaving and self.is_intermediate:
+            x_t = x_t.clone()
+            x_t[:, 0:1, :] = x_t[:, 0:1, :] + x_t[:, -1:, :]
+        attn_out = self.attention(hidden_states=x_t, attention_mask=None)
+        if isinstance(attn_out, tuple):
+            out = attn_out[0]
+        else:
+            out = attn_out
+        out = self.norm(out)
+        out = out + residual
+        out = out.transpose(1, 2)
+        return out
+class Model(nn.Module):
+    """
+    Reverso: conv-deltanet hybrid for time series forecasting.
+    """
+    def __init__(self, configs):
+        super().__init__()
+        self.seq_len = configs.seq_len
+        self.input_token_len = configs.input_token_len
+        self.output_token_len = configs.output_token_len
+        self.d_model = configs.d_model
+        self.use_norm = configs.use_norm
+        self.embedding = nn.Linear(1, self.d_model, bias=False)
+        self.shared_flashfftconv = FlashFFTConv(self.seq_len, dtype=torch.bfloat16)
+        d_intermediate = configs.d_intermediate
+        expand_v = getattr(configs, 'expand_v', 1.0)
+        state_weaving = getattr(configs, 'state_weaving', False)
+        gating_kernel_size = getattr(configs, 'gating_kernel_size', 3)
+        module_list = [m.strip() for m in configs.main_module.split(',')]
+        e_layers = len(module_list)
+        layers = []
+        for i, layer_type in enumerate(module_list):
+            if layer_type == 'conv':
+                layers.append(CNNBlock(
+                    self.d_model, self.seq_len, self.shared_flashfftconv, gating_kernel_size,
+                ))
+            elif layer_type == 'attn':
+                is_intermediate = (i > 0) and (i < e_layers - 1)
+                layers.append(AttentionBlock(
+                    self.d_model, expand_v, state_weaving, is_intermediate,
+                ))
+            else:
+                raise ValueError(f'Invalid layer type: {layer_type}')
+            layers.append(MLPBlock(self.d_model, self.d_model, d_intermediate))
+        self.layers = nn.Sequential(*layers)
+        output_bottleneck_dim = getattr(configs, 'output_bottleneck_dim', self.output_token_len)
+        self.head = nn.Linear(self.input_token_len, output_bottleneck_dim, bias=configs.learn_bias)
+        self.simple_q_proj = nn.Linear(self.d_model, self.d_model)
+        self.key_proj = nn.Linear(self.d_model, self.d_model)
+        self.value_proj = nn.Linear(self.d_model, self.d_model)
+        self.out_proj = nn.Linear(self.d_model, 1)
+    def forward(self, x, x_mark=None, y_mark=None, **kwargs: Any):
+        B, L, C = x.shape
+        if self.use_norm:
+            x_min = x.min(1, keepdim=True)[0].detach()
+            x_max = x.max(1, keepdim=True)[0].detach()
+            x_range = torch.clamp(x_max - x_min, min=1e-5).detach()
+            x = (x - x_min) / x_range
+            means = x_min
+            stdev = x_range
+        x = self.embedding(x).transpose(1, 2)
+        dec_out = self.layers(x)
+        temp_out = self.head(dec_out).permute(0, 2, 1)
+        q = self.simple_q_proj(temp_out)
+        dec_out_perm = dec_out.permute(0, 2, 1)
+        k = self.key_proj(dec_out_perm)
+        v = self.value_proj(dec_out_perm)
+        attn = F.scaled_dot_product_attention(q, k, v)
+        dec_out = self.out_proj(attn)
+        if self.use_norm:
+            dec_out = dec_out * stdev + means
+        return dec_out
+    def forecast(self, x, x_mark=None, y_mark=None, **kwargs):
+        return self.forward(x, x_mark, y_mark, **kwargs)