Wolfvin commited on 17 days ago

Commit

2d7e335

verified ·

1 Parent(s): cc1beb2

AAM Diffusion LLM v1.0 — The Body of Aphantasic Abstraction Model

Browse files

Files changed (37) hide show

.gitignore +14 -0
README.md +253 -0
config.json +105 -0
diffusion_llm/README.md +331 -0
diffusion_llm/__init__.py +60 -0
diffusion_llm/config/__init__.py +5 -0
diffusion_llm/config/model_config.py +620 -0
diffusion_llm/data/__init__.py +6 -0
diffusion_llm/data/data_pipeline.py +179 -0
diffusion_llm/data/synthetic_generator.py +427 -0
diffusion_llm/inference/__init__.py +5 -0
diffusion_llm/inference/generator.py +333 -0
diffusion_llm/model/__init__.py +13 -0
diffusion_llm/model/aam_diffusion_model.py +475 -0
diffusion_llm/model/diffusion_transformer.py +394 -0
diffusion_llm/model/graph_encoder.py +553 -0
diffusion_llm/model/noise_scheduler.py +426 -0
diffusion_llm/requirements.txt +17 -0
diffusion_llm/scripts/evaluate.py +157 -0
diffusion_llm/scripts/export.py +71 -0
diffusion_llm/scripts/train.py +168 -0
diffusion_llm/scripts/train_final.py +686 -0
diffusion_llm/scripts/train_minimal.py +260 -0
diffusion_llm/tests/__init__.py +1 -0
diffusion_llm/tests/test_model.py +239 -0
diffusion_llm/tests/test_scheduler.py +98 -0
diffusion_llm/tokenizer/__init__.py +5 -0
diffusion_llm/tokenizer/aam_tokenizer.py +596 -0
diffusion_llm/training/__init__.py +7 -0
diffusion_llm/training/dataset.py +371 -0
diffusion_llm/training/losses.py +127 -0
diffusion_llm/training/trainer.py +420 -0
inference_example.py +38 -0
pytorch_model.bin +3 -0
requirements.txt +2 -0
tokenizer.json +964 -0
training_config.json +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# AAM Diffusion LLM v1.0 — HuggingFace Repository Files
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+dist/
+build/
+*.so
+.env
+output/
+aam-diffusion-v1/
+*.log
+.DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,253 @@

+---
+language:
+- id
+- en
+license: mit
+library_name: pytorch
+tags:
+- diffusion
+- text-generation
+- aam
+- aphantasic-abstraction-model
+- sentence-arrangement
+- graph-conditioned
+- indonesian
+---
+# AAM Diffusion LLM v1.0
+> **"AAM = 1 Pikiran + 1 Tubuh" (1 Mind + 1 Body)**
+The dedicated "body" of the **Aphantasic Abstraction Model (AAM)** — a small diffusion LLM specifically trained to arrange sentences from structured graph data.
+## What is this?
+This is **NOT** a general-purpose LLM. This is a **SPECIALIZED sentence composer** that:
+- Takes **graph-structured conditioning** as input (evidence nodes, anomalies, reasoning chains, confidence scores)
+- Produces **coherent natural language narratives** through iterative denoising (diffusion process)
+- **Cannot hallucinate** — it can only narrate what the graph knows
+### Why Diffusion (Not Autoregressive)?
+1. **Non-sequential generation** — Can revise earlier parts while generating later parts, mirroring how thoughts form: vague intuition → clearer pattern → explicit narrative
+2. **Graph conditioning** — The entire graph structure is encoded as conditioning, not just a text prefix
+3. **Anti-hallucination by design** — Trained exclusively on Graph→Narrative pairs, the model has no capability to generate information outside the graph conditioning
+## Architecture
+```
+┌──────────────────────────────────────────────────────────┐
+│  AAM = 1 Pikiran + 1 Tubuh                                │
+│                                                           │
+│  Pikiran (Mind) = RSVS Knowledge Graph                    │
+│    - Structural memory — perfect recall                    │
+│    - Relational — understands concept connections           │
+│    - Confidence scores — knows certainty levels             │
+│                                                           │
+│  Tubuh (Body) = AAM Diffusion LLM (This Model)            │
+│    ┌─────────────────────────────────────────────┐        │
+│    │  Graph Conditioning Encoder                   │        │
+│    │  ├─ Evidence Node Encoder                     │        │
+│    │  ├─ Composition Encoder                       │        │
+│    │  ├─ Anomaly Encoder                           │        │
+│    │  ├─ Reasoning Chain Encoder                   │        │
+│    │  ├─ Confidence Embedding                      │        │
+│    │  ├─ Temporal Embedding                        │        │
+│    │  └─ Graph Attention Layers                    │        │
+│    │         ↓ (cross-attention keys/values)       │        │
+│    ├─────────────────────────────────────────────┤        │
+│    │  Diffusion Transformer (Denoiser)             │        │
+│    │  ├─ Token Embedding                           │        │
+│    │  ├─ Timestep Embedding (sinusoidal)           │        │
+│    │  ├─ N × TransformerBlock:                     │        │
+│    │  │   ├─ AdaptiveLayerNorm + Self-Attention    │        │
+│    │  │   ├─ AdaptiveLayerNorm + Cross-Attention   │        │
+│    │  │   └─ AdaptiveLayerNorm + Feed-Forward      │        │
+│    │  └─ Output Projection                         │        │
+│    │         ↓ (predicted noise)                   │        │
+│    ├─────────────────────────────────────────────┤        │
+│    │  Noise Scheduler                              │        │
+│    │  ├─ Forward: x_0 + noise → x_t                │        │
+│    │  └─ Reverse: x_t → denoise → x_{t-1}         │        │
+│    └─────────────────────────────────────────────┘        │
+│                                                           │
+│  Training: Graph→Narrative pairs                          │
+│  Inference: Noise → N denoising steps → Narrative         │
+└──────────────────────────────────────────────────────────┘
+```
+## Model Details (v1.0 — Trained)
+| Parameter | Value |
+|-----------|-------|
+| Architecture | Diffusion Transformer with Graph Conditioning |
+| d_model | 64 |
+| n_layers | 2 |
+| n_heads | 4 |
+| d_ff | 128 |
+| **Total Parameters** | **311,670 (311.7K)** |
+| Vocab size | 500 (BPE + special tokens) |
+| Max sequence length | 32 |
+| Diffusion timesteps (train) | 50 |
+| Diffusion timesteps (inference) | 5 |
+| Noise schedule | Cosine |
+| Prediction type | Epsilon (noise prediction) |
+| Sampling method | DDIM |
+> **Note**: This v1.0 model was trained with minimal parameters (311K) for proof-of-concept on CPU. For production use, scale up to the `base` (170M) or `medium` (300M) configurations provided in the framework.
+## Model Sizes (Framework Supports)
+| Size | d_model | Layers | Heads | Params | Recommended For |
+|------|---------|--------|-------|--------|----------------|
+| tiny | 256 | 4 | 4 | ~25M | Quick testing, debugging |
+| small | 512 | 8 | 8 | ~70M | Development, prototyping |
+| **base** | **768** | **12** | **12** | **~170M** | **Recommended for training** |
+| medium | 1024 | 12 | 16 | ~300M | Final training, best quality |
+## Usage
+### Quick Start
+```python
+from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator, AamDiffusionConfig
+# Load model
+config = AamDiffusionConfig.from_json("config.json")
+model = AamDiffusionModel.load("model.pt", device="cpu")
+tokenizer = AamTokenizer.load("tokenizer.json")
+# Create generator
+generator = AamGenerator(model, tokenizer, config)
+# Generate narrative from graph conditioning
+result = generator.generate(
+    trigger="Siapa yang mencuri Snow Plum Pill?",
+    evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"],
+    anomalies=["Tidak ada konsumsi pil baru di pasar gelap"],
+    reasoning_steps=["Cross-reference tanggal kejadian", "Deteksi anomali pola"],
+    source_trust=0.85,
+)
+print(result.narrative)
+print(f"Confidence: {result.confidence:.1%}")
+print(f"Steps: {result.n_diffusion_steps}")
+```
+### Training Your Own Model
+```python
+from diffusion_llm import AamDiffusionConfig, get_default_config
+from diffusion_llm.training import AamTrainer, GraphNarrativeDataset
+from diffusion_llm.data import DataPipeline
+# Get config for your desired size
+config = get_default_config("base")  # 170M params
+# Prepare data pipeline
+pipeline = DataPipeline(config)
+tokenizer, train_loader, val_loader = pipeline.prepare()
+# Create and train model
+model = AamDiffusionModel(config)
+trainer = AamTrainer(config, model, tokenizer, train_loader.dataset, val_loader.dataset)
+trainer.train()
+```
+### Command Line
+```bash
+# Train with default config
+python diffusion_llm/scripts/train.py --model_size base
+# Generate narratives
+python diffusion_llm/scripts/evaluate.py --checkpoint output/best.pt --generate
+# Export model
+python diffusion_llm/scripts/export.py --checkpoint output/best.pt --output model_export/
+```
+## Philosophy
+**AAM = 1 Pikiran + 1 Tubuh (1 Mind + 1 Body)**
+- **Mind** = RSVS Knowledge Graph (structural memory, perfect recall, relational understanding)
+- **Body** = This Diffusion LLM (sentence arranger, graph-conditioned, anti-hallucination)
+Unlike using a rented LLM (GPT, Claude, etc.) as the "body", this model is **specifically trained for AAM**:
+- It **cannot generate** information not present in the graph conditioning
+- It **arranges sentences** based on structured evidence
+- It uses **diffusion** (non-sequential generation) instead of autoregressive generation
+- It is **small** but **specialized** — like Jin Soun's body in the novel, it may be "third-rate" but it's **his own**
+> Jin Soun bukan orang yang menyewa tubuh orang lain untuk berbicara.
+> Dia punya tubuh sendiri — lemah, third-rate, tapi MILIKNYA.
+> Karena tubuhnya khusus dilatih untuk mengeksekusi perintah dari
+> pikirannya (bukan pikiran orang lain), outputnya lebih terarah
+> daripada orang yang punya tubuh lebih kuat tapi pikiran lebih lemah.
+## Framework Structure
+```
+diffusion_llm/
+├── __init__.py                 # Public API
+├── config/
+│   └── model_config.py         # All configuration dataclasses
+├── tokenizer/
+│   └── aam_tokenizer.py        # Sentence-level + BPE hybrid tokenizer
+├── model/
+│   ├── noise_scheduler.py      # Forward/reverse diffusion process
+│   ├── graph_encoder.py        # Graph conditioning encoder
+│   ├── diffusion_transformer.py # Core denoising transformer
+│   └── aam_diffusion_model.py  # Complete model (combines all)
+├── training/
+│   ├── losses.py               # Loss functions (MSE, MAE, Huber, weighted)
+│   ├── dataset.py              # GraphNarrative dataset
+│   └── trainer.py              # Training loop with AMP, EMA, etc.
+├── inference/
+│   └── generator.py            # Inference pipeline
+├── data/
+│   ├── synthetic_generator.py  # Synthetic training data
+│   └── data_pipeline.py        # Data preparation pipeline
+├── scripts/
+│   ├── train.py                # Training entry point
+│   ├── evaluate.py             # Evaluation & generation
+│   └── export.py               # Model export
+└── tests/
+    ├── test_model.py           # Model component tests
+    └── test_scheduler.py       # Noise scheduler tests
+```
+## Training Data Format
+Data training dalam format JSONL:
+```json
+{
+  "narrative": "Berdasarkan analisis, Diancang Five Swords mencuri Snow Plum Pill.",
+  "trigger": "Siapa yang mencuri Snow Plum Pill?",
+  "evidence_nodes": ["Hefei", "Diancang Five Swords", "Ju Jangmok"],
+  "compositions": [],
+  "confidence_map": {"Hefei": 0.9, "Diancang Five Swords": 0.85},
+  "anomalies": ["Tidak ada konsumsi pil baru di pasar gelap"],
+  "reasoning_steps": ["Cross-reference tanggal kejadian", "Deteksi anomali pola"],
+  "source_trust": 0.85,
+  "language": "id"
+}
+```
+## License
+MIT
+## Citation
+```bibtex
+@software{aam_diffusion_llm_v1,
+  title = {AAM Diffusion LLM: The Body of Aphantasic Abstraction Model},
+  author = {AAM Team},
+  year = {2026},
+  description = {A specialized diffusion LLM for sentence arrangement from graph-structured data},
+  url = {https://huggingface.co/aam-diffusion-v1}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,105 @@

+{
+  "model": {
+    "d_model": 64,
+    "n_layers": 2,
+    "n_heads": 4,
+    "d_ff": 128,
+    "dropout": 0.1,
+    "activation": "gelu",
+    "max_seq_len": 32,
+    "vocab_size": 500,
+    "pos_encoding_type": "learned",
+    "use_flash_attention": false,
+    "norm_type": "layernorm",
+    "norm_eps": 1e-06,
+    "init_std": 0.02
+  },
+  "diffusion": {
+    "n_timesteps": 50,
+    "n_inference_steps": 5,
+    "schedule_type": "cosine",
+    "beta_start": 0.0001,
+    "beta_end": 0.02,
+    "prediction_type": "epsilon",
+    "sampling_method": "ddim",
+    "eta_ddim": 0.0,
+    "clip_sample_max": 5.0,
+    "clip_sample_min": -5.0,
+    "loss_type": "mse",
+    "loss_weighting": "none",
+    "p2_gamma": 1.0,
+    "p2_k": 1.0
+  },
+  "graph_encoder": {
+    "d_graph": 32,
+    "n_graph_layers": 1,
+    "n_graph_heads": 2,
+    "max_evidence_nodes": 3,
+    "max_compositions": 2,
+    "max_anomalies": 2,
+    "max_reasoning_steps": 2,
+    "conditioning_method": "cross_attention",
+    "embed_confidence": false,
+    "embed_temporal": false
+  },
+  "tokenizer": {
+    "bpe_vocab_size": 500,
+    "max_sentences": 32,
+    "sentence_boundary_token": "<sent>",
+    "pad_token": "<pad>",
+    "bos_token": "<bos>",
+    "eos_token": "<eos>",
+    "mask_token": "<mask>",
+    "noise_token": "<noise>",
+    "evidence_token": "<evidence>",
+    "anomaly_token": "<anomaly>",
+    "confidence_token": "<confidence>",
+    "reasoning_token": "<reasoning>",
+    "composition_token": "<composition>",
+    "temporal_token": "<temporal>",
+    "min_frequency": 2,
+    "dropout_rate": 0.0
+  },
+  "training": {
+    "learning_rate": 0.001,
+    "weight_decay": 0.01,
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_eps": 1e-08,
+    "lr_schedule": "cosine",
+    "warmup_steps": 5,
+    "batch_size": 2,
+    "gradient_accumulation_steps": 4,
+    "max_steps": 50,
+    "max_epochs": 100,
+    "dropout": 0.1,
+    "grad_clip_norm": 1.0,
+    "use_amp": false,
+    "amp_dtype": "bf16",
+    "save_every_steps": 5000,
+    "eval_every_steps": 1000,
+    "keep_last_n_checkpoints": 3,
+    "use_ema": true,
+    "ema_decay": 0.9999,
+    "train_data_path": "",
+    "val_data_path": "",
+    "num_workers": 0,
+    "log_every_steps": 100,
+    "wandb_project": "aam-diffusion-llm",
+    "wandb_run_name": ""
+  },
+  "inference": {
+    "n_steps": 5,
+    "temperature": 1.0,
+    "top_k": 50,
+    "top_p": 0.95,
+    "repetition_penalty": 1.2,
+    "max_output_sentences": 16,
+    "language": "id"
+  },
+  "model_name": "aam-diffusion-v1.0",
+  "output_dir": "./aam-diffusion-v1",
+  "seed": 42,
+  "aam_mind_source": "rsvs_graph",
+  "aam_body_type": "specialized_diffusion"
+}

diffusion_llm/README.md ADDED Viewed

	@@ -0,0 +1,331 @@

+# AAM Diffusion LLM Framework
+> **"AAM = 1 Pikiran + 1 Tubuh" (1 Mind + 1 Body)**
+Framework khusus untuk melatih Diffusion LLM yang menjadi "tubuh" (body) dari Aphantasic Abstraction Model (AAM). Ini BUKAN LLM umum — ini model yang KHUSUS dilatih untuk menyusun kalimat dari data graph yang terstruktur.
+---
+## Filosofi
+### Kenapa Bukan LLM Umum?
+Konsep sebelumnya: "tubuh Jin Soun = LLM umum (GPT, Claude, dll.)" — ini **salah besar**.
+| Aspek | LLM Umum (Sewaan) | AAM Diffusion LLM (Milik Sendiri) |
+|-------|-------------------|-----------------------------------|
+| Input | Prompt teks | Graph conditioning (evidence, anomaly, dll.) |
+| Output | Teks probabilistik | Narrative yang grounded di graph |
+| Hallucination | BISA mengarang | TIDAK BISA — hanya menarasikan apa yang graph ketahui |
+| Tujuan | General purpose | Khusus menyusun kalimat dari graph |
+| Ukuran | 7B-175B params | 100M-500M params |
+| Metode | Autoregressive | Diffusion (non-sequential) |
+| Identitas | Sewaan | MILIK AAM sendiri |
+### Kenapa Diffusion (Bukan Autoregressive)?
+1. **Non-sequential** — Bisa merevisi bagian awal saat generating bagian akhir. Mirip cara Jin Soun membentuk pikiran: vague → clearer → explicit.
+2. **Graph conditioning** — Seluruh graph bisa di-encode sebagai conditioning, bukan hanya prefix. Autoregressive hanya bisa melihat "apa yang sudah di-generate sebelumnya."
+3. **Coherent long-form** — Diffusion menghasilkan teks yang lebih koheren untuk narasi panjang karena setiap token "mengetahui" tentang token lain.
+4. **Anti-hallucination** — Model dilatih KHUSUS untuk Graph→Narrative, tidak punya kapabilitas mengarang informasi di luar graph.
+---
+## Arsitektur
+```
+┌──────────────────────────────────────────────────────────┐
+│  AAM = 1 Pikiran + 1 Tubuh                                │
+│                                                           │
+│  Pikiran (Mind) = RSVS Knowledge Graph                    │
+│    - Structural memory — mengingat SEMUA                  │
+│    - Relational — memahami koneksi antar konsep           │
+│    - Perfect recall — tidak pernah lupa                   │
+│    - Confidence scores — tahu apa yang pasti vs ragu      │
+│                                                           │
+│  Tubuh (Body) = AAM Diffusion LLM                         │
+│    ┌─────────────────────────────────────────────┐        │
+│    │  Graph Conditioning Encoder                   │        │
+│    │  ├─ Evidence Node Encoder                     │        │
+│    │  ├─ Composition Encoder                       │        │
+│    │  ├─ Anomaly Encoder                           │        │
+│    │  ├─ Reasoning Chain Encoder                   │        │
+│    │  ├─ Confidence Embedding                      │        │
+│    │  ├─ Temporal Embedding                        │        │
+│    │  └─ Graph Attention Layers                    │        │
+│    │         ↓ (cross-attention keys/values)       │        │
+│    ├─────────────────────────────────────────────┤        │
+│    │  Diffusion Transformer (Denoiser)             │        │
+│    │  ├─ Token Embedding                           │        │
+│    │  ├─ Timestep Embedding (sinusoidal)           │        │
+│    │  ├─ N × TransformerBlock:                     │        │
+│    │  │   ├─ AdaptiveLayerNorm + Self-Attention    │        │
+│    │  │   ├─ AdaptiveLayerNorm + Cross-Attention   │        │
+│    │  │   └─ AdaptiveLayerNorm + Feed-Forward      │        │
+│    │  └─ Output Projection                         │        │
+│    │         ↓ (predicted noise)                   │        │
+│    ├─────────────────────────────────────────────┤        │
+│    │  Noise Scheduler                              │        │
+│    │  ├─ Forward: x_0 + noise → x_t                │        │
+│    │  └─ Reverse: x_t → denoise → x_{t-1}         │        │
+│    └─────────────────────────────────────────────┘        │
+│                                                           │
+│  Training: Graph→Narrative pairs                          │
+│  Inference: Noise → N denoising steps → Narrative         │
+└─────────────────────────────���────────────────────────────┘
+```
+---
+## Struktur Folder
+```
+diffusion_llm/
+├── __init__.py                 # Package init with public API
+├── config/
+│   ├── __init__.py
+│   └── model_config.py         # All configuration dataclasses
+├── tokenizer/
+│   ├── __init__.py
+│   └── aam_tokenizer.py        # Sentence-level + BPE hybrid tokenizer
+├── model/
+│   ├── __init__.py
+│   ├── noise_scheduler.py      # Forward/reverse diffusion process
+│   ├── graph_encoder.py        # Graph conditioning encoder
+│   ├── diffusion_transformer.py # Core denoising transformer
+│   └── aam_diffusion_model.py  # Complete model (combines all)
+├── training/
+│   ├── __init__.py
+│   ├── losses.py               # Loss functions (MSE, MAE, Huber, weighted)
+│   ├── dataset.py              # GraphNarrative dataset
+│   └── trainer.py              # Training loop with AMP, EMA, etc.
+├── inference/
+│   ├── __init__.py
+│   └── generator.py            # Inference pipeline
+├── data/
+│   ├── __init__.py
+│   ├── synthetic_generator.py  # Synthetic training data
+│   └── data_pipeline.py        # Data preparation pipeline
+├── scripts/
+│   ├── train.py                # Training entry point
+│   ├── evaluate.py             # Evaluation & generation
+│   └── export.py               # Model export
+├── tests/
+│   ├── __init__.py
+│   ├── test_scheduler.py       # Noise scheduler tests
+│   └── test_model.py           # Model component tests
+├── requirements.txt            # Python dependencies
+└── README.md                   # This file
+```
+---
+## Quick Start
+### 1. Install Dependencies
+```bash
+pip install torch numpy pytest
+```
+### 2. Generate Synthetic Data
+```python
+from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator
+generator = SyntheticDataGenerator(seed=42, language="id")
+train_path, val_path = generator.generate_training_split(
+    output_dir="./data",
+    n_train=10000,
+    n_val=500,
+)
+```
+### 3. Train the Model
+```bash
+# Quick test with tiny model
+python diffusion_llm/scripts/train.py --model_size tiny --max_steps 100
+# Full training with base model
+python diffusion_llm/scripts/train.py --model_size base --max_steps 500000
+```
+### 4. Generate Narratives
+```bash
+# Generate samples
+python diffusion_llm/scripts/evaluate.py --checkpoint output/best.pt --generate
+# Interactive mode
+python diffusion_llm/scripts/evaluate.py --checkpoint output/best.pt --interactive
+```
+### 5. Programmatic Usage
+```python
+from diffusion_llm import (
+    AamDiffusionConfig, get_default_config,
+    AamDiffusionModel, AamTokenizer, AamGenerator,
+)
+# Load model and tokenizer
+config = AamDiffusionConfig.from_json("output/config.json")
+model = AamDiffusionModel.load("output/best.pt")
+tokenizer = AamTokenizer.load("output/data/tokenizer.json")
+# Create generator
+generator = AamGenerator(model, tokenizer, config)
+# Generate narrative from graph conditioning
+result = generator.generate(
+    trigger="Siapa yang mencuri Snow Plum Pill?",
+    evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"],
+    anomalies=["Tidak ada konsumsi pil baru di pasar gelap"],
+    reasoning_steps=["Cross-reference tanggal kejadian", "Deteksi anomali"],
+    source_trust=0.85,
+)
+print(result.narrative)
+print(f"Confidence: {result.confidence:.1%}")
+print(f"Steps: {result.n_diffusion_steps}")
+```
+---
+## Model Sizes
+| Size | d_model | Layers | Heads | Params | Recommended For |
+|------|---------|--------|-------|--------|----------------|
+| tiny | 256 | 4 | 4 | ~25M | Quick testing, debugging |
+| small | 512 | 8 | 8 | ~70M | Development, prototyping |
+| **base** | **768** | **12** | **12** | **~170M** | **Recommended for training** |
+| medium | 1024 | 12 | 16 | ~300M | Final training, best quality |
+---
+## Konfigurasi
+### Model Config
+```python
+from diffusion_llm.config.model_config import AamDiffusionConfig, ModelConfig, DiffusionConfig
+config = AamDiffusionConfig(
+    model=ModelConfig(
+        d_model=768,        # Hidden dimension
+        n_layers=12,        # Transformer blocks
+        n_heads=12,         # Attention heads
+        d_ff=3072,          # Feed-forward dimension
+        vocab_size=32000,   # Vocabulary size
+        max_seq_len=512,    # Maximum sequence length
+    ),
+    diffusion=DiffusionConfig(
+        n_timesteps=1000,   # Training timesteps
+        n_inference_steps=50,  # Inference steps (fewer = faster)
+        schedule_type="cosine",  # Noise schedule
+        prediction_type="epsilon",  # Predict noise
+        sampling_method="ddim",  # Fast deterministic sampling
+    ),
+)
+```
+### Inference Config
+```python
+from diffusion_llm.config.model_config import InferenceConfig
+inference = InferenceConfig(
+    n_steps=50,           # Denoising steps
+    temperature=1.0,      # Sampling temperature
+    top_k=50,             # Top-k sampling
+    max_output_sentences=16,  # Max sentences
+    language="id",        # Output language
+)
+```
+---
+## Integrasi dengan AAM Pipeline
+Framework ini dirancang untuk menjadi "tubuh" dari AAM. Setelah model dilatih,
+integrasi dengan `pipeline.py` sangat mudah:
+```python
+# Dalam pipeline.py, ganti fallback:
+from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator
+class AamPipeline:
+    def __init__(self, ...):
+        # Load trained diffusion model
+        diffusion_config = AamDiffusionConfig.from_json("path/to/config.json")
+        diffusion_model = AamDiffusionModel.load("path/to/best.pt")
+        diffusion_tokenizer = AamTokenizer.load("path/to/tokenizer.json")
+        self.diffusion_llm = AamGenerator(diffusion_model, diffusion_tokenizer, diffusion_config)
+```
+---
+## Training Data Format
+Data training dalam format JSONL, satu contoh per baris:
+```json
+{
+  "narrative": "Berdasarkan analisis, Diancang Five Swords mencuri Snow Plum Pill menggunakan Ju Jangmok sebagai kambing hitam.",
+  "trigger": "Siapa yang mencuri Snow Plum Pill?",
+  "evidence_nodes": ["Hefei", "Diancang Five Swords", "Ju Jangmok", "Gyeryong Merchant Guild"],
+  "compositions": [],
+  "confidence_map": {"Hefei": 0.9, "Diancang Five Swords": 0.85, "Ju Jangmok": 0.7},
+  "anomalies": ["Tidak ada konsumsi pil baru di pasar gelap", "Pencuri menghilang tanpa jejak"],
+  "reasoning_steps": ["Cross-reference tanggal kejadian", "Deteksi ketidaksesuaian pola", "Pattern completion dari bukti terpisah"],
+  "source_trust": 0.85,
+  "temporal_context": [],
+  "language": "id",
+  "source": "synthetic"
+}
+```
+---
+## Running Tests
+```bash
+# Run all tests
+cd diffusion_llm
+python -m pytest tests/ -v
+# Run specific test
+python -m pytest tests/test_model.py -v
+# Run with coverage
+python -m pytest tests/ --cov=diffusion_llm
+```
+---
+## Roadmap
+- [x] **Phase 1: Framework Design** — Arsitektur, config, interface
+- [x] **Phase 2: Core Components** — Noise scheduler, transformer, graph encoder, tokenizer
+- [x] **Phase 3: Training Infrastructure** — Trainer, dataset, loss functions, synthetic data
+- [x] **Phase 4: Inference Pipeline** — Generator, batch generation, interactive mode
+- [ ] **Phase 5: Training Execution** — Train on synthetic data, iterate
+- [ ] **Phase 6: Real Data** — Collect real Graph→Narrative pairs from AAM usage
+- [ ] **Phase 7: Optimization** — Quantization, distillation, flash attention
+- [ ] **Phase 8: Integration** — Plug trained model into AAM pipeline
+---
+## Analogi Novel
+> Jin Soun bukan orang yang menyewa tubuh orang lain untuk berbicara.
+> Dia punya tubuh sendiri — lemah, third-rate, tapi MILIKNYA.
+> Karena tubuhnya khusus dilatih untuk mengeksekusi perintah dari
+> pikirannya (bukan pikiran orang lain), outputnya lebih terarah
+> daripada orang yang punya tubuh lebih kuat tapi pikiran lebih lemah.
+>
+> **AAM = 1 pikiran + 1 tubuh. Bukan 1 pikiran + tubuh sewaan.**

diffusion_llm/__init__.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+AAM Diffusion LLM Framework — The Body of Aphantasic Abstraction Model
+"AAM = 1 Pikiran + 1 Tubuh" (1 Mind + 1 Body)
+Pikiran (Mind) = RSVS Knowledge Graph — structural, relational, perfect memory
+Tubuh (Body)  = This Diffusion LLM — generates natural language FROM the graph
+This is NOT a general-purpose LLM. This is a SPECIALIZED sentence composer
+that takes structured graph data as input and produces coherent, evidence-backed
+narrative output. Think of it as a "vocal cord" for the graph — it can only
+say what the graph knows, but it says it fluently.
+Why Diffusion?
+- Diffusion models start from noise and iteratively denoise
+- This mirrors how Jin Soun's thoughts form: from vague intuition ->
+  clearer pattern -> explicit narrative
+- Unlike autoregressive LLMs (GPT), diffusion models can:
+  - Be conditioned on structured input (graph)
+  - Revise earlier parts during generation (non-sequential)
+  - Produce more coherent long-form text from structure
+Architecture:
+  Input: Graph conditioning (evidence nodes, compositions, confidence, anomalies)
+  Process: Iterative denoising from noise
+  Output: Natural language narrative grounded in graph structure
+Analogi: Jin Soun (graph) + tubuhnya (this model).
+Tubuhnya third-rate, tapi karena KHUSUS dilatih untuk
+mengeksekusi perintah dari graph-nya sendiri, outputnya
+lebih terarah daripada LLM umum yang "tidak kenal" graph.
+"""
+__version__ = "0.1.0"
+__author__ = "AAM Team"
+from diffusion_llm.config.model_config import AamDiffusionConfig, get_default_config
+from diffusion_llm.model.noise_scheduler import NoiseScheduler
+from diffusion_llm.model.graph_encoder import GraphConditioningEncoder
+from diffusion_llm.model.diffusion_transformer import DiffusionTransformer
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+from diffusion_llm.inference.generator import AamGenerator
+from diffusion_llm.training.trainer import AamTrainer
+from diffusion_llm.training.dataset import GraphNarrativeDataset
+from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator
+__all__ = [
+    "AamDiffusionConfig",
+    "get_default_config",
+    "NoiseScheduler",
+    "GraphConditioningEncoder",
+    "DiffusionTransformer",
+    "AamDiffusionModel",
+    "AamTokenizer",
+    "AamGenerator",
+    "AamTrainer",
+    "GraphNarrativeDataset",
+    "SyntheticDataGenerator",
+]

diffusion_llm/config/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Configuration module for AAM Diffusion LLM."""
+from diffusion_llm.config.model_config import AamDiffusionConfig, get_default_config
+__all__ = ["AamDiffusionConfig", "get_default_config"]

diffusion_llm/config/model_config.py ADDED Viewed

	@@ -0,0 +1,620 @@

+"""
+AAM Diffusion LLM — Model Configuration
+Defines all hyperparameters for the diffusion model architecture,
+training process, and inference pipeline.
+Design Philosophy:
+    - Small model (100M-500M params) — specialized, not general
+    - Sentence-level tokenization — not subword, because AAM arranges
+      sentences, not individual tokens
+    - Graph-conditioned — the model MUST receive graph structure as input
+    - Non-sequential generation — diffusion, not autoregressive
+Analogi: Seperti tubuh Jin Soun, model ini kecil tapi KKHUSUS
+dilatih untuk satu tugas: menarasikan dari graph. Tidak perlu
+7B params kalau tugasku hanya menyusun kalimat dari data yang
+sudah terstruktur.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Optional
+@dataclass
+class ModelConfig:
+    """Architecture hyperparameters for the Diffusion Transformer.
+    Target: 100M-500M parameters total.
+    Calculation:
+        params ≈ d_model^2 * (12 * n_layers) for transformer
+        d_model=512, n_layers=8  → ~50M core params
+        d_model=768, n_layers=12 → ~170M core params
+        d_model=1024, n_layers=12 → ~300M core params
+    """
+    # --- Core Transformer ---
+    d_model: int = 768
+    """Hidden dimension of the transformer."""
+    n_layers: int = 12
+    """Number of transformer blocks."""
+    n_heads: int = 12
+    """Number of attention heads (d_model must be divisible by n_heads)."""
+    d_ff: int = 3072
+    """Feed-forward hidden dimension (typically 4x d_model)."""
+    dropout: float = 0.1
+    """Dropout rate for attention and feed-forward layers."""
+    activation: str = "gelu"
+    """Activation function: 'gelu' or 'relu'."""
+    # --- Sequence ---
+    max_seq_len: int = 512
+    """Maximum sequence length (in sentence-level tokens)."""
+    # --- Vocabulary ---
+    vocab_size: int = 32000
+    """Vocabulary size for the tokenizer.
+    Since we use sentence-level tokens + subword BPE hybrid,
+    this includes special tokens + subword units.
+    """
+    # --- Positional Encoding ---
+    pos_encoding_type: str = "rotary"
+    """Positional encoding type: 'rotary' (RoPE) or 'learned'."""
+    # --- Attention ---
+    use_flash_attention: bool = True
+    """Whether to use Flash Attention 2 if available."""
+    # --- Normalization ---
+    norm_type: str = "rmsnorm"
+    """Normalization type: 'rmsnorm' or 'layernorm'."""
+    norm_eps: float = 1e-6
+    """Epsilon for normalization layers."""
+    # --- Initialization ---
+    init_std: float = 0.02
+    """Standard deviation for weight initialization."""
+    def estimate_params(self) -> str:
+        """Estimate total parameter count."""
+        # Embedding: vocab_size * d_model
+        embed_params = self.vocab_size * self.d_model
+        # Per layer: 4 * d_model^2 (QKV + O) + 2 * d_model * d_ff (FF)
+        layer_params = 4 * self.d_model ** 2 + 2 * self.d_model * self.d_ff
+        total = embed_params + self.n_layers * layer_params
+        if total >= 1e9:
+            return f"{total / 1e9:.1f}B"
+        elif total >= 1e6:
+            return f"{total / 1e6:.1f}M"
+        else:
+            return f"{total / 1e3:.1f}K"
+@dataclass
+class DiffusionConfig:
+    """Hyperparameters for the diffusion process.
+    The diffusion process works on the latent representation of text:
+    1. Forward: Add Gaussian noise to text embeddings over T timesteps
+    2. Reverse: Learn to denoise step by step
+    3. At inference: Start from pure noise, denoise to coherent text
+    This is DIFFERENT from image diffusion because:
+    - We operate in a learned latent space (not pixel space)
+    - Text has discrete structure (sentences, not pixels)
+    - We use a text-specific noise schedule
+    """
+    # --- Noise Schedule ---
+    n_timesteps: int = 1000
+    """Total number of diffusion timesteps for training."""
+    n_inference_steps: int = 50
+    """Number of denoising steps at inference (fewer = faster, less quality)."""
+    schedule_type: str = "cosine"
+    """Noise schedule type: 'linear', 'cosine', or 'sigmoid'."""
+    beta_start: float = 1e-4
+    """Starting beta for linear schedule."""
+    beta_end: float = 0.02
+    """Ending beta for linear schedule."""
+    # --- Noise Prediction ---
+    prediction_type: str = "epsilon"
+    """What the model predicts: 'epsilon' (noise), 'x0' (clean data),
+    or 'v' (velocity). Epsilon prediction is most stable for text."""
+    # --- Sampling ---
+    sampling_method: str = "ddim"
+    """Sampling method: 'ddpm' (slow, stochastic) or 'ddim' (fast, deterministic)."""
+    eta_ddim: float = 0.0
+    """DDIM stochasticity parameter (0 = deterministic, 1 = full stochastic)."""
+    # --- Clipping ---
+    clip_sample_max: float = 5.0
+    """Maximum value for clipped samples during inference."""
+    clip_sample_min: float = -5.0
+    """Minimum value for clipped samples during inference."""
+    # --- Loss ---
+    loss_type: str = "mse"
+    """Loss function: 'mse' (L2) or 'mae' (L1) or 'huber'."""
+    loss_weighting: str = "min_snr"
+    """Loss weighting strategy: 'none', 'min_snr', or 'p2'."""
+    p2_gamma: float = 1.0
+    """P2 weighting gamma (only used if loss_weighting='p2')."""
+    p2_k: float = 1.0
+    """P2 weighting k (only used if loss_weighting='p2')."""
+@dataclass
+class GraphEncoderConfig:
+    """Configuration for the Graph Conditioning Encoder.
+    The graph encoder takes structured graph data (evidence nodes,
+    compositions, confidence scores, anomalies, reasoning chains)
+    and produces a conditioning vector that guides the diffusion process.
+    This is the KEY differentiator from general LLMs:
+    the model is conditioned on GRAPH STRUCTURE, not just text prompts.
+    """
+    # --- Graph Encoder Architecture ---
+    d_graph: int = 512
+    """Hidden dimension for graph encoding."""
+    n_graph_layers: int = 4
+    """Number of graph attention layers."""
+    n_graph_heads: int = 8
+    """Number of attention heads for graph encoding."""
+    # --- Input Dimensions ---
+    max_evidence_nodes: int = 50
+    """Maximum number of evidence nodes to encode."""
+    max_compositions: int = 20
+    """Maximum number of compositions to encode."""
+    max_anomalies: int = 10
+    """Maximum number of anomalies to encode."""
+    max_reasoning_steps: int = 15
+    """Maximum number of reasoning steps to encode."""
+    # --- Conditioning Injection ---
+    conditioning_method: str = "cross_attention"
+    """How to inject graph conditioning into the diffusion model:
+    'cross_attention' (separate encoder, cross-attn in transformer)
+    'ada_ln' (adaptive layer norm, conditioning modulates scale/shift)
+    'concat' (concatenate conditioning to input sequence)
+    """
+    # --- Confidence Embedding ---
+    embed_confidence: bool = True
+    """Whether to embed confidence scores as part of the conditioning."""
+    # --- Temporal Embedding ---
+    embed_temporal: bool = True
+    """Whether to embed temporal context (time-based relationships)."""
+@dataclass
+class TokenizerConfig:
+    """Configuration for the AAM Sentence-Level Tokenizer.
+    Unlike standard BPE tokenizers that operate at subword level,
+    AAM's tokenizer is designed for SENTENCE ARRANGEMENT:
+    - Sentences are the primary unit of generation
+    - Within sentences, subword BPE handles individual words
+    - Special tokens for graph structure (evidence, anomaly, etc.)
+    """
+    # --- BPE ---
+    bpe_vocab_size: int = 28000
+    """Subword BPE vocabulary size (within the total vocab_size)."""
+    # --- Sentence-Level ---
+    max_sentences: int = 32
+    """Maximum number of sentences in one generation."""
+    sentence_boundary_token: str = "<sent>"
+    """Token marking sentence boundaries."""
+    # --- Special Tokens ---
+    pad_token: str = "<pad>"
+    bos_token: str = "<bos>"
+    eos_token: str = "<eos>"
+    mask_token: str = "<mask>"
+    noise_token: str = "<noise>"
+    # --- Graph-Structure Tokens ---
+    evidence_token: str = "<evidence>"
+    anomaly_token: str = "<anomaly>"
+    confidence_token: str = "<confidence>"
+    reasoning_token: str = "<reasoning>"
+    composition_token: str = "<composition>"
+    temporal_token: str = "<temporal>"
+    # --- Training ---
+    min_frequency: int = 2
+    """Minimum frequency for BPE merge operations."""
+    dropout_rate: float = 0.0
+    """BPE dropout rate (0 = no dropout, regularization during training)."""
+@dataclass
+class TrainingConfig:
+    """Training hyperparameters and settings."""
+    # --- Optimizer ---
+    learning_rate: float = 1e-4
+    """Peak learning rate."""
+    weight_decay: float = 0.01
+    """Weight decay for AdamW."""
+    adam_beta1: float = 0.9
+    """Adam beta1."""
+    adam_beta2: float = 0.999
+    """Adam beta2."""
+    adam_eps: float = 1e-8
+    """Adam epsilon."""
+    # --- Learning Rate Schedule ---
+    lr_schedule: str = "cosine"
+    """LR schedule: 'cosine', 'linear', or 'constant'."""
+    warmup_steps: int = 2000
+    """Number of warmup steps."""
+    # --- Training ---
+    batch_size: int = 32
+    """Training batch size (per GPU)."""
+    gradient_accumulation_steps: int = 4
+    """Gradient accumulation steps (effective batch = batch_size * this)."""
+    max_steps: int = 500000
+    """Maximum training steps."""
+    max_epochs: int = 100
+    """Maximum training epochs."""
+    # --- Regularization ---
+    dropout: float = 0.1
+    """Training dropout rate."""
+    grad_clip_norm: float = 1.0
+    """Gradient clipping max norm."""
+    # --- Mixed Precision ---
+    use_amp: bool = True
+    """Whether to use Automatic Mixed Precision (fp16/bf16)."""
+    amp_dtype: str = "bf16"
+    """AMP data type: 'fp16' or 'bf16'."""
+    # --- Checkpointing ---
+    save_every_steps: int = 5000
+    """Save checkpoint every N steps."""
+    eval_every_steps: int = 1000
+    """Evaluate every N steps."""
+    keep_last_n_checkpoints: int = 3
+    """Keep only the last N checkpoints."""
+    # --- EMA ---
+    use_ema: bool = True
+    """Whether to use Exponential Moving Average for inference weights."""
+    ema_decay: float = 0.9999
+    """EMA decay rate."""
+    # --- Data ---
+    train_data_path: str = ""
+    """Path to training data (JSONL format)."""
+    val_data_path: str = ""
+    """Path to validation data (JSONL format)."""
+    num_workers: int = 4
+    """Number of data loading workers."""
+    # --- Logging ---
+    log_every_steps: int = 100
+    """Log training metrics every N steps."""
+    wandb_project: str = "aam-diffusion-llm"
+    """Weights & Biases project name."""
+    wandb_run_name: str = ""
+    """Weights & Biases run name (auto-generated if empty)."""
+@dataclass
+class InferenceConfig:
+    """Inference-time configuration."""
+    n_steps: int = 50
+    """Number of denoising steps (more = better quality, slower)."""
+    temperature: float = 1.0
+    """Sampling temperature (1.0 = standard, <1 = more deterministic)."""
+    top_k: int = 50
+    """Top-k sampling for token decoding."""
+    top_p: float = 0.95
+    """Nucleus sampling threshold."""
+    repetition_penalty: float = 1.2
+    """Penalty for repeating tokens."""
+    max_output_sentences: int = 16
+    """Maximum number of sentences in output."""
+    language: str = "id"
+    """Output language: 'id' (Indonesian) or 'en' (English)."""
+@dataclass
+class AamDiffusionConfig:
+    """Master configuration for the AAM Diffusion LLM.
+    Combines all sub-configurations into a single object.
+    This is the entry point for configuring the entire framework.
+    """
+    model: ModelConfig = field(default_factory=ModelConfig)
+    diffusion: DiffusionConfig = field(default_factory=DiffusionConfig)
+    graph_encoder: GraphEncoderConfig = field(default_factory=GraphEncoderConfig)
+    tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)
+    training: TrainingConfig = field(default_factory=TrainingConfig)
+    inference: InferenceConfig = field(default_factory=InferenceConfig)
+    # --- Meta ---
+    model_name: str = "aam-diffusion-v0.1"
+    """Model name for saving/loading."""
+    output_dir: str = "./output"
+    """Base output directory."""
+    seed: int = 42
+    """Random seed for reproducibility."""
+    # --- AAM Philosophy ---
+    aam_mind_source: str = "rsvs_graph"
+    """Source of the 'mind' that conditions this 'body'.
+    Always 'rsvs_graph' for AAM — the model CANNOT generate
+    information not present in the graph conditioning."""
+    aam_body_type: str = "specialized_diffusion"
+    """Type of the 'body'. Always 'specialized_diffusion' for AAM.
+    This is NOT a general LLM — it only arranges sentences
+    based on graph-structured evidence."""
+    def to_dict(self) -> dict:
+        """Serialize config to dictionary."""
+        return asdict(self)
+    def to_json(self, path: str | Path) -> None:
+        """Save config to JSON file."""
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
+    @classmethod
+    def from_json(cls, path: str | Path) -> AamDiffusionConfig:
+        """Load config from JSON file."""
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        return cls(
+            model=ModelConfig(**data.get("model", {})),
+            diffusion=DiffusionConfig(**data.get("diffusion", {})),
+            graph_encoder=GraphEncoderConfig(**data.get("graph_encoder", {})),
+            tokenizer=TokenizerConfig(**data.get("tokenizer", {})),
+            training=TrainingConfig(**data.get("training", {})),
+            inference=InferenceConfig(**data.get("inference", {})),
+            model_name=data.get("model_name", "aam-diffusion-v0.1"),
+            output_dir=data.get("output_dir", "./output"),
+            seed=data.get("seed", 42),
+            aam_mind_source=data.get("aam_mind_source", "rsvs_graph"),
+            aam_body_type=data.get("aam_body_type", "specialized_diffusion"),
+        )
+    def summary(self) -> str:
+        """Print a summary of the configuration."""
+        lines = [
+            "=" * 60,
+            f"  AAM Diffusion LLM Configuration: {self.model_name}",
+            "=" * 60,
+            "",
+            f"  Model Architecture:",
+            f"    d_model={self.model.d_model}, n_layers={self.model.n_layers}, "
+            f"n_heads={self.model.n_heads}",
+            f"    d_ff={self.model.d_ff}, vocab_size={self.model.vocab_size}",
+            f"    max_seq_len={self.model.max_seq_len}",
+            f"    Estimated params: {self.model.estimate_params()}",
+            "",
+            f"  Diffusion Process:",
+            f"    Timesteps (train)={self.diffusion.n_timesteps}",
+            f"    Timesteps (inference)={self.diffusion.n_inference_steps}",
+            f"    Schedule={self.diffusion.schedule_type}",
+            f"    Prediction={self.diffusion.prediction_type}",
+            f"    Sampling={self.diffusion.sampling_method}",
+            "",
+            f"  Graph Encoder:",
+            f"    d_graph={self.graph_encoder.d_graph}",
+            f"    n_layers={self.graph_encoder.n_graph_layers}",
+            f"    Conditioning={self.graph_encoder.conditioning_method}",
+            f"    Max evidence nodes={self.graph_encoder.max_evidence_nodes}",
+            "",
+            f"  Training:",
+            f"    LR={self.training.learning_rate}",
+            f"    Batch={self.training.batch_size} x {self.training.gradient_accumulation_steps} accum",
+            f"    Max steps={self.training.max_steps}",
+            f"    AMP={self.training.use_amp} ({self.training.amp_dtype})",
+            "",
+            f"  AAM Philosophy:",
+            f"    Mind = {self.aam_mind_source} (RSVS Knowledge Graph)",
+            f"    Body = {self.aam_body_type} (This Model)",
+            f"    Identity = 1 Mind + 1 Body (NOT rented LLM)",
+            "",
+            "=" * 60,
+        ]
+        return "\n".join(lines)
+def get_default_config(
+    model_size: str = "base",
+) -> AamDiffusionConfig:
+    """Get a default configuration for different model sizes.
+    Args:
+        model_size: One of 'tiny', 'small', 'base', 'medium'.
+            - tiny:   ~25M params  (for quick testing)
+            - small:  ~70M params  (for development)
+            - base:   ~170M params (recommended for training)
+            - medium: ~300M params (for final training)
+    Returns:
+        AamDiffusionConfig with appropriate settings.
+    """
+    configs = {
+        "tiny": AamDiffusionConfig(
+            model=ModelConfig(
+                d_model=256,
+                n_layers=4,
+                n_heads=4,
+                d_ff=1024,
+                vocab_size=16000,
+                max_seq_len=256,
+            ),
+            graph_encoder=GraphEncoderConfig(
+                d_graph=256,
+                n_graph_layers=2,
+                n_graph_heads=4,
+            ),
+            diffusion=DiffusionConfig(
+                n_timesteps=500,
+                n_inference_steps=20,
+            ),
+            training=TrainingConfig(
+                batch_size=16,
+                learning_rate=3e-4,
+                warmup_steps=500,
+                max_steps=100000,
+            ),
+            model_name="aam-diffusion-tiny",
+        ),
+        "small": AamDiffusionConfig(
+            model=ModelConfig(
+                d_model=512,
+                n_layers=8,
+                n_heads=8,
+                d_ff=2048,
+                vocab_size=24000,
+                max_seq_len=384,
+            ),
+            graph_encoder=GraphEncoderConfig(
+                d_graph=384,
+                n_graph_layers=4,
+                n_graph_heads=8,
+            ),
+            diffusion=DiffusionConfig(
+                n_timesteps=1000,
+                n_inference_steps=30,
+            ),
+            training=TrainingConfig(
+                batch_size=24,
+                learning_rate=2e-4,
+                warmup_steps=1000,
+                max_steps=200000,
+            ),
+            model_name="aam-diffusion-small",
+        ),
+        "base": AamDiffusionConfig(
+            model=ModelConfig(
+                d_model=768,
+                n_layers=12,
+                n_heads=12,
+                d_ff=3072,
+                vocab_size=32000,
+                max_seq_len=512,
+            ),
+            graph_encoder=GraphEncoderConfig(
+                d_graph=512,
+                n_graph_layers=4,
+                n_graph_heads=8,
+            ),
+            diffusion=DiffusionConfig(
+                n_timesteps=1000,
+                n_inference_steps=50,
+            ),
+            training=TrainingConfig(
+                batch_size=32,
+                learning_rate=1e-4,
+                warmup_steps=2000,
+                max_steps=500000,
+            ),
+            model_name="aam-diffusion-base",
+        ),
+        "medium": AamDiffusionConfig(
+            model=ModelConfig(
+                d_model=1024,
+                n_layers=12,
+                n_heads=16,
+                d_ff=4096,
+                vocab_size=32000,
+                max_seq_len=768,
+            ),
+            graph_encoder=GraphEncoderConfig(
+                d_graph=768,
+                n_graph_layers=6,
+                n_graph_heads=12,
+            ),
+            diffusion=DiffusionConfig(
+                n_timesteps=1000,
+                n_inference_steps=50,
+            ),
+            training=TrainingConfig(
+                batch_size=16,
+                learning_rate=5e-5,
+                warmup_steps=5000,
+                max_steps=1000000,
+            ),
+            model_name="aam-diffusion-medium",
+        ),
+    }
+    if model_size not in configs:
+        raise ValueError(
+            f"Unknown model_size '{model_size}'. "
+            f"Choose from: {list(configs.keys())}"
+        )
+    return configs[model_size]

diffusion_llm/data/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Data pipeline module for AAM Diffusion LLM."""
+from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator
+from diffusion_llm.data.data_pipeline import DataPipeline
+__all__ = ["SyntheticDataGenerator", "DataPipeline"]

diffusion_llm/data/data_pipeline.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+AAM Diffusion LLM — Data Pipeline
+Orchestrates data preparation: from raw graph data and narratives
+to tokenized, batched training data.
+The pipeline handles:
+1. Loading raw graph→narrative pairs
+2. Generating synthetic data if real data isn't available
+3. Tokenizing all data
+4. Creating train/val splits
+5. Building DataLoaders
+Analogi: Seperti proses persiapan sebelum Jin Soun berlatih —
+mengumpulkan semua kasus, mengorganisirnya, dan menyiapkan
+data latihan yang terstruktur.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Optional
+from torch.utils.data import DataLoader
+from diffusion_llm.config.model_config import AamDiffusionConfig
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+from diffusion_llm.training.dataset import GraphNarrativeDataset, collate_fn
+from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator
+logger = logging.getLogger(__name__)
+class DataPipeline:
+    """Data preparation pipeline for AAM Diffusion LLM training.
+    Orchestrates the entire data preparation process:
+    1. Check for existing data
+    2. Generate synthetic data if needed
+    3. Train tokenizer on the data
+    4. Create datasets and dataloaders
+    Usage:
+        pipeline = DataPipeline(config)
+        tokenizer, train_loader, val_loader = pipeline.prepare()
+    """
+    def __init__(self, config: AamDiffusionConfig):
+        self.config = config
+        self.output_dir = Path(config.output_dir) / "data"
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    def prepare(
+        self,
+        tokenizer: Optional[AamTokenizer] = None,
+        force_regenerate: bool = False,
+    ) -> tuple[AamTokenizer, DataLoader, Optional[DataLoader]]:
+        """Prepare all data for training.
+        Args:
+            tokenizer: Optional pre-trained tokenizer.
+            force_regenerate: Whether to regenerate synthetic data.
+        Returns:
+            Tuple of (tokenizer, train_loader, val_loader).
+        """
+        train_path = Path(self.config.training.train_data_path) if self.config.training.train_data_path else None
+        val_path = Path(self.config.training.val_data_path) if self.config.training.val_data_path else None
+        # Step 1: Generate synthetic data if no real data
+        if not train_path or not train_path.exists() or force_regenerate:
+            logger.info("Generating synthetic training data...")
+            train_path, val_path = SyntheticDataGenerator.generate_training_split(
+                output_dir=self.output_dir,
+                n_train=10000,
+                n_val=500,
+                language=self.config.inference.language,
+                seed=self.config.seed,
+            )
+        # Step 2: Train tokenizer if not provided
+        if tokenizer is None or not tokenizer.is_trained:
+            logger.info("Training tokenizer...")
+            tokenizer = AamTokenizer()
+            # Read training texts for tokenizer training
+            texts = self._read_texts(train_path)
+            tokenizer.train(texts, vocab_size=self.config.tokenizer.bpe_vocab_size)
+            tokenizer.save(self.output_dir / "tokenizer.json")
+            logger.info("Tokenizer trained and saved. Vocab size: %d", tokenizer.vocab_size)
+        # Step 3: Create datasets
+        logger.info("Creating datasets...")
+        train_dataset = GraphNarrativeDataset(
+            data_path=train_path,
+            tokenizer=tokenizer,
+            max_seq_len=self.config.model.max_seq_len,
+            max_evidence=self.config.graph_encoder.max_evidence_nodes,
+            max_anomalies=self.config.graph_encoder.max_anomalies,
+            max_reasoning=self.config.graph_encoder.max_reasoning_steps,
+        )
+        val_dataset = None
+        if val_path and val_path.exists():
+            val_dataset = GraphNarrativeDataset(
+                data_path=val_path,
+                tokenizer=tokenizer,
+                max_seq_len=self.config.model.max_seq_len,
+                max_evidence=self.config.graph_encoder.max_evidence_nodes,
+                max_anomalies=self.config.graph_encoder.max_anomalies,
+                max_reasoning=self.config.graph_encoder.max_reasoning_steps,
+                augment=False,  # No augmentation for validation
+            )
+        # Step 4: Create dataloaders
+        train_loader = DataLoader(
+            train_dataset,
+            batch_size=self.config.training.batch_size,
+            shuffle=True,
+            num_workers=self.config.training.num_workers,
+            collate_fn=collate_fn,
+            pin_memory=True,
+        )
+        val_loader = None
+        if val_dataset:
+            val_loader = DataLoader(
+                val_dataset,
+                batch_size=self.config.training.batch_size,
+                shuffle=False,
+                num_workers=self.config.training.num_workers,
+                collate_fn=collate_fn,
+                pin_memory=True,
+            )
+        logger.info(
+            "Data pipeline ready: %d training examples, %s validation examples",
+            len(train_dataset),
+            len(val_dataset) if val_dataset else 0,
+        )
+        return tokenizer, train_loader, val_loader
+    def _read_texts(self, path: Path) -> list[str]:
+        """Read narrative texts from JSONL file for tokenizer training.
+        Args:
+            path: Path to JSONL data file.
+        Returns:
+            List of narrative texts.
+        """
+        import json
+        texts = []
+        if not path.exists():
+            return texts
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    # Collect both narratives and evidence for richer tokenizer
+                    if data.get("narrative"):
+                        texts.append(data["narrative"])
+                    if data.get("trigger"):
+                        texts.append(data["trigger"])
+                    for ev in data.get("evidence_nodes", []):
+                        texts.append(ev)
+                    for anom in data.get("anomalies", []):
+                        texts.append(anom)
+                    for step in data.get("reasoning_steps", []):
+                        texts.append(step)
+                except json.JSONDecodeError:
+                    continue
+        return texts

diffusion_llm/data/synthetic_generator.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+AAM Diffusion LLM — Synthetic Data Generator
+Generates synthetic Graph→Narrative training pairs for
+pre-training the diffusion model before real data is available.
+The synthetic data follows the AAM pattern:
+- Graph conditioning: evidence, compositions, anomalies, reasoning
+- Target narrative: natural language text that represents the graph data
+This is essential because:
+1. We need training data before the model can be used
+2. The data must follow the Graph→Narrative format specifically
+3. Synthetic data helps bootstrap the model's ability to
+   arrange sentences from structured evidence
+Analogi: Seperti Jin Soun berlatih dengan kasus-kasus fiktif
+sebelum menghadapi kasus nyata — data sintetis memberikan
+"latihan dasar" sebelum data asli tersedia.
+"""
+from __future__ import annotations
+import json
+import logging
+import random
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+# --- Templates for synthetic data generation ---
+# Indonesian narrative templates
+ID_TEMPLATES = {
+    "analysis": [
+        "Berdasarkan analisis terhadap {trigger}: {evidence_summary}. {reasoning_summary}. Tingkat keyakinan: {confidence_pct}.",
+        "Analisis menunjukkan bahwa {trigger} terkait dengan {evidence_summary}. {anomaly_summary}. Kesimpulan: {reasoning_summary}.",
+        "Dari data yang tersedia, {trigger} memiliki koneksi ke {evidence_summary}. {reasoning_summary}. Confidence: {confidence_pct}.",
+        "Hasil investigasi: {trigger}. Bukti: {evidence_summary}. {anomaly_summary}. {reasoning_summary}.",
+        "Temuan: {trigger} berkorelasi dengan {evidence_summary}. Catatan: {anomaly_summary}. Analisis: {reasoning_summary}.",
+    ],
+    "evidence_summary": [
+        "bukti menunjukkan {nodes}",
+        "data dari {nodes} mengindikasikan",
+        "{nodes} menjadi kunci",
+        "informasi dari {nodes} mengarah ke",
+        "sumber {nodes} mengkonfirmasi",
+    ],
+    "anomaly_summary": [
+        "Anomali terdeteksi: {anomalies}",
+        "Perhatian: {anomalies}",
+        "Pola tidak lazim: {anomalies}",
+        "Ketidaksesuaian ditemukan: {anomalies}",
+        "Terdapat kejanggalan: {anomalies}",
+    ],
+    "reasoning_summary": [
+        "Langkah penalaran: {steps}",
+        "Proses deduksi: {steps}",
+        "Analisis bertahap: {steps}",
+        "Penelusuran logika: {steps}",
+        "Rantai penalaran: {steps}",
+    ],
+}
+# English narrative templates
+EN_TEMPLATES = {
+    "analysis": [
+        "Based on analysis of {trigger}: {evidence_summary}. {reasoning_summary}. Confidence: {confidence_pct}.",
+        "Analysis indicates that {trigger} is related to {evidence_summary}. {anomaly_summary}. Conclusion: {reasoning_summary}.",
+        "From available data, {trigger} has connections to {evidence_summary}. {reasoning_summary}. Confidence level: {confidence_pct}.",
+        "Investigation results: {trigger}. Evidence: {evidence_summary}. {anomaly_summary}. {reasoning_summary}.",
+        "Findings: {trigger} correlates with {evidence_summary}. Note: {anomaly_summary}. Analysis: {reasoning_summary}.",
+    ],
+    "evidence_summary": [
+        "evidence shows {nodes}",
+        "data from {nodes} indicates",
+        "{nodes} are key factors",
+        "information from {nodes} points to",
+        "sources {nodes} confirm",
+    ],
+    "anomaly_summary": [
+        "Anomaly detected: {anomalies}",
+        "Note: {anomalies}",
+        "Unusual pattern: {anomalies}",
+        "Inconsistency found: {anomalies}",
+        "Irregularity observed: {anomalies}",
+    ],
+    "reasoning_summary": [
+        "Reasoning steps: {steps}",
+        "Deductive process: {steps}",
+        "Step-by-step analysis: {steps}",
+        "Logical trace: {steps}",
+        "Reasoning chain: {steps}",
+    ],
+}
+# Sample graph data for synthetic generation
+SAMPLE_EVIDENCE_NODES = {
+    "id": [
+        "Hefei", "Diancang Five Swords", "Ju Jangmok", "Snow Plum Pill",
+        "Gyeryong Merchant Guild", "Simhyeon Pavilion", "Martial Alliance",
+        "Gu Ilmu", "Jang Hangi", "Blood Serpent Dance Step",
+        "taeul_sect", "dark_faction", "hefei_branch",
+    ],
+    "en": [
+        "Hefei", "Diancang Five Swords", "Ju Jangmok", "Snow Plum Pill",
+        "Gyeryong Merchant Guild", "Simhyeon Pavilion", "Martial Alliance",
+        "Gu Ilmu", "Jang Hangi", "Blood Serpent Dance Step",
+        "taeul_sect", "dark_faction", "hefei_branch",
+    ],
+}
+SAMPLE_TRIGGERS = {
+    "id": [
+        "Siapa yang mencuri Snow Plum Pill?",
+        "Analisis pergerakan Diancang Five Swords",
+        "Hubungan antara Ju Jangmok dan pencurian",
+        "Anomali dalam laporan Hefei",
+        "Investigasi inside job di Diancang",
+        "Pola konsumsi Snow Plum Pill",
+        "Cross-reference kejadian di Hefei",
+        "Evaluasi kepercayaan sumber informasi",
+        "Prediksi tindakan berikutnya tersangka",
+        "Pattern completion dari bukti terpisah",
+    ],
+    "en": [
+        "Who stole the Snow Plum Pill?",
+        "Analysis of Diancang Five Swords movements",
+        "Connection between Ju Jangmok and the theft",
+        "Anomalies in the Hefei reports",
+        "Investigation of inside job at Diancang",
+        "Pattern of Snow Plum Pill consumption",
+        "Cross-referencing events in Hefei",
+        "Source trustworthiness evaluation",
+        "Predicting next suspect actions",
+        "Pattern completion from disparate evidence",
+    ],
+}
+SAMPLE_ANOMALIES = {
+    "id": [
+        "Tidak ada konsumsi pil baru di pasar gelap",
+        "Pencuri menghilang tanpa jejak",
+        "Success rate pair lebih tinggi dari biasanya",
+        "Misi di-assign dari dalam Diancang sendiri",
+        "Ju Jangmok menghilang hari yang sama dengan pencurian",
+        "Tidak ada pencuri baru setelah Ju Jangmok menghilang",
+    ],
+    "en": [
+        "No new pill consumption in black market",
+        "Thief disappeared without a trace",
+        "Pair success rate unusually high",
+        "Mission assigned from within Diancang itself",
+        "Ju Jangmok disappeared same day as theft",
+        "No new thief appeared after Ju Jangmok vanished",
+    ],
+}
+SAMPLE_REASONING_STEPS = {
+    "id": [
+        "Recall: Ingat semua laporan terkait Hefei",
+        "Cross-reference: Bandingkan tanggal kejadian",
+        "Filter: Eliminasi yang tidak relevan",
+        "Anomaly: Deteksi ketidaksesuaian pola",
+        "Pattern: Hubungkan fragmen terpisah",
+        "Compose: Susun kesimpulan dari bukti",
+        "Predict: Perkirakan tindakan berikutnya",
+        "Verify: Cek konsistensi kesimpulan",
+    ],
+    "en": [
+        "Recall: Remember all reports related to Hefei",
+        "Cross-reference: Compare event dates",
+        "Filter: Eliminate irrelevant data",
+        "Anomaly: Detect pattern inconsistency",
+        "Pattern: Connect disparate fragments",
+        "Compose: Assemble conclusion from evidence",
+        "Predict: Estimate next actions",
+        "Verify: Check conclusion consistency",
+    ],
+}
+class SyntheticDataGenerator:
+    """Generate synthetic Graph→Narrative training pairs.
+    This generator creates training data that follows the AAM
+    pattern: structured graph conditioning → natural language narrative.
+    The generated data covers:
+    - Various trigger types (questions, analysis requests)
+    - Different numbers of evidence nodes (1-50)
+    - Various anomaly patterns
+    - Different reasoning chain lengths
+    - Confidence distributions
+    - Both Indonesian and English
+    Usage:
+        generator = SyntheticDataGenerator()
+        examples = generator.generate(n=1000, language="id")
+        generator.save(examples, "training_data.jsonl")
+    """
+    def __init__(
+        self,
+        seed: int = 42,
+        language: str = "id",
+    ):
+        """Initialize the synthetic data generator.
+        Args:
+            seed: Random seed for reproducibility.
+            language: Default language for generation.
+        """
+        self.seed = seed
+        self.language = language
+        random.seed(seed)
+    def generate(
+        self,
+        n: int = 1000,
+        language: Optional[str] = None,
+        min_evidence: int = 2,
+        max_evidence: int = 15,
+        anomaly_probability: float = 0.6,
+        reasoning_probability: float = 0.8,
+    ) -> list[dict]:
+        """Generate synthetic training examples.
+        Args:
+            n: Number of examples to generate.
+            language: Language override.
+            min_evidence: Minimum evidence nodes per example.
+            max_evidence: Maximum evidence nodes per example.
+            anomaly_probability: Probability of including anomalies.
+            reasoning_probability: Probability of including reasoning steps.
+        Returns:
+            List of training example dictionaries.
+        """
+        lang = language or self.language
+        templates = ID_TEMPLATES if lang == "id" else EN_TEMPLATES
+        evidence_pool = SAMPLE_EVIDENCE_NODES.get(lang, SAMPLE_EVIDENCE_NODES["en"])
+        trigger_pool = SAMPLE_TRIGGERS.get(lang, SAMPLE_TRIGGERS["en"])
+        anomaly_pool = SAMPLE_ANOMALIES.get(lang, SAMPLE_ANOMALIES["en"])
+        reasoning_pool = SAMPLE_REASONING_STEPS.get(lang, SAMPLE_REASONING_STEPS["en"])
+        examples = []
+        for _ in range(n):
+            # Random trigger
+            trigger = random.choice(trigger_pool)
+            # Random evidence nodes
+            n_evidence = random.randint(min_evidence, max_evidence)
+            evidence = random.sample(evidence_pool, min(n_evidence, len(evidence_pool)))
+            # Random confidence map
+            confidence_map = {
+                node: round(random.uniform(0.3, 1.0), 2)
+                for node in evidence
+            }
+            # Random anomalies
+            anomalies = []
+            if random.random() < anomaly_probability:
+                n_anomalies = random.randint(1, 3)
+                anomalies = random.sample(anomaly_pool, min(n_anomalies, len(anomaly_pool)))
+            # Random reasoning steps
+            reasoning_steps = []
+            if random.random() < reasoning_probability:
+                n_steps = random.randint(2, 6)
+                reasoning_steps = random.sample(reasoning_pool, min(n_steps, len(reasoning_pool)))
+            # Source trust
+            source_trust = round(random.uniform(0.5, 1.0), 2)
+            # Generate narrative from template
+            narrative = self._generate_narrative(
+                trigger=trigger,
+                evidence=evidence,
+                anomalies=anomalies,
+                reasoning_steps=reasoning_steps,
+                confidence_map=confidence_map,
+                templates=templates,
+                lang=lang,
+            )
+            example = {
+                "narrative": narrative,
+                "trigger": trigger,
+                "evidence_nodes": evidence,
+                "compositions": [],
+                "confidence_map": confidence_map,
+                "anomalies": anomalies,
+                "reasoning_steps": reasoning_steps,
+                "source_trust": source_trust,
+                "language": lang,
+                "source": "synthetic",
+            }
+            examples.append(example)
+        logger.info("Generated %d synthetic examples (language=%s)", n, lang)
+        return examples
+    def _generate_narrative(
+        self,
+        trigger: str,
+        evidence: list[str],
+        anomalies: list[str],
+        reasoning_steps: list[str],
+        confidence_map: dict[str, float],
+        templates: dict,
+        lang: str,
+    ) -> str:
+        """Generate a narrative from templates.
+        Args:
+            trigger: Trigger text.
+            evidence: Evidence node labels.
+            anomalies: Anomaly descriptions.
+            reasoning_steps: Reasoning step descriptions.
+            confidence_map: Confidence scores.
+            templates: Template dictionary.
+            lang: Language code.
+        Returns:
+            Generated narrative string.
+        """
+        # Build narrative parts
+        evidence_str = ", ".join(evidence[:5])
+        avg_confidence = sum(confidence_map.values()) / max(len(confidence_map), 1)
+        # Fill templates
+        evidence_summary = random.choice(templates["evidence_summary"]).format(
+            nodes=evidence_str
+        )
+        anomaly_summary = ""
+        if anomalies:
+            anomaly_summary = random.choice(templates["anomaly_summary"]).format(
+                anomalies="; ".join(anomalies[:3])
+            )
+        reasoning_summary = ""
+        if reasoning_steps:
+            reasoning_summary = random.choice(templates["reasoning_summary"]).format(
+                steps="; ".join(reasoning_steps[:4])
+            )
+        # Main narrative
+        narrative = random.choice(templates["analysis"]).format(
+            trigger=trigger,
+            evidence_summary=evidence_summary,
+            anomaly_summary=anomaly_summary,
+            reasoning_summary=reasoning_summary,
+            confidence_pct=f"{avg_confidence:.0%}",
+        )
+        return narrative
+    def save(
+        self,
+        examples: list[dict],
+        path: str | Path,
+    ) -> None:
+        """Save examples to JSONL file.
+        Args:
+            examples: List of example dictionaries.
+            path: Output file path.
+        """
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w", encoding="utf-8") as f:
+            for example in examples:
+                f.write(json.dumps(example, ensure_ascii=False) + "\n")
+        logger.info("Saved %d examples to %s", len(examples), path)
+    @classmethod
+    def generate_training_split(
+        cls,
+        output_dir: str | Path,
+        n_train: int = 10000,
+        n_val: int = 500,
+        language: str = "id",
+        seed: int = 42,
+    ) -> tuple[Path, Path]:
+        """Generate and save train/val splits.
+        Args:
+            output_dir: Output directory.
+            n_train: Number of training examples.
+            n_val: Number of validation examples.
+            language: Language for generation.
+            seed: Random seed.
+        Returns:
+            Tuple of (train_path, val_path).
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        generator = cls(seed=seed, language=language)
+        # Generate training data
+        train_examples = generator.generate(n=n_train, language=language)
+        train_path = output_dir / "train.jsonl"
+        generator.save(train_examples, train_path)
+        # Generate validation data (different seed)
+        val_generator = cls(seed=seed + 1, language=language)
+        val_examples = val_generator.generate(n=n_val, language=language)
+        val_path = output_dir / "val.jsonl"
+        val_generator.save(val_examples, val_path)
+        logger.info(
+            "Generated training split: %d train, %d val",
+            n_train, n_val,
+        )
+        return train_path, val_path

diffusion_llm/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Inference module for AAM Diffusion LLM."""
+from diffusion_llm.inference.generator import AamGenerator
+__all__ = ["AamGenerator"]

diffusion_llm/inference/generator.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+AAM Diffusion LLM — Inference Generator
+Generates natural language narratives from graph conditioning
+using the trained diffusion model.
+The generation process:
+1. Encode graph conditioning (evidence, anomalies, reasoning)
+2. Start from pure noise in the latent space
+3. Iteratively denoise for N steps
+4. Convert denoised embeddings to token IDs
+5. Detokenize to natural language text
+Analogi: Seperti Jin Soun akhirnya "berbicara" — dari
+pikiran yang kabur (noise) menjadi kata-kata yang jelas
+(denoised narrative). Setiap langkah denoising = satu
+langkah lebih dekat ke koherensi.
+"""
+from __future__ import annotations
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+from diffusion_llm.config.model_config import AamDiffusionConfig, InferenceConfig
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+logger = logging.getLogger(__name__)
+@dataclass
+class GenerationResult:
+    """Result from a generation call.
+    Contains the generated narrative plus metadata about
+    how it was generated, for traceability.
+    """
+    narrative: str
+    """Generated narrative text."""
+    token_ids: list[int] = field(default_factory=list)
+    """Generated token IDs."""
+    n_diffusion_steps: int = 0
+    """Number of denoising steps used."""
+    generation_time_s: float = 0.0
+    """Wall-clock generation time."""
+    model_name: str = ""
+    """Name of the model used."""
+    evidence_used: list[str] = field(default_factory=list)
+    """Evidence nodes that were provided as conditioning."""
+    confidence: float = 0.0
+    """Overall confidence of the generation."""
+    language: str = "id"
+    """Output language."""
+    def to_dict(self) -> dict:
+        """Serialize to dictionary."""
+        return {
+            "narrative": self.narrative,
+            "n_diffusion_steps": self.n_diffusion_steps,
+            "generation_time_s": round(self.generation_time_s, 3),
+            "model_name": self.model_name,
+            "evidence_used": self.evidence_used,
+            "confidence": round(self.confidence, 3),
+            "language": self.language,
+        }
+class AamGenerator:
+    """Generate narratives from graph conditioning using the trained model.
+    This is the main inference interface. It takes graph-structured
+    data (from the RSVS Knowledge Graph) and produces natural
+    language narratives through the diffusion denoising process.
+    Usage:
+        # Load model and tokenizer
+        config = AamDiffusionConfig.from_json("config.json")
+        model = AamDiffusionModel.load("best.pt")
+        tokenizer = AamTokenizer.load("tokenizer.json")
+        # Create generator
+        generator = AamGenerator(model, tokenizer, config)
+        # Generate narrative
+        result = generator.generate(
+            trigger="Siapa yang mencuri Snow Plum Pill?",
+            evidence_nodes=["hefei", "diancang", "ju_jangmok"],
+            anomalies=["no external pill consumption"],
+            reasoning_steps=["Diancang pair was in Hefei before theft"],
+        )
+        print(result.narrative)
+    Args:
+        model: Trained AamDiffusionModel.
+        tokenizer: Trained AamTokenizer.
+        config: AamDiffusionConfig with inference settings.
+    """
+    def __init__(
+        self,
+        model: AamDiffusionModel,
+        tokenizer: AamTokenizer,
+        config: AamDiffusionConfig,
+    ):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.config = config
+        self.inference_config = config.inference
+        # Device
+        self.device = next(model.parameters()).device
+        # Set model to eval mode
+        self.model.eval()
+    @torch.no_grad()
+    def generate(
+        self,
+        trigger: str = "",
+        evidence_nodes: Optional[list[str]] = None,
+        compositions: Optional[list[str]] = None,
+        confidence_map: Optional[dict[str, float]] = None,
+        anomalies: Optional[list[str]] = None,
+        reasoning_steps: Optional[list[str]] = None,
+        source_trust: float = 1.0,
+        n_steps: Optional[int] = None,
+        temperature: Optional[float] = None,
+        language: Optional[str] = None,
+        max_sentences: Optional[int] = None,
+    ) -> GenerationResult:
+        """Generate a narrative from graph conditioning.
+        This is the main generation method. It:
+        1. Tokenizes the graph conditioning data
+        2. Encodes it through the graph encoder
+        3. Starts from noise and iteratively denoises
+        4. Converts the result to text
+        Args:
+            trigger: The trigger question or topic.
+            evidence_nodes: Evidence node descriptions.
+            compositions: Composition descriptions.
+            confidence_map: Node confidence scores.
+            anomalies: Anomaly descriptions.
+            reasoning_steps: Reasoning step descriptions.
+            source_trust: Source trust score.
+            n_steps: Override number of denoising steps.
+            temperature: Override sampling temperature.
+            language: Override output language.
+            max_sentences: Maximum sentences in output.
+        Returns:
+            GenerationResult with the narrative and metadata.
+        """
+        start_time = time.time()
+        # Use config defaults if not overridden
+        n_steps = n_steps or self.inference_config.n_steps
+        temperature = temperature or self.inference_config.temperature
+        language = language or self.inference_config.language
+        max_sentences = max_sentences or self.inference_config.max_output_sentences
+        # --- Step 1: Tokenize graph conditioning ---
+        evidence_ids_tensor = None
+        evidence_conf_tensor = None
+        anomaly_ids_tensor = None
+        anomaly_conf_tensor = None
+        reasoning_ids_tensor = None
+        reasoning_conf_tensor = None
+        if evidence_nodes:
+            evidence_ids_list = []
+            evidence_conf_list = []
+            for node in evidence_nodes[:self.config.graph_encoder.max_evidence_nodes]:
+                ids = self.tokenizer.encode(node, add_special=False)
+                ids = self.tokenizer.pad_sequence(ids, 32)
+                evidence_ids_list.append(ids)
+                conf = (confidence_map or {}).get(node, 0.7)
+                evidence_conf_list.append(conf)
+            while len(evidence_ids_list) < self.config.graph_encoder.max_evidence_nodes:
+                evidence_ids_list.append([0] * 32)
+                evidence_conf_list.append(0.0)
+            evidence_ids_tensor = torch.tensor(
+                [evidence_ids_list], dtype=torch.long, device=self.device
+            )
+            evidence_conf_tensor = torch.tensor(
+                [evidence_conf_list], dtype=torch.float32, device=self.device
+            )
+        if anomalies:
+            anomaly_ids_list = []
+            for anom in anomalies[:self.config.graph_encoder.max_anomalies]:
+                ids = self.tokenizer.encode(anom, add_special=False)
+                ids = self.tokenizer.pad_sequence(ids, 32)
+                anomaly_ids_list.append(ids)
+            while len(anomaly_ids_list) < self.config.graph_encoder.max_anomalies:
+                anomaly_ids_list.append([0] * 32)
+            anomaly_ids_tensor = torch.tensor(
+                [anomaly_ids_list], dtype=torch.long, device=self.device
+            )
+            anomaly_conf_tensor = torch.full(
+                (1, self.config.graph_encoder.max_anomalies),
+                0.6, dtype=torch.float32, device=self.device,
+            )
+        if reasoning_steps:
+            reasoning_ids_list = []
+            for step in reasoning_steps[:self.config.graph_encoder.max_reasoning_steps]:
+                ids = self.tokenizer.encode(step, add_special=False)
+                ids = self.tokenizer.pad_sequence(ids, 32)
+                reasoning_ids_list.append(ids)
+            while len(reasoning_ids_list) < self.config.graph_encoder.max_reasoning_steps:
+                reasoning_ids_list.append([0] * 32)
+            reasoning_ids_tensor = torch.tensor(
+                [reasoning_ids_list], dtype=torch.long, device=self.device
+            )
+            reasoning_conf_tensor = torch.full(
+                (1, self.config.graph_encoder.max_reasoning_steps),
+                0.7, dtype=torch.float32, device=self.device,
+            )
+        source_trust_tensor = torch.tensor(
+            [source_trust], dtype=torch.float32, device=self.device
+        )
+        # --- Step 2: Encode graph conditioning ---
+        graph_cond = self.model.graph_encoder(
+            evidence_ids=evidence_ids_tensor,
+            evidence_confidence=evidence_conf_tensor,
+            anomaly_ids=anomaly_ids_tensor,
+            anomaly_confidence=anomaly_conf_tensor,
+            reasoning_ids=reasoning_ids_tensor,
+            reasoning_confidence=reasoning_conf_tensor,
+            source_trust=source_trust_tensor,
+        )
+        # --- Step 3: Generate via diffusion denoising ---
+        shape = (
+            1,
+            self.config.model.max_seq_len,
+            self.config.model.d_model,
+        )
+        denoised = self.model.sample(
+            graph_cond=graph_cond,
+            n_steps=n_steps,
+            method=self.config.diffusion.sampling_method,
+            shape=shape,
+            device=self.device,
+        )
+        # --- Step 4: Convert to tokens ---
+        token_ids = self.model.embeddings_to_tokens(
+            denoised, temperature=temperature,
+            top_k=self.inference_config.top_k,
+        )
+        # --- Step 5: Detokenize ---
+        token_list = token_ids[0].cpu().tolist()
+        narrative = self.tokenizer.decode(token_list, skip_special=True)
+        # Truncate to max sentences
+        if max_sentences:
+            sentences = self.tokenizer._split_sentences(narrative)
+            if len(sentences) > max_sentences:
+                narrative = ". ".join(sentences[:max_sentences]) + "."
+        generation_time = time.time() - start_time
+        # Compute average confidence
+        avg_confidence = source_trust
+        if confidence_map:
+            avg_confidence = sum(confidence_map.values()) / len(confidence_map)
+        return GenerationResult(
+            narrative=narrative,
+            token_ids=token_list,
+            n_diffusion_steps=n_steps,
+            generation_time_s=generation_time,
+            model_name=self.config.model_name,
+            evidence_used=evidence_nodes or [],
+            confidence=avg_confidence,
+            language=language,
+        )
+    def generate_batch(
+        self,
+        triggers: list[str],
+        evidence_nodes_list: Optional[list[list[str]]] = None,
+        anomalies_list: Optional[list[list[str]]] = None,
+        **kwargs,
+    ) -> list[GenerationResult]:
+        """Generate narratives for multiple triggers.
+        Args:
+            triggers: List of trigger questions.
+            evidence_nodes_list: List of evidence node lists.
+            anomalies_list: List of anomaly lists.
+            **kwargs: Additional arguments passed to generate().
+        Returns:
+            List of GenerationResult objects.
+        """
+        results = []
+        for i, trigger in enumerate(triggers):
+            evidence = evidence_nodes_list[i] if evidence_nodes_list else None
+            anomalies = anomalies_list[i] if anomalies_list else None
+            result = self.generate(
+                trigger=trigger,
+                evidence_nodes=evidence,
+                anomalies=anomalies,
+                **kwargs,
+            )
+            results.append(result)
+        return results

diffusion_llm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Model components for AAM Diffusion LLM."""
+from diffusion_llm.model.noise_scheduler import NoiseScheduler
+from diffusion_llm.model.graph_encoder import GraphConditioningEncoder
+from diffusion_llm.model.diffusion_transformer import DiffusionTransformer
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+__all__ = [
+    "NoiseScheduler",
+    "GraphConditioningEncoder",
+    "DiffusionTransformer",
+    "AamDiffusionModel",
+]

diffusion_llm/model/aam_diffusion_model.py ADDED Viewed

	@@ -0,0 +1,475 @@

+"""
+AAM Diffusion LLM — Complete Model
+Combines the Diffusion Transformer, Graph Encoder, and Noise Scheduler
+into a single, unified model for training and inference.
+This is the "body" of AAM — the specialized sentence composer that
+takes graph conditioning as input and produces coherent narratives
+through iterative denoising.
+Architecture:
+    ┌──────────────────────────────────────────────────┐
+    │  AAM Diffusion Model (The Body)                   │
+    │                                                   │
+    │  Input:                                           │
+    │    - Token IDs (text)                             │
+    │    - Graph conditioning (evidence, compositions,  │
+    │      confidence, anomalies, reasoning chains)     │
+    │                                                   │
+    │  Training Process:                                │
+    │    1. Tokenize text → embeddings                  │
+    │    2. Sample random timestep t                    │
+    │    3. Add noise: x_t = schedule.add_noise(x_0, t) │
+    │    4. Encode graph conditioning                   │
+    │    5. Predict noise: eps = transformer(x_t, t, c) │
+    │    6. Compute loss: L = MSE(eps, eps_target)      │
+    │                                                   │
+    │  Inference Process:                               │
+    │    1. Start from pure noise x_T                   │
+    │    2. Encode graph conditioning                   │
+    │    3. For t = T, T-1, ..., 1:                     │
+    │       a. Predict noise: eps = transformer(x_t, t) │
+    │       b. Denoise: x_{t-1} = schedule.step(eps)   │
+    │    4. Decode final x_0 → text tokens              │
+    │    5. Detokenize → natural language narrative     │
+    │                                                   │
+    │  Key Constraint:                                  │
+    │    The model CANNOT generate information not       │
+    │    present in the graph conditioning. It can only  │
+    │    ARRANGE what the graph knows into sentences.    │
+    │                                                   │
+    │  Analogi: Jin Soun (mind/graph) + tubuhnya         │
+    │  (this model). Tubuhnya hanya bisa mengucapkan    │
+    │  apa yang dipikirkannya — tidak bisa mengarang.   │
+    └──────────────────────────────────────────────────┘
+Analogi: Ini adalah seluruh "tubuh" Jin Soun — bukan hanya
+ototnya (transformer), tapi juga sistem saraf (graph encoder)
+dan kemampuan untuk memperbaiki diri (diffusion denoising).
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+from diffusion_llm.config.model_config import AamDiffusionConfig
+from diffusion_llm.model.noise_scheduler import NoiseScheduler
+from diffusion_llm.model.graph_encoder import GraphConditioningEncoder
+from diffusion_llm.model.diffusion_transformer import DiffusionTransformer
+logger = logging.getLogger(__name__)
+class AamDiffusionModel(nn.Module):
+    """Complete AAM Diffusion LLM model.
+    Combines:
+    - DiffusionTransformer: Core denoising network
+    - GraphConditioningEncoder: Encodes graph structure for conditioning
+    - NoiseScheduler: Manages the diffusion process
+    This model is designed to be trained on Graph→Narrative pairs,
+    where the graph data comes from the RSVS Knowledge Graph and
+    the narrative is the target natural language output.
+    Args:
+        config: AamDiffusionConfig with all hyperparameters.
+    """
+    def __init__(self, config: AamDiffusionConfig):
+        super().__init__()
+        self.config = config
+        # Core components
+        self.noise_scheduler = NoiseScheduler(
+            n_timesteps=config.diffusion.n_timesteps,
+            schedule_type=config.diffusion.schedule_type,
+            beta_start=config.diffusion.beta_start,
+            beta_end=config.diffusion.beta_end,
+            prediction_type=config.diffusion.prediction_type,
+        )
+        self.graph_encoder = GraphConditioningEncoder(
+            config=config.graph_encoder,
+            vocab_size=config.model.vocab_size,
+        )
+        # Align graph encoder output dim with transformer's d_model
+        self.graph_encoder.set_output_dim(config.model.d_model)
+        self.transformer = DiffusionTransformer(config.model)
+        # Token-to-embedding projection (shared with transformer)
+        # The transformer's token_embedding is used for both
+        # encoding input text and decoding output text
+        # Output head: project from d_model to vocab_size
+        self.lm_head = nn.Linear(
+            config.model.d_model, config.model.vocab_size, bias=False
+        )
+        # Tie weights between token embedding and LM head
+        # This is standard practice and reduces parameter count
+        self.lm_head.weight = self.transformer.token_embedding.weight
+        # EMA model (for inference, updated during training)
+        self._ema_model: Optional[AamDiffusionModel] = None
+        self._ema_decay = config.training.ema_decay
+        logger.info(
+            "AamDiffusionModel initialized: %s params, %s",
+            self._format_params(self.get_num_params()),
+            config.model_name,
+        )
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        timestep: torch.Tensor,
+        evidence_ids: Optional[torch.Tensor] = None,
+        evidence_confidence: Optional[torch.Tensor] = None,
+        evidence_timestamps: Optional[torch.Tensor] = None,
+        composition_ids: Optional[torch.Tensor] = None,
+        composition_confidence: Optional[torch.Tensor] = None,
+        anomaly_ids: Optional[torch.Tensor] = None,
+        anomaly_confidence: Optional[torch.Tensor] = None,
+        anomaly_timestamps: Optional[torch.Tensor] = None,
+        reasoning_ids: Optional[torch.Tensor] = None,
+        reasoning_confidence: Optional[torch.Tensor] = None,
+        source_trust: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass for training.
+        1. Get clean embeddings from token IDs
+        2. Add noise at the given timestep
+        3. Encode graph conditioning
+        4. Predict noise via transformer
+        5. Return predicted noise (loss computed externally)
+        Args:
+            token_ids: Clean text token IDs, shape (batch, seq_len).
+            timestep: Random timestep indices, shape (batch,).
+            evidence_ids: Evidence node token IDs.
+            evidence_confidence: Evidence confidence scores.
+            evidence_timestamps: Evidence timestamps.
+            composition_ids: Composition token IDs.
+            composition_confidence: Composition confidence.
+            anomaly_ids: Anomaly token IDs.
+            anomaly_confidence: Anomaly confidence.
+            anomaly_timestamps: Anomaly timestamps.
+            reasoning_ids: Reasoning step token IDs.
+            reasoning_confidence: Reasoning confidence.
+            source_trust: Source trust score.
+        Returns:
+            Predicted noise tensor of shape (batch, seq_len, d_model).
+        """
+        # Step 1: Get clean embeddings (x_0)
+        x_0 = self.transformer.token_embedding(token_ids)
+        # Step 2: Add noise
+        noise = torch.randn_like(x_0)
+        x_t = self.noise_scheduler.add_noise(x_0, noise, timestep)
+        # Step 3: Encode graph conditioning
+        batch_size = token_ids.shape[0]
+        graph_cond = self.graph_encoder(
+            evidence_ids=evidence_ids,
+            evidence_confidence=evidence_confidence,
+            evidence_timestamps=evidence_timestamps,
+            composition_ids=composition_ids,
+            composition_confidence=composition_confidence,
+            anomaly_ids=anomaly_ids,
+            anomaly_confidence=anomaly_confidence,
+            anomaly_timestamps=anomaly_timestamps,
+            reasoning_ids=reasoning_ids,
+            reasoning_confidence=reasoning_confidence,
+            source_trust=source_trust,
+            batch_size=batch_size,
+        )
+        # Extract cross-attention keys/values from graph conditioning
+        graph_keys = graph_cond.get("keys")
+        graph_values = graph_cond.get("values")
+        # Step 4: Predict noise via transformer
+        predicted = self.transformer(
+            x_t=x_t,
+            t=timestep,
+            graph_keys=graph_keys,
+            graph_values=graph_values,
+        )
+        return predicted, noise
+    def compute_loss(
+        self,
+        predicted: torch.Tensor,
+        target: torch.Tensor,
+        timestep: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute diffusion training loss.
+        Supports different loss types and weighting strategies.
+        Args:
+            predicted: Model output (predicted noise/x0/v).
+            target: Target (actual noise/x0/v).
+            timestep: Timestep indices for loss weighting.
+        Returns:
+            Scalar loss value.
+        """
+        # Base loss
+        if self.config.diffusion.loss_type == "mse":
+            loss = nn.functional.mse_loss(predicted, target, reduction="none")
+        elif self.config.diffusion.loss_type == "mae":
+            loss = nn.functional.l1_loss(predicted, target, reduction="none")
+        elif self.config.diffusion.loss_type == "huber":
+            loss = nn.functional.smooth_l1_loss(predicted, target, reduction="none")
+        else:
+            raise ValueError(f"Unknown loss_type: {self.config.diffusion.loss_type}")
+        # Average over feature dimension
+        loss = loss.mean(dim=-1)  # (batch, seq_len)
+        # Apply loss weighting
+        if self.config.diffusion.loss_weighting == "min_snr":
+            loss = self._apply_min_snr_weighting(loss, timestep)
+        elif self.config.diffusion.loss_weighting == "p2":
+            loss = self._apply_p2_weighting(loss, timestep)
+        # Average over sequence and batch
+        return loss.mean()
+    def _apply_min_snr_weighting(
+        self,
+        loss: torch.Tensor,
+        timestep: torch.Tensor,
+        gamma: float = 5.0,
+    ) -> torch.Tensor:
+        """Apply Min-SNR weighting strategy.
+        Weights the loss by min(SNR, gamma) / SNR, where
+        SNR = alpha_bar / (1 - alpha_bar).
+        This helps balance the loss across timesteps, preventing
+        high-noise steps from dominating.
+        Args:
+            loss: Unweighted loss.
+            timestep: Timestep indices.
+            gamma: SNR clipping value.
+        Returns:
+            Weighted loss.
+        """
+        alpha_bar = self.noise_scheduler.alphas_cumprod.to(loss.device)
+        snr = alpha_bar[timestep] / (1 - alpha_bar[timestep] + 1e-8)
+        weight = torch.clamp(snr, max=gamma) / (snr + 1e-8)
+        # Expand weight to match loss shape
+        weight = weight.unsqueeze(-1).expand_as(loss)
+        return loss * weight
+    def _apply_p2_weighting(
+        self,
+        loss: torch.Tensor,
+        timestep: torch.Tensor,
+    ) -> torch.Tensor:
+        """Apply P2 weighting strategy.
+        weight = 1 / (SNR^gamma + k)
+        Args:
+            loss: Unweighted loss.
+            timestep: Timestep indices.
+        Returns:
+            Weighted loss.
+        """
+        alpha_bar = self.noise_scheduler.alphas_cumprod.to(loss.device)
+        snr = alpha_bar[timestep] / (1 - alpha_bar[timestep] + 1e-8)
+        gamma = self.config.diffusion.p2_gamma
+        k = self.config.diffusion.p2_k
+        weight = 1.0 / (snr ** gamma + k)
+        weight = weight.unsqueeze(-1).expand_as(loss)
+        return loss * weight
+    @torch.no_grad()
+    def sample(
+        self,
+        graph_cond: dict[str, torch.Tensor],
+        n_steps: Optional[int] = None,
+        method: str = "ddim",
+        shape: Optional[tuple[int, ...]] = None,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """Generate samples via iterative denoising.
+        This is the INFERENCE method — start from pure noise and
+        iteratively denoise to produce coherent text embeddings.
+        Args:
+            graph_cond: Graph conditioning dict from GraphConditioningEncoder.
+            n_steps: Number of denoising steps. Uses config if None.
+            method: Sampling method ('ddpm' or 'ddim').
+            shape: Shape of the output (batch, seq_len, d_model).
+            device: Device to generate on.
+        Returns:
+            Denoised embeddings of shape (batch, seq_len, d_model).
+        """
+        if n_steps is None:
+            n_steps = self.config.diffusion.n_inference_steps
+        if device is None:
+            device = next(self.parameters()).device
+        if shape is None:
+            shape = (1, self.config.model.max_seq_len, self.config.model.d_model)
+        # Start from pure noise
+        x = torch.randn(shape, device=device)
+        # Get graph conditioning
+        graph_keys = graph_cond.get("keys")
+        graph_values = graph_cond.get("values")
+        if method == "ddpm":
+            # Full DDPM sampling
+            for t in reversed(range(self.config.diffusion.n_timesteps)):
+                t_tensor = torch.full((shape[0],), t, device=device, dtype=torch.long)
+                predicted = self.transformer(
+                    x_t=x, t=t_tensor,
+                    graph_keys=graph_keys,
+                    graph_values=graph_values,
+                )
+                x = self.noise_scheduler.step_ddpm(predicted, x, t_tensor)
+        elif method == "ddim":
+            # Fast DDIM sampling
+            timesteps = self.noise_scheduler.get_timestep_schedule(n_steps)
+            for i in range(len(timesteps) - 1):
+                t = timesteps[i]
+                t_prev = timesteps[i + 1] if i + 1 < len(timesteps) else 0
+                t_tensor = torch.full((shape[0],), t, device=device, dtype=torch.long)
+                predicted = self.transformer(
+                    x_t=x, t=t_tensor,
+                    graph_keys=graph_keys,
+                    graph_values=graph_values,
+                )
+                x = self.noise_scheduler.step_ddim(
+                    predicted, x, t, t_prev,
+                    eta=self.config.diffusion.eta_ddim,
+                )
+        return x
+    def embeddings_to_tokens(
+        self,
+        embeddings: torch.Tensor,
+        temperature: float = 1.0,
+        top_k: int = 50,
+    ) -> torch.Tensor:
+        """Convert continuous embeddings to discrete token IDs.
+        This is the final step of generation — project embeddings
+        to vocabulary logits and sample tokens.
+        Args:
+            embeddings: Denoised embeddings of shape (batch, seq_len, d_model).
+            temperature: Sampling temperature.
+            top_k: Top-k sampling cutoff.
+        Returns:
+            Token IDs of shape (batch, seq_len).
+        """
+        logits = self.lm_head(embeddings) / temperature
+        # Top-k sampling
+        if top_k > 0:
+            top_k_values, top_k_indices = torch.topk(logits, top_k, dim=-1)
+            probs = torch.softmax(top_k_values, dim=-1)
+            sampled_indices = torch.multinomial(
+                probs.view(-1, top_k), 1
+            ).view(logits.shape[0], logits.shape[1])
+            token_ids = top_k_indices.gather(
+                -1, sampled_indices.unsqueeze(-1)
+            ).squeeze(-1)
+        else:
+            probs = torch.softmax(logits, dim=-1)
+            token_ids = torch.argmax(logits, dim=-1)
+        return token_ids
+    def get_num_params(self) -> int:
+        """Get total number of parameters."""
+        return sum(p.numel() for p in self.parameters())
+    @staticmethod
+    def _format_params(n: int) -> str:
+        """Format parameter count for display."""
+        if n >= 1e9:
+            return f"{n / 1e9:.1f}B"
+        elif n >= 1e6:
+            return f"{n / 1e6:.1f}M"
+        elif n >= 1e3:
+            return f"{n / 1e3:.1f}K"
+        return str(n)
+    def save(self, path: str) -> None:
+        """Save model checkpoint.
+        Args:
+            path: Output file path.
+        """
+        torch.save({
+            "model_state_dict": self.state_dict(),
+            "config": self.config.to_dict(),
+        }, path)
+        logger.info("Model saved to %s", path)
+    @classmethod
+    def load(cls, path: str, device: str = "cpu") -> AamDiffusionModel:
+        """Load model from checkpoint.
+        Args:
+            path: Checkpoint file path.
+            device: Device to load to.
+        Returns:
+            Loaded AamDiffusionModel.
+        """
+        checkpoint = torch.load(path, map_location=device, weights_only=False)
+        config_dict = checkpoint.get("config", {})
+        if isinstance(config_dict, dict):
+            config = AamDiffusionConfig()
+            # Try to reconstruct config from dict
+            try:
+                from diffusion_llm.config.model_config import (
+                    ModelConfig, DiffusionConfig, GraphEncoderConfig,
+                    TokenizerConfig, TrainingConfig, InferenceConfig,
+                )
+                config = AamDiffusionConfig(
+                    model=ModelConfig(**config_dict.get("model", {})),
+                    diffusion=DiffusionConfig(**config_dict.get("diffusion", {})),
+                    graph_encoder=GraphEncoderConfig(**config_dict.get("graph_encoder", {})),
+                    tokenizer=TokenizerConfig(**config_dict.get("tokenizer", {})),
+                    training=TrainingConfig(**config_dict.get("training", {})),
+                    inference=InferenceConfig(**config_dict.get("inference", {})),
+                    model_name=config_dict.get("model_name", "aam-diffusion-v0.1"),
+                    output_dir=config_dict.get("output_dir", "./output"),
+                    seed=config_dict.get("seed", 42),
+                )
+            except Exception:
+                logger.warning("Could not reconstruct config from checkpoint, using defaults")
+        else:
+            config = config_dict
+        model = cls(config)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        model.to(device)
+        logger.info("Model loaded from %s", path)
+        return model

diffusion_llm/model/diffusion_transformer.py ADDED Viewed

	@@ -0,0 +1,394 @@

+"""
+AAM Diffusion LLM — Diffusion Transformer (Denoiser)
+The core denoising network. Takes noisy text embeddings and graph
+conditioning, and predicts the noise (or clean data) at each
+diffusion timestep.
+Architecture:
+    Input: Noisy embeddings x_t + timestep t + graph conditioning
+    Output: Predicted noise epsilon (or x_0 or v)
+The transformer uses:
+    - Self-attention over the text sequence
+    - Cross-attention to graph conditioning (evidence, anomalies, etc.)
+    - Timestep embedding (sinusoidal) injected via adaptive layer norm
+    - Optional flash attention for efficiency
+This is the "brainstem" of the body — the core computation that
+transforms noisy signals into coherent patterns.
+Analogi: Seperti otot Jin Soun yang merespons sinyal dari otak —
+model ini menerima "sinyal noise" dan "instruksi dari graph",
+lalu mengubahnya menjadi gerakan yang koheren (kalimat).
+"""
+from __future__ import annotations
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusion_llm.config.model_config import ModelConfig
+class SinusoidalTimestepEmbedding(nn.Module):
+    """Sinusoidal embedding for diffusion timesteps.
+    Maps integer timesteps to d_model-dimensional vectors using
+    sinusoidal position encoding, similar to Transformers.
+    This allows the model to know "how noisy" the current input is,
+    which is essential for the denoising process.
+    """
+    def __init__(self, d_model: int, max_period: int = 10000):
+        super().__init__()
+        self.d_model = d_model
+        self.max_period = max_period
+        # Two-layer MLP to project sinusoidal features
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model * 4),
+            nn.GELU(),
+            nn.Linear(d_model * 4, d_model),
+        )
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        """Embed timesteps.
+        Args:
+            t: Timestep indices of shape (batch,).
+        Returns:
+            Timestep embeddings of shape (batch, d_model).
+        """
+        device = t.device
+        half_dim = self.d_model // 2
+        emb = math.log(self.max_period) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device, dtype=torch.float32) * -emb)
+        emb = t.float().unsqueeze(-1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+        if emb.shape[-1] < self.d_model:
+            emb = F.pad(emb, (0, self.d_model - emb.shape[-1]))
+        return self.mlp(emb)
+class AdaptiveLayerNorm(nn.Module):
+    """Adaptive Layer Normalization conditioned on timestep.
+    Instead of fixed scale/shift parameters, this layer norm
+    uses the timestep embedding to produce scale and shift:
+        y = (1 + scale(t)) * norm(x) + shift(t)
+    This allows the model to behave differently at different
+    noise levels — more "creative" at high noise, more
+    "precise" at low noise.
+    Analogi: Jin Soun menyesuaikan intensitas pikirannya
+    berdasarkan seberapa kabur situasinya — semakin kabur,
+    semakin "kreatif" pendekatannya.
+    """
+    def __init__(self, d_model: int, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(d_model, elementwise_affine=False, eps=eps)
+        self.scale_proj = nn.Linear(d_model, d_model)
+        self.shift_proj = nn.Linear(d_model, d_model)
+        # Initialize shift to zero, scale to one
+        nn.init.zeros_(self.shift_proj.weight)
+        nn.init.zeros_(self.shift_proj.bias)
+        nn.init.ones_(self.scale_proj.weight)
+        nn.init.zeros_(self.scale_proj.bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        """Apply adaptive layer norm.
+        Args:
+            x: Input tensor of shape (batch, seq_len, d_model).
+            timestep_emb: Timestep embedding of shape (batch, d_model).
+        Returns:
+            Normalized and modulated tensor.
+        """
+        normalized = self.norm(x)
+        scale = (1 + self.scale_proj(timestep_emb)).unsqueeze(1)
+        shift = self.shift_proj(timestep_emb).unsqueeze(1)
+        return normalized * scale + shift
+class TransformerBlock(nn.Module):
+    """Single transformer block with self-attention, cross-attention, and FFN.
+    The block structure:
+    1. Adaptive Layer Norm + Self-Attention
+    2. Adaptive Layer Norm + Cross-Attention (to graph conditioning)
+    3. Adaptive Layer Norm + Feed-Forward Network
+    Each sub-layer has a residual connection.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        d_ff: int,
+        dropout: float = 0.1,
+        norm_eps: float = 1e-6,
+        norm_type: str = "rmsnorm",
+        use_flash_attention: bool = True,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        # Norms
+        NormClass = nn.RMSNorm if norm_type == "rmsnorm" else nn.LayerNorm
+        # Self-attention
+        self.self_attn_norm = AdaptiveLayerNorm(d_model, eps=norm_eps)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=d_model,
+            num_heads=n_heads,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self.self_attn_dropout = nn.Dropout(dropout)
+        # Cross-attention (to graph conditioning)
+        self.cross_attn_norm = AdaptiveLayerNorm(d_model, eps=norm_eps)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=d_model,
+            num_heads=n_heads,
+            dropout=dropout,
+            batch_first=True,
+            kdim=d_model,
+            vdim=d_model,
+        )
+        self.cross_attn_dropout = nn.Dropout(dropout)
+        # Feed-forward
+        self.ff_norm = AdaptiveLayerNorm(d_model, eps=norm_eps)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_ff),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_ff, d_model),
+            nn.Dropout(dropout),
+        )
+        # Layer scales (optional, helps with deep networks)
+        self.self_attn_scale = nn.Parameter(torch.ones(1) * 0.1)
+        self.cross_attn_scale = nn.Parameter(torch.ones(1) * 0.1)
+        self.ff_scale = nn.Parameter(torch.ones(1) * 0.1)
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep_emb: torch.Tensor,
+        graph_keys: Optional[torch.Tensor] = None,
+        graph_values: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass.
+        Args:
+            x: Input sequence of shape (batch, seq_len, d_model).
+            timestep_emb: Timestep embedding of shape (batch, d_model).
+            graph_keys: Graph conditioning keys for cross-attention,
+                shape (batch, n_graph_nodes, d_model).
+            graph_values: Graph conditioning values for cross-attention,
+                shape (batch, n_graph_nodes, d_model).
+            causal_mask: Optional causal mask for self-attention.
+        Returns:
+            Output sequence of shape (batch, seq_len, d_model).
+        """
+        # 1. Self-attention with adaptive layer norm
+        normed = self.self_attn_norm(x, timestep_emb)
+        attn_out, _ = self.self_attn(
+            normed, normed, normed,
+            attn_mask=causal_mask,
+            need_weights=False,
+        )
+        x = x + self.self_attn_scale * self.self_attn_dropout(attn_out)
+        # 2. Cross-attention to graph conditioning (if available)
+        if graph_keys is not None and graph_values is not None:
+            normed = self.cross_attn_norm(x, timestep_emb)
+            cross_out, _ = self.cross_attn(
+                normed, graph_keys, graph_values,
+                need_weights=False,
+            )
+            x = x + self.cross_attn_scale * self.cross_attn_dropout(cross_out)
+        # 3. Feed-forward with adaptive layer norm
+        normed = self.ff_norm(x, timestep_emb)
+        ff_out = self.ff(normed)
+        x = x + self.ff_scale * ff_out
+        return x
+class DiffusionTransformer(nn.Module):
+    """Diffusion Transformer — the core denoising network for AAM.
+    This transformer takes:
+    - Noisy text embeddings (x_t)
+    - Diffusion timestep (t)
+    - Graph conditioning (evidence, anomalies, reasoning chains)
+    And predicts the noise that was added (or the clean data,
+    depending on prediction_type).
+    Architecture Overview:
+    ┌────────────────────────────────────────────────┐
+    │  Input Embedding: x_t (noisy) → embedding     │
+    │  + Positional Encoding (RoPE or learned)       │
+    │                                                │
+    │  N x TransformerBlock:                         │
+    │    ├─ AdaLN + Self-Attention                   │
+    │    ├─ AdaLN + Cross-Attention (to graph)       │
+    │    └─ AdaLN + Feed-Forward                     │
+    │                                                │
+    │  Output Projection: → predicted noise          │
+    └────────────────────────────────────────────────┘
+    Key Features:
+    - Adaptive Layer Norm: timestep-conditioned normalization
+    - Cross-Attention: graph conditioning guides generation
+    - Layer Scales: helps training deep networks
+    - RoPE: better length generalization than learned positions
+    Args:
+        config: ModelConfig with architecture hyperparameters.
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        # Input embedding (from token IDs to d_model)
+        self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
+        # Timestep embedding
+        self.timestep_embedding = SinusoidalTimestepEmbedding(config.d_model)
+        # Positional encoding
+        if config.pos_encoding_type == "learned":
+            self.position_embedding = nn.Embedding(
+                config.max_seq_len, config.d_model
+            )
+        else:
+            # RoPE is applied inside attention (no separate embedding)
+            self.position_embedding = None
+        # Transformer blocks
+        self.blocks = nn.ModuleList([
+            TransformerBlock(
+                d_model=config.d_model,
+                n_heads=config.n_heads,
+                d_ff=config.d_ff,
+                dropout=config.dropout,
+                norm_eps=config.norm_eps,
+                norm_type=config.norm_type,
+                use_flash_attention=config.use_flash_attention,
+            )
+            for _ in range(config.n_layers)
+        ])
+        # Final norm
+        NormClass = nn.RMSNorm if config.norm_type == "rmsnorm" else nn.LayerNorm
+        self.final_norm = NormClass(config.d_model, eps=config.norm_eps)
+        # Output projection (predict noise/x0/v)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        """Initialize weights with Xavier/GPT-2 style."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+    def forward(
+        self,
+        x_t: torch.Tensor,
+        t: torch.Tensor,
+        token_ids: Optional[torch.Tensor] = None,
+        graph_keys: Optional[torch.Tensor] = None,
+        graph_values: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass: predict noise given noisy input and timestep.
+        Args:
+            x_t: Noisy text embeddings of shape (batch, seq_len, d_model).
+                If None, token_ids must be provided.
+            t: Timestep indices of shape (batch,).
+            token_ids: Token IDs of shape (batch, seq_len).
+                Used to create embeddings if x_t is not provided directly.
+                In training, x_t comes from the noise scheduler.
+            graph_keys: Graph conditioning keys for cross-attention,
+                shape (batch, n_graph_nodes, d_model).
+            graph_values: Graph conditioning values for cross-attention,
+                shape (batch, n_graph_nodes, d_model).
+        Returns:
+            Predicted noise of shape (batch, seq_len, d_model).
+        """
+        # Get input embeddings
+        if x_t is None and token_ids is not None:
+            # Create embeddings from token IDs (used for initial x_0)
+            h = self.token_embedding(token_ids)
+        elif x_t is not None:
+            h = x_t
+        else:
+            raise ValueError("Either x_t or token_ids must be provided")
+        # Add positional encoding
+        if self.position_embedding is not None:
+            seq_len = h.shape[1]
+            positions = torch.arange(seq_len, device=h.device).unsqueeze(0)
+            h = h + self.position_embedding(positions)
+        # Embed timestep
+        t_emb = self.timestep_embedding(t)
+        # Pass through transformer blocks
+        for block in self.blocks:
+            h = block(
+                h,
+                timestep_emb=t_emb,
+                graph_keys=graph_keys,
+                graph_values=graph_values,
+            )
+        # Final norm and projection
+        h = self.final_norm(h)
+        output = self.output_proj(h)
+        return output
+    def get_num_params(self) -> int:
+        """Get total number of parameters."""
+        return sum(p.numel() for p in self.parameters())
+    def get_num_trainable_params(self) -> int:
+        """Get number of trainable parameters."""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)

diffusion_llm/model/graph_encoder.py ADDED Viewed

	@@ -0,0 +1,553 @@

+"""
+AAM Diffusion LLM — Graph Conditioning Encoder
+Encodes structured graph data into a conditioning vector that guides
+the diffusion process. This is the KEY differentiator from general LLMs:
+the model is conditioned on GRAPH STRUCTURE, not just text prompts.
+The graph encoder takes:
+    - Evidence nodes (what the graph knows)
+    - Compositions (how concepts compose)
+    - Confidence scores (how sure the graph is)
+    - Anomalies (what doesn't fit)
+    - Reasoning chains (how the graph reached conclusions)
+    - Temporal context (when events happened)
+And produces a conditioning representation that the diffusion model
+uses to guide denoising.
+Analogi: Seperti otak Jin Soun mengirimkan sinyal ke pita suaranya —
+graph memberi "tahu" apa yang harus dikatakan, dan encoder ini
+menerjemahkan "pengetahuan graph" menjadi "instruksi untuk tubuh".
+"""
+from __future__ import annotations
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusion_llm.config.model_config import GraphEncoderConfig
+class ConfidenceEmbedding(nn.Module):
+    """Embed confidence scores as continuous values.
+    Maps [0, 1] confidence scores to d_graph-dimensional vectors
+    using sinusoidal encoding for smooth interpolation.
+    Analogi: Jin Soun tahu bedanya "aku yakin 100%" vs "mungkin 60%"
+    — encoding ini mengajarkan model membedakan juga.
+    """
+    def __init__(self, d_graph: int):
+        super().__init__()
+        self.d_graph = d_graph
+        # Learnable projection from scalar to d_graph
+        self.projection = nn.Sequential(
+            nn.Linear(1, d_graph // 4),
+            nn.GELU(),
+            nn.Linear(d_graph // 4, d_graph),
+        )
+    def forward(self, confidence: torch.Tensor) -> torch.Tensor:
+        """Embed confidence scores.
+        Args:
+            confidence: Tensor of shape (..., 1) with values in [0, 1].
+        Returns:
+            Tensor of shape (..., d_graph).
+        """
+        if confidence.dim() == 0:
+            confidence = confidence.unsqueeze(0)
+        if confidence.dim() == 1:
+            confidence = confidence.unsqueeze(-1)
+        return self.projection(confidence)
+class TemporalEmbedding(nn.Module):
+    """Embed temporal context as position-aware vectors.
+    Uses sinusoidal positional encoding adapted for timestamps,
+    allowing the model to understand time-based relationships.
+    Analogi: Jin Soun mengingat bahwa "kejadian A terjadi 3 hari
+    sebelum kejadian B" — temporal embedding mengajarkan model
+    memahami hubungan waktu antar kejadian.
+    """
+    def __init__(self, d_graph: int, max_period: int = 10000):
+        super().__init__()
+        self.d_graph = d_graph
+        self.max_period = max_period
+        self.projection = nn.Sequential(
+            nn.Linear(d_graph, d_graph),
+            nn.GELU(),
+            nn.Linear(d_graph, d_graph),
+        )
+    def forward(self, timestamps: torch.Tensor) -> torch.Tensor:
+        """Embed timestamps.
+        Args:
+            timestamps: Tensor of shape (batch, n_events) with normalized
+                timestamps (0 = earliest, 1 = latest).
+        Returns:
+            Tensor of shape (batch, n_events, d_graph).
+        """
+        batch_size, n_events = timestamps.shape
+        device = timestamps.device
+        # Sinusoidal encoding
+        half_dim = self.d_graph // 2
+        emb = math.log(self.max_period) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device, dtype=torch.float32) * -emb)
+        emb = timestamps.float().unsqueeze(-1) * emb.unsqueeze(0).unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+        if emb.shape[-1] < self.d_graph:
+            # Pad if d_graph is odd
+            emb = F.pad(emb, (0, self.d_graph - emb.shape[-1]))
+        return self.projection(emb)
+class NodeEncoder(nn.Module):
+    """Encode a single evidence node or composition.
+    Each node is represented as:
+    - Text embedding (from the tokenizer's vocabulary)
+    - Confidence score
+    - Optional temporal context
+    - Source trust score
+    These are combined into a single d_graph-dimensional vector.
+    """
+    def __init__(
+        self,
+        d_graph: int,
+        vocab_size: int = 32000,
+        embed_confidence: bool = True,
+        embed_temporal: bool = True,
+    ):
+        super().__init__()
+        self.d_graph = d_graph
+        # Text embedding (will be shared with the main model)
+        self.text_embed = nn.Embedding(vocab_size, d_graph)
+        # Confidence embedding
+        self.use_confidence = embed_confidence
+        if embed_confidence:
+            self.conf_embed = ConfidenceEmbedding(d_graph)
+        # Temporal embedding
+        self.use_temporal = embed_temporal
+        if embed_temporal:
+            self.temporal_embed = TemporalEmbedding(d_graph)
+        # Fusion layer — always build for max possible inputs
+        # At runtime, we may have fewer (e.g., no temporal data provided),
+        # so we use a flexible approach: always concatenate all available
+        # embeddings and project through a layer that handles the max size.
+        self._n_max_inputs = 1 + int(embed_confidence) + int(embed_temporal)
+        self.fusion = nn.Sequential(
+            nn.Linear(d_graph * self._n_max_inputs, d_graph),
+            nn.GELU(),
+            nn.LayerNorm(d_graph),
+        )
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        confidence: Optional[torch.Tensor] = None,
+        timestamps: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Encode a batch of evidence nodes.
+        Args:
+            token_ids: Token IDs of shape (batch, n_nodes, seq_len).
+            confidence: Confidence scores of shape (batch, n_nodes).
+            timestamps: Timestamps of shape (batch, n_nodes).
+        Returns:
+            Encoded nodes of shape (batch, n_nodes, d_graph).
+        """
+        # Text embedding: mean pool over sequence length
+        text_emb = self.text_embed(token_ids).mean(dim=-2)  # (batch, n_nodes, d_graph)
+        embeddings = [text_emb]
+        if self.use_confidence:
+            if confidence is not None:
+                conf_emb = self.conf_embed(confidence.unsqueeze(-1))  # (batch, n_nodes, d_graph)
+                embeddings.append(conf_emb)
+            else:
+                # Zero-pad to maintain consistent dimension
+                embeddings.append(torch.zeros_like(text_emb))
+        if self.use_temporal:
+            if timestamps is not None:
+                temp_emb = self.temporal_embed(timestamps)  # (batch, n_nodes, d_graph)
+                embeddings.append(temp_emb)
+            else:
+                embeddings.append(torch.zeros_like(text_emb))
+        # Fuse all embeddings
+        combined = torch.cat(embeddings, dim=-1)
+        return self.fusion(combined)
+class GraphAttentionLayer(nn.Module):
+    """Multi-head attention layer for graph-structured data.
+    Unlike standard self-attention, this operates on graph nodes
+    where edges represent structural relationships (compositions,
+    evidence links, temporal connections).
+    For now, we use standard multi-head attention over the node
+    sequence, as the structural information is already encoded
+    in the node features. Future versions can incorporate explicit
+    edge structure via graph attention networks (GAT).
+    """
+    def __init__(self, d_graph: int, n_heads: int, dropout: float = 0.1):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(
+            embed_dim=d_graph,
+            num_heads=n_heads,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self.norm = nn.LayerNorm(d_graph)
+        self.ff = nn.Sequential(
+            nn.Linear(d_graph, d_graph * 4),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_graph * 4, d_graph),
+            nn.Dropout(dropout),
+        )
+        self.norm_ff = nn.LayerNorm(d_graph)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass.
+        Args:
+            x: Node features of shape (batch, n_nodes, d_graph).
+            mask: Optional attention mask.
+        Returns:
+            Updated node features of same shape.
+        """
+        # Self-attention with residual
+        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
+        x = self.norm(x + attn_out)
+        # Feed-forward with residual
+        ff_out = self.ff(x)
+        x = self.norm_ff(x + ff_out)
+        return x
+class GraphConditioningEncoder(nn.Module):
+    """Encode graph-structured conditioning data for the diffusion model.
+    This encoder takes structured data from the RSVS Knowledge Graph
+    and produces conditioning vectors that guide the diffusion process.
+    The encoding process:
+    1. Encode each evidence node (text + confidence + temporal)
+    2. Encode compositions (how concepts relate)
+    3. Encode anomalies (what doesn't fit)
+    4. Encode reasoning chain (step-by-step logic)
+    5. Aggregate via graph attention layers
+    6. Project to conditioning vector for the diffusion model
+    Output modes (conditioning_method):
+    - 'cross_attention': Returns (K, V) pairs for cross-attention in transformer
+    - 'ada_ln': Returns scale/shift parameters for adaptive layer norm
+    - 'concat': Returns a conditioning prefix to concatenate with input
+    Args:
+        config: GraphEncoderConfig with hyperparameters.
+        vocab_size: Vocabulary size (must match tokenizer).
+    """
+    def __init__(
+        self,
+        config: GraphEncoderConfig,
+        vocab_size: int = 32000,
+    ):
+        super().__init__()
+        self.config = config
+        self.conditioning_method = config.conditioning_method
+        # Node encoders for different graph element types
+        self.evidence_encoder = NodeEncoder(
+            d_graph=config.d_graph,
+            vocab_size=vocab_size,
+            embed_confidence=config.embed_confidence,
+            embed_temporal=config.embed_temporal,
+        )
+        self.composition_encoder = NodeEncoder(
+            d_graph=config.d_graph,
+            vocab_size=vocab_size,
+            embed_confidence=config.embed_confidence,
+            embed_temporal=False,  # Compositions don't have temporal info
+        )
+        self.anomaly_encoder = NodeEncoder(
+            d_graph=config.d_graph,
+            vocab_size=vocab_size,
+            embed_confidence=True,  # Anomalies always have confidence
+            embed_temporal=config.embed_temporal,
+        )
+        self.reasoning_encoder = NodeEncoder(
+            d_graph=config.d_graph,
+            vocab_size=vocab_size,
+            embed_confidence=True,  # Reasoning steps have confidence
+            embed_temporal=False,
+        )
+        # Source trust embedding
+        self.trust_embed = ConfidenceEmbedding(config.d_graph)
+        # Graph attention layers for cross-node interaction
+        self.graph_layers = nn.ModuleList([
+            GraphAttentionLayer(
+                d_graph=config.d_graph,
+                n_heads=config.n_graph_heads,
+                dropout=0.1,
+            )
+            for _ in range(config.n_graph_layers)
+        ])
+        # Conditioning projection depends on method
+        # d_model_out will be set via set_output_dim() or defaults to d_graph
+        self._d_model_out = config.d_graph
+        if self.conditioning_method == "cross_attention":
+            # Project to (K, V) for cross-attention
+            self.key_proj = nn.Linear(config.d_graph, self._d_model_out)
+            self.value_proj = nn.Linear(config.d_graph, self._d_model_out)
+        elif self.conditioning_method == "ada_ln":
+            # Project to scale and shift for adaptive layer norm
+            self.scale_proj = nn.Linear(config.d_graph, self._d_model_out)
+            self.shift_proj = nn.Linear(config.d_graph, self._d_model_out)
+        elif self.conditioning_method == "concat":
+            # Project to a prefix sequence
+            self.concat_proj = nn.Linear(config.d_graph, self._d_model_out)
+        # Global pooling for summary
+        self.global_pool_proj = nn.Sequential(
+            nn.Linear(config.d_graph, config.d_graph),
+            nn.GELU(),
+            nn.Linear(config.d_graph, config.d_graph),
+        )
+        # Type embeddings for different graph element types
+        self.type_embeddings = nn.Embedding(4, config.d_graph)
+        # 0 = evidence, 1 = composition, 2 = anomaly, 3 = reasoning
+    def set_output_dim(self, d_model_out: int) -> None:
+        """Set the output dimension for the projection layers.
+        This must be called after __init__ if d_graph != d_model
+        (which is typically the case when the graph encoder's d_graph
+        differs from the transformer's d_model).
+        Args:
+            d_model_out: Output dimension (typically the transformer's d_model).
+        """
+        if d_model_out == self._d_model_out:
+            return  # No change needed
+        self._d_model_out = d_model_out
+        # Rebuild projection layers with new output dim
+        if self.conditioning_method == "cross_attention":
+            self.key_proj = nn.Linear(self.config.d_graph, d_model_out)
+            self.value_proj = nn.Linear(self.config.d_graph, d_model_out)
+        elif self.conditioning_method == "ada_ln":
+            self.scale_proj = nn.Linear(self.config.d_graph, d_model_out)
+            self.shift_proj = nn.Linear(self.config.d_graph, d_model_out)
+        elif self.conditioning_method == "concat":
+            self.concat_proj = nn.Linear(self.config.d_graph, d_model_out)
+    def forward(
+        self,
+        evidence_ids: Optional[torch.Tensor] = None,
+        evidence_confidence: Optional[torch.Tensor] = None,
+        evidence_timestamps: Optional[torch.Tensor] = None,
+        composition_ids: Optional[torch.Tensor] = None,
+        composition_confidence: Optional[torch.Tensor] = None,
+        anomaly_ids: Optional[torch.Tensor] = None,
+        anomaly_confidence: Optional[torch.Tensor] = None,
+        anomaly_timestamps: Optional[torch.Tensor] = None,
+        reasoning_ids: Optional[torch.Tensor] = None,
+        reasoning_confidence: Optional[torch.Tensor] = None,
+        source_trust: Optional[torch.Tensor] = None,
+        batch_size: Optional[int] = None,
+    ) -> dict[str, torch.Tensor]:
+        """Encode graph conditioning data.
+        All inputs are optional — the encoder handles missing data gracefully.
+        Args:
+            evidence_ids: Evidence node token IDs, shape (batch, n_evidence, seq_len).
+            evidence_confidence: Evidence confidence scores, shape (batch, n_evidence).
+            evidence_timestamps: Evidence timestamps, shape (batch, n_evidence).
+            composition_ids: Composition token IDs, shape (batch, n_compositions, seq_len).
+            composition_confidence: Composition confidence, shape (batch, n_compositions).
+            anomaly_ids: Anomaly token IDs, shape (batch, n_anomalies, seq_len).
+            anomaly_confidence: Anomaly confidence, shape (batch, n_anomalies).
+            anomaly_timestamps: Anomaly timestamps, shape (batch, n_anomalies).
+            reasoning_ids: Reasoning step token IDs, shape (batch, n_steps, seq_len).
+            reasoning_confidence: Reasoning confidence, shape (batch, n_steps).
+            source_trust: Source trust score, shape (batch,).
+        Returns:
+            Dictionary with conditioning tensors depending on conditioning_method:
+            - 'cross_attention': {'keys': ..., 'values': ..., 'global': ...}
+            - 'ada_ln': {'scale': ..., 'shift': ..., 'global': ...}
+            - 'concat': {'prefix': ..., 'global': ...}
+        """
+        batch_size_inferred = self._infer_batch_size(
+            evidence_ids, composition_ids, anomaly_ids, reasoning_ids
+        )
+        device = next(self.parameters()).device
+        # Encode each type of graph element
+        node_embeddings = []
+        type_indices = []
+        # Evidence nodes
+        if evidence_ids is not None:
+            evidence_emb = self.evidence_encoder(
+                evidence_ids, evidence_confidence, evidence_timestamps
+            )
+            # Add type embedding
+            type_emb = self.type_embeddings(
+                torch.zeros(evidence_emb.shape[1], dtype=torch.long, device=device)
+            )
+            evidence_emb = evidence_emb + type_emb.unsqueeze(0)
+            node_embeddings.append(evidence_emb)
+            type_indices.extend([0] * evidence_emb.shape[1])
+        # Compositions
+        if composition_ids is not None:
+            comp_emb = self.composition_encoder(
+                composition_ids, composition_confidence
+            )
+            type_emb = self.type_embeddings(
+                torch.ones(comp_emb.shape[1], dtype=torch.long, device=device)
+            )
+            comp_emb = comp_emb + type_emb.unsqueeze(0)
+            node_embeddings.append(comp_emb)
+            type_indices.extend([1] * comp_emb.shape[1])
+        # Anomalies
+        if anomaly_ids is not None:
+            anom_emb = self.anomaly_encoder(
+                anomaly_ids, anomaly_confidence, anomaly_timestamps
+            )
+            type_emb = self.type_embeddings(
+                torch.full((anom_emb.shape[1],), 2, dtype=torch.long, device=device)
+            )
+            anom_emb = anom_emb + type_emb.unsqueeze(0)
+            node_embeddings.append(anom_emb)
+            type_indices.extend([2] * anom_emb.shape[1])
+        # Reasoning steps
+        if reasoning_ids is not None:
+            reason_emb = self.reasoning_encoder(
+                reasoning_ids, reasoning_confidence
+            )
+            type_emb = self.type_embeddings(
+                torch.full((reason_emb.shape[1],), 3, dtype=torch.long, device=device)
+            )
+            reason_emb = reason_emb + type_emb.unsqueeze(0)
+            node_embeddings.append(reason_emb)
+            type_indices.extend([3] * reason_emb.shape[1])
+        # If no graph data, return zero conditioning
+        if not node_embeddings:
+            bsz = batch_size or batch_size_inferred
+            dummy = torch.zeros(
+                bsz, 1, self.config.d_graph, device=device
+            )
+            return self._project_conditioning(dummy)
+        # Concatenate all node embeddings
+        all_nodes = torch.cat(node_embeddings, dim=1)  # (batch, n_total_nodes, d_graph)
+        # Add source trust as a global bias
+        if source_trust is not None:
+            trust_emb = self.trust_embed(source_trust.unsqueeze(-1))  # (batch, d_graph)
+            # Broadcast trust to all nodes
+            all_nodes = all_nodes + trust_emb.unsqueeze(1) * 0.1  # Small influence
+        # Apply graph attention layers
+        for layer in self.graph_layers:
+            all_nodes = layer(all_nodes)
+        # Compute global conditioning (mean pool)
+        global_cond = all_nodes.mean(dim=1)  # (batch, d_graph)
+        global_cond = self.global_pool_proj(global_cond)
+        # Project based on conditioning method
+        result = self._project_conditioning(all_nodes)
+        result["global"] = global_cond
+        return result
+    def _project_conditioning(
+        self, node_features: torch.Tensor
+    ) -> dict[str, torch.Tensor]:
+        """Project node features to conditioning format.
+        Args:
+            node_features: Shape (batch, n_nodes, d_graph).
+        Returns:
+            Dictionary with conditioning tensors.
+        """
+        result = {}
+        if self.conditioning_method == "cross_attention":
+            result["keys"] = self.key_proj(node_features)
+            result["values"] = self.value_proj(node_features)
+        elif self.conditioning_method == "ada_ln":
+            # Use mean-pooled features for scale/shift
+            pooled = node_features.mean(dim=1)
+            result["scale"] = self.scale_proj(pooled)
+            result["shift"] = self.shift_proj(pooled)
+        elif self.conditioning_method == "concat":
+            result["prefix"] = self.concat_proj(node_features)
+        return result
+    @staticmethod
+    def _infer_batch_size(*tensors) -> int:
+        """Infer batch size from the first non-None tensor."""
+        for t in tensors:
+            if t is not None:
+                return t.shape[0]
+        return 1

diffusion_llm/model/noise_scheduler.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""
+AAM Diffusion LLM — Noise Scheduler
+Implements the forward (noising) and reverse (denoising) diffusion process.
+Forward Process:
+    q(x_t | x_0) = N(x_t; sqrt(alpha_bar_t) * x_0, (1 - alpha_bar_t) * I)
+Reverse Process:
+    p(x_{t-1} | x_t) = N(x_{t-1}; mu_theta(x_t, t), sigma_t^2 * I)
+This scheduler supports:
+    - Linear noise schedule (Ho et al., 2020)
+    - Cosine noise schedule (Nichol & Dhariwal, 2021) — recommended
+    - Sigmoid noise schedule
+Analogi: Seperti Jin Soun membentuk pikirannya — dari noise
+(kabur, tidak jelas) menjadi sinyal (pola yang jelas).
+Setiap langkah denoising = satu langkah lebih dekat ke
+kesimpulan yang koheren.
+"""
+from __future__ import annotations
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+class NoiseScheduler(nn.Module):
+    """Noise scheduler for the diffusion process.
+    Manages the noise schedule (beta values, alpha values, etc.)
+    and provides methods for adding noise and computing posterior
+    distributions.
+    Args:
+        n_timesteps: Total number of diffusion timesteps.
+        schedule_type: Type of noise schedule ('linear', 'cosine', 'sigmoid').
+        beta_start: Starting beta for linear schedule.
+        beta_end: Ending beta for linear schedule.
+        prediction_type: What the model predicts ('epsilon', 'x0', or 'v').
+    """
+    def __init__(
+        self,
+        n_timesteps: int = 1000,
+        schedule_type: str = "cosine",
+        beta_start: float = 1e-4,
+        beta_end: float = 0.02,
+        prediction_type: str = "epsilon",
+    ):
+        super().__init__()
+        self.n_timesteps = n_timesteps
+        self.schedule_type = schedule_type
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.prediction_type = prediction_type
+        # Compute and register noise schedule buffers
+        betas = self._compute_betas()
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod_prev = torch.cat(
+            [torch.ones(1, dtype=betas.dtype), alphas_cumprod[:-1]]
+        )
+        # Register all as buffers (part of model state but not parameters)
+        self.register_buffer("betas", betas)
+        self.register_buffer("alphas", alphas)
+        self.register_buffer("alphas_cumprod", alphas_cumprod)
+        self.register_buffer("alphas_cumprod_prev", alphas_cumprod_prev)
+        # For q(x_t | x_0) computation
+        self.register_buffer("sqrt_alphas_cumprod", torch.sqrt(alphas_cumprod))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod", torch.sqrt(1.0 - alphas_cumprod)
+        )
+        # For posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (
+            betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        )
+        self.register_buffer("posterior_variance", posterior_variance)
+        self.register_buffer(
+            "posterior_log_variance_clipped",
+            torch.log(posterior_variance.clamp(min=1e-20)),
+        )
+        self.register_buffer(
+            "posterior_mean_coef1",
+            betas * torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod),
+        )
+        self.register_buffer(
+            "posterior_mean_coef2",
+            (1.0 - alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - alphas_cumprod),
+        )
+    def _compute_betas(self) -> torch.Tensor:
+        """Compute beta schedule.
+        Returns:
+            Tensor of shape (n_timesteps,) with beta values.
+        """
+        if self.schedule_type == "linear":
+            return torch.linspace(
+                self.beta_start, self.beta_end, self.n_timesteps
+            )
+        elif self.schedule_type == "cosine":
+            return self._cosine_schedule()
+        elif self.schedule_type == "sigmoid":
+            return self._sigmoid_schedule()
+        else:
+            raise ValueError(
+                f"Unknown schedule_type '{self.schedule_type}'. "
+                f"Use 'linear', 'cosine', or 'sigmoid'."
+            )
+    def _cosine_schedule(self, s: float = 0.008) -> torch.Tensor:
+        """Cosine schedule as proposed in Nichol & Dhariwal 2021.
+        alpha_bar(t) = cos^2((t/T + s) / (1 + s) * pi/2)
+        beta(t) = 1 - alpha_bar(t) / alpha_bar(t-1)
+        This schedule avoids too much noise at the end and too
+        little at the beginning, leading to more stable training.
+        Args:
+            s: Offset to prevent singularity at t=0.
+        Returns:
+            Tensor of beta values.
+        """
+        steps = self.n_timesteps + 1
+        t = torch.linspace(0, self.n_timesteps, steps)
+        alphas_cumprod = torch.cos(
+            ((t / self.n_timesteps) + s) / (1 + s) * math.pi * 0.5
+        ) ** 2
+        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+        betas = 1.0 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        return torch.clamp(betas, 0.0001, 0.9999)
+    def _sigmoid_schedule(self) -> torch.Tensor:
+        """Sigmoid-based noise schedule.
+        beta(t) = sigmoid(-gamma * (t - T/2) + offset) * (beta_end - beta_start) + beta_start
+        Provides a smooth transition between low and high noise.
+        """
+        betas = torch.linspace(-6, 6, self.n_timesteps)
+        betas = torch.sigmoid(betas) * (self.beta_end - self.beta_start) + self.beta_start
+        return torch.clamp(betas, 0.0001, 0.9999)
+    def add_noise(
+        self,
+        x_0: torch.Tensor,
+        noise: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward diffusion: add noise to clean data.
+        q(x_t | x_0) = N(x_t; sqrt(alpha_bar_t) * x_0, (1 - alpha_bar_t) * I)
+        x_t = sqrt(alpha_bar_t) * x_0 + sqrt(1 - alpha_bar_t) * noise
+        Args:
+            x_0: Clean data tensor of shape (batch, seq_len, d_model).
+            noise: Noise tensor of same shape as x_0.
+            t: Timestep indices of shape (batch,).
+        Returns:
+            Noisy data x_t of same shape as x_0.
+        """
+        # Gather schedule values for timesteps
+        sqrt_alpha = self._gather(self.sqrt_alphas_cumprod, t, x_0)
+        sqrt_one_minus_alpha = self._gather(
+            self.sqrt_one_minus_alphas_cumprod, t, x_0
+        )
+        return sqrt_alpha * x_0 + sqrt_one_minus_alpha * noise
+    def compute_loss_target(
+        self,
+        x_0: torch.Tensor,
+        noise: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute the target for the diffusion loss.
+        Depending on prediction_type:
+        - 'epsilon': target = noise (predict the noise that was added)
+        - 'x0': target = x_0 (predict the clean data directly)
+        - 'v': target = v (velocity prediction, combines both)
+        Args:
+            x_0: Clean data.
+            noise: Noise that was added.
+            t: Timestep indices.
+        Returns:
+            Target tensor for loss computation.
+        """
+        if self.prediction_type == "epsilon":
+            return noise
+        elif self.prediction_type == "x0":
+            return x_0
+        elif self.prediction_type == "v":
+            # v = sqrt(alpha_bar) * noise - sqrt(1 - alpha_bar) * x_0
+            sqrt_alpha = self._gather(self.sqrt_alphas_cumprod, t, x_0)
+            sqrt_one_minus_alpha = self._gather(
+                self.sqrt_one_minus_alphas_cumprod, t, x_0
+            )
+            return sqrt_alpha * noise - sqrt_one_minus_alpha * x_0
+        else:
+            raise ValueError(f"Unknown prediction_type: {self.prediction_type}")
+    def predict_x0_from_epsilon(
+        self,
+        x_t: torch.Tensor,
+        epsilon: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """Predict x_0 from the model's epsilon prediction.
+        x_0 = (x_t - sqrt(1 - alpha_bar_t) * epsilon) / sqrt(alpha_bar_t)
+        Args:
+            x_t: Noisy data.
+            epsilon: Predicted noise.
+            t: Timestep indices.
+        Returns:
+            Predicted clean data x_0.
+        """
+        sqrt_alpha = self._gather(self.sqrt_alphas_cumprod, t, x_t)
+        sqrt_one_minus_alpha = self._gather(
+            self.sqrt_one_minus_alphas_cumprod, t, x_t
+        )
+        return (x_t - sqrt_one_minus_alpha * epsilon) / sqrt_alpha
+    def predict_x0_from_v(
+        self,
+        x_t: torch.Tensor,
+        v: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """Predict x_0 from velocity prediction.
+        Args:
+            x_t: Noisy data.
+            v: Predicted velocity.
+            t: Timestep indices.
+        Returns:
+            Predicted clean data x_0.
+        """
+        sqrt_alpha = self._gather(self.sqrt_alphas_cumprod, t, x_t)
+        sqrt_one_minus_alpha = self._gather(
+            self.sqrt_one_minus_alphas_cumprod, t, x_t
+        )
+        return sqrt_alpha * x_t - sqrt_one_minus_alpha * v
+    def posterior_mean(
+        self,
+        x_0: torch.Tensor,
+        x_t: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute the posterior mean q(x_{t-1} | x_t, x_0).
+        mu = coef1 * x_0 + coef2 * x_t
+        Args:
+            x_0: Predicted or actual clean data.
+            x_t: Noisy data at timestep t.
+            t: Timestep indices.
+        Returns:
+            Posterior mean tensor.
+        """
+        coef1 = self._gather(self.posterior_mean_coef1, t, x_t)
+        coef2 = self._gather(self.posterior_mean_coef2, t, x_t)
+        return coef1 * x_0 + coef2 * x_t
+    def step_ddpm(
+        self,
+        model_output: torch.Tensor,
+        x_t: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """Single DDPM reverse step: x_t -> x_{t-1}.
+        Args:
+            model_output: Model prediction (epsilon, x0, or v).
+            x_t: Noisy data at timestep t.
+            t: Current timestep indices.
+        Returns:
+            Denoised data at timestep t-1.
+        """
+        # Get predicted x_0
+        if self.prediction_type == "epsilon":
+            x_0_pred = self.predict_x0_from_epsilon(x_t, model_output, t)
+        elif self.prediction_type == "x0":
+            x_0_pred = model_output
+        elif self.prediction_type == "v":
+            x_0_pred = self.predict_x0_from_v(x_t, model_output, t)
+        else:
+            raise ValueError(f"Unknown prediction_type: {self.prediction_type}")
+        # Clamp x_0 prediction for stability
+        x_0_pred = x_0_pred.clamp(-5.0, 5.0)
+        # Compute posterior mean
+        mean = self.posterior_mean(x_0_pred, x_t, t)
+        # Add noise (except for t=0)
+        if t.min() > 0:
+            noise = torch.randn_like(x_t)
+            # Get posterior variance
+            log_variance = self._gather(
+                self.posterior_log_variance_clipped, t, x_t
+            )
+            noise_scale = torch.exp(0.5 * log_variance)
+            return mean + noise_scale * noise
+        else:
+            return mean
+    def step_ddim(
+        self,
+        model_output: torch.Tensor,
+        x_t: torch.Tensor,
+        t: int,
+        t_prev: int,
+        eta: float = 0.0,
+    ) -> torch.Tensor:
+        """Single DDIM reverse step: x_t -> x_{t_prev}.
+        DDIM is deterministic when eta=0, allowing fewer steps
+        at inference time while maintaining quality.
+        Args:
+            model_output: Model prediction.
+            x_t: Noisy data at timestep t.
+            t: Current timestep (scalar).
+            t_prev: Previous timestep (scalar, < t).
+            eta: Stochasticity parameter (0 = deterministic).
+        Returns:
+            Denoised data at timestep t_prev.
+        """
+        device = x_t.device
+        t_tensor = torch.tensor([t], device=device).expand(x_t.shape[0])
+        # Get predicted x_0
+        if self.prediction_type == "epsilon":
+            x_0_pred = self.predict_x0_from_epsilon(x_t, model_output, t_tensor)
+        elif self.prediction_type == "x0":
+            x_0_pred = model_output
+        elif self.prediction_type == "v":
+            x_0_pred = self.predict_x0_from_v(x_t, model_output, t_tensor)
+        else:
+            raise ValueError(f"Unknown prediction_type: {self.prediction_type}")
+        x_0_pred = x_0_pred.clamp(-5.0, 5.0)
+        # alpha_bar values
+        alpha_t = self.alphas_cumprod[t]
+        alpha_prev = self.alphas_cumprod[t_prev] if t_prev >= 0 else torch.tensor(1.0, device=device)
+        # Compute sigma
+        sigma = eta * torch.sqrt(
+            (1 - alpha_prev) / (1 - alpha_t) * (1 - alpha_t / alpha_prev)
+        )
+        # Direction pointing to x_t
+        pred_dir = torch.sqrt(1 - alpha_prev - sigma ** 2) * (
+            (x_t - torch.sqrt(alpha_t) * x_0_pred) / torch.sqrt(1 - alpha_t)
+        )
+        # DDIM update
+        x_prev = torch.sqrt(alpha_prev) * x_0_pred + pred_dir
+        if eta > 0 and sigma > 0:
+            noise = torch.randn_like(x_t)
+            x_prev = x_prev + sigma * noise
+        return x_prev
+    @staticmethod
+    def _gather(
+        values: torch.Tensor,
+        t: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        """Gather schedule values for timesteps and reshape for broadcasting.
+        Args:
+            values: Schedule values of shape (n_timesteps,).
+            t: Timestep indices of shape (batch,).
+            target: Target tensor to match shape.
+        Returns:
+            Gathered values reshaped for broadcasting with target.
+        """
+        gathered = values.gather(0, t)
+        # Reshape to (batch, 1, 1, ...) for broadcasting
+        ndim = target.ndim - 1  # minus batch dim
+        for _ in range(ndim):
+            gathered = gathered.unsqueeze(-1)
+        return gathered.expand_as(target)
+    def get_timestep_schedule(self, n_inference_steps: int) -> list[int]:
+        """Get evenly-spaced timestep schedule for inference.
+        For DDIM: use a subset of the training timesteps.
+        Args:
+            n_inference_steps: Number of inference steps.
+        Returns:
+            List of timestep indices in descending order.
+        """
+        step_size = self.n_timesteps // n_inference_steps
+        return list(range(self.n_timesteps - 1, 0, -step_size))

diffusion_llm/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# AAM Diffusion LLM — Dependencies
+# Core
+torch>=2.0.0
+numpy>=1.24.0
+# Training
+tensorboard>=2.13.0
+# Optional (for logging and monitoring)
+# wandb>=0.15.0
+# Testing
+pytest>=7.4.0
+# Note: This framework is designed to be lightweight.
+# No heavy ML framework dependencies beyond PyTorch.

diffusion_llm/scripts/evaluate.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python3
+"""
+AAM Diffusion LLM — Evaluation Script
+Evaluates a trained AAM Diffusion Model on test data or
+generates sample narratives from graph conditioning.
+Usage:
+    # Evaluate on test data
+    python scripts/evaluate.py --checkpoint output/best.pt
+    # Generate sample narratives
+    python scripts/evaluate.py --checkpoint output/best.pt --generate
+    # Interactive mode
+    python scripts/evaluate.py --checkpoint output/best.pt --interactive
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from diffusion_llm.config.model_config import AamDiffusionConfig
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+from diffusion_llm.inference.generator import AamGenerator
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate AAM Diffusion LLM")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Model checkpoint path")
+    parser.add_argument("--tokenizer", type=str, default=None, help="Tokenizer path")
+    parser.add_argument("--generate", action="store_true", help="Generate sample narratives")
+    parser.add_argument("--interactive", action="store_true", help="Interactive mode")
+    parser.add_argument("--test_data", type=str, default=None, help="Test data path (JSONL)")
+    parser.add_argument("--n_steps", type=int, default=50, help="Inference denoising steps")
+    parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature")
+    parser.add_argument("--language", type=str, default="id", help="Output language")
+    return parser.parse_args()
+def generate_samples(generator: AamGenerator, language: str) -> None:
+    """Generate sample narratives from predefined graph conditioning."""
+    samples = [
+        {
+            "trigger": "Siapa yang mencuri Snow Plum Pill?",
+            "evidence_nodes": ["Hefei", "Diancang Five Swords", "Ju Jangmok", "Gyeryong Merchant Guild"],
+            "anomalies": ["Tidak ada konsumsi pil baru di pasar gelap", "Pencuri menghilang tanpa jejak"],
+            "reasoning_steps": ["Cross-reference tanggal kejadian", "Deteksi ketidaksesuaian pola"],
+        },
+        {
+            "trigger": "Analisis pergerakan Diancang Five Swords",
+            "evidence_nodes": ["Gu Ilmu", "Jang Hangi", "Diancang Five Swords", "Hefei"],
+            "anomalies": ["Success rate pair lebih tinggi dari biasanya"],
+            "reasoning_steps": ["Recall laporan terkait", "Pattern completion dari bukti"],
+        },
+        {
+            "trigger": "Hubungan antara Ju Jangmok dan pencurian",
+            "evidence_nodes": ["Ju Jangmok", "Snow Plum Pill", "dark_faction"],
+            "anomalies": ["Ju Jangmok menghilang hari yang sama"],
+            "reasoning_steps": ["Eliminasi tersangka obvious", "Verify konsistensi"],
+        },
+    ]
+    print("\n" + "=" * 60)
+    print("  AAM Diffusion LLM — Sample Generation")
+    print("=" * 60)
+    for i, sample in enumerate(samples, 1):
+        result = generator.generate(
+            trigger=sample["trigger"],
+            evidence_nodes=sample["evidence_nodes"],
+            anomalies=sample["anomalies"],
+            reasoning_steps=sample["reasoning_steps"],
+            language=language,
+        )
+        print(f"\n--- Sample {i} ---")
+        print(f"Trigger: {sample['trigger']}")
+        print(f"Evidence: {', '.join(sample['evidence_nodes'])}")
+        print(f"Anomalies: {'; '.join(sample['anomalies'])}")
+        print(f"\nGenerated Narrative:")
+        print(result.narrative)
+        print(f"\n[Steps: {result.n_diffusion_steps}, Time: {result.generation_time_s:.2f}s]")
+def interactive_mode(generator: AamGenerator, language: str) -> None:
+    """Interactive generation mode."""
+    print("\n" + "=" * 60)
+    print("  AAM Diffusion LLM — Interactive Mode")
+    print("  Type 'quit' to exit")
+    print("=" * 60)
+    while True:
+        trigger = input("\nTrigger/Question: ").strip()
+        if trigger.lower() in ("quit", "exit", "q"):
+            break
+        evidence = input("Evidence nodes (comma-separated): ").strip()
+        evidence_nodes = [e.strip() for e in evidence.split(",") if e.strip()] if evidence else None
+        anomalies_input = input("Anomalies (comma-separated): ").strip()
+        anomalies = [a.strip() for a in anomalies_input.split(",") if a.strip()] if anomalies_input else None
+        result = generator.generate(
+            trigger=trigger,
+            evidence_nodes=evidence_nodes,
+            anomalies=anomalies,
+            language=language,
+        )
+        print(f"\nGenerated Narrative:\n{result.narrative}")
+        print(f"\n[Steps: {result.n_diffusion_steps}, Time: {result.generation_time_s:.2f}s, Confidence: {result.confidence:.1%}]")
+def main() -> None:
+    args = parse_args()
+    # Load model
+    logger.info("Loading model from %s", args.checkpoint)
+    model = AamDiffusionModel.load(args.checkpoint)
+    # Load or create tokenizer
+    if args.tokenizer:
+        tokenizer = AamTokenizer.load(args.tokenizer)
+    else:
+        # Try to find tokenizer in same directory as checkpoint
+        tokenizer_path = Path(args.checkpoint).parent / "data" / "tokenizer.json"
+        if tokenizer_path.exists():
+            tokenizer = AamTokenizer.load(tokenizer_path)
+        else:
+            logger.warning("No tokenizer found. Using untrained tokenizer.")
+            tokenizer = AamTokenizer()
+    # Create generator
+    generator = AamGenerator(model, tokenizer, model.config)
+    if args.interactive:
+        interactive_mode(generator, args.language)
+    elif args.generate:
+        generate_samples(generator, args.language)
+    else:
+        logger.info("Use --generate or --interactive flag")
+if __name__ == "__main__":
+    main()

diffusion_llm/scripts/export.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+"""
+AAM Diffusion LLM — Export Script
+Export a trained model for deployment.
+Usage:
+    python scripts/export.py --checkpoint output/best.pt --output model_export/
+"""
+from __future__ import annotations
+import argparse
+import logging
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from diffusion_llm.config.model_config import AamDiffusionConfig
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Export AAM Diffusion Model")
+    parser.add_argument("--checkpoint", type=str, required=True)
+    parser.add_argument("--output", type=str, default="./model_export")
+    parser.add_argument("--format", type=str, default="pt", choices=["pt", "onnx"])
+    args = parser.parse_args()
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Load model
+    model = AamDiffusionModel.load(args.checkpoint)
+    model.eval()
+    # Save model
+    model_path = output_dir / "model.pt"
+    model.save(str(model_path))
+    logger.info("Model exported to %s", model_path)
+    # Save config
+    config_path = output_dir / "config.json"
+    model.config.to_json(config_path)
+    logger.info("Config saved to %s", config_path)
+    # Try to copy tokenizer
+    checkpoint_dir = Path(args.checkpoint).parent
+    tokenizer_path = checkpoint_dir / "data" / "tokenizer.json"
+    if tokenizer_path.exists():
+        import shutil
+        shutil.copy(tokenizer_path, output_dir / "tokenizer.json")
+        logger.info("Tokenizer copied to %s", output_dir / "tokenizer.json")
+    # Summary
+    print(f"\nExport complete!")
+    print(f"  Model: {model_path}")
+    print(f"  Config: {config_path}")
+    print(f"  Parameters: {model._format_params(model.get_num_params())}")
+    print(f"\n  This is AAM's own body — 1 mind + 1 body.")
+    print(f"  Mind = RSVS Knowledge Graph")
+    print(f"  Body = This Diffusion Model ({model.config.model_name})")
+if __name__ == "__main__":
+    main()

diffusion_llm/scripts/train.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+"""
+AAM Diffusion LLM — Training Script
+Main entry point for training the AAM Diffusion Model.
+Usage:
+    # Train with default config (base model)
+    python scripts/train.py
+    # Train with specific model size
+    python scripts/train.py --model_size small
+    # Train with custom config
+    python scripts/train.py --config path/to/config.json
+    # Train with specific data
+    python scripts/train.py --train_data path/to/train.jsonl --val_data path/to/val.jsonl
+Analogi: Seperti Jin Soun memulai latihan fisiknya —
+ini adalah titik awal di mana "tubuh" AAM mulai dilatih.
+"""
+from __future__ import annotations
+import argparse
+import logging
+import sys
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from diffusion_llm.config.model_config import AamDiffusionConfig, get_default_config
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+from diffusion_llm.training.trainer import AamTrainer
+from diffusion_llm.training.dataset import GraphNarrativeDataset
+from diffusion_llm.data.data_pipeline import DataPipeline
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Train AAM Diffusion LLM",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    # Model configuration
+    parser.add_argument(
+        "--model_size", type=str, default="base",
+        choices=["tiny", "small", "base", "medium"],
+        help="Model size preset",
+    )
+    parser.add_argument(
+        "--config", type=str, default=None,
+        help="Path to custom config JSON (overrides --model_size)",
+    )
+    # Data
+    parser.add_argument(
+        "--train_data", type=str, default=None,
+        help="Path to training data (JSONL)",
+    )
+    parser.add_argument(
+        "--val_data", type=str, default=None,
+        help="Path to validation data (JSONL)",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="./output",
+        help="Output directory for checkpoints and logs",
+    )
+    parser.add_argument(
+        "--force_regenerate", action="store_true",
+        help="Force regenerate synthetic data",
+    )
+    # Training overrides
+    parser.add_argument("--batch_size", type=int, default=None)
+    parser.add_argument("--learning_rate", type=float, default=None)
+    parser.add_argument("--max_steps", type=int, default=None)
+    parser.add_argument("--n_timesteps", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=42)
+    return parser.parse_args()
+def main() -> None:
+    """Main training entry point."""
+    args = parse_args()
+    # Load or create config
+    if args.config:
+        config = AamDiffusionConfig.from_json(args.config)
+        logger.info("Loaded config from %s", args.config)
+    else:
+        config = get_default_config(args.model_size)
+        logger.info("Using %s model config", args.model_size)
+    # Apply CLI overrides
+    if args.output_dir:
+        config.output_dir = args.output_dir
+    if args.train_data:
+        config.training.train_data_path = args.train_data
+    if args.val_data:
+        config.training.val_data_path = args.val_data
+    if args.batch_size:
+        config.training.batch_size = args.batch_size
+    if args.learning_rate:
+        config.training.learning_rate = args.learning_rate
+    if args.max_steps:
+        config.training.max_steps = args.max_steps
+    if args.n_timesteps:
+        config.diffusion.n_timesteps = args.n_timesteps
+    config.seed = args.seed
+    # Print config summary
+    print(config.summary())
+    # Save config
+    config_path = Path(config.output_dir) / "config.json"
+    config.to_json(config_path)
+    logger.info("Config saved to %s", config_path)
+    # Step 1: Prepare data
+    pipeline = DataPipeline(config)
+    tokenizer, train_loader, val_loader = pipeline.prepare(
+        force_regenerate=args.force_regenerate,
+    )
+    # Step 2: Create model
+    model = AamDiffusionModel(config)
+    logger.info(
+        "Model created: %s parameters",
+        model._format_params(model.get_num_params()),
+    )
+    # Step 3: Create datasets (using pre-created loaders)
+    train_dataset = train_loader.dataset
+    val_dataset = val_loader.dataset if val_loader else None
+    # Step 4: Create trainer and train
+    trainer = AamTrainer(
+        config=config,
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+    )
+    # Override data loaders (already created by pipeline)
+    trainer.train_loader = train_loader
+    trainer.val_loader = val_loader
+    # Start training
+    trainer.train()
+    logger.info("Training complete! Output saved to %s", config.output_dir)
+if __name__ == "__main__":
+    main()

diffusion_llm/scripts/train_final.py ADDED Viewed

	@@ -0,0 +1,686 @@

+#!/usr/bin/env python3
+"""
+AAM Diffusion LLM — Final Training Script
+Trains the complete AAM Diffusion LLM pipeline:
+1. Generate synthetic training data (Graph→Narrative pairs)
+2. Train the AAM Sentence-Level + BPE Tokenizer
+3. Train the Diffusion Transformer model
+4. Save final model, tokenizer, and config for HuggingFace upload
+This is the "birth" of AAM's body — from random weights to
+a model that can arrange sentences from graph conditioning.
+Usage:
+    python scripts/train_final.py --output_dir ./aam-diffusion-v1
+    python scripts/train_final.py --model_size tiny --max_steps 500
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import torch
+import numpy as np
+from diffusion_llm.config.model_config import (
+    AamDiffusionConfig, get_default_config, ModelConfig,
+    DiffusionConfig, GraphEncoderConfig, TokenizerConfig,
+    TrainingConfig, InferenceConfig,
+)
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+from diffusion_llm.training.dataset import GraphNarrativeDataset, collate_fn
+from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger("train_final")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train AAM Diffusion LLM (Final)")
+    parser.add_argument("--model_size", type=str, default="tiny",
+                        choices=["tiny", "small", "base", "medium"])
+    parser.add_argument("--output_dir", type=str, default="./aam-diffusion-v1")
+    parser.add_argument("--max_steps", type=int, default=500)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--learning_rate", type=float, default=3e-4)
+    parser.add_argument("--n_synthetic_train", type=int, default=500)
+    parser.add_argument("--n_synthetic_val", type=int, default=50)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--log_every", type=int, default=50)
+    parser.add_argument("--save_every", type=int, default=500)
+    parser.add_argument("--eval_every", type=int, default=200)
+    return parser.parse_args()
+def set_seed(seed: int):
+    """Set random seeds for reproducibility."""
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    import random
+    random.seed(seed)
+def generate_data(output_dir: Path, n_train: int, n_val: int, seed: int):
+    """Generate synthetic training data."""
+    logger.info("=" * 60)
+    logger.info("STEP 1: Generating Synthetic Training Data")
+    logger.info("=" * 60)
+    data_dir = output_dir / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    train_path, val_path = SyntheticDataGenerator.generate_training_split(
+        output_dir=data_dir,
+        n_train=n_train,
+        n_val=n_val,
+        language="id",
+        seed=seed,
+    )
+    logger.info(f"  Train data: {train_path} ({n_train} examples)")
+    logger.info(f"  Val data:   {val_path} ({n_val} examples)")
+    return train_path, val_path
+def train_tokenizer(train_path: Path, output_dir: Path, config: AamDiffusionConfig) -> AamTokenizer:
+    """Train the AAM Tokenizer on synthetic data."""
+    logger.info("=" * 60)
+    logger.info("STEP 2: Training AAM Sentence-Level + BPE Tokenizer")
+    logger.info("=" * 60)
+    tokenizer = AamTokenizer(config=config.tokenizer)
+    # Read training texts
+    texts = []
+    with open(train_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                if data.get("narrative"):
+                    texts.append(data["narrative"])
+                if data.get("trigger"):
+                    texts.append(data["trigger"])
+                for ev in data.get("evidence_nodes", []):
+                    texts.append(ev)
+                for anom in data.get("anomalies", []):
+                    texts.append(anom)
+                for step in data.get("reasoning_steps", []):
+                    texts.append(step)
+                for comp in data.get("compositions", []):
+                    texts.append(comp)
+            except json.JSONDecodeError:
+                continue
+    logger.info(f"  Training tokenizer on {len(texts)} texts...")
+    tokenizer.train(texts, vocab_size=config.tokenizer.bpe_vocab_size)
+    # Save tokenizer
+    tokenizer_path = output_dir / "tokenizer.json"
+    tokenizer.save(tokenizer_path)
+    logger.info(f"  Tokenizer saved: {tokenizer_path}")
+    logger.info(f"  Vocab size: {tokenizer.vocab_size}")
+    logger.info(f"  BPE merges: {len(tokenizer.merges)}")
+    return tokenizer
+def create_dataloaders(
+    train_path: Path, val_path: Path,
+    tokenizer: AamTokenizer, config: AamDiffusionConfig
+):
+    """Create training and validation data loaders."""
+    logger.info("=" * 60)
+    logger.info("STEP 3: Creating DataLoaders")
+    logger.info("=" * 60)
+    train_dataset = GraphNarrativeDataset(
+        data_path=train_path,
+        tokenizer=tokenizer,
+        max_seq_len=config.model.max_seq_len,
+        max_evidence=config.graph_encoder.max_evidence_nodes,
+        max_anomalies=config.graph_encoder.max_anomalies,
+        max_reasoning=config.graph_encoder.max_reasoning_steps,
+        augment=True,
+    )
+    val_dataset = GraphNarrativeDataset(
+        data_path=val_path,
+        tokenizer=tokenizer,
+        max_seq_len=config.model.max_seq_len,
+        max_evidence=config.graph_encoder.max_evidence_nodes,
+        max_anomalies=config.graph_encoder.max_anomalies,
+        max_reasoning=config.graph_encoder.max_reasoning_steps,
+        augment=False,
+    )
+    from torch.utils.data import DataLoader
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=config.training.batch_size,
+        shuffle=True,
+        num_workers=0,  # CPU training: use 0 workers
+        collate_fn=collate_fn,
+        pin_memory=False,  # CPU: no pin_memory
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=config.training.batch_size,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=collate_fn,
+        pin_memory=False,
+    )
+    logger.info(f"  Train: {len(train_dataset)} examples, {len(train_loader)} batches")
+    logger.info(f"  Val:   {len(val_dataset)} examples, {len(val_loader)} batches")
+    return train_loader, val_loader
+def train_model(
+    model: AamDiffusionModel,
+    tokenizer: AamTokenizer,
+    train_loader,
+    val_loader,
+    config: AamDiffusionConfig,
+    output_dir: Path,
+    args,
+):
+    """Train the AAM Diffusion Model."""
+    logger.info("=" * 60)
+    logger.info("STEP 4: Training AAM Diffusion LLM")
+    logger.info("=" * 60)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"  Device: {device}")
+    logger.info(f"  Parameters: {model._format_params(model.get_num_params())}")
+    model.to(device)
+    # Optimizer
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.learning_rate,
+        weight_decay=config.training.weight_decay,
+        betas=(config.training.adam_beta1, config.training.adam_beta2),
+    )
+    # LR scheduler with warmup
+    warmup_steps = min(200, args.max_steps // 10)
+    def lr_lambda(step):
+        if step < warmup_steps:
+            return step / max(warmup_steps, 1)
+        progress = (step - warmup_steps) / max(args.max_steps - warmup_steps, 1)
+        return 0.5 * (1.0 + torch.cos(torch.tensor(progress * 3.14159)).item())
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+    # Training loop
+    global_step = 0
+    best_val_loss = float("inf")
+    train_losses = []
+    start_time = time.time()
+    logger.info(f"  Max steps: {args.max_steps}")
+    logger.info(f"  Batch size: {args.batch_size}")
+    logger.info(f"  Learning rate: {args.learning_rate}")
+    logger.info(f"  Warmup steps: {warmup_steps}")
+    logger.info("")
+    epoch = 0
+    while global_step < args.max_steps:
+        epoch += 1
+        model.train()
+        epoch_loss = 0.0
+        n_batches = 0
+        for batch_idx, batch in enumerate(train_loader):
+            if global_step >= args.max_steps:
+                break
+            # Move batch to device
+            batch = {
+                k: v.to(device) if isinstance(v, torch.Tensor) else v
+                for k, v in batch.items()
+            }
+            # Sample random timesteps
+            batch_size = batch["token_ids"].shape[0]
+            t = torch.randint(
+                0, config.diffusion.n_timesteps,
+                (batch_size,), device=device,
+            )
+            # Forward pass
+            predicted, target = model(
+                token_ids=batch["token_ids"],
+                timestep=t,
+                evidence_ids=batch.get("evidence_ids"),
+                evidence_confidence=batch.get("evidence_confidence"),
+                anomaly_ids=batch.get("anomaly_ids"),
+                anomaly_confidence=batch.get("anomaly_confidence"),
+                reasoning_ids=batch.get("reasoning_ids"),
+                reasoning_confidence=batch.get("reasoning_confidence"),
+                source_trust=batch.get("source_trust"),
+            )
+            # Compute loss
+            loss = model.compute_loss(predicted, target, t)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            # Gradient clipping
+            torch.nn.utils.clip_grad_norm_(
+                model.parameters(), config.training.grad_clip_norm
+            )
+            optimizer.step()
+            scheduler.step()
+            loss_val = loss.item()
+            train_losses.append(loss_val)
+            epoch_loss += loss_val
+            n_batches += 1
+            global_step += 1
+            # Logging
+            if global_step % args.log_every == 0:
+                lr = optimizer.param_groups[0]["lr"]
+                avg_loss = sum(train_losses[-args.log_every:]) / len(train_losses[-args.log_every:])
+                elapsed = time.time() - start_time
+                steps_per_sec = global_step / max(elapsed, 1)
+                logger.info(
+                    f"  Step {global_step:>6d}/{args.max_steps} | "
+                    f"Loss: {avg_loss:.4f} | "
+                    f"LR: {lr:.2e} | "
+                    f"Speed: {steps_per_sec:.1f} steps/s"
+                )
+            # Evaluation
+            if global_step % args.eval_every == 0 and val_loader is not None:
+                val_loss = evaluate(model, val_loader, config, device)
+                logger.info(f"  >>> Validation loss: {val_loss:.4f}")
+                if val_loss < best_val_loss:
+                    best_val_loss = val_loss
+                    save_model(model, tokenizer, config, output_dir / "best.pt")
+                    logger.info(f"  >>> New best model saved! (val_loss: {val_loss:.4f})")
+            # Checkpoint
+            if global_step % args.save_every == 0:
+                save_model(model, tokenizer, config, output_dir / f"step_{global_step}.pt")
+        avg_epoch_loss = epoch_loss / max(n_batches, 1)
+        logger.info(f"  Epoch {epoch} complete. Avg loss: {avg_epoch_loss:.4f}")
+    # Final save
+    save_model(model, tokenizer, config, output_dir / "final.pt")
+    elapsed = time.time() - start_time
+    logger.info("")
+    logger.info(f"  Training complete! {global_step} steps in {elapsed/60:.1f} minutes")
+    logger.info(f"  Best val loss: {best_val_loss:.4f}")
+    logger.info(f"  Final train loss: {train_losses[-1]:.4f}")
+    return model
+def evaluate(model, val_loader, config, device):
+    """Evaluate on validation set."""
+    model.eval()
+    total_loss = 0.0
+    n_batches = 0
+    with torch.no_grad():
+        for batch in val_loader:
+            batch = {
+                k: v.to(device) if isinstance(v, torch.Tensor) else v
+                for k, v in batch.items()
+            }
+            batch_size = batch["token_ids"].shape[0]
+            t = torch.randint(
+                0, config.diffusion.n_timesteps,
+                (batch_size,), device=device,
+            )
+            predicted, target = model(
+                token_ids=batch["token_ids"],
+                timestep=t,
+                evidence_ids=batch.get("evidence_ids"),
+                evidence_confidence=batch.get("evidence_confidence"),
+                anomaly_ids=batch.get("anomaly_ids"),
+                anomaly_confidence=batch.get("anomaly_confidence"),
+                reasoning_ids=batch.get("reasoning_ids"),
+                reasoning_confidence=batch.get("reasoning_confidence"),
+                source_trust=batch.get("source_trust"),
+            )
+            loss = model.compute_loss(predicted, target, t)
+            total_loss += loss.item()
+            n_batches += 1
+    model.train()
+    return total_loss / max(n_batches, 1)
+def save_model(model, tokenizer, config, path):
+    """Save model checkpoint with tokenizer."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    checkpoint = {
+        "model_state_dict": model.state_dict(),
+        "config": config.to_dict(),
+    }
+    torch.save(checkpoint, path)
+def export_for_huggingface(model, tokenizer, config, output_dir: Path):
+    """Export model in HuggingFace-compatible format."""
+    logger.info("=" * 60)
+    logger.info("STEP 5: Exporting for HuggingFace")
+    logger.info("=" * 60)
+    hf_dir = output_dir / "huggingface"
+    hf_dir.mkdir(parents=True, exist_ok=True)
+    # Save model weights
+    model_path = hf_dir / "model.pt"
+    model.save(str(model_path))
+    logger.info(f"  Model saved: {model_path}")
+    # Save tokenizer
+    tokenizer_path = hf_dir / "tokenizer.json"
+    tokenizer.save(tokenizer_path)
+    logger.info(f"  Tokenizer saved: {tokenizer_path}")
+    # Save config
+    config_path = hf_dir / "config.json"
+    config.to_json(config_path)
+    logger.info(f"  Config saved: {config_path}")
+    # Save model card
+    model_card = f"""---
+language:
+- id
+- en
+license: mit
+library_name: pytorch
+tags:
+- diffusion
+- text-generation
+- aam
+- aphantasic-abstraction-model
+- sentence-arrangement
+- graph-conditioned
+---
+# AAM Diffusion LLM v1.0
+> **"AAM = 1 Pikiran + 1 Tubuh" (1 Mind + 1 Body)**
+The dedicated "body" of the Aphantasic Abstraction Model (AAM) — a small diffusion LLM specifically trained to arrange sentences from structured graph data.
+## What is this?
+This is NOT a general-purpose LLM. This is a SPECIALIZED sentence composer that:
+- Takes **graph-structured conditioning** as input (evidence, anomalies, reasoning chains, confidence scores)
+- Produces **coherent natural language narratives** through iterative denoising
+- **Cannot hallucinate** — it can only narrate what the graph knows
+## Architecture
+```
+Graph Conditioning Encoder → Diffusion Transformer → Noise Scheduler
+         (Mind input)           (The Body)          (Iterative refinement)
+```
+### Key Components
+- **Graph Conditioning Encoder**: Encodes evidence nodes, compositions, anomalies, reasoning chains with confidence and temporal embeddings
+- **Diffusion Transformer**: Core denoising network with adaptive layer norm, self-attention, and cross-attention to graph conditioning
+- **Noise Scheduler**: Cosine noise schedule with DDPM/DDIM sampling support
+## Model Details
+| Parameter | Value |
+|-----------|-------|
+| Architecture | Diffusion Transformer |
+| d_model | {config.model.d_model} |
+| n_layers | {config.model.n_layers} |
+| n_heads | {config.model.n_heads} |
+| d_ff | {config.model.d_ff} |
+| Parameters | {model._format_params(model.get_num_params())} |
+| Vocab size | {config.model.vocab_size} |
+| Max sequence length | {config.model.max_seq_len} |
+| Diffusion timesteps (train) | {config.diffusion.n_timesteps} |
+| Diffusion timesteps (inference) | {config.diffusion.n_inference_steps} |
+| Noise schedule | {config.diffusion.schedule_type} |
+| Prediction type | {config.diffusion.prediction_type} |
+| Sampling method | {config.diffusion.sampling_method} |
+## Usage
+```python
+from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator, AamDiffusionConfig
+# Load model
+config = AamDiffusionConfig.from_json("config.json")
+model = AamDiffusionModel.load("model.pt")
+tokenizer = AamTokenizer.load("tokenizer.json")
+# Create generator
+generator = AamGenerator(model, tokenizer, config)
+# Generate narrative from graph conditioning
+result = generator.generate(
+    trigger="Siapa yang mencuri Snow Plum Pill?",
+    evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"],
+    anomalies=["Tidak ada konsumsi pil baru di pasar gelap"],
+    reasoning_steps=["Cross-reference tanggal kejadian"],
+    source_trust=0.85,
+)
+print(result.narrative)
+```
+## Philosophy
+**AAM = 1 Mind + 1 Body**
+- **Mind** = RSVS Knowledge Graph (structural memory, perfect recall, relational understanding)
+- **Body** = This Diffusion LLM (sentence arranger, graph-conditioned, anti-hallucination)
+Unlike using a rented LLM (GPT, Claude) as the "body", this model is specifically trained for AAM:
+- It cannot generate information not present in the graph conditioning
+- It arranges sentences based on structured evidence
+- It uses diffusion (non-sequential generation) instead of autoregressive generation
+- It is small ({model._format_params(model.get_num_params())}) but specialized
+## Training
+Trained on synthetic Graph→Narrative pairs with:
+- Indonesian and English narrative templates
+- Evidence nodes, anomalies, reasoning chains
+- Confidence score distributions
+- Source trust scores
+## License
+MIT
+"""
+    model_card_path = hf_dir / "README.md"
+    with open(model_card_path, "w", encoding="utf-8") as f:
+        f.write(model_card)
+    logger.info(f"  Model card saved: {model_card_path}")
+    # Copy full framework code
+    import shutil
+    framework_src = Path(__file__).parent.parent  # diffusion_llm/
+    framework_dst = hf_dir / "diffusion_llm"
+    if framework_dst.exists():
+        shutil.rmtree(framework_dst)
+    shutil.copytree(framework_src, framework_dst,
+                    ignore=shutil.ignore_patterns('__pycache__', '*.pyc', 'output', 'data'))
+    logger.info(f"  Framework code copied to: {framework_dst}")
+    # Save training script
+    train_script_dst = hf_dir / "train.py"
+    shutil.copy2(Path(__file__), train_script_dst)
+    # Save inference example
+    inference_example = hf_dir / "inference_example.py"
+    with open(inference_example, "w", encoding="utf-8") as f:
+        f.write('''#!/usr/bin/env python3
+"""AAM Diffusion LLM — Inference Example"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent))
+import torch
+from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator, AamDiffusionConfig
+def main():
+    # Load model and tokenizer
+    config = AamDiffusionConfig.from_json("config.json")
+    model = AamDiffusionModel.load("model.pt", device="cpu")
+    tokenizer = AamTokenizer.load("tokenizer.json")
+    # Create generator
+    generator = AamGenerator(model, tokenizer, config)
+    # Generate narrative
+    result = generator.generate(
+        trigger="Siapa yang mencuri Snow Plum Pill?",
+        evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"],
+        anomalies=["Tidak ada konsumsi pil baru di pasar gelap"],
+        reasoning_steps=["Cross-reference tanggal kejadian", "Deteksi anomali pola"],
+        source_trust=0.85,
+    )
+    print("=" * 60)
+    print("  AAM Diffusion LLM — Generated Narrative")
+    print("=" * 60)
+    print(f"  Trigger: {result.evidence_used}")
+    print(f"  Narrative: {result.narrative}")
+    print(f"  Confidence: {result.confidence:.1%}")
+    print(f"  Steps: {result.n_diffusion_steps}")
+    print(f"  Time: {result.generation_time_s:.2f}s")
+if __name__ == "__main__":
+    main()
+''')
+    logger.info(f"  Inference example saved: {inference_example}")
+    logger.info(f"\n  HuggingFace export complete: {hf_dir}")
+    return hf_dir
+def main():
+    args = parse_args()
+    set_seed(args.seed)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("=" * 60)
+    print("  AAM Diffusion LLM — Final Training")
+    print("  \"1 Pikiran + 1 Tubuh\" (1 Mind + 1 Body)")
+    print("=" * 60)
+    print()
+    # Get config
+    config = get_default_config(args.model_size)
+    # CPU-optimized overrides for faster training
+    config.model.max_seq_len = 128
+    config.model.vocab_size = 8000
+    config.graph_encoder.max_evidence_nodes = 10
+    config.graph_encoder.max_anomalies = 5
+    config.graph_encoder.max_reasoning_steps = 5
+    config.graph_encoder.max_compositions = 5
+    config.diffusion.n_timesteps = 200
+    config.diffusion.n_inference_steps = 20
+    config.tokenizer.bpe_vocab_size = 8000 - 13  # minus special tokens
+    # Override settings for CPU training
+    config.training.batch_size = args.batch_size
+    config.training.learning_rate = args.learning_rate
+    config.training.max_steps = args.max_steps
+    config.training.use_amp = False  # No AMP on CPU
+    config.training.num_workers = 0  # No multiprocessing on CPU
+    config.training.warmup_steps = min(100, args.max_steps // 5)
+    config.output_dir = str(output_dir)
+    config.seed = args.seed
+    config.model_name = "aam-diffusion-v1.0"
+    # Print config
+    print(config.summary())
+    # Step 1: Generate synthetic data
+    train_path, val_path = generate_data(
+        output_dir, args.n_synthetic_train, args.n_synthetic_val, args.seed
+    )
+    # Step 2: Train tokenizer
+    tokenizer = train_tokenizer(train_path, output_dir, config)
+    # Update vocab_size to match actual tokenizer
+    actual_vocab = tokenizer.vocab_size
+    if actual_vocab != config.model.vocab_size:
+        logger.info(f"  Updating vocab_size: {config.model.vocab_size} → {actual_vocab}")
+        config.model.vocab_size = actual_vocab
+    # Step 3: Create dataloaders
+    train_loader, val_loader = create_dataloaders(
+        train_path, val_path, tokenizer, config
+    )
+    # Step 4: Create and train model
+    model = AamDiffusionModel(config)
+    logger.info(f"  Model parameters: {model._format_params(model.get_num_params())}")
+    model = train_model(
+        model, tokenizer, train_loader, val_loader,
+        config, output_dir, args
+    )
+    # Step 5: Export for HuggingFace
+    hf_dir = export_for_huggingface(model, tokenizer, config, output_dir)
+    # Final summary
+    print()
+    print("=" * 60)
+    print("  TRAINING COMPLETE!")
+    print("=" * 60)
+    print(f"  Model: {config.model_name}")
+    print(f"  Parameters: {model._format_params(model.get_num_params())}")
+    print(f"  Output: {output_dir}")
+    print(f"  HuggingFace export: {hf_dir}")
+    print()
+    print("  AAM = 1 Pikiran + 1 Tubuh")
+    print("  Pikiran = RSVS Knowledge Graph")
+    print("  Tubuh   = This Diffusion LLM")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

diffusion_llm/scripts/train_minimal.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#!/usr/bin/env python3
+"""
+AAM Diffusion LLM — Minimal Training Script for CPU
+Trains a very small AAM Diffusion LLM model on CPU.
+"""
+import sys
+import json
+import time
+import logging
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import torch
+import numpy as np
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger("train")
+def main():
+    from diffusion_llm.config.model_config import (
+        AamDiffusionConfig, ModelConfig, DiffusionConfig,
+        GraphEncoderConfig, TokenizerConfig, TrainingConfig, InferenceConfig,
+    )
+    from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+    from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+    from diffusion_llm.training.dataset import GraphNarrativeDataset, collate_fn
+    from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator
+    from torch.utils.data import DataLoader
+    output_dir = Path("./aam-diffusion-v1")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    data_dir = output_dir / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    # ===== STEP 1: Generate Data =====
+    logger.info("STEP 1: Generating synthetic data...")
+    train_path, val_path = SyntheticDataGenerator.generate_training_split(
+        output_dir=data_dir, n_train=200, n_val=20, language="id", seed=42,
+    )
+    # ===== STEP 2: Train Tokenizer =====
+    logger.info("STEP 2: Training tokenizer...")
+    tokenizer = AamTokenizer()
+    texts = []
+    with open(train_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                for key in ["narrative", "trigger"]:
+                    if data.get(key):
+                        texts.append(data[key])
+                for key in ["evidence_nodes", "anomalies", "reasoning_steps"]:
+                    for item in data.get(key, []):
+                        texts.append(item)
+            except json.JSONDecodeError:
+                continue
+    tokenizer.train(texts, vocab_size=2000)
+    tokenizer.save(data_dir / "tokenizer.json")
+    actual_vocab = tokenizer.vocab_size
+    logger.info(f"  Tokenizer: vocab_size={actual_vocab}, merges={len(tokenizer.merges)}")
+    # ===== STEP 3: Config =====
+    config = AamDiffusionConfig(
+        model=ModelConfig(
+            d_model=128,
+            n_layers=2,
+            n_heads=4,
+            d_ff=256,
+            vocab_size=actual_vocab,
+            max_seq_len=64,
+            pos_encoding_type="learned",
+            use_flash_attention=False,
+            norm_type="layernorm",
+            init_std=0.02,
+        ),
+        diffusion=DiffusionConfig(
+            n_timesteps=100,
+            n_inference_steps=10,
+            schedule_type="cosine",
+            prediction_type="epsilon",
+            loss_type="mse",
+            loss_weighting="none",
+        ),
+        graph_encoder=GraphEncoderConfig(
+            d_graph=64,
+            n_graph_layers=1,
+            n_graph_heads=2,
+            max_evidence_nodes=5,
+            max_compositions=3,
+            max_anomalies=3,
+            max_reasoning_steps=3,
+            conditioning_method="cross_attention",
+            embed_confidence=False,
+            embed_temporal=False,
+        ),
+        tokenizer=TokenizerConfig(bpe_vocab_size=2000),
+        training=TrainingConfig(
+            batch_size=4,
+            learning_rate=1e-3,
+            max_steps=100,
+            warmup_steps=10,
+            use_amp=False,
+            num_workers=0,
+            grad_clip_norm=1.0,
+        ),
+        inference=InferenceConfig(n_steps=10),
+        model_name="aam-diffusion-v1.0",
+        output_dir=str(output_dir),
+        seed=42,
+    )
+    # ===== STEP 4: Create Model =====
+    logger.info("STEP 3: Creating model...")
+    model = AamDiffusionModel(config)
+    n_params = model.get_num_params()
+    logger.info(f"  Parameters: {model._format_params(n_params)} ({n_params:,})")
+    # ===== STEP 5: Create DataLoaders =====
+    logger.info("STEP 4: Creating dataloaders...")
+    train_dataset = GraphNarrativeDataset(
+        data_path=train_path, tokenizer=tokenizer,
+        max_seq_len=config.model.max_seq_len,
+        max_evidence=config.graph_encoder.max_evidence_nodes,
+        max_anomalies=config.graph_encoder.max_anomalies,
+        max_reasoning=config.graph_encoder.max_reasoning_steps,
+        augment=True,
+    )
+    val_dataset = GraphNarrativeDataset(
+        data_path=val_path, tokenizer=tokenizer,
+        max_seq_len=config.model.max_seq_len,
+        max_evidence=config.graph_encoder.max_evidence_nodes,
+        max_anomalies=config.graph_encoder.max_anomalies,
+        max_reasoning=config.graph_encoder.max_reasoning_steps,
+        augment=False,
+    )
+    train_loader = DataLoader(
+        train_dataset, batch_size=4, shuffle=True,
+        num_workers=0, collate_fn=collate_fn,
+    )
+    val_loader = DataLoader(
+        val_dataset, batch_size=4, shuffle=False,
+        num_workers=0, collate_fn=collate_fn,
+    )
+    # ===== STEP 6: Train =====
+    logger.info("STEP 5: Training...")
+    device = torch.device("cpu")
+    model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
+    max_steps = 100
+    start_time = time.time()
+    global_step = 0
+    train_losses = []
+    for epoch in range(50):  # Max epochs
+        model.train()
+        for batch in train_loader:
+            if global_step >= max_steps:
+                break
+            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
+                     for k, v in batch.items()}
+            batch_size = batch["token_ids"].shape[0]
+            t = torch.randint(0, config.diffusion.n_timesteps, (batch_size,), device=device)
+            predicted, target = model(
+                token_ids=batch["token_ids"],
+                timestep=t,
+                evidence_ids=batch.get("evidence_ids"),
+                evidence_confidence=batch.get("evidence_confidence"),
+                anomaly_ids=batch.get("anomaly_ids"),
+                anomaly_confidence=batch.get("anomaly_confidence"),
+                reasoning_ids=batch.get("reasoning_ids"),
+                reasoning_confidence=batch.get("reasoning_confidence"),
+                source_trust=batch.get("source_trust"),
+            )
+            loss = model.compute_loss(predicted, target, t)
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            train_losses.append(loss.item())
+            global_step += 1
+            if global_step % 10 == 0:
+                avg = sum(train_losses[-10:]) / len(train_losses[-10:])
+                elapsed = time.time() - start_time
+                logger.info(f"  Step {global_step}/{max_steps} | Loss: {avg:.4f} | Time: {elapsed:.1f}s")
+        if global_step >= max_steps:
+            break
+    # ===== STEP 7: Evaluate =====
+    logger.info("STEP 6: Evaluating...")
+    model.eval()
+    val_losses = []
+    with torch.no_grad():
+        for batch in val_loader:
+            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
+                     for k, v in batch.items()}
+            batch_size = batch["token_ids"].shape[0]
+            t = torch.randint(0, config.diffusion.n_timesteps, (batch_size,), device=device)
+            predicted, target = model(
+                token_ids=batch["token_ids"],
+                timestep=t,
+                evidence_ids=batch.get("evidence_ids"),
+                evidence_confidence=batch.get("evidence_confidence"),
+                anomaly_ids=batch.get("anomaly_ids"),
+                anomaly_confidence=batch.get("anomaly_confidence"),
+                reasoning_ids=batch.get("reasoning_ids"),
+                reasoning_confidence=batch.get("reasoning_confidence"),
+                source_trust=batch.get("source_trust"),
+            )
+            loss = model.compute_loss(predicted, target, t)
+            val_losses.append(loss.item())
+    avg_val_loss = sum(val_losses) / len(val_losses) if val_losses else 0
+    logger.info(f"  Val loss: {avg_val_loss:.4f}")
+    # ===== STEP 8: Save =====
+    logger.info("STEP 7: Saving model...")
+    # Save model
+    model_path = output_dir / "model.pt"
+    torch.save({
+        "model_state_dict": model.state_dict(),
+        "config": config.to_dict(),
+    }, model_path)
+    # Save tokenizer (already saved)
+    # Save config
+    config.to_json(output_dir / "config.json")
+    elapsed = time.time() - start_time
+    logger.info(f"\n  DONE! {global_step} steps in {elapsed:.1f}s")
+    logger.info(f"  Final train loss: {train_losses[-1]:.4f}")
+    logger.info(f"  Val loss: {avg_val_loss:.4f}")
+    logger.info(f"  Parameters: {model._format_params(n_params)}")
+    logger.info(f"  Output: {output_dir}")
+    return model, tokenizer, config, output_dir
+if __name__ == "__main__":
+    main()

diffusion_llm/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Tests for AAM Diffusion LLM framework."""

diffusion_llm/tests/test_model.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""Tests for AAM Diffusion Model components."""
+import torch
+import pytest
+from diffusion_llm.config.model_config import AamDiffusionConfig, get_default_config, ModelConfig
+from diffusion_llm.model.noise_scheduler import NoiseScheduler
+from diffusion_llm.model.graph_encoder import GraphConditioningEncoder, GraphEncoderConfig
+from diffusion_llm.model.diffusion_transformer import DiffusionTransformer
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+class TestConfig:
+    """Test configuration system."""
+    def test_default_config(self):
+        """Test default configuration creation."""
+        config = get_default_config("base")
+        assert config.model.d_model == 768
+        assert config.model.n_layers == 12
+        assert config.diffusion.n_timesteps == 1000
+    def test_tiny_config(self):
+        """Test tiny model configuration."""
+        config = get_default_config("tiny")
+        assert config.model.d_model == 256
+        assert config.model.n_layers == 4
+    def test_config_serialization(self, tmp_path):
+        """Test config save/load roundtrip."""
+        config = get_default_config("small")
+        path = tmp_path / "config.json"
+        config.to_json(path)
+        loaded = AamDiffusionConfig.from_json(path)
+        assert loaded.model.d_model == config.model.d_model
+        assert loaded.model.n_layers == config.model.n_layers
+    def test_param_estimation(self):
+        """Test parameter count estimation."""
+        config = ModelConfig(d_model=768, n_layers=12, d_ff=3072)
+        params = config.estimate_params()
+        assert "M" in params  # Should be in millions
+class TestTokenizer:
+    """Test AAM Tokenizer."""
+    def test_basic_encoding(self):
+        """Test basic text encoding."""
+        tokenizer = AamTokenizer()
+        # Train on sample text first
+        tokenizer.train(["Hello world this is a test", "Another test sentence"])
+        ids = tokenizer.encode("Hello world")
+        assert isinstance(ids, list)
+        assert len(ids) > 0
+        assert ids[0] == tokenizer.bos_id
+        assert ids[-1] == tokenizer.eos_id
+    def test_decode_roundtrip(self):
+        """Test encode/decode roundtrip."""
+        tokenizer = AamTokenizer()
+        texts = [
+            "Berdasarkan analisis, pencuri adalah Diancang.",
+            "Anomali terdeteksi dalam laporan Hefei.",
+            "Evidence: Ju Jangmok, Snow Plum Pill.",
+        ]
+        tokenizer.train(texts)
+        for text in texts:
+            ids = tokenizer.encode(text)
+            decoded = tokenizer.decode(ids, skip_special=True)
+            # Decoded text should contain key words
+            assert len(decoded) > 0
+    def test_special_tokens(self):
+        """Test special token IDs."""
+        tokenizer = AamTokenizer()
+        assert tokenizer.pad_id == 0
+        assert tokenizer.bos_id == 1
+        assert tokenizer.eos_id == 2
+    def test_sentence_boundaries(self):
+        """Test sentence boundary detection."""
+        tokenizer = AamTokenizer()
+        ids = [1, 10, 20, 5, 30, 40, 5, 50, 2]  # BOS, sent, sent, EOS
+        boundaries = tokenizer.get_sentence_boundaries(ids)
+        assert 3 in boundaries  # Index of <sent> token
+        assert 6 in boundaries
+    def test_save_load(self, tmp_path):
+        """Test tokenizer save/load."""
+        tokenizer = AamTokenizer()
+        tokenizer.train(["Test text for tokenizer", "Another training example"])
+        path = tmp_path / "tokenizer.json"
+        tokenizer.save(path)
+        loaded = AamTokenizer.load(path)
+        assert loaded.vocab_size == tokenizer.vocab_size
+        assert loaded.is_trained
+    def test_structure_encoding(self):
+        """Test encoding with graph structure tokens."""
+        tokenizer = AamTokenizer()
+        tokenizer.train(["Evidence text", "Anomaly description", "Reasoning step"])
+        ids = tokenizer.encode_with_structure(
+            text="Main narrative text",
+            evidence_nodes=["evidence1", "evidence2"],
+            anomalies=["anomaly1"],
+        )
+        assert isinstance(ids, list)
+        assert len(ids) > 0
+    def test_padding(self):
+        """Test sequence padding."""
+        tokenizer = AamTokenizer()
+        ids = [1, 2, 3]
+        padded = tokenizer.pad_sequence(ids, max_len=10)
+        assert len(padded) == 10
+        assert padded[3:] == [0] * 7  # Padded with pad_id
+class TestDiffusionTransformer:
+    """Test Diffusion Transformer model."""
+    def test_forward_pass(self):
+        """Test basic forward pass."""
+        config = ModelConfig(
+            d_model=128, n_layers=2, n_heads=4, d_ff=256,
+            vocab_size=1000, max_seq_len=64,
+        )
+        model = DiffusionTransformer(config)
+        x_t = torch.randn(2, 32, 128)  # batch=2, seq=32, d=128
+        t = torch.tensor([100, 500])
+        output = model(x_t=x_t, t=t)
+        assert output.shape == (2, 32, 128)
+    def test_with_graph_conditioning(self):
+        """Test forward pass with graph conditioning."""
+        config = ModelConfig(
+            d_model=128, n_layers=2, n_heads=4, d_ff=256,
+            vocab_size=1000, max_seq_len=64,
+        )
+        model = DiffusionTransformer(config)
+        x_t = torch.randn(2, 32, 128)
+        t = torch.tensor([100, 500])
+        graph_keys = torch.randn(2, 10, 128)  # 10 graph nodes
+        graph_values = torch.randn(2, 10, 128)
+        output = model(x_t=x_t, t=t, graph_keys=graph_keys, graph_values=graph_values)
+        assert output.shape == (2, 32, 128)
+class TestAamDiffusionModel:
+    """Test complete AAM Diffusion Model."""
+    def test_model_creation_tiny(self):
+        """Test creating a tiny model."""
+        config = get_default_config("tiny")
+        model = AamDiffusionModel(config)
+        n_params = model.get_num_params()
+        assert n_params > 0
+        assert n_params < 100e6  # Tiny should be under 100M
+    def test_forward_training(self):
+        """Test training forward pass."""
+        config = get_default_config("tiny")
+        model = AamDiffusionModel(config)
+        model.eval()
+        token_ids = torch.randint(0, config.model.vocab_size, (2, 32))
+        timestep = torch.randint(0, config.diffusion.n_timesteps, (2,))
+        with torch.no_grad():
+            predicted, noise = model(token_ids=token_ids, timestep=timestep)
+        assert predicted.shape == noise.shape
+    def test_loss_computation(self):
+        """Test loss computation."""
+        config = get_default_config("tiny")
+        model = AamDiffusionModel(config)
+        model.eval()
+        token_ids = torch.randint(0, config.model.vocab_size, (2, 32))
+        timestep = torch.randint(0, config.diffusion.n_timesteps, (2,))
+        with torch.no_grad():
+            predicted, noise = model(token_ids=token_ids, timestep=timestep)
+            loss = model.compute_loss(predicted, noise, timestep)
+        assert loss.item() >= 0
+        assert not torch.isnan(loss)
+    def test_save_load(self, tmp_path):
+        """Test model save/load."""
+        config = get_default_config("tiny")
+        model = AamDiffusionModel(config)
+        path = str(tmp_path / "model.pt")
+        model.save(path)
+        loaded = AamDiffusionModel.load(path)
+        assert loaded.config.model.d_model == config.model.d_model
+class TestGraphEncoder:
+    """Test Graph Conditioning Encoder."""
+    def test_evidence_encoding(self):
+        """Test encoding evidence nodes."""
+        config = GraphEncoderConfig(d_graph=128, n_graph_layers=2, n_graph_heads=4)
+        encoder = GraphConditioningEncoder(config, vocab_size=1000)
+        evidence_ids = torch.randint(0, 1000, (2, 5, 16))  # 2 batch, 5 nodes, 16 tokens each
+        evidence_conf = torch.tensor([[0.8, 0.6, 0.9, 0.7, 0.5],
+                                       [0.7, 0.8, 0.6, 0.9, 0.5]])
+        result = encoder(evidence_ids=evidence_ids, evidence_confidence=evidence_conf)
+        assert "keys" in result
+        assert "values" in result
+    def test_no_input(self):
+        """Test encoder with no graph data (should return zeros)."""
+        config = GraphEncoderConfig(d_graph=128, n_graph_layers=2, n_graph_heads=4)
+        encoder = GraphConditioningEncoder(config, vocab_size=1000)
+        result = encoder()
+        assert "keys" in result
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

diffusion_llm/tests/test_scheduler.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Tests for Noise Scheduler."""
+import torch
+import pytest
+from diffusion_llm.model.noise_scheduler import NoiseScheduler
+class TestNoiseScheduler:
+    """Test suite for the NoiseScheduler."""
+    def test_cosine_schedule(self):
+        """Test cosine noise schedule creation."""
+        scheduler = NoiseScheduler(n_timesteps=1000, schedule_type="cosine")
+        assert scheduler.betas.shape == (1000,)
+        assert (scheduler.betas > 0).all()
+        assert (scheduler.betas < 1).all()
+    def test_linear_schedule(self):
+        """Test linear noise schedule creation."""
+        scheduler = NoiseScheduler(n_timesteps=1000, schedule_type="linear")
+        assert scheduler.betas.shape == (1000,)
+        assert scheduler.betas[0] < scheduler.betas[-1]  # Increasing
+    def test_sigmoid_schedule(self):
+        """Test sigmoid noise schedule creation."""
+        scheduler = NoiseScheduler(n_timesteps=1000, schedule_type="sigmoid")
+        assert scheduler.betas.shape == (1000,)
+        assert (scheduler.betas > 0).all()
+    def test_add_noise(self):
+        """Test forward diffusion (adding noise)."""
+        scheduler = NoiseScheduler(n_timesteps=1000)
+        x_0 = torch.randn(2, 10, 64)  # batch=2, seq=10, d=64
+        noise = torch.randn_like(x_0)
+        t = torch.tensor([0, 500])
+        x_t = scheduler.add_noise(x_0, noise, t)
+        assert x_t.shape == x_0.shape
+        # At t=0, x_t should be close to x_0
+        # At t=500, x_t should be significantly different
+    def test_loss_target_epsilon(self):
+        """Test epsilon prediction target."""
+        scheduler = NoiseScheduler(prediction_type="epsilon")
+        x_0 = torch.randn(2, 10, 64)
+        noise = torch.randn_like(x_0)
+        t = torch.tensor([100, 500])
+        target = scheduler.compute_loss_target(x_0, noise, t)
+        assert torch.allclose(target, noise)
+    def test_loss_target_x0(self):
+        """Test x0 prediction target."""
+        scheduler = NoiseScheduler(prediction_type="x0")
+        x_0 = torch.randn(2, 10, 64)
+        noise = torch.randn_like(x_0)
+        t = torch.tensor([100, 500])
+        target = scheduler.compute_loss_target(x_0, noise, t)
+        assert torch.allclose(target, x_0)
+    def test_predict_x0_from_epsilon(self):
+        """Test x0 prediction from epsilon."""
+        scheduler = NoiseScheduler(prediction_type="epsilon")
+        x_0 = torch.randn(2, 10, 64)
+        noise = torch.randn_like(x_0)
+        t = torch.tensor([100])
+        x_t = scheduler.add_noise(x_0, noise, t)
+        x_0_pred = scheduler.predict_x0_from_epsilon(x_t, noise, t)
+        # Should be close to original x_0
+        assert x_0_pred.shape == x_0.shape
+    def test_ddpm_step(self):
+        """Test single DDPM reverse step."""
+        scheduler = NoiseScheduler(n_timesteps=1000)
+        x_t = torch.randn(2, 10, 64)
+        model_output = torch.randn_like(x_t)
+        t = torch.tensor([500, 500])
+        x_prev = scheduler.step_ddpm(model_output, x_t, t)
+        assert x_prev.shape == x_t.shape
+    def test_ddim_step(self):
+        """Test single DDIM reverse step."""
+        scheduler = NoiseScheduler(n_timesteps=1000)
+        x_t = torch.randn(2, 10, 64)
+        model_output = torch.randn_like(x_t)
+        x_prev = scheduler.step_ddim(model_output, x_t, t=500, t_prev=400)
+        assert x_prev.shape == x_t.shape
+    def test_timestep_schedule(self):
+        """Test inference timestep schedule."""
+        scheduler = NoiseScheduler(n_timesteps=1000)
+        schedule = scheduler.get_timestep_schedule(n_inference_steps=50)
+        assert len(schedule) > 0
+        assert schedule[0] > schedule[-1]  # Descending order

diffusion_llm/tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Tokenizer module for AAM Diffusion LLM."""
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+__all__ = ["AamTokenizer"]

diffusion_llm/tokenizer/aam_tokenizer.py ADDED Viewed

	@@ -0,0 +1,596 @@

+"""
+AAM Diffusion LLM — Tokenizer
+Sentence-level + subword BPE hybrid tokenizer designed specifically
+for AAM's sentence arrangement task.
+Unlike standard tokenizers (GPT-2 BPE, SentencePiece) that tokenize
+at the subword level, AAM's tokenizer is designed with SENTENCE
+ARRANGEMENT in mind:
+1. Sentences are the primary unit of generation (not individual tokens)
+2. Within sentences, subword BPE handles individual words
+3. Special tokens for graph structure (evidence, anomaly, confidence)
+4. Sentence boundary markers for the diffusion model
+The tokenizer maintains two levels:
+- Sentence level: Where sentences begin/end, for the diffusion model
+  to arrange and revise non-sequentially
+- Token level: Subword units within sentences, for detailed generation
+Analogi: Jin Soun tidak berpikir dalam kata-per-kata — dia
+berpikir dalam KALIMAT. "Pencuri = Diancang pair. Ju Jangmok = cover."
+Setiap kalimat sudah utuh, yang dia susun adalah URUTAN kalimat.
+"""
+from __future__ import annotations
+import json
+import re
+import unicodedata
+from collections import Counter
+from pathlib import Path
+from typing import Optional
+from diffusion_llm.config.model_config import TokenizerConfig
+# Special token IDs (always at the start of vocabulary)
+SPECIAL_TOKENS = [
+    "<pad>",        # 0
+    "<bos>",        # 1
+    "<eos>",        # 2
+    "<mask>",       # 3
+    "<noise>",      # 4
+    "<sent>",       # 5 - sentence boundary
+    "<evidence>",   # 6
+    "<anomaly>",    # 7
+    "<confidence>", # 8
+    "<reasoning>",  # 9
+    "<composition>",# 10
+    "<temporal>",   # 11
+    "<unk>",        # 12
+]
+class AamTokenizer:
+    """AAM Sentence-Level + Subword BPE Hybrid Tokenizer.
+    This tokenizer is specifically designed for the AAM Diffusion LLM:
+    - It understands sentence boundaries (<sent> tokens)
+    - It has special tokens for graph structure
+    - It uses BPE for subword tokenization within sentences
+    - It can encode/decode both plain text and graph-conditioned text
+    Usage:
+        tokenizer = AamTokenizer()
+        tokenizer.train(texts, vocab_size=28000)
+        # Encode text
+        ids = tokenizer.encode("Berdasarkan analisis, pencuri adalah Diancang.")
+        # Decode back
+        text = tokenizer.decode(ids)
+        # With graph structure tokens
+        ids = tokenizer.encode_with_structure(
+            "Pencuri = Diancang pair",
+            evidence_nodes=["hefei", "diancang"],
+            anomalies=[{"desc": "no external pill consumption"}],
+        )
+    """
+    def __init__(self, config: Optional[TokenizerConfig] = None):
+        """Initialize the tokenizer.
+        Args:
+            config: Tokenizer configuration. Uses defaults if None.
+        """
+        self.config = config or TokenizerConfig()
+        # Build initial vocabulary with special tokens
+        self.vocab: dict[str, int] = {}
+        self.id_to_token: dict[int, str] = {}
+        self._init_special_tokens()
+        # BPE merges (learned during training)
+        self.merges: dict[tuple[str, str], int] = {}
+        self._bpe_cache: dict[str, str] = {}
+        # Compiled patterns
+        self._sentence_pattern = re.compile(
+            r'(?<=[.!?])\s+|(?<=\n)\s*'
+        )
+        self._word_pattern = re.compile(
+            r'\w+|[^\w\s]'
+        )
+        # Flag
+        self._is_trained = False
+    def _init_special_tokens(self) -> None:
+        """Initialize special tokens in vocabulary."""
+        for i, token in enumerate(SPECIAL_TOKENS):
+            self.vocab[token] = i
+            self.id_to_token[i] = token
+    @property
+    def pad_id(self) -> int:
+        return self.vocab[self.config.pad_token]
+    @property
+    def bos_id(self) -> int:
+        return self.vocab[self.config.bos_token]
+    @property
+    def eos_id(self) -> int:
+        return self.vocab[self.config.eos_token]
+    @property
+    def mask_id(self) -> int:
+        return self.vocab[self.config.mask_token]
+    @property
+    def noise_id(self) -> int:
+        return self.vocab[self.config.noise_token]
+    @property
+    def sent_id(self) -> int:
+        return self.vocab[self.config.sentence_boundary_token]
+    @property
+    def unk_id(self) -> int:
+        return self.vocab.get("<unk>", len(SPECIAL_TOKENS) - 1)
+    @property
+    def vocab_size(self) -> int:
+        """Current vocabulary size."""
+        return len(self.vocab)
+    @property
+    def is_trained(self) -> bool:
+        """Whether the tokenizer has been trained."""
+        return self._is_trained
+    def train(
+        self,
+        texts: list[str],
+        vocab_size: Optional[int] = None,
+    ) -> None:
+        """Train the BPE tokenizer on a corpus.
+        Args:
+            texts: List of training texts.
+            vocab_size: Target vocabulary size. Uses config if None.
+        """
+        target_vocab = vocab_size or self.config.bpe_vocab_size
+        # Step 1: Pre-tokenize into words
+        word_freqs: Counter = Counter()
+        for text in texts:
+            words = self._pre_tokenize(text)
+            for word in words:
+                word_freqs[word] += 1
+        # Step 2: Initialize character-level vocabulary
+        char_vocab: set[str] = set()
+        for word in word_freqs:
+            for char in word:
+                char_vocab.add(char)
+        # Add character tokens to vocabulary
+        for char in sorted(char_vocab):
+            if char not in self.vocab:
+                idx = len(self.vocab)
+                self.vocab[char] = idx
+                self.id_to_token[idx] = char
+        # Step 3: Split words into character sequences
+        word_splits: dict[str, list[str]] = {}
+        for word in word_freqs:
+            word_splits[word] = list(word)
+            # Add end-of-word marker
+            if len(word_splits[word]) > 1:
+                word_splits[word][-1] = word_splits[word][-1] + "</w>"
+        # Step 4: Learn BPE merges
+        n_merges = target_vocab - len(self.vocab)
+        for i in range(n_merges):
+            # Count pairs
+            pair_freqs: Counter = Counter()
+            for word, freq in word_freqs.items():
+                symbols = word_splits.get(word, [])
+                for j in range(len(symbols) - 1):
+                    pair = (symbols[j], symbols[j + 1])
+                    pair_freqs[pair] += freq
+            if not pair_freqs:
+                break
+            # Find most frequent pair
+            best_pair = pair_freqs.most_common(1)[0][0]
+            # Record merge
+            self.merges[best_pair] = i
+            # Apply merge
+            new_symbol = best_pair[0] + best_pair[1]
+            for word in word_splits:
+                symbols = word_splits[word]
+                new_symbols = []
+                j = 0
+                while j < len(symbols):
+                    if (
+                        j < len(symbols) - 1
+                        and symbols[j] == best_pair[0]
+                        and symbols[j + 1] == best_pair[1]
+                    ):
+                        new_symbols.append(new_symbol)
+                        j += 2
+                    else:
+                        new_symbols.append(symbols[j])
+                        j += 1
+                word_splits[word] = new_symbols
+            # Add merged token to vocabulary
+            if new_symbol not in self.vocab:
+                idx = len(self.vocab)
+                self.vocab[new_symbol] = idx
+                self.id_to_token[idx] = new_symbol
+        self._is_trained = True
+        self._bpe_cache.clear()
+    def _pre_tokenize(self, text: str) -> list[str]:
+        """Pre-tokenize text into words.
+        Args:
+            text: Input text.
+        Returns:
+            List of words.
+        """
+        # Normalize unicode
+        text = unicodedata.normalize("NFC", text)
+        # Split into words and punctuation
+        words = self._word_pattern.findall(text.lower())
+        return words
+    def _bpe_encode(self, word: str) -> list[str]:
+        """Apply BPE to a single word.
+        Args:
+            word: Input word (lowercase).
+        Returns:
+            List of BPE tokens.
+        """
+        if word in self._bpe_cache:
+            return self._bpe_cache[word].split()
+        # Start with character-level split
+        symbols = list(word)
+        if len(symbols) > 1:
+            symbols[-1] = symbols[-1] + "</w>"
+        # Apply merges in order
+        while len(symbols) > 1:
+            # Find the pair with the lowest merge rank
+            best_pair = None
+            best_rank = float("inf")
+            for i in range(len(symbols) - 1):
+                pair = (symbols[i], symbols[i + 1])
+                rank = self.merges.get(pair, float("inf"))
+                if rank < best_rank:
+                    best_rank = rank
+                    best_pair = pair
+            if best_pair is None or best_rank == float("inf"):
+                break
+            # Apply merge
+            new_symbol = best_pair[0] + best_pair[1]
+            new_symbols = []
+            i = 0
+            while i < len(symbols):
+                if (
+                    i < len(symbols) - 1
+                    and symbols[i] == best_pair[0]
+                    and symbols[i + 1] == best_pair[1]
+                ):
+                    new_symbols.append(new_symbol)
+                    i += 2
+                else:
+                    new_symbols.append(symbols[i])
+                    i += 1
+            symbols = new_symbols
+        # Cache result
+        self._bpe_cache[word] = " ".join(symbols)
+        return symbols
+    def encode(self, text: str, add_special: bool = True) -> list[int]:
+        """Encode text to token IDs.
+        The encoding process:
+        1. Split text into sentences
+        2. Insert sentence boundary tokens between sentences
+        3. BPE-encode each word within sentences
+        4. Add BOS/EOS tokens if requested
+        Args:
+            text: Input text.
+            add_special: Whether to add BOS/EOS tokens.
+        Returns:
+            List of token IDs.
+        """
+        ids = []
+        if add_special:
+            ids.append(self.bos_id)
+        # Split into sentences
+        sentences = self._split_sentences(text)
+        for i, sentence in enumerate(sentences):
+            if i > 0:
+                ids.append(self.sent_id)  # Sentence boundary
+            # Tokenize words in the sentence
+            words = self._pre_tokenize(sentence)
+            for word in words:
+                if self._is_trained:
+                    bpe_tokens = self._bpe_encode(word)
+                    for token in bpe_tokens:
+                        if token in self.vocab:
+                            ids.append(self.vocab[token])
+                        else:
+                            ids.append(self.unk_id)
+                else:
+                    # Fallback: character-level encoding
+                    for char in word:
+                        if char in self.vocab:
+                            ids.append(self.vocab[char])
+                        else:
+                            ids.append(self.unk_id)
+        if add_special:
+            ids.append(self.eos_id)
+        return ids
+    def encode_with_structure(
+        self,
+        text: str,
+        evidence_nodes: Optional[list[str]] = None,
+        compositions: Optional[list[str]] = None,
+        anomalies: Optional[list[str]] = None,
+        reasoning_steps: Optional[list[str]] = None,
+        confidence: Optional[float] = None,
+    ) -> list[int]:
+        """Encode text with graph structure tokens.
+        Adds structural tokens that represent the graph conditioning,
+        so the model knows what kind of evidence/anomalies it's
+        generating from.
+        Args:
+            text: The narrative text.
+            evidence_nodes: List of evidence node labels.
+            compositions: List of composition descriptions.
+            anomalies: List of anomaly descriptions.
+            reasoning_steps: List of reasoning step descriptions.
+            confidence: Overall confidence score.
+        Returns:
+            List of token IDs with structure tokens.
+        """
+        ids = [self.bos_id]
+        # Evidence section
+        if evidence_nodes:
+            ids.append(self.vocab["<evidence>"])
+            for node in evidence_nodes:
+                node_ids = self.encode(node, add_special=False)
+                ids.extend(node_ids)
+            ids.append(self.vocab["<evidence>"])  # Close section
+        # Anomaly section
+        if anomalies:
+            ids.append(self.vocab["<anomaly>"])
+            for anomaly in anomalies:
+                anom_ids = self.encode(anomaly, add_special=False)
+                ids.extend(anom_ids)
+            ids.append(self.vocab["<anomaly>"])
+        # Reasoning section
+        if reasoning_steps:
+            ids.append(self.vocab["<reasoning>"])
+            for step in reasoning_steps:
+                step_ids = self.encode(step, add_special=False)
+                ids.extend(step_ids)
+                ids.append(self.sent_id)
+            ids.append(self.vocab["<reasoning>"])
+        # Confidence
+        if confidence is not None:
+            ids.append(self.vocab["<confidence>"])
+            # Encode confidence as a token (discretized)
+            conf_bucket = min(int(confidence * 10), 9)
+            conf_token = f"<conf_{conf_bucket}>"
+            if conf_token in self.vocab:
+                ids.append(self.vocab[conf_token])
+        # Composition section
+        if compositions:
+            ids.append(self.vocab["<composition>"])
+            for comp in compositions:
+                comp_ids = self.encode(comp, add_special=False)
+                ids.extend(comp_ids)
+                ids.append(self.sent_id)
+            ids.append(self.vocab["<composition>"])
+        # Main narrative
+        narrative_ids = self.encode(text, add_special=False)
+        ids.extend(narrative_ids)
+        ids.append(self.eos_id)
+        return ids
+    def decode(self, ids: list[int], skip_special: bool = False) -> str:
+        """Decode token IDs back to text.
+        Args:
+            ids: List of token IDs.
+            skip_special: Whether to skip special tokens in output.
+        Returns:
+            Decoded text string.
+        """
+        special_ids = set()
+        if skip_special:
+            for token in SPECIAL_TOKENS:
+                if token in self.vocab:
+                    special_ids.add(self.vocab[token])
+        tokens = []
+        for id_ in ids:
+            if skip_special and id_ in special_ids:
+                continue
+            if id_ in self.id_to_token:
+                tokens.append(self.id_to_token[id_])
+            else:
+                tokens.append("<unk>")
+        # Join and clean up BPE tokens
+        text = "".join(tokens)
+        text = text.replace("</w>", " ")
+        # Clean up sentence boundaries
+        text = text.replace("<sent>", ". ")
+        # Clean up multiple spaces
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    def _split_sentences(self, text: str) -> list[str]:
+        """Split text into sentences.
+        Args:
+            text: Input text.
+        Returns:
+            List of sentence strings.
+        """
+        sentences = self._sentence_pattern.split(text)
+        return [s.strip() for s in sentences if s.strip()]
+    def pad_sequence(
+        self,
+        ids: list[int],
+        max_len: int,
+        pad_id: Optional[int] = None,
+    ) -> list[int]:
+        """Pad a sequence to max_len.
+        Args:
+            ids: Token IDs.
+            max_len: Target length.
+            pad_id: Padding token ID. Uses config if None.
+        Returns:
+            Padded sequence.
+        """
+        padding_id = pad_id if pad_id is not None else self.pad_id
+        if len(ids) >= max_len:
+            return ids[:max_len]
+        return ids + [padding_id] * (max_len - len(ids))
+    def get_sentence_boundaries(self, ids: list[int]) -> list[int]:
+        """Find sentence boundary positions in a token sequence.
+        This is used by the diffusion model to identify which tokens
+        belong to which sentence, enabling non-sequential generation
+        and revision at the sentence level.
+        Args:
+            ids: Token IDs.
+        Returns:
+            List of indices where sentence boundaries occur.
+        """
+        boundaries = []
+        for i, id_ in enumerate(ids):
+            if id_ == self.sent_id:
+                boundaries.append(i)
+        return boundaries
+    def save(self, path: str | Path) -> None:
+        """Save tokenizer to file.
+        Args:
+            path: Output file path (JSON).
+        """
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            "config": {
+                "bpe_vocab_size": self.config.bpe_vocab_size,
+                "max_sentences": self.config.max_sentences,
+                "sentence_boundary_token": self.config.sentence_boundary_token,
+                "pad_token": self.config.pad_token,
+                "bos_token": self.config.bos_token,
+                "eos_token": self.config.eos_token,
+                "mask_token": self.config.mask_token,
+                "noise_token": self.config.noise_token,
+                "min_frequency": self.config.min_frequency,
+            },
+            "vocab": self.vocab,
+            "merges": {f"{k[0]}|||{k[1]}": v for k, v in self.merges.items()},
+            "is_trained": self._is_trained,
+        }
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+    @classmethod
+    def load(cls, path: str | Path) -> AamTokenizer:
+        """Load tokenizer from file.
+        Args:
+            path: Input file path (JSON).
+        Returns:
+            Loaded AamTokenizer.
+        """
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        config = TokenizerConfig(**data.get("config", {}))
+        tokenizer = cls(config=config)
+        # Restore vocabulary
+        tokenizer.vocab = data["vocab"]
+        tokenizer.id_to_token = {int(v): k for k, v in data["vocab"].items()}
+        # Restore merges
+        tokenizer.merges = {}
+        for k_str, v in data.get("merges", {}).items():
+            parts = k_str.split("|||")
+            tokenizer.merges[(parts[0], parts[1])] = v
+        tokenizer._is_trained = data.get("is_trained", False)
+        return tokenizer
+    def __len__(self) -> int:
+        return self.vocab_size
+    def __repr__(self) -> str:
+        status = "trained" if self._is_trained else "untrained"
+        return (
+            f"AamTokenizer(vocab_size={self.vocab_size}, "
+            f"merges={len(self.merges)}, status={status})"
+        )

diffusion_llm/training/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Training module for AAM Diffusion LLM."""
+from diffusion_llm.training.trainer import AamTrainer
+from diffusion_llm.training.dataset import GraphNarrativeDataset
+from diffusion_llm.training.losses import DiffusionLoss, compute_loss
+__all__ = ["AamTrainer", "GraphNarrativeDataset", "DiffusionLoss", "compute_loss"]

diffusion_llm/training/dataset.py ADDED Viewed

	@@ -0,0 +1,371 @@

+"""
+AAM Diffusion LLM — Dataset
+Dataset class for Graph→Narrative training pairs.
+Each training example consists of:
+    - Graph conditioning: evidence nodes, compositions, confidence,
+      anomalies, reasoning chains, temporal context
+    - Target narrative: natural language text that represents
+      the graph data in sentence form
+The dataset handles:
+    - Loading from JSONL files
+    - Tokenization of both graph data and narratives
+    - Padding and batching
+    - Data augmentation (sentence shuffling, noise injection)
+Analogi: Seperti Jin Soun berlatih mengungkapkan kesimpulan —
+dia diberi "kasus" (graph data) dan "jawaban yang benar"
+(narrative target), lalu berlatih sampai bisa menyusun
+kalimat yang tepat dari graph.
+"""
+from __future__ import annotations
+import json
+import logging
+import random
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import torch
+from torch.utils.data import Dataset
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+logger = logging.getLogger(__name__)
+@dataclass
+class GraphNarrativeExample:
+    """A single training example: graph conditioning + target narrative.
+    This represents the "input" and "expected output" for one
+    training step of the diffusion model.
+    """
+    # Target narrative (what the model should generate)
+    narrative: str = ""
+    # Graph conditioning inputs
+    trigger: str = ""
+    evidence_nodes: list[str] = field(default_factory=list)
+    compositions: list[str] = field(default_factory=list)
+    confidence_map: dict[str, float] = field(default_factory=dict)
+    anomalies: list[str] = field(default_factory=list)
+    reasoning_steps: list[str] = field(default_factory=list)
+    source_trust: float = 1.0
+    temporal_context: list[str] = field(default_factory=list)
+    # Metadata
+    language: str = "id"
+    source: str = "synthetic"
+    def to_dict(self) -> dict:
+        """Serialize to dictionary."""
+        return {
+            "narrative": self.narrative,
+            "trigger": self.trigger,
+            "evidence_nodes": self.evidence_nodes,
+            "compositions": self.compositions,
+            "confidence_map": self.confidence_map,
+            "anomalies": self.anomalies,
+            "reasoning_steps": self.reasoning_steps,
+            "source_trust": self.source_trust,
+            "temporal_context": self.temporal_context,
+            "language": self.language,
+            "source": self.source,
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> GraphNarrativeExample:
+        """Deserialize from dictionary."""
+        return cls(
+            narrative=data.get("narrative", ""),
+            trigger=data.get("trigger", ""),
+            evidence_nodes=data.get("evidence_nodes", []),
+            compositions=data.get("compositions", []),
+            confidence_map=data.get("confidence_map", {}),
+            anomalies=data.get("anomalies", []),
+            reasoning_steps=data.get("reasoning_steps", []),
+            source_trust=data.get("source_trust", 1.0),
+            temporal_context=data.get("temporal_context", []),
+            language=data.get("language", "id"),
+            source=data.get("source", "synthetic"),
+        )
+@dataclass
+class BatchOutput:
+    """Output from a single batch.
+    All tensors are already padded to uniform length.
+    """
+    token_ids: torch.Tensor
+    """Target narrative token IDs, shape (batch, seq_len)."""
+    evidence_ids: Optional[torch.Tensor] = None
+    """Evidence node token IDs, shape (batch, n_evidence, ev_seq_len)."""
+    evidence_confidence: Optional[torch.Tensor] = None
+    """Evidence confidence, shape (batch, n_evidence)."""
+    anomaly_ids: Optional[torch.Tensor] = None
+    """Anomaly token IDs, shape (batch, n_anomalies, an_seq_len)."""
+    anomaly_confidence: Optional[torch.Tensor] = None
+    """Anomaly confidence, shape (batch, n_anomalies)."""
+    reasoning_ids: Optional[torch.Tensor] = None
+    """Reasoning step token IDs, shape (batch, n_steps, r_seq_len)."""
+    reasoning_confidence: Optional[torch.Tensor] = None
+    """Reasoning confidence, shape (batch, n_steps)."""
+    source_trust: Optional[torch.Tensor] = None
+    """Source trust scores, shape (batch,)."""
+class GraphNarrativeDataset(Dataset):
+    """Dataset for Graph→Narrative training pairs.
+    Loads training examples from JSONL files and provides
+    tokenized, padded batches for training.
+    Args:
+        data_path: Path to JSONL file with training data.
+        tokenizer: AamTokenizer instance for encoding.
+        max_seq_len: Maximum sequence length for narratives.
+        max_evidence: Maximum number of evidence nodes.
+        max_anomalies: Maximum number of anomalies.
+        max_reasoning: Maximum number of reasoning steps.
+        augment: Whether to apply data augmentation.
+    """
+    def __init__(
+        self,
+        data_path: str | Path,
+        tokenizer: AamTokenizer,
+        max_seq_len: int = 512,
+        max_evidence: int = 50,
+        max_anomalies: int = 10,
+        max_reasoning: int = 15,
+        augment: bool = True,
+    ):
+        self.data_path = Path(data_path)
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.max_evidence = max_evidence
+        self.max_anomalies = max_anomalies
+        self.max_reasoning = max_reasoning
+        self.augment = augment
+        # Load data
+        self.examples: list[GraphNarrativeExample] = []
+        self._load_data()
+        logger.info(
+            "GraphNarrativeDataset: %d examples loaded from %s",
+            len(self.examples),
+            self.data_path,
+        )
+    def _load_data(self) -> None:
+        """Load examples from JSONL file."""
+        if not self.data_path.exists():
+            logger.warning("Data file not found: %s", self.data_path)
+            return
+        with open(self.data_path, "r", encoding="utf-8") as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    example = GraphNarrativeExample.from_dict(data)
+                    if example.narrative:  # Skip empty narratives
+                        self.examples.append(example)
+                except json.JSONDecodeError:
+                    logger.warning("Invalid JSON at line %d", line_num)
+    def __len__(self) -> int:
+        return len(self.examples)
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        """Get a single training example.
+        Returns:
+            Dictionary with tokenized inputs.
+        """
+        example = self.examples[idx]
+        # Data augmentation
+        if self.augment:
+            example = self._augment(example)
+        # Tokenize narrative (target)
+        narrative_ids = self.tokenizer.encode(example.narrative, add_special=True)
+        narrative_ids = self.tokenizer.pad_sequence(narrative_ids, self.max_seq_len)
+        narrative_tensor = torch.tensor(narrative_ids, dtype=torch.long)
+        # Tokenize evidence nodes
+        evidence_data = self._tokenize_node_list(
+            example.evidence_nodes, max_nodes=self.max_evidence
+        )
+        # Tokenize anomalies
+        anomaly_data = self._tokenize_node_list(
+            example.anomalies, max_nodes=self.max_anomalies
+        )
+        # Tokenize reasoning steps
+        reasoning_data = self._tokenize_node_list(
+            example.reasoning_steps, max_nodes=self.max_reasoning
+        )
+        # Source trust
+        source_trust = torch.tensor(example.source_trust, dtype=torch.float32)
+        # Evidence confidence
+        conf_values = list(example.confidence_map.values())[:self.max_evidence]
+        if conf_values:
+            evidence_conf = torch.tensor(conf_values, dtype=torch.float32)
+            evidence_conf = torch.nn.functional.pad(
+                evidence_conf, (0, self.max_evidence - len(conf_values))
+            )
+        else:
+            evidence_conf = torch.zeros(self.max_evidence, dtype=torch.float32)
+        # Anomaly confidence (default 0.6 for detected anomalies)
+        anomaly_conf = torch.full(
+            (self.max_anomalies,), 0.6, dtype=torch.float32
+        )
+        # Reasoning confidence (default 0.7)
+        reasoning_conf = torch.full(
+            (self.max_reasoning,), 0.7, dtype=torch.float32
+        )
+        return {
+            "token_ids": narrative_tensor,
+            "evidence_ids": evidence_data["ids"],
+            "evidence_confidence": evidence_conf,
+            "anomaly_ids": anomaly_data["ids"],
+            "anomaly_confidence": anomaly_conf,
+            "reasoning_ids": reasoning_data["ids"],
+            "reasoning_confidence": reasoning_conf,
+            "source_trust": source_trust,
+        }
+    def _tokenize_node_list(
+        self,
+        nodes: list[str],
+        max_nodes: int,
+        max_node_len: int = 32,
+    ) -> dict[str, torch.Tensor]:
+        """Tokenize a list of node descriptions.
+        Args:
+            nodes: List of node text descriptions.
+            max_nodes: Maximum number of nodes to encode.
+            max_node_len: Maximum token length per node.
+        Returns:
+            Dictionary with padded token IDs tensor.
+        """
+        if not nodes:
+            return {
+                "ids": torch.zeros(max_nodes, max_node_len, dtype=torch.long),
+            }
+        # Limit to max_nodes
+        nodes = nodes[:max_nodes]
+        all_ids = []
+        for node in nodes:
+            ids = self.tokenizer.encode(node, add_special=False)
+            ids = self.tokenizer.pad_sequence(ids, max_node_len)
+            all_ids.append(ids)
+        # Pad to max_nodes
+        while len(all_ids) < max_nodes:
+            all_ids.append([0] * max_node_len)
+        return {
+            "ids": torch.tensor(all_ids, dtype=torch.long),
+        }
+    def _augment(self, example: GraphNarrativeExample) -> GraphNarrativeExample:
+        """Apply data augmentation.
+        Augmentation strategies:
+        1. Random sentence shuffling within the narrative
+        2. Random evidence node dropping (simulate incomplete data)
+        3. Random confidence perturbation
+        Args:
+            example: Original training example.
+        Returns:
+            Augmented example.
+        """
+        import copy
+        augmented = copy.deepcopy(example)
+        # 1. Sentence shuffling (with 20% probability)
+        if random.random() < 0.2:
+            sentences = self.tokenizer._split_sentences(augmented.narrative)
+            if len(sentences) > 2:
+                # Keep first sentence, shuffle the rest
+                first = sentences[0]
+                rest = sentences[1:]
+                random.shuffle(rest)
+                augmented.narrative = first + " " + " ".join(rest)
+        # 2. Evidence dropping (with 10% probability per node)
+        if augmented.evidence_nodes:
+            augmented.evidence_nodes = [
+                node for node in augmented.evidence_nodes
+                if random.random() > 0.1
+            ]
+        # 3. Confidence perturbation
+        if augmented.confidence_map:
+            perturbed = {}
+            for k, v in augmented.confidence_map.items():
+                noise = random.gauss(0, 0.05)
+                perturbed[k] = max(0.0, min(1.0, v + noise))
+            augmented.confidence_map = perturbed
+        return augmented
+def collate_fn(batch: list[dict[str, torch.Tensor]]) -> dict[str, torch.Tensor]:
+    """Custom collate function for DataLoader.
+    Handles variable-length graph conditioning by padding
+    all tensors in the batch to the same size.
+    Args:
+        batch: List of example dictionaries.
+    Returns:
+        Batched dictionary of tensors.
+    """
+    result = {}
+    # Stack all tensors
+    for key in batch[0]:
+        tensors = [item[key] for item in batch]
+        if tensors[0].dim() == 0:
+            result[key] = torch.stack(tensors)
+        elif tensors[0].dim() == 1:
+            result[key] = torch.stack(tensors)
+        elif tensors[0].dim() == 2:
+            result[key] = torch.stack(tensors)
+        else:
+            result[key] = torch.stack(tensors)
+    return result

diffusion_llm/training/losses.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+AAM Diffusion LLM — Loss Functions
+Implements various loss functions for training the diffusion model,
+including MSE, MAE, Huber, and weighted variants.
+Analogi: Seperti Jin Soun mengukur seberapa jauh prediksinya
+dari kenyataan — semakin besar gap, semakin besar "rasa sakit"
+yang mendorong perbaikan.
+"""
+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusion_llm.config.model_config import DiffusionConfig
+class DiffusionLoss(nn.Module):
+    """Loss function for diffusion model training.
+    Computes the loss between predicted and target values,
+    with optional weighting strategies to balance training
+    across different noise levels.
+    Args:
+        config: DiffusionConfig with loss hyperparameters.
+    """
+    def __init__(self, config: DiffusionConfig):
+        super().__init__()
+        self.config = config
+    def forward(
+        self,
+        predicted: torch.Tensor,
+        target: torch.Tensor,
+        timestep: torch.Tensor,
+        alphas_cumprod: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute diffusion loss.
+        Args:
+            predicted: Model output (predicted noise/x0/v).
+            target: Target values.
+            timestep: Timestep indices for weighting.
+            alphas_cumprod: Cumulative product of alphas from scheduler.
+        Returns:
+            Scalar loss value.
+        """
+        # Base loss
+        if self.config.loss_type == "mse":
+            loss = F.mse_loss(predicted, target, reduction="none")
+        elif self.config.loss_type == "mae":
+            loss = F.l1_loss(predicted, target, reduction="none")
+        elif self.config.loss_type == "huber":
+            loss = F.smooth_l1_loss(predicted, target, reduction="none")
+        else:
+            raise ValueError(f"Unknown loss_type: {self.config.loss_type}")
+        # Average over feature dimension
+        loss = loss.mean(dim=-1)  # (batch, seq_len)
+        # Apply weighting
+        if self.config.loss_weighting == "min_snr":
+            loss = self._min_snr_weight(loss, timestep, alphas_cumprod)
+        elif self.config.loss_weighting == "p2":
+            loss = self._p2_weight(loss, timestep, alphas_cumprod)
+        return loss.mean()
+    def _min_snr_weight(
+        self,
+        loss: torch.Tensor,
+        timestep: torch.Tensor,
+        alphas_cumprod: torch.Tensor,
+        gamma: float = 5.0,
+    ) -> torch.Tensor:
+        """Min-SNR-gamma weighting (Hang et al., 2023)."""
+        snr = alphas_cumprod[timestep] / (1 - alphas_cumprod[timestep] + 1e-8)
+        weight = torch.clamp(snr, max=gamma) / (snr + 1e-8)
+        weight = weight.unsqueeze(-1).expand_as(loss)
+        return loss * weight
+    def _p2_weight(
+        self,
+        loss: torch.Tensor,
+        timestep: torch.Tensor,
+        alphas_cumprod: torch.Tensor,
+    ) -> torch.Tensor:
+        """P2 weighting (Choi et al., 2022)."""
+        snr = alphas_cumprod[timestep] / (1 - alphas_cumprod[timestep] + 1e-8)
+        weight = 1.0 / (snr ** self.config.p2_gamma + self.config.p2_k)
+        weight = weight.unsqueeze(-1).expand_as(loss)
+        return loss * weight
+def compute_loss(
+    predicted: torch.Tensor,
+    target: torch.Tensor,
+    timestep: torch.Tensor,
+    alphas_cumprod: torch.Tensor,
+    loss_type: str = "mse",
+    loss_weighting: str = "none",
+) -> torch.Tensor:
+    """Convenience function to compute diffusion loss without creating a module.
+    Args:
+        predicted: Model output.
+        target: Target values.
+        timestep: Timestep indices.
+        alphas_cumprod: Alpha cumulative products.
+        loss_type: Loss function type.
+        loss_weighting: Weighting strategy.
+    Returns:
+        Scalar loss value.
+    """
+    config = DiffusionConfig(
+        loss_type=loss_type,
+        loss_weighting=loss_weighting,
+    )
+    loss_fn = DiffusionLoss(config)
+    return loss_fn(predicted, target, timestep, alphas_cumprod)

diffusion_llm/training/trainer.py ADDED Viewed

	@@ -0,0 +1,420 @@

+"""
+AAM Diffusion LLM — Trainer
+Training loop for the AAM Diffusion Model.
+Handles:
+    - Training loop with gradient accumulation
+    - Learning rate scheduling with warmup
+    - Mixed precision training (AMP)
+    - EMA model updates
+    - Checkpoint saving/loading
+    - Logging to console and Weights & Biases
+    - Evaluation on validation set
+Analogi: Seperti latihan fisik Jin Soun — berulang-ulang,
+bertahap meningkat intensitas, dengan instruktur yang
+mengawasi dan memberi koreksi.
+"""
+from __future__ import annotations
+import json
+import logging
+import math
+import time
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from diffusion_llm.config.model_config import AamDiffusionConfig
+from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel
+from diffusion_llm.training.dataset import GraphNarrativeDataset, collate_fn
+from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer
+from diffusion_llm.training.losses import DiffusionLoss
+logger = logging.getLogger(__name__)
+class AamTrainer:
+    """Trainer for the AAM Diffusion Model.
+    Args:
+        config: AamDiffusionConfig with training settings.
+        model: AamDiffusionModel instance.
+        tokenizer: AamTokenizer instance.
+        train_dataset: Training dataset.
+        val_dataset: Optional validation dataset.
+    """
+    def __init__(
+        self,
+        config: AamDiffusionConfig,
+        model: AamDiffusionModel,
+        tokenizer: AamTokenizer,
+        train_dataset: GraphNarrativeDataset,
+        val_dataset: Optional[GraphNarrativeDataset] = None,
+    ):
+        self.config = config
+        self.model = model
+        self.tokenizer = tokenizer
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        # Device
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.model.to(self.device)
+        logger.info("Training on device: %s", self.device)
+        # Optimizer
+        self.optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=config.training.learning_rate,
+            weight_decay=config.training.weight_decay,
+            betas=(config.training.adam_beta1, config.training.adam_beta2),
+            eps=config.training.adam_eps,
+        )
+        # Loss function
+        self.loss_fn = DiffusionLoss(config.diffusion)
+        # Data loaders
+        self.train_loader = DataLoader(
+            train_dataset,
+            batch_size=config.training.batch_size,
+            shuffle=True,
+            num_workers=config.training.num_workers,
+            collate_fn=collate_fn,
+            pin_memory=True,
+        )
+        if val_dataset:
+            self.val_loader = DataLoader(
+                val_dataset,
+                batch_size=config.training.batch_size,
+                shuffle=False,
+                num_workers=config.training.num_workers,
+                collate_fn=collate_fn,
+                pin_memory=True,
+            )
+        else:
+            self.val_loader = None
+        # LR scheduler
+        self.scheduler = self._create_lr_scheduler()
+        # AMP
+        self.scaler = None
+        if config.training.use_amp:
+            dtype = torch.bfloat16 if config.training.amp_dtype == "bf16" else torch.float16
+            self.scaler = torch.amp.GradScaler("cuda", enabled=(dtype == torch.float16))
+        # EMA
+        self.ema_model = None
+        if config.training.use_ema:
+            self.ema_model = self._create_ema_model()
+        # State tracking
+        self.global_step = 0
+        self.best_val_loss = float("inf")
+        self.train_losses: list[float] = []
+        # Output directory
+        self.output_dir = Path(config.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Seed
+        torch.manual_seed(config.seed)
+    def _create_lr_scheduler(self):
+        """Create learning rate scheduler with warmup."""
+        total_steps = self.config.training.max_steps
+        warmup_steps = self.config.training.warmup_steps
+        def lr_lambda(step: int) -> float:
+            if step < warmup_steps:
+                return step / max(warmup_steps, 1)
+            if self.config.training.lr_schedule == "cosine":
+                progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
+                return 0.5 * (1.0 + math.cos(math.pi * progress))
+            elif self.config.training.lr_schedule == "linear":
+                progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
+                return 1.0 - progress
+            else:
+                return 1.0
+        return torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda)
+    def _create_ema_model(self) -> AamDiffusionModel:
+        """Create EMA copy of the model."""
+        import copy
+        ema = copy.deepcopy(self.model)
+        for param in ema.parameters():
+            param.requires_grad = False
+        return ema
+    @torch.no_grad()
+    def _update_ema(self) -> None:
+        """Update EMA model weights."""
+        if self.ema_model is None:
+            return
+        decay = self.config.training.ema_decay
+        for ema_param, model_param in zip(
+            self.ema_model.parameters(), self.model.parameters()
+        ):
+            ema_param.data.mul_(decay).add_(model_param.data, alpha=1 - decay)
+    def train(self) -> None:
+        """Main training loop.
+        Runs for max_steps or max_epochs, whichever comes first.
+        Saves checkpoints and runs evaluation periodically.
+        """
+        logger.info("Starting training...")
+        logger.info("  Max steps: %d", self.config.training.max_steps)
+        logger.info("  Batch size: %d", self.config.training.batch_size)
+        logger.info("  Gradient accumulation: %d", self.config.training.gradient_accumulation_steps)
+        logger.info("  Effective batch size: %d",
+                     self.config.training.batch_size * self.config.training.gradient_accumulation_steps)
+        start_time = time.time()
+        epoch = 0
+        while self.global_step < self.config.training.max_steps:
+            epoch += 1
+            if epoch > self.config.training.max_epochs:
+                break
+            logger.info("=== Epoch %d ===", epoch)
+            epoch_loss = 0.0
+            n_batches = 0
+            for batch_idx, batch in enumerate(self.train_loader):
+                loss = self._train_step(batch)
+                epoch_loss += loss
+                n_batches += 1
+                # Logging
+                if self.global_step % self.config.training.log_every_steps == 0:
+                    avg_loss = epoch_loss / max(n_batches, 1)
+                    lr = self.optimizer.param_groups[0]["lr"]
+                    elapsed = time.time() - start_time
+                    steps_per_sec = self.global_step / max(elapsed, 1)
+                    logger.info(
+                        "Step %d | Loss: %.4f | LR: %.2e | Speed: %.1f steps/s",
+                        self.global_step, loss, lr, steps_per_sec,
+                    )
+                # Evaluation
+                if (self.global_step % self.config.training.eval_every_steps == 0
+                        and self.val_loader is not None):
+                    val_loss = self.evaluate()
+                    logger.info("Validation loss: %.4f", val_loss)
+                    if val_loss < self.best_val_loss:
+                        self.best_val_loss = val_loss
+                        self._save_checkpoint("best.pt")
+                # Checkpoint
+                if self.global_step % self.config.training.save_every_steps == 0:
+                    self._save_checkpoint(f"step_{self.global_step}.pt")
+                # Stop condition
+                if self.global_step >= self.config.training.max_steps:
+                    break
+            avg_epoch_loss = epoch_loss / max(n_batches, 1)
+            logger.info("Epoch %d complete. Average loss: %.4f", epoch, avg_epoch_loss)
+        # Final save
+        self._save_checkpoint("final.pt")
+        elapsed = time.time() - start_time
+        logger.info(
+            "Training complete! %d steps in %.1f hours",
+            self.global_step, elapsed / 3600,
+        )
+    def _train_step(self, batch: dict[str, torch.Tensor]) -> float:
+        """Single training step.
+        Args:
+            batch: Batch of training data.
+        Returns:
+            Loss value for this step.
+        """
+        self.model.train()
+        # Move batch to device
+        batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
+                 for k, v in batch.items()}
+        # Sample random timesteps
+        batch_size = batch["token_ids"].shape[0]
+        t = torch.randint(
+            0, self.config.diffusion.n_timesteps,
+            (batch_size,), device=self.device,
+        )
+        # Forward pass
+        if self.scaler is not None:
+            with torch.amp.autocast("cuda", enabled=True):
+                predicted, target = self.model(
+                    token_ids=batch["token_ids"],
+                    timestep=t,
+                    evidence_ids=batch.get("evidence_ids"),
+                    evidence_confidence=batch.get("evidence_confidence"),
+                    anomaly_ids=batch.get("anomaly_ids"),
+                    anomaly_confidence=batch.get("anomaly_confidence"),
+                    reasoning_ids=batch.get("reasoning_ids"),
+                    reasoning_confidence=batch.get("reasoning_confidence"),
+                    source_trust=batch.get("source_trust"),
+                )
+                loss = self.model.compute_loss(predicted, target, t)
+                loss = loss / self.config.training.gradient_accumulation_steps
+        else:
+            predicted, target = self.model(
+                token_ids=batch["token_ids"],
+                timestep=t,
+                evidence_ids=batch.get("evidence_ids"),
+                evidence_confidence=batch.get("evidence_confidence"),
+                anomaly_ids=batch.get("anomaly_ids"),
+                anomaly_confidence=batch.get("anomaly_confidence"),
+                reasoning_ids=batch.get("reasoning_ids"),
+                reasoning_confidence=batch.get("reasoning_confidence"),
+                source_trust=batch.get("source_trust"),
+            )
+            loss = self.model.compute_loss(predicted, target, t)
+            loss = loss / self.config.training.gradient_accumulation_steps
+        # Backward pass
+        if self.scaler is not None:
+            self.scaler.scale(loss).backward()
+        else:
+            loss.backward()
+        # Gradient accumulation
+        if (self.global_step + 1) % self.config.training.gradient_accumulation_steps == 0:
+            # Gradient clipping
+            if self.scaler is not None:
+                self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(
+                self.model.parameters(),
+                self.config.training.grad_clip_norm,
+            )
+            # Optimizer step
+            if self.scaler is not None:
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            else:
+                self.optimizer.step()
+            # LR schedule
+            self.scheduler.step()
+            # Zero gradients
+            self.optimizer.zero_grad()
+            # EMA update
+            self._update_ema()
+        self.global_step += 1
+        self.train_losses.append(loss.item())
+        return loss.item()
+    @torch.no_grad()
+    def evaluate(self) -> float:
+        """Evaluate on validation set.
+        Returns:
+            Average validation loss.
+        """
+        if self.val_loader is None:
+            return float("inf")
+        self.model.eval()
+        total_loss = 0.0
+        n_batches = 0
+        for batch in self.val_loader:
+            batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
+                     for k, v in batch.items()}
+            batch_size = batch["token_ids"].shape[0]
+            t = torch.randint(
+                0, self.config.diffusion.n_timesteps,
+                (batch_size,), device=self.device,
+            )
+            predicted, target = self.model(
+                token_ids=batch["token_ids"],
+                timestep=t,
+                evidence_ids=batch.get("evidence_ids"),
+                evidence_confidence=batch.get("evidence_confidence"),
+                anomaly_ids=batch.get("anomaly_ids"),
+                anomaly_confidence=batch.get("anomaly_confidence"),
+                reasoning_ids=batch.get("reasoning_ids"),
+                reasoning_confidence=batch.get("reasoning_confidence"),
+                source_trust=batch.get("source_trust"),
+            )
+            loss = self.model.compute_loss(predicted, target, t)
+            total_loss += loss.item()
+            n_batches += 1
+        avg_loss = total_loss / max(n_batches, 1)
+        self.model.train()
+        return avg_loss
+    def _save_checkpoint(self, filename: str) -> None:
+        """Save training checkpoint.
+        Args:
+            filename: Checkpoint filename.
+        """
+        path = self.output_dir / filename
+        checkpoint = {
+            "model_state_dict": self.model.state_dict(),
+            "optimizer_state_dict": self.optimizer.state_dict(),
+            "scheduler_state_dict": self.scheduler.state_dict(),
+            "global_step": self.global_step,
+            "best_val_loss": self.best_val_loss,
+            "config": self.config.to_dict(),
+        }
+        if self.ema_model is not None:
+            checkpoint["ema_state_dict"] = self.ema_model.state_dict()
+        torch.save(checkpoint, path)
+        logger.info("Checkpoint saved: %s", path)
+        # Clean up old checkpoints
+        self._cleanup_checkpoints()
+    def _cleanup_checkpoints(self) -> None:
+        """Remove old checkpoints, keeping only the last N."""
+        keep_n = self.config.training.keep_last_n_checkpoints
+        checkpoints = sorted(self.output_dir.glob("step_*.pt"))
+        while len(checkpoints) > keep_n:
+            oldest = checkpoints.pop(0)
+            oldest.unlink()
+            logger.info("Removed old checkpoint: %s", oldest)
+    def load_checkpoint(self, path: str) -> None:
+        """Load from checkpoint.
+        Args:
+            path: Checkpoint file path.
+        """
+        checkpoint = torch.load(path, map_location=self.device)
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+        self.global_step = checkpoint["global_step"]
+        self.best_val_loss = checkpoint.get("best_val_loss", float("inf"))
+        logger.info("Loaded checkpoint from step %d", self.global_step)

inference_example.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python3
+"""AAM Diffusion LLM v1.0 — Inference Example"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent))
+import torch
+from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator, AamDiffusionConfig
+def main():
+    # Load model and tokenizer
+    config = AamDiffusionConfig.from_json("config.json")
+    model = AamDiffusionModel.load("model.pt", device="cpu")
+    tokenizer = AamTokenizer.load("tokenizer.json")
+    # Create generator
+    generator = AamGenerator(model, tokenizer, config)
+    # Generate narrative from graph conditioning
+    result = generator.generate(
+        trigger="Siapa yang mencuri Snow Plum Pill?",
+        evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"],
+        anomalies=["Tidak ada konsumsi pil baru di pasar gelap"],
+        reasoning_steps=["Cross-reference tanggal kejadian", "Deteksi anomali pola"],
+        source_trust=0.85,
+    )
+    print("=" * 60)
+    print("  AAM Diffusion LLM — Generated Narrative")
+    print("=" * 60)
+    print(f"  Narrative: {result.narrative}")
+    print(f"  Confidence: {result.confidence:.1%}")
+    print(f"  Steps: {result.n_diffusion_steps}")
+    print(f"  Time: {result.generation_time_s:.2f}s")
+if __name__ == "__main__":
+    main()

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:651e28c3b1fd60919884cc7e6311cd7a604c368669b9abecf27adb2efbc1eaea
+size 1297247

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch>=2.0.0
2	+ numpy>=1.24.0

tokenizer.json ADDED Viewed

	@@ -0,0 +1,964 @@

+{
+  "config": {
+    "bpe_vocab_size": 28000,
+    "max_sentences": 32,
+    "sentence_boundary_token": "<sent>",
+    "pad_token": "<pad>",
+    "bos_token": "<bos>",
+    "eos_token": "<eos>",
+    "mask_token": "<mask>",
+    "noise_token": "<noise>",
+    "min_frequency": 2
+  },
+  "vocab": {
+    "<pad>": 0,
+    "<bos>": 1,
+    "<eos>": 2,
+    "<mask>": 3,
+    "<noise>": 4,
+    "<sent>": 5,
+    "<evidence>": 6,
+    "<anomaly>": 7,
+    "<confidence>": 8,
+    "<reasoning>": 9,
+    "<composition>": 10,
+    "<temporal>": 11,
+    "<unk>": 12,
+    "%": 13,
+    ",": 14,
+    "-": 15,
+    ".": 16,
+    "0": 17,
+    "1": 18,
+    "2": 19,
+    "3": 20,
+    "4": 21,
+    "6": 22,
+    "7": 23,
+    "8": 24,
+    "9": 25,
+    ":": 26,
+    ";": 27,
+    "?": 28,
+    "_": 29,
+    "a": 30,
+    "b": 31,
+    "c": 32,
+    "d": 33,
+    "e": 34,
+    "f": 35,
+    "g": 36,
+    "h": 37,
+    "i": 38,
+    "j": 39,
+    "k": 40,
+    "l": 41,
+    "m": 42,
+    "n": 43,
+    "o": 44,
+    "p": 45,
+    "r": 46,
+    "s": 47,
+    "t": 48,
+    "u": 49,
+    "v": 50,
+    "w": 51,
+    "y": 52,
+    "z": 53,
+    "an": 54,
+    "an</w>": 55,
+    "er": 56,
+    "en": 57,
+    "da": 58,
+    "ti": 59,
+    "il": 60,
+    "si": 61,
+    "di": 62,
+    "ang</w>": 63,
+    "si</w>": 64,
+    "anc": 65,
+    "kan</w>": 66,
+    "al": 67,
+    "su": 68,
+    "ang": 69,
+    "ri</w>": 70,
+    "ke": 71,
+    "ef": 72,
+    "ter": 73,
+    "se": 74,
+    "te": 75,
+    "pa": 76,
+    "ng": 77,
+    "on</w>": 78,
+    "on": 79,
+    "hef": 80,
+    "hefe": 81,
+    "enc": 82,
+    "or": 83,
+    "la": 84,
+    "sim": 85,
+    "ul": 86,
+    "tida": 87,
+    "ar": 88,
+    "eng": 89,
+    "dari</w>": 90,
+    "re": 91,
+    "bu": 92,
+    "ance</w>": 93,
+    "ra": 94,
+    "om": 95,
+    "hefei</w>": 96,
+    "jang": 97,
+    "sa": 98,
+    "ju</w>": 99,
+    "jangm": 100,
+    "jangmo": 101,
+    "jangmok</w>": 102,
+    "al</w>": 103,
+    "os": 104,
+    "dianc": 105,
+    "diancang</w>": 106,
+    "ai": 107,
+    "in": 108,
+    "ja": 109,
+    "kon": 110,
+    "li": 111,
+    "ct</w>": 112,
+    "tidak</w>": 113,
+    "eri": 114,
+    "fi": 115,
+    "meng": 116,
+    "asi</w>": 117,
+    "kesim": 118,
+    "kesimp": 119,
+    "kesimpul": 120,
+    "kesimpulan</w>": 121,
+    "di</w>": 122,
+    "ngkan</w>": 123,
+    "ksi</w>": 124,
+    "pi": 125,
+    "ya</w>": 126,
+    "yang</w>": 127,
+    "encu": 128,
+    "ta": 129,
+    "buk": 130,
+    "bukt": 131,
+    "bukti</w>": 132,
+    "pen": 133,
+    "per": 134,
+    "lu": 135,
+    "le": 136,
+    "fiv": 137,
+    "five</w>": 138,
+    "sw": 139,
+    "swor": 140,
+    "sword": 141,
+    "swords</w>": 142,
+    "pencu": 143,
+    "ence</w>": 144,
+    "ce": 145,
+    "ku": 146,
+    "ili": 147,
+    "sn": 148,
+    "sno": 149,
+    "snow</w>": 150,
+    "plu": 151,
+    "plum</w>": 152,
+    "pil": 153,
+    "pill</w>": 154,
+    "mengh": 155,
+    "menghil": 156,
+    "menghilang</w>": 157,
+    "lo": 158,
+    "bi": 159,
+    "de": 160,
+    "anom": 161,
+    "anomal": 162,
+    "mar": 163,
+    "marti": 164,
+    "martial</w>": 165,
+    "alli": 166,
+    "alliance</w>": 167,
+    "mu": 168,
+    "anal": 169,
+    "anali": 170,
+    "analisi": 171,
+    "analisis</w>": 172,
+    "gy": 173,
+    "gyer": 174,
+    "gyery": 175,
+    "gyeryon": 176,
+    "gyeryong</w>": 177,
+    "mer": 178,
+    "merc": 179,
+    "merch": 180,
+    "merchan": 181,
+    "merchant</w>": 182,
+    "gu": 183,
+    "guil": 184,
+    "guild</w>": 185,
+    "ha": 186,
+    "cr": 187,
+    "cros": 188,
+    "cross</w>": 189,
+    "ref": 190,
+    "refer": 191,
+    "reference</w>": 192,
+    "keja": 193,
+    "kejadi": 194,
+    "kejadian</w>": 195,
+    "simh": 196,
+    "simhy": 197,
+    "simhye": 198,
+    "simhyeon</w>": 199,
+    "pav": 200,
+    "pavili": 201,
+    "pavilion</w>": 202,
+    "me": 203,
+    "tion</w>": 204,
+    "sum": 205,
+    "blo": 206,
+    "bloo": 207,
+    "blood</w>": 208,
+    "ser": 209,
+    "serpen": 210,
+    "serpent</w>": 211,
+    "dance</w>": 212,
+    "ste": 213,
+    "step</w>": 214,
+    "pre": 215,
+    "predi": 216,
+    "tin": 217,
+    "tinda": 218,
+    "tindakan</w>": 219,
+    "beri": 220,
+    "beriku": 221,
+    "berikut": 222,
+    "berikutn": 223,
+    "berikutnya</w>": 224,
+    "tae": 225,
+    "taeul": 226,
+    "taeul_": 227,
+    "taeul_se": 228,
+    "taeul_sect</w>": 229,
+    "po": 230,
+    "pol": 231,
+    "pola</w>": 232,
+    "jang</w>": 233,
+    "hang": 234,
+    "hangi</w>": 235,
+    "ad": 236,
+    "ada</w>": 237,
+    "bar": 238,
+    "baru</w>": 239,
+    "pat": 240,
+    "patter": 241,
+    "pattern</w>": 242,
+    "terpi": 243,
+    "terpisa": 244,
+    "terpisah</w>": 245,
+    "com": 246,
+    "comp": 247,
+    "as": 248,
+    "dete": 249,
+    "deteksi</w>": 250,
+    "gu</w>": 251,
+    "ilm": 252,
+    "ilmu</w>": 253,
+    "ketida": 254,
+    "ketidak": 255,
+    "ketidakse": 256,
+    "ketidaksesu": 257,
+    "ketidaksesuai": 258,
+    "ketidaksesuaian</w>": 259,
+    "terk": 260,
+    "terkai": 261,
+    "terkait</w>": 262,
+    "lap": 263,
+    "lapor": 264,
+    "laporan</w>": 265,
+    "hu": 266,
+    "hubu": 267,
+    "ela": 268,
+    "dar": 269,
+    "dark": 270,
+    "dark_": 271,
+    "dark_f": 272,
+    "dark_fa": 273,
+    "dark_fac": 274,
+    "dark_faction</w>": 275,
+    "at</w>": 276,
+    "anomaly</w>": 277,
+    "ban": 278,
+    "bandi": 279,
+    "bandingkan</w>": 280,
+    "tang": 281,
+    "tangg": 282,
+    "tanggal</w>": 283,
+    "hefei": 284,
+    "hefei_": 285,
+    "hefei_b": 286,
+    "hefei_br": 287,
+    "hefei_branc": 288,
+    "hefei_branch</w>": 289,
+    "deng": 290,
+    "dengan</w>": 291,
+    "hubungkan</w>": 292,
+    "fra": 293,
+    "frag": 294,
+    "fragme": 295,
+    "fragmen</w>": 296,
+    "pencuri</w>": 297,
+    "compos": 298,
+    "compose</w>": 299,
+    "susu": 300,
+    "susun</w>": 301,
+    "rec": 302,
+    "recal": 303,
+    "recall</w>": 304,
+    "ing": 305,
+    "ingat</w>": 306,
+    "semu": 307,
+    "semua</w>": 308,
+    "predict</w>": 309,
+    "perk": 310,
+    "perki": 311,
+    "perkira": 312,
+    "perkirakan</w>": 313,
+    "veri": 314,
+    "verif": 315,
+    "verify</w>": 316,
+    "cek</w>": 317,
+    "konsi": 318,
+    "konsis": 319,
+    "konsist": 320,
+    "konsisten": 321,
+    "konsistensi</w>": 322,
+    "konsum": 323,
+    "konsumsi</w>": 324,
+    "pa</w>": 325,
+    "men": 326,
+    "ting": 327,
+    "fil": 328,
+    "filte": 329,
+    "filter</w>": 330,
+    "eli": 331,
+    "elim": 332,
+    "elimin": 333,
+    "eliminasi</w>": 334,
+    "rele": 335,
+    "relev": 336,
+    "relevan</w>": 337,
+    "pil</w>": 338,
+    "pasa": 339,
+    "pasar</w>": 340,
+    "gela": 341,
+    "gelap</w>": 342,
+    "suc": 343,
+    "succe": 344,
+    "succes": 345,
+    "success</w>": 346,
+    "rat": 347,
+    "rate</w>": 348,
+    "pai": 349,
+    "pair</w>": 350,
+    "lebi": 351,
+    "lebih</w>": 352,
+    "tingg": 353,
+    "tinggi</w>": 354,
+    "bias": 355,
+    "biasan": 356,
+    "biasanya</w>": 357,
+    "dala": 358,
+    "dalam</w>": 359,
+    "ber": 360,
+    "pencur": 361,
+    "pencuri": 362,
+    "pencurian</w>": 363,
+    "ka</w>": 364,
+    "tan": 365,
+    "tanpa</w>": 366,
+    "je": 367,
+    "jeja": 368,
+    "jejak</w>": 369,
+    "perg": 370,
+    "perger": 371,
+    "pergera": 372,
+    "pergerakan</w>": 373,
+    "masi</w>": 374,
+    "inv": 375,
+    "inve": 376,
+    "inves": 377,
+    "investi": 378,
+    "investig": 379,
+    "investigasi</w>": 380,
+    "hari</w>": 381,
+    "sam": 382,
+    "sama</w>": 383,
+    "dat": 384,
+    "data</w>": 385,
+    "menu": 386,
+    "menun": 387,
+    "menunj": 388,
+    "menunju": 389,
+    "menunjuk": 390,
+    "menunjukkan</w>": 391,
+    "ca": 392,
+    "mi": 393,
+    "misi</w>": 394,
+    "assi": 395,
+    "assig": 396,
+    "assign</w>": 397,
+    "sen": 398,
+    "sendi": 399,
+    "sendiri</w>": 400,
+    "ka": 401,
+    "sete": 402,
+    "setela": 403,
+    "setelah</w>": 404,
+    "temu": 405,
+    "ke</w>": 406,
+    "sumb": 407,
+    "sumbe": 408,
+    "sumber</w>": 409,
+    "prediksi</w>": 410,
+    "ters": 411,
+    "tersang": 412,
+    "tersangka</w>": 413,
+    "penal": 414,
+    "penalar": 415,
+    "penalaran</w>": 416,
+    "menja": 417,
+    "menjadi</w>": 418,
+    "kun": 419,
+    "kunc": 420,
+    "kunci</w>": 421,
+    "hasi": 422,
+    "hasil</w>": 423,
+    "inf": 424,
+    "infor": 425,
+    "informasi</w>": 426,
+    "anomali</w>": 427,
+    "ya": 428,
+    "temuan</w>": 429,
+    "berk": 430,
+    "berkor": 431,
+    "berkorela": 432,
+    "berkorelasi</w>": 433,
+    "cata": 434,
+    "catat": 435,
+    "catatan</w>": 436,
+    "sia": 437,
+    "siapa</w>": 438,
+    "mencu": 439,
+    "mencuri</w>": 440,
+    "terse": 441,
+    "tersedi": 442,
+    "tersedia</w>": 443,
+    "mem": 444,
+    "memili": 445,
+    "memilik": 446,
+    "memiliki</w>": 447,
+    "kone": 448,
+    "koneksi</w>": 449,
+    "con": 450,
+    "confi": 451,
+    "confid": 452,
+    "confidence</w>": 453,
+    "mengin": 454,
+    "mengindi": 455,
+    "mengindika": 456,
+    "mengindikasi": 457,
+    "mengindikasikan</w>": 458,
+    "pene": 459,
+    "penelu": 460,
+    "penelusu": 461,
+    "penelusur": 462,
+    "penelusuran</w>": 463,
+    "log": 464,
+    "logi": 465,
+    "logika</w>": 466,
+    "pr": 467,
+    "pro": 468,
+    "prose": 469,
+    "proses</w>": 470,
+    "ded": 471,
+    "dedu": 472,
+    "deduksi</w>": 473,
+    "mengkon": 474,
+    "mengkonfi": 475,
+    "mengkonfir": 476,
+    "mengkonfirmasi</w>": 477,
+    "comple": 478,
+    "completion</w>": 479,
+    "ba": 480,
+    "bah": 481,
+    "bahw": 482,
+    "bahwa</w>": 483,
+    "ev": 484,
+    "eval": 485,
+    "evalu": 486,
+    "evaluasi</w>": 487,
+    "keper": 488,
+    "keperca": 489,
+    "kepercaya": 490,
+    "kepercayaan</w>": 491,
+    "berta": 492,
+    "bertaha": 493,
+    "bertahap</w>": 494,
+    "insi": 495,
+    "insid": 496,
+    "inside</w>": 497,
+    "jo": 498,
+    "job</w>": 499
+  },
+  "merges": {
+    "a|||n": 0,
+    "a|||n</w>": 1,
+    "e|||r": 2,
+    "e|||n": 3,
+    "d|||a": 4,
+    "t|||i": 5,
+    "i|||l": 6,
+    "s|||i": 7,
+    "d|||i": 8,
+    "an|||g</w>": 9,
+    "s|||i</w>": 10,
+    "an|||c": 11,
+    "k|||an</w>": 12,
+    "a|||l": 13,
+    "s|||u": 14,
+    "an|||g": 15,
+    "r|||i</w>": 16,
+    "k|||e": 17,
+    "e|||f": 18,
+    "t|||er": 19,
+    "s|||e": 20,
+    "t|||e": 21,
+    "p|||a": 22,
+    "n|||g": 23,
+    "o|||n</w>": 24,
+    "o|||n": 25,
+    "h|||ef": 26,
+    "hef|||e": 27,
+    "en|||c": 28,
+    "o|||r": 29,
+    "l|||a": 30,
+    "si|||m": 31,
+    "u|||l": 32,
+    "ti|||da": 33,
+    "a|||r": 34,
+    "en|||g": 35,
+    "da|||ri</w>": 36,
+    "r|||e": 37,
+    "b|||u": 38,
+    "anc|||e</w>": 39,
+    "r|||a": 40,
+    "o|||m": 41,
+    "hefe|||i</w>": 42,
+    "j|||ang": 43,
+    "s|||a": 44,
+    "j|||u</w>": 45,
+    "jang|||m": 46,
+    "jangm|||o": 47,
+    "jangmo|||k</w>": 48,
+    "a|||l</w>": 49,
+    "o|||s": 50,
+    "di|||anc": 51,
+    "dianc|||ang</w>": 52,
+    "a|||i": 53,
+    "i|||n": 54,
+    "j|||a": 55,
+    "k|||on": 56,
+    "l|||i": 57,
+    "c|||t</w>": 58,
+    "tida|||k</w>": 59,
+    "er|||i": 60,
+    "f|||i": 61,
+    "m|||eng": 62,
+    "a|||si</w>": 63,
+    "ke|||sim": 64,
+    "kesim|||p": 65,
+    "kesimp|||ul": 66,
+    "kesimpul|||an</w>": 67,
+    "d|||i</w>": 68,
+    "ng|||kan</w>": 69,
+    "k|||si</w>": 70,
+    "p|||i": 71,
+    "y|||a</w>": 72,
+    "y|||ang</w>": 73,
+    "enc|||u": 74,
+    "t|||a": 75,
+    "bu|||k": 76,
+    "buk|||t": 77,
+    "bukt|||i</w>": 78,
+    "p|||en": 79,
+    "p|||er": 80,
+    "l|||u": 81,
+    "l|||e": 82,
+    "fi|||v": 83,
+    "fiv|||e</w>": 84,
+    "s|||w": 85,
+    "sw|||or": 86,
+    "swor|||d": 87,
+    "sword|||s</w>": 88,
+    "p|||encu": 89,
+    "enc|||e</w>": 90,
+    "c|||e": 91,
+    "k|||u": 92,
+    "il|||i": 93,
+    "s|||n": 94,
+    "sn|||o": 95,
+    "sno|||w</w>": 96,
+    "p|||lu": 97,
+    "plu|||m</w>": 98,
+    "p|||il": 99,
+    "pil|||l</w>": 100,
+    "meng|||h": 101,
+    "mengh|||il": 102,
+    "menghil|||ang</w>": 103,
+    "l|||o": 104,
+    "b|||i": 105,
+    "d|||e": 106,
+    "an|||om": 107,
+    "anom|||al": 108,
+    "m|||ar": 109,
+    "mar|||ti": 110,
+    "marti|||al</w>": 111,
+    "al|||li": 112,
+    "alli|||ance</w>": 113,
+    "m|||u": 114,
+    "an|||al": 115,
+    "anal|||i": 116,
+    "anali|||si": 117,
+    "analisi|||s</w>": 118,
+    "g|||y": 119,
+    "gy|||er": 120,
+    "gyer|||y": 121,
+    "gyery|||on": 122,
+    "gyeryon|||g</w>": 123,
+    "m|||er": 124,
+    "mer|||c": 125,
+    "merc|||h": 126,
+    "merch|||an": 127,
+    "merchan|||t</w>": 128,
+    "g|||u": 129,
+    "gu|||il": 130,
+    "guil|||d</w>": 131,
+    "h|||a": 132,
+    "c|||r": 133,
+    "cr|||os": 134,
+    "cros|||s</w>": 135,
+    "r|||ef": 136,
+    "ref|||er": 137,
+    "refer|||ence</w>": 138,
+    "ke|||ja": 139,
+    "keja|||di": 140,
+    "kejadi|||an</w>": 141,
+    "sim|||h": 142,
+    "simh|||y": 143,
+    "simhy|||e": 144,
+    "simhye|||on</w>": 145,
+    "pa|||v": 146,
+    "pav|||ili": 147,
+    "pavili|||on</w>": 148,
+    "m|||e": 149,
+    "ti|||on</w>": 150,
+    "su|||m": 151,
+    "b|||lo": 152,
+    "blo|||o": 153,
+    "bloo|||d</w>": 154,
+    "s|||er": 155,
+    "ser|||pen": 156,
+    "serpen|||t</w>": 157,
+    "d|||ance</w>": 158,
+    "s|||te": 159,
+    "ste|||p</w>": 160,
+    "p|||re": 161,
+    "pre|||di": 162,
+    "ti|||n": 163,
+    "tin|||da": 164,
+    "tinda|||kan</w>": 165,
+    "b|||eri": 166,
+    "beri|||ku": 167,
+    "beriku|||t": 168,
+    "berikut|||n": 169,
+    "berikutn|||ya</w>": 170,
+    "ta|||e": 171,
+    "tae|||ul": 172,
+    "taeul|||_": 173,
+    "taeul_|||se": 174,
+    "taeul_se|||ct</w>": 175,
+    "p|||o": 176,
+    "po|||l": 177,
+    "pol|||a</w>": 178,
+    "j|||ang</w>": 179,
+    "h|||ang": 180,
+    "hang|||i</w>": 181,
+    "a|||d": 182,
+    "ad|||a</w>": 183,
+    "b|||ar": 184,
+    "bar|||u</w>": 185,
+    "pa|||t": 186,
+    "pat|||ter": 187,
+    "patter|||n</w>": 188,
+    "ter|||pi": 189,
+    "terpi|||sa": 190,
+    "terpisa|||h</w>": 191,
+    "c|||om": 192,
+    "com|||p": 193,
+    "a|||s": 194,
+    "de|||te": 195,
+    "dete|||ksi</w>": 196,
+    "g|||u</w>": 197,
+    "il|||m": 198,
+    "ilm|||u</w>": 199,
+    "ke|||tida": 200,
+    "ketida|||k": 201,
+    "ketidak|||se": 202,
+    "ketidakse|||su": 203,
+    "ketidaksesu|||ai": 204,
+    "ketidaksesuai|||an</w>": 205,
+    "ter|||k": 206,
+    "terk|||ai": 207,
+    "terkai|||t</w>": 208,
+    "la|||p": 209,
+    "lap|||or": 210,
+    "lapor|||an</w>": 211,
+    "h|||u": 212,
+    "hu|||bu": 213,
+    "e|||la": 214,
+    "da|||r": 215,
+    "dar|||k": 216,
+    "dark|||_": 217,
+    "dark_|||f": 218,
+    "dark_f|||a": 219,
+    "dark_fa|||c": 220,
+    "dark_fac|||tion</w>": 221,
+    "a|||t</w>": 222,
+    "anomal|||y</w>": 223,
+    "b|||an": 224,
+    "ban|||di": 225,
+    "bandi|||ngkan</w>": 226,
+    "t|||ang": 227,
+    "tang|||g": 228,
+    "tangg|||al</w>": 229,
+    "hefe|||i": 230,
+    "hefei|||_": 231,
+    "hefei_|||b": 232,
+    "hefei_b|||r": 233,
+    "hefei_br|||anc": 234,
+    "hefei_branc|||h</w>": 235,
+    "d|||eng": 236,
+    "deng|||an</w>": 237,
+    "hubu|||ngkan</w>": 238,
+    "f|||ra": 239,
+    "fra|||g": 240,
+    "frag|||me": 241,
+    "fragme|||n</w>": 242,
+    "pencu|||ri</w>": 243,
+    "comp|||os": 244,
+    "compos|||e</w>": 245,
+    "su|||su": 246,
+    "susu|||n</w>": 247,
+    "re|||c": 248,
+    "rec|||al": 249,
+    "recal|||l</w>": 250,
+    "i|||ng": 251,
+    "ing|||at</w>": 252,
+    "se|||mu": 253,
+    "semu|||a</w>": 254,
+    "predi|||ct</w>": 255,
+    "per|||k": 256,
+    "perk|||i": 257,
+    "perki|||ra": 258,
+    "perkira|||kan</w>": 259,
+    "v|||eri": 260,
+    "veri|||f": 261,
+    "verif|||y</w>": 262,
+    "ce|||k</w>": 263,
+    "kon|||si": 264,
+    "konsi|||s": 265,
+    "konsis|||t": 266,
+    "konsist|||en": 267,
+    "konsisten|||si</w>": 268,
+    "kon|||sum": 269,
+    "konsum|||si</w>": 270,
+    "p|||a</w>": 271,
+    "m|||en": 272,
+    "ti|||ng": 273,
+    "f|||il": 274,
+    "fil|||te": 275,
+    "filte|||r</w>": 276,
+    "e|||li": 277,
+    "eli|||m": 278,
+    "elim|||in": 279,
+    "elimin|||asi</w>": 280,
+    "re|||le": 281,
+    "rele|||v": 282,
+    "relev|||an</w>": 283,
+    "pi|||l</w>": 284,
+    "pa|||sa": 285,
+    "pasa|||r</w>": 286,
+    "g|||ela": 287,
+    "gela|||p</w>": 288,
+    "su|||c": 289,
+    "suc|||ce": 290,
+    "succe|||s": 291,
+    "succes|||s</w>": 292,
+    "ra|||t": 293,
+    "rat|||e</w>": 294,
+    "pa|||i": 295,
+    "pai|||r</w>": 296,
+    "le|||bi": 297,
+    "lebi|||h</w>": 298,
+    "ting|||g": 299,
+    "tingg|||i</w>": 300,
+    "bi|||as": 301,
+    "bias|||an": 302,
+    "biasan|||ya</w>": 303,
+    "da|||la": 304,
+    "dala|||m</w>": 305,
+    "b|||er": 306,
+    "pencu|||r": 307,
+    "pencur|||i": 308,
+    "pencuri|||an</w>": 309,
+    "k|||a</w>": 310,
+    "t|||an": 311,
+    "tan|||pa</w>": 312,
+    "j|||e": 313,
+    "je|||ja": 314,
+    "jeja|||k</w>": 315,
+    "per|||g": 316,
+    "perg|||er": 317,
+    "perger|||a": 318,
+    "pergera|||kan</w>": 319,
+    "m|||asi</w>": 320,
+    "in|||v": 321,
+    "inv|||e": 322,
+    "inve|||s": 323,
+    "inves|||ti": 324,
+    "investi|||g": 325,
+    "investig|||asi</w>": 326,
+    "ha|||ri</w>": 327,
+    "sa|||m": 328,
+    "sam|||a</w>": 329,
+    "da|||t": 330,
+    "dat|||a</w>": 331,
+    "men|||u": 332,
+    "menu|||n": 333,
+    "menun|||j": 334,
+    "menunj|||u": 335,
+    "menunju|||k": 336,
+    "menunjuk|||kan</w>": 337,
+    "c|||a": 338,
+    "m|||i": 339,
+    "mi|||si</w>": 340,
+    "as|||si": 341,
+    "assi|||g": 342,
+    "assig|||n</w>": 343,
+    "s|||en": 344,
+    "sen|||di": 345,
+    "sendi|||ri</w>": 346,
+    "k|||a": 347,
+    "se|||te": 348,
+    "sete|||la": 349,
+    "setela|||h</w>": 350,
+    "te|||mu": 351,
+    "k|||e</w>": 352,
+    "sum|||b": 353,
+    "sumb|||e": 354,
+    "sumbe|||r</w>": 355,
+    "predi|||ksi</w>": 356,
+    "ter|||s": 357,
+    "ters|||ang": 358,
+    "tersang|||ka</w>": 359,
+    "pen|||al": 360,
+    "penal|||ar": 361,
+    "penalar|||an</w>": 362,
+    "men|||ja": 363,
+    "menja|||di</w>": 364,
+    "ku|||n": 365,
+    "kun|||c": 366,
+    "kunc|||i</w>": 367,
+    "ha|||si": 368,
+    "hasi|||l</w>": 369,
+    "in|||f": 370,
+    "inf|||or": 371,
+    "infor|||masi</w>": 372,
+    "anomal|||i</w>": 373,
+    "y|||a": 374,
+    "temu|||an</w>": 375,
+    "ber|||k": 376,
+    "berk|||or": 377,
+    "berkor|||ela": 378,
+    "berkorela|||si</w>": 379,
+    "ca|||ta": 380,
+    "cata|||t": 381,
+    "catat|||an</w>": 382,
+    "si|||a": 383,
+    "sia|||pa</w>": 384,
+    "m|||encu": 385,
+    "mencu|||ri</w>": 386,
+    "ter|||se": 387,
+    "terse|||di": 388,
+    "tersedi|||a</w>": 389,
+    "me|||m": 390,
+    "mem|||ili": 391,
+    "memili|||k": 392,
+    "memilik|||i</w>": 393,
+    "kon|||e": 394,
+    "kone|||ksi</w>": 395,
+    "c|||on": 396,
+    "con|||fi": 397,
+    "confi|||d": 398,
+    "confid|||ence</w>": 399,
+    "meng|||in": 400,
+    "mengin|||di": 401,
+    "mengindi|||ka": 402,
+    "mengindika|||si": 403,
+    "mengindikasi|||kan</w>": 404,
+    "pen|||e": 405,
+    "pene|||lu": 406,
+    "penelu|||su": 407,
+    "penelusu|||r": 408,
+    "penelusur|||an</w>": 409,
+    "lo|||g": 410,
+    "log|||i": 411,
+    "logi|||ka</w>": 412,
+    "p|||r": 413,
+    "pr|||o": 414,
+    "pro|||se": 415,
+    "prose|||s</w>": 416,
+    "de|||d": 417,
+    "ded|||u": 418,
+    "dedu|||ksi</w>": 419,
+    "meng|||kon": 420,
+    "mengkon|||fi": 421,
+    "mengkonfi|||r": 422,
+    "mengkonfir|||masi</w>": 423,
+    "comp|||le": 424,
+    "comple|||tion</w>": 425,
+    "b|||a": 426,
+    "ba|||h": 427,
+    "bah|||w": 428,
+    "bahw|||a</w>": 429,
+    "e|||v": 430,
+    "ev|||al": 431,
+    "eval|||u": 432,
+    "evalu|||asi</w>": 433,
+    "ke|||per": 434,
+    "keper|||ca": 435,
+    "keperca|||ya": 436,
+    "kepercaya|||an</w>": 437,
+    "ber|||ta": 438,
+    "berta|||ha": 439,
+    "bertaha|||p</w>": 440,
+    "in|||si": 441,
+    "insi|||d": 442,
+    "insid|||e</w>": 443,
+    "j|||o": 444,
+    "jo|||b</w>": 445
+  },
+  "is_trained": true
+}

training_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "model_name": "aam-diffusion-v1.0",
+  "aam_mind_source": "rsvs_graph",
+  "aam_body_type": "specialized_diffusion",
+  "architecture": {
+    "type": "diffusion_transformer",
+    "d_model": 64,
+    "n_layers": 2,
+    "n_heads": 4,
+    "d_ff": 128,
+    "vocab_size": 500,
+    "max_seq_len": 32,
+    "pos_encoding_type": "learned"
+  },
+  "diffusion": {
+    "n_timesteps": 50,
+    "n_inference_steps": 5,
+    "schedule_type": "cosine",
+    "prediction_type": "epsilon",
+    "sampling_method": "ddim"
+  },
+  "graph_encoder": {
+    "d_graph": 32,
+    "n_graph_layers": 1,
+    "conditioning_method": "cross_attention"
+  },
+  "parameters": 311670
+}