| |
| """ |
| AAM Diffusion LLM — Final Training Script |
| |
| Trains the complete AAM Diffusion LLM pipeline: |
| 1. Generate synthetic training data (Graph→Narrative pairs) |
| 2. Train the AAM Sentence-Level + BPE Tokenizer |
| 3. Train the Diffusion Transformer model |
| 4. Save final model, tokenizer, and config for HuggingFace upload |
| |
| This is the "birth" of AAM's body — from random weights to |
| a model that can arrange sentences from graph conditioning. |
| |
| Usage: |
| python scripts/train_final.py --output_dir ./aam-diffusion-v1 |
| python scripts/train_final.py --model_size tiny --max_steps 500 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import logging |
| import sys |
| import time |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| import torch |
| import numpy as np |
|
|
| from diffusion_llm.config.model_config import ( |
| AamDiffusionConfig, get_default_config, ModelConfig, |
| DiffusionConfig, GraphEncoderConfig, TokenizerConfig, |
| TrainingConfig, InferenceConfig, |
| ) |
| from diffusion_llm.model.aam_diffusion_model import AamDiffusionModel |
| from diffusion_llm.tokenizer.aam_tokenizer import AamTokenizer |
| from diffusion_llm.training.dataset import GraphNarrativeDataset, collate_fn |
| from diffusion_llm.data.synthetic_generator import SyntheticDataGenerator |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", |
| ) |
| logger = logging.getLogger("train_final") |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="Train AAM Diffusion LLM (Final)") |
| parser.add_argument("--model_size", type=str, default="tiny", |
| choices=["tiny", "small", "base", "medium"]) |
| parser.add_argument("--output_dir", type=str, default="./aam-diffusion-v1") |
| parser.add_argument("--max_steps", type=int, default=500) |
| parser.add_argument("--batch_size", type=int, default=8) |
| parser.add_argument("--learning_rate", type=float, default=3e-4) |
| parser.add_argument("--n_synthetic_train", type=int, default=500) |
| parser.add_argument("--n_synthetic_val", type=int, default=50) |
| parser.add_argument("--seed", type=int, default=42) |
| parser.add_argument("--log_every", type=int, default=50) |
| parser.add_argument("--save_every", type=int, default=500) |
| parser.add_argument("--eval_every", type=int, default=200) |
| return parser.parse_args() |
|
|
|
|
| def set_seed(seed: int): |
| """Set random seeds for reproducibility.""" |
| torch.manual_seed(seed) |
| np.random.seed(seed) |
| import random |
| random.seed(seed) |
|
|
|
|
| def generate_data(output_dir: Path, n_train: int, n_val: int, seed: int): |
| """Generate synthetic training data.""" |
| logger.info("=" * 60) |
| logger.info("STEP 1: Generating Synthetic Training Data") |
| logger.info("=" * 60) |
|
|
| data_dir = output_dir / "data" |
| data_dir.mkdir(parents=True, exist_ok=True) |
|
|
| train_path, val_path = SyntheticDataGenerator.generate_training_split( |
| output_dir=data_dir, |
| n_train=n_train, |
| n_val=n_val, |
| language="id", |
| seed=seed, |
| ) |
|
|
| logger.info(f" Train data: {train_path} ({n_train} examples)") |
| logger.info(f" Val data: {val_path} ({n_val} examples)") |
| return train_path, val_path |
|
|
|
|
| def train_tokenizer(train_path: Path, output_dir: Path, config: AamDiffusionConfig) -> AamTokenizer: |
| """Train the AAM Tokenizer on synthetic data.""" |
| logger.info("=" * 60) |
| logger.info("STEP 2: Training AAM Sentence-Level + BPE Tokenizer") |
| logger.info("=" * 60) |
|
|
| tokenizer = AamTokenizer(config=config.tokenizer) |
|
|
| |
| texts = [] |
| with open(train_path, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| data = json.loads(line) |
| if data.get("narrative"): |
| texts.append(data["narrative"]) |
| if data.get("trigger"): |
| texts.append(data["trigger"]) |
| for ev in data.get("evidence_nodes", []): |
| texts.append(ev) |
| for anom in data.get("anomalies", []): |
| texts.append(anom) |
| for step in data.get("reasoning_steps", []): |
| texts.append(step) |
| for comp in data.get("compositions", []): |
| texts.append(comp) |
| except json.JSONDecodeError: |
| continue |
|
|
| logger.info(f" Training tokenizer on {len(texts)} texts...") |
| tokenizer.train(texts, vocab_size=config.tokenizer.bpe_vocab_size) |
|
|
| |
| tokenizer_path = output_dir / "tokenizer.json" |
| tokenizer.save(tokenizer_path) |
| logger.info(f" Tokenizer saved: {tokenizer_path}") |
| logger.info(f" Vocab size: {tokenizer.vocab_size}") |
| logger.info(f" BPE merges: {len(tokenizer.merges)}") |
|
|
| return tokenizer |
|
|
|
|
| def create_dataloaders( |
| train_path: Path, val_path: Path, |
| tokenizer: AamTokenizer, config: AamDiffusionConfig |
| ): |
| """Create training and validation data loaders.""" |
| logger.info("=" * 60) |
| logger.info("STEP 3: Creating DataLoaders") |
| logger.info("=" * 60) |
|
|
| train_dataset = GraphNarrativeDataset( |
| data_path=train_path, |
| tokenizer=tokenizer, |
| max_seq_len=config.model.max_seq_len, |
| max_evidence=config.graph_encoder.max_evidence_nodes, |
| max_anomalies=config.graph_encoder.max_anomalies, |
| max_reasoning=config.graph_encoder.max_reasoning_steps, |
| augment=True, |
| ) |
|
|
| val_dataset = GraphNarrativeDataset( |
| data_path=val_path, |
| tokenizer=tokenizer, |
| max_seq_len=config.model.max_seq_len, |
| max_evidence=config.graph_encoder.max_evidence_nodes, |
| max_anomalies=config.graph_encoder.max_anomalies, |
| max_reasoning=config.graph_encoder.max_reasoning_steps, |
| augment=False, |
| ) |
|
|
| from torch.utils.data import DataLoader |
|
|
| train_loader = DataLoader( |
| train_dataset, |
| batch_size=config.training.batch_size, |
| shuffle=True, |
| num_workers=0, |
| collate_fn=collate_fn, |
| pin_memory=False, |
| ) |
|
|
| val_loader = DataLoader( |
| val_dataset, |
| batch_size=config.training.batch_size, |
| shuffle=False, |
| num_workers=0, |
| collate_fn=collate_fn, |
| pin_memory=False, |
| ) |
|
|
| logger.info(f" Train: {len(train_dataset)} examples, {len(train_loader)} batches") |
| logger.info(f" Val: {len(val_dataset)} examples, {len(val_loader)} batches") |
|
|
| return train_loader, val_loader |
|
|
|
|
| def train_model( |
| model: AamDiffusionModel, |
| tokenizer: AamTokenizer, |
| train_loader, |
| val_loader, |
| config: AamDiffusionConfig, |
| output_dir: Path, |
| args, |
| ): |
| """Train the AAM Diffusion Model.""" |
| logger.info("=" * 60) |
| logger.info("STEP 4: Training AAM Diffusion LLM") |
| logger.info("=" * 60) |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| logger.info(f" Device: {device}") |
| logger.info(f" Parameters: {model._format_params(model.get_num_params())}") |
|
|
| model.to(device) |
|
|
| |
| optimizer = torch.optim.AdamW( |
| model.parameters(), |
| lr=args.learning_rate, |
| weight_decay=config.training.weight_decay, |
| betas=(config.training.adam_beta1, config.training.adam_beta2), |
| ) |
|
|
| |
| warmup_steps = min(200, args.max_steps // 10) |
|
|
| def lr_lambda(step): |
| if step < warmup_steps: |
| return step / max(warmup_steps, 1) |
| progress = (step - warmup_steps) / max(args.max_steps - warmup_steps, 1) |
| return 0.5 * (1.0 + torch.cos(torch.tensor(progress * 3.14159)).item()) |
|
|
| scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) |
|
|
| |
| global_step = 0 |
| best_val_loss = float("inf") |
| train_losses = [] |
| start_time = time.time() |
|
|
| logger.info(f" Max steps: {args.max_steps}") |
| logger.info(f" Batch size: {args.batch_size}") |
| logger.info(f" Learning rate: {args.learning_rate}") |
| logger.info(f" Warmup steps: {warmup_steps}") |
| logger.info("") |
|
|
| epoch = 0 |
| while global_step < args.max_steps: |
| epoch += 1 |
| model.train() |
| epoch_loss = 0.0 |
| n_batches = 0 |
|
|
| for batch_idx, batch in enumerate(train_loader): |
| if global_step >= args.max_steps: |
| break |
|
|
| |
| batch = { |
| k: v.to(device) if isinstance(v, torch.Tensor) else v |
| for k, v in batch.items() |
| } |
|
|
| |
| batch_size = batch["token_ids"].shape[0] |
| t = torch.randint( |
| 0, config.diffusion.n_timesteps, |
| (batch_size,), device=device, |
| ) |
|
|
| |
| predicted, target = model( |
| token_ids=batch["token_ids"], |
| timestep=t, |
| evidence_ids=batch.get("evidence_ids"), |
| evidence_confidence=batch.get("evidence_confidence"), |
| anomaly_ids=batch.get("anomaly_ids"), |
| anomaly_confidence=batch.get("anomaly_confidence"), |
| reasoning_ids=batch.get("reasoning_ids"), |
| reasoning_confidence=batch.get("reasoning_confidence"), |
| source_trust=batch.get("source_trust"), |
| ) |
|
|
| |
| loss = model.compute_loss(predicted, target, t) |
|
|
| |
| optimizer.zero_grad() |
| loss.backward() |
|
|
| |
| torch.nn.utils.clip_grad_norm_( |
| model.parameters(), config.training.grad_clip_norm |
| ) |
|
|
| optimizer.step() |
| scheduler.step() |
|
|
| loss_val = loss.item() |
| train_losses.append(loss_val) |
| epoch_loss += loss_val |
| n_batches += 1 |
| global_step += 1 |
|
|
| |
| if global_step % args.log_every == 0: |
| lr = optimizer.param_groups[0]["lr"] |
| avg_loss = sum(train_losses[-args.log_every:]) / len(train_losses[-args.log_every:]) |
| elapsed = time.time() - start_time |
| steps_per_sec = global_step / max(elapsed, 1) |
| logger.info( |
| f" Step {global_step:>6d}/{args.max_steps} | " |
| f"Loss: {avg_loss:.4f} | " |
| f"LR: {lr:.2e} | " |
| f"Speed: {steps_per_sec:.1f} steps/s" |
| ) |
|
|
| |
| if global_step % args.eval_every == 0 and val_loader is not None: |
| val_loss = evaluate(model, val_loader, config, device) |
| logger.info(f" >>> Validation loss: {val_loss:.4f}") |
| if val_loss < best_val_loss: |
| best_val_loss = val_loss |
| save_model(model, tokenizer, config, output_dir / "best.pt") |
| logger.info(f" >>> New best model saved! (val_loss: {val_loss:.4f})") |
|
|
| |
| if global_step % args.save_every == 0: |
| save_model(model, tokenizer, config, output_dir / f"step_{global_step}.pt") |
|
|
| avg_epoch_loss = epoch_loss / max(n_batches, 1) |
| logger.info(f" Epoch {epoch} complete. Avg loss: {avg_epoch_loss:.4f}") |
|
|
| |
| save_model(model, tokenizer, config, output_dir / "final.pt") |
| elapsed = time.time() - start_time |
| logger.info("") |
| logger.info(f" Training complete! {global_step} steps in {elapsed/60:.1f} minutes") |
| logger.info(f" Best val loss: {best_val_loss:.4f}") |
| logger.info(f" Final train loss: {train_losses[-1]:.4f}") |
|
|
| return model |
|
|
|
|
| def evaluate(model, val_loader, config, device): |
| """Evaluate on validation set.""" |
| model.eval() |
| total_loss = 0.0 |
| n_batches = 0 |
|
|
| with torch.no_grad(): |
| for batch in val_loader: |
| batch = { |
| k: v.to(device) if isinstance(v, torch.Tensor) else v |
| for k, v in batch.items() |
| } |
|
|
| batch_size = batch["token_ids"].shape[0] |
| t = torch.randint( |
| 0, config.diffusion.n_timesteps, |
| (batch_size,), device=device, |
| ) |
|
|
| predicted, target = model( |
| token_ids=batch["token_ids"], |
| timestep=t, |
| evidence_ids=batch.get("evidence_ids"), |
| evidence_confidence=batch.get("evidence_confidence"), |
| anomaly_ids=batch.get("anomaly_ids"), |
| anomaly_confidence=batch.get("anomaly_confidence"), |
| reasoning_ids=batch.get("reasoning_ids"), |
| reasoning_confidence=batch.get("reasoning_confidence"), |
| source_trust=batch.get("source_trust"), |
| ) |
| loss = model.compute_loss(predicted, target, t) |
| total_loss += loss.item() |
| n_batches += 1 |
|
|
| model.train() |
| return total_loss / max(n_batches, 1) |
|
|
|
|
| def save_model(model, tokenizer, config, path): |
| """Save model checkpoint with tokenizer.""" |
| path = Path(path) |
| path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| checkpoint = { |
| "model_state_dict": model.state_dict(), |
| "config": config.to_dict(), |
| } |
| torch.save(checkpoint, path) |
|
|
|
|
| def export_for_huggingface(model, tokenizer, config, output_dir: Path): |
| """Export model in HuggingFace-compatible format.""" |
| logger.info("=" * 60) |
| logger.info("STEP 5: Exporting for HuggingFace") |
| logger.info("=" * 60) |
|
|
| hf_dir = output_dir / "huggingface" |
| hf_dir.mkdir(parents=True, exist_ok=True) |
|
|
| |
| model_path = hf_dir / "model.pt" |
| model.save(str(model_path)) |
| logger.info(f" Model saved: {model_path}") |
|
|
| |
| tokenizer_path = hf_dir / "tokenizer.json" |
| tokenizer.save(tokenizer_path) |
| logger.info(f" Tokenizer saved: {tokenizer_path}") |
|
|
| |
| config_path = hf_dir / "config.json" |
| config.to_json(config_path) |
| logger.info(f" Config saved: {config_path}") |
|
|
| |
| model_card = f"""--- |
| language: |
| - id |
| - en |
| license: mit |
| library_name: pytorch |
| tags: |
| - diffusion |
| - text-generation |
| - aam |
| - aphantasic-abstraction-model |
| - sentence-arrangement |
| - graph-conditioned |
| --- |
| |
| # AAM Diffusion LLM v1.0 |
| |
| > **"AAM = 1 Pikiran + 1 Tubuh" (1 Mind + 1 Body)** |
| |
| The dedicated "body" of the Aphantasic Abstraction Model (AAM) — a small diffusion LLM specifically trained to arrange sentences from structured graph data. |
| |
| ## What is this? |
| |
| This is NOT a general-purpose LLM. This is a SPECIALIZED sentence composer that: |
| - Takes **graph-structured conditioning** as input (evidence, anomalies, reasoning chains, confidence scores) |
| - Produces **coherent natural language narratives** through iterative denoising |
| - **Cannot hallucinate** — it can only narrate what the graph knows |
| |
| ## Architecture |
| |
| ``` |
| Graph Conditioning Encoder → Diffusion Transformer → Noise Scheduler |
| (Mind input) (The Body) (Iterative refinement) |
| ``` |
| |
| ### Key Components |
| - **Graph Conditioning Encoder**: Encodes evidence nodes, compositions, anomalies, reasoning chains with confidence and temporal embeddings |
| - **Diffusion Transformer**: Core denoising network with adaptive layer norm, self-attention, and cross-attention to graph conditioning |
| - **Noise Scheduler**: Cosine noise schedule with DDPM/DDIM sampling support |
| |
| ## Model Details |
| |
| | Parameter | Value | |
| |-----------|-------| |
| | Architecture | Diffusion Transformer | |
| | d_model | {config.model.d_model} | |
| | n_layers | {config.model.n_layers} | |
| | n_heads | {config.model.n_heads} | |
| | d_ff | {config.model.d_ff} | |
| | Parameters | {model._format_params(model.get_num_params())} | |
| | Vocab size | {config.model.vocab_size} | |
| | Max sequence length | {config.model.max_seq_len} | |
| | Diffusion timesteps (train) | {config.diffusion.n_timesteps} | |
| | Diffusion timesteps (inference) | {config.diffusion.n_inference_steps} | |
| | Noise schedule | {config.diffusion.schedule_type} | |
| | Prediction type | {config.diffusion.prediction_type} | |
| | Sampling method | {config.diffusion.sampling_method} | |
| |
| ## Usage |
| |
| ```python |
| from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator, AamDiffusionConfig |
| |
| # Load model |
| config = AamDiffusionConfig.from_json("config.json") |
| model = AamDiffusionModel.load("model.pt") |
| tokenizer = AamTokenizer.load("tokenizer.json") |
| |
| # Create generator |
| generator = AamGenerator(model, tokenizer, config) |
| |
| # Generate narrative from graph conditioning |
| result = generator.generate( |
| trigger="Siapa yang mencuri Snow Plum Pill?", |
| evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"], |
| anomalies=["Tidak ada konsumsi pil baru di pasar gelap"], |
| reasoning_steps=["Cross-reference tanggal kejadian"], |
| source_trust=0.85, |
| ) |
| print(result.narrative) |
| ``` |
| |
| ## Philosophy |
| |
| **AAM = 1 Mind + 1 Body** |
| |
| - **Mind** = RSVS Knowledge Graph (structural memory, perfect recall, relational understanding) |
| - **Body** = This Diffusion LLM (sentence arranger, graph-conditioned, anti-hallucination) |
| |
| Unlike using a rented LLM (GPT, Claude) as the "body", this model is specifically trained for AAM: |
| - It cannot generate information not present in the graph conditioning |
| - It arranges sentences based on structured evidence |
| - It uses diffusion (non-sequential generation) instead of autoregressive generation |
| - It is small ({model._format_params(model.get_num_params())}) but specialized |
| |
| ## Training |
| |
| Trained on synthetic Graph→Narrative pairs with: |
| - Indonesian and English narrative templates |
| - Evidence nodes, anomalies, reasoning chains |
| - Confidence score distributions |
| - Source trust scores |
| |
| ## License |
| |
| MIT |
| """ |
| model_card_path = hf_dir / "README.md" |
| with open(model_card_path, "w", encoding="utf-8") as f: |
| f.write(model_card) |
| logger.info(f" Model card saved: {model_card_path}") |
|
|
| |
| import shutil |
| framework_src = Path(__file__).parent.parent |
| framework_dst = hf_dir / "diffusion_llm" |
| if framework_dst.exists(): |
| shutil.rmtree(framework_dst) |
| shutil.copytree(framework_src, framework_dst, |
| ignore=shutil.ignore_patterns('__pycache__', '*.pyc', 'output', 'data')) |
| logger.info(f" Framework code copied to: {framework_dst}") |
|
|
| |
| train_script_dst = hf_dir / "train.py" |
| shutil.copy2(Path(__file__), train_script_dst) |
|
|
| |
| inference_example = hf_dir / "inference_example.py" |
| with open(inference_example, "w", encoding="utf-8") as f: |
| f.write('''#!/usr/bin/env python3 |
| """AAM Diffusion LLM — Inference Example""" |
| |
| import sys |
| from pathlib import Path |
| sys.path.insert(0, str(Path(__file__).parent)) |
| |
| import torch |
| from diffusion_llm import AamDiffusionModel, AamTokenizer, AamGenerator, AamDiffusionConfig |
| |
| def main(): |
| # Load model and tokenizer |
| config = AamDiffusionConfig.from_json("config.json") |
| model = AamDiffusionModel.load("model.pt", device="cpu") |
| tokenizer = AamTokenizer.load("tokenizer.json") |
| |
| # Create generator |
| generator = AamGenerator(model, tokenizer, config) |
| |
| # Generate narrative |
| result = generator.generate( |
| trigger="Siapa yang mencuri Snow Plum Pill?", |
| evidence_nodes=["Hefei", "Diancang Five Swords", "Ju Jangmok"], |
| anomalies=["Tidak ada konsumsi pil baru di pasar gelap"], |
| reasoning_steps=["Cross-reference tanggal kejadian", "Deteksi anomali pola"], |
| source_trust=0.85, |
| ) |
| |
| print("=" * 60) |
| print(" AAM Diffusion LLM — Generated Narrative") |
| print("=" * 60) |
| print(f" Trigger: {result.evidence_used}") |
| print(f" Narrative: {result.narrative}") |
| print(f" Confidence: {result.confidence:.1%}") |
| print(f" Steps: {result.n_diffusion_steps}") |
| print(f" Time: {result.generation_time_s:.2f}s") |
| |
| if __name__ == "__main__": |
| main() |
| ''') |
| logger.info(f" Inference example saved: {inference_example}") |
|
|
| logger.info(f"\n HuggingFace export complete: {hf_dir}") |
| return hf_dir |
|
|
|
|
| def main(): |
| args = parse_args() |
| set_seed(args.seed) |
|
|
| output_dir = Path(args.output_dir) |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| print("=" * 60) |
| print(" AAM Diffusion LLM — Final Training") |
| print(" \"1 Pikiran + 1 Tubuh\" (1 Mind + 1 Body)") |
| print("=" * 60) |
| print() |
|
|
| |
| config = get_default_config(args.model_size) |
|
|
| |
| config.model.max_seq_len = 128 |
| config.model.vocab_size = 8000 |
| config.graph_encoder.max_evidence_nodes = 10 |
| config.graph_encoder.max_anomalies = 5 |
| config.graph_encoder.max_reasoning_steps = 5 |
| config.graph_encoder.max_compositions = 5 |
| config.diffusion.n_timesteps = 200 |
| config.diffusion.n_inference_steps = 20 |
| config.tokenizer.bpe_vocab_size = 8000 - 13 |
|
|
| |
| config.training.batch_size = args.batch_size |
| config.training.learning_rate = args.learning_rate |
| config.training.max_steps = args.max_steps |
| config.training.use_amp = False |
| config.training.num_workers = 0 |
| config.training.warmup_steps = min(100, args.max_steps // 5) |
| config.output_dir = str(output_dir) |
| config.seed = args.seed |
| config.model_name = "aam-diffusion-v2.1" |
|
|
| |
| print(config.summary()) |
|
|
| |
| train_path, val_path = generate_data( |
| output_dir, args.n_synthetic_train, args.n_synthetic_val, args.seed |
| ) |
|
|
| |
| tokenizer = train_tokenizer(train_path, output_dir, config) |
|
|
| |
| actual_vocab = tokenizer.vocab_size |
| if actual_vocab != config.model.vocab_size: |
| logger.info(f" Updating vocab_size: {config.model.vocab_size} → {actual_vocab}") |
| config.model.vocab_size = actual_vocab |
|
|
| |
| train_loader, val_loader = create_dataloaders( |
| train_path, val_path, tokenizer, config |
| ) |
|
|
| |
| model = AamDiffusionModel(config) |
| logger.info(f" Model parameters: {model._format_params(model.get_num_params())}") |
|
|
| model = train_model( |
| model, tokenizer, train_loader, val_loader, |
| config, output_dir, args |
| ) |
|
|
| |
| hf_dir = export_for_huggingface(model, tokenizer, config, output_dir) |
|
|
| |
| print() |
| print("=" * 60) |
| print(" TRAINING COMPLETE!") |
| print("=" * 60) |
| print(f" Model: {config.model_name}") |
| print(f" Parameters: {model._format_params(model.get_num_params())}") |
| print(f" Output: {output_dir}") |
| print(f" HuggingFace export: {hf_dir}") |
| print() |
| print(" AAM = 1 Pikiran + 1 Tubuh") |
| print(" Pikiran = RSVS Knowledge Graph") |
| print(" Tubuh = This Diffusion LLM") |
| print("=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|