| """ |
| AAM Diffusion LLM — Synthetic Data Generator |
| |
| Generates synthetic Graph→Narrative training pairs for |
| pre-training the diffusion model before real data is available. |
| |
| The synthetic data follows the AAM pattern: |
| - Graph conditioning: evidence, compositions, anomalies, reasoning |
| - Target narrative: natural language text that represents the graph data |
| |
| This is essential because: |
| 1. We need training data before the model can be used |
| 2. The data must follow the Graph→Narrative format specifically |
| 3. Synthetic data helps bootstrap the model's ability to |
| arrange sentences from structured evidence |
| |
| Analogi: Seperti Jin Soun berlatih dengan kasus-kasus fiktif |
| sebelum menghadapi kasus nyata — data sintetis memberikan |
| "latihan dasar" sebelum data asli tersedia. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import logging |
| import random |
| from dataclasses import dataclass, field |
| from pathlib import Path |
| from typing import Optional |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
|
|
| |
| ID_TEMPLATES = { |
| "analysis": [ |
| "Berdasarkan analisis terhadap {trigger}: {evidence_summary}. {reasoning_summary}. Tingkat keyakinan: {confidence_pct}.", |
| "Analisis menunjukkan bahwa {trigger} terkait dengan {evidence_summary}. {anomaly_summary}. Kesimpulan: {reasoning_summary}.", |
| "Dari data yang tersedia, {trigger} memiliki koneksi ke {evidence_summary}. {reasoning_summary}. Confidence: {confidence_pct}.", |
| "Hasil investigasi: {trigger}. Bukti: {evidence_summary}. {anomaly_summary}. {reasoning_summary}.", |
| "Temuan: {trigger} berkorelasi dengan {evidence_summary}. Catatan: {anomaly_summary}. Analisis: {reasoning_summary}.", |
| ], |
| "evidence_summary": [ |
| "bukti menunjukkan {nodes}", |
| "data dari {nodes} mengindikasikan", |
| "{nodes} menjadi kunci", |
| "informasi dari {nodes} mengarah ke", |
| "sumber {nodes} mengkonfirmasi", |
| ], |
| "anomaly_summary": [ |
| "Anomali terdeteksi: {anomalies}", |
| "Perhatian: {anomalies}", |
| "Pola tidak lazim: {anomalies}", |
| "Ketidaksesuaian ditemukan: {anomalies}", |
| "Terdapat kejanggalan: {anomalies}", |
| ], |
| "reasoning_summary": [ |
| "Langkah penalaran: {steps}", |
| "Proses deduksi: {steps}", |
| "Analisis bertahap: {steps}", |
| "Penelusuran logika: {steps}", |
| "Rantai penalaran: {steps}", |
| ], |
| } |
|
|
| |
| EN_TEMPLATES = { |
| "analysis": [ |
| "Based on analysis of {trigger}: {evidence_summary}. {reasoning_summary}. Confidence: {confidence_pct}.", |
| "Analysis indicates that {trigger} is related to {evidence_summary}. {anomaly_summary}. Conclusion: {reasoning_summary}.", |
| "From available data, {trigger} has connections to {evidence_summary}. {reasoning_summary}. Confidence level: {confidence_pct}.", |
| "Investigation results: {trigger}. Evidence: {evidence_summary}. {anomaly_summary}. {reasoning_summary}.", |
| "Findings: {trigger} correlates with {evidence_summary}. Note: {anomaly_summary}. Analysis: {reasoning_summary}.", |
| ], |
| "evidence_summary": [ |
| "evidence shows {nodes}", |
| "data from {nodes} indicates", |
| "{nodes} are key factors", |
| "information from {nodes} points to", |
| "sources {nodes} confirm", |
| ], |
| "anomaly_summary": [ |
| "Anomaly detected: {anomalies}", |
| "Note: {anomalies}", |
| "Unusual pattern: {anomalies}", |
| "Inconsistency found: {anomalies}", |
| "Irregularity observed: {anomalies}", |
| ], |
| "reasoning_summary": [ |
| "Reasoning steps: {steps}", |
| "Deductive process: {steps}", |
| "Step-by-step analysis: {steps}", |
| "Logical trace: {steps}", |
| "Reasoning chain: {steps}", |
| ], |
| } |
|
|
| |
| SAMPLE_EVIDENCE_NODES = { |
| "id": [ |
| "Hefei", "Diancang Five Swords", "Ju Jangmok", "Snow Plum Pill", |
| "Gyeryong Merchant Guild", "Simhyeon Pavilion", "Martial Alliance", |
| "Gu Ilmu", "Jang Hangi", "Blood Serpent Dance Step", |
| "taeul_sect", "dark_faction", "hefei_branch", |
| ], |
| "en": [ |
| "Hefei", "Diancang Five Swords", "Ju Jangmok", "Snow Plum Pill", |
| "Gyeryong Merchant Guild", "Simhyeon Pavilion", "Martial Alliance", |
| "Gu Ilmu", "Jang Hangi", "Blood Serpent Dance Step", |
| "taeul_sect", "dark_faction", "hefei_branch", |
| ], |
| } |
|
|
| SAMPLE_TRIGGERS = { |
| "id": [ |
| "Siapa yang mencuri Snow Plum Pill?", |
| "Analisis pergerakan Diancang Five Swords", |
| "Hubungan antara Ju Jangmok dan pencurian", |
| "Anomali dalam laporan Hefei", |
| "Investigasi inside job di Diancang", |
| "Pola konsumsi Snow Plum Pill", |
| "Cross-reference kejadian di Hefei", |
| "Evaluasi kepercayaan sumber informasi", |
| "Prediksi tindakan berikutnya tersangka", |
| "Pattern completion dari bukti terpisah", |
| ], |
| "en": [ |
| "Who stole the Snow Plum Pill?", |
| "Analysis of Diancang Five Swords movements", |
| "Connection between Ju Jangmok and the theft", |
| "Anomalies in the Hefei reports", |
| "Investigation of inside job at Diancang", |
| "Pattern of Snow Plum Pill consumption", |
| "Cross-referencing events in Hefei", |
| "Source trustworthiness evaluation", |
| "Predicting next suspect actions", |
| "Pattern completion from disparate evidence", |
| ], |
| } |
|
|
| SAMPLE_ANOMALIES = { |
| "id": [ |
| "Tidak ada konsumsi pil baru di pasar gelap", |
| "Pencuri menghilang tanpa jejak", |
| "Success rate pair lebih tinggi dari biasanya", |
| "Misi di-assign dari dalam Diancang sendiri", |
| "Ju Jangmok menghilang hari yang sama dengan pencurian", |
| "Tidak ada pencuri baru setelah Ju Jangmok menghilang", |
| ], |
| "en": [ |
| "No new pill consumption in black market", |
| "Thief disappeared without a trace", |
| "Pair success rate unusually high", |
| "Mission assigned from within Diancang itself", |
| "Ju Jangmok disappeared same day as theft", |
| "No new thief appeared after Ju Jangmok vanished", |
| ], |
| } |
|
|
| SAMPLE_REASONING_STEPS = { |
| "id": [ |
| "Recall: Ingat semua laporan terkait Hefei", |
| "Cross-reference: Bandingkan tanggal kejadian", |
| "Filter: Eliminasi yang tidak relevan", |
| "Anomaly: Deteksi ketidaksesuaian pola", |
| "Pattern: Hubungkan fragmen terpisah", |
| "Compose: Susun kesimpulan dari bukti", |
| "Predict: Perkirakan tindakan berikutnya", |
| "Verify: Cek konsistensi kesimpulan", |
| ], |
| "en": [ |
| "Recall: Remember all reports related to Hefei", |
| "Cross-reference: Compare event dates", |
| "Filter: Eliminate irrelevant data", |
| "Anomaly: Detect pattern inconsistency", |
| "Pattern: Connect disparate fragments", |
| "Compose: Assemble conclusion from evidence", |
| "Predict: Estimate next actions", |
| "Verify: Check conclusion consistency", |
| ], |
| } |
|
|
|
|
| class SyntheticDataGenerator: |
| """Generate synthetic Graph→Narrative training pairs. |
| |
| This generator creates training data that follows the AAM |
| pattern: structured graph conditioning → natural language narrative. |
| |
| The generated data covers: |
| - Various trigger types (questions, analysis requests) |
| - Different numbers of evidence nodes (1-50) |
| - Various anomaly patterns |
| - Different reasoning chain lengths |
| - Confidence distributions |
| - Both Indonesian and English |
| |
| Usage: |
| generator = SyntheticDataGenerator() |
| examples = generator.generate(n=1000, language="id") |
| generator.save(examples, "training_data.jsonl") |
| """ |
|
|
| def __init__( |
| self, |
| seed: int = 42, |
| language: str = "id", |
| ): |
| """Initialize the synthetic data generator. |
| |
| Args: |
| seed: Random seed for reproducibility. |
| language: Default language for generation. |
| """ |
| self.seed = seed |
| self.language = language |
| random.seed(seed) |
|
|
| def generate( |
| self, |
| n: int = 1000, |
| language: Optional[str] = None, |
| min_evidence: int = 2, |
| max_evidence: int = 15, |
| anomaly_probability: float = 0.6, |
| reasoning_probability: float = 0.8, |
| ) -> list[dict]: |
| """Generate synthetic training examples. |
| |
| Args: |
| n: Number of examples to generate. |
| language: Language override. |
| min_evidence: Minimum evidence nodes per example. |
| max_evidence: Maximum evidence nodes per example. |
| anomaly_probability: Probability of including anomalies. |
| reasoning_probability: Probability of including reasoning steps. |
| |
| Returns: |
| List of training example dictionaries. |
| """ |
| lang = language or self.language |
| templates = ID_TEMPLATES if lang == "id" else EN_TEMPLATES |
| evidence_pool = SAMPLE_EVIDENCE_NODES.get(lang, SAMPLE_EVIDENCE_NODES["en"]) |
| trigger_pool = SAMPLE_TRIGGERS.get(lang, SAMPLE_TRIGGERS["en"]) |
| anomaly_pool = SAMPLE_ANOMALIES.get(lang, SAMPLE_ANOMALIES["en"]) |
| reasoning_pool = SAMPLE_REASONING_STEPS.get(lang, SAMPLE_REASONING_STEPS["en"]) |
|
|
| examples = [] |
| for _ in range(n): |
| |
| trigger = random.choice(trigger_pool) |
|
|
| |
| n_evidence = random.randint(min_evidence, max_evidence) |
| evidence = random.sample(evidence_pool, min(n_evidence, len(evidence_pool))) |
|
|
| |
| confidence_map = { |
| node: round(random.uniform(0.3, 1.0), 2) |
| for node in evidence |
| } |
|
|
| |
| anomalies = [] |
| if random.random() < anomaly_probability: |
| n_anomalies = random.randint(1, 3) |
| anomalies = random.sample(anomaly_pool, min(n_anomalies, len(anomaly_pool))) |
|
|
| |
| reasoning_steps = [] |
| if random.random() < reasoning_probability: |
| n_steps = random.randint(2, 6) |
| reasoning_steps = random.sample(reasoning_pool, min(n_steps, len(reasoning_pool))) |
|
|
| |
| source_trust = round(random.uniform(0.5, 1.0), 2) |
|
|
| |
| narrative = self._generate_narrative( |
| trigger=trigger, |
| evidence=evidence, |
| anomalies=anomalies, |
| reasoning_steps=reasoning_steps, |
| confidence_map=confidence_map, |
| templates=templates, |
| lang=lang, |
| ) |
|
|
| example = { |
| "narrative": narrative, |
| "trigger": trigger, |
| "evidence_nodes": evidence, |
| "compositions": [], |
| "confidence_map": confidence_map, |
| "anomalies": anomalies, |
| "reasoning_steps": reasoning_steps, |
| "source_trust": source_trust, |
| "language": lang, |
| "source": "synthetic", |
| } |
|
|
| examples.append(example) |
|
|
| logger.info("Generated %d synthetic examples (language=%s)", n, lang) |
| return examples |
|
|
| def _generate_narrative( |
| self, |
| trigger: str, |
| evidence: list[str], |
| anomalies: list[str], |
| reasoning_steps: list[str], |
| confidence_map: dict[str, float], |
| templates: dict, |
| lang: str, |
| ) -> str: |
| """Generate a narrative from templates. |
| |
| Args: |
| trigger: Trigger text. |
| evidence: Evidence node labels. |
| anomalies: Anomaly descriptions. |
| reasoning_steps: Reasoning step descriptions. |
| confidence_map: Confidence scores. |
| templates: Template dictionary. |
| lang: Language code. |
| |
| Returns: |
| Generated narrative string. |
| """ |
| |
| evidence_str = ", ".join(evidence[:5]) |
| avg_confidence = sum(confidence_map.values()) / max(len(confidence_map), 1) |
|
|
| |
| evidence_summary = random.choice(templates["evidence_summary"]).format( |
| nodes=evidence_str |
| ) |
|
|
| anomaly_summary = "" |
| if anomalies: |
| anomaly_summary = random.choice(templates["anomaly_summary"]).format( |
| anomalies="; ".join(anomalies[:3]) |
| ) |
|
|
| reasoning_summary = "" |
| if reasoning_steps: |
| reasoning_summary = random.choice(templates["reasoning_summary"]).format( |
| steps="; ".join(reasoning_steps[:4]) |
| ) |
|
|
| |
| narrative = random.choice(templates["analysis"]).format( |
| trigger=trigger, |
| evidence_summary=evidence_summary, |
| anomaly_summary=anomaly_summary, |
| reasoning_summary=reasoning_summary, |
| confidence_pct=f"{avg_confidence:.0%}", |
| ) |
|
|
| return narrative |
|
|
| def save( |
| self, |
| examples: list[dict], |
| path: str | Path, |
| ) -> None: |
| """Save examples to JSONL file. |
| |
| Args: |
| examples: List of example dictionaries. |
| path: Output file path. |
| """ |
| path = Path(path) |
| path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| with open(path, "w", encoding="utf-8") as f: |
| for example in examples: |
| f.write(json.dumps(example, ensure_ascii=False) + "\n") |
|
|
| logger.info("Saved %d examples to %s", len(examples), path) |
|
|
| @classmethod |
| def generate_training_split( |
| cls, |
| output_dir: str | Path, |
| n_train: int = 10000, |
| n_val: int = 500, |
| language: str = "id", |
| seed: int = 42, |
| ) -> tuple[Path, Path]: |
| """Generate and save train/val splits. |
| |
| Args: |
| output_dir: Output directory. |
| n_train: Number of training examples. |
| n_val: Number of validation examples. |
| language: Language for generation. |
| seed: Random seed. |
| |
| Returns: |
| Tuple of (train_path, val_path). |
| """ |
| output_dir = Path(output_dir) |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| generator = cls(seed=seed, language=language) |
|
|
| |
| train_examples = generator.generate(n=n_train, language=language) |
| train_path = output_dir / "train.jsonl" |
| generator.save(train_examples, train_path) |
|
|
| |
| val_generator = cls(seed=seed + 1, language=language) |
| val_examples = val_generator.generate(n=n_val, language=language) |
| val_path = output_dir / "val.jsonl" |
| val_generator.save(val_examples, val_path) |
|
|
| logger.info( |
| "Generated training split: %d train, %d val", |
| n_train, n_val, |
| ) |
|
|
| return train_path, val_path |
|
|