File size: 2,898 Bytes
ed6bec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# generator/batch_generator.py

from pathlib import Path
from .config import DEFAULT_SAMPLE_COUNT
from .dictionary_access import load_all_dictionaries
from .templates_basic import generate_emotional_expression_samples
from .templates_actions import generate_action_object_samples
from .templates_context import generate_context_rich_samples
from .templates_context_advanced import generate_advanced_context_samples
from .training_builder import build_training_sample, save_samples


def generate_single_dataset(path: Path, sample_count: int):
    dictionaries = load_all_dictionaries()

    emotional = generate_emotional_expression_samples(
        dictionaries=dictionaries,
        count=sample_count // 4,
        source_language="en",
    )
    actions = generate_action_object_samples(
        dictionaries=dictionaries,
        count=sample_count // 4,
        source_language="en",
    )
    context_rich = generate_context_rich_samples(
        dictionaries=dictionaries,
        count=sample_count // 4,
        source_language="en",
    )
    advanced = generate_advanced_context_samples(
        dictionaries=dictionaries,
        count=sample_count // 4,
        source_language="en",
    )

    raw_samples = emotional + actions + context_rich + advanced

    training_samples = []
    for sample in raw_samples:
        user_text = sample.get("input", "")
        glyphic_output = sample.get("glyphic", "")
        realized_output = sample.get("output", "")

        identity = "A helpful, aligned Glyphic agent."
        emotion = sample.get("emotion", "neutral")
        sensory = sample.get("sensory", "none")
        social = sample.get("social", "alone")

        intent = {
            "goal": "assist",
            "urgency": "1",
            "focus": "support"
        }

        behavior = {
            "tone": "warm",
            "pacing": "steady",
            "depth": "medium",
            "style": "natural",
            "clarity": "high"
        }

        memory_summary = sample.get("memory", "")
        thought_chain = sample.get("thought_chain", "")

        training_sample = build_training_sample(
            user_text=user_text,
            identity=identity,
            emotion=emotion,
            sensory=sensory,
            social=social,
            intent=intent,
            behavior=behavior,
            memory_summary=memory_summary,
            thought_chain=thought_chain,
            glyphic_output=glyphic_output,
            realized_output=realized_output
        )
        training_samples.append(training_sample)

    save_samples(str(path), training_samples)


def generate_batches(count: int, outdir: Path):
    outdir.mkdir(parents=True, exist_ok=True)
    for i in range(count):
        path = outdir / f"glyphic_dataset_{i+1}.jsonl"
        generate_single_dataset(path, DEFAULT_SAMPLE_COUNT)
        print(f"[batch] wrote {path}")