File size: 2,898 Bytes
ed6bec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# generator/batch_generator.py
from pathlib import Path
from .config import DEFAULT_SAMPLE_COUNT
from .dictionary_access import load_all_dictionaries
from .templates_basic import generate_emotional_expression_samples
from .templates_actions import generate_action_object_samples
from .templates_context import generate_context_rich_samples
from .templates_context_advanced import generate_advanced_context_samples
from .training_builder import build_training_sample, save_samples
def generate_single_dataset(path: Path, sample_count: int):
dictionaries = load_all_dictionaries()
emotional = generate_emotional_expression_samples(
dictionaries=dictionaries,
count=sample_count // 4,
source_language="en",
)
actions = generate_action_object_samples(
dictionaries=dictionaries,
count=sample_count // 4,
source_language="en",
)
context_rich = generate_context_rich_samples(
dictionaries=dictionaries,
count=sample_count // 4,
source_language="en",
)
advanced = generate_advanced_context_samples(
dictionaries=dictionaries,
count=sample_count // 4,
source_language="en",
)
raw_samples = emotional + actions + context_rich + advanced
training_samples = []
for sample in raw_samples:
user_text = sample.get("input", "")
glyphic_output = sample.get("glyphic", "")
realized_output = sample.get("output", "")
identity = "A helpful, aligned Glyphic agent."
emotion = sample.get("emotion", "neutral")
sensory = sample.get("sensory", "none")
social = sample.get("social", "alone")
intent = {
"goal": "assist",
"urgency": "1",
"focus": "support"
}
behavior = {
"tone": "warm",
"pacing": "steady",
"depth": "medium",
"style": "natural",
"clarity": "high"
}
memory_summary = sample.get("memory", "")
thought_chain = sample.get("thought_chain", "")
training_sample = build_training_sample(
user_text=user_text,
identity=identity,
emotion=emotion,
sensory=sensory,
social=social,
intent=intent,
behavior=behavior,
memory_summary=memory_summary,
thought_chain=thought_chain,
glyphic_output=glyphic_output,
realized_output=realized_output
)
training_samples.append(training_sample)
save_samples(str(path), training_samples)
def generate_batches(count: int, outdir: Path):
outdir.mkdir(parents=True, exist_ok=True)
for i in range(count):
path = outdir / f"glyphic_dataset_{i+1}.jsonl"
generate_single_dataset(path, DEFAULT_SAMPLE_COUNT)
print(f"[batch] wrote {path}")
|