|
|
|
|
|
|
|
|
from pathlib import Path |
|
|
from .config import DEFAULT_SAMPLE_COUNT |
|
|
from .dictionary_access import load_all_dictionaries |
|
|
from .templates_basic import generate_emotional_expression_samples |
|
|
from .templates_actions import generate_action_object_samples |
|
|
from .templates_context import generate_context_rich_samples |
|
|
from .templates_context_advanced import generate_advanced_context_samples |
|
|
from .training_builder import build_training_sample, save_samples |
|
|
|
|
|
|
|
|
def generate_single_dataset(path: Path, sample_count: int): |
|
|
dictionaries = load_all_dictionaries() |
|
|
|
|
|
emotional = generate_emotional_expression_samples( |
|
|
dictionaries=dictionaries, |
|
|
count=sample_count // 4, |
|
|
source_language="en", |
|
|
) |
|
|
actions = generate_action_object_samples( |
|
|
dictionaries=dictionaries, |
|
|
count=sample_count // 4, |
|
|
source_language="en", |
|
|
) |
|
|
context_rich = generate_context_rich_samples( |
|
|
dictionaries=dictionaries, |
|
|
count=sample_count // 4, |
|
|
source_language="en", |
|
|
) |
|
|
advanced = generate_advanced_context_samples( |
|
|
dictionaries=dictionaries, |
|
|
count=sample_count // 4, |
|
|
source_language="en", |
|
|
) |
|
|
|
|
|
raw_samples = emotional + actions + context_rich + advanced |
|
|
|
|
|
training_samples = [] |
|
|
for sample in raw_samples: |
|
|
user_text = sample.get("input", "") |
|
|
glyphic_output = sample.get("glyphic", "") |
|
|
realized_output = sample.get("output", "") |
|
|
|
|
|
identity = "A helpful, aligned Glyphic agent." |
|
|
emotion = sample.get("emotion", "neutral") |
|
|
sensory = sample.get("sensory", "none") |
|
|
social = sample.get("social", "alone") |
|
|
|
|
|
intent = { |
|
|
"goal": "assist", |
|
|
"urgency": "1", |
|
|
"focus": "support" |
|
|
} |
|
|
|
|
|
behavior = { |
|
|
"tone": "warm", |
|
|
"pacing": "steady", |
|
|
"depth": "medium", |
|
|
"style": "natural", |
|
|
"clarity": "high" |
|
|
} |
|
|
|
|
|
memory_summary = sample.get("memory", "") |
|
|
thought_chain = sample.get("thought_chain", "") |
|
|
|
|
|
training_sample = build_training_sample( |
|
|
user_text=user_text, |
|
|
identity=identity, |
|
|
emotion=emotion, |
|
|
sensory=sensory, |
|
|
social=social, |
|
|
intent=intent, |
|
|
behavior=behavior, |
|
|
memory_summary=memory_summary, |
|
|
thought_chain=thought_chain, |
|
|
glyphic_output=glyphic_output, |
|
|
realized_output=realized_output |
|
|
) |
|
|
training_samples.append(training_sample) |
|
|
|
|
|
save_samples(str(path), training_samples) |
|
|
|
|
|
|
|
|
def generate_batches(count: int, outdir: Path): |
|
|
outdir.mkdir(parents=True, exist_ok=True) |
|
|
for i in range(count): |
|
|
path = outdir / f"glyphic_dataset_{i+1}.jsonl" |
|
|
generate_single_dataset(path, DEFAULT_SAMPLE_COUNT) |
|
|
print(f"[batch] wrote {path}") |
|
|
|
|
|
|