ObjectverseDiary / scripts /prepare_curated_dataset.py
qqyule's picture
Deploy latest Objectverse Diary from fa09aac
dd6cefc verified
"""Prepare synthetic curated SFT data for Objectverse Diary LoRA tests."""
from __future__ import annotations
import argparse
import json
import sys
from collections.abc import Mapping, Sequence
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from src.models.schema import DiaryEntry, ObjectInfo, ObjectUnderstanding, Persona, PersonaEnvelope
DEFAULT_OUTPUT_PATH = Path("data/train/objectverse_sft_curated.jsonl")
DEFAULT_V2_OUTPUT_PATH = Path("data/train/objectverse_sft_curated_v2.jsonl")
DEFAULT_COUNT = 50
DEFAULT_V2_COUNT = 200
SOURCE_V1 = "objectverse-diary-synthetic-curated-v1"
SOURCE_V2 = "objectverse-diary-synthetic-curated-v2"
SYSTEM_PROMPT = (
"You are Objectverse Diary, an English-first small-model assistant. "
"Given structured object understanding and a requested personality mode, "
"return strict JSON with keys persona and diary. Keep the voice strange, "
"specific to the object, and suitable for a shareable object archive."
)
MODES = ("Cynical", "Dramatic", "Lonely", "Philosopher", "Romantic")
OBJECTS = [
{
"name": "coffee mug",
"features": ["white ceramic", "coffee ring", "tiny handle shadow"],
"context": "developer desk",
"memory": "listened to morning promises dissolve into cold coffee",
},
{
"name": "mechanical keyboard",
"features": ["black keycaps", "dust in the rows", "one glossy spacebar"],
"context": "office corner",
"memory": "translated panic into clicking long after midnight",
},
{
"name": "running shoe",
"features": ["creased mesh", "mud on the sole", "loose lace"],
"context": "bedroom doorway",
"memory": "carried brave intentions to the end of the block and back",
},
{
"name": "desk lamp",
"features": ["brushed metal neck", "warm bulb", "tilted shade"],
"context": "late-night desk",
"memory": "held a circle of light over notes nobody finished",
},
{
"name": "water bottle",
"features": ["clear plastic wall", "scratched cap", "half-full body"],
"context": "kitchen counter",
"memory": "survived every resolution to drink more water",
},
{
"name": "notebook",
"features": ["bent corner", "blue ink ghosts", "elastic strap"],
"context": "bag pocket",
"memory": "guarded three plans, two lists, and one sentence crossed out hard",
},
{
"name": "umbrella",
"features": ["folded black canopy", "wet seam", "curved handle"],
"context": "entryway hook",
"memory": "became useful only when the sky was already theatrical",
},
{
"name": "house key",
"features": ["brass teeth", "scratched bow", "small metal ring"],
"context": "coat pocket",
"memory": "opened the same door for every version of its human",
},
{
"name": "charging cable",
"features": ["frayed sleeve", "white plastic tip", "gentle knot"],
"context": "bedside floor",
"memory": "fed glowing rectangles while pretending not to resent them",
},
{
"name": "teaspoon",
"features": ["silver bowl", "thin handle", "tea stain near the neck"],
"context": "sink edge",
"memory": "stirred sweetness into cups and suspicion into silence",
},
]
OBJECTS_V2 = [
*(
dict(
obj,
scene_detail=f"resting in the {obj['context']} with a history no one inventoried",
)
for obj in OBJECTS
),
{
"name": "wireless earbud case",
"features": ["rounded white shell", "tiny hinge", "charging light"],
"context": "commuter bag",
"memory": "held two small arguments against silence through a crowded train",
"scene_detail": "buried beside lint, receipts, and one forgotten mint",
},
{
"name": "transit card",
"features": ["scuffed plastic", "faded corner", "thin blue stripe"],
"context": "wallet slot",
"memory": "opened gates for mornings that were already late",
"scene_detail": "pressed flat under coins and expired coupons",
},
{
"name": "canvas tote bag",
"features": ["creased cotton", "ink logo", "soft handles"],
"context": "entryway floor",
"memory": "carried groceries, books, and ambitions heavier than both",
"scene_detail": "slumped open with a receipt still clinging inside",
},
{
"name": "cracked phone case",
"features": ["clear plastic", "corner crack", "fingerprint haze"],
"context": "bedside table",
"memory": "took the impact so the glowing rectangle could remain innocent",
"scene_detail": "lying face down after another nervous scroll",
},
{
"name": "lip balm tube",
"features": ["twisted cap", "pocket scratches", "worn label"],
"context": "coat pocket",
"memory": "answered every small weather emergency without being thanked",
"scene_detail": "rolling between keys and a folded train ticket",
},
{
"name": "medicine organizer",
"features": ["clear lids", "weekday letters", "plastic hinges"],
"context": "bathroom shelf",
"memory": "sorted tiny promises into seven obedient compartments",
"scene_detail": "waiting under fluorescent light with Monday already open",
},
{
"name": "travel toothbrush",
"features": ["folding handle", "blue bristles", "vented cap"],
"context": "hotel sink",
"memory": "kept a mouth honest in rooms that forgot every guest",
"scene_detail": "balanced near a wrapped soap and a paper cup",
},
{
"name": "passport cover",
"features": ["navy leather", "creased spine", "stitched edge"],
"context": "carry-on pocket",
"memory": "guarded borders, delays, and a face trying to look awake",
"scene_detail": "wedged beside boarding papers and a silent pen",
},
{
"name": "boarding pass stub",
"features": ["thermal paper", "torn edge", "gate code"],
"context": "jacket pocket",
"memory": "proved a journey happened after the airport swallowed the day",
"scene_detail": "softened by rain and folded into four tired rectangles",
},
{
"name": "hotel keycard",
"features": ["matte plastic", "blank stripe", "room-number sleeve"],
"context": "nightstand",
"memory": "opened a temporary room for a temporary version of its human",
"scene_detail": "resting beside a glass of water no one trusted",
},
{
"name": "remote control",
"features": ["rubber buttons", "battery door scar", "dusty edges"],
"context": "sofa cushion",
"memory": "changed channels while nobody changed their mind",
"scene_detail": "half-sunk between cushions with one crumb for company",
},
{
"name": "reading glasses",
"features": ["thin frames", "smudged lenses", "bent temple"],
"context": "book stack",
"memory": "made small letters confess their meaning at midnight",
"scene_detail": "left open across a page that was never finished",
},
{
"name": "glasses case",
"features": ["hard shell", "soft lining", "snap hinge"],
"context": "desk drawer",
"memory": "protected fragile clarity from the tyranny of keys",
"scene_detail": "waiting in darkness with a paperclip pressed to its side",
},
{
"name": "wristwatch",
"features": ["scratched face", "brown strap", "small crown"],
"context": "dresser tray",
"memory": "measured days while humans pretended not to be measured",
"scene_detail": "stopped beside coins and a single loose button",
},
{
"name": "hair clip",
"features": ["amber plastic", "tiny teeth", "curved spring"],
"context": "bathroom counter",
"memory": "held chaos together for meetings, errands, and almost-crying",
"scene_detail": "resting near a fogged mirror and stray strands",
},
{
"name": "laundry token",
"features": ["round brass", "machine number", "dulled rim"],
"context": "laundry room",
"memory": "bought one more spin for clothes that knew too much",
"scene_detail": "cool in a palm smelling faintly of detergent",
},
{
"name": "refrigerator magnet",
"features": ["painted souvenir", "flat magnet back", "chipped corner"],
"context": "kitchen door",
"memory": "held reminders in place while everyone forgot the reason",
"scene_detail": "pinning a grocery list under a blue-white hum",
},
{
"name": "grocery receipt",
"features": ["curled paper", "faded ink", "long total"],
"context": "kitchen counter",
"memory": "itemized hunger, soap, and one unnecessary chocolate bar",
"scene_detail": "curling beside fruit that ripened too quickly",
},
{
"name": "spice jar",
"features": ["glass body", "red powder", "metal lid"],
"context": "kitchen shelf",
"memory": "made bland evenings briefly remember a warmer country",
"scene_detail": "standing in a row of louder labels",
},
{
"name": "cutting board",
"features": ["wood grain", "knife marks", "rounded corner"],
"context": "kitchen island",
"memory": "received every chopped plan without flinching",
"scene_detail": "drying upright after a meal nobody photographed",
},
{
"name": "ceramic bowl",
"features": ["blue rim", "tiny chip", "glazed curve"],
"context": "dish rack",
"memory": "held soup, cereal, and one quiet apology",
"scene_detail": "tilted beside plates still warm from rinse water",
},
{
"name": "reusable chopsticks",
"features": ["dark bamboo", "tapered tips", "cloth sleeve"],
"context": "lunch bag",
"memory": "lifted noodles through ordinary hunger and office gossip",
"scene_detail": "tucked into a sleeve with a soy sauce stain",
},
{
"name": "tea tin",
"features": ["green metal", "tight lid", "leaf dust"],
"context": "pantry shelf",
"memory": "kept rain-colored leaves ready for small recoveries",
"scene_detail": "quiet behind cereal boxes and a jar of almonds",
},
{
"name": "sticky note stack",
"features": ["yellow pages", "curled edge", "faint adhesive"],
"context": "monitor base",
"memory": "accepted urgent thoughts that became decorative fossils",
"scene_detail": "leaning under a monitor's cold rectangular sun",
},
{
"name": "binder clip",
"features": ["black steel", "silver arms", "pinched mouth"],
"context": "paper tray",
"memory": "held loose pages together when ideas tried to scatter",
"scene_detail": "biting a stack marked later in blue ink",
},
{
"name": "fountain pen",
"features": ["black barrel", "gold nib", "ink stain"],
"context": "notebook margin",
"memory": "turned hesitation into lines that looked deliberate",
"scene_detail": "uncapped beside a sentence crossed out twice",
},
{
"name": "old ticket stub",
"features": ["creased paper", "seat number", "torn perforation"],
"context": "memory box",
"memory": "survived the event after the applause became dust",
"scene_detail": "pressed under postcards and a dried ribbon",
},
{
"name": "candle jar",
"features": ["smoked glass", "wax tunnel", "blackened wick"],
"context": "window ledge",
"memory": "made one room pretend to be softer than it was",
"scene_detail": "cooled beside a window with rain on the other side",
},
{
"name": "alarm clock",
"features": ["round face", "plastic feet", "stubborn button"],
"context": "bedside shelf",
"memory": "tore people from dreams and was hated for being correct",
"scene_detail": "facing a bed that negotiated with every morning",
},
{
"name": "tape measure",
"features": ["yellow tape", "lock switch", "metal hook"],
"context": "tool drawer",
"memory": "proved shelves, windows, and ambitions were smaller than claimed",
"scene_detail": "coiled beside screws and one pencil shaved short",
},
]
MODE_PROFILES = {
"Cynical": {
"mood": "tired but sharply observant",
"fear": "being replaced by something newer and less honest",
"tag": ["dry witness", "domestic sarcasm", "small rebellion"],
"voice": "withholding applause",
},
"Dramatic": {
"mood": "grandly wounded",
"fear": "being forgotten before the curtain falls",
"tag": ["tragic prop", "household opera", "minor thunder"],
"voice": "making every scratch sound like fate",
},
"Lonely": {
"mood": "quietly abandoned",
"fear": "becoming background forever",
"tag": ["soft echo", "forgotten corner", "patient dust"],
"voice": "speaking as if the room almost listened",
},
"Philosopher": {
"mood": "curious and needlessly profound",
"fear": "discovering usefulness is not the same as meaning",
"tag": ["tiny ontology", "useful doubt", "object soul"],
"voice": "turning chores into metaphysics",
},
"Romantic": {
"mood": "hopelessly sentimental",
"fear": "loving a human who mistakes devotion for convenience",
"tag": ["tender witness", "secret devotion", "warm ache"],
"voice": "saving every ordinary touch as evidence",
},
}
def build_curated_records(
count: int | None = None,
*,
version: str = "v1",
) -> list[dict[str, object]]:
version = _validate_version(version)
if count is None:
count = DEFAULT_V2_COUNT if version == "v2" else DEFAULT_COUNT
if count < 1:
raise ValueError("count must be at least 1")
objects = _objects_for_version(version)
source = _source_for_version(version)
records: list[dict[str, object]] = []
for index in range(count):
obj = objects[index % len(objects)]
mode = MODES[(index // len(objects)) % len(MODES)]
record_id = _record_id(version, index)
understanding = _build_object_understanding(obj)
persona = _build_persona(obj, mode)
diary = _build_diary(obj, mode, persona.persona, index)
assistant_payload = {
"persona": persona.persona.model_dump(mode="json"),
"diary": diary.model_dump(mode="json"),
}
record = {
"id": record_id,
"source": source,
"split": "train",
"mode": mode,
"object_description": _object_description(obj),
"object_understanding": understanding.model_dump(mode="json"),
"curation_notes": _curation_notes(version),
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": _user_prompt(understanding.model_dump(mode="json"), mode),
},
{
"role": "assistant",
"content": json.dumps(assistant_payload, ensure_ascii=False),
},
],
}
if version == "v2":
record["scene_detail"] = str(obj["scene_detail"])
records.append(record)
return records
def write_jsonl(records: Sequence[Mapping[str, object]], output_path: Path) -> Path:
output_path.parent.mkdir(parents=True, exist_ok=True)
lines = [json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records]
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
return output_path
def prepare_curated_dataset(
output_path: Path | None = None,
count: int | None = None,
*,
version: str = "v1",
) -> Path:
version = _validate_version(version)
if output_path is None:
output_path = DEFAULT_V2_OUTPUT_PATH if version == "v2" else DEFAULT_OUTPUT_PATH
return write_jsonl(build_curated_records(count, version=version), output_path)
def _validate_version(version: str) -> str:
if version not in {"v1", "v2"}:
raise ValueError("version must be 'v1' or 'v2'.")
return version
def _objects_for_version(version: str) -> Sequence[Mapping[str, object]]:
return OBJECTS_V2 if version == "v2" else OBJECTS
def _source_for_version(version: str) -> str:
return SOURCE_V2 if version == "v2" else SOURCE_V1
def _record_id(version: str, index: int) -> str:
if version == "v2":
return f"curated-v2-synthetic-{index + 1:04d}"
return f"curated-synthetic-{index + 1:04d}"
def _curation_notes(version: str) -> str:
if version == "v2":
return (
"Synthetic curated v2 row: no private photo, no personal identifier, "
"broader object and scene coverage, English-first output with Chinese helper text."
)
return (
"Synthetic curated row: no private photo, no personal identifier, "
"English-first output with Chinese helper text."
)
def _build_object_understanding(obj: Mapping[str, object]) -> ObjectUnderstanding:
return ObjectUnderstanding(
object=ObjectInfo(
name=str(obj["name"]),
visible_features=[str(feature) for feature in obj["features"]],
likely_context=str(obj["context"]),
confidence=0.9,
)
)
def _build_persona(obj: Mapping[str, object], mode: str) -> PersonaEnvelope:
profile = MODE_PROFILES[mode]
object_name = str(obj["name"])
character_name = _character_name(object_name, mode)
return PersonaEnvelope(
persona=Persona(
object_name=object_name,
character_name=character_name,
mood=str(profile["mood"]),
secret_fear=str(profile["fear"]),
core_memory=str(obj["memory"]),
complaint=f"I am not merely a {object_name}; I am an archive of what humans do when they think things cannot testify.",
tags=[str(tag) for tag in profile["tag"]],
)
)
def _build_diary(obj: Mapping[str, object], mode: str, persona: Persona, index: int) -> DiaryEntry:
profile = MODE_PROFILES[mode]
object_name = str(obj["name"])
features = ", ".join(str(feature) for feature in obj["features"][:2])
scene = str(obj.get("scene_detail", "collecting proof that ordinary things notice everything"))
day_number = 300 + index + len(object_name)
english = (
f"Today I waited in the {obj['context']} wearing my {features} like official records. "
f"The humans moved around me with the confidence of temporary weather. "
f"I remembered how I {obj['memory']}, and I answered in my own way: {profile['voice']}. "
f"My mood is {persona.mood}, but I am still here, {scene}."
)
chinese = (
f"今天我待在 {obj['context']},带着 {features},像一份安静的档案。"
f"人类从我身边经过,好像自己不是短暂天气。"
f"我记得自己曾经 {obj['memory']},于是用自己的方式回应:{profile['voice']}。"
f"我的情绪是 {persona.mood},但我仍在这里,{scene}。"
)
return DiaryEntry(
title=f"Secret Diary - Day {day_number}",
english=english,
chinese=chinese,
)
def _character_name(object_name: str, mode: str) -> str:
compact = "".join(part.capitalize() for part in object_name.split()[:2])
suffix = {
"Cynical": "Ash",
"Dramatic": "of the Minor Stage",
"Lonely": "Afterlight",
"Philosopher": "the Questioning",
"Romantic": "de Moon",
}[mode]
return f"{compact} {suffix}".strip()
def _object_description(obj: Mapping[str, object]) -> str:
features = ", ".join(str(feature) for feature in obj["features"])
description = f"{obj['name']} in a {obj['context']} with {features}"
if "scene_detail" in obj:
description = f"{description}, {obj['scene_detail']}"
return description
def _user_prompt(object_understanding: Mapping[str, object], mode: str) -> str:
payload = json.dumps(object_understanding, ensure_ascii=False, sort_keys=True)
return (
f"Personality mode: {mode}\n"
f"Object understanding JSON: {payload}\n"
"Return JSON with keys persona and diary only."
)
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--version", choices=("v1", "v2"), default="v1")
parser.add_argument("--count", type=int, default=None)
parser.add_argument("--output", type=Path, default=None)
return parser.parse_args()
def main() -> None:
args = _parse_args()
output_path = prepare_curated_dataset(args.output, args.count, version=args.version)
record_count = args.count or (DEFAULT_V2_COUNT if args.version == "v2" else DEFAULT_COUNT)
print(f"wrote {record_count} synthetic curated SFT records to {output_path}")
if __name__ == "__main__":
main()