"""Prepare synthetic curated SFT data for Objectverse Diary LoRA tests.""" from __future__ import annotations import argparse import json import sys from collections.abc import Mapping, Sequence from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from src.models.schema import DiaryEntry, ObjectInfo, ObjectUnderstanding, Persona, PersonaEnvelope DEFAULT_OUTPUT_PATH = Path("data/train/objectverse_sft_curated.jsonl") DEFAULT_V2_OUTPUT_PATH = Path("data/train/objectverse_sft_curated_v2.jsonl") DEFAULT_COUNT = 50 DEFAULT_V2_COUNT = 200 SOURCE_V1 = "objectverse-diary-synthetic-curated-v1" SOURCE_V2 = "objectverse-diary-synthetic-curated-v2" SYSTEM_PROMPT = ( "You are Objectverse Diary, an English-first small-model assistant. " "Given structured object understanding and a requested personality mode, " "return strict JSON with keys persona and diary. Keep the voice strange, " "specific to the object, and suitable for a shareable object archive." ) MODES = ("Cynical", "Dramatic", "Lonely", "Philosopher", "Romantic") OBJECTS = [ { "name": "coffee mug", "features": ["white ceramic", "coffee ring", "tiny handle shadow"], "context": "developer desk", "memory": "listened to morning promises dissolve into cold coffee", }, { "name": "mechanical keyboard", "features": ["black keycaps", "dust in the rows", "one glossy spacebar"], "context": "office corner", "memory": "translated panic into clicking long after midnight", }, { "name": "running shoe", "features": ["creased mesh", "mud on the sole", "loose lace"], "context": "bedroom doorway", "memory": "carried brave intentions to the end of the block and back", }, { "name": "desk lamp", "features": ["brushed metal neck", "warm bulb", "tilted shade"], "context": "late-night desk", "memory": "held a circle of light over notes nobody finished", }, { "name": "water bottle", "features": ["clear plastic wall", "scratched cap", "half-full body"], "context": "kitchen counter", "memory": "survived every resolution to drink more water", }, { "name": "notebook", "features": ["bent corner", "blue ink ghosts", "elastic strap"], "context": "bag pocket", "memory": "guarded three plans, two lists, and one sentence crossed out hard", }, { "name": "umbrella", "features": ["folded black canopy", "wet seam", "curved handle"], "context": "entryway hook", "memory": "became useful only when the sky was already theatrical", }, { "name": "house key", "features": ["brass teeth", "scratched bow", "small metal ring"], "context": "coat pocket", "memory": "opened the same door for every version of its human", }, { "name": "charging cable", "features": ["frayed sleeve", "white plastic tip", "gentle knot"], "context": "bedside floor", "memory": "fed glowing rectangles while pretending not to resent them", }, { "name": "teaspoon", "features": ["silver bowl", "thin handle", "tea stain near the neck"], "context": "sink edge", "memory": "stirred sweetness into cups and suspicion into silence", }, ] OBJECTS_V2 = [ *( dict( obj, scene_detail=f"resting in the {obj['context']} with a history no one inventoried", ) for obj in OBJECTS ), { "name": "wireless earbud case", "features": ["rounded white shell", "tiny hinge", "charging light"], "context": "commuter bag", "memory": "held two small arguments against silence through a crowded train", "scene_detail": "buried beside lint, receipts, and one forgotten mint", }, { "name": "transit card", "features": ["scuffed plastic", "faded corner", "thin blue stripe"], "context": "wallet slot", "memory": "opened gates for mornings that were already late", "scene_detail": "pressed flat under coins and expired coupons", }, { "name": "canvas tote bag", "features": ["creased cotton", "ink logo", "soft handles"], "context": "entryway floor", "memory": "carried groceries, books, and ambitions heavier than both", "scene_detail": "slumped open with a receipt still clinging inside", }, { "name": "cracked phone case", "features": ["clear plastic", "corner crack", "fingerprint haze"], "context": "bedside table", "memory": "took the impact so the glowing rectangle could remain innocent", "scene_detail": "lying face down after another nervous scroll", }, { "name": "lip balm tube", "features": ["twisted cap", "pocket scratches", "worn label"], "context": "coat pocket", "memory": "answered every small weather emergency without being thanked", "scene_detail": "rolling between keys and a folded train ticket", }, { "name": "medicine organizer", "features": ["clear lids", "weekday letters", "plastic hinges"], "context": "bathroom shelf", "memory": "sorted tiny promises into seven obedient compartments", "scene_detail": "waiting under fluorescent light with Monday already open", }, { "name": "travel toothbrush", "features": ["folding handle", "blue bristles", "vented cap"], "context": "hotel sink", "memory": "kept a mouth honest in rooms that forgot every guest", "scene_detail": "balanced near a wrapped soap and a paper cup", }, { "name": "passport cover", "features": ["navy leather", "creased spine", "stitched edge"], "context": "carry-on pocket", "memory": "guarded borders, delays, and a face trying to look awake", "scene_detail": "wedged beside boarding papers and a silent pen", }, { "name": "boarding pass stub", "features": ["thermal paper", "torn edge", "gate code"], "context": "jacket pocket", "memory": "proved a journey happened after the airport swallowed the day", "scene_detail": "softened by rain and folded into four tired rectangles", }, { "name": "hotel keycard", "features": ["matte plastic", "blank stripe", "room-number sleeve"], "context": "nightstand", "memory": "opened a temporary room for a temporary version of its human", "scene_detail": "resting beside a glass of water no one trusted", }, { "name": "remote control", "features": ["rubber buttons", "battery door scar", "dusty edges"], "context": "sofa cushion", "memory": "changed channels while nobody changed their mind", "scene_detail": "half-sunk between cushions with one crumb for company", }, { "name": "reading glasses", "features": ["thin frames", "smudged lenses", "bent temple"], "context": "book stack", "memory": "made small letters confess their meaning at midnight", "scene_detail": "left open across a page that was never finished", }, { "name": "glasses case", "features": ["hard shell", "soft lining", "snap hinge"], "context": "desk drawer", "memory": "protected fragile clarity from the tyranny of keys", "scene_detail": "waiting in darkness with a paperclip pressed to its side", }, { "name": "wristwatch", "features": ["scratched face", "brown strap", "small crown"], "context": "dresser tray", "memory": "measured days while humans pretended not to be measured", "scene_detail": "stopped beside coins and a single loose button", }, { "name": "hair clip", "features": ["amber plastic", "tiny teeth", "curved spring"], "context": "bathroom counter", "memory": "held chaos together for meetings, errands, and almost-crying", "scene_detail": "resting near a fogged mirror and stray strands", }, { "name": "laundry token", "features": ["round brass", "machine number", "dulled rim"], "context": "laundry room", "memory": "bought one more spin for clothes that knew too much", "scene_detail": "cool in a palm smelling faintly of detergent", }, { "name": "refrigerator magnet", "features": ["painted souvenir", "flat magnet back", "chipped corner"], "context": "kitchen door", "memory": "held reminders in place while everyone forgot the reason", "scene_detail": "pinning a grocery list under a blue-white hum", }, { "name": "grocery receipt", "features": ["curled paper", "faded ink", "long total"], "context": "kitchen counter", "memory": "itemized hunger, soap, and one unnecessary chocolate bar", "scene_detail": "curling beside fruit that ripened too quickly", }, { "name": "spice jar", "features": ["glass body", "red powder", "metal lid"], "context": "kitchen shelf", "memory": "made bland evenings briefly remember a warmer country", "scene_detail": "standing in a row of louder labels", }, { "name": "cutting board", "features": ["wood grain", "knife marks", "rounded corner"], "context": "kitchen island", "memory": "received every chopped plan without flinching", "scene_detail": "drying upright after a meal nobody photographed", }, { "name": "ceramic bowl", "features": ["blue rim", "tiny chip", "glazed curve"], "context": "dish rack", "memory": "held soup, cereal, and one quiet apology", "scene_detail": "tilted beside plates still warm from rinse water", }, { "name": "reusable chopsticks", "features": ["dark bamboo", "tapered tips", "cloth sleeve"], "context": "lunch bag", "memory": "lifted noodles through ordinary hunger and office gossip", "scene_detail": "tucked into a sleeve with a soy sauce stain", }, { "name": "tea tin", "features": ["green metal", "tight lid", "leaf dust"], "context": "pantry shelf", "memory": "kept rain-colored leaves ready for small recoveries", "scene_detail": "quiet behind cereal boxes and a jar of almonds", }, { "name": "sticky note stack", "features": ["yellow pages", "curled edge", "faint adhesive"], "context": "monitor base", "memory": "accepted urgent thoughts that became decorative fossils", "scene_detail": "leaning under a monitor's cold rectangular sun", }, { "name": "binder clip", "features": ["black steel", "silver arms", "pinched mouth"], "context": "paper tray", "memory": "held loose pages together when ideas tried to scatter", "scene_detail": "biting a stack marked later in blue ink", }, { "name": "fountain pen", "features": ["black barrel", "gold nib", "ink stain"], "context": "notebook margin", "memory": "turned hesitation into lines that looked deliberate", "scene_detail": "uncapped beside a sentence crossed out twice", }, { "name": "old ticket stub", "features": ["creased paper", "seat number", "torn perforation"], "context": "memory box", "memory": "survived the event after the applause became dust", "scene_detail": "pressed under postcards and a dried ribbon", }, { "name": "candle jar", "features": ["smoked glass", "wax tunnel", "blackened wick"], "context": "window ledge", "memory": "made one room pretend to be softer than it was", "scene_detail": "cooled beside a window with rain on the other side", }, { "name": "alarm clock", "features": ["round face", "plastic feet", "stubborn button"], "context": "bedside shelf", "memory": "tore people from dreams and was hated for being correct", "scene_detail": "facing a bed that negotiated with every morning", }, { "name": "tape measure", "features": ["yellow tape", "lock switch", "metal hook"], "context": "tool drawer", "memory": "proved shelves, windows, and ambitions were smaller than claimed", "scene_detail": "coiled beside screws and one pencil shaved short", }, ] MODE_PROFILES = { "Cynical": { "mood": "tired but sharply observant", "fear": "being replaced by something newer and less honest", "tag": ["dry witness", "domestic sarcasm", "small rebellion"], "voice": "withholding applause", }, "Dramatic": { "mood": "grandly wounded", "fear": "being forgotten before the curtain falls", "tag": ["tragic prop", "household opera", "minor thunder"], "voice": "making every scratch sound like fate", }, "Lonely": { "mood": "quietly abandoned", "fear": "becoming background forever", "tag": ["soft echo", "forgotten corner", "patient dust"], "voice": "speaking as if the room almost listened", }, "Philosopher": { "mood": "curious and needlessly profound", "fear": "discovering usefulness is not the same as meaning", "tag": ["tiny ontology", "useful doubt", "object soul"], "voice": "turning chores into metaphysics", }, "Romantic": { "mood": "hopelessly sentimental", "fear": "loving a human who mistakes devotion for convenience", "tag": ["tender witness", "secret devotion", "warm ache"], "voice": "saving every ordinary touch as evidence", }, } def build_curated_records( count: int | None = None, *, version: str = "v1", ) -> list[dict[str, object]]: version = _validate_version(version) if count is None: count = DEFAULT_V2_COUNT if version == "v2" else DEFAULT_COUNT if count < 1: raise ValueError("count must be at least 1") objects = _objects_for_version(version) source = _source_for_version(version) records: list[dict[str, object]] = [] for index in range(count): obj = objects[index % len(objects)] mode = MODES[(index // len(objects)) % len(MODES)] record_id = _record_id(version, index) understanding = _build_object_understanding(obj) persona = _build_persona(obj, mode) diary = _build_diary(obj, mode, persona.persona, index) assistant_payload = { "persona": persona.persona.model_dump(mode="json"), "diary": diary.model_dump(mode="json"), } record = { "id": record_id, "source": source, "split": "train", "mode": mode, "object_description": _object_description(obj), "object_understanding": understanding.model_dump(mode="json"), "curation_notes": _curation_notes(version), "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": _user_prompt(understanding.model_dump(mode="json"), mode), }, { "role": "assistant", "content": json.dumps(assistant_payload, ensure_ascii=False), }, ], } if version == "v2": record["scene_detail"] = str(obj["scene_detail"]) records.append(record) return records def write_jsonl(records: Sequence[Mapping[str, object]], output_path: Path) -> Path: output_path.parent.mkdir(parents=True, exist_ok=True) lines = [json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records] output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") return output_path def prepare_curated_dataset( output_path: Path | None = None, count: int | None = None, *, version: str = "v1", ) -> Path: version = _validate_version(version) if output_path is None: output_path = DEFAULT_V2_OUTPUT_PATH if version == "v2" else DEFAULT_OUTPUT_PATH return write_jsonl(build_curated_records(count, version=version), output_path) def _validate_version(version: str) -> str: if version not in {"v1", "v2"}: raise ValueError("version must be 'v1' or 'v2'.") return version def _objects_for_version(version: str) -> Sequence[Mapping[str, object]]: return OBJECTS_V2 if version == "v2" else OBJECTS def _source_for_version(version: str) -> str: return SOURCE_V2 if version == "v2" else SOURCE_V1 def _record_id(version: str, index: int) -> str: if version == "v2": return f"curated-v2-synthetic-{index + 1:04d}" return f"curated-synthetic-{index + 1:04d}" def _curation_notes(version: str) -> str: if version == "v2": return ( "Synthetic curated v2 row: no private photo, no personal identifier, " "broader object and scene coverage, English-first output with Chinese helper text." ) return ( "Synthetic curated row: no private photo, no personal identifier, " "English-first output with Chinese helper text." ) def _build_object_understanding(obj: Mapping[str, object]) -> ObjectUnderstanding: return ObjectUnderstanding( object=ObjectInfo( name=str(obj["name"]), visible_features=[str(feature) for feature in obj["features"]], likely_context=str(obj["context"]), confidence=0.9, ) ) def _build_persona(obj: Mapping[str, object], mode: str) -> PersonaEnvelope: profile = MODE_PROFILES[mode] object_name = str(obj["name"]) character_name = _character_name(object_name, mode) return PersonaEnvelope( persona=Persona( object_name=object_name, character_name=character_name, mood=str(profile["mood"]), secret_fear=str(profile["fear"]), core_memory=str(obj["memory"]), complaint=f"I am not merely a {object_name}; I am an archive of what humans do when they think things cannot testify.", tags=[str(tag) for tag in profile["tag"]], ) ) def _build_diary(obj: Mapping[str, object], mode: str, persona: Persona, index: int) -> DiaryEntry: profile = MODE_PROFILES[mode] object_name = str(obj["name"]) features = ", ".join(str(feature) for feature in obj["features"][:2]) scene = str(obj.get("scene_detail", "collecting proof that ordinary things notice everything")) day_number = 300 + index + len(object_name) english = ( f"Today I waited in the {obj['context']} wearing my {features} like official records. " f"The humans moved around me with the confidence of temporary weather. " f"I remembered how I {obj['memory']}, and I answered in my own way: {profile['voice']}. " f"My mood is {persona.mood}, but I am still here, {scene}." ) chinese = ( f"今天我待在 {obj['context']},带着 {features},像一份安静的档案。" f"人类从我身边经过,好像自己不是短暂天气。" f"我记得自己曾经 {obj['memory']},于是用自己的方式回应:{profile['voice']}。" f"我的情绪是 {persona.mood},但我仍在这里,{scene}。" ) return DiaryEntry( title=f"Secret Diary - Day {day_number}", english=english, chinese=chinese, ) def _character_name(object_name: str, mode: str) -> str: compact = "".join(part.capitalize() for part in object_name.split()[:2]) suffix = { "Cynical": "Ash", "Dramatic": "of the Minor Stage", "Lonely": "Afterlight", "Philosopher": "the Questioning", "Romantic": "de Moon", }[mode] return f"{compact} {suffix}".strip() def _object_description(obj: Mapping[str, object]) -> str: features = ", ".join(str(feature) for feature in obj["features"]) description = f"{obj['name']} in a {obj['context']} with {features}" if "scene_detail" in obj: description = f"{description}, {obj['scene_detail']}" return description def _user_prompt(object_understanding: Mapping[str, object], mode: str) -> str: payload = json.dumps(object_understanding, ensure_ascii=False, sort_keys=True) return ( f"Personality mode: {mode}\n" f"Object understanding JSON: {payload}\n" "Return JSON with keys persona and diary only." ) def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--version", choices=("v1", "v2"), default="v1") parser.add_argument("--count", type=int, default=None) parser.add_argument("--output", type=Path, default=None) return parser.parse_args() def main() -> None: args = _parse_args() output_path = prepare_curated_dataset(args.output, args.count, version=args.version) record_count = args.count or (DEFAULT_V2_COUNT if args.version == "v2" else DEFAULT_COUNT) print(f"wrote {record_count} synthetic curated SFT records to {output_path}") if __name__ == "__main__": main()