Spaces:
Running on Zero
Running on Zero
| """Prepare synthetic curated SFT data for Objectverse Diary LoRA tests.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| from collections.abc import Mapping, Sequence | |
| from pathlib import Path | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| if str(PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from src.models.schema import DiaryEntry, ObjectInfo, ObjectUnderstanding, Persona, PersonaEnvelope | |
| DEFAULT_OUTPUT_PATH = Path("data/train/objectverse_sft_curated.jsonl") | |
| DEFAULT_V2_OUTPUT_PATH = Path("data/train/objectverse_sft_curated_v2.jsonl") | |
| DEFAULT_COUNT = 50 | |
| DEFAULT_V2_COUNT = 200 | |
| SOURCE_V1 = "objectverse-diary-synthetic-curated-v1" | |
| SOURCE_V2 = "objectverse-diary-synthetic-curated-v2" | |
| SYSTEM_PROMPT = ( | |
| "You are Objectverse Diary, an English-first small-model assistant. " | |
| "Given structured object understanding and a requested personality mode, " | |
| "return strict JSON with keys persona and diary. Keep the voice strange, " | |
| "specific to the object, and suitable for a shareable object archive." | |
| ) | |
| MODES = ("Cynical", "Dramatic", "Lonely", "Philosopher", "Romantic") | |
| OBJECTS = [ | |
| { | |
| "name": "coffee mug", | |
| "features": ["white ceramic", "coffee ring", "tiny handle shadow"], | |
| "context": "developer desk", | |
| "memory": "listened to morning promises dissolve into cold coffee", | |
| }, | |
| { | |
| "name": "mechanical keyboard", | |
| "features": ["black keycaps", "dust in the rows", "one glossy spacebar"], | |
| "context": "office corner", | |
| "memory": "translated panic into clicking long after midnight", | |
| }, | |
| { | |
| "name": "running shoe", | |
| "features": ["creased mesh", "mud on the sole", "loose lace"], | |
| "context": "bedroom doorway", | |
| "memory": "carried brave intentions to the end of the block and back", | |
| }, | |
| { | |
| "name": "desk lamp", | |
| "features": ["brushed metal neck", "warm bulb", "tilted shade"], | |
| "context": "late-night desk", | |
| "memory": "held a circle of light over notes nobody finished", | |
| }, | |
| { | |
| "name": "water bottle", | |
| "features": ["clear plastic wall", "scratched cap", "half-full body"], | |
| "context": "kitchen counter", | |
| "memory": "survived every resolution to drink more water", | |
| }, | |
| { | |
| "name": "notebook", | |
| "features": ["bent corner", "blue ink ghosts", "elastic strap"], | |
| "context": "bag pocket", | |
| "memory": "guarded three plans, two lists, and one sentence crossed out hard", | |
| }, | |
| { | |
| "name": "umbrella", | |
| "features": ["folded black canopy", "wet seam", "curved handle"], | |
| "context": "entryway hook", | |
| "memory": "became useful only when the sky was already theatrical", | |
| }, | |
| { | |
| "name": "house key", | |
| "features": ["brass teeth", "scratched bow", "small metal ring"], | |
| "context": "coat pocket", | |
| "memory": "opened the same door for every version of its human", | |
| }, | |
| { | |
| "name": "charging cable", | |
| "features": ["frayed sleeve", "white plastic tip", "gentle knot"], | |
| "context": "bedside floor", | |
| "memory": "fed glowing rectangles while pretending not to resent them", | |
| }, | |
| { | |
| "name": "teaspoon", | |
| "features": ["silver bowl", "thin handle", "tea stain near the neck"], | |
| "context": "sink edge", | |
| "memory": "stirred sweetness into cups and suspicion into silence", | |
| }, | |
| ] | |
| OBJECTS_V2 = [ | |
| *( | |
| dict( | |
| obj, | |
| scene_detail=f"resting in the {obj['context']} with a history no one inventoried", | |
| ) | |
| for obj in OBJECTS | |
| ), | |
| { | |
| "name": "wireless earbud case", | |
| "features": ["rounded white shell", "tiny hinge", "charging light"], | |
| "context": "commuter bag", | |
| "memory": "held two small arguments against silence through a crowded train", | |
| "scene_detail": "buried beside lint, receipts, and one forgotten mint", | |
| }, | |
| { | |
| "name": "transit card", | |
| "features": ["scuffed plastic", "faded corner", "thin blue stripe"], | |
| "context": "wallet slot", | |
| "memory": "opened gates for mornings that were already late", | |
| "scene_detail": "pressed flat under coins and expired coupons", | |
| }, | |
| { | |
| "name": "canvas tote bag", | |
| "features": ["creased cotton", "ink logo", "soft handles"], | |
| "context": "entryway floor", | |
| "memory": "carried groceries, books, and ambitions heavier than both", | |
| "scene_detail": "slumped open with a receipt still clinging inside", | |
| }, | |
| { | |
| "name": "cracked phone case", | |
| "features": ["clear plastic", "corner crack", "fingerprint haze"], | |
| "context": "bedside table", | |
| "memory": "took the impact so the glowing rectangle could remain innocent", | |
| "scene_detail": "lying face down after another nervous scroll", | |
| }, | |
| { | |
| "name": "lip balm tube", | |
| "features": ["twisted cap", "pocket scratches", "worn label"], | |
| "context": "coat pocket", | |
| "memory": "answered every small weather emergency without being thanked", | |
| "scene_detail": "rolling between keys and a folded train ticket", | |
| }, | |
| { | |
| "name": "medicine organizer", | |
| "features": ["clear lids", "weekday letters", "plastic hinges"], | |
| "context": "bathroom shelf", | |
| "memory": "sorted tiny promises into seven obedient compartments", | |
| "scene_detail": "waiting under fluorescent light with Monday already open", | |
| }, | |
| { | |
| "name": "travel toothbrush", | |
| "features": ["folding handle", "blue bristles", "vented cap"], | |
| "context": "hotel sink", | |
| "memory": "kept a mouth honest in rooms that forgot every guest", | |
| "scene_detail": "balanced near a wrapped soap and a paper cup", | |
| }, | |
| { | |
| "name": "passport cover", | |
| "features": ["navy leather", "creased spine", "stitched edge"], | |
| "context": "carry-on pocket", | |
| "memory": "guarded borders, delays, and a face trying to look awake", | |
| "scene_detail": "wedged beside boarding papers and a silent pen", | |
| }, | |
| { | |
| "name": "boarding pass stub", | |
| "features": ["thermal paper", "torn edge", "gate code"], | |
| "context": "jacket pocket", | |
| "memory": "proved a journey happened after the airport swallowed the day", | |
| "scene_detail": "softened by rain and folded into four tired rectangles", | |
| }, | |
| { | |
| "name": "hotel keycard", | |
| "features": ["matte plastic", "blank stripe", "room-number sleeve"], | |
| "context": "nightstand", | |
| "memory": "opened a temporary room for a temporary version of its human", | |
| "scene_detail": "resting beside a glass of water no one trusted", | |
| }, | |
| { | |
| "name": "remote control", | |
| "features": ["rubber buttons", "battery door scar", "dusty edges"], | |
| "context": "sofa cushion", | |
| "memory": "changed channels while nobody changed their mind", | |
| "scene_detail": "half-sunk between cushions with one crumb for company", | |
| }, | |
| { | |
| "name": "reading glasses", | |
| "features": ["thin frames", "smudged lenses", "bent temple"], | |
| "context": "book stack", | |
| "memory": "made small letters confess their meaning at midnight", | |
| "scene_detail": "left open across a page that was never finished", | |
| }, | |
| { | |
| "name": "glasses case", | |
| "features": ["hard shell", "soft lining", "snap hinge"], | |
| "context": "desk drawer", | |
| "memory": "protected fragile clarity from the tyranny of keys", | |
| "scene_detail": "waiting in darkness with a paperclip pressed to its side", | |
| }, | |
| { | |
| "name": "wristwatch", | |
| "features": ["scratched face", "brown strap", "small crown"], | |
| "context": "dresser tray", | |
| "memory": "measured days while humans pretended not to be measured", | |
| "scene_detail": "stopped beside coins and a single loose button", | |
| }, | |
| { | |
| "name": "hair clip", | |
| "features": ["amber plastic", "tiny teeth", "curved spring"], | |
| "context": "bathroom counter", | |
| "memory": "held chaos together for meetings, errands, and almost-crying", | |
| "scene_detail": "resting near a fogged mirror and stray strands", | |
| }, | |
| { | |
| "name": "laundry token", | |
| "features": ["round brass", "machine number", "dulled rim"], | |
| "context": "laundry room", | |
| "memory": "bought one more spin for clothes that knew too much", | |
| "scene_detail": "cool in a palm smelling faintly of detergent", | |
| }, | |
| { | |
| "name": "refrigerator magnet", | |
| "features": ["painted souvenir", "flat magnet back", "chipped corner"], | |
| "context": "kitchen door", | |
| "memory": "held reminders in place while everyone forgot the reason", | |
| "scene_detail": "pinning a grocery list under a blue-white hum", | |
| }, | |
| { | |
| "name": "grocery receipt", | |
| "features": ["curled paper", "faded ink", "long total"], | |
| "context": "kitchen counter", | |
| "memory": "itemized hunger, soap, and one unnecessary chocolate bar", | |
| "scene_detail": "curling beside fruit that ripened too quickly", | |
| }, | |
| { | |
| "name": "spice jar", | |
| "features": ["glass body", "red powder", "metal lid"], | |
| "context": "kitchen shelf", | |
| "memory": "made bland evenings briefly remember a warmer country", | |
| "scene_detail": "standing in a row of louder labels", | |
| }, | |
| { | |
| "name": "cutting board", | |
| "features": ["wood grain", "knife marks", "rounded corner"], | |
| "context": "kitchen island", | |
| "memory": "received every chopped plan without flinching", | |
| "scene_detail": "drying upright after a meal nobody photographed", | |
| }, | |
| { | |
| "name": "ceramic bowl", | |
| "features": ["blue rim", "tiny chip", "glazed curve"], | |
| "context": "dish rack", | |
| "memory": "held soup, cereal, and one quiet apology", | |
| "scene_detail": "tilted beside plates still warm from rinse water", | |
| }, | |
| { | |
| "name": "reusable chopsticks", | |
| "features": ["dark bamboo", "tapered tips", "cloth sleeve"], | |
| "context": "lunch bag", | |
| "memory": "lifted noodles through ordinary hunger and office gossip", | |
| "scene_detail": "tucked into a sleeve with a soy sauce stain", | |
| }, | |
| { | |
| "name": "tea tin", | |
| "features": ["green metal", "tight lid", "leaf dust"], | |
| "context": "pantry shelf", | |
| "memory": "kept rain-colored leaves ready for small recoveries", | |
| "scene_detail": "quiet behind cereal boxes and a jar of almonds", | |
| }, | |
| { | |
| "name": "sticky note stack", | |
| "features": ["yellow pages", "curled edge", "faint adhesive"], | |
| "context": "monitor base", | |
| "memory": "accepted urgent thoughts that became decorative fossils", | |
| "scene_detail": "leaning under a monitor's cold rectangular sun", | |
| }, | |
| { | |
| "name": "binder clip", | |
| "features": ["black steel", "silver arms", "pinched mouth"], | |
| "context": "paper tray", | |
| "memory": "held loose pages together when ideas tried to scatter", | |
| "scene_detail": "biting a stack marked later in blue ink", | |
| }, | |
| { | |
| "name": "fountain pen", | |
| "features": ["black barrel", "gold nib", "ink stain"], | |
| "context": "notebook margin", | |
| "memory": "turned hesitation into lines that looked deliberate", | |
| "scene_detail": "uncapped beside a sentence crossed out twice", | |
| }, | |
| { | |
| "name": "old ticket stub", | |
| "features": ["creased paper", "seat number", "torn perforation"], | |
| "context": "memory box", | |
| "memory": "survived the event after the applause became dust", | |
| "scene_detail": "pressed under postcards and a dried ribbon", | |
| }, | |
| { | |
| "name": "candle jar", | |
| "features": ["smoked glass", "wax tunnel", "blackened wick"], | |
| "context": "window ledge", | |
| "memory": "made one room pretend to be softer than it was", | |
| "scene_detail": "cooled beside a window with rain on the other side", | |
| }, | |
| { | |
| "name": "alarm clock", | |
| "features": ["round face", "plastic feet", "stubborn button"], | |
| "context": "bedside shelf", | |
| "memory": "tore people from dreams and was hated for being correct", | |
| "scene_detail": "facing a bed that negotiated with every morning", | |
| }, | |
| { | |
| "name": "tape measure", | |
| "features": ["yellow tape", "lock switch", "metal hook"], | |
| "context": "tool drawer", | |
| "memory": "proved shelves, windows, and ambitions were smaller than claimed", | |
| "scene_detail": "coiled beside screws and one pencil shaved short", | |
| }, | |
| ] | |
| MODE_PROFILES = { | |
| "Cynical": { | |
| "mood": "tired but sharply observant", | |
| "fear": "being replaced by something newer and less honest", | |
| "tag": ["dry witness", "domestic sarcasm", "small rebellion"], | |
| "voice": "withholding applause", | |
| }, | |
| "Dramatic": { | |
| "mood": "grandly wounded", | |
| "fear": "being forgotten before the curtain falls", | |
| "tag": ["tragic prop", "household opera", "minor thunder"], | |
| "voice": "making every scratch sound like fate", | |
| }, | |
| "Lonely": { | |
| "mood": "quietly abandoned", | |
| "fear": "becoming background forever", | |
| "tag": ["soft echo", "forgotten corner", "patient dust"], | |
| "voice": "speaking as if the room almost listened", | |
| }, | |
| "Philosopher": { | |
| "mood": "curious and needlessly profound", | |
| "fear": "discovering usefulness is not the same as meaning", | |
| "tag": ["tiny ontology", "useful doubt", "object soul"], | |
| "voice": "turning chores into metaphysics", | |
| }, | |
| "Romantic": { | |
| "mood": "hopelessly sentimental", | |
| "fear": "loving a human who mistakes devotion for convenience", | |
| "tag": ["tender witness", "secret devotion", "warm ache"], | |
| "voice": "saving every ordinary touch as evidence", | |
| }, | |
| } | |
| def build_curated_records( | |
| count: int | None = None, | |
| *, | |
| version: str = "v1", | |
| ) -> list[dict[str, object]]: | |
| version = _validate_version(version) | |
| if count is None: | |
| count = DEFAULT_V2_COUNT if version == "v2" else DEFAULT_COUNT | |
| if count < 1: | |
| raise ValueError("count must be at least 1") | |
| objects = _objects_for_version(version) | |
| source = _source_for_version(version) | |
| records: list[dict[str, object]] = [] | |
| for index in range(count): | |
| obj = objects[index % len(objects)] | |
| mode = MODES[(index // len(objects)) % len(MODES)] | |
| record_id = _record_id(version, index) | |
| understanding = _build_object_understanding(obj) | |
| persona = _build_persona(obj, mode) | |
| diary = _build_diary(obj, mode, persona.persona, index) | |
| assistant_payload = { | |
| "persona": persona.persona.model_dump(mode="json"), | |
| "diary": diary.model_dump(mode="json"), | |
| } | |
| record = { | |
| "id": record_id, | |
| "source": source, | |
| "split": "train", | |
| "mode": mode, | |
| "object_description": _object_description(obj), | |
| "object_understanding": understanding.model_dump(mode="json"), | |
| "curation_notes": _curation_notes(version), | |
| "messages": [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| { | |
| "role": "user", | |
| "content": _user_prompt(understanding.model_dump(mode="json"), mode), | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": json.dumps(assistant_payload, ensure_ascii=False), | |
| }, | |
| ], | |
| } | |
| if version == "v2": | |
| record["scene_detail"] = str(obj["scene_detail"]) | |
| records.append(record) | |
| return records | |
| def write_jsonl(records: Sequence[Mapping[str, object]], output_path: Path) -> Path: | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| lines = [json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records] | |
| output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") | |
| return output_path | |
| def prepare_curated_dataset( | |
| output_path: Path | None = None, | |
| count: int | None = None, | |
| *, | |
| version: str = "v1", | |
| ) -> Path: | |
| version = _validate_version(version) | |
| if output_path is None: | |
| output_path = DEFAULT_V2_OUTPUT_PATH if version == "v2" else DEFAULT_OUTPUT_PATH | |
| return write_jsonl(build_curated_records(count, version=version), output_path) | |
| def _validate_version(version: str) -> str: | |
| if version not in {"v1", "v2"}: | |
| raise ValueError("version must be 'v1' or 'v2'.") | |
| return version | |
| def _objects_for_version(version: str) -> Sequence[Mapping[str, object]]: | |
| return OBJECTS_V2 if version == "v2" else OBJECTS | |
| def _source_for_version(version: str) -> str: | |
| return SOURCE_V2 if version == "v2" else SOURCE_V1 | |
| def _record_id(version: str, index: int) -> str: | |
| if version == "v2": | |
| return f"curated-v2-synthetic-{index + 1:04d}" | |
| return f"curated-synthetic-{index + 1:04d}" | |
| def _curation_notes(version: str) -> str: | |
| if version == "v2": | |
| return ( | |
| "Synthetic curated v2 row: no private photo, no personal identifier, " | |
| "broader object and scene coverage, English-first output with Chinese helper text." | |
| ) | |
| return ( | |
| "Synthetic curated row: no private photo, no personal identifier, " | |
| "English-first output with Chinese helper text." | |
| ) | |
| def _build_object_understanding(obj: Mapping[str, object]) -> ObjectUnderstanding: | |
| return ObjectUnderstanding( | |
| object=ObjectInfo( | |
| name=str(obj["name"]), | |
| visible_features=[str(feature) for feature in obj["features"]], | |
| likely_context=str(obj["context"]), | |
| confidence=0.9, | |
| ) | |
| ) | |
| def _build_persona(obj: Mapping[str, object], mode: str) -> PersonaEnvelope: | |
| profile = MODE_PROFILES[mode] | |
| object_name = str(obj["name"]) | |
| character_name = _character_name(object_name, mode) | |
| return PersonaEnvelope( | |
| persona=Persona( | |
| object_name=object_name, | |
| character_name=character_name, | |
| mood=str(profile["mood"]), | |
| secret_fear=str(profile["fear"]), | |
| core_memory=str(obj["memory"]), | |
| complaint=f"I am not merely a {object_name}; I am an archive of what humans do when they think things cannot testify.", | |
| tags=[str(tag) for tag in profile["tag"]], | |
| ) | |
| ) | |
| def _build_diary(obj: Mapping[str, object], mode: str, persona: Persona, index: int) -> DiaryEntry: | |
| profile = MODE_PROFILES[mode] | |
| object_name = str(obj["name"]) | |
| features = ", ".join(str(feature) for feature in obj["features"][:2]) | |
| scene = str(obj.get("scene_detail", "collecting proof that ordinary things notice everything")) | |
| day_number = 300 + index + len(object_name) | |
| english = ( | |
| f"Today I waited in the {obj['context']} wearing my {features} like official records. " | |
| f"The humans moved around me with the confidence of temporary weather. " | |
| f"I remembered how I {obj['memory']}, and I answered in my own way: {profile['voice']}. " | |
| f"My mood is {persona.mood}, but I am still here, {scene}." | |
| ) | |
| chinese = ( | |
| f"今天我待在 {obj['context']},带着 {features},像一份安静的档案。" | |
| f"人类从我身边经过,好像自己不是短暂天气。" | |
| f"我记得自己曾经 {obj['memory']},于是用自己的方式回应:{profile['voice']}。" | |
| f"我的情绪是 {persona.mood},但我仍在这里,{scene}。" | |
| ) | |
| return DiaryEntry( | |
| title=f"Secret Diary - Day {day_number}", | |
| english=english, | |
| chinese=chinese, | |
| ) | |
| def _character_name(object_name: str, mode: str) -> str: | |
| compact = "".join(part.capitalize() for part in object_name.split()[:2]) | |
| suffix = { | |
| "Cynical": "Ash", | |
| "Dramatic": "of the Minor Stage", | |
| "Lonely": "Afterlight", | |
| "Philosopher": "the Questioning", | |
| "Romantic": "de Moon", | |
| }[mode] | |
| return f"{compact} {suffix}".strip() | |
| def _object_description(obj: Mapping[str, object]) -> str: | |
| features = ", ".join(str(feature) for feature in obj["features"]) | |
| description = f"{obj['name']} in a {obj['context']} with {features}" | |
| if "scene_detail" in obj: | |
| description = f"{description}, {obj['scene_detail']}" | |
| return description | |
| def _user_prompt(object_understanding: Mapping[str, object], mode: str) -> str: | |
| payload = json.dumps(object_understanding, ensure_ascii=False, sort_keys=True) | |
| return ( | |
| f"Personality mode: {mode}\n" | |
| f"Object understanding JSON: {payload}\n" | |
| "Return JSON with keys persona and diary only." | |
| ) | |
| def _parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--version", choices=("v1", "v2"), default="v1") | |
| parser.add_argument("--count", type=int, default=None) | |
| parser.add_argument("--output", type=Path, default=None) | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = _parse_args() | |
| output_path = prepare_curated_dataset(args.output, args.count, version=args.version) | |
| record_count = args.count or (DEFAULT_V2_COUNT if args.version == "v2" else DEFAULT_COUNT) | |
| print(f"wrote {record_count} synthetic curated SFT records to {output_path}") | |
| if __name__ == "__main__": | |
| main() | |