Spaces:
Running on Zero
Running on Zero
File size: 21,567 Bytes
9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de dd6cefc 9e874de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | """Prepare synthetic curated SFT data for Objectverse Diary LoRA tests."""
from __future__ import annotations
import argparse
import json
import sys
from collections.abc import Mapping, Sequence
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from src.models.schema import DiaryEntry, ObjectInfo, ObjectUnderstanding, Persona, PersonaEnvelope
DEFAULT_OUTPUT_PATH = Path("data/train/objectverse_sft_curated.jsonl")
DEFAULT_V2_OUTPUT_PATH = Path("data/train/objectverse_sft_curated_v2.jsonl")
DEFAULT_COUNT = 50
DEFAULT_V2_COUNT = 200
SOURCE_V1 = "objectverse-diary-synthetic-curated-v1"
SOURCE_V2 = "objectverse-diary-synthetic-curated-v2"
SYSTEM_PROMPT = (
"You are Objectverse Diary, an English-first small-model assistant. "
"Given structured object understanding and a requested personality mode, "
"return strict JSON with keys persona and diary. Keep the voice strange, "
"specific to the object, and suitable for a shareable object archive."
)
MODES = ("Cynical", "Dramatic", "Lonely", "Philosopher", "Romantic")
OBJECTS = [
{
"name": "coffee mug",
"features": ["white ceramic", "coffee ring", "tiny handle shadow"],
"context": "developer desk",
"memory": "listened to morning promises dissolve into cold coffee",
},
{
"name": "mechanical keyboard",
"features": ["black keycaps", "dust in the rows", "one glossy spacebar"],
"context": "office corner",
"memory": "translated panic into clicking long after midnight",
},
{
"name": "running shoe",
"features": ["creased mesh", "mud on the sole", "loose lace"],
"context": "bedroom doorway",
"memory": "carried brave intentions to the end of the block and back",
},
{
"name": "desk lamp",
"features": ["brushed metal neck", "warm bulb", "tilted shade"],
"context": "late-night desk",
"memory": "held a circle of light over notes nobody finished",
},
{
"name": "water bottle",
"features": ["clear plastic wall", "scratched cap", "half-full body"],
"context": "kitchen counter",
"memory": "survived every resolution to drink more water",
},
{
"name": "notebook",
"features": ["bent corner", "blue ink ghosts", "elastic strap"],
"context": "bag pocket",
"memory": "guarded three plans, two lists, and one sentence crossed out hard",
},
{
"name": "umbrella",
"features": ["folded black canopy", "wet seam", "curved handle"],
"context": "entryway hook",
"memory": "became useful only when the sky was already theatrical",
},
{
"name": "house key",
"features": ["brass teeth", "scratched bow", "small metal ring"],
"context": "coat pocket",
"memory": "opened the same door for every version of its human",
},
{
"name": "charging cable",
"features": ["frayed sleeve", "white plastic tip", "gentle knot"],
"context": "bedside floor",
"memory": "fed glowing rectangles while pretending not to resent them",
},
{
"name": "teaspoon",
"features": ["silver bowl", "thin handle", "tea stain near the neck"],
"context": "sink edge",
"memory": "stirred sweetness into cups and suspicion into silence",
},
]
OBJECTS_V2 = [
*(
dict(
obj,
scene_detail=f"resting in the {obj['context']} with a history no one inventoried",
)
for obj in OBJECTS
),
{
"name": "wireless earbud case",
"features": ["rounded white shell", "tiny hinge", "charging light"],
"context": "commuter bag",
"memory": "held two small arguments against silence through a crowded train",
"scene_detail": "buried beside lint, receipts, and one forgotten mint",
},
{
"name": "transit card",
"features": ["scuffed plastic", "faded corner", "thin blue stripe"],
"context": "wallet slot",
"memory": "opened gates for mornings that were already late",
"scene_detail": "pressed flat under coins and expired coupons",
},
{
"name": "canvas tote bag",
"features": ["creased cotton", "ink logo", "soft handles"],
"context": "entryway floor",
"memory": "carried groceries, books, and ambitions heavier than both",
"scene_detail": "slumped open with a receipt still clinging inside",
},
{
"name": "cracked phone case",
"features": ["clear plastic", "corner crack", "fingerprint haze"],
"context": "bedside table",
"memory": "took the impact so the glowing rectangle could remain innocent",
"scene_detail": "lying face down after another nervous scroll",
},
{
"name": "lip balm tube",
"features": ["twisted cap", "pocket scratches", "worn label"],
"context": "coat pocket",
"memory": "answered every small weather emergency without being thanked",
"scene_detail": "rolling between keys and a folded train ticket",
},
{
"name": "medicine organizer",
"features": ["clear lids", "weekday letters", "plastic hinges"],
"context": "bathroom shelf",
"memory": "sorted tiny promises into seven obedient compartments",
"scene_detail": "waiting under fluorescent light with Monday already open",
},
{
"name": "travel toothbrush",
"features": ["folding handle", "blue bristles", "vented cap"],
"context": "hotel sink",
"memory": "kept a mouth honest in rooms that forgot every guest",
"scene_detail": "balanced near a wrapped soap and a paper cup",
},
{
"name": "passport cover",
"features": ["navy leather", "creased spine", "stitched edge"],
"context": "carry-on pocket",
"memory": "guarded borders, delays, and a face trying to look awake",
"scene_detail": "wedged beside boarding papers and a silent pen",
},
{
"name": "boarding pass stub",
"features": ["thermal paper", "torn edge", "gate code"],
"context": "jacket pocket",
"memory": "proved a journey happened after the airport swallowed the day",
"scene_detail": "softened by rain and folded into four tired rectangles",
},
{
"name": "hotel keycard",
"features": ["matte plastic", "blank stripe", "room-number sleeve"],
"context": "nightstand",
"memory": "opened a temporary room for a temporary version of its human",
"scene_detail": "resting beside a glass of water no one trusted",
},
{
"name": "remote control",
"features": ["rubber buttons", "battery door scar", "dusty edges"],
"context": "sofa cushion",
"memory": "changed channels while nobody changed their mind",
"scene_detail": "half-sunk between cushions with one crumb for company",
},
{
"name": "reading glasses",
"features": ["thin frames", "smudged lenses", "bent temple"],
"context": "book stack",
"memory": "made small letters confess their meaning at midnight",
"scene_detail": "left open across a page that was never finished",
},
{
"name": "glasses case",
"features": ["hard shell", "soft lining", "snap hinge"],
"context": "desk drawer",
"memory": "protected fragile clarity from the tyranny of keys",
"scene_detail": "waiting in darkness with a paperclip pressed to its side",
},
{
"name": "wristwatch",
"features": ["scratched face", "brown strap", "small crown"],
"context": "dresser tray",
"memory": "measured days while humans pretended not to be measured",
"scene_detail": "stopped beside coins and a single loose button",
},
{
"name": "hair clip",
"features": ["amber plastic", "tiny teeth", "curved spring"],
"context": "bathroom counter",
"memory": "held chaos together for meetings, errands, and almost-crying",
"scene_detail": "resting near a fogged mirror and stray strands",
},
{
"name": "laundry token",
"features": ["round brass", "machine number", "dulled rim"],
"context": "laundry room",
"memory": "bought one more spin for clothes that knew too much",
"scene_detail": "cool in a palm smelling faintly of detergent",
},
{
"name": "refrigerator magnet",
"features": ["painted souvenir", "flat magnet back", "chipped corner"],
"context": "kitchen door",
"memory": "held reminders in place while everyone forgot the reason",
"scene_detail": "pinning a grocery list under a blue-white hum",
},
{
"name": "grocery receipt",
"features": ["curled paper", "faded ink", "long total"],
"context": "kitchen counter",
"memory": "itemized hunger, soap, and one unnecessary chocolate bar",
"scene_detail": "curling beside fruit that ripened too quickly",
},
{
"name": "spice jar",
"features": ["glass body", "red powder", "metal lid"],
"context": "kitchen shelf",
"memory": "made bland evenings briefly remember a warmer country",
"scene_detail": "standing in a row of louder labels",
},
{
"name": "cutting board",
"features": ["wood grain", "knife marks", "rounded corner"],
"context": "kitchen island",
"memory": "received every chopped plan without flinching",
"scene_detail": "drying upright after a meal nobody photographed",
},
{
"name": "ceramic bowl",
"features": ["blue rim", "tiny chip", "glazed curve"],
"context": "dish rack",
"memory": "held soup, cereal, and one quiet apology",
"scene_detail": "tilted beside plates still warm from rinse water",
},
{
"name": "reusable chopsticks",
"features": ["dark bamboo", "tapered tips", "cloth sleeve"],
"context": "lunch bag",
"memory": "lifted noodles through ordinary hunger and office gossip",
"scene_detail": "tucked into a sleeve with a soy sauce stain",
},
{
"name": "tea tin",
"features": ["green metal", "tight lid", "leaf dust"],
"context": "pantry shelf",
"memory": "kept rain-colored leaves ready for small recoveries",
"scene_detail": "quiet behind cereal boxes and a jar of almonds",
},
{
"name": "sticky note stack",
"features": ["yellow pages", "curled edge", "faint adhesive"],
"context": "monitor base",
"memory": "accepted urgent thoughts that became decorative fossils",
"scene_detail": "leaning under a monitor's cold rectangular sun",
},
{
"name": "binder clip",
"features": ["black steel", "silver arms", "pinched mouth"],
"context": "paper tray",
"memory": "held loose pages together when ideas tried to scatter",
"scene_detail": "biting a stack marked later in blue ink",
},
{
"name": "fountain pen",
"features": ["black barrel", "gold nib", "ink stain"],
"context": "notebook margin",
"memory": "turned hesitation into lines that looked deliberate",
"scene_detail": "uncapped beside a sentence crossed out twice",
},
{
"name": "old ticket stub",
"features": ["creased paper", "seat number", "torn perforation"],
"context": "memory box",
"memory": "survived the event after the applause became dust",
"scene_detail": "pressed under postcards and a dried ribbon",
},
{
"name": "candle jar",
"features": ["smoked glass", "wax tunnel", "blackened wick"],
"context": "window ledge",
"memory": "made one room pretend to be softer than it was",
"scene_detail": "cooled beside a window with rain on the other side",
},
{
"name": "alarm clock",
"features": ["round face", "plastic feet", "stubborn button"],
"context": "bedside shelf",
"memory": "tore people from dreams and was hated for being correct",
"scene_detail": "facing a bed that negotiated with every morning",
},
{
"name": "tape measure",
"features": ["yellow tape", "lock switch", "metal hook"],
"context": "tool drawer",
"memory": "proved shelves, windows, and ambitions were smaller than claimed",
"scene_detail": "coiled beside screws and one pencil shaved short",
},
]
MODE_PROFILES = {
"Cynical": {
"mood": "tired but sharply observant",
"fear": "being replaced by something newer and less honest",
"tag": ["dry witness", "domestic sarcasm", "small rebellion"],
"voice": "withholding applause",
},
"Dramatic": {
"mood": "grandly wounded",
"fear": "being forgotten before the curtain falls",
"tag": ["tragic prop", "household opera", "minor thunder"],
"voice": "making every scratch sound like fate",
},
"Lonely": {
"mood": "quietly abandoned",
"fear": "becoming background forever",
"tag": ["soft echo", "forgotten corner", "patient dust"],
"voice": "speaking as if the room almost listened",
},
"Philosopher": {
"mood": "curious and needlessly profound",
"fear": "discovering usefulness is not the same as meaning",
"tag": ["tiny ontology", "useful doubt", "object soul"],
"voice": "turning chores into metaphysics",
},
"Romantic": {
"mood": "hopelessly sentimental",
"fear": "loving a human who mistakes devotion for convenience",
"tag": ["tender witness", "secret devotion", "warm ache"],
"voice": "saving every ordinary touch as evidence",
},
}
def build_curated_records(
count: int | None = None,
*,
version: str = "v1",
) -> list[dict[str, object]]:
version = _validate_version(version)
if count is None:
count = DEFAULT_V2_COUNT if version == "v2" else DEFAULT_COUNT
if count < 1:
raise ValueError("count must be at least 1")
objects = _objects_for_version(version)
source = _source_for_version(version)
records: list[dict[str, object]] = []
for index in range(count):
obj = objects[index % len(objects)]
mode = MODES[(index // len(objects)) % len(MODES)]
record_id = _record_id(version, index)
understanding = _build_object_understanding(obj)
persona = _build_persona(obj, mode)
diary = _build_diary(obj, mode, persona.persona, index)
assistant_payload = {
"persona": persona.persona.model_dump(mode="json"),
"diary": diary.model_dump(mode="json"),
}
record = {
"id": record_id,
"source": source,
"split": "train",
"mode": mode,
"object_description": _object_description(obj),
"object_understanding": understanding.model_dump(mode="json"),
"curation_notes": _curation_notes(version),
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": _user_prompt(understanding.model_dump(mode="json"), mode),
},
{
"role": "assistant",
"content": json.dumps(assistant_payload, ensure_ascii=False),
},
],
}
if version == "v2":
record["scene_detail"] = str(obj["scene_detail"])
records.append(record)
return records
def write_jsonl(records: Sequence[Mapping[str, object]], output_path: Path) -> Path:
output_path.parent.mkdir(parents=True, exist_ok=True)
lines = [json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records]
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
return output_path
def prepare_curated_dataset(
output_path: Path | None = None,
count: int | None = None,
*,
version: str = "v1",
) -> Path:
version = _validate_version(version)
if output_path is None:
output_path = DEFAULT_V2_OUTPUT_PATH if version == "v2" else DEFAULT_OUTPUT_PATH
return write_jsonl(build_curated_records(count, version=version), output_path)
def _validate_version(version: str) -> str:
if version not in {"v1", "v2"}:
raise ValueError("version must be 'v1' or 'v2'.")
return version
def _objects_for_version(version: str) -> Sequence[Mapping[str, object]]:
return OBJECTS_V2 if version == "v2" else OBJECTS
def _source_for_version(version: str) -> str:
return SOURCE_V2 if version == "v2" else SOURCE_V1
def _record_id(version: str, index: int) -> str:
if version == "v2":
return f"curated-v2-synthetic-{index + 1:04d}"
return f"curated-synthetic-{index + 1:04d}"
def _curation_notes(version: str) -> str:
if version == "v2":
return (
"Synthetic curated v2 row: no private photo, no personal identifier, "
"broader object and scene coverage, English-first output with Chinese helper text."
)
return (
"Synthetic curated row: no private photo, no personal identifier, "
"English-first output with Chinese helper text."
)
def _build_object_understanding(obj: Mapping[str, object]) -> ObjectUnderstanding:
return ObjectUnderstanding(
object=ObjectInfo(
name=str(obj["name"]),
visible_features=[str(feature) for feature in obj["features"]],
likely_context=str(obj["context"]),
confidence=0.9,
)
)
def _build_persona(obj: Mapping[str, object], mode: str) -> PersonaEnvelope:
profile = MODE_PROFILES[mode]
object_name = str(obj["name"])
character_name = _character_name(object_name, mode)
return PersonaEnvelope(
persona=Persona(
object_name=object_name,
character_name=character_name,
mood=str(profile["mood"]),
secret_fear=str(profile["fear"]),
core_memory=str(obj["memory"]),
complaint=f"I am not merely a {object_name}; I am an archive of what humans do when they think things cannot testify.",
tags=[str(tag) for tag in profile["tag"]],
)
)
def _build_diary(obj: Mapping[str, object], mode: str, persona: Persona, index: int) -> DiaryEntry:
profile = MODE_PROFILES[mode]
object_name = str(obj["name"])
features = ", ".join(str(feature) for feature in obj["features"][:2])
scene = str(obj.get("scene_detail", "collecting proof that ordinary things notice everything"))
day_number = 300 + index + len(object_name)
english = (
f"Today I waited in the {obj['context']} wearing my {features} like official records. "
f"The humans moved around me with the confidence of temporary weather. "
f"I remembered how I {obj['memory']}, and I answered in my own way: {profile['voice']}. "
f"My mood is {persona.mood}, but I am still here, {scene}."
)
chinese = (
f"今天我待在 {obj['context']},带着 {features},像一份安静的档案。"
f"人类从我身边经过,好像自己不是短暂天气。"
f"我记得自己曾经 {obj['memory']},于是用自己的方式回应:{profile['voice']}。"
f"我的情绪是 {persona.mood},但我仍在这里,{scene}。"
)
return DiaryEntry(
title=f"Secret Diary - Day {day_number}",
english=english,
chinese=chinese,
)
def _character_name(object_name: str, mode: str) -> str:
compact = "".join(part.capitalize() for part in object_name.split()[:2])
suffix = {
"Cynical": "Ash",
"Dramatic": "of the Minor Stage",
"Lonely": "Afterlight",
"Philosopher": "the Questioning",
"Romantic": "de Moon",
}[mode]
return f"{compact} {suffix}".strip()
def _object_description(obj: Mapping[str, object]) -> str:
features = ", ".join(str(feature) for feature in obj["features"])
description = f"{obj['name']} in a {obj['context']} with {features}"
if "scene_detail" in obj:
description = f"{description}, {obj['scene_detail']}"
return description
def _user_prompt(object_understanding: Mapping[str, object], mode: str) -> str:
payload = json.dumps(object_understanding, ensure_ascii=False, sort_keys=True)
return (
f"Personality mode: {mode}\n"
f"Object understanding JSON: {payload}\n"
"Return JSON with keys persona and diary only."
)
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--version", choices=("v1", "v2"), default="v1")
parser.add_argument("--count", type=int, default=None)
parser.add_argument("--output", type=Path, default=None)
return parser.parse_args()
def main() -> None:
args = _parse_args()
output_path = prepare_curated_dataset(args.output, args.count, version=args.version)
record_count = args.count or (DEFAULT_V2_COUNT if args.version == "v2" else DEFAULT_COUNT)
print(f"wrote {record_count} synthetic curated SFT records to {output_path}")
if __name__ == "__main__":
main()
|