Spaces:
Running on Zero
Running on Zero
| from __future__ import annotations | |
| import json | |
| from typing import Any | |
| from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now | |
| LORA_DATASET_SCHEMA_VERSION = 1 | |
| BASE_MODEL = "openbmb/MiniCPM5-1B" | |
| ADAPTER_TASK = "hackathon_advisor_tool_call_and_voice" | |
| TOOL_CALL_SYSTEM_PROMPT = ( | |
| "You are The Unwritten Almanac's originality and build-plan advisor. Choose exactly one validated tool call for " | |
| "the user's project-advice request. Return only the XML function call." | |
| ) | |
| RESPONSE_SYSTEM_PROMPT = ( | |
| "You are The Unwritten Almanac's originality and build-plan advisor. Write concise, evidence-grounded advice from " | |
| "the tool observations, cited pages, score, and selected goals." | |
| ) | |
| def build_lora_dataset_jsonl(session: dict[str, Any], metadata: dict[str, Any]) -> str: | |
| trace = _list_of_dicts(session.get("trace")) | |
| ideas = _list_of_dicts(session.get("ideas")) | |
| goals = [str(goal) for goal in session.get("goals") or []] | |
| examples = _examples(trace, goals) | |
| records = [ | |
| { | |
| "type": "lora_sft_manifest", | |
| "schema_version": LORA_DATASET_SCHEMA_VERSION, | |
| "generated_at": utc_now(), | |
| "app": "hackathon-advisor", | |
| "base_model": BASE_MODEL, | |
| "adapter_task": ADAPTER_TASK, | |
| "format": "chat-jsonl", | |
| "record_kinds": ["tool_call", "advisor_response"], | |
| "source": "exact_session_trace", | |
| "idea_count": len(ideas), | |
| "turn_count": len(trace), | |
| "included_turn_count": len({example["turn_index"] for example in examples}), | |
| "example_count": len(examples), | |
| "index": _index_metadata(metadata), | |
| } | |
| ] | |
| records.extend(examples) | |
| return "\n".join(json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records) + "\n" | |
| def _examples(trace: list[dict[str, Any]], goals: list[str]) -> list[dict[str, Any]]: | |
| examples: list[dict[str, Any]] = [] | |
| for turn_index, event in enumerate(trace, start=1): | |
| if not _is_successful_turn(event): | |
| continue | |
| input_text = _clean(event.get("input")) | |
| response = _clean(event.get("response")) | |
| if not input_text or not response: | |
| continue | |
| tool_call = _tool_call(event) | |
| if not tool_call["name"]: | |
| continue | |
| shared = { | |
| "type": "lora_sft_example", | |
| "schema_version": LORA_DATASET_SCHEMA_VERSION, | |
| "base_model": BASE_MODEL, | |
| "adapter_task": ADAPTER_TASK, | |
| "turn_index": turn_index, | |
| "goals": goals, | |
| "score": _score(event), | |
| "tool_call": tool_call, | |
| "tool_observations": _tool_observations(event), | |
| } | |
| examples.append( | |
| { | |
| **shared, | |
| "example_index": len(examples) + 1, | |
| "example_kind": "tool_call", | |
| "messages": [ | |
| {"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT}, | |
| {"role": "user", "content": input_text}, | |
| {"role": "assistant", "content": _tool_call_xml(tool_call)}, | |
| ], | |
| } | |
| ) | |
| examples.append( | |
| { | |
| **shared, | |
| "example_index": len(examples) + 1, | |
| "example_kind": "advisor_response", | |
| "messages": [ | |
| {"role": "system", "content": RESPONSE_SYSTEM_PROMPT}, | |
| {"role": "user", "content": _response_context(input_text, event, tool_call)}, | |
| {"role": "assistant", "content": response}, | |
| ], | |
| } | |
| ) | |
| return examples | |
| def _is_successful_turn(event: dict[str, Any]) -> bool: | |
| resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {} | |
| return str(resolution.get("status") or "") == "valid" | |
| def _tool_call(event: dict[str, Any]) -> dict[str, Any]: | |
| resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {} | |
| call = resolution.get("call") if isinstance(resolution.get("call"), dict) else {} | |
| arguments = call.get("arguments") if isinstance(call.get("arguments"), dict) else {} | |
| return { | |
| "name": _clean(call.get("name")), | |
| "arguments": arguments, | |
| } | |
| def _tool_call_xml(tool_call: dict[str, Any]) -> str: | |
| arguments = json.dumps(tool_call["arguments"], ensure_ascii=False, sort_keys=True, separators=(",", ":")) | |
| return f'<function name="{tool_call["name"]}">{arguments}</function>' | |
| def _response_context(input_text: str, event: dict[str, Any], tool_call: dict[str, Any]) -> str: | |
| observations = _tool_observations(event) | |
| lines = [ | |
| input_text, | |
| "", | |
| f"Tool call: {_tool_call_xml(tool_call)}", | |
| "Tool observations:", | |
| ] | |
| if observations: | |
| for observation in observations: | |
| lines.append(f"- {observation['name']}: {observation['summary']}") | |
| else: | |
| lines.append("- none") | |
| score = _score(event) | |
| verdict = score["verdict"] or "n/a" | |
| overall = score["overall"] if score["overall"] is not None else "n/a" | |
| lines.extend( | |
| [ | |
| f"Verdict: {verdict}", | |
| f"Overall: {overall}", | |
| f"Plan steps: {score['plan_steps']}", | |
| ] | |
| ) | |
| return "\n".join(lines) | |
| def _tool_observations(event: dict[str, Any]) -> list[dict[str, str]]: | |
| observations = [] | |
| for tool in _list_of_dicts(event.get("tools")): | |
| name = _clean(tool.get("name")) | |
| summary = _clean(tool.get("summary")) | |
| if name or summary: | |
| observations.append({"name": name, "summary": summary}) | |
| return observations | |
| def _score(event: dict[str, Any]) -> dict[str, Any]: | |
| return { | |
| "verdict": _clean(event.get("verdict")), | |
| "overall": event.get("overall"), | |
| "plan_steps": int(event.get("plan_steps") or 0), | |
| } | |
| def _index_metadata(metadata: dict[str, Any]) -> dict[str, str]: | |
| return { | |
| "algorithm": _clean(metadata.get("index_algorithm")), | |
| "snapshot_generated_at": _clean(metadata.get("snapshot_generated_at")), | |
| "index_generated_at": _clean(metadata.get("index_generated_at")), | |
| "snapshot_digest": _clean(metadata.get("snapshot_digest")), | |
| } | |