JacobLinCool's picture
deploy: sync GitHub main de5dbf9
13fe947 verified
from __future__ import annotations
import json
from typing import Any
from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
LORA_DATASET_SCHEMA_VERSION = 1
BASE_MODEL = "openbmb/MiniCPM5-1B"
ADAPTER_TASK = "hackathon_advisor_tool_call_and_voice"
TOOL_CALL_SYSTEM_PROMPT = (
"You are The Unwritten Almanac's originality and build-plan advisor. Choose exactly one validated tool call for "
"the user's project-advice request. Return only the XML function call."
)
RESPONSE_SYSTEM_PROMPT = (
"You are The Unwritten Almanac's originality and build-plan advisor. Write concise, evidence-grounded advice from "
"the tool observations, cited pages, score, and selected goals."
)
def build_lora_dataset_jsonl(session: dict[str, Any], metadata: dict[str, Any]) -> str:
trace = _list_of_dicts(session.get("trace"))
ideas = _list_of_dicts(session.get("ideas"))
goals = [str(goal) for goal in session.get("goals") or []]
examples = _examples(trace, goals)
records = [
{
"type": "lora_sft_manifest",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"generated_at": utc_now(),
"app": "hackathon-advisor",
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"format": "chat-jsonl",
"record_kinds": ["tool_call", "advisor_response"],
"source": "exact_session_trace",
"idea_count": len(ideas),
"turn_count": len(trace),
"included_turn_count": len({example["turn_index"] for example in examples}),
"example_count": len(examples),
"index": _index_metadata(metadata),
}
]
records.extend(examples)
return "\n".join(json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records) + "\n"
def _examples(trace: list[dict[str, Any]], goals: list[str]) -> list[dict[str, Any]]:
examples: list[dict[str, Any]] = []
for turn_index, event in enumerate(trace, start=1):
if not _is_successful_turn(event):
continue
input_text = _clean(event.get("input"))
response = _clean(event.get("response"))
if not input_text or not response:
continue
tool_call = _tool_call(event)
if not tool_call["name"]:
continue
shared = {
"type": "lora_sft_example",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"turn_index": turn_index,
"goals": goals,
"score": _score(event),
"tool_call": tool_call,
"tool_observations": _tool_observations(event),
}
examples.append(
{
**shared,
"example_index": len(examples) + 1,
"example_kind": "tool_call",
"messages": [
{"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT},
{"role": "user", "content": input_text},
{"role": "assistant", "content": _tool_call_xml(tool_call)},
],
}
)
examples.append(
{
**shared,
"example_index": len(examples) + 1,
"example_kind": "advisor_response",
"messages": [
{"role": "system", "content": RESPONSE_SYSTEM_PROMPT},
{"role": "user", "content": _response_context(input_text, event, tool_call)},
{"role": "assistant", "content": response},
],
}
)
return examples
def _is_successful_turn(event: dict[str, Any]) -> bool:
resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {}
return str(resolution.get("status") or "") == "valid"
def _tool_call(event: dict[str, Any]) -> dict[str, Any]:
resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {}
call = resolution.get("call") if isinstance(resolution.get("call"), dict) else {}
arguments = call.get("arguments") if isinstance(call.get("arguments"), dict) else {}
return {
"name": _clean(call.get("name")),
"arguments": arguments,
}
def _tool_call_xml(tool_call: dict[str, Any]) -> str:
arguments = json.dumps(tool_call["arguments"], ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return f'<function name="{tool_call["name"]}">{arguments}</function>'
def _response_context(input_text: str, event: dict[str, Any], tool_call: dict[str, Any]) -> str:
observations = _tool_observations(event)
lines = [
input_text,
"",
f"Tool call: {_tool_call_xml(tool_call)}",
"Tool observations:",
]
if observations:
for observation in observations:
lines.append(f"- {observation['name']}: {observation['summary']}")
else:
lines.append("- none")
score = _score(event)
verdict = score["verdict"] or "n/a"
overall = score["overall"] if score["overall"] is not None else "n/a"
lines.extend(
[
f"Verdict: {verdict}",
f"Overall: {overall}",
f"Plan steps: {score['plan_steps']}",
]
)
return "\n".join(lines)
def _tool_observations(event: dict[str, Any]) -> list[dict[str, str]]:
observations = []
for tool in _list_of_dicts(event.get("tools")):
name = _clean(tool.get("name"))
summary = _clean(tool.get("summary"))
if name or summary:
observations.append({"name": name, "summary": summary})
return observations
def _score(event: dict[str, Any]) -> dict[str, Any]:
return {
"verdict": _clean(event.get("verdict")),
"overall": event.get("overall"),
"plan_steps": int(event.get("plan_steps") or 0),
}
def _index_metadata(metadata: dict[str, Any]) -> dict[str, str]:
return {
"algorithm": _clean(metadata.get("index_algorithm")),
"snapshot_generated_at": _clean(metadata.get("snapshot_generated_at")),
"index_generated_at": _clean(metadata.get("index_generated_at")),
"snapshot_digest": _clean(metadata.get("snapshot_digest")),
}