Spaces:
Running on Zero
Running on Zero
File size: 6,348 Bytes
2b2e65d 13fe947 2b2e65d beeebb1 2b2e65d beeebb1 2b2e65d 9eec184 2b2e65d 13fe947 2b2e65d 9eec184 2b2e65d 9eec184 2b2e65d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | from __future__ import annotations
import json
from typing import Any
from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now
LORA_DATASET_SCHEMA_VERSION = 1
BASE_MODEL = "openbmb/MiniCPM5-1B"
ADAPTER_TASK = "hackathon_advisor_tool_call_and_voice"
TOOL_CALL_SYSTEM_PROMPT = (
"You are The Unwritten Almanac's originality and build-plan advisor. Choose exactly one validated tool call for "
"the user's project-advice request. Return only the XML function call."
)
RESPONSE_SYSTEM_PROMPT = (
"You are The Unwritten Almanac's originality and build-plan advisor. Write concise, evidence-grounded advice from "
"the tool observations, cited pages, score, and selected goals."
)
def build_lora_dataset_jsonl(session: dict[str, Any], metadata: dict[str, Any]) -> str:
trace = _list_of_dicts(session.get("trace"))
ideas = _list_of_dicts(session.get("ideas"))
goals = [str(goal) for goal in session.get("goals") or []]
examples = _examples(trace, goals)
records = [
{
"type": "lora_sft_manifest",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"generated_at": utc_now(),
"app": "hackathon-advisor",
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"format": "chat-jsonl",
"record_kinds": ["tool_call", "advisor_response"],
"source": "exact_session_trace",
"idea_count": len(ideas),
"turn_count": len(trace),
"included_turn_count": len({example["turn_index"] for example in examples}),
"example_count": len(examples),
"index": _index_metadata(metadata),
}
]
records.extend(examples)
return "\n".join(json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records) + "\n"
def _examples(trace: list[dict[str, Any]], goals: list[str]) -> list[dict[str, Any]]:
examples: list[dict[str, Any]] = []
for turn_index, event in enumerate(trace, start=1):
if not _is_successful_turn(event):
continue
input_text = _clean(event.get("input"))
response = _clean(event.get("response"))
if not input_text or not response:
continue
tool_call = _tool_call(event)
if not tool_call["name"]:
continue
shared = {
"type": "lora_sft_example",
"schema_version": LORA_DATASET_SCHEMA_VERSION,
"base_model": BASE_MODEL,
"adapter_task": ADAPTER_TASK,
"turn_index": turn_index,
"goals": goals,
"score": _score(event),
"tool_call": tool_call,
"tool_observations": _tool_observations(event),
}
examples.append(
{
**shared,
"example_index": len(examples) + 1,
"example_kind": "tool_call",
"messages": [
{"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT},
{"role": "user", "content": input_text},
{"role": "assistant", "content": _tool_call_xml(tool_call)},
],
}
)
examples.append(
{
**shared,
"example_index": len(examples) + 1,
"example_kind": "advisor_response",
"messages": [
{"role": "system", "content": RESPONSE_SYSTEM_PROMPT},
{"role": "user", "content": _response_context(input_text, event, tool_call)},
{"role": "assistant", "content": response},
],
}
)
return examples
def _is_successful_turn(event: dict[str, Any]) -> bool:
resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {}
return str(resolution.get("status") or "") == "valid"
def _tool_call(event: dict[str, Any]) -> dict[str, Any]:
resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {}
call = resolution.get("call") if isinstance(resolution.get("call"), dict) else {}
arguments = call.get("arguments") if isinstance(call.get("arguments"), dict) else {}
return {
"name": _clean(call.get("name")),
"arguments": arguments,
}
def _tool_call_xml(tool_call: dict[str, Any]) -> str:
arguments = json.dumps(tool_call["arguments"], ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return f'<function name="{tool_call["name"]}">{arguments}</function>'
def _response_context(input_text: str, event: dict[str, Any], tool_call: dict[str, Any]) -> str:
observations = _tool_observations(event)
lines = [
input_text,
"",
f"Tool call: {_tool_call_xml(tool_call)}",
"Tool observations:",
]
if observations:
for observation in observations:
lines.append(f"- {observation['name']}: {observation['summary']}")
else:
lines.append("- none")
score = _score(event)
verdict = score["verdict"] or "n/a"
overall = score["overall"] if score["overall"] is not None else "n/a"
lines.extend(
[
f"Verdict: {verdict}",
f"Overall: {overall}",
f"Plan steps: {score['plan_steps']}",
]
)
return "\n".join(lines)
def _tool_observations(event: dict[str, Any]) -> list[dict[str, str]]:
observations = []
for tool in _list_of_dicts(event.get("tools")):
name = _clean(tool.get("name"))
summary = _clean(tool.get("summary"))
if name or summary:
observations.append({"name": name, "summary": summary})
return observations
def _score(event: dict[str, Any]) -> dict[str, Any]:
return {
"verdict": _clean(event.get("verdict")),
"overall": event.get("overall"),
"plan_steps": int(event.get("plan_steps") or 0),
}
def _index_metadata(metadata: dict[str, Any]) -> dict[str, str]:
return {
"algorithm": _clean(metadata.get("index_algorithm")),
"snapshot_generated_at": _clean(metadata.get("snapshot_generated_at")),
"index_generated_at": _clean(metadata.get("index_generated_at")),
"snapshot_digest": _clean(metadata.get("snapshot_digest")),
}
|