Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

hackathon-advisor / hackathon_advisor /lora_dataset.py

JacobLinCool

deploy: sync GitHub main de5dbf9

13fe947 verified about 18 hours ago

raw

history blame contribute delete

6.35 kB

	from __future__ import annotations

	import json
	from typing import Any

	from hackathon_advisor._text import clean as _clean, list_of_dicts as _list_of_dicts, utc_now


	LORA_DATASET_SCHEMA_VERSION = 1
	BASE_MODEL = "openbmb/MiniCPM5-1B"
	ADAPTER_TASK = "hackathon_advisor_tool_call_and_voice"

	TOOL_CALL_SYSTEM_PROMPT = (
	"You are The Unwritten Almanac's originality and build-plan advisor. Choose exactly one validated tool call for "
	"the user's project-advice request. Return only the XML function call."
	)

	RESPONSE_SYSTEM_PROMPT = (
	"You are The Unwritten Almanac's originality and build-plan advisor. Write concise, evidence-grounded advice from "
	"the tool observations, cited pages, score, and selected goals."
	)


	def build_lora_dataset_jsonl(session: dict[str, Any], metadata: dict[str, Any]) -> str:
	trace = _list_of_dicts(session.get("trace"))
	ideas = _list_of_dicts(session.get("ideas"))
	goals = [str(goal) for goal in session.get("goals") or []]
	examples = _examples(trace, goals)
	records = [
	{
	"type": "lora_sft_manifest",
	"schema_version": LORA_DATASET_SCHEMA_VERSION,
	"generated_at": utc_now(),
	"app": "hackathon-advisor",
	"base_model": BASE_MODEL,
	"adapter_task": ADAPTER_TASK,
	"format": "chat-jsonl",
	"record_kinds": ["tool_call", "advisor_response"],
	"source": "exact_session_trace",
	"idea_count": len(ideas),
	"turn_count": len(trace),
	"included_turn_count": len({example["turn_index"] for example in examples}),
	"example_count": len(examples),
	"index": _index_metadata(metadata),
	}
	]
	records.extend(examples)
	return "\n".join(json.dumps(record, ensure_ascii=False, sort_keys=True) for record in records) + "\n"


	def _examples(trace: list[dict[str, Any]], goals: list[str]) -> list[dict[str, Any]]:
	examples: list[dict[str, Any]] = []
	for turn_index, event in enumerate(trace, start=1):
	if not _is_successful_turn(event):
	continue
	input_text = _clean(event.get("input"))
	response = _clean(event.get("response"))
	if not input_text or not response:
	continue
	tool_call = _tool_call(event)
	if not tool_call["name"]:
	continue
	shared = {
	"type": "lora_sft_example",
	"schema_version": LORA_DATASET_SCHEMA_VERSION,
	"base_model": BASE_MODEL,
	"adapter_task": ADAPTER_TASK,
	"turn_index": turn_index,
	"goals": goals,
	"score": _score(event),
	"tool_call": tool_call,
	"tool_observations": _tool_observations(event),
	}
	examples.append(
	{
	**shared,
	"example_index": len(examples) + 1,
	"example_kind": "tool_call",
	"messages": [
	{"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT},
	{"role": "user", "content": input_text},
	{"role": "assistant", "content": _tool_call_xml(tool_call)},
	],
	}
	)
	examples.append(
	{
	**shared,
	"example_index": len(examples) + 1,
	"example_kind": "advisor_response",
	"messages": [
	{"role": "system", "content": RESPONSE_SYSTEM_PROMPT},
	{"role": "user", "content": _response_context(input_text, event, tool_call)},
	{"role": "assistant", "content": response},
	],
	}
	)
	return examples


	def _is_successful_turn(event: dict[str, Any]) -> bool:
	resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {}
	return str(resolution.get("status") or "") == "valid"


	def _tool_call(event: dict[str, Any]) -> dict[str, Any]:
	resolution = event.get("tool_resolution") if isinstance(event.get("tool_resolution"), dict) else {}
	call = resolution.get("call") if isinstance(resolution.get("call"), dict) else {}
	arguments = call.get("arguments") if isinstance(call.get("arguments"), dict) else {}
	return {
	"name": _clean(call.get("name")),
	"arguments": arguments,
	}


	def _tool_call_xml(tool_call: dict[str, Any]) -> str:
	arguments = json.dumps(tool_call["arguments"], ensure_ascii=False, sort_keys=True, separators=(",", ":"))
	return f'<function name="{tool_call["name"]}">{arguments}</function>'


	def _response_context(input_text: str, event: dict[str, Any], tool_call: dict[str, Any]) -> str:
	observations = _tool_observations(event)
	lines = [
	input_text,
	"",
	f"Tool call: {_tool_call_xml(tool_call)}",
	"Tool observations:",
	]
	if observations:
	for observation in observations:
	lines.append(f"- {observation['name']}: {observation['summary']}")
	else:
	lines.append("- none")

	score = _score(event)
	verdict = score["verdict"] or "n/a"
	overall = score["overall"] if score["overall"] is not None else "n/a"
	lines.extend(
	[
	f"Verdict: {verdict}",
	f"Overall: {overall}",
	f"Plan steps: {score['plan_steps']}",
	]
	)
	return "\n".join(lines)


	def _tool_observations(event: dict[str, Any]) -> list[dict[str, str]]:
	observations = []
	for tool in _list_of_dicts(event.get("tools")):
	name = _clean(tool.get("name"))
	summary = _clean(tool.get("summary"))
	if name or summary:
	observations.append({"name": name, "summary": summary})
	return observations


	def _score(event: dict[str, Any]) -> dict[str, Any]:
	return {
	"verdict": _clean(event.get("verdict")),
	"overall": event.get("overall"),
	"plan_steps": int(event.get("plan_steps") or 0),
	}


	def _index_metadata(metadata: dict[str, Any]) -> dict[str, str]:
	return {
	"algorithm": _clean(metadata.get("index_algorithm")),
	"snapshot_generated_at": _clean(metadata.get("snapshot_generated_at")),
	"index_generated_at": _clean(metadata.get("index_generated_at")),
	"snapshot_digest": _clean(metadata.get("snapshot_digest")),
	}