biodsbench-adapter / src /harness /evaluation /sourceTrajectoryWriter.ts
starpacker52's picture
Add files using upload-large-folder tool
2c2dc59 verified
Raw
History Blame Contribute Delete
7.82 kB
import { appendFile, mkdir, writeFile } from 'fs/promises'
import { join } from 'path'
import type {
EvaluationRunMetadata,
JudgeResult,
LoopStatus,
SourceAgentEvent,
SubmissionValidationIssue,
TaskRun,
} from './types.js'
export type CleanTrajectoryRecord =
| {
kind: 'run_context'
task_id: string
run_id: string
started_at: string
runtime_python?: string
run_metadata?: EvaluationRunMetadata
}
| {
kind: 'assistant_text'
round: number
text: string
}
| {
kind: 'tool_call'
round: number
tool: string
tool_use_id?: string
input?: unknown
}
| {
kind: 'tool_result'
round: number
tool_use_id?: string
ok: boolean
text?: string
}
| {
kind: 'policy_deny'
round: number
tool: string
reason: string
}
| {
kind: 'trajectory_warning'
round: number
code: string
message: string
details?: unknown
}
| {
kind: 'recovery_started'
round: number
message: string
}
| {
kind: 'recovery_finished'
round: number
finalized: boolean
summary?: string
}
| {
kind: 'submission_validation_failed'
round: number
ok: false
normalized_files: string[]
issues: SubmissionValidationIssue[]
}
| {
kind: 'submission_validation_passed'
round: number
ok: true
normalized_files: string[]
issues: SubmissionValidationIssue[]
}
| {
kind: 'agent_result'
round: number
subtype?: string
stop_reason?: string | null
duration_ms?: number
duration_api_ms?: number
is_error?: boolean
usage?: unknown
errors?: string[]
}
| {
kind: 'finalize'
round: number
summary: string
files: string[]
}
| {
kind: 'judge_result'
round: number
status: JudgeResult['status']
reward: number
feedback: unknown
}
| {
kind: 'run_finished'
status: LoopStatus
reward: number
completed_at: string
final_result?: unknown
}
function truncateText(value: string | undefined, maxLength = 4000): string | undefined {
if (!value || value.length <= maxLength) return value
const keep = Math.floor(maxLength / 2)
return `${value.slice(0, keep)}\n... [${value.length - maxLength} chars truncated] ...\n${value.slice(-keep)}`
}
function isShortAsciiPunctuationOnly(value: string): boolean {
const trimmed = value.trim()
if (!trimmed || trimmed.length > 3) return false
return [...trimmed].every(char => {
const code = char.charCodeAt(0)
return (
(code >= 33 && code <= 47) ||
(code >= 58 && code <= 64) ||
(code >= 91 && code <= 96) ||
(code >= 123 && code <= 126)
)
})
}
export function cleanAssistantTextForTrajectory(text: string): string | undefined {
const trimmed = text.trim()
if (!trimmed) return undefined
if (isShortAsciiPunctuationOnly(trimmed)) return undefined
const lines = text.replace(/\r\n/g, '\n').split('\n')
let index = 0
let removedLeadingNoise = false
while (index < lines.length && isShortAsciiPunctuationOnly(lines[index] ?? '')) {
removedLeadingNoise = true
index++
while (index < lines.length && (lines[index] ?? '').trim() === '') index++
}
if (!removedLeadingNoise) return text
const cleaned = lines.slice(index).join('\n').trim()
return cleaned ? cleaned : undefined
}
function cleanInput(input: unknown): unknown {
if (typeof input === 'string') return truncateText(input, 2000)
if (!input || typeof input !== 'object') return input
if (Array.isArray(input)) return input.map(cleanInput)
const out: Record<string, unknown> = {}
for (const [key, value] of Object.entries(input)) {
out[key] = typeof value === 'string' ? truncateText(value, 2000) : cleanInput(value)
}
return out
}
export class SourceTrajectoryWriter {
readonly cleanPath: string
readonly rawPath: string
private readonly taskRun: TaskRun
constructor(taskRun: TaskRun) {
this.taskRun = taskRun
this.cleanPath = join(taskRun.logsDir, 'trajectory.clean.jsonl')
this.rawPath = join(taskRun.logsDir, 'trajectory.raw.jsonl')
}
async start(input: {
startedAt: string
runtimePython?: string
runMetadata?: EvaluationRunMetadata
}): Promise<void> {
await mkdir(this.taskRun.logsDir, { recursive: true })
await writeFile(this.cleanPath, '', 'utf8')
await writeFile(this.rawPath, '', 'utf8')
await this.appendClean({
kind: 'run_context',
task_id: this.taskRun.taskId,
run_id: this.taskRun.runId,
started_at: input.startedAt,
runtime_python: input.runtimePython,
run_metadata: input.runMetadata,
})
}
async appendClean(record: CleanTrajectoryRecord): Promise<void> {
await appendFile(this.cleanPath, `${JSON.stringify(record)}\n`, 'utf8')
}
async appendRaw(record: unknown): Promise<void> {
await appendFile(this.rawPath, `${JSON.stringify(record)}\n`, 'utf8')
}
async agentEvent(round: number, event: SourceAgentEvent): Promise<void> {
await this.appendRaw({ round, ...event })
if (event.type === 'assistant_text') {
const text = cleanAssistantTextForTrajectory(event.text)
if (text !== undefined) {
await this.appendClean({
kind: 'assistant_text',
round,
text: truncateText(text) ?? '',
})
}
return
}
if (event.type === 'tool_call') {
await this.appendClean({
kind: 'tool_call',
round,
tool: event.tool,
tool_use_id: event.toolUseId,
input: cleanInput(event.input),
})
return
}
if (event.type === 'tool_result') {
await this.appendClean({
kind: 'tool_result',
round,
tool_use_id: event.toolUseId,
ok: event.ok,
text: truncateText(event.text),
})
return
}
if (event.type === 'policy_deny') {
await this.appendClean({
kind: 'policy_deny',
round,
tool: event.tool,
reason: event.reason,
})
return
}
if (event.type === 'trajectory_warning') {
await this.appendClean({
kind: 'trajectory_warning',
round,
code: event.code,
message: event.message,
details: cleanInput(event.details),
})
return
}
if (event.type === 'run_warning') {
await this.appendClean({
kind: 'trajectory_warning',
round,
code: event.code,
message: event.message,
details: cleanInput(event.details),
})
return
}
if (event.type === 'submission_validation_failed') {
await this.appendClean({
kind: 'submission_validation_failed',
round,
ok: false,
normalized_files: event.result.normalizedFiles,
issues: event.result.issues,
})
return
}
if (event.type === 'submission_validation_passed') {
await this.appendClean({
kind: 'submission_validation_passed',
round,
ok: true,
normalized_files: event.result.normalizedFiles,
issues: event.result.issues,
})
return
}
if (event.type === 'agent_result') {
await this.appendClean({
kind: 'agent_result',
round,
subtype: event.subtype,
stop_reason: event.stopReason,
duration_ms: event.durationMs,
duration_api_ms: event.durationApiMs,
is_error: event.isError,
usage: event.usage,
errors: event.errors,
})
return
}
await this.appendClean({
kind: 'finalize',
round,
summary: event.summary,
files: event.files,
})
}
}