Spaces:
Running
Running
timchen0618 commited on
Commit ·
d14bce3
1
Parent(s): 1eb493c
Derive new_status from new_trajectory; fix sidebar check mark; fix question for incomplete
Browse files- selected_tools backend: compute new_status from [Final Answer] presence in new_trajectory,
instead of trusting the eval file is_completed which reflects the reference run, not new run
- SelectedToolsApp: use new_status for incomplete detection and status badge (fixes query 49)
- SelectedToolsApp: question row and incomplete badge now use new_status (correct for all runs)
- patch_sel_tools_test300_questions.py: fills empty question fields from BrowseComp JSONL
(11 selected-tools test300 repos, running on torch)
- backend/api/selected_tools.py +4 -1
- frontend/dist/assets/{ExperimentsApp-D09jcP3c.js → ExperimentsApp-r9xSXake.js} +0 -0
- frontend/dist/assets/{ModelApp-kmFeUyK4.js → ModelApp-Cw242jQN.js} +1 -1
- frontend/dist/assets/{PlanRevisionsApp-BplajecZ.js → PlanRevisionsApp-CUfcVsmt.js} +1 -1
- frontend/dist/assets/{ScoutRunsApp-xRBbKpfW.js → ScoutRunsApp-B_i5X87r.js} +1 -1
- frontend/dist/assets/{SelectedToolsApp-ButjYRD7.js → SelectedToolsApp-BBGQyqN9.js} +2 -2
- frontend/dist/assets/{SftDiffApp-Bh7C7VBz.js → SftDiffApp-D-nUoSxu.js} +1 -1
- frontend/dist/assets/{TrajExtApp-DNAeWbp6.js → TrajExtApp-Y8_orlLr.js} +1 -1
- frontend/dist/assets/{VisualizerApp-D_QCMHfk.js → VisualizerApp-B36fbafZ.js} +2 -2
- frontend/dist/assets/{index-CR1kJ7mL.js → index-BewwkiJG.js} +0 -0
- frontend/dist/index.html +1 -1
- frontend/src/selected_tools/SelectedToolsApp.tsx +6 -5
- patch_sel_tools_test300_questions.py +89 -0
backend/api/selected_tools.py
CHANGED
|
@@ -94,6 +94,8 @@ def _load(variant: str) -> list:
|
|
| 94 |
sel_idx = json.loads(sel_idx)
|
| 95 |
except Exception:
|
| 96 |
sel_idx = []
|
|
|
|
|
|
|
| 97 |
rows.append({
|
| 98 |
"query_id": str(row["query_id"]),
|
| 99 |
"rationale": row.get("rationale") or "",
|
|
@@ -101,11 +103,12 @@ def _load(variant: str) -> list:
|
|
| 101 |
"k_requested": int(row["k_requested"]),
|
| 102 |
"k_effective": int(row["k_effective"]),
|
| 103 |
"excerpt": row["excerpt"],
|
| 104 |
-
"new_trajectory":
|
| 105 |
"direct_answer": bool(row["direct_answer"]),
|
| 106 |
"tool_call_counts": tool_counts,
|
| 107 |
"total_tool_calls": total_tool_calls,
|
| 108 |
"status": row["status"],
|
|
|
|
| 109 |
"question": row.get("question") or "",
|
| 110 |
"correct_answer": row.get("correct_answer") or "",
|
| 111 |
"correct": row.get("correct"), # None if not available
|
|
|
|
| 94 |
sel_idx = json.loads(sel_idx)
|
| 95 |
except Exception:
|
| 96 |
sel_idx = []
|
| 97 |
+
new_traj = row.get("new_trajectory") or ""
|
| 98 |
+
new_status = "completed" if "[Final Answer]" in new_traj else "incomplete"
|
| 99 |
rows.append({
|
| 100 |
"query_id": str(row["query_id"]),
|
| 101 |
"rationale": row.get("rationale") or "",
|
|
|
|
| 103 |
"k_requested": int(row["k_requested"]),
|
| 104 |
"k_effective": int(row["k_effective"]),
|
| 105 |
"excerpt": row["excerpt"],
|
| 106 |
+
"new_trajectory": new_traj,
|
| 107 |
"direct_answer": bool(row["direct_answer"]),
|
| 108 |
"tool_call_counts": tool_counts,
|
| 109 |
"total_tool_calls": total_tool_calls,
|
| 110 |
"status": row["status"],
|
| 111 |
+
"new_status": new_status,
|
| 112 |
"question": row.get("question") or "",
|
| 113 |
"correct_answer": row.get("correct_answer") or "",
|
| 114 |
"correct": row.get("correct"), # None if not available
|
frontend/dist/assets/{ExperimentsApp-D09jcP3c.js → ExperimentsApp-r9xSXake.js}
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/dist/assets/{ModelApp-kmFeUyK4.js → ModelApp-Cw242jQN.js}
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import{r as i,p as re,a as le,j as e}from"./index-
|
| 2 |
${u.repo}`:u.repo,children:u.presetName||u.name}),e.jsxs("div",{className:"text-[10px] text-gray-500",children:[u.column," | ",u.n_rows," rows | ",u.n_samples," samples"]})]}),e.jsx("button",{onClick:v=>{v.stopPropagation(),G(M===u.id?null:u.id),O("")},className:`transition-colors shrink-0 ${M===u.id?"text-blue-400":"text-gray-600 hover:text-blue-400"}`,title:"Save as preset",children:e.jsx("svg",{className:"w-3.5 h-3.5",fill:"none",viewBox:"0 0 24 24",stroke:"currentColor",children:e.jsx("path",{strokeLinecap:"round",strokeLinejoin:"round",strokeWidth:2,d:"M5 5a2 2 0 012-2h10a2 2 0 012 2v16l-7-3.5L5 21V5z"})})}),e.jsx("button",{onClick:v=>{v.stopPropagation(),b(u.id)},className:"text-gray-600 hover:text-red-400 transition-colors shrink-0",title:"Remove",children:e.jsx("svg",{className:"w-3.5 h-3.5",fill:"none",viewBox:"0 0 24 24",stroke:"currentColor",children:e.jsx("path",{strokeLinecap:"round",strokeLinejoin:"round",strokeWidth:2,d:"M6 18L18 6M6 6l12 12"})})})]}),M===u.id&&e.jsxs("div",{className:"flex gap-1 mt-1 ml-6",children:[e.jsx("input",{type:"text",value:B,onChange:v=>O(v.target.value),onKeyDown:v=>{v.key==="Enter"&&z(u),v.key==="Escape"&&G(null)},placeholder:"Preset name...",className:"flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none",autoFocus:!0}),e.jsx("button",{onClick:()=>z(u),className:"px-2 py-1 text-xs bg-blue-600 hover:bg-blue-500 rounded text-white",children:"Save"})]})]},u.id))})]},r)})})]}),R&&(()=>{const r=t.find(a=>a.id===R);return r!=null&&r.presetId?e.jsxs("div",{className:"p-3 border-t border-gray-700 space-y-2",children:[e.jsx("div",{className:"text-[10px] text-gray-500 uppercase font-semibold tracking-wider",children:"Edit Preset"}),e.jsx("input",{type:"text",value:_,onChange:a=>J(a.target.value),onKeyDown:a=>{a.key==="Enter"&&_.trim()&&(w(r.presetId,r.id,{name:_.trim()}),$(null)),a.key==="Escape"&&$(null)},placeholder:"Preset name...",className:"w-full px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none",autoFocus:!0}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx("button",{onClick:()=>{_.trim()&&(w(r.presetId,r.id,{name:_.trim()}),$(null))},disabled:!_.trim(),className:"flex-1 px-2 py-1 text-xs bg-blue-600 hover:bg-blue-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors",children:"Save"}),e.jsx("button",{onClick:()=>{j(r.presetId,r.id),$(null)},className:"px-2 py-1 text-xs bg-red-900 hover:bg-red-800 rounded text-red-300 transition-colors",children:"Delete"}),e.jsx("button",{onClick:()=>$(null),className:"px-2 py-1 text-xs bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors",children:"Cancel"})]})]}):null})(),e.jsx("div",{className:"p-3 border-t border-gray-700",children:k?e.jsxs("div",{className:"space-y-2",children:[e.jsx("input",{type:"text",value:m,onChange:r=>y(r.target.value),onKeyDown:r=>r.key==="Enter"&&K(),placeholder:"org/dataset-name",className:"w-full px-2 py-1.5 text-sm bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none",autoFocus:!0}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx("input",{type:"text",value:P,onChange:r=>N(r.target.value),placeholder:"Response col (auto-detect)",className:"flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"}),e.jsx("input",{type:"text",value:F,onChange:r=>W(r.target.value),placeholder:"Split",className:"w-16 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"})]}),e.jsx("div",{className:"flex gap-2",children:e.jsx("input",{type:"text",value:H,onChange:r=>Y(r.target.value),placeholder:"Prompt col (auto-detect)",className:"flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"})}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx("button",{onClick:K,disabled:!m.trim()||n[m.trim()],className:"flex-1 px-2 py-1.5 text-sm bg-blue-600 hover:bg-blue-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors",children:n[m.trim()]?"Loading...":"Load"}),e.jsx("button",{onClick:()=>o(!1),className:"px-3 py-1.5 text-sm bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors",children:"Cancel"})]})]}):e.jsx("button",{onClick:()=>{$(null),o(!0),y(""),N(""),W("train"),Y("")},className:"w-full px-3 py-2 text-sm bg-blue-600 hover:bg-blue-500 rounded text-white font-medium transition-colors",children:"+ Add Repo"})})]})}function de(t){if(!t)return[{text:"(no response)",className:"text-gray-500 italic"}];const l=[],n=t.split(`
|
| 3 |
`);for(let c=0;c<n.length;c++){const p=n[c],s=p.toLowerCase().trim();let x="text-gray-300";s.startsWith("wait")||s.startsWith("hmm")||s.startsWith("but wait")?x="text-yellow-400":s.startsWith("let me try")||s.startsWith("let me reconsider")||s.startsWith("let me think")?x="text-cyan-400":s.startsWith("so the answer")||s.startsWith("so the expression")||s.startsWith("therefore")||s.startsWith("the final")?x="text-green-400 font-bold":s.startsWith("i give up")||s.startsWith("i can't find")||s.startsWith("i'm stuck")||s.startsWith("i'm sorry")?x="text-red-400 font-bold":p.includes("=")&&/[+\-*/]/.test(p)&&(x="text-gray-100"),l.push({text:p,className:x}),c<n.length-1&&l.push({text:`
|
| 4 |
`,className:""})}return l}function ue(t){if(!t||!t.trim())return[];try{const l=JSON.parse(t);if(Array.isArray(l)&&l.length>0&&l[0].role!==void 0)return l.map(n=>({role:String(n.role||"unknown"),content:String(n.content??"")}))}catch{}if(t.includes("<|im_start|>"))return t.split("<|im_start|>").filter(Boolean).map(n=>{const c=n.indexOf(`
|
|
|
|
| 1 |
+
import{r as i,p as re,a as le,j as e}from"./index-BewwkiJG.js";const U="/api/model",X="/api/presets/model";async function L(t,l){const n=await fetch(t,{headers:{"Content-Type":"application/json"},...l});if(!n.ok){const c=await n.json().catch(()=>({error:n.statusText}));throw new Error(c.error||n.statusText)}return n.json()}const A={loadDataset(t,l,n,c){return L(`${U}/datasets/load`,{method:"POST",body:JSON.stringify({repo:t,column:l,split:n,prompt_column:c})})},listDatasets(){return L(`${U}/datasets/`)},getQuestion(t,l){return L(`${U}/datasets/${t}/question/${l}`)},getSummary(t){return L(`${U}/datasets/${t}/summary`)},unloadDataset(t){return L(`${U}/datasets/${t}`,{method:"DELETE"})},listPresets(){return L(`${X}`)},createPreset(t,l,n,c){return L(`${X}`,{method:"POST",body:JSON.stringify({name:t,repo:l,column:n,split:c})})},updatePreset(t,l){return L(`${X}/${t}`,{method:"PUT",body:JSON.stringify(l)})},deletePreset(t){return L(`${X}/${t}`,{method:"DELETE"})}};function ie(){const t=i.useRef(re().params),[l,n]=i.useState([]),[c,p]=i.useState([]),[s,x]=i.useState("all"),[b,g]=i.useState({}),[h,S]=i.useState({}),[I,j]=i.useState(null),[w,k]=i.useState({}),[o,m]=i.useState(null);i.useEffect(()=>{A.listPresets().then(p).catch(()=>{})},[]),i.useEffect(()=>{const a=re().params,d=parseInt(a.get("q")||"0"),f=parseInt(a.get("s")||"0"),u=a.get("filter")||"all";x(u),(!isNaN(d)||!isNaN(f))&&(window.__initialQ=isNaN(d)?0:d,window.__initialS=isNaN(f)?0:f)},[]);const y=i.useMemo(()=>{const r={};for(const a of l){const d=a.questionFingerprint;r[d]||(r[d]=[]),r[d].push(a)}return r},[l]),P=i.useMemo(()=>Object.keys(y).sort(),[y]);i.useEffect(()=>{if(o&&y[o])return;const r=P.find(a=>y[a].some(d=>d.active));r?m(r):P.length>0?m(P[0]):m(null)},[P,y,o]);const N=i.useMemo(()=>l.filter(r=>r.active&&r.questionFingerprint===o),[l,o]),[F,W]=i.useState([]);i.useEffect(()=>{const r=new Set(N.map(a=>a.id));W(a=>{const d=a.filter(v=>r.has(v)),f=N.map(v=>v.id).filter(v=>!a.includes(v)),u=[...d,...f];return u.length===a.length&&u.every((v,C)=>v===a[C])?a:u})},[N]);const H=i.useMemo(()=>{const r=new Map(N.map(a=>[a.id,a]));return F.map(a=>r.get(a)).filter(a=>a!==void 0)},[N,F]),Y=i.useCallback((r,a)=>{r!==a&&W(d=>{const f=[...d],u=f.indexOf(r),v=f.indexOf(a);return u===-1||v===-1?d:(f.splice(u,1),f.splice(v,0,r),f)})},[]),D=o?w[o]:void 0,E=(D==null?void 0:D.questionIdx)??0,M=(D==null?void 0:D.sampleIdx)??0,G=i.useCallback(r=>{o&&k(a=>{const d=a[o]??{questionIdx:0,sampleIdx:0},f=typeof r=="function"?r(d.questionIdx):r;return{...a,[o]:{...d,questionIdx:f}}})},[o]),B=i.useCallback(r=>{o&&k(a=>{const d=a[o]??{questionIdx:0,sampleIdx:0},f=typeof r=="function"?r(d.sampleIdx):r;return{...a,[o]:{...d,sampleIdx:f}}})},[o]);i.useEffect(()=>{const r=new URLSearchParams,a=l.filter(d=>d.active);a.length>0&&(r.set("repos",a.map(d=>d.repo).join(",")),r.set("cols",a.map(d=>d.column).join(",")),r.set("pcols",a.map(d=>d.promptColumn||"formatted_prompt").join(","))),r.set("q",String(E)),r.set("s",String(M)),s!=="all"&&r.set("filter",s),o&&r.set("group",o),le({params:r})},[l,E,M,s,o]),i.useEffect(()=>{N.forEach(r=>{const a=`${r.id}:${E}`;b[a]||A.getQuestion(r.id,E).then(d=>{g(f=>({...f,[a]:d}))}).catch(()=>{})})},[E,N]);const O=i.useCallback(async(r,a,d,f,u,v)=>{S(C=>({...C,[r]:!0})),j(null);try{const{question_fingerprint:C,...Q}=await A.loadDataset(r,a,d,f),V=C??"",ee={...Q,questionFingerprint:V,active:!0,presetId:u,presetName:v};n(T=>T.some(q=>q.id===ee.id)?T:[...T,ee]),k(T=>{if(T[V])return T;const q=window,ne=typeof q.__initialQ=="number"?q.__initialQ:0,oe=typeof q.__initialS=="number"?q.__initialS:0,te=Object.keys(T).length===0;return{...T,[V]:{questionIdx:te?ne:0,sampleIdx:te?oe:0}}}),m(V)}catch(C){j(C instanceof Error?C.message:"Failed to load dataset")}finally{S(C=>({...C,[r]:!1}))}},[]);i.useEffect(()=>{var u,v,C;const r=t.current,a=((u=r.get("repos"))==null?void 0:u.split(",").filter(Boolean))||[],d=((v=r.get("cols"))==null?void 0:v.split(","))||[],f=((C=r.get("pcols"))==null?void 0:C.split(","))||[];for(let Q=0;Q<a.length;Q++)O(a[Q],d[Q]||void 0,void 0,f[Q]||void 0)},[O]);const R=i.useCallback(async r=>{await A.unloadDataset(r).catch(()=>{}),n(a=>a.filter(d=>d.id!==r))},[]),$=i.useCallback(r=>{n(a=>{const d=a.map(u=>u.id===r?{...u,active:!u.active}:u),f=d.find(u=>u.id===r);return f&&f.active&&m(f.questionFingerprint),d})},[]),_=i.useCallback((r,a)=>{n(d=>d.map(f=>f.id===r?{...f,presetName:a}:f))},[]),J=i.useCallback(r=>{n(a=>a.map(d=>d.id===r?{...d,presetId:void 0,presetName:void 0}:d))},[]),K=Math.min(...N.map(r=>r.n_rows),1/0),z=Math.max(...N.map(r=>r.n_samples),0);return{datasets:l,presets:c,setPresets:p,questionIdx:E,setQuestionIdx:G,sampleIdx:M,setSampleIdx:B,filter:s,setFilter:x,loading:h,error:I,setError:j,activeDatasets:N,orderedActiveDatasets:H,maxQuestions:K,maxSamples:z,addDataset:O,removeDataset:R,toggleDataset:$,updateDatasetPresetName:_,clearDatasetPreset:J,getQuestionData:r=>b[`${r}:${E}`],reorderPanels:Y,groups:y,groupIds:P,currentGroupId:o,setCurrentGroupId:m}}const se=[{bg:"bg-blue-500",border:"border-blue-500",text:"text-blue-400",label:"text-blue-300"},{bg:"bg-emerald-500",border:"border-emerald-500",text:"text-emerald-400",label:"text-emerald-300"},{bg:"bg-amber-500",border:"border-amber-500",text:"text-amber-400",label:"text-amber-300"},{bg:"bg-purple-500",border:"border-purple-500",text:"text-purple-400",label:"text-purple-300"},{bg:"bg-rose-500",border:"border-rose-500",text:"text-rose-400",label:"text-rose-300"},{bg:"bg-cyan-500",border:"border-cyan-500",text:"text-cyan-400",label:"text-cyan-300"}];function ce({datasets:t,presets:l,loading:n,groups:c,groupIds:p,currentGroupId:s,onAddDataset:x,onRemoveDataset:b,onToggleDataset:g,onSetCurrentGroup:h,onLoadPreset:S,onSavePreset:I,onDeletePreset:j,onUpdatePreset:w}){const[k,o]=i.useState(!1),[m,y]=i.useState(""),[P,N]=i.useState(""),[F,W]=i.useState("train"),[H,Y]=i.useState(""),[D,E]=i.useState(""),[M,G]=i.useState(null),[B,O]=i.useState(""),[R,$]=i.useState(null),[_,J]=i.useState(""),K=()=>{m.trim()&&(x(m.trim(),P.trim()||void 0,F.trim()||void 0,H.trim()||void 0),y(""),o(!1))},z=r=>{B.trim()&&(I(B.trim(),r.repo,r.column,r.split),O(""),G(null))},Z=r=>{const a=p.indexOf(r);return se[a%se.length]};return e.jsxs("div",{className:"w-72 min-w-72 bg-gray-900 border-r border-gray-700 flex flex-col h-full",children:[e.jsxs("div",{className:"p-3 border-b border-gray-700",children:[e.jsx("div",{className:"flex items-center justify-between mb-2",children:e.jsx("h3",{className:"text-xs font-semibold text-gray-400 uppercase tracking-wider",children:"Presets"})}),l.length===0?e.jsx("p",{className:"text-xs text-gray-500 italic",children:"No presets saved"}):e.jsxs(e.Fragment,{children:[l.length>6&&e.jsx("input",{type:"text",value:D,onChange:r=>E(r.target.value),placeholder:"Search presets...",className:"w-full px-2 py-1 mb-2 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"}),e.jsx("div",{className:"flex flex-wrap gap-1 max-h-32 overflow-y-auto",children:l.filter(r=>!D||r.name.toLowerCase().includes(D.toLowerCase())||r.repo.toLowerCase().includes(D.toLowerCase())).map(r=>e.jsxs("div",{className:"group relative",children:[e.jsx("button",{onClick:()=>S(r),className:"px-2 py-1 text-xs bg-gray-800 hover:bg-gray-700 rounded border border-gray-600 text-gray-300 transition-colors",title:`${r.repo} (${r.column}, ${r.split??"train"})`,children:r.name}),e.jsx("div",{className:"hidden group-hover:flex absolute top-full left-0 mt-1 z-10 gap-1",children:e.jsx("button",{onClick:()=>j(r.id),className:"px-1.5 py-0.5 text-[10px] bg-red-900 hover:bg-red-800 rounded text-red-300",children:"Delete"})})]},r.id))})]})]}),e.jsxs("div",{className:"flex-1 overflow-y-auto p-3",children:[e.jsx("h3",{className:"text-xs font-semibold text-gray-400 uppercase tracking-wider mb-2",children:"Loaded Repos"}),t.length===0?e.jsx("p",{className:"text-xs text-gray-500 italic",children:"No repos loaded. Add one below."}):e.jsx("div",{className:"space-y-3",children:p.map(r=>{const a=Z(r),d=c[r],f=r===s;return e.jsxs("div",{children:[e.jsxs("button",{onClick:()=>h(r),className:`w-full flex items-center gap-1.5 mb-1 px-1 py-0.5 rounded transition-colors ${f?"bg-gray-800":"hover:bg-gray-800/50"}`,children:[e.jsx("span",{className:`inline-block w-2 h-2 rounded-full ${a.bg} shrink-0`}),e.jsxs("span",{className:`text-[10px] font-semibold uppercase tracking-wider ${f?a.label:"text-gray-500"}`,children:["Group ",p.indexOf(r)+1,e.jsxs("span",{className:"normal-case font-normal ml-1 text-gray-600",children:["(",d.length," repo",d.length!==1?"s":"",")"]})]}),f&&e.jsx("span",{className:"text-[9px] text-gray-600 ml-auto",children:"viewing"})]}),e.jsx("div",{className:`space-y-1 border-l-2 ml-1 pl-2 ${f?a.border:"border-gray-700"}`,children:d.map(u=>e.jsxs("div",{children:[e.jsxs("div",{onClick:()=>{u.presetId&&($(R===u.id?null:u.id),J(u.presetName||""),o(!1))},className:`flex items-center gap-2 px-2 py-1.5 rounded text-sm transition-colors ${u.active?"bg-gray-800":"bg-gray-900 opacity-60"} ${R===u.id?"ring-1 ring-blue-500":""} ${u.presetId?"cursor-pointer":""}`,children:[e.jsx("input",{type:"checkbox",checked:u.active,onChange:()=>g(u.id),onClick:v=>v.stopPropagation(),className:"rounded border-gray-600 bg-gray-800 text-blue-500 focus:ring-blue-500 focus:ring-offset-0"}),e.jsxs("div",{className:"flex-1 min-w-0",children:[e.jsx("div",{className:"text-gray-200 truncate text-xs font-medium",title:u.presetName?`${u.presetName}
|
| 2 |
${u.repo}`:u.repo,children:u.presetName||u.name}),e.jsxs("div",{className:"text-[10px] text-gray-500",children:[u.column," | ",u.n_rows," rows | ",u.n_samples," samples"]})]}),e.jsx("button",{onClick:v=>{v.stopPropagation(),G(M===u.id?null:u.id),O("")},className:`transition-colors shrink-0 ${M===u.id?"text-blue-400":"text-gray-600 hover:text-blue-400"}`,title:"Save as preset",children:e.jsx("svg",{className:"w-3.5 h-3.5",fill:"none",viewBox:"0 0 24 24",stroke:"currentColor",children:e.jsx("path",{strokeLinecap:"round",strokeLinejoin:"round",strokeWidth:2,d:"M5 5a2 2 0 012-2h10a2 2 0 012 2v16l-7-3.5L5 21V5z"})})}),e.jsx("button",{onClick:v=>{v.stopPropagation(),b(u.id)},className:"text-gray-600 hover:text-red-400 transition-colors shrink-0",title:"Remove",children:e.jsx("svg",{className:"w-3.5 h-3.5",fill:"none",viewBox:"0 0 24 24",stroke:"currentColor",children:e.jsx("path",{strokeLinecap:"round",strokeLinejoin:"round",strokeWidth:2,d:"M6 18L18 6M6 6l12 12"})})})]}),M===u.id&&e.jsxs("div",{className:"flex gap-1 mt-1 ml-6",children:[e.jsx("input",{type:"text",value:B,onChange:v=>O(v.target.value),onKeyDown:v=>{v.key==="Enter"&&z(u),v.key==="Escape"&&G(null)},placeholder:"Preset name...",className:"flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none",autoFocus:!0}),e.jsx("button",{onClick:()=>z(u),className:"px-2 py-1 text-xs bg-blue-600 hover:bg-blue-500 rounded text-white",children:"Save"})]})]},u.id))})]},r)})})]}),R&&(()=>{const r=t.find(a=>a.id===R);return r!=null&&r.presetId?e.jsxs("div",{className:"p-3 border-t border-gray-700 space-y-2",children:[e.jsx("div",{className:"text-[10px] text-gray-500 uppercase font-semibold tracking-wider",children:"Edit Preset"}),e.jsx("input",{type:"text",value:_,onChange:a=>J(a.target.value),onKeyDown:a=>{a.key==="Enter"&&_.trim()&&(w(r.presetId,r.id,{name:_.trim()}),$(null)),a.key==="Escape"&&$(null)},placeholder:"Preset name...",className:"w-full px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none",autoFocus:!0}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx("button",{onClick:()=>{_.trim()&&(w(r.presetId,r.id,{name:_.trim()}),$(null))},disabled:!_.trim(),className:"flex-1 px-2 py-1 text-xs bg-blue-600 hover:bg-blue-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors",children:"Save"}),e.jsx("button",{onClick:()=>{j(r.presetId,r.id),$(null)},className:"px-2 py-1 text-xs bg-red-900 hover:bg-red-800 rounded text-red-300 transition-colors",children:"Delete"}),e.jsx("button",{onClick:()=>$(null),className:"px-2 py-1 text-xs bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors",children:"Cancel"})]})]}):null})(),e.jsx("div",{className:"p-3 border-t border-gray-700",children:k?e.jsxs("div",{className:"space-y-2",children:[e.jsx("input",{type:"text",value:m,onChange:r=>y(r.target.value),onKeyDown:r=>r.key==="Enter"&&K(),placeholder:"org/dataset-name",className:"w-full px-2 py-1.5 text-sm bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none",autoFocus:!0}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx("input",{type:"text",value:P,onChange:r=>N(r.target.value),placeholder:"Response col (auto-detect)",className:"flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"}),e.jsx("input",{type:"text",value:F,onChange:r=>W(r.target.value),placeholder:"Split",className:"w-16 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"})]}),e.jsx("div",{className:"flex gap-2",children:e.jsx("input",{type:"text",value:H,onChange:r=>Y(r.target.value),placeholder:"Prompt col (auto-detect)",className:"flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-blue-500 focus:outline-none"})}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx("button",{onClick:K,disabled:!m.trim()||n[m.trim()],className:"flex-1 px-2 py-1.5 text-sm bg-blue-600 hover:bg-blue-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors",children:n[m.trim()]?"Loading...":"Load"}),e.jsx("button",{onClick:()=>o(!1),className:"px-3 py-1.5 text-sm bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors",children:"Cancel"})]})]}):e.jsx("button",{onClick:()=>{$(null),o(!0),y(""),N(""),W("train"),Y("")},className:"w-full px-3 py-2 text-sm bg-blue-600 hover:bg-blue-500 rounded text-white font-medium transition-colors",children:"+ Add Repo"})})]})}function de(t){if(!t)return[{text:"(no response)",className:"text-gray-500 italic"}];const l=[],n=t.split(`
|
| 3 |
`);for(let c=0;c<n.length;c++){const p=n[c],s=p.toLowerCase().trim();let x="text-gray-300";s.startsWith("wait")||s.startsWith("hmm")||s.startsWith("but wait")?x="text-yellow-400":s.startsWith("let me try")||s.startsWith("let me reconsider")||s.startsWith("let me think")?x="text-cyan-400":s.startsWith("so the answer")||s.startsWith("so the expression")||s.startsWith("therefore")||s.startsWith("the final")?x="text-green-400 font-bold":s.startsWith("i give up")||s.startsWith("i can't find")||s.startsWith("i'm stuck")||s.startsWith("i'm sorry")?x="text-red-400 font-bold":p.includes("=")&&/[+\-*/]/.test(p)&&(x="text-gray-100"),l.push({text:p,className:x}),c<n.length-1&&l.push({text:`
|
| 4 |
`,className:""})}return l}function ue(t){if(!t||!t.trim())return[];try{const l=JSON.parse(t);if(Array.isArray(l)&&l.length>0&&l[0].role!==void 0)return l.map(n=>({role:String(n.role||"unknown"),content:String(n.content??"")}))}catch{}if(t.includes("<|im_start|>"))return t.split("<|im_start|>").filter(Boolean).map(n=>{const c=n.indexOf(`
|
frontend/dist/assets/{PlanRevisionsApp-BplajecZ.js → PlanRevisionsApp-CUfcVsmt.js}
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import{r as i,j as t}from"./index-
|
| 2 |
`),n=p.split(`
|
| 3 |
`),d=new Set(x),m=new Set(n),s=[];let o=0,a=0;for(;o<x.length||a<n.length;){const u=x[o],l=n[a];o>=x.length?(s.push({text:n[a],type:"add"}),a++):a>=n.length?(s.push({text:x[o],type:"del"}),o++):u===l?(s.push({text:u,type:"same"}),o++,a++):m.has(u)?d.has(l)?(s.push({text:u,type:"del"}),s.push({text:l,type:"add"}),o++,a++):(s.push({text:l,type:"add"}),a++):(s.push({text:u,type:"del"}),o++)}return s}function _({prev:r,curr:p}){const x=i.useMemo(()=>S(r,p),[r,p]);return t.jsx("pre",{className:"text-xs font-mono whitespace-pre-wrap leading-relaxed",children:x.map((n,d)=>n.type==="same"?t.jsx("span",{className:"text-gray-300",children:n.text+`
|
| 4 |
`},d):n.type==="add"?t.jsx("span",{className:"bg-green-900/40 text-green-300",children:"+ "+n.text+`
|
|
|
|
| 1 |
+
import{r as i,j as t}from"./index-BewwkiJG.js";function S(r,p){const x=r.split(`
|
| 2 |
`),n=p.split(`
|
| 3 |
`),d=new Set(x),m=new Set(n),s=[];let o=0,a=0;for(;o<x.length||a<n.length;){const u=x[o],l=n[a];o>=x.length?(s.push({text:n[a],type:"add"}),a++):a>=n.length?(s.push({text:x[o],type:"del"}),o++):u===l?(s.push({text:u,type:"same"}),o++,a++):m.has(u)?d.has(l)?(s.push({text:u,type:"del"}),s.push({text:l,type:"add"}),o++,a++):(s.push({text:l,type:"add"}),a++):(s.push({text:u,type:"del"}),o++)}return s}function _({prev:r,curr:p}){const x=i.useMemo(()=>S(r,p),[r,p]);return t.jsx("pre",{className:"text-xs font-mono whitespace-pre-wrap leading-relaxed",children:x.map((n,d)=>n.type==="same"?t.jsx("span",{className:"text-gray-300",children:n.text+`
|
| 4 |
`},d):n.type==="add"?t.jsx("span",{className:"bg-green-900/40 text-green-300",children:"+ "+n.text+`
|
frontend/dist/assets/{ScoutRunsApp-xRBbKpfW.js → ScoutRunsApp-B_i5X87r.js}
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import{r as n,j as e}from"./index-
|
| 2 |
|
| 3 |
---
|
| 4 |
|
|
|
|
| 1 |
+
import{r as n,j as e}from"./index-BewwkiJG.js";const S={reasoning:{border:"border-purple-700",labelColor:"text-purple-400",bg:"bg-purple-950/30"},tool_call:{border:"border-blue-700",labelColor:"text-blue-400",bg:"bg-blue-950/30"},tool_result:{border:"border-gray-600",labelColor:"text-gray-400",bg:"bg-gray-800/30"},final_answer:{border:"border-green-700",labelColor:"text-green-400",bg:"bg-green-950/30"},unknown:{border:"border-gray-700",labelColor:"text-gray-500",bg:""}};function N(a){if(!a)return[];const l=[],p=a.split(`
|
| 2 |
|
| 3 |
---
|
| 4 |
|
frontend/dist/assets/{SelectedToolsApp-ButjYRD7.js → SelectedToolsApp-BBGQyqN9.js}
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import{r as n,j as e}from"./index-
|
| 2 |
|
| 3 |
---
|
| 4 |
|
|
@@ -8,4 +8,4 @@ import{r as n,j as e}from"./index-CR1kJ7mL.js";const E={reasoning:{border:"borde
|
|
| 8 |
[Tool Result]
|
| 9 |
`,p=s.indexOf(u),d=s.indexOf(`]
|
| 10 |
`),x=d>=0?s.slice(11,d).trim():"unknown";if(p>=0){const g=s.slice(0,p).replace(/^\[Tool Call:[^\]]*\]\n/,"").trim(),b=s.slice(p+u.length).trim();l.push({type:"tool_call",label:`Tool Call: ${x}`,content:g}),l.push({type:"tool_result",label:"Tool Result",content:b})}else{const g=s.replace(/^\[Tool Call:[^\]]*\]\n/,"").trim();l.push({type:"tool_call",label:`Tool Call: ${x}`,content:g})}}else s.startsWith(`[Final Answer]
|
| 11 |
-
`)?l.push({type:"final_answer",label:"Final Answer",content:s.slice(15).trim()}):s&&l.push({type:"unknown",label:"—",content:s})}return l}function
|
|
|
|
| 1 |
+
import{r as n,j as e}from"./index-BewwkiJG.js";const E={reasoning:{border:"border-purple-700",labelColor:"text-purple-400",bg:"bg-purple-950/30"},tool_call:{border:"border-blue-700",labelColor:"text-blue-400",bg:"bg-blue-950/30"},tool_result:{border:"border-gray-600",labelColor:"text-gray-400",bg:"bg-gray-800/30"},final_answer:{border:"border-green-700",labelColor:"text-green-400",bg:"bg-green-950/30"},unknown:{border:"border-gray-700",labelColor:"text-gray-500",bg:""}};function $(a){if(!a)return[];const l=[],i=a.split(/\n\n(?=\[)/);for(const o of i){const s=o.trim();s.startsWith("[Reasoning]:")?l.push({type:"reasoning",label:"Reasoning",content:s.slice(12).trim()}):s.startsWith("[Tool call]")?l.push({type:"tool_call",label:"Tool Call",content:s.slice(11).trim()}):s.startsWith("[Tool result]:")?l.push({type:"tool_result",label:"Tool Result",content:s.slice(14).trim()}):s.startsWith("[Final answer]:")?l.push({type:"final_answer",label:"Final Answer",content:s.slice(15).trim()}):s&&l.push({type:"unknown",label:"—",content:s})}return l}function F(a){if(!a)return[];const l=[],i=a.split(`
|
| 2 |
|
| 3 |
---
|
| 4 |
|
|
|
|
| 8 |
[Tool Result]
|
| 9 |
`,p=s.indexOf(u),d=s.indexOf(`]
|
| 10 |
`),x=d>=0?s.slice(11,d).trim():"unknown";if(p>=0){const g=s.slice(0,p).replace(/^\[Tool Call:[^\]]*\]\n/,"").trim(),b=s.slice(p+u.length).trim();l.push({type:"tool_call",label:`Tool Call: ${x}`,content:g}),l.push({type:"tool_result",label:"Tool Result",content:b})}else{const g=s.replace(/^\[Tool Call:[^\]]*\]\n/,"").trim();l.push({type:"tool_call",label:`Tool Call: ${x}`,content:g})}}else s.startsWith(`[Final Answer]
|
| 11 |
+
`)?l.push({type:"final_answer",label:"Final Answer",content:s.slice(15).trim()}):s&&l.push({type:"unknown",label:"—",content:s})}return l}function _({blocks:a}){return a.length===0?e.jsx("div",{className:"text-gray-500 text-xs italic",children:"No steps."}):e.jsx("div",{className:"space-y-2",children:a.map((l,i)=>{const o=E[l.type];return e.jsxs("div",{className:`border-l-2 ${o.border} ${o.bg} pl-3 py-1.5 rounded-r`,children:[e.jsx("div",{className:`text-[10px] font-bold uppercase tracking-widest mb-1 ${o.labelColor}`,children:l.label??l.type}),e.jsx("pre",{className:"text-xs text-gray-300 whitespace-pre-wrap font-mono leading-relaxed",children:l.content})]},i)})})}function W(){const[a,l]=n.useState([]),[i,o]=n.useState(!0),[s,u]=n.useState(null),[p,d]=n.useState(0),[x,g]=n.useState(""),[b,f]=n.useState("all"),[h,k]=n.useState("traj_summary_orig_ext"),[j,C]=n.useState({});n.useEffect(()=>{o(!0),l([]),d(0),fetch(`/api/selected-tools/?variant=${encodeURIComponent(h)}`).then(t=>{if(!t.ok)throw new Error(t.statusText);return t.json()}).then(t=>{l(t.rows),t.variants&&C(t.variants),o(!1)}).catch(t=>{u(t.message),o(!1)})},[h]);const y=n.useMemo(()=>{let t=a;if(b==="direct"&&(t=t.filter(c=>c.direct_answer)),b==="searched"&&(t=t.filter(c=>!c.direct_answer)),x.trim()){const c=x.toLowerCase();t=t.filter(q=>q.query_id.includes(c))}return t},[a,x,b]),r=y[p]??null,T=n.useMemo(()=>r?$(r.excerpt):[],[r]),S=n.useMemo(()=>r?F(r.new_trajectory):[],[r]),N=a.filter(t=>t.direct_answer).length,R=a.length?Math.round(100*N/a.length):0,m=a.filter(t=>t.correct!==null&&t.correct!==void 0),v=m.filter(t=>t.correct===!0).length,w=m.length?Math.round(100*v/m.length):null;return i?e.jsx("div",{className:"h-full flex items-center justify-center text-gray-400",children:"Loading from HuggingFace…"}):s?e.jsxs("div",{className:"h-full flex items-center justify-center text-red-400",children:["Error: ",s]}):e.jsxs("div",{className:"h-full flex overflow-hidden bg-gray-950 text-gray-100",children:[e.jsxs("div",{className:"w-64 shrink-0 flex flex-col border-r border-gray-800 bg-gray-900",children:[Object.keys(j).length>0&&e.jsxs("div",{className:"px-2 py-2 border-b border-gray-800 bg-gray-900/80",children:[e.jsx("div",{className:"text-[10px] text-gray-500 uppercase tracking-widest mb-1",children:"Variant"}),e.jsx("select",{value:h,onChange:t=>{k(t.target.value),f("all"),g("")},className:"w-full bg-gray-800 border border-gray-700 text-gray-200 text-xs rounded px-2 py-1.5",children:Object.entries(j).map(([t,c])=>e.jsx("option",{value:t,children:c.label},t))})]}),e.jsxs("div",{className:"px-3 py-2 border-b border-gray-800 bg-gray-900/80 space-y-1.5",children:[w!==null&&e.jsxs("div",{children:[e.jsx("div",{className:"text-[10px] text-gray-500 uppercase tracking-widest mb-0.5",children:"Accuracy"}),e.jsxs("div",{className:"text-lg font-bold text-sky-400",children:[w,"%"]}),e.jsxs("div",{className:"text-[10px] text-gray-600",children:[v," / ",m.length," correct"]})]}),e.jsxs("div",{children:[e.jsx("div",{className:"text-[10px] text-gray-500 uppercase tracking-widest mb-0.5",children:"Direct-answer rate"}),e.jsxs("div",{className:"text-base font-bold text-emerald-400",children:[R,"%"]}),e.jsxs("div",{className:"text-[10px] text-gray-600",children:[N," / ",a.length," no tool calls"]})]})]}),e.jsx("div",{className:"flex gap-1 px-2 py-2 border-b border-gray-800",children:["all","direct","searched"].map(t=>e.jsx("button",{onClick:()=>{f(t),d(0)},className:`flex-1 text-[10px] py-1 rounded border transition-colors capitalize ${b===t?t==="direct"?"bg-emerald-900/60 border-emerald-600 text-emerald-300":t==="searched"?"bg-blue-900/60 border-blue-600 text-blue-300":"bg-gray-700 border-gray-500 text-gray-200":"bg-gray-800/50 border-gray-700 text-gray-500 hover:border-gray-500"}`,children:t},t))}),e.jsxs("div",{className:"px-2 py-1.5 border-b border-gray-800",children:[e.jsx("input",{type:"text",placeholder:"Search query ID…",value:x,onChange:t=>{g(t.target.value),d(0)},className:"w-full bg-gray-800 border border-gray-700 text-gray-200 text-xs rounded px-2 py-1.5 placeholder-gray-600"}),e.jsxs("div",{className:"text-[10px] text-gray-600 mt-1",children:[y.length," / ",a.length]})]}),e.jsx("div",{className:"flex-1 overflow-y-auto",children:y.map((t,c)=>e.jsxs("button",{onClick:()=>d(c),className:`w-full text-left px-3 py-2 border-b border-gray-800/50 text-xs transition-colors ${p===c?"bg-blue-900/40 text-blue-200 border-l-2 border-l-blue-500":"text-gray-400 hover:bg-gray-800"}`,children:[e.jsxs("div",{className:"flex items-center justify-between",children:[e.jsxs("span",{className:"font-medium text-gray-200",children:["#",t.query_id]}),e.jsxs("div",{className:"flex items-center gap-1",children:[t.correct===!0&&e.jsx("span",{className:"text-[9px] px-1.5 py-0.5 rounded-full bg-green-900/60 text-green-400 border border-green-800",children:"✓"}),(t.correct===!1||t.correct==null&&t.new_status==="incomplete")&&e.jsx("span",{className:"text-[9px] px-1.5 py-0.5 rounded-full bg-red-900/60 text-red-400 border border-red-800",children:"✗"}),t.direct_answer?e.jsx("span",{className:"text-[9px] px-1.5 py-0.5 rounded-full bg-emerald-900/60 text-emerald-400 border border-emerald-800",children:"direct"}):e.jsxs("span",{className:"text-[9px] px-1.5 py-0.5 rounded-full bg-blue-900/60 text-blue-400 border border-blue-800",children:[t.total_tool_calls," calls"]})]})]}),e.jsxs("div",{className:"text-[10px] text-gray-600 mt-0.5",children:["k=",t.k_effective," selected steps"]})]},t.query_id))})]}),r?e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 overflow-hidden",children:[e.jsxs("div",{className:"px-4 py-2 bg-gray-900/60 border-b border-gray-800 shrink-0",children:[e.jsxs("div",{className:"flex items-center gap-3 flex-wrap",children:[e.jsxs("span",{className:"text-sm font-medium text-gray-100",children:["Query #",r.query_id]}),r.correct===!0&&e.jsx("span",{className:"text-xs px-2 py-0.5 rounded-full bg-green-900/50 text-green-300 border border-green-800 font-semibold",children:"✓ Correct"}),(r.correct===!1||r.correct==null&&r.new_status==="incomplete")&&e.jsx("span",{className:"text-xs px-2 py-0.5 rounded-full bg-red-900/50 text-red-300 border border-red-800 font-semibold",children:"✗ Incorrect"}),r.direct_answer?e.jsx("span",{className:"text-xs px-2 py-0.5 rounded-full bg-emerald-900/50 text-emerald-300 border border-emerald-800",children:"Direct answer"}):e.jsxs("span",{className:"text-xs px-2 py-0.5 rounded-full bg-blue-900/50 text-blue-300 border border-blue-800",children:[r.total_tool_calls," tool calls"]}),e.jsxs("span",{className:"text-xs text-gray-500",children:["k=",r.k_effective," steps selected"]}),e.jsx("span",{className:`text-xs px-2 py-0.5 rounded-full ${r.new_status==="completed"?"bg-gray-800 text-gray-400":"bg-amber-900/50 text-amber-300"}`,children:r.new_status})]}),(r.question||r.new_status==="incomplete")&&e.jsxs("div",{className:"mt-1.5 text-xs text-gray-200 leading-snug bg-gray-800/40 rounded px-2 py-1.5 border border-gray-700",children:[e.jsx("span",{className:"text-[10px] font-bold uppercase tracking-widest text-violet-400 mr-2",children:"Question"}),r.question||e.jsx("span",{className:"text-gray-500 italic",children:"not available (run was incomplete)"})]}),r.correct_answer&&e.jsxs("div",{className:"mt-1 text-xs leading-snug bg-gray-800/40 rounded px-2 py-1.5 border border-gray-700 flex items-start gap-2",children:[e.jsx("span",{className:"text-[10px] font-bold uppercase tracking-widest text-green-400 shrink-0 mt-0.5",children:"Answer"}),e.jsx("span",{className:r.correct===!0?"text-green-300":r.correct===!1?"text-red-300":"text-gray-300",children:r.correct_answer})]}),r.selected_indices.length>0&&e.jsxs("div",{className:"mt-1.5 flex items-center gap-1.5 flex-wrap",children:[e.jsx("span",{className:"text-[10px] font-bold uppercase tracking-widest text-gray-500",children:"Selected steps"}),r.selected_indices.map(t=>e.jsxs("span",{className:"text-[10px] px-1.5 py-0.5 rounded bg-gray-800 border border-gray-700 text-gray-300 font-mono",children:["#",t]},t))]}),r.rationale&&e.jsxs("div",{className:"mt-1.5 text-xs text-gray-400 leading-snug bg-gray-800/50 rounded px-2 py-1.5 border border-gray-700",children:[e.jsx("span",{className:"text-[10px] font-bold uppercase tracking-widest text-amber-500 mr-2",children:"Rationale"}),r.rationale]})]}),e.jsxs("div",{className:"flex-1 flex overflow-hidden min-w-0",children:[e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 border-r border-gray-800 overflow-hidden",children:[e.jsxs("div",{className:"px-3 py-1.5 bg-gray-900/40 border-b border-gray-800 shrink-0",children:[e.jsx("span",{className:"text-[11px] font-semibold text-amber-400 uppercase tracking-widest",children:"Selected Tool Calls"}),e.jsxs("span",{className:"text-[10px] text-gray-600 ml-2",children:["reference trajectory · ",r.k_effective," steps"]})]}),e.jsx("div",{className:"flex-1 overflow-y-auto p-3",children:e.jsx(_,{blocks:T})})]}),e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 overflow-hidden",children:[e.jsxs("div",{className:"px-3 py-1.5 bg-gray-900/40 border-b border-gray-800 shrink-0",children:[e.jsx("span",{className:"text-[11px] font-semibold text-sky-400 uppercase tracking-widest",children:"New Trajectory"}),e.jsx("span",{className:"text-[10px] text-gray-600 ml-2",children:"gpt-oss-120b · conditioned on selected steps"})]}),e.jsx("div",{className:"flex-1 overflow-y-auto p-3",children:e.jsx(_,{blocks:S})})]})]})]}):e.jsx("div",{className:"flex-1 flex items-center justify-center text-gray-500",children:"No query selected."})]})}export{W as default};
|
frontend/dist/assets/{SftDiffApp-Bh7C7VBz.js → SftDiffApp-D-nUoSxu.js}
RENAMED
|
@@ -1 +1 @@
|
|
| 1 |
-
import{r as d,j as e}from"./index-
|
|
|
|
| 1 |
+
import{r as d,j as e}from"./index-BewwkiJG.js";const N={user:{bg:"bg-amber-950/40",border:"border-amber-600",label:"text-amber-400",tag:"USER MESSAGE"},reasoning:{bg:"bg-purple-950/40",border:"border-purple-600",label:"text-purple-400",tag:"REASONING"},tool_call:{bg:"bg-blue-950/40",border:"border-blue-600",label:"text-blue-400",tag:"TOOL CALL"},tool_resp:{bg:"bg-gray-800/60",border:"border-gray-600",label:"text-gray-400",tag:"TOOL RESPONSE"}};function g({kind:r,label:t,children:l}){const s=N[r];return e.jsxs("div",{className:`border-l-2 ${s.border} ${s.bg} pl-3 py-2 rounded-r mb-2`,children:[e.jsx("div",{className:`text-[9px] font-bold uppercase tracking-widest mb-1.5 ${s.label}`,children:t??s.tag}),l]})}function A(r){return r.split(/\n\n/).map(t=>t.trim()).filter(Boolean).flatMap(t=>{try{const l=JSON.parse(t);return typeof l=="object"&&l!==null&&"type"in l?[l]:[]}catch{return[]}})}function T({excerpt:r,userContent:t}){const l=d.useMemo(()=>A(r),[r]);return e.jsxs("div",{className:"space-y-0",children:[e.jsx(g,{kind:"user",children:e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:t})}),l.map((s,a)=>{const n=JSON.stringify(s,null,2);return s.type==="reasoning"?e.jsx(g,{kind:"reasoning",children:e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:n})},a):s.type==="function_call"?e.jsx(g,{kind:"tool_call",label:`TOOL CALL${s.name?`: ${s.name}`:""}`,children:e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:n})},a):s.type==="function_call_output"?e.jsx(g,{kind:"tool_resp",children:e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:n})},a):e.jsx("div",{className:"border-l-2 border-gray-700 pl-3 py-2 mb-2",children:e.jsx("pre",{className:"text-xs text-gray-400 whitespace-pre-wrap font-mono leading-relaxed",children:n})},a)})]})}function I(r){const t=[],l=/(<tool_call>[\s\S]*?<\/tool_call>)/g;let s=0,a;for(;(a=l.exec(r))!==null;){const i=r.slice(s,a.index).trim();i&&t.push({kind:"reasoning",text:i}),t.push({kind:"tool_call",text:a[1]}),s=a.index+a[1].length}const n=r.slice(s).trim();return n&&t.push({kind:"reasoning",text:n}),t}function M(r){const t=[],l=/(<think>[\s\S]*?<\/think>|<tool_call>[\s\S]*?<\/tool_call>)/g;let s=0,a;for(;(a=l.exec(r))!==null;){const i=r.slice(s,a.index).trim();i&&t.push({kind:"reasoning",text:i}),a[1].startsWith("<think>")?t.push({kind:"reasoning",text:a[1]}):t.push({kind:"tool_call",text:a[1]}),s=a.index+a[1].length}const n=r.slice(s).trim();return n&&t.push({kind:"reasoning",text:n}),t}function R({content:r,template:t}){const l=t==="qwen"?M(r):I(r),s=l.some(n=>n.kind==="tool_call"),a=N.reasoning;return e.jsxs("div",{className:`border-l-2 ${a.border} ${a.bg} pl-3 py-2 rounded-r mb-2`,children:[e.jsx("div",{className:`text-[9px] font-bold uppercase tracking-widest mb-1.5 ${a.label}`,children:s?"REASONING + TOOL CALL":"REASONING"}),l.map((n,i)=>{if(n.kind==="tool_call"){const p=N.tool_call;return e.jsxs("div",{className:`border-l-2 ${p.border} ${p.bg} pl-2 py-1.5 rounded-r mb-1.5`,children:[e.jsx("div",{className:`text-[9px] font-bold uppercase tracking-widest mb-1 ${p.label}`,children:"TOOL CALL"}),e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:n.text})]},i)}return e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed mb-1.5",children:n.text},i)})]})}function G({messages:r,template:t}){return e.jsx("div",{className:"space-y-0",children:r.map((l,s)=>l.role==="user"&&s===0?e.jsx(g,{kind:"user",children:e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:l.content})},s):l.role==="user"?e.jsx(g,{kind:"tool_resp",children:e.jsx("pre",{className:"text-xs text-gray-200 whitespace-pre-wrap font-mono leading-relaxed",children:l.content})},s):e.jsx(R,{content:l.content,template:t},s))})}function D(){return e.jsxs("div",{className:"flex items-center gap-4 px-4 py-1.5 bg-gray-900/80 border-b border-gray-800 shrink-0 flex-wrap",children:[e.jsx("span",{className:"text-[9px] font-bold uppercase tracking-widest text-gray-500 mr-1",children:"Legend"}),Object.entries(N).map(([r,t])=>e.jsxs("span",{className:`flex items-center gap-1.5 text-[10px] ${t.label}`,children:[e.jsx("span",{className:`inline-block w-2.5 h-2.5 rounded-sm border ${t.border} ${t.bg}`}),t.tag]},r))]})}function B({value:r,onChange:t}){return e.jsxs("select",{value:r,onChange:l=>t(l.target.value),className:"text-xs bg-gray-800 border border-gray-600 text-gray-200 rounded px-2 py-1 ml-2 cursor-pointer",children:[e.jsx("option",{value:"gpt-oss",children:"gpt-oss"}),e.jsx("option",{value:"qwen",children:"qwen"})]})}function P(){var v,k,_,S;const[r,t]=d.useState([]),[l,s]=d.useState(!0),[a,n]=d.useState(null),[i,p]=d.useState(0),[f,E]=d.useState(""),[b,q]=d.useState("gpt-oss");d.useEffect(()=>{s(!0),fetch("/api/sft-diff/").then(o=>{if(!o.ok)throw new Error(o.statusText);return o.json()}).then(o=>{t(o.rows),s(!1)}).catch(o=>{n(o.message),s(!1)})},[]);const w=d.useMemo(()=>{if(!f.trim())return r;const o=f.toLowerCase();return r.filter(x=>{var m,u,j,h;return x.query_id.toLowerCase().includes(o)||(((u=(m=x.messages_gpt)==null?void 0:m[0])==null?void 0:u.content)??((h=(j=x.messages_qwen)==null?void 0:j[0])==null?void 0:h.content)??"").toLowerCase().includes(o)})},[r,f]),c=w[i]??null,y=c?b==="qwen"?c.messages_qwen:c.messages_gpt:null,$=((k=(v=c==null?void 0:c.messages_gpt)==null?void 0:v[0])==null?void 0:k.content)??((S=(_=c==null?void 0:c.messages_qwen)==null?void 0:_[0])==null?void 0:S.content)??"";return l?e.jsx("div",{className:"h-full flex items-center justify-center text-gray-400",children:"Loading…"}):a?e.jsxs("div",{className:"h-full flex items-center justify-center text-red-400",children:["Error: ",a]}):e.jsxs("div",{className:"h-full flex overflow-hidden bg-gray-950 text-gray-100",children:[e.jsxs("div",{className:"w-60 shrink-0 flex flex-col border-r border-gray-800 bg-gray-900",children:[e.jsxs("div",{className:"px-2 py-1.5 border-b border-gray-800",children:[e.jsx("input",{type:"text",placeholder:"Search query ID or question…",value:f,onChange:o=>{E(o.target.value),p(0)},className:"w-full bg-gray-800 border border-gray-700 text-gray-200 text-xs rounded px-2 py-1.5 placeholder-gray-600"}),e.jsxs("div",{className:"text-[10px] text-gray-600 mt-1",children:[w.length," / ",r.length]})]}),e.jsx("div",{className:"flex-1 overflow-y-auto",children:w.map((o,x)=>{var h,L,O,C;const m=((L=(h=o.messages_gpt)==null?void 0:h[0])==null?void 0:L.content)??((C=(O=o.messages_qwen)==null?void 0:O[0])==null?void 0:C.content)??"",u=m.match(/Question:\s*([\s\S]{0,120})/),j=u?u[1].trim().replace(/\n/g," "):m.slice(0,80);return e.jsxs("button",{onClick:()=>p(x),className:`w-full text-left px-3 py-2 border-b border-gray-800/50 text-xs transition-colors ${i===x?"bg-blue-900/40 text-blue-200 border-l-2 border-l-blue-500":"text-gray-400 hover:bg-gray-800"}`,children:[e.jsxs("div",{className:"font-medium text-gray-200 mb-0.5",children:["#",o.query_id]}),e.jsx("div",{className:"text-[10px] text-gray-500 leading-snug line-clamp-2",children:j})]},o.query_id+x)})})]}),c?e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 overflow-hidden",children:[e.jsx(D,{}),e.jsxs("div",{className:"flex-1 flex overflow-hidden min-w-0",children:[e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 border-r border-gray-800 overflow-hidden",children:[e.jsxs("div",{className:"px-3 py-1.5 bg-gray-900/60 border-b border-gray-800 shrink-0",children:[e.jsx("span",{className:"text-[11px] font-semibold text-amber-400 uppercase tracking-widest",children:"Original"}),e.jsx("span",{className:"text-[10px] text-gray-500 ml-2",children:"excerpt field — raw JSON items"})]}),e.jsx("div",{className:"flex-1 overflow-y-auto p-3",children:e.jsx(T,{excerpt:c.excerpt,userContent:$})})]}),e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 overflow-hidden",children:[e.jsxs("div",{className:"px-3 py-1.5 bg-gray-900/60 border-b border-gray-800 shrink-0 flex items-center",children:[e.jsx("span",{className:"text-[11px] font-semibold text-sky-400 uppercase tracking-widest",children:"Converted"}),e.jsx(B,{value:b,onChange:q}),y&&e.jsxs("span",{className:"text-[10px] text-gray-500 ml-2",children:[y.length," messages"]})]}),e.jsx("div",{className:"flex-1 overflow-y-auto p-3",children:y?e.jsx(G,{messages:y,template:b}):e.jsxs("div",{className:"text-gray-500 text-xs italic mt-4",children:["Not available for ",b," template."]})})]})]})]}):e.jsx("div",{className:"flex-1 flex items-center justify-center text-gray-500",children:"No record selected."})]})}export{P as default};
|
frontend/dist/assets/{TrajExtApp-DNAeWbp6.js → TrajExtApp-Y8_orlLr.js}
RENAMED
|
@@ -1 +1 @@
|
|
| 1 |
-
import{r as l,j as e}from"./index-
|
|
|
|
| 1 |
+
import{r as l,j as e}from"./index-BewwkiJG.js";function S(r){try{return JSON.parse(r).map(o=>({type:["reasoning","tool_call","tool_result","final_answer"].includes(o.type)?o.type:"unknown",content:o.content}))}catch{return[]}}function T(r){if(!r||r==="(no trajectory steps)")return[];const n=[],o=r.split(/\n\n(?=\[(?:Reasoning\]:|Tool call\]|Tool result\]:|Final answer\]:))/);for(const d of o){const s=d.trim();s.startsWith("[Reasoning]:")?n.push({type:"reasoning",content:s.slice(12).trim()}):s.startsWith("[Tool call]")?n.push({type:"tool_call",content:s.slice(11).trim()}):s.startsWith("[Tool result]:")?n.push({type:"tool_result",content:s.slice(14).trim()}):s.startsWith("[Final answer]:")?n.push({type:"final_answer",content:s.slice(15).trim()}):s&&n.push({type:"unknown",content:s})}return n}const k={reasoning:{border:"border-purple-700",label:"Reasoning",labelColor:"text-purple-400",bg:"bg-purple-950/30"},tool_call:{border:"border-blue-700",label:"Tool Call",labelColor:"text-blue-400",bg:"bg-blue-950/30"},tool_result:{border:"border-gray-600",label:"Tool Result",labelColor:"text-gray-400",bg:"bg-gray-800/30"},final_answer:{border:"border-green-700",label:"Final Answer",labelColor:"text-green-400",bg:"bg-green-950/30"},unknown:{border:"border-gray-700",label:"—",labelColor:"text-gray-500",bg:""}};function C({row:r}){const n=l.useMemo(()=>r.trajectory_blocks?S(r.trajectory_blocks):T(r.trajectory_text),[r.trajectory_blocks,r.trajectory_text]);return n.length===0?e.jsx("div",{className:"text-gray-500 text-xs",children:"No trajectory steps."}):e.jsx("div",{className:"space-y-2",children:n.map((o,d)=>{const s=k[o.type];return e.jsxs("div",{className:`border-l-2 ${s.border} ${s.bg} pl-3 py-1 rounded-r`,children:[e.jsx("div",{className:`text-[10px] font-bold uppercase tracking-widest mb-1 ${s.labelColor}`,children:s.label}),e.jsx("pre",{className:"text-xs text-gray-300 whitespace-pre-wrap font-mono leading-relaxed",children:o.content})]},d)})})}const E={traj_ext:"Full Trajectory (Tags)",traj_orig_ext:"Full Trajectory (Original Messages)",traj_summary_ext:"Summary (Tags)",traj_summary_orig_ext:"Summary (Original Messages)",traj_summary_ext_selected_tools:"Selected Tools (Tags)",traj_summary_orig_ext_selected_tools:"Selected Tools (Original Messages)"},f=r=>E[r]??r,M=["trajectory","prompt","both"];function R(){const[r,n]=l.useState([]),[o,d]=l.useState(!0),[s,j]=l.useState(null),[b,p]=l.useState(0),[c,_]=l.useState("trajectory"),[u,N]=l.useState(""),[x,v]=l.useState("all"),[m,w]=l.useState({});l.useEffect(()=>{d(!0),fetch("/api/traj-ext/").then(t=>{if(!t.ok)throw new Error(t.statusText);return t.json()}).then(t=>{n(t.rows),d(!1)}).catch(t=>{j(t.message),d(!1)})},[]);const h=l.useMemo(()=>Array.from(new Set(r.map(i=>i.run_name??""))).filter(Boolean).sort(),[r]),y=l.useMemo(()=>{let t=r;if(x!=="all"&&(t=t.filter(g=>g.run_name===x)),!u.trim())return t;const i=u.toLowerCase();return t.filter(g=>g.query_id.includes(i)||g.question.toLowerCase().includes(i))},[r,u,x]),a=y[b]??null;return l.useEffect(()=>{a&&(c!=="prompt"&&c!=="both"||m[a.run_id]||fetch(`/api/traj-ext/${encodeURIComponent(a.run_id)}`).then(t=>t.json()).then(t=>w(i=>({...i,[a.run_id]:t.formatted_prompt??""}))).catch(()=>{}))},[a==null?void 0:a.run_id,c]),o?e.jsx("div",{className:"h-full flex items-center justify-center text-gray-400",children:"Loading traj-ext data from HuggingFace…"}):s?e.jsxs("div",{className:"h-full flex items-center justify-center text-red-400",children:["Error: ",s]}):e.jsxs("div",{className:"h-full flex overflow-hidden bg-gray-950 text-gray-100",children:[e.jsxs("div",{className:"w-64 shrink-0 flex flex-col border-r border-gray-800 bg-gray-900",children:[e.jsxs("div",{className:"p-2 border-b border-gray-800 space-y-1.5",children:[h.length>0&&e.jsxs("select",{value:x,onChange:t=>{v(t.target.value),p(0)},className:"w-full bg-gray-800 border border-gray-700 text-gray-200 text-xs rounded px-2 py-1.5",children:[e.jsxs("option",{value:"all",children:["All runs (",r.length,")"]}),h.map(t=>e.jsx("option",{value:t,children:f(t)},t))]}),e.jsx("input",{type:"text",placeholder:"Search queries…",value:u,onChange:t=>{N(t.target.value),p(0)},className:"w-full bg-gray-800 border border-gray-700 text-gray-200 text-xs rounded px-2 py-1.5 placeholder-gray-600"}),e.jsxs("div",{className:"text-[10px] text-gray-600",children:[y.length,x!=="all"||u?` / ${r.length}`:""," trajectories"]})]}),e.jsx("div",{className:"flex-1 overflow-y-auto",children:y.map((t,i)=>e.jsxs("button",{onClick:()=>p(i),className:`w-full text-left px-3 py-2 border-b border-gray-800/50 text-xs transition-colors ${b===i?"bg-blue-900/40 text-blue-200 border-l-2 border-l-blue-500":"text-gray-400 hover:bg-gray-800"}`,children:[e.jsxs("div",{className:"font-medium text-gray-200",children:["#",t.query_id]}),e.jsxs("div",{className:"text-gray-500 truncate mt-0.5",children:[t.question.slice(0,60),"…"]}),e.jsxs("div",{className:"flex gap-2 mt-1 text-[10px] text-gray-600 flex-wrap items-center",children:[e.jsxs("span",{children:[t.n_tool_calls," tools"]}),e.jsxs("span",{children:[t.n_reasoning_steps," reasoning"]}),e.jsx("span",{className:t.status==="completed"?"text-green-600":"text-amber-600",children:t.status}),t.n_tool_calls===0&&e.jsx("span",{className:"px-1 py-0.5 rounded bg-yellow-900/50 text-yellow-300 font-semibold",children:"direct"})]}),t.run_name&&x==="all"&&e.jsx("div",{className:"mt-0.5 text-[9px] text-indigo-400 truncate",children:f(t.run_name)})]},t.run_id))})]}),a?e.jsxs("div",{className:"flex-1 flex flex-col min-w-0 overflow-hidden",children:[e.jsxs("div",{className:"px-4 py-2 bg-gray-900/60 border-b border-gray-800 shrink-0",children:[e.jsxs("div",{className:"flex items-center gap-3 flex-wrap",children:[e.jsxs("span",{className:"text-sm font-medium text-gray-100",children:["Query #",a.query_id]}),e.jsx("span",{className:`text-xs px-2 py-0.5 rounded-full ${a.status==="completed"?"bg-green-900/50 text-green-300":"bg-amber-900/50 text-amber-300"}`,children:a.status}),e.jsxs("span",{className:"text-xs text-gray-500",children:[a.n_tool_calls," tool calls · ",a.n_reasoning_steps," reasoning"]}),a.n_tool_calls===0&&e.jsx("span",{className:"px-1.5 py-0.5 rounded bg-yellow-900/50 text-yellow-300 text-xs font-semibold",children:"direct"}),e.jsx("div",{className:"ml-auto flex gap-1",children:M.map(t=>e.jsx("button",{onClick:()=>_(t),className:`px-2 py-0.5 text-xs rounded border transition-colors capitalize ${c===t?"bg-blue-700 border-blue-500 text-white":"bg-gray-800 border-gray-700 text-gray-400 hover:border-gray-500"}`,children:t},t))})]}),e.jsx("div",{className:"mt-1 text-xs text-gray-300 leading-snug",children:a.question})]}),e.jsxs("div",{className:"flex-1 overflow-y-auto p-4 space-y-4 min-w-0",children:[(c==="prompt"||c==="both")&&e.jsxs("div",{children:[c==="both"&&e.jsx("div",{className:"text-xs font-semibold text-gray-400 uppercase tracking-wide mb-2",children:"Prompt"}),m[a.run_id]?e.jsx("pre",{className:"text-xs font-mono whitespace-pre-wrap text-gray-200 bg-gray-900 border border-gray-700 rounded p-3 leading-relaxed",children:m[a.run_id]}):e.jsx("div",{className:"text-gray-500 text-xs p-3",children:"Loading prompt…"})]}),(c==="trajectory"||c==="both")&&e.jsxs("div",{children:[c==="both"&&e.jsx("div",{className:"text-xs font-semibold text-gray-400 uppercase tracking-wide mb-2 mt-4",children:"Trajectory"}),e.jsx(C,{row:a})]})]})]}):e.jsx("div",{className:"flex-1 flex items-center justify-center text-gray-500",children:"No trajectory selected."})]})}export{R as default};
|
frontend/dist/assets/{VisualizerApp-D_QCMHfk.js → VisualizerApp-B36fbafZ.js}
RENAMED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["assets/ModelApp-
|
| 2 |
-
import{u as o,j as e,n as d,r as l,_ as t}from"./index-
|
|
|
|
| 1 |
+
const __vite__mapDeps=(i,m=__vite__mapDeps,d=(m.f||(m.f=["assets/ModelApp-Cw242jQN.js","assets/index-BewwkiJG.js","assets/index-CxBbYLho.css","assets/PlanRevisionsApp-CUfcVsmt.js","assets/TrajExtApp-Y8_orlLr.js","assets/SelectedToolsApp-BBGQyqN9.js","assets/ScoutRunsApp-B_i5X87r.js"])))=>i.map(i=>d[i]);
|
| 2 |
+
import{u as o,j as e,n as d,r as l,_ as t}from"./index-BewwkiJG.js";const c=l.lazy(()=>t(()=>import("./ModelApp-Cw242jQN.js"),__vite__mapDeps([0,1,2]))),n=l.lazy(()=>t(()=>import("./PlanRevisionsApp-CUfcVsmt.js"),__vite__mapDeps([3,1,2]))),x=l.lazy(()=>t(()=>import("./TrajExtApp-Y8_orlLr.js"),__vite__mapDeps([4,1,2]))),m=l.lazy(()=>t(()=>import("./SelectedToolsApp-BBGQyqN9.js"),__vite__mapDeps([5,1,2]))),u=l.lazy(()=>t(()=>import("./ScoutRunsApp-B_i5X87r.js"),__vite__mapDeps([6,1,2]))),i=[{id:"model",label:"Model Trace",activeClass:"border-blue-500 text-blue-400"},{id:"plan-revisions",label:"Plan Revisions",activeClass:"border-amber-500 text-amber-400"},{id:"traj-ext",label:"Traj Ext",activeClass:"border-emerald-500 text-emerald-400"},{id:"selected-tools",label:"Selected Tools",activeClass:"border-amber-400 text-amber-300"},{id:"scout-runs",label:"Scout Runs",activeClass:"border-cyan-500 text-cyan-400"}],p=new Set(i.map(r=>r.id));function _(){const r=o(),s=p.has(r.tab)?r.tab:"model";return e.jsxs("div",{className:"h-full flex flex-col",children:[e.jsx("div",{className:"flex items-center border-b border-gray-800 bg-gray-900/50 px-2 shrink-0",children:i.map(a=>e.jsx("button",{onClick:()=>d({page:"viz",tab:a.id}),className:`px-5 py-2 text-sm font-medium border-b-2 transition-colors ${s===a.id?a.activeClass:"border-transparent text-gray-500 hover:text-gray-300"}`,children:a.label},a.id))}),e.jsx("div",{className:"flex-1 overflow-hidden",children:e.jsxs(l.Suspense,{fallback:e.jsx("div",{className:"flex items-center justify-center h-full text-gray-500",children:"Loading..."}),children:[s==="model"&&e.jsx("div",{className:"theme-model h-full",children:e.jsx(c,{})}),s==="plan-revisions"&&e.jsx("div",{className:"h-full",children:e.jsx(n,{})}),s==="traj-ext"&&e.jsx("div",{className:"h-full",children:e.jsx(x,{})}),s==="selected-tools"&&e.jsx("div",{className:"h-full",children:e.jsx(m,{})}),s==="scout-runs"&&e.jsx("div",{className:"h-full",children:e.jsx(u,{})})]})})]})}export{_ as default};
|
frontend/dist/assets/{index-CR1kJ7mL.js → index-BewwkiJG.js}
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/dist/index.html
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
<meta charset="UTF-8" />
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
<title>RACA Dashboard</title>
|
| 7 |
-
<script type="module" crossorigin src="/assets/index-
|
| 8 |
<link rel="stylesheet" crossorigin href="/assets/index-CxBbYLho.css">
|
| 9 |
</head>
|
| 10 |
<body class="bg-gray-950 text-gray-100">
|
|
|
|
| 4 |
<meta charset="UTF-8" />
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
<title>RACA Dashboard</title>
|
| 7 |
+
<script type="module" crossorigin src="/assets/index-BewwkiJG.js"></script>
|
| 8 |
<link rel="stylesheet" crossorigin href="/assets/index-CxBbYLho.css">
|
| 9 |
</head>
|
| 10 |
<body class="bg-gray-950 text-gray-100">
|
frontend/src/selected_tools/SelectedToolsApp.tsx
CHANGED
|
@@ -14,6 +14,7 @@ interface Row {
|
|
| 14 |
tool_call_counts: Record<string, number>;
|
| 15 |
total_tool_calls: number;
|
| 16 |
status: string;
|
|
|
|
| 17 |
question: string;
|
| 18 |
correct_answer: string;
|
| 19 |
correct: boolean | null;
|
|
@@ -262,7 +263,7 @@ export default function SelectedToolsApp() {
|
|
| 262 |
{row.correct === true && (
|
| 263 |
<span className="text-[9px] px-1.5 py-0.5 rounded-full bg-green-900/60 text-green-400 border border-green-800">✓</span>
|
| 264 |
)}
|
| 265 |
-
{(row.correct === false || (row.correct == null && row.
|
| 266 |
<span className="text-[9px] px-1.5 py-0.5 rounded-full bg-red-900/60 text-red-400 border border-red-800">✗</span>
|
| 267 |
)}
|
| 268 |
{row.direct_answer
|
|
@@ -288,7 +289,7 @@ export default function SelectedToolsApp() {
|
|
| 288 |
{current.correct === true && (
|
| 289 |
<span className="text-xs px-2 py-0.5 rounded-full bg-green-900/50 text-green-300 border border-green-800 font-semibold">✓ Correct</span>
|
| 290 |
)}
|
| 291 |
-
{(current.correct === false || (current.correct == null && current.
|
| 292 |
<span className="text-xs px-2 py-0.5 rounded-full bg-red-900/50 text-red-300 border border-red-800 font-semibold">✗ Incorrect</span>
|
| 293 |
)}
|
| 294 |
{current.direct_answer
|
|
@@ -296,12 +297,12 @@ export default function SelectedToolsApp() {
|
|
| 296 |
: <span className="text-xs px-2 py-0.5 rounded-full bg-blue-900/50 text-blue-300 border border-blue-800">{current.total_tool_calls} tool calls</span>
|
| 297 |
}
|
| 298 |
<span className="text-xs text-gray-500">k={current.k_effective} steps selected</span>
|
| 299 |
-
<span className={`text-xs px-2 py-0.5 rounded-full ${current.
|
| 300 |
-
{current.
|
| 301 |
</span>
|
| 302 |
</div>
|
| 303 |
{/* Question */}
|
| 304 |
-
{(current.question || current.
|
| 305 |
<div className="mt-1.5 text-xs text-gray-200 leading-snug bg-gray-800/40 rounded px-2 py-1.5 border border-gray-700">
|
| 306 |
<span className="text-[10px] font-bold uppercase tracking-widest text-violet-400 mr-2">Question</span>
|
| 307 |
{current.question || <span className="text-gray-500 italic">not available (run was incomplete)</span>}
|
|
|
|
| 14 |
tool_call_counts: Record<string, number>;
|
| 15 |
total_tool_calls: number;
|
| 16 |
status: string;
|
| 17 |
+
new_status: string;
|
| 18 |
question: string;
|
| 19 |
correct_answer: string;
|
| 20 |
correct: boolean | null;
|
|
|
|
| 263 |
{row.correct === true && (
|
| 264 |
<span className="text-[9px] px-1.5 py-0.5 rounded-full bg-green-900/60 text-green-400 border border-green-800">✓</span>
|
| 265 |
)}
|
| 266 |
+
{(row.correct === false || (row.correct == null && row.new_status === "incomplete")) && (
|
| 267 |
<span className="text-[9px] px-1.5 py-0.5 rounded-full bg-red-900/60 text-red-400 border border-red-800">✗</span>
|
| 268 |
)}
|
| 269 |
{row.direct_answer
|
|
|
|
| 289 |
{current.correct === true && (
|
| 290 |
<span className="text-xs px-2 py-0.5 rounded-full bg-green-900/50 text-green-300 border border-green-800 font-semibold">✓ Correct</span>
|
| 291 |
)}
|
| 292 |
+
{(current.correct === false || (current.correct == null && current.new_status === "incomplete")) && (
|
| 293 |
<span className="text-xs px-2 py-0.5 rounded-full bg-red-900/50 text-red-300 border border-red-800 font-semibold">✗ Incorrect</span>
|
| 294 |
)}
|
| 295 |
{current.direct_answer
|
|
|
|
| 297 |
: <span className="text-xs px-2 py-0.5 rounded-full bg-blue-900/50 text-blue-300 border border-blue-800">{current.total_tool_calls} tool calls</span>
|
| 298 |
}
|
| 299 |
<span className="text-xs text-gray-500">k={current.k_effective} steps selected</span>
|
| 300 |
+
<span className={`text-xs px-2 py-0.5 rounded-full ${current.new_status === "completed" ? "bg-gray-800 text-gray-400" : "bg-amber-900/50 text-amber-300"}`}>
|
| 301 |
+
{current.new_status}
|
| 302 |
</span>
|
| 303 |
</div>
|
| 304 |
{/* Question */}
|
| 305 |
+
{(current.question || current.new_status === "incomplete") && (
|
| 306 |
<div className="mt-1.5 text-xs text-gray-200 leading-snug bg-gray-800/40 rounded px-2 py-1.5 border border-gray-700">
|
| 307 |
<span className="text-[10px] font-bold uppercase tracking-widest text-violet-400 mr-2">Question</span>
|
| 308 |
{current.question || <span className="text-gray-500 italic">not available (run was incomplete)</span>}
|
patch_sel_tools_test300_questions.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Patch the selected-tools test300 HF datasets to fill missing question fields
|
| 4 |
+
from the BrowseComp JSONL. The eval files for some queries omit the 'question'
|
| 5 |
+
field; this script fills them using query_id -> query from BrowseComp.
|
| 6 |
+
|
| 7 |
+
Python env: /scratch/hc3337/envs/raca-py312/bin/python
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
import json, sys, os
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
os.environ.setdefault("HF_HOME", "/scratch/hc3337/.cache/huggingface")
|
| 14 |
+
|
| 15 |
+
BC_JSONL = Path("/scratch/hc3337/projects/BrowseComp-Plus/data/browsecomp_plus_decrypted_test300.jsonl")
|
| 16 |
+
|
| 17 |
+
REPOS = [
|
| 18 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1",
|
| 19 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1",
|
| 20 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1",
|
| 21 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1",
|
| 22 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1",
|
| 23 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1",
|
| 24 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1",
|
| 25 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1",
|
| 26 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1",
|
| 27 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1",
|
| 28 |
+
"timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load_bc_questions(path: Path) -> dict:
|
| 33 |
+
qmap: dict = {}
|
| 34 |
+
with path.open("r", encoding="utf-8") as f:
|
| 35 |
+
for line in f:
|
| 36 |
+
line = line.strip()
|
| 37 |
+
if not line:
|
| 38 |
+
continue
|
| 39 |
+
d = json.loads(line)
|
| 40 |
+
qid = d.get("query_id")
|
| 41 |
+
q = d.get("query") or d.get("question") or ""
|
| 42 |
+
if qid is not None and q:
|
| 43 |
+
qmap[int(qid)] = q
|
| 44 |
+
print(f"Loaded {len(qmap)} questions from {path}", file=sys.stderr)
|
| 45 |
+
return qmap
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def patch_repo(repo: str, bc_questions: dict) -> None:
|
| 49 |
+
from datasets import load_dataset, Dataset
|
| 50 |
+
|
| 51 |
+
print(f"\nLoading {repo}...", file=sys.stderr)
|
| 52 |
+
ds = load_dataset(repo, split="train")
|
| 53 |
+
print(f" {len(ds)} rows, columns: {ds.column_names}", file=sys.stderr)
|
| 54 |
+
|
| 55 |
+
rows = []
|
| 56 |
+
filled = 0
|
| 57 |
+
for row in ds:
|
| 58 |
+
r = dict(row)
|
| 59 |
+
qid = int(r["query_id"])
|
| 60 |
+
if not r.get("question"):
|
| 61 |
+
q = bc_questions.get(qid, "")
|
| 62 |
+
if q:
|
| 63 |
+
r["question"] = q
|
| 64 |
+
filled += 1
|
| 65 |
+
rows.append(r)
|
| 66 |
+
|
| 67 |
+
print(f" Filled {filled} missing questions from BrowseComp JSONL", file=sys.stderr)
|
| 68 |
+
if filled == 0:
|
| 69 |
+
print(f" No changes needed — skipping push.", file=sys.stderr)
|
| 70 |
+
return
|
| 71 |
+
|
| 72 |
+
ds_new = Dataset.from_list(rows)
|
| 73 |
+
ds_new.push_to_hub(repo, split="train",
|
| 74 |
+
commit_message="Fill missing question fields from BrowseComp JSONL")
|
| 75 |
+
print(f" Pushed {len(rows)} rows to {repo}.", file=sys.stderr)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def main():
|
| 79 |
+
bc_questions = load_bc_questions(BC_JSONL)
|
| 80 |
+
for repo in REPOS:
|
| 81 |
+
try:
|
| 82 |
+
patch_repo(repo, bc_questions)
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"ERROR patching {repo}: {e}", file=sys.stderr)
|
| 85 |
+
print("\nALL DONE", file=sys.stderr)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
main()
|