holocron-trask-http / packages /trask /src /web-research-subprocess.ts
th3w1zard1's picture
Deploy trask-http web research from community-bots@6f6709a0116dc99200b9a9ba4cf65f3bf5a649c9
ddf7640 verified
import { spawn } from "node:child_process";
import { existsSync } from "node:fs";
import { dirname, join, resolve } from "node:path";
import type { WebResearchRuntimeConfig } from "@openkotor/config";
export interface HeadlessWebResearchResult {
readonly report: string;
readonly research_information?: {
readonly source_urls?: readonly string[] | null;
readonly cited_urls?: readonly string[] | null;
readonly retrieved_urls?: readonly string[] | null;
readonly visited_urls?: readonly string[] | null;
readonly query_domains?: readonly string[] | null;
readonly allowed_url_prefixes?: readonly string[] | null;
readonly rejected_source_urls?: readonly string[] | null;
};
}
export interface HeadlessWebResearchModelOption {
readonly id: string;
readonly label: string;
readonly provider: string;
readonly recommended?: boolean;
}
/** stdin payload for `scripts/trask_web_research.py`. */
export interface HeadlessWebResearchRequestPayload {
readonly query: string;
readonly custom_prompt?: string;
readonly source_urls?: readonly string[];
readonly query_domains?: readonly string[];
readonly allowed_url_prefixes?: readonly string[];
readonly model?: string;
readonly report_type?: string;
readonly report_source?: string;
}
/** @deprecated Use HeadlessWebResearchResult */
export type HeadlessAiResearchWizardResult = HeadlessWebResearchResult;
/** @deprecated Use HeadlessWebResearchRequestPayload */
export type HeadlessAiResearchWizardRequestPayload = HeadlessWebResearchRequestPayload;
/** @deprecated Use HeadlessWebResearchModelOption */
export type HeadlessAiResearchWizardModelOption = HeadlessWebResearchModelOption;
const findRepoRoot = (startDir: string, maxHops = 24): string => {
let dir = resolve(startDir);
for (let hop = 0; hop < maxHops; hop++) {
const script = join(dir, "scripts", "trask_web_research.py");
if (existsSync(script)) {
return dir;
}
const parent = dirname(dir);
if (parent === dir) {
return process.cwd();
}
dir = parent;
}
return process.cwd();
};
const defaultScriptPath = (repoRoot: string): string => join(repoRoot, "scripts", "trask_web_research.py");
const spawnHeadless = (
python: string,
script: string,
cwd: string,
payload: HeadlessWebResearchRequestPayload,
timeoutMs: number,
): Promise<{ stdout: string; stderr: string; code: number | null }> => {
return new Promise((resolvePromise, rejectPromise) => {
const child = spawn(python, [script], {
cwd,
stdio: ["pipe", "pipe", "pipe"],
env: {
...process.env,
TRASK_ALLOWED_QUERY_DOMAINS: (payload.query_domains ?? []).join("\n"),
TRASK_ALLOWED_URL_PREFIXES: (payload.allowed_url_prefixes ?? []).join("\n"),
PYTHONIOENCODING: "utf-8",
PYTHONUTF8: "1",
},
});
const chunksOut: Buffer[] = [];
const chunksErr: Buffer[] = [];
let settled = false;
child.stdout?.on("data", (chunk: Buffer | string) => {
chunksOut.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
});
child.stderr?.on("data", (chunk: Buffer | string) => {
chunksErr.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
});
const timer = setTimeout(() => {
if (settled) {
return;
}
settled = true;
child.kill("SIGTERM");
rejectPromise(new Error(`Trask web research runner timed out after ${timeoutMs}ms`));
}, timeoutMs);
child.on("error", (error) => {
if (settled) {
return;
}
settled = true;
clearTimeout(timer);
rejectPromise(error);
});
child.on("close", (exitCode) => {
if (settled) {
return;
}
settled = true;
clearTimeout(timer);
resolvePromise({
stdout: Buffer.concat(chunksOut).toString("utf8").trim(),
stderr: Buffer.concat(chunksErr).toString("utf8").trim(),
code: exitCode,
});
});
try {
child.stdin?.write(Buffer.from(JSON.stringify(payload), "utf8"));
child.stdin?.end();
} catch (error) {
if (!settled) {
settled = true;
clearTimeout(timer);
rejectPromise(error);
}
}
});
};
export const runHeadlessWebResearch = async (
config: WebResearchRuntimeConfig,
payload: HeadlessWebResearchRequestPayload,
): Promise<HeadlessWebResearchResult> => {
const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
const script = (config.headlessScriptPath?.trim() || defaultScriptPath(repoRoot)).trim();
if (!existsSync(script)) {
throw new Error(
`Trask web research script not found: ${script}. Run scripts/bootstrap_trask_research.sh or set TRASK_WEB_RESEARCH_SCRIPT.`,
);
}
const python = config.pythonExecutable?.trim() || "python";
const { stdout, stderr, code } = await spawnHeadless(python, script, repoRoot, payload, config.timeoutMs);
if (code !== 0) {
throw new Error(`Trask web research runner exited ${code ?? "unknown"}: ${stderr || stdout || "no output"}`);
}
try {
const parsed = JSON.parse(stdout) as HeadlessWebResearchResult;
if (typeof parsed.report !== "string" || !parsed.report.trim()) {
throw new Error("Web research runner returned empty report.");
}
return parsed;
} catch (error) {
if (error instanceof SyntaxError) {
throw new Error(`Trask web research runner returned invalid JSON: ${stdout.slice(0, 400)}`);
}
throw error;
}
};
/** @deprecated Use runHeadlessWebResearch */
export const runHeadlessGptResearcher = runHeadlessWebResearch;
const labelFromModelId = (modelId: string): string => {
const tail = modelId.split("/").pop() ?? modelId;
return tail
.replace(/[-_]+/gu, " ")
.replace(/\b\w/gu, (char) => char.toUpperCase())
.replace(/\bGpt\b/gu, "GPT")
.replace(/\bAi\b/gu, "AI");
};
const providerFromModelId = (modelId: string): string => {
const withoutPrefix = modelId.includes(":") ? modelId.split(":", 2)[1] ?? modelId : modelId;
const provider = withoutPrefix.includes("/") ? withoutPrefix.split("/", 1)[0] ?? withoutPrefix : "Trask web research";
return provider
.replace(/[-_]+/gu, " ")
.replace(/\b\w/gu, (char) => char.toUpperCase())
.replace(/\bAi\b/gu, "AI")
.replace(/^Openrouter$/u, "OpenRouter");
};
const normalizeWebResearchModelId = (modelId: string): string => {
const trimmed = modelId.trim();
if (!trimmed) return "";
if (trimmed.includes(":")) return trimmed;
return trimmed.startsWith("openrouter/") ? `openrouter:${trimmed}` : `litellm:${trimmed}`;
};
const parseModelList = (stdout: string): HeadlessWebResearchModelOption[] => {
const parsed = JSON.parse(stdout) as unknown;
if (!Array.isArray(parsed)) return [];
const seen = new Set<string>();
const models: HeadlessWebResearchModelOption[] = [];
for (const raw of parsed) {
if (typeof raw !== "string") continue;
const id = normalizeWebResearchModelId(raw);
if (!id || seen.has(id)) continue;
seen.add(id);
models.push({
id,
label: labelFromModelId(id),
provider: providerFromModelId(id),
});
}
return models;
};
export const listHeadlessWebResearchModels = async (
config: WebResearchRuntimeConfig,
): Promise<HeadlessWebResearchModelOption[]> => {
const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
const python = config.pythonExecutable?.trim() || "python";
const script = [
"import json, sys",
"from pathlib import Path",
"root = Path(sys.argv[1]).resolve()",
"fallbacks = root / 'vendor' / 'llm_fallbacks' / 'src'",
"sys.path.insert(0, str(fallbacks))",
"try:",
" from llm_fallbacks.config import FREE_CHAT_MODELS",
" models = [name for name, _ in FREE_CHAT_MODELS]",
"except Exception:",
" from llm_fallbacks import filter_models",
" models = list(filter_models(model_type='chat', free_only=True))",
"print(json.dumps(models[:60]))",
].join("\n");
const { stdout, stderr, code } = await new Promise<{ stdout: string; stderr: string; code: number | null }>(
(resolvePromise, rejectPromise) => {
const child = spawn(python, ["-c", script, repoRoot], {
cwd: process.cwd(),
stdio: ["ignore", "pipe", "pipe"],
env: {
...process.env,
PYTHONIOENCODING: "utf-8",
PYTHONUTF8: "1",
},
});
const chunksOut: Buffer[] = [];
const chunksErr: Buffer[] = [];
let settled = false;
const timer = setTimeout(() => {
if (settled) return;
settled = true;
child.kill("SIGTERM");
rejectPromise(new Error("Trask web research model list timed out"));
}, Math.min(config.timeoutMs, 15_000));
child.stdout?.on("data", (chunk: Buffer | string) => chunksOut.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)));
child.stderr?.on("data", (chunk: Buffer | string) => chunksErr.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)));
child.on("error", (error) => {
if (settled) return;
settled = true;
clearTimeout(timer);
rejectPromise(error);
});
child.on("close", (exitCode) => {
if (settled) return;
settled = true;
clearTimeout(timer);
resolvePromise({
stdout: Buffer.concat(chunksOut).toString("utf8").trim(),
stderr: Buffer.concat(chunksErr).toString("utf8").trim(),
code: exitCode,
});
});
},
);
if (code !== 0) {
throw new Error(`Trask web research model list exited ${code ?? "unknown"}: ${stderr || stdout || "no output"}`);
}
return parseModelList(stdout);
};
/** @deprecated Use listHeadlessWebResearchModels */
export const listHeadlessGptResearcherModels = listHeadlessWebResearchModels;
export const probeHeadlessWebResearchDryRun = async (config: WebResearchRuntimeConfig): Promise<boolean> => {
const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
const script = (config.headlessScriptPath?.trim() || defaultScriptPath(repoRoot)).trim();
if (!existsSync(script)) {
return false;
}
const python = config.pythonExecutable?.trim() || "python";
const { code } = await new Promise<{ code: number | null }>((resolvePromise, rejectPromise) => {
const child = spawn(python, [script, "--dry-run"], {
cwd: repoRoot,
stdio: ["ignore", "pipe", "pipe"],
env: { ...process.env, PYTHONIOENCODING: "utf-8", PYTHONUTF8: "1" },
});
let settled = false;
const timer = setTimeout(() => {
if (settled) return;
settled = true;
child.kill("SIGTERM");
rejectPromise(new Error("dry-run probe timed out"));
}, 15_000);
child.on("error", () => {
if (settled) return;
settled = true;
clearTimeout(timer);
resolvePromise({ code: 1 });
});
child.on("close", (exitCode) => {
if (settled) return;
settled = true;
clearTimeout(timer);
resolvePromise({ code: exitCode });
});
});
return code === 0;
};