Spaces:

OpenKotOR
/

holocron-trask-http

Configuration error

App Files Files Community

th3w1zard1 commited on 12 days ago

Commit

ddf7640

verified ·

1 Parent(s): 017b801

Deploy trask-http web research from community-bots@6f6709a0116dc99200b9a9ba4cf65f3bf5a649c9

Browse files

Files changed (24) hide show

Dockerfile +22 -8
README.md +3 -3
apps/trask-http-server/src/main.ts +56 -4
docs/trask-research-backends.md +80 -0
package.json +4 -3
packages/config/src/index.test.ts +11 -13
packages/config/src/index.ts +70 -56
packages/personas/src/index.ts +1 -1
packages/retrieval/src/discord-permalink.test.ts +52 -0
packages/retrieval/src/discord-permalink.ts +98 -0
packages/retrieval/src/index.ts +31 -5
packages/trask-http/src/router.test.ts +26 -26
packages/trask-http/src/router.ts +29 -23
packages/trask/src/community-knowledge.test.ts +72 -0
packages/trask/src/community-knowledge.ts +61 -0
packages/trask/src/index.ts +3 -1
packages/trask/src/research-wizard.ts +17 -1270
packages/trask/src/web-research-subprocess.ts +337 -0
packages/trask/src/web-research.test.ts +38 -0
packages/trask/src/web-research.ts +1559 -0
pnpm-lock.yaml +2 -0
requirements-trask-research.txt +6 -0
scripts/trask_cache.py +254 -0
scripts/trask_web_research.py +511 -0

Dockerfile CHANGED Viewed

@@ -4,22 +4,33 @@
 FROM node:24-bookworm AS base
 WORKDIR /workspace
 ENV NODE_ENV=production
-RUN apt-get update \
-  && apt-get install -y --no-install-recommends python3 python3-venv python3-pip \
-  && rm -rf /var/lib/apt/lists/*
 RUN corepack enable && corepack prepare pnpm@10.11.0 --activate
 FROM base AS deps
 COPY package.json pnpm-lock.yaml pnpm-workspace.yaml tsconfig.base.json tsconfig.workspace.json ./
 COPY packages ./packages
 COPY apps/trask-http-server ./apps/trask-http-server
-COPY vendor/ai-researchwizard ./vendor/ai-researchwizard
-COPY vendor/llm_fallbacks ./vendor/llm_fallbacks
-COPY scripts/bootstrap_trask_gpt_researcher.sh ./scripts/bootstrap_trask_gpt_researcher.sh
 COPY data/ingest-worker ./data/ingest-worker
 RUN pnpm install --frozen-lockfile
 RUN pnpm --filter @openkotor/trask-http-server build
-RUN bash scripts/bootstrap_trask_gpt_researcher.sh
 FROM base AS runtime
 WORKDIR /workspace
@@ -28,11 +39,14 @@ ENV PORT=${PORT}
 ENV TRASK_HTTP_PORT=${PORT}
 ENV TRASK_WEB_ALLOW_ANONYMOUS=1
 ENV TRASK_WEB_DEFAULT_USER_ID=qa-webui
-ENV TRASK_GPT_RESEARCHER_PYTHON=/workspace/.venv-trask-gptr/bin/python
 ENV TRASK_PUBLIC_WEB_ORIGIN=https://openkotor.github.io
 ENV TRASK_RESEARCHWIZARD_TIMEOUT_MS=900000
 ENV INGEST_STATE_DIR=/workspace/data/ingest-worker
 ENV TRASK_HTTP_DATA_DIR=/workspace/data/trask-http-server
 COPY --from=deps /workspace /workspace
 EXPOSE 7860
 CMD ["node", "apps/trask-http-server/dist/main.js"]

 FROM node:24-bookworm AS base
 WORKDIR /workspace
 ENV NODE_ENV=production
 RUN corepack enable && corepack prepare pnpm@10.11.0 --activate
 FROM base AS deps
 COPY package.json pnpm-lock.yaml pnpm-workspace.yaml tsconfig.base.json tsconfig.workspace.json ./
 COPY packages ./packages
 COPY apps/trask-http-server ./apps/trask-http-server
 COPY data/ingest-worker ./data/ingest-worker
+COPY scripts/trask_web_research.py scripts/trask_web_research.py
+COPY scripts/trask_cache.py scripts/trask_cache.py
+COPY requirements-trask-research.txt requirements-trask-research.txt
 RUN pnpm install --frozen-lockfile
 RUN pnpm --filter @openkotor/trask-http-server build
+FROM base AS python-research
+WORKDIR /workspace
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-venv \
+    libxml2-dev libxslt1-dev gcc \
+  && rm -rf /var/lib/apt/lists/*
+COPY requirements-trask-research.txt /workspace/requirements-trask-research.txt
+COPY scripts/trask_web_research.py /workspace/scripts/trask_web_research.py
+COPY scripts/trask_cache.py /workspace/scripts/trask_cache.py
+RUN python3 -m venv /workspace/.venv-trask-research \
+  && /workspace/.venv-trask-research/bin/pip install --upgrade pip \
+  && /workspace/.venv-trask-research/bin/pip install -r /workspace/requirements-trask-research.txt \
+  && (/workspace/.venv-trask-research/bin/python -m crawl4ai install || true)
 FROM base AS runtime
 WORKDIR /workspace
 ENV TRASK_HTTP_PORT=${PORT}
 ENV TRASK_WEB_ALLOW_ANONYMOUS=1
 ENV TRASK_WEB_DEFAULT_USER_ID=qa-webui
 ENV TRASK_PUBLIC_WEB_ORIGIN=https://openkotor.github.io
 ENV TRASK_RESEARCHWIZARD_TIMEOUT_MS=900000
+ENV TRASK_WEB_RESEARCH_PYTHON=/workspace/.venv-trask-research/bin/python
 ENV INGEST_STATE_DIR=/workspace/data/ingest-worker
 ENV TRASK_HTTP_DATA_DIR=/workspace/data/trask-http-server
 COPY --from=deps /workspace /workspace
+COPY --from=python-research /workspace/.venv-trask-research /workspace/.venv-trask-research
+COPY --from=python-research /workspace/scripts/trask_web_research.py /workspace/scripts/trask_web_research.py
+COPY --from=python-research /workspace/scripts/trask_cache.py /workspace/scripts/trask_cache.py
 EXPOSE 7860
 CMD ["node", "apps/trask-http-server/dist/main.js"]

README.md CHANGED Viewed

@@ -8,9 +8,9 @@ app_port: 7860
 pinned: false
 ---
-# Holocron Trask HTTP (live research)
-Public `trask-http-server` with headless **ai-researchwizard** (GPTR), local ingest chunks, and `/api/trask/*` for Holocron.
 - Source: `apps/trask-http-server` in [OpenKotOR/community-bots](https://github.com/OpenKotOR/community-bots)
 - Deployed by `.github/workflows/trask-http-public.yml`
@@ -27,4 +27,4 @@ CI syncs from GitHub repository secrets when they exist. None are required for d
 | `TAVILY_API_KEY` | Optional web retrieval |
 | `FAST_LLM` / `SMART_LLM` / `STRATEGIC_LLM` | Optional GPTR model overrides |
-Without paid API keys, Trask uses **vendored `llm_fallbacks` free models** and **bundled local knowledge** (`data/ingest-worker`). Holocron still returns grounded answers for the canonical technical topics.

 pinned: false
 ---
+# Holocron Trask HTTP
+Public `trask-http-server` and `/api/trask/*` for Holocron. Docker image includes **Crawl4AI** research venv (`TRASK_WEB_RESEARCH_PYTHON`). Set `OPENAI_API_KEY` or `OPENROUTER_API_KEY` for live synthesis (`docs/trask-research-backends.md`).
 - Source: `apps/trask-http-server` in [OpenKotOR/community-bots](https://github.com/OpenKotOR/community-bots)
 - Deployed by `.github/workflows/trask-http-public.yml`
 | `TAVILY_API_KEY` | Optional web retrieval |
 | `FAST_LLM` / `SMART_LLM` / `STRATEGIC_LLM` | Optional GPTR model overrides |
+Holocron research **requires at least one working LLM** in the provider fallback chain (`llm_fallbacks` free models when no paid keys are set). On startup, `trask-http-server` probes the chain and exposes `researchAvailable` on `GET /api/trask/session`. Set `TRASK_STRICT_LLM_PROBE=1` to refuse boot when every provider fails.

apps/trask-http-server/src/main.ts CHANGED Viewed

@@ -11,7 +11,7 @@ import {
   resolveCorsHeaders,
 } from "@openkotor/platform";
 import { createChunkSearchProvider } from "@openkotor/retrieval";
-import { createResearchWizardClient } from "@openkotor/trask";
 import { createTraskHttpRouter, type TraskHttpAuth } from "@openkotor/trask-http";
 import express, { type Request, type Response } from "express";
@@ -61,12 +61,20 @@ const config = loadTraskHttpServerConfig();
 const resolveFromRoot = (p: string) => (path.isAbsolute(p) ? p : path.resolve(repoRoot, p));
 const queryRepository = new JsonTraskQueryRepository(resolveDataFile(resolveFromRoot(config.dataDir), "trask-queries.json"));
-const searchProvider = createChunkSearchProvider(resolveFromRoot(config.chunkDir));
-const researchWizard = createResearchWizardClient(config.researchWizard, config.ai);
 const runtime = {
   searchProvider,
-  researchWizard,
   queryRepository,
 };
@@ -151,11 +159,36 @@ app.use((req, res, next) => {
   next();
 });
 app.use(
   "/api/trask",
   createTraskHttpRouter({
     runtime,
     auth: createWebAuth(config),
   }),
 );
@@ -182,6 +215,25 @@ const { server, listen } = createNodeApiHost({
 listen(config.port, () => {
   logger.info(`Trask HTTP API listening on port ${config.port}`);
 });
 process.on("SIGINT", () => {

   resolveCorsHeaders,
 } from "@openkotor/platform";
 import { createChunkSearchProvider } from "@openkotor/retrieval";
+import { createWebResearchClient, probeHeadlessWebResearchDryRun } from "@openkotor/trask";
 import { createTraskHttpRouter, type TraskHttpAuth } from "@openkotor/trask-http";
 import express, { type Request, type Response } from "express";
 const resolveFromRoot = (p: string) => (path.isAbsolute(p) ? p : path.resolve(repoRoot, p));
 const queryRepository = new JsonTraskQueryRepository(resolveDataFile(resolveFromRoot(config.dataDir), "trask-queries.json"));
+const discordGuildId =
+  process.env.TRASK_DISCORD_GUILD_ID?.trim()
+  || process.env.DISCORD_TARGET_GUILD_ID?.trim()
+  || undefined;
+const searchProvider = createChunkSearchProvider(resolveFromRoot(config.chunkDir), {
+  ...(discordGuildId ? { discordGuildId } : {}),
+});
+const webResearch = createWebResearchClient(config.webResearch, config.ai, {
+  localSearchProvider: searchProvider,
+});
 const runtime = {
   searchProvider,
+  webResearch,
   queryRepository,
 };
   next();
 });
+const hasLlmRewriteKey = Boolean(config.ai.openAiApiKey?.trim());
+const researchUnavailableReason = (): string => {
+  if (!hasLlmRewriteKey) {
+    return "Set OPENAI_API_KEY or OPENROUTER_API_KEY for Holocron answer synthesis.";
+  }
+  return "Run scripts/bootstrap_trask_research.sh (Crawl4AI venv) and set TRASK_WEB_RESEARCH_PYTHON. See docs/trask-research-backends.md.";
+};
+const holocronSessionState: {
+  researchAvailable: boolean;
+  researchUnavailableReason?: string;
+} = {
+  researchAvailable: false,
+  researchUnavailableReason: researchUnavailableReason(),
+};
 app.use(
   "/api/trask",
   createTraskHttpRouter({
     runtime,
     auth: createWebAuth(config),
+    getSession: () => ({
+      loggedIn: false,
+      oauthAvailable: false,
+      researchAvailable: holocronSessionState.researchAvailable,
+      ...(holocronSessionState.researchUnavailableReason
+        ? { researchUnavailableReason: holocronSessionState.researchUnavailableReason }
+        : {}),
+    }),
   }),
 );
 listen(config.port, () => {
   logger.info(`Trask HTTP API listening on port ${config.port}`);
+  void (async () => {
+    try {
+      const dryRunOk = await probeHeadlessWebResearchDryRun(config.webResearch);
+      if (dryRunOk && hasLlmRewriteKey) {
+        holocronSessionState.researchAvailable = true;
+        delete holocronSessionState.researchUnavailableReason;
+        logger.info("Holocron live web research is available (Crawl4AI gather + LLM synthesis).");
+        return;
+      }
+      holocronSessionState.researchAvailable = false;
+      holocronSessionState.researchUnavailableReason = researchUnavailableReason();
+      logger.warn(`Holocron live research unavailable: ${holocronSessionState.researchUnavailableReason}`);
+    } catch (error: unknown) {
+      const detail = error instanceof Error ? error.message : String(error);
+      holocronSessionState.researchAvailable = false;
+      holocronSessionState.researchUnavailableReason = `${researchUnavailableReason()} (${detail})`;
+      logger.warn(`Holocron research probe failed: ${detail}`);
+    }
+  })();
 });
 process.on("SIGINT", () => {

docs/trask-research-backends.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Trask / Holocron research backends
+Holocron’s UI lives in **`apps/holocron-web`**. It talks to **`apps/trask-http-server`** at `/api/trask/*`.
+## Default stack (implemented)
+| Layer | Implementation |
+|--------|----------------|
+| **Discovery** | DuckDuckGo (`duckduckgo-search`) with `site:` hints from approved domains |
+| **Scrape** | [Crawl4AI](https://github.com/unclecode/crawl4ai) → LLM-friendly markdown (`scripts/trask_web_research.py`) |
+| **Synthesis** | Node `WebResearchClient` OpenAI-compatible rewrite (`packages/trask/src/web-research.ts`) |
+### Bootstrap
+```bash
+bash scripts/bootstrap_trask_research.sh   # creates .venv-trask-research
+export TRASK_WEB_RESEARCH_PYTHON="$(pwd)/.venv-trask-research/bin/python"
+# OPENAI_API_KEY or OPENROUTER_API_KEY required for Holocron synthesis
+```
+Fedora/RHEL hosts need `libxml2-devel` and `libxslt-devel` before the first bootstrap (for `lxml`).
+### Environment
+| Variable | Purpose |
+|----------|---------|
+| `TRASK_WEB_RESEARCH_PYTHON` | Python for `scripts/trask_web_research.py` (defaults to `.venv-trask-research`) |
+| `TRASK_WEB_RESEARCH_SCRIPT` | Optional override script path |
+| `TRASK_GPT_RESEARCHER_PYTHON` | Deprecated alias for `TRASK_WEB_RESEARCH_PYTHON` |
+| `TRASK_WEB_RESEARCH_TIMEOUT_MS` | Subprocess timeout (default **900000**; legacy alias `TRASK_RESEARCHWIZARD_TIMEOUT_MS`) |
+| `OPENAI_API_KEY` / `OPENROUTER_API_KEY` | LLM rewrite for final Holocron answers |
+| `REDIS_URL` / `TRASK_REDIS_URL` | Optional Redis for research cache (`scripts/trask_cache.py`) |
+| `TRASK_CACHE_DISABLED` | Set to `1` to bypass Redis even when `REDIS_URL` is set |
+| `TRASK_CACHE_SEARCH_TTL_SECONDS` | DuckDuckGo URL-list cache TTL (default **21600** = 6h) |
+| `TRASK_CACHE_PAGE_TTL_SECONDS` | Per-page markdown cache TTL (default **604800** = 7d) |
+| `TRASK_CACHE_RESEARCH_TTL_SECONDS` | Full research JSON cache TTL (default **3600** = 1h) |
+### Redis cache (optional, no Pinecone)
+When `REDIS_URL` is set, `scripts/trask_web_research.py` uses `scripts/trask_cache.py` to avoid repeat work:
+| Layer | Key pattern | What it skips |
+|--------|-------------|----------------|
+| Search | `trask:search:{hash}` | DuckDuckGo discovery for the same query + domains |
+| Page | `trask:page:{hash}` | Crawl4AI / trafilatura fetch for the same URL |
+| Research | `trask:research:{hash}` | Entire subprocess result for identical payload |
+Cache stats appear under `research_information.cache` (e.g. `page_hits`, `search_misses`, `research_hits`).
+```bash
+# Local Redis (example)
+podman run -d --name trask-redis -p 6379:6379 redis:7-alpine
+export REDIS_URL=redis://localhost:6379/0
+python scripts/trask_cache.py   # connectivity self-test
+```
+### Verification
+```bash
+python scripts/smoke_trask_web_research.py --dry-run
+node --import tsx/esm scripts/verify_trask_cli_qa.mjs
+pnpm holocron:e2e   # with trask-http-server on :4010
+```
+## Explicitly rejected (do not implement)
+These were considered as follow-ups and are **out of scope**:
+| Approach | Reason |
+|----------|--------|
+| Node-native **llm-scraper** for single-URL extraction | Not part of the product path; Crawl4AI + DDG covers live research. |
+| **browser-use** integration | Not part of the product path. |
+| Trask `/ask` via self-hosted **Firecrawl HTTP API** (reuse ingest key without Python) | Firecrawl remains **ingest-worker only** when `FIRECRAWL_API_KEY` is set—not the Holocron/Discord answer pipeline. |
+| `TRASK_RESEARCH_BACKEND_URL` HTTP sidecar | Reserved env name only; no planned sidecar replacing `trask_web_research.py`. |
+## Other references (not default)
+- [khoj-ai/khoj](https://github.com/khoj-ai/khoj) — full Q&A product (not integrated)
+- [assafelovic/gpt-researcher](https://github.com/assafelovic/gpt-researcher) — upstream of the removed vendored fork
+- [searxng/searxng](https://github.com/searxng/searxng) — metasearch sidecar (not integrated)

package.json CHANGED Viewed

@@ -31,10 +31,11 @@
     "build:pazaak-nakama": "pnpm --filter @openkotor/pazaak-nakama build",
     "dev:ingest": "pnpm --filter @openkotor/ingest-worker dev",
     "discord:install-links": "tsx scripts/discord-install-links.ts",
-    "smoke:trask-gptr-dry": "python scripts/smoke_trask_headless_gptr.py --dry-run",
-    "smoke:trask-gptr": "python scripts/smoke_trask_headless_gptr.py",
     "holocron:e2e": "node scripts/holocron-e2e-live-build.mjs && pnpm exec playwright test --config apps/holocron-web/playwright.config.ts",
-    "discord:smoke-trask-commands": "node scripts/discord_trask_commands_smoke.mjs"
   },
   "devDependencies": {
     "@playwright/test": "^1.58.0",

     "build:pazaak-nakama": "pnpm --filter @openkotor/pazaak-nakama build",
     "dev:ingest": "pnpm --filter @openkotor/ingest-worker dev",
     "discord:install-links": "tsx scripts/discord-install-links.ts",
     "holocron:e2e": "node scripts/holocron-e2e-live-build.mjs && pnpm exec playwright test --config apps/holocron-web/playwright.config.ts",
+    "discord:smoke-trask-commands": "node scripts/discord_trask_commands_smoke.mjs",
+    "trask:env:fetch": "node scripts/discord_fetch_trask_env.mjs",
+    "trask:start": "bash scripts/trask_bot_start.sh",
+    "trask:wait-token": "bash scripts/discord_wait_token_and_start.sh"
   },
   "devDependencies": {
     "@playwright/test": "^1.58.0",

packages/config/src/index.test.ts CHANGED Viewed

@@ -84,9 +84,9 @@ test("loadSharedAiConfig returns undefined headers when no OpenRouter vars are s
 // loadResearchWizardRuntimeConfig — timeout and script path
 // ---------------------------------------------------------------------------
-test("loadResearchWizardRuntimeConfig defaults timeout to 90000 ms when TRASK_RESEARCHWIZARD_TIMEOUT_MS is absent", () => {
   const cfg = loadResearchWizardRuntimeConfig({});
-  assert.equal(cfg.timeoutMs, 90000);
 });
 test("loadResearchWizardRuntimeConfig respects TRASK_RESEARCHWIZARD_TIMEOUT_MS override", () => {
@@ -94,23 +94,21 @@ test("loadResearchWizardRuntimeConfig respects TRASK_RESEARCHWIZARD_TIMEOUT_MS o
   assert.equal(cfg.timeoutMs, 120000);
 });
-test("loadResearchWizardRuntimeConfig sets headlessScriptPath to undefined when TRASK_GPT_RESEARCHER_SCRIPT is absent", () => {
   const cfg = loadResearchWizardRuntimeConfig({});
   assert.equal(cfg.headlessScriptPath, undefined);
 });
-test("loadResearchWizardRuntimeConfig resolves an explicit headless script path", () => {
-  const cfg = loadResearchWizardRuntimeConfig({ TRASK_GPT_RESEARCHER_SCRIPT: "/tmp/my_script.py" });
-  assert.ok(cfg.headlessScriptPath?.endsWith("my_script.py"));
 });
-test("loadResearchWizardRuntimeConfig falls back to 'python' when no venv path is present", () => {
-  // Force no auto-discovery by pointing at a non-existent root.
-  const cfg = loadResearchWizardRuntimeConfig({ TRASK_GPT_RESEARCHER_ROOT: "/nonexistent/path/that/does/not/exist" });
-  // Explicit TRASK_GPT_RESEARCHER_ROOT is set but has no gpt_researcher/ inside, so
-  // root resolves but the venv walk still yields 'python'.
-  assert.ok(typeof cfg.pythonExecutable === "string");
-  assert.ok(cfg.pythonExecutable.length > 0);
 });
 // ---------------------------------------------------------------------------

 // loadResearchWizardRuntimeConfig — timeout and script path
 // ---------------------------------------------------------------------------
+test("loadResearchWizardRuntimeConfig defaults timeout to 900000 ms when TRASK_RESEARCHWIZARD_TIMEOUT_MS is absent", () => {
   const cfg = loadResearchWizardRuntimeConfig({});
+  assert.equal(cfg.timeoutMs, 900000);
 });
 test("loadResearchWizardRuntimeConfig respects TRASK_RESEARCHWIZARD_TIMEOUT_MS override", () => {
   assert.equal(cfg.timeoutMs, 120000);
 });
+test("loadResearchWizardRuntimeConfig resolves repoRoot and pythonExecutable", () => {
   const cfg = loadResearchWizardRuntimeConfig({});
+  assert.ok(cfg.repoRoot.length > 0);
+  assert.ok(cfg.pythonExecutable.length > 0);
   assert.equal(cfg.headlessScriptPath, undefined);
 });
+test("loadResearchWizardRuntimeConfig respects TRASK_WEB_RESEARCH_PYTHON override", () => {
+  const cfg = loadResearchWizardRuntimeConfig({ TRASK_WEB_RESEARCH_PYTHON: "/custom/python" });
+  assert.equal(cfg.pythonExecutable, "/custom/python");
 });
+test("loadResearchWizardRuntimeConfig resolves TRASK_RESEARCH_BACKEND_URL", () => {
+  const cfg = loadResearchWizardRuntimeConfig({ TRASK_RESEARCH_BACKEND_URL: "http://127.0.0.1:3002" });
+  assert.equal(cfg.backendUrl, "http://127.0.0.1:3002");
 });
 // ---------------------------------------------------------------------------

packages/config/src/index.ts CHANGED Viewed

@@ -13,21 +13,26 @@ import { loadPolicyFromFile } from "@openkotor/pazaak-policy/file-loader";
 import { config as loadDotEnv } from "dotenv";
 import { z } from "zod";
-function findDotEnv(): string | undefined {
   let dir = resolve(process.cwd());
   for (;;) {
-    const candidate = join(dir, ".env");
-    if (existsSync(candidate)) return candidate;
     const parent = dirname(dir);
-    if (parent === dir) return undefined;
     dir = parent;
   }
 }
-const dotEnvPath = findDotEnv();
-if (dotEnvPath) {
-  loadDotEnv({ path: dotEnvPath });
 } else {
   loadDotEnv();
 }
@@ -107,29 +112,29 @@ export interface SharedAiConfig {
   databaseUrl: string | undefined;
 }
-export interface ResearchWizardRuntimeConfig {
-  /** Absolute path to vendored `ai-researchwizard` (folder containing `gpt_researcher/`). */
-  gptResearcherRoot: string | undefined;
-  /** Python interpreter for `trask_headless_research.py` (default `python`). */
   pythonExecutable: string;
-  /** Optional absolute path to the headless runner; default `<gptResearcherRoot>/trask_headless_research.py`. */
   headlessScriptPath: string | undefined;
   timeoutMs: number;
 }
-const hasAiResearchWizardPackage = (rootDir: string): boolean =>
-  existsSync(join(rootDir, "gpt_researcher"));
-/**
- * Walks upward from `startDir` looking for `vendor/ai-researchwizard` with a `gpt_researcher/` tree.
- * Covers `pnpm --filter … dev` where cwd is `apps/trask-http-server` instead of the monorepo root.
- */
-const findVendorAiResearchWizard = (startDir: string, maxHops = 24): string | undefined => {
   let dir = resolve(startDir);
   for (let hop = 0; hop < maxHops; hop++) {
-    const candidate = join(dir, "vendor", "ai-researchwizard");
-    if (hasAiResearchWizardPackage(candidate)) {
-      return candidate;
     }
     const parent = dirname(dir);
     if (parent === dir) {
@@ -140,72 +145,81 @@ const findVendorAiResearchWizard = (startDir: string, maxHops = 24): string | un
   return undefined;
 };
-const resolveGptResearcherRoot = (env: NodeJS.ProcessEnv): string | undefined => {
-  const explicit = readOptionalEnv("TRASK_GPT_RESEARCHER_ROOT", env);
   if (explicit) {
     return resolve(explicit.trim());
   }
-  const fromCwd = findVendorAiResearchWizard(process.cwd());
   if (fromCwd) {
     return fromCwd;
   }
   const configModuleDir = dirname(fileURLToPath(import.meta.url));
-  const fromPackage = findVendorAiResearchWizard(configModuleDir);
   if (fromPackage) {
     return fromPackage;
   }
-  return undefined;
 };
 /**
- * Prefer the monorepo bootstrap venv (scripts/bootstrap_trask_gpt_researcher.*) when
- * `TRASK_GPT_RESEARCHER_PYTHON` is unset so Trask HTTP / Discord match `smoke_trask_headless_gptr.py`.
  */
-const resolveTraskHeadlessPythonExecutable = (
-  gptResearcherRoot: string | undefined,
-  env: NodeJS.ProcessEnv,
-): string => {
-  const explicit = readOptionalEnv("TRASK_GPT_RESEARCHER_PYTHON", env)?.trim();
   if (explicit) {
     return explicit;
   }
-  if (!gptResearcherRoot) {
-    return "python";
-  }
-  const vendorDir = dirname(gptResearcherRoot);
-  const repoRoot = dirname(vendorDir);
-  const winPy = join(repoRoot, ".venv-trask-gptr", "Scripts", "python.exe");
-  const unixPy = join(repoRoot, ".venv-trask-gptr", "bin", "python");
   if (process.platform === "win32" && existsSync(winPy)) {
     return winPy;
   }
   if (existsSync(unixPy)) {
     return unixPy;
   }
-  return "python";
 };
-export const loadResearchWizardRuntimeConfig = (env: NodeJS.ProcessEnv = process.env): ResearchWizardRuntimeConfig => {
-  const scriptRaw = readOptionalEnv("TRASK_GPT_RESEARCHER_SCRIPT", env);
-  const gptResearcherRoot = resolveGptResearcherRoot(env);
   return {
-    gptResearcherRoot,
-    pythonExecutable: resolveTraskHeadlessPythonExecutable(gptResearcherRoot, env),
     headlessScriptPath: scriptRaw ? resolve(scriptRaw.trim()) : undefined,
-    timeoutMs: integerish.parse(readOptionalEnv("TRASK_RESEARCHWIZARD_TIMEOUT_MS", env) ?? "900000"),
   };
 };
 export interface TraskProactiveConfig {
   /** When true, reads channel messages (privileged intents) and may reply without `/ask`. */
   enabled: boolean;
@@ -237,7 +251,7 @@ export interface TraskWelcomeConfig {
 export interface TraskBotConfig {
   discord: DiscordRuntimeConfig;
   ai: SharedAiConfig;
-  researchWizard: ResearchWizardRuntimeConfig;
   allowedGuildIds: string[];
   approvedChannelIds: string[];
   /** Guild IDs where slash commands are registered (comma list in `TRASK_SLASH_GUILD_IDS`). */
@@ -380,7 +394,7 @@ export const loadTraskBotConfig = (env: NodeJS.ProcessEnv = process.env): TraskB
   return {
     discord: loadDiscordRuntimeConfig("TRASK", env),
     ai: loadSharedAiConfig(env),
-    researchWizard: loadResearchWizardRuntimeConfig(env),
     allowedGuildIds: readListEnv("TRASK_ALLOWED_GUILD_IDS", env),
     approvedChannelIds,
     slashCommandGuildIds: readListEnv("TRASK_SLASH_GUILD_IDS", env),
@@ -454,7 +468,7 @@ export const loadIngestWorkerConfig = (env: NodeJS.ProcessEnv = process.env): In
 export interface TraskHttpServerConfig {
   port: number;
-  researchWizard: ResearchWizardRuntimeConfig;
   ai: SharedAiConfig;
   dataDir: string;
   /** When set, require `Authorization: Bearer <key>` or `X-Trask-Api-Key`. */
@@ -471,7 +485,7 @@ export interface TraskHttpServerConfig {
 export const loadTraskHttpServerConfig = (env: NodeJS.ProcessEnv = process.env): TraskHttpServerConfig => {
   return {
     port: integerish.parse(readOptionalEnv("TRASK_HTTP_PORT", env) ?? "4010"),
-    researchWizard: loadResearchWizardRuntimeConfig(env),
     ai: loadSharedAiConfig(env),
     dataDir: readOptionalEnv("TRASK_HTTP_DATA_DIR", env) ?? "data/trask-http-server",
     webApiKey: readOptionalEnv("TRASK_WEB_API_KEY", env),

 import { config as loadDotEnv } from "dotenv";
 import { z } from "zod";
+function findDotEnvFiles(): string[] {
+  const found: string[] = [];
   let dir = resolve(process.cwd());
   for (;;) {
+    const local = join(dir, ".env.local");
+    const env = join(dir, ".env");
+    if (existsSync(local)) found.push(local);
+    if (existsSync(env)) found.push(env);
     const parent = dirname(dir);
+    if (parent === dir) break;
     dir = parent;
   }
+  return found;
 }
+const dotEnvPaths = findDotEnvFiles();
+if (dotEnvPaths.length > 0) {
+  for (const path of dotEnvPaths) {
+    loadDotEnv({ path });
+  }
 } else {
   loadDotEnv();
 }
   databaseUrl: string | undefined;
 }
+export interface WebResearchRuntimeConfig {
+  /** Monorepo root (contains `scripts/trask_web_research.py`). */
+  repoRoot: string;
+  /** Python interpreter for `scripts/trask_web_research.py`. */
   pythonExecutable: string;
+  /** Optional absolute path to the headless runner; default `<repoRoot>/scripts/trask_web_research.py`. */
   headlessScriptPath: string | undefined;
+  /** Reserved for a future HTTP research sidecar (`TRASK_RESEARCH_BACKEND_URL`). */
+  backendUrl: string | undefined;
   timeoutMs: number;
 }
+/** @deprecated Use WebResearchRuntimeConfig */
+export type ResearchWizardRuntimeConfig = WebResearchRuntimeConfig;
+const hasTraskWebResearchScript = (rootDir: string): boolean =>
+  existsSync(join(rootDir, "scripts", "trask_web_research.py"));
+const findRepoRootWithWebResearch = (startDir: string, maxHops = 24): string | undefined => {
   let dir = resolve(startDir);
   for (let hop = 0; hop < maxHops; hop++) {
+    if (hasTraskWebResearchScript(dir)) {
+      return dir;
     }
     const parent = dirname(dir);
     if (parent === dir) {
   return undefined;
 };
+const resolveTraskResearchRepoRoot = (env: NodeJS.ProcessEnv): string => {
+  const explicit = readOptionalEnv("TRASK_REPO_ROOT", env);
   if (explicit) {
     return resolve(explicit.trim());
   }
+  const fromCwd = findRepoRootWithWebResearch(process.cwd());
   if (fromCwd) {
     return fromCwd;
   }
   const configModuleDir = dirname(fileURLToPath(import.meta.url));
+  const fromPackage = findRepoRootWithWebResearch(join(configModuleDir, "..", ".."));
   if (fromPackage) {
     return fromPackage;
   }
+  return process.cwd();
 };
 /**
+ * Prefer `.venv-trask-research` when `TRASK_WEB_RESEARCH_PYTHON` is unset.
+ * Falls back to deprecated `TRASK_GPT_RESEARCHER_PYTHON` for migration.
  */
+const resolveTraskWebResearchPythonExecutable = (repoRoot: string, env: NodeJS.ProcessEnv): string => {
+  const explicit =
+    readOptionalEnv("TRASK_WEB_RESEARCH_PYTHON", env)?.trim() ||
+    readOptionalEnv("TRASK_GPT_RESEARCHER_PYTHON", env)?.trim();
   if (explicit) {
     return explicit;
   }
+  const winPy = join(repoRoot, ".venv-trask-research", "Scripts", "python.exe");
+  const unixPy = join(repoRoot, ".venv-trask-research", "bin", "python");
+  const legacyWin = join(repoRoot, ".venv-trask-gptr", "Scripts", "python.exe");
+  const legacyUnix = join(repoRoot, ".venv-trask-gptr", "bin", "python");
   if (process.platform === "win32" && existsSync(winPy)) {
     return winPy;
   }
   if (existsSync(unixPy)) {
     return unixPy;
   }
+  if (process.platform === "win32" && existsSync(legacyWin)) {
+    return legacyWin;
+  }
+  if (existsSync(legacyUnix)) {
+    return legacyUnix;
+  }
+  return "python3";
 };
+export const loadWebResearchRuntimeConfig = (env: NodeJS.ProcessEnv = process.env): WebResearchRuntimeConfig => {
+  const repoRoot = resolveTraskResearchRepoRoot(env);
+  const scriptRaw =
+    readOptionalEnv("TRASK_WEB_RESEARCH_SCRIPT", env) ?? readOptionalEnv("TRASK_GPT_RESEARCHER_SCRIPT", env);
+  const backendUrl = readOptionalEnv("TRASK_RESEARCH_BACKEND_URL", env)?.trim() || undefined;
+  const timeoutRaw =
+    readOptionalEnv("TRASK_WEB_RESEARCH_TIMEOUT_MS", env)
+    ?? readOptionalEnv("TRASK_RESEARCHWIZARD_TIMEOUT_MS", env)
+    ?? "900000";
   return {
+    repoRoot,
+    pythonExecutable: resolveTraskWebResearchPythonExecutable(repoRoot, env),
     headlessScriptPath: scriptRaw ? resolve(scriptRaw.trim()) : undefined,
+    backendUrl,
+    timeoutMs: integerish.parse(timeoutRaw),
   };
 };
+/** @deprecated Use loadWebResearchRuntimeConfig */
+export const loadResearchWizardRuntimeConfig = loadWebResearchRuntimeConfig;
 export interface TraskProactiveConfig {
   /** When true, reads channel messages (privileged intents) and may reply without `/ask`. */
   enabled: boolean;
 export interface TraskBotConfig {
   discord: DiscordRuntimeConfig;
   ai: SharedAiConfig;
+  webResearch: WebResearchRuntimeConfig;
   allowedGuildIds: string[];
   approvedChannelIds: string[];
   /** Guild IDs where slash commands are registered (comma list in `TRASK_SLASH_GUILD_IDS`). */
   return {
     discord: loadDiscordRuntimeConfig("TRASK", env),
     ai: loadSharedAiConfig(env),
+    webResearch: loadWebResearchRuntimeConfig(env),
     allowedGuildIds: readListEnv("TRASK_ALLOWED_GUILD_IDS", env),
     approvedChannelIds,
     slashCommandGuildIds: readListEnv("TRASK_SLASH_GUILD_IDS", env),
 export interface TraskHttpServerConfig {
   port: number;
+  webResearch: WebResearchRuntimeConfig;
   ai: SharedAiConfig;
   dataDir: string;
   /** When set, require `Authorization: Bearer <key>` or `X-Trask-Api-Key`. */
 export const loadTraskHttpServerConfig = (env: NodeJS.ProcessEnv = process.env): TraskHttpServerConfig => {
   return {
     port: integerish.parse(readOptionalEnv("TRASK_HTTP_PORT", env) ?? "4010"),
+    webResearch: loadWebResearchRuntimeConfig(env),
     ai: loadSharedAiConfig(env),
     dataDir: readOptionalEnv("TRASK_HTTP_DATA_DIR", env) ?? "data/trask-http-server",
     webApiKey: readOptionalEnv("TRASK_WEB_API_KEY", env),

packages/personas/src/index.ts CHANGED Viewed

@@ -20,7 +20,7 @@ export * from "./hk-dialog.js";
 export const personaProfiles: Record<PersonaProfile["id"], PersonaProfile> = {
   trask: {
     id: "trask",
-    displayName: "Trask Ulgo",
     summary: "Republic-first guide voice for quick help, troubleshooting, and source-backed answers.",
     speechStyle: [
       "direct and practical",

 export const personaProfiles: Record<PersonaProfile["id"], PersonaProfile> = {
   trask: {
     id: "trask",
+    displayName: "Trask Q&A Assistant",
     summary: "Republic-first guide voice for quick help, troubleshooting, and source-backed answers.",
     speechStyle: [
       "direct and practical",

packages/retrieval/src/discord-permalink.test.ts ADDED Viewed

	@@ -0,0 +1,52 @@

+import assert from "node:assert/strict";
+import { describe, test } from "node:test";
+import {
+  anchorMessageIdFromChunkTags,
+  buildDiscordMessagePermalink,
+  channelIdFromChunkTags,
+  guildIdFromChunkTags,
+  isDiscordCitationUrl,
+  resolveDiscordChunkCitationUrl,
+} from "./discord-permalink.js";
+describe("discord permalink helpers", () => {
+  test("buildDiscordMessagePermalink formats discord.com URL", () => {
+    assert.equal(
+      buildDiscordMessagePermalink("111", "222", "333"),
+      "https://discord.com/channels/111/222/333",
+    );
+  });
+  test("resolveDiscordChunkCitationUrl prefers stored HTTPS permalink", () => {
+    const url = resolveDiscordChunkCitationUrl({
+      url: "https://discord.com/channels/g/c/m",
+      tags: [],
+    });
+    assert.equal(url, "https://discord.com/channels/g/c/m");
+  });
+  test("resolveDiscordChunkCitationUrl builds from discord:// and tags", () => {
+    const url = resolveDiscordChunkCitationUrl(
+      {
+        url: "discord://approved-channels/9001/8001-8002",
+        tags: ["guild:1001", "channel:9001", "anchorMessage:8001"],
+      },
+      "fallback-should-not-use",
+    );
+    assert.equal(url, "https://discord.com/channels/1001/9001/8001");
+  });
+  test("tag parsers read guild channel and anchor", () => {
+    const tags = ["guild:g1", "channel:c1", "anchorMessage:m1"];
+    assert.equal(guildIdFromChunkTags(tags), "g1");
+    assert.equal(channelIdFromChunkTags(tags), "c1");
+    assert.equal(anchorMessageIdFromChunkTags(tags), "m1");
+  });
+  test("isDiscordCitationUrl recognizes discord schemes", () => {
+    assert.equal(isDiscordCitationUrl("discord://approved-channels/1/2"), true);
+    assert.equal(isDiscordCitationUrl("https://discord.com/channels/1/2/3"), true);
+    assert.equal(isDiscordCitationUrl("https://example.com"), false);
+  });
+});

packages/retrieval/src/discord-permalink.ts ADDED Viewed

	@@ -0,0 +1,98 @@

+/** Discord message permalink and chunk URL helpers for Trask community citations. */
+const DISCORD_CHUNK_URL_PATTERN =
+  /^discord:\/\/approved-channels\/([^/]+)\/([^/-]+)(?:-([^/]+))?$/;
+export function buildDiscordMessagePermalink(
+  guildId: string,
+  channelId: string,
+  messageId: string,
+): string {
+  const guild = guildId.trim();
+  const channel = channelId.trim();
+  const message = messageId.trim();
+  if (!guild || !channel || !message) {
+    return "";
+  }
+  return `https://discord.com/channels/${guild}/${channel}/${message}`;
+}
+export function parseDiscordChunkUrl(
+  url: string,
+): { channelId: string; firstMessageId: string; lastMessageId?: string } | null {
+  const match = DISCORD_CHUNK_URL_PATTERN.exec(url.trim());
+  if (!match) return null;
+  const channelId = match[1]?.trim();
+  const firstMessageId = match[2]?.trim();
+  const lastMessageId = match[3]?.trim();
+  if (!channelId || !firstMessageId) return null;
+  return lastMessageId
+    ? { channelId, firstMessageId, lastMessageId }
+    : { channelId, firstMessageId };
+}
+export function guildIdFromChunkTags(tags: readonly string[]): string | undefined {
+  for (const tag of tags) {
+    if (tag.startsWith("guild:")) {
+      const value = tag.slice("guild:".length).trim();
+      if (value) return value;
+    }
+  }
+  return undefined;
+}
+export function channelIdFromChunkTags(tags: readonly string[]): string | undefined {
+  for (const tag of tags) {
+    if (tag.startsWith("channel:")) {
+      const value = tag.slice("channel:".length).trim();
+      if (value) return value;
+    }
+  }
+  return undefined;
+}
+export function anchorMessageIdFromChunkTags(tags: readonly string[]): string | undefined {
+  for (const tag of tags) {
+    if (tag.startsWith("anchorMessage:")) {
+      const value = tag.slice("anchorMessage:".length).trim();
+      if (value) return value;
+    }
+  }
+  return undefined;
+}
+/**
+ * Resolve a chunk record to an HTTPS citation URL when possible.
+ * Prefers stored https://discord.com permalinks; falls back to guild id from tags or option.
+ */
+export function resolveDiscordChunkCitationUrl(
+  chunk: { url: string; tags: readonly string[] },
+  fallbackGuildId?: string,
+): string {
+  const url = chunk.url.trim();
+  if (url.startsWith("https://discord.com/channels/")) {
+    return url;
+  }
+  const guildId = guildIdFromChunkTags(chunk.tags) ?? fallbackGuildId?.trim();
+  const channelFromTags = channelIdFromChunkTags(chunk.tags);
+  const anchorFromTags = anchorMessageIdFromChunkTags(chunk.tags);
+  const parsed = parseDiscordChunkUrl(url);
+  const channelId = channelFromTags ?? parsed?.channelId;
+  const messageId = anchorFromTags ?? parsed?.firstMessageId;
+  if (guildId && channelId && messageId) {
+    return buildDiscordMessagePermalink(guildId, channelId, messageId);
+  }
+  return url;
+}
+export function isDiscordCitationUrl(url: string): boolean {
+  const trimmed = url.trim();
+  return (
+    trimmed.startsWith("discord://")
+    || trimmed.startsWith("https://discord.com/channels/")
+  );
+}

packages/retrieval/src/index.ts CHANGED Viewed

@@ -1,6 +1,16 @@
 import { mkdir, open, readFile, readdir, rename, rm, stat, utimes, writeFile } from "node:fs/promises";
 import path from "node:path";
 export type SourceKind = "website" | "github" | "discord";
 export interface SourceDescriptor {
@@ -709,8 +719,9 @@ export interface SourceIndexRecord {
   tags: readonly string[];
 }
-const isNonWebChunkUrl = (url: string): boolean =>
-  url.startsWith("local://") || url.startsWith("discord://");
 type SerializableValue = object | string | number | boolean | null;
@@ -874,10 +885,16 @@ export class FileChunkStore {
   }
 }
 export class ChunkSearchProvider implements SearchProvider {
   public constructor(
     private readonly chunkStore: FileChunkStore,
     private readonly catalog: StaticCatalogSearchProvider,
   ) {}
   public async listSources(): Promise<readonly SourceDescriptor[]> {
@@ -893,7 +910,7 @@ export class ChunkSearchProvider implements SearchProvider {
       this.catalog.search(query, limit),
       this.chunkStore.loadAllChunks(),
     ]);
-    const searchableChunks = allChunks.filter((chunk) => !isNonWebChunkUrl(chunk.url));
     const chunkHits: SearchHit[] = searchableChunks
       .map((chunk) => {
@@ -908,6 +925,11 @@ export class ChunkSearchProvider implements SearchProvider {
           score += textTokens.filter((t) => t === token).length;
         }
         score += intentScoreDelta(intent, chunk.tags);
         return {
           sourceId: chunk.sourceId,
@@ -915,7 +937,7 @@ export class ChunkSearchProvider implements SearchProvider {
           kind: chunk.kind,
           title: chunk.title,
           snippet: chunk.chunkText.slice(0, 800).trim() + (chunk.chunkText.length > 800 ? "\u2026" : ""),
-          url: chunk.url,
           score,
           tags: chunk.tags,
         } satisfies SearchHit;
@@ -942,9 +964,13 @@ export class ChunkSearchProvider implements SearchProvider {
   }
 }
-export const createChunkSearchProvider = (stateDir: string): ChunkSearchProvider => {
   return new ChunkSearchProvider(
     new FileChunkStore(stateDir),
     new StaticCatalogSearchProvider(defaultSourceCatalog, new FileReindexQueueStore(stateDir)),
   );
 };

 import { mkdir, open, readFile, readdir, rename, rm, stat, utimes, writeFile } from "node:fs/promises";
 import path from "node:path";
+export {
+  anchorMessageIdFromChunkTags,
+  buildDiscordMessagePermalink,
+  channelIdFromChunkTags,
+  guildIdFromChunkTags,
+  isDiscordCitationUrl,
+  parseDiscordChunkUrl,
+  resolveDiscordChunkCitationUrl,
+} from "./discord-permalink.js";
 export type SourceKind = "website" | "github" | "discord";
 export interface SourceDescriptor {
   tags: readonly string[];
 }
+import { isDiscordCitationUrl, resolveDiscordChunkCitationUrl } from "./discord-permalink.js";
+const isExcludedChunkUrl = (url: string): boolean => url.startsWith("local://");
 type SerializableValue = object | string | number | boolean | null;
   }
 }
+export interface ChunkSearchProviderOptions {
+  /** Resolves `discord://` chunk URLs to HTTPS permalinks when tags omit guild id. */
+  discordGuildId?: string;
+}
 export class ChunkSearchProvider implements SearchProvider {
   public constructor(
     private readonly chunkStore: FileChunkStore,
     private readonly catalog: StaticCatalogSearchProvider,
+    private readonly options: ChunkSearchProviderOptions = {},
   ) {}
   public async listSources(): Promise<readonly SourceDescriptor[]> {
       this.catalog.search(query, limit),
       this.chunkStore.loadAllChunks(),
     ]);
+    const searchableChunks = allChunks.filter((chunk) => !isExcludedChunkUrl(chunk.url));
     const chunkHits: SearchHit[] = searchableChunks
       .map((chunk) => {
           score += textTokens.filter((t) => t === token).length;
         }
         score += intentScoreDelta(intent, chunk.tags);
+        if (isDiscordCitationUrl(chunk.url)) {
+          score += 1;
+        }
+        const citationUrl = resolveDiscordChunkCitationUrl(chunk, this.options.discordGuildId);
         return {
           sourceId: chunk.sourceId,
           kind: chunk.kind,
           title: chunk.title,
           snippet: chunk.chunkText.slice(0, 800).trim() + (chunk.chunkText.length > 800 ? "\u2026" : ""),
+          url: citationUrl,
           score,
           tags: chunk.tags,
         } satisfies SearchHit;
   }
 }
+export const createChunkSearchProvider = (
+  stateDir: string,
+  options?: ChunkSearchProviderOptions,
+): ChunkSearchProvider => {
   return new ChunkSearchProvider(
     new FileChunkStore(stateDir),
     new StaticCatalogSearchProvider(defaultSourceCatalog, new FileReindexQueueStore(stateDir)),
+    options ?? {},
   );
 };

packages/trask-http/src/router.test.ts CHANGED Viewed

@@ -7,10 +7,10 @@ import path from "node:path";
 import { JsonTraskQueryRepository } from "@openkotor/persistence";
 import type { SourceDescriptor } from "@openkotor/retrieval";
 import type {
-  ResearchWizardAnswer,
-  ResearchWizardProgressEvent,
-  ResearchWizardQueryHandler,
-  ResearchWizardQueryOptions,
 } from "@openkotor/trask";
 import express from "express";
 import request from "supertest";
@@ -28,11 +28,11 @@ const mockSource: SourceDescriptor = {
   tags: [],
 };
-const mockWizard: ResearchWizardQueryHandler = {
   async answerQuestion(
     _query: string,
-    onProgress?: (event: ResearchWizardProgressEvent) => void,
-  ): Promise<ResearchWizardAnswer> {
     onProgress?.({ phase: "gather", detail: "test" });
     return {
       answer: "Stub answer.\n\nSources\n1. Test Source - https://example.com",
@@ -76,7 +76,7 @@ test("GET /session returns anonymous payload by default", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -112,7 +112,7 @@ test("GET /session uses getSession override", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -153,7 +153,7 @@ test("POST /auth/logout returns 204 by default", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -187,7 +187,7 @@ test("GET /sources returns JSON when authenticated", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -223,7 +223,7 @@ test("GET /models defaults to Auto only when the wizard has no live model list",
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -235,7 +235,7 @@ test("GET /models defaults to Auto only when the wizard has no live model list",
   const res = await request(app).get("/api/trask/models");
   assert.equal(res.status, 200);
   assert.deepEqual(res.body.models, [
-    { id: "auto", label: "Auto", provider: "ResearchWizard fallback", recommended: true },
   ]);
 });
@@ -253,12 +253,12 @@ test("GET /models filters out non-free model ids", async () => {
     },
   };
-  const researchWizard = {
     ...mockWizard,
     async listModels() {
       return [
         { id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
-        { id: "litellm:foo/bar", label: "Paid-ish", provider: "ResearchWizard" },
         { id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
       ];
     },
@@ -271,7 +271,7 @@ test("GET /models filters out non-free model ids", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard,
         queryRepository,
       },
       auth: {
@@ -283,13 +283,13 @@ test("GET /models filters out non-free model ids", async () => {
   const res = await request(app).get("/api/trask/models");
   assert.equal(res.status, 200);
   assert.deepEqual(res.body.models, [
-    { id: "auto", label: "Auto", provider: "ResearchWizard fallback", recommended: true },
     { id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
     { id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
   ]);
 });
-test("POST /ask rejects model ids outside the current ResearchWizard list", async () => {
   const queryRepository = new JsonTraskQueryRepository(path.join(tmpDir, `qmr-${Math.random()}.json`));
   const searchProvider = {
     async listSources() {
@@ -310,7 +310,7 @@ test("POST /ask rejects model ids outside the current ResearchWizard list", asyn
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -348,7 +348,7 @@ test("POST /ask persists, returns 202, completes asynchronously", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -388,8 +388,8 @@ test("POST /ask forwards source weights to the research wizard", async () => {
       return { queuedSourceIds: [] as string[], mode: "file-queue" as const };
     },
   };
-  let receivedOptions: ResearchWizardQueryOptions | undefined;
-  const weightedWizard: ResearchWizardQueryHandler = {
     async answerQuestion(_query, _onProgress, options) {
       receivedOptions = options;
       return {
@@ -408,7 +408,7 @@ test("POST /ask forwards source weights to the research wizard", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: weightedWizard,
         queryRepository,
       },
       auth: {
@@ -455,7 +455,7 @@ test("GET /thread/:threadId returns persisted rows for the authenticated user",
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -510,7 +510,7 @@ test("GET /thread/:threadId requires authentication", async () => {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {
@@ -554,7 +554,7 @@ test("anonymous persistQueries=false skips disk but still returns threadId", asy
     createTraskHttpRouter({
       runtime: {
         searchProvider,
-        researchWizard: mockWizard,
         queryRepository,
       },
       auth: {

 import { JsonTraskQueryRepository } from "@openkotor/persistence";
 import type { SourceDescriptor } from "@openkotor/retrieval";
 import type {
+  WebResearchAnswer,
+  WebResearchProgressEvent,
+  WebResearchQueryHandler,
+  WebResearchQueryOptions,
 } from "@openkotor/trask";
 import express from "express";
 import request from "supertest";
   tags: [],
 };
+const mockWizard: WebResearchQueryHandler = {
   async answerQuestion(
     _query: string,
+    onProgress?: (event: WebResearchProgressEvent) => void,
+  ): Promise<WebResearchAnswer> {
     onProgress?.({ phase: "gather", detail: "test" });
     return {
       answer: "Stub answer.\n\nSources\n1. Test Source - https://example.com",
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
   const res = await request(app).get("/api/trask/models");
   assert.equal(res.status, 200);
   assert.deepEqual(res.body.models, [
+    { id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
   ]);
 });
     },
   };
+  const webResearch = {
     ...mockWizard,
     async listModels() {
       return [
         { id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
+        { id: "litellm:foo/bar", label: "Paid-ish", provider: "Trask web research" },
         { id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
       ];
     },
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch,
         queryRepository,
       },
       auth: {
   const res = await request(app).get("/api/trask/models");
   assert.equal(res.status, 200);
   assert.deepEqual(res.body.models, [
+    { id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
     { id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
     { id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
   ]);
 });
+test("POST /ask rejects model ids outside the current web research model list", async () => {
   const queryRepository = new JsonTraskQueryRepository(path.join(tmpDir, `qmr-${Math.random()}.json`));
   const searchProvider = {
     async listSources() {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
       return { queuedSourceIds: [] as string[], mode: "file-queue" as const };
     },
   };
+  let receivedOptions: WebResearchQueryOptions | undefined;
+  const weightedWizard: WebResearchQueryHandler = {
     async answerQuestion(_query, _onProgress, options) {
       receivedOptions = options;
       return {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: weightedWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {
     createTraskHttpRouter({
       runtime: {
         searchProvider,
+        webResearch: mockWizard,
         queryRepository,
       },
       auth: {

packages/trask-http/src/router.ts CHANGED Viewed

@@ -14,10 +14,10 @@ import { normalizeAuthHandlerError, type AuthHandlerThrown } from "@openkotor/pl
 import type { SearchProvider } from "@openkotor/retrieval";
 import type {
-  ResearchWizardModelOption,
-  ResearchWizardProgressEvent,
-  ResearchWizardQueryHandler,
-  ResearchWizardSourcePreference,
 } from "@openkotor/trask";
 import { Router, type Request, type Response, type RequestHandler } from "express";
@@ -28,7 +28,7 @@ export interface TraskHttpRuntime {
   searchProvider: SearchProvider;
-  researchWizard: ResearchWizardQueryHandler;
   queryRepository: JsonTraskQueryRepository;
@@ -68,6 +68,12 @@ export interface TraskHttpSessionDto {
   oauthAvailable?: boolean;
   discord?: { id: string; username: string; displayName: string };
 }
@@ -184,7 +190,7 @@ const mapTraskQueryRecord = (record: TraskQueryRecord): TraskQueryRecord => ({
 });
 const mapDescriptorsToSourceRecords = (
-  sources: ResearchWizardProgressEvent["sources"],
 ): readonly TraskSourceRecord[] => {
   if (!sources?.length) return [];
   return sources.map((s) => ({
@@ -215,11 +221,11 @@ const appendLiveTrace = async (
 const CANCELED_QUERY_ERROR = "Canceled by newer request.";
-const DEFAULT_TRASK_MODEL_OPTIONS: readonly ResearchWizardModelOption[] = [
-  { id: "auto", label: "Auto", provider: "ResearchWizard fallback", recommended: true },
 ];
-const mapModelOption = (option: ResearchWizardModelOption): ResearchWizardModelOption => ({
   id: option.id,
   label: option.label,
   provider: option.provider,
@@ -232,11 +238,11 @@ const isFreeModelId = (id: string): boolean => {
 };
 const resolveTraskModelOptions = async (
-  researchWizard: ResearchWizardQueryHandler,
-): Promise<readonly ResearchWizardModelOption[]> => {
-  const dynamicModels = researchWizard.listModels ? await researchWizard.listModels() : [];
   const seen = new Set<string>();
-  const models: ResearchWizardModelOption[] = [];
   for (const option of [...DEFAULT_TRASK_MODEL_OPTIONS, ...dynamicModels]) {
     const id = option.id.trim();
     if (!id || seen.has(id) || !isFreeModelId(id)) continue;
@@ -244,7 +250,7 @@ const resolveTraskModelOptions = async (
     models.push(mapModelOption({
       id,
       label: option.label.trim() || id,
-      provider: option.provider.trim() || "ResearchWizard",
       ...(option.recommended ? { recommended: true } : {}),
     }));
   }
@@ -309,14 +315,14 @@ const normalizeTraskModelFromBody = (raw: ScalarOrObject | undefined): string |
   return model;
 };
-const normalizeSourcePreferencesFromBody = (raw: ScalarOrObject | undefined): ResearchWizardSourcePreference[] | undefined => {
   if (raw === undefined || raw === null) return undefined;
   if (!Array.isArray(raw)) {
     throw Object.assign(new Error("sourceWeights must be an array when provided."), { status: 422 });
   }
   return raw
-    .map((entry): ResearchWizardSourcePreference | undefined => {
       if (!entry || typeof entry !== "object" || Array.isArray(entry)) return undefined;
       const value = entry as Record<string, unknown>;
       const url = typeof value.url === "string" ? value.url.trim() : "";
@@ -330,7 +336,7 @@ const normalizeSourcePreferencesFromBody = (raw: ScalarOrObject | undefined): Re
         enabled: value.enabled !== false,
       };
     })
-    .filter((entry): entry is ResearchWizardSourcePreference => entry !== undefined);
 };
@@ -394,7 +400,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
     options.auth.requireAuth(async (_req, res, _user) => {
       try {
         const trask = requireRuntime();
-        const models = await resolveTraskModelOptions(trask.researchWizard);
         res.json({ models: models.map(mapModelOption) });
       } catch (err) {
         handleTraskError(res, err as AuthHandlerThrown);
@@ -605,7 +611,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
       let threadId: string;
       let model: string | undefined;
-      let sourcePreferences: ResearchWizardSourcePreference[] | undefined;
       const persist = shouldPersistForUser(user);
@@ -624,9 +630,9 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
         sourcePreferences = normalizeSourcePreferencesFromBody(body.sourceWeights);
         if (model) {
-          const allowedModels = await resolveTraskModelOptions(trask.researchWizard);
           if (!allowedModels.some((option) => option.id === model)) {
-            throw Object.assign(new Error("model is not available in the current ResearchWizard fallback list."), { status: 422 });
           }
         }
@@ -652,7 +658,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
       if (!persist) {
         try {
-          const result = await trask.researchWizard.answerQuestion(query, undefined, {
             ...(model ? { model } : {}),
             ...(sourcePreferences ? { sourcePreferences } : {}),
           });
@@ -735,7 +741,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
       void (async () => {
         try {
-          const result = await trask.researchWizard.answerQuestion(query, async (ev) => {
             await appendLiveTrace(trask.queryRepository, queryId, {
               phase: ev.phase,
               ...(ev.detail !== undefined ? { detail: ev.detail } : {}),

 import type { SearchProvider } from "@openkotor/retrieval";
 import type {
+  WebResearchModelOption,
+  WebResearchProgressEvent,
+  WebResearchQueryHandler,
+  WebResearchSourcePreference,
 } from "@openkotor/trask";
 import { Router, type Request, type Response, type RequestHandler } from "express";
   searchProvider: SearchProvider;
+  webResearch: WebResearchQueryHandler;
   queryRepository: JsonTraskQueryRepository;
   oauthAvailable?: boolean;
+  /** False when startup LLM probe found no working provider in the fallback chain. */
+  researchAvailable?: boolean;
+  researchUnavailableReason?: string;
   discord?: { id: string; username: string; displayName: string };
 }
 });
 const mapDescriptorsToSourceRecords = (
+  sources: WebResearchProgressEvent["sources"],
 ): readonly TraskSourceRecord[] => {
   if (!sources?.length) return [];
   return sources.map((s) => ({
 const CANCELED_QUERY_ERROR = "Canceled by newer request.";
+const DEFAULT_TRASK_MODEL_OPTIONS: readonly WebResearchModelOption[] = [
+  { id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
 ];
+const mapModelOption = (option: WebResearchModelOption): WebResearchModelOption => ({
   id: option.id,
   label: option.label,
   provider: option.provider,
 };
 const resolveTraskModelOptions = async (
+  webResearch: WebResearchQueryHandler,
+): Promise<readonly WebResearchModelOption[]> => {
+  const dynamicModels = webResearch.listModels ? await webResearch.listModels() : [];
   const seen = new Set<string>();
+  const models: WebResearchModelOption[] = [];
   for (const option of [...DEFAULT_TRASK_MODEL_OPTIONS, ...dynamicModels]) {
     const id = option.id.trim();
     if (!id || seen.has(id) || !isFreeModelId(id)) continue;
     models.push(mapModelOption({
       id,
       label: option.label.trim() || id,
+      provider: option.provider.trim() || "WebResearch",
       ...(option.recommended ? { recommended: true } : {}),
     }));
   }
   return model;
 };
+const normalizeSourcePreferencesFromBody = (raw: ScalarOrObject | undefined): WebResearchSourcePreference[] | undefined => {
   if (raw === undefined || raw === null) return undefined;
   if (!Array.isArray(raw)) {
     throw Object.assign(new Error("sourceWeights must be an array when provided."), { status: 422 });
   }
   return raw
+    .map((entry): WebResearchSourcePreference | undefined => {
       if (!entry || typeof entry !== "object" || Array.isArray(entry)) return undefined;
       const value = entry as Record<string, unknown>;
       const url = typeof value.url === "string" ? value.url.trim() : "";
         enabled: value.enabled !== false,
       };
     })
+    .filter((entry): entry is WebResearchSourcePreference => entry !== undefined);
 };
     options.auth.requireAuth(async (_req, res, _user) => {
       try {
         const trask = requireRuntime();
+        const models = await resolveTraskModelOptions(trask.webResearch);
         res.json({ models: models.map(mapModelOption) });
       } catch (err) {
         handleTraskError(res, err as AuthHandlerThrown);
       let threadId: string;
       let model: string | undefined;
+      let sourcePreferences: WebResearchSourcePreference[] | undefined;
       const persist = shouldPersistForUser(user);
         sourcePreferences = normalizeSourcePreferencesFromBody(body.sourceWeights);
         if (model) {
+          const allowedModels = await resolveTraskModelOptions(trask.webResearch);
           if (!allowedModels.some((option) => option.id === model)) {
+            throw Object.assign(new Error("model is not available in the current web research model list."), { status: 422 });
           }
         }
       if (!persist) {
         try {
+          const result = await trask.webResearch.answerQuestion(query, undefined, {
             ...(model ? { model } : {}),
             ...(sourcePreferences ? { sourcePreferences } : {}),
           });
       void (async () => {
         try {
+          const result = await trask.webResearch.answerQuestion(query, async (ev) => {
             await appendLiveTrace(trask.queryRepository, queryId, {
               phase: ev.phase,
               ...(ev.detail !== undefined ? { detail: ev.detail } : {}),

packages/trask/src/community-knowledge.test.ts ADDED Viewed

	@@ -0,0 +1,72 @@

+import assert from "node:assert/strict";
+import { describe, test } from "node:test";
+import type { SearchHit } from "@openkotor/retrieval";
+import {
+  buildCommunityKnowledgeDigest,
+  filterWebArchiveCitationSources,
+  mergeCommunityAndWebSources,
+  searchHitsToCommunitySources,
+} from "./community-knowledge.js";
+const sampleHit = (url: string): SearchHit => ({
+  sourceId: "approved-discord-knowledge",
+  sourceName: "Approved Discord Knowledge",
+  kind: "discord",
+  title: "#general",
+  snippet: "Revan was a Jedi.",
+  url,
+  score: 3,
+  tags: ["discord"],
+});
+describe("community knowledge helpers", () => {
+  test("searchHitsToCommunitySources maps discord permalinks", () => {
+    const sources = searchHitsToCommunitySources([
+      sampleHit("https://discord.com/channels/1/2/3"),
+    ]);
+    assert.equal(sources.length, 1);
+    assert.equal(sources[0]!.kind, "discord");
+    assert.equal(sources[0]!.homeUrl, "https://discord.com/channels/1/2/3");
+  });
+  test("buildCommunityKnowledgeDigest includes permalink lines", () => {
+    const digest = buildCommunityKnowledgeDigest([
+      sampleHit("https://discord.com/channels/1/2/3"),
+    ]);
+    assert.match(digest, /Permalink: https:\/\/discord\.com\/channels\/1\/2\/3/);
+  });
+  test("filterWebArchiveCitationSources excludes discord URLs", () => {
+    const web = {
+      id: "w1",
+      name: "Web",
+      kind: "website" as const,
+      homeUrl: "https://deadlystream.com",
+      description: "fixture",
+      freshnessPolicy: "live",
+      approvalScope: "global",
+      tags: ["web"],
+    };
+    const discord = searchHitsToCommunitySources([sampleHit("https://discord.com/channels/1/2/3")])[0]!;
+    const filtered = filterWebArchiveCitationSources([web, discord]);
+    assert.equal(filtered.length, 1);
+    assert.equal(filtered[0]!.homeUrl, web.homeUrl);
+  });
+  test("mergeCommunityAndWebSources dedupes by URL", () => {
+    const web = {
+      id: "w1",
+      name: "Web",
+      kind: "website" as const,
+      homeUrl: "https://example.com/a",
+      description: "fixture",
+      freshnessPolicy: "live",
+      approvalScope: "global",
+      tags: ["web"],
+    };
+    const merged = mergeCommunityAndWebSources([web], [web]);
+    assert.equal(merged.length, 1);
+  });
+});

packages/trask/src/community-knowledge.ts ADDED Viewed

	@@ -0,0 +1,61 @@

+import type { SearchHit, SourceDescriptor } from "@openkotor/retrieval";
+import { isDiscordCitationUrl } from "@openkotor/retrieval";
+export const COMMUNITY_SOURCE_ID = "approved-discord-knowledge";
+export function searchHitsToCommunitySources(hits: readonly SearchHit[]): SourceDescriptor[] {
+  return hits.map((hit, index) => ({
+    id: `${COMMUNITY_SOURCE_ID}-hit-${index + 1}`,
+    name: hit.title.trim() || "Discord message",
+    kind: "discord",
+    homeUrl: hit.url,
+    description: hit.snippet.slice(0, 280),
+    freshnessPolicy: "live-and-imported",
+    approvalScope: "approved-channels",
+    tags: [...hit.tags, "community"],
+  }));
+}
+export function buildCommunityKnowledgeDigest(hits: readonly SearchHit[]): string {
+  if (hits.length === 0) return "";
+  const blocks = hits.map((hit, index) => {
+    const lines = [
+      `[${index + 1}] ${hit.title}`,
+      hit.snippet,
+      `Permalink: ${hit.url}`,
+    ];
+    return lines.join("\n");
+  });
+  return [
+    "Community context (lower authority than approved web archives; prefer web sources when they conflict):",
+    "",
+    ...blocks,
+  ].join("\n");
+}
+export function isCommunityCitationUrl(url: string): boolean {
+  return isDiscordCitationUrl(url);
+}
+export function filterWebArchiveCitationSources(sources: readonly SourceDescriptor[]): SourceDescriptor[] {
+  return sources.filter((source) => {
+    const url = source.homeUrl.trim();
+    if (!url.startsWith("http://") && !url.startsWith("https://")) return false;
+    return !isCommunityCitationUrl(url);
+  });
+}
+export function mergeCommunityAndWebSources(
+  webSources: readonly SourceDescriptor[],
+  communitySources: readonly SourceDescriptor[],
+): SourceDescriptor[] {
+  const seen = new Set<string>();
+  const merged: SourceDescriptor[] = [];
+  for (const source of [...webSources, ...communitySources]) {
+    const key = source.homeUrl.trim().toLowerCase();
+    if (!key || seen.has(key)) continue;
+    seen.add(key);
+    merged.push(source);
+  }
+  return merged;
+}

packages/trask/src/index.ts CHANGED Viewed

@@ -1,3 +1,5 @@
-export * from "./research-wizard.js";
 export * from "./discord-reply-format.js";
 export * from "./proactive-llm.js";

+export * from "./web-research.js";
+export * from "./web-research-subprocess.js";
 export * from "./discord-reply-format.js";
 export * from "./proactive-llm.js";
+export * from "./community-knowledge.js";

packages/trask/src/research-wizard.ts CHANGED Viewed

@@ -1,1271 +1,18 @@
-import OpenAI from "openai";
-import { loadSharedAiConfig, type ResearchWizardRuntimeConfig, type SharedAiConfig } from "@openkotor/config";
-import {
-  isTraskApprovedBaseUrl,
-  isTraskApprovedResearchUrl,
-  sourceUrlMatchesDescriptor,
-  traskApprovedResearchBaseHosts,
-  traskApprovedResearchSources,
-  type SourceDescriptor,
-} from "@openkotor/retrieval";
-import {
-  listHeadlessGptResearcherModels,
-  runHeadlessGptResearcher,
-  type HeadlessAiResearchWizardModelOption,
-} from "./gpt-researcher-subprocess.js";
-export interface ResearchWizardAnswer {
-  answer: string;
-  /** Sources explicitly cited in the final answer shown to users. */
-  approvedSources: readonly SourceDescriptor[];
-  /** Sources retrieved as candidate evidence for the answer/rewrite stage. */
-  retrievedSources: readonly SourceDescriptor[];
-  /** Allowlisted URLs the headless researcher touched while gathering evidence. */
-  visitedUrls: readonly string[];
-}
-export interface ResearchWizardBriefAnswer extends ResearchWizardAnswer {
-  /** Normalized research report text used for proactive semantic gating. */
-  researchReport: string;
-}
-/** Fine-grained phases for Holocron clients polling thread history. */
-export interface ResearchWizardProgressEvent {
-  phase: "gather" | "report" | "sources" | "compose";
-  detail?: string;
-  sources?: readonly SourceDescriptor[];
-}
-export interface ResearchWizardQueryOptions {
-  /** Preferred ai-researchwizard model id, e.g. `openrouter:openrouter/auto` or `litellm:moonshotai/kimi-k2`. */
-  model?: string;
-  /** Optional per-request source enablement and weight hints from Holocron's Source Prioritization dialog. */
-  sourcePreferences?: readonly ResearchWizardSourcePreference[];
-}
-export interface ResearchWizardSourcePreference {
-  name?: string;
-  url: string;
-  weight: number;
-  enabled: boolean;
-}
-export interface ResearchWizardModelOption extends HeadlessAiResearchWizardModelOption {}
-/** Structural type for adapters that only need full Q&A (e.g. Trask HTTP `/ask`). */
-export interface ResearchWizardQueryHandler {
-  answerQuestion(
-    query: string,
-    onProgress?: (event: ResearchWizardProgressEvent) => void,
-    options?: ResearchWizardQueryOptions,
-  ): Promise<ResearchWizardAnswer>;
-  listModels?(): Promise<readonly ResearchWizardModelOption[]>;
-}
-const DEFAULT_RESEARCH_WIZARD_MODELS: readonly ResearchWizardModelOption[] = [
-  { id: "auto", label: "Auto", provider: "ResearchWizard fallback", recommended: true },
-];
-interface ResearchWizardResponsePayload {
-  report?: string | null;
-  research_information?: {
-    source_urls?: readonly string[] | null;
-    cited_urls?: readonly string[] | null;
-    retrieved_urls?: readonly string[] | null;
-    visited_urls?: readonly string[] | null;
-    query_domains?: readonly string[] | null;
-    allowed_url_prefixes?: readonly string[] | null;
-    rejected_source_urls?: readonly string[] | null;
-  };
-}
-const buildResearchTask = (query: string): string => {
-  return query.trim();
-};
-const buildCustomPrompt = (): string => {
-  return [
-    "Answer the user's question as a Discord-native KOTOR assistant reply using only the provided research context.",
-    "Requirements:",
-    "- Lead with the answer, not an introduction.",
-    "- Sound direct, practical, and helpful.",
-    "- Keep the answer concise: at most 3 short paragraphs or 5 compact bullets total before sources.",
-    "- Do not describe your research process, retrieval steps, indexing, backend systems, or source policy unless the user explicitly asks.",
-    "- Include inline numeric citations like [1] tied to concrete claims.",
-    ' - End with the exact heading "Sources" on its own line.',
-    "- Under Sources, list only the sources you cited, each on its own numbered line in the format: 1. Source Name - URL",
-    "- Do not add markdown headings other than the final Sources heading.",
-  ].join("\n");
-};
-const buildCustomPromptBrief = (): string => {
-  return [
-    "Produce a compact research digest for Star Wars: Knights of the Old Republic (KOTOR 1/2) modding questions.",
-    "Constraints:",
-    "- Stay under ~900 words; bullet key facts when possible.",
-    "- Do not narrate tooling, retrieval steps, or how you searched.",
-    "- Prefer actionable answers over background essays.",
-    "- Include inline numeric citations like [1] tied to concrete claims.",
-    ' - End with the exact heading "Sources" on its own line.',
-    "- Under Sources, list only cited sources as numbered lines: 1. Source Name - URL",
-  ].join("\n");
-};
-const normalizeUrl = (value: string): string => value.replace(/\/+$/, "").trim();
-const extractUrls = (value: string): string[] => {
-  const matches = value.match(/[a-z][a-z0-9+.-]*:\/\/[^\s)>\]]+/giu) ?? [];
-  return [...new Set(matches.map((match) => match.replace(/[.,;:!?]+$/, "")))];
-};
-const extractSourceSectionUrls = (value: string): string[] => {
-  const normalized = value.replace(/\r\n/g, "\n");
-  const sourceHeading = /\n(?:#{1,6}\s*)?(?:Sources|References)\s*\n/i;
-  const match = normalized.match(sourceHeading);
-  if (!match || match.index === undefined) {
-    return extractUrls(normalized);
-  }
-  const sourceSection = normalized.slice(match.index + match[0].length);
-  return extractUrls(sourceSection);
-};
-const hostnameHint = (url: string): string => {
-  try {
-    return new URL(url).hostname.replace(/^www\./, "").toLowerCase();
-  } catch {
-    return url.slice(0, 48);
-  }
-};
-/** Dedupe by normalized URL; preserves first-seen order for stable Holocron pulses. */
-const uniqueUrlsPreserveOrder = (urls: readonly string[]): string[] => {
-  const seen = new Set<string>();
-  const out: string[] = [];
-  for (const raw of urls) {
-    const u = normalizeUrl(raw);
-    if (!u || seen.has(u)) continue;
-    seen.add(u);
-    out.push(u);
-  }
-  return out;
-};
-const payloadUrls = (values: readonly string[] | null | undefined): string[] =>
-  Array.isArray(values) ? values.filter((value): value is string => typeof value === "string") : [];
-const isAllowedSourceUrl = (url: string, sourcePool: readonly SourceDescriptor[]): boolean => {
-  if (!isPublicWebCitationUrl(url)) return false;
-  if (sourcePool.some((source) => sourceUrlMatchesDescriptor(url, source))) return true;
-  if (isTraskApprovedResearchUrl(url, sourcePool)) return true;
-  return isTraskApprovedBaseUrl(url);
-};
-/** Visited URLs from ai-researchwizard payload (Holocron live facet pings). */
-const collectVisitedUrlsFromPayload = (
-  payload: ResearchWizardResponsePayload,
-  approvedSources: readonly SourceDescriptor[],
-): string[] => {
-  const info = payload.research_information;
-  return uniqueUrlsPreserveOrder(payloadUrls(info?.visited_urls)).filter((url) =>
-    isAllowedSourceUrl(url, approvedSources),
-  );
-};
-const collectRejectedUrlsFromPayload = (payload: ResearchWizardResponsePayload): string[] => {
-  const rawRejected = payload.research_information?.rejected_source_urls;
-  return Array.isArray(rawRejected)
-    ? uniqueUrlsPreserveOrder(rawRejected.filter((value): value is string => typeof value === "string"))
-    : [];
-};
-const MAX_ARCHIVE_PROBE_EVENTS = 28;
-const emitArchiveProbeEvents = (
-  payload: ResearchWizardResponsePayload,
-  approvedSources: readonly SourceDescriptor[],
-  onProgress?: (event: ResearchWizardProgressEvent) => void,
-): void => {
-  if (!onProgress) return;
-  const urls = collectVisitedUrlsFromPayload(payload, approvedSources).slice(0, MAX_ARCHIVE_PROBE_EVENTS * 2);
-  let emitted = 0;
-  for (const url of urls) {
-    if (emitted >= MAX_ARCHIVE_PROBE_EVENTS) break;
-    const matched = matchApprovedSource(url, approvedSources);
-    const host = hostnameHint(url);
-    onProgress({
-      phase: "gather",
-      detail: matched ? `Facet · ${matched.name}` : `Touch · ${host}`,
-      ...(matched ? { sources: [matched] } : {}),
-    });
-    emitted++;
-  }
-};
-const matchApprovedSource = (
-  url: string,
-  approvedSources: readonly SourceDescriptor[],
-): SourceDescriptor | undefined => {
-  const candidate = normalizeUrl(url);
-  return approvedSources.find((source) => {
-    const homeUrl = normalizeUrl(source.homeUrl);
-    return candidate === homeUrl || candidate.startsWith(`${homeUrl}/`);
-  });
-};
-const sourceUrlLabel = (source: SourceDescriptor, url: string): string => {
-  try {
-    const exact = new URL(url);
-    const base = new URL(source.homeUrl);
-    const exactPath = decodeURIComponent(exact.pathname.replace(/\/+$/u, ""));
-    const basePath = decodeURIComponent(base.pathname.replace(/\/+$/u, ""));
-    if (exactPath === basePath) return source.name;
-    const relativePath = exactPath.startsWith(`${basePath}/`) ? exactPath.slice(basePath.length + 1) : exactPath;
-    const cleaned = relativePath
-      .replace(/^blob\/[^/]+\//u, "")
-      .replace(/^tree\/[^/]+\//u, "")
-      .replace(/^wiki\//u, "")
-      .split("/")
-      .filter(Boolean)
-      .slice(-2)
-      .join("/")
-      .replace(/[-_]+/gu, " ")
-      .trim();
-    if (!cleaned) return source.name;
-    const lineAnchor = exact.hash && /^#L\d+(?:-L\d+)?$/iu.test(exact.hash) ? exact.hash : "";
-    return `${source.name}: ${cleaned}${lineAnchor}`;
-  } catch {
-    return source.name;
-  }
-};
-const exactSourceFromUrl = (url: string, approvedSources: readonly SourceDescriptor[]): SourceDescriptor | undefined => {
-  const exactUrl = normalizeUrl(url);
-  const catalogMatch = matchApprovedSource(url, approvedSources);
-  if (catalogMatch) {
-    const sourceUrl = normalizeUrl(catalogMatch.homeUrl);
-    return {
-      ...catalogMatch,
-      id: exactUrl === sourceUrl ? catalogMatch.id : `${catalogMatch.id}:${exactUrl}`,
-      name: sourceUrlLabel(catalogMatch, exactUrl),
-      homeUrl: exactUrl,
-    };
-  }
-  if (!isTraskApprovedBaseUrl(url)) return undefined;
-  const host = hostnameHint(url);
-  return {
-    id: `approved-web:${exactUrl}`,
-    name: host,
-    kind: "website",
-    homeUrl: exactUrl,
-    description: `Approved web source (${host})`,
-    freshnessPolicy: "live web research",
-    approvalScope: "approved research host",
-    tags: [host],
-  };
-};
-const isCatalogRootUrl = (url: string, approvedSources: readonly SourceDescriptor[]): boolean => {
-  const normalized = normalizeUrl(url);
-  return approvedSources.some((source) => normalizeUrl(source.homeUrl) === normalized);
-};
-const materializeSourcesFromUrls = (
-  urls: readonly string[],
-  sourcePool: readonly SourceDescriptor[],
-): readonly SourceDescriptor[] => {
-  const candidateUrls = uniqueUrlsPreserveOrder(
-    urls.filter((url) => isAllowedSourceUrl(url, sourcePool)),
-  );
-  const matched: SourceDescriptor[] = [];
-  const hasPreciseUrl = candidateUrls.some((url) => !isCatalogRootUrl(url, sourcePool));
-  for (const url of candidateUrls) {
-    if (hasPreciseUrl && isCatalogRootUrl(url, sourcePool)) continue;
-    const source = exactSourceFromUrl(url, sourcePool);
-    if (source && !matched.some((entry) => normalizeUrl(entry.homeUrl) === normalizeUrl(source.homeUrl))) {
-      matched.push(source);
-    }
-  }
-  return matched.slice(0, 6);
-};
-const collectCitedSources = (
-  report: string,
-  approvedSources: readonly SourceDescriptor[],
-  payload: ResearchWizardResponsePayload,
-): readonly SourceDescriptor[] => {
-  const info = payload.research_information;
-  return materializeSourcesFromUrls([
-    ...extractSourceSectionUrls(report),
-    ...payloadUrls(info?.cited_urls),
-    ...payloadUrls(info?.source_urls),
-  ], approvedSources);
-};
-const collectRetrievedSources = (
-  report: string,
-  approvedSources: readonly SourceDescriptor[],
-  payload: ResearchWizardResponsePayload,
-): readonly SourceDescriptor[] => {
-  const info = payload.research_information;
-  return materializeSourcesFromUrls([
-    ...payloadUrls(info?.retrieved_urls),
-    ...payloadUrls(info?.cited_urls),
-    ...payloadUrls(info?.source_urls),
-    ...extractSourceSectionUrls(report),
-  ], approvedSources);
-};
-const collectCitedSourcesFromText = (
-  text: string,
-  sourcePool: readonly SourceDescriptor[],
-): readonly SourceDescriptor[] => materializeSourcesFromUrls(extractSourceSectionUrls(text), sourcePool);
-const normalizeReport = (value: string): string => {
-  return value
-    .replace(/^#\s+.*$/m, "")
-    .replace(/^##\s+Table of Contents[\s\S]*?(?=^##\s+|^Sources\s*$|^#\s+|$)/im, "")
-    .replace(/\n{3,}/g, "\n\n")
-    .trim();
-};
-const formatSourcesSection = (sources: readonly SourceDescriptor[]): string => {
-  return [
-    "Sources",
-    ...sources.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`),
-  ].join("\n");
-};
-const countPayloadWebUrls = (payload: ResearchWizardResponsePayload): number => {
-  const info = payload.research_information;
-  const urls = uniqueUrlsPreserveOrder([
-    ...payloadUrls(info?.cited_urls),
-    ...payloadUrls(info?.retrieved_urls),
-    ...payloadUrls(info?.visited_urls),
-    ...payloadUrls(info?.source_urls),
-  ]);
-  return urls.filter((url) => isPublicWebCitationUrl(url)).length;
-};
-const isSynthesisFailureReport = (report: string, payload: ResearchWizardResponsePayload): boolean => {
-  const normalized = report.trim();
-  const webUrlCount = countPayloadWebUrls(payload);
-  if (webUrlCount >= MIN_HOLOCRON_WEB_CITATIONS) {
-    return /^i could not complete live archive synthesis\b/iu.test(normalized);
-  }
-  if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
-    return true;
-  }
-  if (
-    /^-\s+\S+.*is an approved archive page that may answer questions about/iu.test(normalized)
-  ) {
-    return true;
-  }
-  return false;
-};
-const sourceOnlyFallbackAnswer = (query: string, sources: readonly SourceDescriptor[]): string => {
-  if (sources.length === 0) return "I could not complete live archive synthesis for this question right now.";
-  const topic = query.trim().replace(/\?+$/u, "") || "this question";
-  return [
-    `I found candidate sources for ${topic}, but I could not support a grounded answer from the retrieved evidence.`,
-    "Review the sources below or try a narrower wording.",
-    "",
-    formatSourcesSection(sources),
-  ].join("\n");
-};
-const DEFAULT_REWRITE_TIMEOUT_MS = 15_000;
-const MAX_REWRITE_ATTEMPTS = 2;
-const normalizePreferredRewriteModel = (model: string | undefined): string | undefined => {
-  const trimmed = model?.trim();
-  if (!trimmed) return undefined;
-  if (trimmed.startsWith("litellm:")) return trimmed.slice("litellm:".length).trim() || undefined;
-  if (trimmed.startsWith("openrouter:")) return trimmed.slice("openrouter:".length).trim() || undefined;
-  return trimmed;
-};
-const withTimeout = async <T>(promise: Promise<T>, timeoutMs: number): Promise<T> => {
-  return await new Promise<T>((resolve, reject) => {
-    const timer = setTimeout(() => {
-      reject(new Error(`rewrite timed out after ${timeoutMs}ms`));
-    }, timeoutMs);
-    void promise.then(
-      (value) => {
-        clearTimeout(timer);
-        resolve(value);
-      },
-      (error: unknown) => {
-        clearTimeout(timer);
-        reject(error);
-      },
-    );
-  });
-};
-const fallbackDiscordRewrite = (
-  query: string,
-  report: string,
-  sources: readonly SourceDescriptor[],
-): string => {
-  if (sources.length === 0) {
-    return degradedAnswerFallback(query, sources);
-  }
-  const normalized = normalizeReport(report);
-  if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
-    return sourceOnlyFallbackAnswer(query, sources);
-  }
-  const sourceIndexByUrl = new Map<string, number>(
-    sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
-  );
-  const [bodyOnlyCandidate = ""] = normalized.split(/\n(?:#{1,6}\s*)?(?:Sources|References)\s*\n/i, 1);
-  const bodyOnly = bodyOnlyCandidate
-    .replace(/\[([^\]]+)\]\((https?:\/\/[^)]+)\)/g, (_match, text: string, url: string) => {
-      const matchedSource = matchApprovedSource(url, sources);
-      const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
-      return citationIndex ? `${text} [${citationIndex}]` : text;
-    })
-    .replace(/^#{1,6}\s+.*$/gm, "")
-    .replace(/^\|.*\|$/gm, "")
-    .replace(/\*+/g, "")
-    .replace(/\n{3,}/g, "\n\n")
-    .trim();
-  const paragraphs = bodyOnly
-    .split(/\n{2,}/)
-    .map((paragraph) => paragraph.trim())
-    .filter((paragraph) => paragraph.length > 0);
-  const selected: string[] = [];
-  let totalLength = 0;
-  for (const paragraph of paragraphs) {
-    if (selected.length >= 2) break;
-    if (totalLength + paragraph.length > 900 && selected.length > 0) break;
-    selected.push(paragraph);
-    totalLength += paragraph.length;
-  }
-  let summary = selected.join("\n\n").trim();
-  if (!summary) {
-    summary = bodyOnly.slice(0, 900).trim();
-  }
-  if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
-    summary = `${summary} [1]`.trim();
-  }
-  return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
-};
-const fallbackDiscordBrief = (query: string, report: string, sources: readonly SourceDescriptor[]): string => {
-  if (sources.length === 0) {
-    return degradedAnswerFallback(query, sources);
-  }
-  const normalized = normalizeReport(report);
-  if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
-    return sourceOnlyFallbackAnswer(query, sources);
-  }
-  const sourceIndexByUrl = new Map<string, number>(
-    sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
-  );
-  const [bodyOnlyCandidate = ""] = normalized.split(/\n(?:#{1,6}\s*)?(?:Sources|References)\s*\n/i, 1);
-  const bodyOnly = bodyOnlyCandidate
-    .replace(/\[([^\]]+)\]\((https?:\/\/[^)]+)\)/g, (_match, text: string, url: string) => {
-      const matchedSource = matchApprovedSource(url, sources);
-      const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
-      return citationIndex ? `${text} [${citationIndex}]` : text;
-    })
-    .replace(/^#{1,6}\s+.*$/gm, "")
-    .replace(/\*+/g, "")
-    .replace(/\n{3,}/g, "\n\n")
-    .trim();
-  const firstChunk = bodyOnly.split(/\n{2,}/)[0]?.trim() ?? bodyOnly;
-  let summary = firstChunk.slice(0, 420).trim();
-  if (!summary) {
-    summary = bodyOnly.slice(0, 420).trim();
-  }
-  if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
-    summary = `${summary} [1]`.trim();
-  }
-  return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
-};
-const degradedAnswerFallback = (_query: string, _approvedSources: readonly SourceDescriptor[]): string => {
-  return "I could not complete live archive synthesis for this question right now.";
-};
-const normalizePreferenceUrl = (url: string): URL | undefined => {
-  try {
-    return new URL(url.trim().replace(/\/+$/, ""));
-  } catch {
-    return undefined;
-  }
-};
-const preferenceMatchesSource = (preference: ResearchWizardSourcePreference, source: SourceDescriptor): boolean => {
-  const preferenceUrl = normalizePreferenceUrl(preference.url);
-  const sourceUrl = normalizePreferenceUrl(source.homeUrl);
-  if (preferenceUrl && sourceUrl) {
-    const preferenceHost = preferenceUrl.hostname.replace(/^www\./, "").toLowerCase();
-    const sourceHost = sourceUrl.hostname.replace(/^www\./, "").toLowerCase();
-    const preferencePath = preferenceUrl.pathname.replace(/\/+$/, "");
-    const sourcePath = sourceUrl.pathname.replace(/\/+$/, "");
-    if (preferenceHost === sourceHost && (preferencePath === "" || sourcePath === preferencePath || sourcePath.startsWith(`${preferencePath}/`))) {
-      return true;
-    }
-    if (preferenceHost === sourceHost && preferenceUrl.pathname === "/") {
-      return true;
-    }
-  }
-  const preferenceName = preference.name?.trim().toLowerCase();
-  return Boolean(preferenceName && preferenceName === source.name.trim().toLowerCase());
-};
-const applySourcePreferences = (
-  approvedSources: readonly SourceDescriptor[],
-  preferences?: readonly ResearchWizardSourcePreference[],
-): readonly SourceDescriptor[] => {
-  if (!preferences?.length) return approvedSources;
-  const ranked = approvedSources
-    .map((source, index) => {
-      const preference = preferences.find((entry) => preferenceMatchesSource(entry, source));
-      return {
-        source,
-        index,
-        enabled: preference ? preference.enabled : true,
-        weight: preference && Number.isFinite(preference.weight) ? preference.weight : 1,
-      };
-    })
-    .filter((entry) => entry.enabled)
-    .sort((left, right) => right.weight - left.weight || left.index - right.index)
-    .map((entry) => entry.source);
-  return ranked;
-};
-type ResearchQueryIntent = "tooling" | "technical" | "lore" | "general";
-const TOOLING_QUERY_TERMS = [
-  "mdlops",
-  "mdledit",
-  "kotormax",
-  "kotorblender",
-  "pykotor",
-  "xoreos",
-  "reone",
-  "tslpatcher",
-  "toolchain",
-  "modding",
-  "tool",
-  "script",
-  "gff",
-  "2da",
-  "tlk",
-  "nss",
-  "ncs",
-  "utc",
-  "uti",
-  "mdl",
-  "mdx",
-  "texture",
-  "convert",
-  "blender",
-  "3ds",
-];
-const TECHNICAL_QUERY_TERMS = [
-  "widescreen",
-  "resolution",
-  "hud",
-  "screen",
-  "crash",
-  "compatibility",
-  "steam",
-  "windows",
-  "linux",
-  "mac",
-  "save",
-  "saves",
-  "install",
-  "launcher",
-  "driver",
-  "movies",
-  "cutscene",
-  "graphics",
-  "aspect",
-];
-const LORE_QUERY_TERMS = [
-  "bastila",
-  "revan",
-  "malak",
-  "shan",
-  "jedi",
-  "sith",
-  "rakata",
-  "star forge",
-  "temple summit",
-  "companion",
-  "romance",
-  "story",
-  "lore",
-];
-const LORE_SOURCE_IDS = new Set(["wikipedia-kotor", "strategywiki-kotor"]);
-const queryIncludesAny = (query: string, terms: readonly string[]): boolean => {
-  const lowered = query.toLowerCase();
-  return terms.some((term) => lowered.includes(term));
-};
-const classifyQueryIntent = (query: string): ResearchQueryIntent => {
-  const lowered = query.toLowerCase();
-  if (queryIncludesAny(lowered, TOOLING_QUERY_TERMS)) return "tooling";
-  if (queryIncludesAny(lowered, TECHNICAL_QUERY_TERMS)) return "technical";
-  if (queryIncludesAny(lowered, LORE_QUERY_TERMS)) return "lore";
-  return "general";
-};
-const routeSourcesForQuery = (
-  query: string,
-  approvedSources: readonly SourceDescriptor[],
-): readonly SourceDescriptor[] => {
-  const intent = classifyQueryIntent(query);
-  if (intent === "tooling" || intent === "technical") {
-    const filtered = approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id));
-    return filtered.length > 0 ? filtered : approvedSources;
-  }
-  if (intent === "lore") {
-    return [
-      ...approvedSources.filter((source) => LORE_SOURCE_IDS.has(source.id)),
-      ...approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id)),
-    ];
-  }
-  return approvedSources;
-};
-const mergeSourcesPreserveOrder = (...groups: readonly (readonly SourceDescriptor[])[]): SourceDescriptor[] => {
-  const merged: SourceDescriptor[] = [];
-  const seen = new Set<string>();
-  for (const group of groups) {
-    for (const source of group) {
-      const key = normalizeUrl(source.homeUrl);
-      if (seen.has(key)) continue;
-      seen.add(key);
-      merged.push(source);
-    }
-  }
-  return merged;
-};
-const normalizeMatchToken = (token: string): string => {
-  const lowered = token.toLowerCase();
-  if (lowered.length <= 6) return lowered;
-  return lowered.slice(0, 6);
-};
-const tokenizeQuery = (query: string): string[] =>
-  [...new Set(
-    query
-      .toLowerCase()
-      .replace(/[^\p{L}\p{N}\s-]/gu, " ")
-      .split(/\s+/)
-      .filter((token) => token.length >= 4)
-      .map(normalizeMatchToken),
-  )];
-/** Citations must be real public web pages on the approved allowlist (live GPTR research only). */
-const isPublicWebCitationUrl = (url: string): boolean => {
-  if (url.startsWith("local://") || url.startsWith("discord://")) return false;
-  try {
-    const parsed = new URL(url);
-    return parsed.protocol === "https:" || parsed.protocol === "http:";
-  } catch {
-    return false;
-  }
-};
-const filterPublicWebCitationSources = (sources: readonly SourceDescriptor[]): SourceDescriptor[] =>
-  sources.filter((source) => isPublicWebCitationUrl(source.homeUrl));
-/** Holocron e2e and product policy: answers must ground on multiple approved web sources. */
-export const MIN_HOLOCRON_WEB_CITATIONS = 2;
-const collectWebEvidenceSources = (
-  query: string,
-  report: string,
-  approvedSources: readonly SourceDescriptor[],
-  payload: ResearchWizardResponsePayload,
-): readonly SourceDescriptor[] => {
-  const pool = mergeSourcesPreserveOrder(
-    collectRetrievedSources(report, approvedSources, payload),
-    collectCitedSources(report, approvedSources, payload),
-    materializeSourcesFromUrls(collectVisitedUrlsFromPayload(payload, approvedSources), approvedSources),
-  );
-  return rerankEvidenceSources(query, filterPublicWebCitationSources(pool));
-};
-const ensureMinimumWebCitations = (
-  query: string,
-  cited: readonly SourceDescriptor[],
-  evidence: readonly SourceDescriptor[],
-  payload?: ResearchWizardResponsePayload,
-  approvedSources: readonly SourceDescriptor[] = [],
-): readonly SourceDescriptor[] => {
-  const info = payload?.research_information;
-  const payloadBacked = payload
-    ? materializeSourcesFromUrls(
-      uniqueUrlsPreserveOrder([
-        ...payloadUrls(info?.cited_urls),
-        ...payloadUrls(info?.retrieved_urls),
-        ...payloadUrls(info?.visited_urls),
-        ...payloadUrls(info?.source_urls),
-      ]),
-      approvedSources,
-    )
-    : [];
-  const merged = rerankEvidenceSources(
-    query,
-    mergeSourcesPreserveOrder(cited, evidence, payloadBacked),
-  );
-  const webOnly = filterPublicWebCitationSources(merged);
-  if (webOnly.length >= MIN_HOLOCRON_WEB_CITATIONS) {
-    return webOnly.slice(0, 8);
-  }
-  const padded = rerankEvidenceSources(
-    query,
-    mergeSourcesPreserveOrder(webOnly, filterPublicWebCitationSources(evidence), payloadBacked),
-  );
-  return padded.length >= MIN_HOLOCRON_WEB_CITATIONS
-    ? padded.slice(0, 8)
-    : filterPublicWebCitationSources(payloadBacked).slice(0, 8);
-};
-const composeAnswerFromWebSources = (query: string, sources: readonly SourceDescriptor[]): string => {
-  const webSources = filterPublicWebCitationSources(sources).slice(0, 5);
-  if (webSources.length === 0) {
-    return sourceOnlyFallbackAnswer(query, sources);
-  }
-  return sourceOnlyFallbackAnswer(query, webSources);
-};
-const sourceMatchesQuery = (source: SourceDescriptor, query: string): boolean => {
-  const tokens = tokenizeQuery(query);
-  if (tokens.length === 0) return false;
-  const haystack = `${source.name} ${source.description ?? ""} ${source.homeUrl}`.toLowerCase();
-  let hits = 0;
-  for (const token of tokens) {
-    if (haystack.includes(token)) hits += 1;
-  }
-  return hits >= Math.min(2, tokens.length);
-};
-const sourceRelevanceScore = (source: SourceDescriptor, query: string): number => {
-  const tokens = tokenizeQuery(query);
-  if (tokens.length === 0) return 1;
-  const haystack = [
-    source.name,
-    source.description,
-    source.homeUrl,
-    ...(source.tags ?? []),
-  ].join(" ").toLowerCase();
-  let hits = 0;
-  for (const token of tokens) {
-    if (haystack.includes(token)) hits += 1;
-  }
-  const titleBonus = tokens.some((token) => source.name.toLowerCase().includes(token)) ? 2 : 0;
-  const urlBonus = tokens.some((token) => source.homeUrl.toLowerCase().includes(token)) ? 1 : 0;
-  return hits * 2 + titleBonus + urlBonus;
-};
-const rerankEvidenceSources = (query: string, sources: readonly SourceDescriptor[]): readonly SourceDescriptor[] => {
-  const tokens = tokenizeQuery(query);
-  const ranked = sources
-    .map((source, index) => ({
-      source,
-      index,
-      score: sourceRelevanceScore(source, query),
-    }))
-    .sort((left, right) => right.score - left.score || left.index - right.index);
-  if (tokens.length === 0) {
-    return ranked.map((entry) => entry.source).slice(0, 4);
-  }
-  const strong = ranked.filter((entry) => entry.score >= 2).map((entry) => entry.source);
-  return strong.slice(0, 8);
-};
-const resolveWebSourcesForFailedSynthesis = (
-  query: string,
-  retrievedSources: readonly SourceDescriptor[],
-): readonly SourceDescriptor[] => {
-  const candidates = filterPublicWebCitationSources(retrievedSources);
-  const matched = candidates.filter((source) => sourceMatchesQuery(source, query));
-  return (matched.length > 0 ? matched : candidates).slice(0, 5);
-};
-const researchDomainsForSources = (sources: readonly SourceDescriptor[]): string[] => {
-  const enabledHosts = new Set<string>();
-  for (const source of sources) {
-    try {
-      const host = new URL(source.homeUrl).hostname.replace(/^www\./, "").toLowerCase();
-      const baseHost = traskApprovedResearchBaseHosts.find((base) => host === base || host.endsWith(`.${base}`));
-      if (baseHost) enabledHosts.add(baseHost);
-    } catch {
-      continue;
-    }
-  }
-  return [...enabledHosts];
-};
-const HEARTBEAT_MS = 8000;
-const withProgressHeartbeat = async <T>(
-  phase: ResearchWizardProgressEvent["phase"],
-  makeDetail: (elapsedMs: number) => string,
-  onProgress: ((event: ResearchWizardProgressEvent) => void) | undefined,
-  work: () => Promise<T>,
-): Promise<T> => {
-  if (!onProgress) {
-    return await work();
-  }
-  const startedAt = Date.now();
-  let lastBucket = -1;
-  const emit = () => {
-    const elapsed = Date.now() - startedAt;
-    const bucket = Math.floor(elapsed / HEARTBEAT_MS);
-    if (bucket === lastBucket) return;
-    lastBucket = bucket;
-    onProgress({ phase, detail: makeDetail(elapsed) });
-  };
-  emit();
-  const timer = setInterval(emit, HEARTBEAT_MS);
-  try {
-    return await work();
-  } finally {
-    clearInterval(timer);
-  }
-};
-export class ResearchWizardClient implements ResearchWizardQueryHandler {
-  private readonly openAiClient: OpenAI | null;
-  public constructor(
-    private readonly config: ResearchWizardRuntimeConfig,
-    private readonly aiConfig: SharedAiConfig,
-    private readonly approvedSources: readonly SourceDescriptor[] = traskApprovedResearchSources,
-  ) {
-    this.openAiClient = aiConfig.openAiApiKey
-      ? new OpenAI({
-          apiKey: aiConfig.openAiApiKey,
-          ...(aiConfig.openAiBaseUrl ? { baseURL: aiConfig.openAiBaseUrl } : {}),
-          ...(aiConfig.openAiDefaultHeaders ? { defaultHeaders: aiConfig.openAiDefaultHeaders } : {}),
-        })
-      : null;
-  }
-  public async listModels(): Promise<readonly ResearchWizardModelOption[]> {
-    try {
-      const dynamicModels = await listHeadlessGptResearcherModels(this.config);
-      const seen = new Set(DEFAULT_RESEARCH_WIZARD_MODELS.map((model) => model.id));
-      return [
-        ...DEFAULT_RESEARCH_WIZARD_MODELS,
-        ...dynamicModels.filter((model) => {
-          if (seen.has(model.id)) return false;
-          seen.add(model.id);
-          return true;
-        }),
-      ];
-    } catch {
-      return DEFAULT_RESEARCH_WIZARD_MODELS;
-    }
-  }
-  private async rewriteForDiscord(
-    query: string,
-    report: string,
-    approvedSources: readonly SourceDescriptor[],
-    preferredModel?: string,
-  ): Promise<string> {
-    if (!this.openAiClient) {
-      return fallbackDiscordRewrite(query, report, approvedSources);
-    }
-    const allowedSources = approvedSources
-      .map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
-      .join("\n");
-    const preferredRewriteModel = normalizePreferredRewriteModel(preferredModel);
-    const modelsToTry = [
-      ...new Set([...(preferredRewriteModel ? [preferredRewriteModel] : []), this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks]),
-    ].slice(0, MAX_REWRITE_ATTEMPTS);
-    for (const model of modelsToTry) {
-      try {
-        const completion = await withTimeout(
-          this.openAiClient.chat.completions.create({
-            model,
-            temperature: 0.2,
-            messages: [
-              {
-                role: "system",
-                content: [
-                  "Rewrite research reports into concise Discord answers.",
-                  "Do not mention research steps, indexing, tooling, or backend behavior.",
-                  "Use only the numbered sources provided by the user.",
-                  "Return plain Markdown with no headings except the final Sources heading.",
-                ].join(" "),
-              },
-              {
-                role: "user",
-                content: [
-                  `Question: ${query}`,
-                  "Write a concise answer for Discord.",
-                  "Requirements:",
-                  "- Lead with the answer.",
-                  "- Use at most 3 short paragraphs or 5 compact bullets before sources.",
-                  "- Use inline numeric citations like [1], [2].",
-                  ' - End with the exact heading "Sources" on its own line.',
-                  "- Under Sources, include only the cited sources using the exact numbered lines provided below.",
-                  "Allowed Sources:",
-                  allowedSources,
-                  "Research Report:",
-                  report,
-                ].join("\n\n"),
-              },
-            ],
-          }),
-          DEFAULT_REWRITE_TIMEOUT_MS,
-        );
-        const rewritten = completion.choices[0]?.message?.content?.trim();
-        if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
-          return rewritten;
-        }
-      } catch {
-        continue;
-      }
-    }
-    return fallbackDiscordRewrite(query, report, approvedSources);
-  }
-  private async rewriteForDiscordBrief(
-    query: string,
-    report: string,
-    approvedSources: readonly SourceDescriptor[],
-  ): Promise<string> {
-    if (!this.openAiClient) {
-      return fallbackDiscordBrief(query, report, approvedSources);
-    }
-    const allowedSources = approvedSources
-      .map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
-      .join("\n");
-    const modelsToTry = [...new Set([this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks])].slice(0, MAX_REWRITE_ATTEMPTS);
-    for (const model of modelsToTry) {
-      try {
-        const completion = await withTimeout(
-          this.openAiClient.chat.completions.create({
-            model,
-            temperature: 0.15,
-            max_tokens: 380,
-            messages: [
-              {
-                role: "system",
-                content: [
-                  "Rewrite research into a very short Discord chat reply (like a quick DM).",
-                  "No preamble, no essay tone, no meta commentary about research.",
-                  "Use only the numbered sources provided.",
-                  "Plain sentences; at most 2 short sentences OR up to 3 compact bullets before Sources.",
-                  'End with the exact heading "Sources" on its own line, then cited sources only.',
-                ].join(" "),
-              },
-              {
-                role: "user",
-                content: [
-                  `Question: ${query}`,
-                  "Write the shortest helpful answer.",
-                  "Allowed Sources:",
-                  allowedSources,
-                  "Research Report:",
-                  report,
-                ].join("\n\n"),
-              },
-            ],
-          }),
-          DEFAULT_REWRITE_TIMEOUT_MS,
-        );
-        const rewritten = completion.choices[0]?.message?.content?.trim();
-        if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
-          return rewritten;
-        }
-      } catch {
-        continue;
-      }
-    }
-    return fallbackDiscordBrief(query, report, approvedSources);
-  }
-  private async fetchResearchReport(
-    query: string,
-    customPrompt: string,
-    approvedSources: readonly SourceDescriptor[],
-    options?: ResearchWizardQueryOptions,
-  ): Promise<{ report: string; payload: ResearchWizardResponsePayload }> {
-    if (approvedSources.length === 0) {
-      throw new Error("No approved research sources are enabled.");
-    }
-    const allowedDomains = researchDomainsForSources(approvedSources);
-    const raw = await runHeadlessGptResearcher(this.config, {
-      query: buildResearchTask(query),
-      custom_prompt: customPrompt,
-      query_domains: allowedDomains,
-      allowed_url_prefixes: approvedSources.map((source) => source.homeUrl),
-      ...(options?.model?.trim() ? { model: options.model.trim() } : {}),
-      report_type: "research_report",
-      report_source: "web",
-    });
-    const payload: ResearchWizardResponsePayload = {
-      report: raw.report,
-      ...(raw.research_information !== undefined
-        ? { research_information: { ...raw.research_information } }
-        : {}),
-    };
-    const report = typeof raw.report === "string" ? normalizeReport(raw.report) : "";
-    if (!report) {
-      throw new Error("ai-researchwizard returned an empty report.");
-    }
-    return { report, payload };
-  }
-  public async answerQuestion(
-    query: string,
-    onProgress?: (event: ResearchWizardProgressEvent) => void,
-    options?: ResearchWizardQueryOptions,
-  ): Promise<ResearchWizardAnswer> {
-    const approvedSources = routeSourcesForQuery(
-      query,
-      applySourcePreferences(this.approvedSources, options?.sourcePreferences),
-    );
-    try {
-      const allowedDomains = researchDomainsForSources(approvedSources);
-      onProgress?.({
-        phase: "gather",
-        detail: `Scanning ${approvedSources.length} approved source root${approvedSources.length === 1 ? "" : "s"} across ${allowedDomains.length} host${allowedDomains.length === 1 ? "" : "s"}…`,
-      });
-      const { report, payload } = await withProgressHeartbeat(
-        "gather",
-        (elapsedMs) => {
-          const seconds = Math.max(1, Math.floor(elapsedMs / 1000));
-          return `Researching approved archive sources… (${seconds}s)`;
-        },
-        onProgress,
-        async () => await this.fetchResearchReport(query, buildCustomPrompt(), approvedSources, options),
-      );
-      const rejectedUrls = collectRejectedUrlsFromPayload(payload);
-      if (rejectedUrls.length > 0) {
-        onProgress?.({
-          phase: "gather",
-          detail: `Rejected ${rejectedUrls.length} URL${rejectedUrls.length === 1 ? "" : "s"} outside approved source roots.`,
-        });
-      }
-      emitArchiveProbeEvents(payload, approvedSources, onProgress);
-      onProgress?.({
-        phase: "report",
-        detail: "Ranking passages and citations…",
-      });
-      const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
-      const retrievedSources = webEvidenceSources;
-      const citedSourcesFromReport = rerankEvidenceSources(
-        query,
-        mergeSourcesPreserveOrder(
-          collectCitedSources(report, approvedSources, payload),
-          collectCitedSourcesFromText(report, approvedSources),
-        ),
-      );
-      onProgress?.({
-        phase: "sources",
-        detail: retrievedSources.length ? `${retrievedSources.length} sources retrieved` : "Mapping hosts to archive catalog…",
-        sources: retrievedSources,
-      });
-      onProgress?.({
-        phase: "compose",
-        detail: "Rendering Holocron answer…",
-      });
-      let answer: string;
-      if (retrievedSources.length === 0) {
-        answer = degradedAnswerFallback(query, approvedSources);
-      } else if (isSynthesisFailureReport(report, payload)) {
-        const webSources = resolveWebSourcesForFailedSynthesis(query, retrievedSources);
-        if (webSources.length >= MIN_HOLOCRON_WEB_CITATIONS) {
-          const sourcesForRewrite = filterPublicWebCitationSources(webSources);
-          answer = this.openAiClient
-            ? await this.rewriteForDiscord(query, report, sourcesForRewrite, options?.model)
-            : fallbackDiscordRewrite(query, report, sourcesForRewrite);
-        } else if (webSources.length > 0) {
-          answer = sourceOnlyFallbackAnswer(query, webSources);
-        } else if (retrievedSources.length > 0) {
-          answer = sourceOnlyFallbackAnswer(query, retrievedSources);
-        } else {
-          answer = degradedAnswerFallback(query, approvedSources);
-        }
-      } else if (this.openAiClient) {
-        answer = await this.rewriteForDiscord(
-          query,
-          report,
-          filterPublicWebCitationSources(retrievedSources),
-          options?.model,
-        );
-      } else {
-        answer = fallbackDiscordRewrite(
-          query,
-          report,
-          filterPublicWebCitationSources(retrievedSources),
-        );
-      }
-      const citedSources = ensureMinimumWebCitations(
-        query,
-        filterPublicWebCitationSources(
-          mergeSourcesPreserveOrder(
-            collectCitedSourcesFromText(answer, retrievedSources),
-            citedSourcesFromReport,
-          ),
-        ),
-        webEvidenceSources,
-        payload,
-        approvedSources,
-      );
-      return {
-        answer,
-        approvedSources: citedSources,
-        retrievedSources,
-        visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
-      };
-    } catch (error: unknown) {
-      const detail = error instanceof Error ? error.message : String(error);
-      onProgress?.({
-        phase: "compose",
-        detail: `Live web research failed: ${detail.slice(0, 240)}`,
-      });
-      const topic = query.trim().replace(/\?+$/u, "") || "this question";
-      return {
-        answer: `I could not complete live web research for "${topic}" right now (${detail}). Ensure GPTR Python (TRASK_GPT_RESEARCHER_PYTHON), retriever keys (e.g. TAVILY_API_KEY), and TRASK_RESEARCHWIZARD_TIMEOUT_MS are configured, then retry.`,
-        approvedSources: [],
-        retrievedSources: [],
-        visitedUrls: [],
-      };
-    }
-  }
-  /** Shorter rewrite for proactive/channel replies (still source-backed). */
-  public async answerQuestionBrief(query: string): Promise<ResearchWizardBriefAnswer> {
-    try {
-      const approvedSources = routeSourcesForQuery(query, this.approvedSources);
-      const { report, payload } = await this.fetchResearchReport(query, buildCustomPromptBrief(), approvedSources);
-      const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
-      const retrievedSources = webEvidenceSources;
-      const answer = retrievedSources.length > 0
-        ? await this.rewriteForDiscordBrief(query, report, retrievedSources)
-        : degradedAnswerFallback(query, approvedSources);
-      return {
-        answer,
-        approvedSources: ensureMinimumWebCitations(
-          query,
-          filterPublicWebCitationSources(
-            mergeSourcesPreserveOrder(
-              collectCitedSourcesFromText(answer, retrievedSources),
-              collectCitedSources(report, approvedSources, payload),
-            ),
-          ),
-          webEvidenceSources,
-          payload,
-          approvedSources,
-        ),
-        retrievedSources,
-        visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
-        researchReport: report,
-      };
-    } catch {
-      const topic = query.trim().replace(/\?+$/u, "") || "this question";
-      const answer = `I could not complete live web research for "${topic}" right now.`;
-      return {
-        answer,
-        approvedSources: [],
-        retrievedSources: [],
-        visitedUrls: [],
-        researchReport: answer,
-      };
-    }
-  }
-}
-export const createResearchWizardClient = (
-  config: ResearchWizardRuntimeConfig,
-  aiConfig: SharedAiConfig = loadSharedAiConfig(),
-): ResearchWizardClient => {
-  return new ResearchWizardClient(config, aiConfig, traskApprovedResearchSources);
-};
-// ---------------------------------------------------------------------------
-// Pure helpers exported for unit testing — not part of the public API surface.
-// ---------------------------------------------------------------------------
 export {
-  normalizeUrl as _normalizeUrl,
-  extractUrls as _extractUrls,
-  hostnameHint as _hostnameHint,
-  uniqueUrlsPreserveOrder as _uniqueUrlsPreserveOrder,
-  collectCitedSources as _collectCitedSources,
-  collectRetrievedSources as _collectRetrievedSources,
-  collectVisitedUrlsFromPayload as _collectVisitedUrlsFromPayload,
-  collectCitedSourcesFromText as _collectCitedSourcesFromText,
-  isSynthesisFailureReport as _isSynthesisFailureReport,
-  countPayloadWebUrls as _countPayloadWebUrls,
-  normalizeReport as _normalizeReport,
-  formatSourcesSection as _formatSourcesSection,
-  normalizePreferredRewriteModel as _normalizePreferredRewriteModel,
-  matchApprovedSource as _matchApprovedSource,
-  classifyQueryIntent as _classifyQueryIntent,
-  routeSourcesForQuery as _routeSourcesForQuery,
-};

+/**
+ * @deprecated Import from `./web-research.js` instead. ResearchWizard naming is retired.
+ */
 export {
+  WebResearchClient as ResearchWizardClient,
+  createWebResearchClient as createResearchWizardClient,
+} from "./web-research.js";
+export type {
+  WebResearchAnswer as ResearchWizardAnswer,
+  WebResearchBriefAnswer as ResearchWizardBriefAnswer,
+  WebResearchClientFactoryOptions as ResearchWizardClientFactoryOptions,
+  WebResearchModelOption as ResearchWizardModelOption,
+  WebResearchProgressEvent as ResearchWizardProgressEvent,
+  WebResearchQueryHandler as ResearchWizardQueryHandler,
+  WebResearchQueryOptions as ResearchWizardQueryOptions,
+  WebResearchSourcePreference as ResearchWizardSourcePreference,
+} from "./web-research.js";

packages/trask/src/web-research-subprocess.ts ADDED Viewed

	@@ -0,0 +1,337 @@

+import { spawn } from "node:child_process";
+import { existsSync } from "node:fs";
+import { dirname, join, resolve } from "node:path";
+import type { WebResearchRuntimeConfig } from "@openkotor/config";
+export interface HeadlessWebResearchResult {
+  readonly report: string;
+  readonly research_information?: {
+    readonly source_urls?: readonly string[] | null;
+    readonly cited_urls?: readonly string[] | null;
+    readonly retrieved_urls?: readonly string[] | null;
+    readonly visited_urls?: readonly string[] | null;
+    readonly query_domains?: readonly string[] | null;
+    readonly allowed_url_prefixes?: readonly string[] | null;
+    readonly rejected_source_urls?: readonly string[] | null;
+  };
+}
+export interface HeadlessWebResearchModelOption {
+  readonly id: string;
+  readonly label: string;
+  readonly provider: string;
+  readonly recommended?: boolean;
+}
+/** stdin payload for `scripts/trask_web_research.py`. */
+export interface HeadlessWebResearchRequestPayload {
+  readonly query: string;
+  readonly custom_prompt?: string;
+  readonly source_urls?: readonly string[];
+  readonly query_domains?: readonly string[];
+  readonly allowed_url_prefixes?: readonly string[];
+  readonly model?: string;
+  readonly report_type?: string;
+  readonly report_source?: string;
+}
+/** @deprecated Use HeadlessWebResearchResult */
+export type HeadlessAiResearchWizardResult = HeadlessWebResearchResult;
+/** @deprecated Use HeadlessWebResearchRequestPayload */
+export type HeadlessAiResearchWizardRequestPayload = HeadlessWebResearchRequestPayload;
+/** @deprecated Use HeadlessWebResearchModelOption */
+export type HeadlessAiResearchWizardModelOption = HeadlessWebResearchModelOption;
+const findRepoRoot = (startDir: string, maxHops = 24): string => {
+  let dir = resolve(startDir);
+  for (let hop = 0; hop < maxHops; hop++) {
+    const script = join(dir, "scripts", "trask_web_research.py");
+    if (existsSync(script)) {
+      return dir;
+    }
+    const parent = dirname(dir);
+    if (parent === dir) {
+      return process.cwd();
+    }
+    dir = parent;
+  }
+  return process.cwd();
+};
+const defaultScriptPath = (repoRoot: string): string => join(repoRoot, "scripts", "trask_web_research.py");
+const spawnHeadless = (
+  python: string,
+  script: string,
+  cwd: string,
+  payload: HeadlessWebResearchRequestPayload,
+  timeoutMs: number,
+): Promise<{ stdout: string; stderr: string; code: number | null }> => {
+  return new Promise((resolvePromise, rejectPromise) => {
+    const child = spawn(python, [script], {
+      cwd,
+      stdio: ["pipe", "pipe", "pipe"],
+      env: {
+        ...process.env,
+        TRASK_ALLOWED_QUERY_DOMAINS: (payload.query_domains ?? []).join("\n"),
+        TRASK_ALLOWED_URL_PREFIXES: (payload.allowed_url_prefixes ?? []).join("\n"),
+        PYTHONIOENCODING: "utf-8",
+        PYTHONUTF8: "1",
+      },
+    });
+    const chunksOut: Buffer[] = [];
+    const chunksErr: Buffer[] = [];
+    let settled = false;
+    child.stdout?.on("data", (chunk: Buffer | string) => {
+      chunksOut.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
+    });
+    child.stderr?.on("data", (chunk: Buffer | string) => {
+      chunksErr.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
+    });
+    const timer = setTimeout(() => {
+      if (settled) {
+        return;
+      }
+      settled = true;
+      child.kill("SIGTERM");
+      rejectPromise(new Error(`Trask web research runner timed out after ${timeoutMs}ms`));
+    }, timeoutMs);
+    child.on("error", (error) => {
+      if (settled) {
+        return;
+      }
+      settled = true;
+      clearTimeout(timer);
+      rejectPromise(error);
+    });
+    child.on("close", (exitCode) => {
+      if (settled) {
+        return;
+      }
+      settled = true;
+      clearTimeout(timer);
+      resolvePromise({
+        stdout: Buffer.concat(chunksOut).toString("utf8").trim(),
+        stderr: Buffer.concat(chunksErr).toString("utf8").trim(),
+        code: exitCode,
+      });
+    });
+    try {
+      child.stdin?.write(Buffer.from(JSON.stringify(payload), "utf8"));
+      child.stdin?.end();
+    } catch (error) {
+      if (!settled) {
+        settled = true;
+        clearTimeout(timer);
+        rejectPromise(error);
+      }
+    }
+  });
+};
+export const runHeadlessWebResearch = async (
+  config: WebResearchRuntimeConfig,
+  payload: HeadlessWebResearchRequestPayload,
+): Promise<HeadlessWebResearchResult> => {
+  const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
+  const script = (config.headlessScriptPath?.trim() || defaultScriptPath(repoRoot)).trim();
+  if (!existsSync(script)) {
+    throw new Error(
+      `Trask web research script not found: ${script}. Run scripts/bootstrap_trask_research.sh or set TRASK_WEB_RESEARCH_SCRIPT.`,
+    );
+  }
+  const python = config.pythonExecutable?.trim() || "python";
+  const { stdout, stderr, code } = await spawnHeadless(python, script, repoRoot, payload, config.timeoutMs);
+  if (code !== 0) {
+    throw new Error(`Trask web research runner exited ${code ?? "unknown"}: ${stderr || stdout || "no output"}`);
+  }
+  try {
+    const parsed = JSON.parse(stdout) as HeadlessWebResearchResult;
+    if (typeof parsed.report !== "string" || !parsed.report.trim()) {
+      throw new Error("Web research runner returned empty report.");
+    }
+    return parsed;
+  } catch (error) {
+    if (error instanceof SyntaxError) {
+      throw new Error(`Trask web research runner returned invalid JSON: ${stdout.slice(0, 400)}`);
+    }
+    throw error;
+  }
+};
+/** @deprecated Use runHeadlessWebResearch */
+export const runHeadlessGptResearcher = runHeadlessWebResearch;
+const labelFromModelId = (modelId: string): string => {
+  const tail = modelId.split("/").pop() ?? modelId;
+  return tail
+    .replace(/[-_]+/gu, " ")
+    .replace(/\b\w/gu, (char) => char.toUpperCase())
+    .replace(/\bGpt\b/gu, "GPT")
+    .replace(/\bAi\b/gu, "AI");
+};
+const providerFromModelId = (modelId: string): string => {
+  const withoutPrefix = modelId.includes(":") ? modelId.split(":", 2)[1] ?? modelId : modelId;
+  const provider = withoutPrefix.includes("/") ? withoutPrefix.split("/", 1)[0] ?? withoutPrefix : "Trask web research";
+  return provider
+    .replace(/[-_]+/gu, " ")
+    .replace(/\b\w/gu, (char) => char.toUpperCase())
+    .replace(/\bAi\b/gu, "AI")
+    .replace(/^Openrouter$/u, "OpenRouter");
+};
+const normalizeWebResearchModelId = (modelId: string): string => {
+  const trimmed = modelId.trim();
+  if (!trimmed) return "";
+  if (trimmed.includes(":")) return trimmed;
+  return trimmed.startsWith("openrouter/") ? `openrouter:${trimmed}` : `litellm:${trimmed}`;
+};
+const parseModelList = (stdout: string): HeadlessWebResearchModelOption[] => {
+  const parsed = JSON.parse(stdout) as unknown;
+  if (!Array.isArray(parsed)) return [];
+  const seen = new Set<string>();
+  const models: HeadlessWebResearchModelOption[] = [];
+  for (const raw of parsed) {
+    if (typeof raw !== "string") continue;
+    const id = normalizeWebResearchModelId(raw);
+    if (!id || seen.has(id)) continue;
+    seen.add(id);
+    models.push({
+      id,
+      label: labelFromModelId(id),
+      provider: providerFromModelId(id),
+    });
+  }
+  return models;
+};
+export const listHeadlessWebResearchModels = async (
+  config: WebResearchRuntimeConfig,
+): Promise<HeadlessWebResearchModelOption[]> => {
+  const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
+  const python = config.pythonExecutable?.trim() || "python";
+  const script = [
+    "import json, sys",
+    "from pathlib import Path",
+    "root = Path(sys.argv[1]).resolve()",
+    "fallbacks = root / 'vendor' / 'llm_fallbacks' / 'src'",
+    "sys.path.insert(0, str(fallbacks))",
+    "try:",
+    "    from llm_fallbacks.config import FREE_CHAT_MODELS",
+    "    models = [name for name, _ in FREE_CHAT_MODELS]",
+    "except Exception:",
+    "    from llm_fallbacks import filter_models",
+    "    models = list(filter_models(model_type='chat', free_only=True))",
+    "print(json.dumps(models[:60]))",
+  ].join("\n");
+  const { stdout, stderr, code } = await new Promise<{ stdout: string; stderr: string; code: number | null }>(
+    (resolvePromise, rejectPromise) => {
+      const child = spawn(python, ["-c", script, repoRoot], {
+        cwd: process.cwd(),
+        stdio: ["ignore", "pipe", "pipe"],
+        env: {
+          ...process.env,
+          PYTHONIOENCODING: "utf-8",
+          PYTHONUTF8: "1",
+        },
+      });
+      const chunksOut: Buffer[] = [];
+      const chunksErr: Buffer[] = [];
+      let settled = false;
+      const timer = setTimeout(() => {
+        if (settled) return;
+        settled = true;
+        child.kill("SIGTERM");
+        rejectPromise(new Error("Trask web research model list timed out"));
+      }, Math.min(config.timeoutMs, 15_000));
+      child.stdout?.on("data", (chunk: Buffer | string) => chunksOut.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)));
+      child.stderr?.on("data", (chunk: Buffer | string) => chunksErr.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)));
+      child.on("error", (error) => {
+        if (settled) return;
+        settled = true;
+        clearTimeout(timer);
+        rejectPromise(error);
+      });
+      child.on("close", (exitCode) => {
+        if (settled) return;
+        settled = true;
+        clearTimeout(timer);
+        resolvePromise({
+          stdout: Buffer.concat(chunksOut).toString("utf8").trim(),
+          stderr: Buffer.concat(chunksErr).toString("utf8").trim(),
+          code: exitCode,
+        });
+      });
+    },
+  );
+  if (code !== 0) {
+    throw new Error(`Trask web research model list exited ${code ?? "unknown"}: ${stderr || stdout || "no output"}`);
+  }
+  return parseModelList(stdout);
+};
+/** @deprecated Use listHeadlessWebResearchModels */
+export const listHeadlessGptResearcherModels = listHeadlessWebResearchModels;
+export const probeHeadlessWebResearchDryRun = async (config: WebResearchRuntimeConfig): Promise<boolean> => {
+  const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
+  const script = (config.headlessScriptPath?.trim() || defaultScriptPath(repoRoot)).trim();
+  if (!existsSync(script)) {
+    return false;
+  }
+  const python = config.pythonExecutable?.trim() || "python";
+  const { code } = await new Promise<{ code: number | null }>((resolvePromise, rejectPromise) => {
+    const child = spawn(python, [script, "--dry-run"], {
+      cwd: repoRoot,
+      stdio: ["ignore", "pipe", "pipe"],
+      env: { ...process.env, PYTHONIOENCODING: "utf-8", PYTHONUTF8: "1" },
+    });
+    let settled = false;
+    const timer = setTimeout(() => {
+      if (settled) return;
+      settled = true;
+      child.kill("SIGTERM");
+      rejectPromise(new Error("dry-run probe timed out"));
+    }, 15_000);
+    child.on("error", () => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      resolvePromise({ code: 1 });
+    });
+    child.on("close", (exitCode) => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      resolvePromise({ code: exitCode });
+    });
+  });
+  return code === 0;
+};

packages/trask/src/web-research.test.ts ADDED Viewed

	@@ -0,0 +1,38 @@

+import assert from "node:assert/strict";
+import { describe, test } from "node:test";
+import { loadWebResearchRuntimeConfig } from "@openkotor/config";
+import { createWebResearchClient } from "./web-research.js";
+describe("WebResearchClient", () => {
+  test("createWebResearchClient accepts runtime config", () => {
+    const cfg = loadWebResearchRuntimeConfig({});
+    const client = createWebResearchClient(cfg, {
+      openAiApiKey: undefined,
+      openAiBaseUrl: undefined,
+      openAiDefaultHeaders: undefined,
+      firecrawlApiKey: undefined,
+      chatModel: "gpt-5.4-mini",
+      chatModelFallbacks: [],
+      embeddingModel: "text-embedding-3-large",
+      databaseUrl: undefined,
+    });
+    assert.ok(client);
+  });
+});
+describe("loadWebResearchRuntimeConfig", () => {
+  test("TRASK_WEB_RESEARCH_TIMEOUT_MS overrides legacy TRASK_RESEARCHWIZARD_TIMEOUT_MS", () => {
+    const cfg = loadWebResearchRuntimeConfig({
+      TRASK_WEB_RESEARCH_TIMEOUT_MS: "60000",
+      TRASK_RESEARCHWIZARD_TIMEOUT_MS: "900000",
+    });
+    assert.equal(cfg.timeoutMs, 60_000);
+  });
+  test("TRASK_WEB_RESEARCH_PYTHON is respected", () => {
+    const cfg = loadWebResearchRuntimeConfig({ TRASK_WEB_RESEARCH_PYTHON: "/custom/python" });
+    assert.equal(cfg.pythonExecutable, "/custom/python");
+  });
+});

packages/trask/src/web-research.ts ADDED Viewed

	@@ -0,0 +1,1559 @@

+import OpenAI from "openai";
+import { loadSharedAiConfig, type WebResearchRuntimeConfig, type SharedAiConfig } from "@openkotor/config";
+import {
+  isDiscordCitationUrl,
+  isTraskApprovedBaseUrl,
+  isTraskApprovedResearchUrl,
+  sourceUrlMatchesDescriptor,
+  traskApprovedResearchBaseHosts,
+  traskApprovedResearchSources,
+  type SearchHit,
+  type SearchProvider,
+  type SourceDescriptor,
+} from "@openkotor/retrieval";
+import {
+  buildCommunityKnowledgeDigest,
+  filterWebArchiveCitationSources,
+  mergeCommunityAndWebSources,
+  searchHitsToCommunitySources,
+} from "./community-knowledge.js";
+import {
+  listHeadlessWebResearchModels,
+  runHeadlessWebResearch,
+  type HeadlessWebResearchModelOption,
+} from "./web-research-subprocess.js";
+export interface WebResearchAnswer {
+  answer: string;
+  /** Sources explicitly cited in the final answer shown to users. */
+  approvedSources: readonly SourceDescriptor[];
+  /** Sources retrieved as candidate evidence for the answer/rewrite stage. */
+  retrievedSources: readonly SourceDescriptor[];
+  /** Allowlisted URLs the headless researcher touched while gathering evidence. */
+  visitedUrls: readonly string[];
+}
+export interface WebResearchBriefAnswer extends WebResearchAnswer {
+  /** Normalized research report text used for proactive semantic gating. */
+  researchReport: string;
+}
+/** Fine-grained phases for Holocron clients polling thread history. */
+export interface WebResearchProgressEvent {
+  phase: "gather" | "report" | "sources" | "compose";
+  detail?: string;
+  sources?: readonly SourceDescriptor[];
+}
+export interface WebResearchQueryOptions {
+  /** Preferred rewrite model id, e.g. `openrouter:openrouter/auto` or `litellm:moonshotai/kimi-k2`. */
+  model?: string;
+  /** Optional per-request source enablement and weight hints from Holocron's Source Prioritization dialog. */
+  sourcePreferences?: readonly WebResearchSourcePreference[];
+  /** Imported Discord chunks and/or live channel hits merged before web research. */
+  localHits?: readonly SearchHit[];
+}
+export interface WebResearchClientFactoryOptions {
+  /** When set, searches imported chunks when `localHits` are not passed per request. */
+  localSearchProvider?: SearchProvider;
+  /** Resolves discord:// chunk URLs when searching imported history. */
+  discordGuildId?: string;
+}
+export interface WebResearchSourcePreference {
+  name?: string;
+  url: string;
+  weight: number;
+  enabled: boolean;
+}
+export interface WebResearchModelOption extends HeadlessWebResearchModelOption {}
+/** Structural type for adapters that only need full Q&A (e.g. Trask HTTP `/ask`). */
+export interface WebResearchQueryHandler {
+  answerQuestion(
+    query: string,
+    onProgress?: (event: WebResearchProgressEvent) => void,
+    options?: WebResearchQueryOptions,
+  ): Promise<WebResearchAnswer>;
+  listModels?(): Promise<readonly WebResearchModelOption[]>;
+}
+const DEFAULT_WEB_RESEARCH_MODELS: readonly WebResearchModelOption[] = [
+  { id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
+];
+interface WebResearchResponsePayload {
+  report?: string | null;
+  research_information?: {
+    source_urls?: readonly string[] | null;
+    cited_urls?: readonly string[] | null;
+    retrieved_urls?: readonly string[] | null;
+    visited_urls?: readonly string[] | null;
+    query_domains?: readonly string[] | null;
+    allowed_url_prefixes?: readonly string[] | null;
+    rejected_source_urls?: readonly string[] | null;
+  };
+}
+const buildResearchTask = (query: string): string => {
+  return query.trim();
+};
+const buildCustomPrompt = (): string => {
+  return [
+    "Answer the user's question as a Discord-native KOTOR assistant reply using only the provided research context.",
+    "Requirements:",
+    "- Lead with the answer, not an introduction.",
+    "- Sound direct, practical, and helpful.",
+    "- Keep the answer concise: at most 3 short paragraphs or 5 compact bullets total before sources.",
+    "- Do not describe your research process, retrieval steps, indexing, backend systems, or source policy unless the user explicitly asks.",
+    "- Include inline numeric citations like [1] tied to concrete claims.",
+    ' - End with the exact heading "Sources" on its own line.',
+    "- Under Sources, list only the sources you cited, each on its own numbered line in the format: 1. Source Name - URL",
+    "- Do not add markdown headings other than the final Sources heading.",
+  ].join("\n");
+};
+const buildCustomPromptBrief = (): string => {
+  return [
+    "Produce a compact research digest for Star Wars: Knights of the Old Republic (KOTOR 1/2) modding questions.",
+    "Constraints:",
+    "- Stay under ~900 words; bullet key facts when possible.",
+    "- Do not narrate tooling, retrieval steps, or how you searched.",
+    "- Prefer actionable answers over background essays.",
+    "- Include inline numeric citations like [1] tied to concrete claims.",
+    ' - End with the exact heading "Sources" on its own line.',
+    "- Under Sources, list only cited sources as numbered lines: 1. Source Name - URL",
+  ].join("\n");
+};
+const stripTrailingChars = (value: string, chars: string): string => {
+  let end = value.length;
+  while (end > 0 && chars.includes(value[end - 1]!)) end -= 1;
+  return value.slice(0, end);
+};
+const stripTrailingSlashes = (value: string): string => stripTrailingChars(value, "/");
+const stripTrailingQuestionMarks = (value: string): string => stripTrailingChars(value.trim(), "?");
+const collapseExcessiveNewlines = (value: string): string => {
+  const lines = value.split("\n");
+  const out: string[] = [];
+  let blankRun = 0;
+  for (const line of lines) {
+    if (line.trim() === "") {
+      blankRun += 1;
+      if (blankRun <= 1) out.push("");
+    } else {
+      blankRun = 0;
+      out.push(line);
+    }
+  }
+  return out.join("\n").trim();
+};
+const isSourcesHeadingLine = (line: string): boolean => {
+  let trimmed = line.trim();
+  if (trimmed.startsWith("#")) {
+    while (trimmed.startsWith("#")) trimmed = trimmed.slice(1);
+    trimmed = trimmed.trimStart();
+  }
+  return /^sources$/iu.test(trimmed) || /^references$/iu.test(trimmed);
+};
+const splitAtSourcesHeading = (value: string): string => {
+  const normalized = value.replace(/\r\n/g, "\n");
+  const lines = normalized.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    if (isSourcesHeadingLine(lines[i] ?? "")) {
+      return lines.slice(0, i).join("\n");
+    }
+  }
+  return normalized;
+};
+const extractSourceSectionUrls = (value: string): string[] => {
+  const normalized = value.replace(/\r\n/g, "\n");
+  const lines = normalized.split("\n");
+  for (let i = 0; i < lines.length; i++) {
+    if (isSourcesHeadingLine(lines[i] ?? "")) {
+      return extractUrls(lines.slice(i + 1).join("\n"));
+    }
+  }
+  return extractUrls(normalized);
+};
+const isUrlTerminator = (ch: string): boolean => /\s/u.test(ch) || ch === ")" || ch === ">" || ch === "]";
+const extractUrls = (value: string): string[] => {
+  const urls: string[] = [];
+  const lower = value.toLowerCase();
+  let i = 0;
+  while (i < value.length) {
+    const httpsIdx = lower.indexOf("https://", i);
+    const httpIdx = lower.indexOf("http://", i);
+    if (httpsIdx === -1 && httpIdx === -1) break;
+    const start = httpsIdx === -1
+      ? httpIdx
+      : httpIdx === -1
+        ? httpsIdx
+        : Math.min(httpsIdx, httpIdx);
+    let end = start;
+    while (end < value.length && !isUrlTerminator(value[end]!)) end += 1;
+    urls.push(stripTrailingChars(value.slice(start, end), ".,;:!?"));
+    i = end;
+  }
+  return [...new Set(urls)];
+};
+const rewriteMarkdownLinks = (
+  text: string,
+  onLink: (label: string, url: string) => string,
+): string => {
+  let result = "";
+  let i = 0;
+  while (i < text.length) {
+    if (text[i] !== "[") {
+      result += text[i];
+      i += 1;
+      continue;
+    }
+    const closeBracket = text.indexOf("]", i + 1);
+    if (closeBracket === -1 || text[closeBracket + 1] !== "(") {
+      result += text[i];
+      i += 1;
+      continue;
+    }
+    const closeParen = text.indexOf(")", closeBracket + 2);
+    if (closeParen === -1) {
+      result += text[i];
+      i += 1;
+      continue;
+    }
+    const label = text.slice(i + 1, closeBracket);
+    const url = text.slice(closeBracket + 2, closeParen);
+    if (!url.startsWith("http://") && !url.startsWith("https://")) {
+      result += text.slice(i, closeParen + 1);
+      i = closeParen + 1;
+      continue;
+    }
+    result += onLink(label, url);
+    i = closeParen + 1;
+  }
+  return result;
+};
+/** True when the line opens with 1–6 `#` characters followed by Unicode whitespace (ATX heading). */
+const isAtxMarkdownHeadingLine = (line: string): boolean => {
+  let i = 0;
+  let hashes = 0;
+  while (i < line.length && line[i] === "#" && hashes < 6) {
+    hashes += 1;
+    i += 1;
+  }
+  if (hashes === 0 || hashes > 6) return false;
+  if (i >= line.length) return false;
+  return /\s/u.test(line[i]!);
+};
+const stripMarkdownHeaders = (text: string): string =>
+  text
+    .split("\n")
+    .filter((line) => !isAtxMarkdownHeadingLine(line))
+    .join("\n");
+/** Pipe-delimited markdown table row heuristic: trimmed line starts and ends with `|`. */
+const looksLikeMarkdownTableRow = (line: string): boolean => {
+  const trimmed = line.trim();
+  return trimmed.length >= 2 && trimmed[0] === "|" && trimmed[trimmed.length - 1] === "|";
+};
+const stripMarkdownTableRows = (text: string): string =>
+  text
+    .split("\n")
+    .filter((line) => !looksLikeMarkdownTableRow(line))
+    .join("\n");
+const stripAsteriskRuns = (text: string): string => {
+  let out = "";
+  let i = 0;
+  while (i < text.length) {
+    if (text[i] === "*") {
+      while (i < text.length && text[i] === "*") i += 1;
+      continue;
+    }
+    out += text[i];
+    i += 1;
+  }
+  return out;
+};
+const splitParagraphs = (text: string): string[] => {
+  const paragraphs: string[] = [];
+  let current: string[] = [];
+  for (const line of text.split("\n")) {
+    if (line.trim() === "") {
+      if (current.length > 0) {
+        paragraphs.push(current.join("\n").trim());
+        current = [];
+      }
+    } else {
+      current.push(line);
+    }
+  }
+  if (current.length > 0) paragraphs.push(current.join("\n").trim());
+  return paragraphs.filter((paragraph) => paragraph.length > 0);
+};
+const normalizeUrl = (value: string): string => stripTrailingSlashes(value).trim();
+const hostnameHint = (url: string): string => {
+  try {
+    return new URL(url).hostname.replace(/^www\./, "").toLowerCase();
+  } catch {
+    return url.slice(0, 48);
+  }
+};
+/** Dedupe by normalized URL; preserves first-seen order for stable Holocron pulses. */
+const uniqueUrlsPreserveOrder = (urls: readonly string[]): string[] => {
+  const seen = new Set<string>();
+  const out: string[] = [];
+  for (const raw of urls) {
+    const u = normalizeUrl(raw);
+    if (!u || seen.has(u)) continue;
+    seen.add(u);
+    out.push(u);
+  }
+  return out;
+};
+const payloadUrls = (values: readonly string[] | null | undefined): string[] =>
+  Array.isArray(values) ? values.filter((value): value is string => typeof value === "string") : [];
+const isAllowedSourceUrl = (url: string, sourcePool: readonly SourceDescriptor[]): boolean => {
+  if (!isPublicWebCitationUrl(url)) return false;
+  if (sourcePool.some((source) => sourceUrlMatchesDescriptor(url, source))) return true;
+  if (isTraskApprovedResearchUrl(url, sourcePool)) return true;
+  return isTraskApprovedBaseUrl(url);
+};
+/** Visited URLs from web research payload (Holocron live facet pings). */
+const collectVisitedUrlsFromPayload = (
+  payload: WebResearchResponsePayload,
+  approvedSources: readonly SourceDescriptor[],
+): string[] => {
+  const info = payload.research_information;
+  return uniqueUrlsPreserveOrder(payloadUrls(info?.visited_urls)).filter((url) =>
+    isAllowedSourceUrl(url, approvedSources),
+  );
+};
+const collectRejectedUrlsFromPayload = (payload: WebResearchResponsePayload): string[] => {
+  const rawRejected = payload.research_information?.rejected_source_urls;
+  return Array.isArray(rawRejected)
+    ? uniqueUrlsPreserveOrder(rawRejected.filter((value): value is string => typeof value === "string"))
+    : [];
+};
+const MAX_ARCHIVE_PROBE_EVENTS = 28;
+const emitArchiveProbeEvents = (
+  payload: WebResearchResponsePayload,
+  approvedSources: readonly SourceDescriptor[],
+  onProgress?: (event: WebResearchProgressEvent) => void,
+): void => {
+  if (!onProgress) return;
+  const urls = collectVisitedUrlsFromPayload(payload, approvedSources).slice(0, MAX_ARCHIVE_PROBE_EVENTS * 2);
+  let emitted = 0;
+  for (const url of urls) {
+    if (emitted >= MAX_ARCHIVE_PROBE_EVENTS) break;
+    const matched = matchApprovedSource(url, approvedSources);
+    const host = hostnameHint(url);
+    onProgress({
+      phase: "gather",
+      detail: matched ? `Facet · ${matched.name}` : `Touch · ${host}`,
+      ...(matched ? { sources: [matched] } : {}),
+    });
+    emitted++;
+  }
+};
+const matchApprovedSource = (
+  url: string,
+  approvedSources: readonly SourceDescriptor[],
+): SourceDescriptor | undefined => {
+  const candidate = normalizeUrl(url);
+  return approvedSources.find((source) => {
+    const homeUrl = normalizeUrl(source.homeUrl);
+    return candidate === homeUrl || candidate.startsWith(`${homeUrl}/`);
+  });
+};
+const sourceUrlLabel = (source: SourceDescriptor, url: string): string => {
+  try {
+    const exact = new URL(url);
+    const base = new URL(source.homeUrl);
+    const exactPath = decodeURIComponent(stripTrailingSlashes(exact.pathname));
+    const basePath = decodeURIComponent(stripTrailingSlashes(base.pathname));
+    if (exactPath === basePath) return source.name;
+    const relativePath = exactPath.startsWith(`${basePath}/`) ? exactPath.slice(basePath.length + 1) : exactPath;
+    const cleaned = relativePath
+      .replace(/^blob\/[^/]+\//u, "")
+      .replace(/^tree\/[^/]+\//u, "")
+      .replace(/^wiki\//u, "")
+      .split("/")
+      .filter(Boolean)
+      .slice(-2)
+      .join("/")
+      .replace(/[-_]+/gu, " ")
+      .trim();
+    if (!cleaned) return source.name;
+    const lineAnchor = exact.hash && /^#L\d+(?:-L\d+)?$/iu.test(exact.hash) ? exact.hash : "";
+    return `${source.name}: ${cleaned}${lineAnchor}`;
+  } catch {
+    return source.name;
+  }
+};
+const exactSourceFromUrl = (url: string, approvedSources: readonly SourceDescriptor[]): SourceDescriptor | undefined => {
+  const exactUrl = normalizeUrl(url);
+  const catalogMatch = matchApprovedSource(url, approvedSources);
+  if (catalogMatch) {
+    const sourceUrl = normalizeUrl(catalogMatch.homeUrl);
+    return {
+      ...catalogMatch,
+      id: exactUrl === sourceUrl ? catalogMatch.id : `${catalogMatch.id}:${exactUrl}`,
+      name: sourceUrlLabel(catalogMatch, exactUrl),
+      homeUrl: exactUrl,
+    };
+  }
+  if (!isTraskApprovedBaseUrl(url)) return undefined;
+  const host = hostnameHint(url);
+  return {
+    id: `approved-web:${exactUrl}`,
+    name: host,
+    kind: "website",
+    homeUrl: exactUrl,
+    description: `Approved web source (${host})`,
+    freshnessPolicy: "live web research",
+    approvalScope: "approved research host",
+    tags: [host],
+  };
+};
+const isCatalogRootUrl = (url: string, approvedSources: readonly SourceDescriptor[]): boolean => {
+  const normalized = normalizeUrl(url);
+  return approvedSources.some((source) => normalizeUrl(source.homeUrl) === normalized);
+};
+const materializeSourcesFromUrls = (
+  urls: readonly string[],
+  sourcePool: readonly SourceDescriptor[],
+): readonly SourceDescriptor[] => {
+  const candidateUrls = uniqueUrlsPreserveOrder(
+    urls.filter((url) => isAllowedSourceUrl(url, sourcePool)),
+  );
+  const matched: SourceDescriptor[] = [];
+  const hasPreciseUrl = candidateUrls.some((url) => !isCatalogRootUrl(url, sourcePool));
+  for (const url of candidateUrls) {
+    if (hasPreciseUrl && isCatalogRootUrl(url, sourcePool)) continue;
+    const source = exactSourceFromUrl(url, sourcePool);
+    if (source && !matched.some((entry) => normalizeUrl(entry.homeUrl) === normalizeUrl(source.homeUrl))) {
+      matched.push(source);
+    }
+  }
+  return matched.slice(0, 6);
+};
+const collectCitedSources = (
+  report: string,
+  approvedSources: readonly SourceDescriptor[],
+  payload: WebResearchResponsePayload,
+): readonly SourceDescriptor[] => {
+  const info = payload.research_information;
+  return materializeSourcesFromUrls([
+    ...extractSourceSectionUrls(report),
+    ...payloadUrls(info?.cited_urls),
+    ...payloadUrls(info?.source_urls),
+  ], approvedSources);
+};
+const collectRetrievedSources = (
+  report: string,
+  approvedSources: readonly SourceDescriptor[],
+  payload: WebResearchResponsePayload,
+): readonly SourceDescriptor[] => {
+  const info = payload.research_information;
+  return materializeSourcesFromUrls([
+    ...payloadUrls(info?.retrieved_urls),
+    ...payloadUrls(info?.cited_urls),
+    ...payloadUrls(info?.source_urls),
+    ...extractSourceSectionUrls(report),
+  ], approvedSources);
+};
+const collectCitedSourcesFromText = (
+  text: string,
+  sourcePool: readonly SourceDescriptor[],
+): readonly SourceDescriptor[] => materializeSourcesFromUrls(extractSourceSectionUrls(text), sourcePool);
+const startsWithTableOfContentsHeading = (trimmed: string): boolean => {
+  const lower = trimmed.toLowerCase();
+  if (!lower.startsWith("##")) return false;
+  let i = 2;
+  while (i < lower.length && /\s/u.test(lower[i]!)) i += 1;
+  return lower.startsWith("table of contents", i);
+};
+/** `##` at line start followed by Unicode whitespace (matches prior `^##\\s+` checks). */
+const startsWithH2WithSpace = (trimmed: string): boolean =>
+  trimmed.startsWith("##") && trimmed.length > 2 && /\s/u.test(trimmed[2]!);
+/** Single-level ATX heading: `# ` but not `## …` (H1 title line). */
+const isH1AtxHeadingLine = (trimmed: string): boolean => {
+  if (!trimmed.startsWith("#")) return false;
+  if (trimmed.startsWith("##")) return false;
+  return trimmed.length > 1 && /\s/u.test(trimmed[1]!);
+};
+const normalizeReport = (value: string): string => {
+  const lines = value.replace(/\r\n/g, "\n").split("\n");
+  const out: string[] = [];
+  let skippingToc = false;
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (startsWithTableOfContentsHeading(trimmed)) {
+      skippingToc = true;
+      continue;
+    }
+    if (skippingToc) {
+      if (
+        startsWithH2WithSpace(trimmed)
+        || isSourcesHeadingLine(line)
+        || isH1AtxHeadingLine(trimmed)
+      ) {
+        skippingToc = false;
+      } else {
+        continue;
+      }
+    }
+    if (isH1AtxHeadingLine(trimmed)) continue;
+    out.push(line);
+  }
+  return collapseExcessiveNewlines(out.join("\n"));
+};
+const formatSourcesSection = (sources: readonly SourceDescriptor[]): string => {
+  return [
+    "Sources",
+    ...sources.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`),
+  ].join("\n");
+};
+const countPayloadWebUrls = (payload: WebResearchResponsePayload): number => {
+  const info = payload.research_information;
+  const urls = uniqueUrlsPreserveOrder([
+    ...payloadUrls(info?.cited_urls),
+    ...payloadUrls(info?.retrieved_urls),
+    ...payloadUrls(info?.visited_urls),
+    ...payloadUrls(info?.source_urls),
+  ]);
+  return urls.filter((url) => isPublicWebCitationUrl(url)).length;
+};
+const LEGACY_APPROVED_ARCHIVE_BULLET_MARKER =
+  "is an approved archive page that may answer questions about";
+/**
+ * Legacy failure copy used a markdown bullet whose tail contained a fixed phrase.
+ * Implemented without `.*`-style regexes to avoid polynomial backtracking on adversarial input.
+ */
+const hasLegacyApprovedArchiveFailureBullet = (normalized: string): boolean => {
+  const lower = normalized.toLowerCase();
+  const marker = LEGACY_APPROVED_ARCHIVE_BULLET_MARKER.toLowerCase();
+  if (!lower.startsWith("-")) return false;
+  let i = 1;
+  while (i < lower.length && /\s/u.test(lower[i]!)) i += 1;
+  if (i >= lower.length || /\s/u.test(lower[i]!)) return false;
+  while (i < lower.length && /\S/u.test(lower[i]!)) i += 1;
+  return lower.indexOf(marker, i) !== -1;
+};
+const isSynthesisFailureReport = (report: string, payload: WebResearchResponsePayload): boolean => {
+  const normalized = report.trim();
+  const webUrlCount = countPayloadWebUrls(payload);
+  if (webUrlCount >= MIN_HOLOCRON_WEB_CITATIONS) {
+    return /^i could not complete live archive synthesis\b/iu.test(normalized);
+  }
+  if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
+    return true;
+  }
+  if (hasLegacyApprovedArchiveFailureBullet(normalized)) {
+    return true;
+  }
+  return false;
+};
+const sourceOnlyFallbackAnswer = (query: string, sources: readonly SourceDescriptor[]): string => {
+  if (sources.length === 0) return "I could not complete live archive synthesis for this question right now.";
+  const topic = stripTrailingQuestionMarks(query) || "this question";
+  return [
+    `I found candidate sources for ${topic}, but I could not support a grounded answer from the retrieved evidence.`,
+    "Review the sources below or try a narrower wording.",
+    "",
+    formatSourcesSection(sources),
+  ].join("\n");
+};
+const DEFAULT_REWRITE_TIMEOUT_MS = 15_000;
+const MAX_REWRITE_ATTEMPTS = 2;
+const normalizePreferredRewriteModel = (model: string | undefined): string | undefined => {
+  const trimmed = model?.trim();
+  if (!trimmed) return undefined;
+  if (trimmed.startsWith("litellm:")) return trimmed.slice("litellm:".length).trim() || undefined;
+  if (trimmed.startsWith("openrouter:")) return trimmed.slice("openrouter:".length).trim() || undefined;
+  return trimmed;
+};
+const withTimeout = async <T>(promise: Promise<T>, timeoutMs: number): Promise<T> => {
+  return await new Promise<T>((resolve, reject) => {
+    const timer = setTimeout(() => {
+      reject(new Error(`rewrite timed out after ${timeoutMs}ms`));
+    }, timeoutMs);
+    void promise.then(
+      (value) => {
+        clearTimeout(timer);
+        resolve(value);
+      },
+      (error: unknown) => {
+        clearTimeout(timer);
+        reject(error);
+      },
+    );
+  });
+};
+const fallbackDiscordRewrite = (
+  query: string,
+  report: string,
+  sources: readonly SourceDescriptor[],
+): string => {
+  if (sources.length === 0) {
+    return degradedAnswerFallback(query, sources);
+  }
+  const normalized = normalizeReport(report);
+  if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
+    return sourceOnlyFallbackAnswer(query, sources);
+  }
+  const sourceIndexByUrl = new Map<string, number>(
+    sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
+  );
+  const bodyOnly = collapseExcessiveNewlines(
+    stripAsteriskRuns(
+      stripMarkdownTableRows(
+        stripMarkdownHeaders(
+          rewriteMarkdownLinks(splitAtSourcesHeading(normalized), (text, url) => {
+            const matchedSource = matchApprovedSource(url, sources);
+            const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
+            return citationIndex ? `${text} [${citationIndex}]` : text;
+          }),
+        ),
+      ),
+    ),
+  );
+  const paragraphs = splitParagraphs(bodyOnly);
+  const selected: string[] = [];
+  let totalLength = 0;
+  for (const paragraph of paragraphs) {
+    if (selected.length >= 2) break;
+    if (totalLength + paragraph.length > 900 && selected.length > 0) break;
+    selected.push(paragraph);
+    totalLength += paragraph.length;
+  }
+  let summary = selected.join("\n\n").trim();
+  if (!summary) {
+    summary = bodyOnly.slice(0, 900).trim();
+  }
+  if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
+    summary = `${summary} [1]`.trim();
+  }
+  return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
+};
+const fallbackDiscordBrief = (query: string, report: string, sources: readonly SourceDescriptor[]): string => {
+  if (sources.length === 0) {
+    return degradedAnswerFallback(query, sources);
+  }
+  const normalized = normalizeReport(report);
+  if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
+    return sourceOnlyFallbackAnswer(query, sources);
+  }
+  const sourceIndexByUrl = new Map<string, number>(
+    sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
+  );
+  const bodyOnly = collapseExcessiveNewlines(
+    stripAsteriskRuns(
+      stripMarkdownHeaders(
+        rewriteMarkdownLinks(splitAtSourcesHeading(normalized), (text, url) => {
+          const matchedSource = matchApprovedSource(url, sources);
+          const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
+          return citationIndex ? `${text} [${citationIndex}]` : text;
+        }),
+      ),
+    ),
+  );
+  const firstChunk = splitParagraphs(bodyOnly)[0] ?? bodyOnly;
+  let summary = firstChunk.slice(0, 420).trim();
+  if (!summary) {
+    summary = bodyOnly.slice(0, 420).trim();
+  }
+  if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
+    summary = `${summary} [1]`.trim();
+  }
+  return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
+};
+const degradedAnswerFallback = (_query: string, _approvedSources: readonly SourceDescriptor[]): string => {
+  return "I could not complete live archive synthesis for this question right now.";
+};
+const normalizePreferenceUrl = (url: string): URL | undefined => {
+  try {
+    return new URL(stripTrailingSlashes(url.trim()));
+  } catch {
+    return undefined;
+  }
+};
+const preferenceMatchesSource = (preference: WebResearchSourcePreference, source: SourceDescriptor): boolean => {
+  const preferenceUrl = normalizePreferenceUrl(preference.url);
+  const sourceUrl = normalizePreferenceUrl(source.homeUrl);
+  if (preferenceUrl && sourceUrl) {
+    const preferenceHost = preferenceUrl.hostname.replace(/^www\./, "").toLowerCase();
+    const sourceHost = sourceUrl.hostname.replace(/^www\./, "").toLowerCase();
+    const preferencePath = stripTrailingSlashes(preferenceUrl.pathname);
+    const sourcePath = stripTrailingSlashes(sourceUrl.pathname);
+    if (preferenceHost === sourceHost && (preferencePath === "" || sourcePath === preferencePath || sourcePath.startsWith(`${preferencePath}/`))) {
+      return true;
+    }
+    if (preferenceHost === sourceHost && preferenceUrl.pathname === "/") {
+      return true;
+    }
+  }
+  const preferenceName = preference.name?.trim().toLowerCase();
+  return Boolean(preferenceName && preferenceName === source.name.trim().toLowerCase());
+};
+const applySourcePreferences = (
+  approvedSources: readonly SourceDescriptor[],
+  preferences?: readonly WebResearchSourcePreference[],
+): readonly SourceDescriptor[] => {
+  if (!preferences?.length) return approvedSources;
+  const ranked = approvedSources
+    .map((source, index) => {
+      const preference = preferences.find((entry) => preferenceMatchesSource(entry, source));
+      return {
+        source,
+        index,
+        enabled: preference ? preference.enabled : true,
+        weight: preference && Number.isFinite(preference.weight) ? preference.weight : 1,
+      };
+    })
+    .filter((entry) => entry.enabled)
+    .sort((left, right) => right.weight - left.weight || left.index - right.index)
+    .map((entry) => entry.source);
+  return ranked;
+};
+type ResearchQueryIntent = "tooling" | "technical" | "lore" | "general";
+const TOOLING_QUERY_TERMS = [
+  "mdlops",
+  "mdledit",
+  "kotormax",
+  "kotorblender",
+  "pykotor",
+  "xoreos",
+  "reone",
+  "tslpatcher",
+  "toolchain",
+  "modding",
+  "tool",
+  "script",
+  "gff",
+  "2da",
+  "tlk",
+  "nss",
+  "ncs",
+  "utc",
+  "uti",
+  "mdl",
+  "mdx",
+  "texture",
+  "convert",
+  "blender",
+  "3ds",
+];
+const TECHNICAL_QUERY_TERMS = [
+  "widescreen",
+  "resolution",
+  "hud",
+  "screen",
+  "crash",
+  "compatibility",
+  "steam",
+  "windows",
+  "linux",
+  "mac",
+  "save",
+  "saves",
+  "install",
+  "launcher",
+  "driver",
+  "movies",
+  "cutscene",
+  "graphics",
+  "aspect",
+];
+const LORE_QUERY_TERMS = [
+  "bastila",
+  "revan",
+  "malak",
+  "shan",
+  "jedi",
+  "sith",
+  "rakata",
+  "star forge",
+  "temple summit",
+  "companion",
+  "romance",
+  "story",
+  "lore",
+];
+const LORE_SOURCE_IDS = new Set(["wikipedia-kotor", "strategywiki-kotor"]);
+const queryIncludesAny = (query: string, terms: readonly string[]): boolean => {
+  const lowered = query.toLowerCase();
+  return terms.some((term) => lowered.includes(term));
+};
+const classifyQueryIntent = (query: string): ResearchQueryIntent => {
+  const lowered = query.toLowerCase();
+  if (queryIncludesAny(lowered, TOOLING_QUERY_TERMS)) return "tooling";
+  if (queryIncludesAny(lowered, TECHNICAL_QUERY_TERMS)) return "technical";
+  if (queryIncludesAny(lowered, LORE_QUERY_TERMS)) return "lore";
+  return "general";
+};
+const routeSourcesForQuery = (
+  query: string,
+  approvedSources: readonly SourceDescriptor[],
+): readonly SourceDescriptor[] => {
+  const intent = classifyQueryIntent(query);
+  if (intent === "tooling" || intent === "technical") {
+    const filtered = approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id));
+    return filtered.length > 0 ? filtered : approvedSources;
+  }
+  if (intent === "lore") {
+    return [
+      ...approvedSources.filter((source) => LORE_SOURCE_IDS.has(source.id)),
+      ...approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id)),
+    ];
+  }
+  return approvedSources;
+};
+const mergeSourcesPreserveOrder = (...groups: readonly (readonly SourceDescriptor[])[]): SourceDescriptor[] => {
+  const merged: SourceDescriptor[] = [];
+  const seen = new Set<string>();
+  for (const group of groups) {
+    for (const source of group) {
+      const key = normalizeUrl(source.homeUrl);
+      if (seen.has(key)) continue;
+      seen.add(key);
+      merged.push(source);
+    }
+  }
+  return merged;
+};
+const normalizeMatchToken = (token: string): string => {
+  const lowered = token.toLowerCase();
+  if (lowered.length <= 6) return lowered;
+  return lowered.slice(0, 6);
+};
+const tokenizeQuery = (query: string): string[] =>
+  [...new Set(
+    query
+      .toLowerCase()
+      .replace(/[^\p{L}\p{N}\s-]/gu, " ")
+      .split(/\s+/)
+      .filter((token) => token.length >= 4)
+      .map(normalizeMatchToken),
+  )];
+/** Citations must be real public web pages on the approved allowlist (live web research only). */
+const isPublicWebCitationUrl = (url: string): boolean => {
+  if (url.startsWith("local://") || url.startsWith("discord://") || isDiscordCitationUrl(url)) return false;
+  try {
+    const parsed = new URL(url);
+    return parsed.protocol === "https:" || parsed.protocol === "http:";
+  } catch {
+    return false;
+  }
+};
+const filterPublicWebCitationSources = (sources: readonly SourceDescriptor[]): SourceDescriptor[] =>
+  sources.filter((source) => isPublicWebCitationUrl(source.homeUrl));
+/** Holocron e2e and product policy: answers must ground on multiple approved web sources. */
+export const MIN_HOLOCRON_WEB_CITATIONS = 2;
+const collectWebEvidenceSources = (
+  query: string,
+  report: string,
+  approvedSources: readonly SourceDescriptor[],
+  payload: WebResearchResponsePayload,
+): readonly SourceDescriptor[] => {
+  const pool = mergeSourcesPreserveOrder(
+    collectRetrievedSources(report, approvedSources, payload),
+    collectCitedSources(report, approvedSources, payload),
+    materializeSourcesFromUrls(collectVisitedUrlsFromPayload(payload, approvedSources), approvedSources),
+  );
+  return rerankEvidenceSources(query, filterPublicWebCitationSources(pool));
+};
+const ensureMinimumWebCitations = (
+  query: string,
+  cited: readonly SourceDescriptor[],
+  evidence: readonly SourceDescriptor[],
+  payload?: WebResearchResponsePayload,
+  approvedSources: readonly SourceDescriptor[] = [],
+): readonly SourceDescriptor[] => {
+  const info = payload?.research_information;
+  const payloadBacked = payload
+    ? materializeSourcesFromUrls(
+      uniqueUrlsPreserveOrder([
+        ...payloadUrls(info?.cited_urls),
+        ...payloadUrls(info?.retrieved_urls),
+        ...payloadUrls(info?.visited_urls),
+        ...payloadUrls(info?.source_urls),
+      ]),
+      approvedSources,
+    )
+    : [];
+  const merged = rerankEvidenceSources(
+    query,
+    mergeSourcesPreserveOrder(cited, evidence, payloadBacked),
+  );
+  const webOnly = filterPublicWebCitationSources(merged);
+  if (webOnly.length >= MIN_HOLOCRON_WEB_CITATIONS) {
+    return webOnly.slice(0, 8);
+  }
+  const padded = rerankEvidenceSources(
+    query,
+    mergeSourcesPreserveOrder(webOnly, filterPublicWebCitationSources(evidence), payloadBacked),
+  );
+  return padded.length >= MIN_HOLOCRON_WEB_CITATIONS
+    ? padded.slice(0, 8)
+    : filterPublicWebCitationSources(payloadBacked).slice(0, 8);
+};
+const composeAnswerFromWebSources = (query: string, sources: readonly SourceDescriptor[]): string => {
+  const webSources = filterPublicWebCitationSources(sources).slice(0, 5);
+  if (webSources.length === 0) {
+    return sourceOnlyFallbackAnswer(query, sources);
+  }
+  return sourceOnlyFallbackAnswer(query, webSources);
+};
+const sourceMatchesQuery = (source: SourceDescriptor, query: string): boolean => {
+  const tokens = tokenizeQuery(query);
+  if (tokens.length === 0) return false;
+  const haystack = `${source.name} ${source.description ?? ""} ${source.homeUrl}`.toLowerCase();
+  let hits = 0;
+  for (const token of tokens) {
+    if (haystack.includes(token)) hits += 1;
+  }
+  return hits >= Math.min(2, tokens.length);
+};
+const sourceRelevanceScore = (source: SourceDescriptor, query: string): number => {
+  const tokens = tokenizeQuery(query);
+  if (tokens.length === 0) return 1;
+  const haystack = [
+    source.name,
+    source.description,
+    source.homeUrl,
+    ...(source.tags ?? []),
+  ].join(" ").toLowerCase();
+  let hits = 0;
+  for (const token of tokens) {
+    if (haystack.includes(token)) hits += 1;
+  }
+  const titleBonus = tokens.some((token) => source.name.toLowerCase().includes(token)) ? 2 : 0;
+  const urlBonus = tokens.some((token) => source.homeUrl.toLowerCase().includes(token)) ? 1 : 0;
+  return hits * 2 + titleBonus + urlBonus;
+};
+const rerankEvidenceSources = (query: string, sources: readonly SourceDescriptor[]): readonly SourceDescriptor[] => {
+  const tokens = tokenizeQuery(query);
+  const ranked = sources
+    .map((source, index) => ({
+      source,
+      index,
+      score: sourceRelevanceScore(source, query),
+    }))
+    .sort((left, right) => right.score - left.score || left.index - right.index);
+  if (tokens.length === 0) {
+    return ranked.map((entry) => entry.source).slice(0, 4);
+  }
+  const strong = ranked.filter((entry) => entry.score >= 2).map((entry) => entry.source);
+  return strong.slice(0, 8);
+};
+const resolveWebSourcesForFailedSynthesis = (
+  query: string,
+  retrievedSources: readonly SourceDescriptor[],
+): readonly SourceDescriptor[] => {
+  const candidates = filterPublicWebCitationSources(retrievedSources);
+  const matched = candidates.filter((source) => sourceMatchesQuery(source, query));
+  return (matched.length > 0 ? matched : candidates).slice(0, 5);
+};
+const researchDomainsForSources = (sources: readonly SourceDescriptor[]): string[] => {
+  const enabledHosts = new Set<string>();
+  for (const source of sources) {
+    try {
+      const host = new URL(source.homeUrl).hostname.replace(/^www\./, "").toLowerCase();
+      const baseHost = traskApprovedResearchBaseHosts.find((base) => host === base || host.endsWith(`.${base}`));
+      if (baseHost) enabledHosts.add(baseHost);
+    } catch {
+      continue;
+    }
+  }
+  return [...enabledHosts];
+};
+const HEARTBEAT_MS = 8000;
+const withProgressHeartbeat = async <T>(
+  phase: WebResearchProgressEvent["phase"],
+  makeDetail: (elapsedMs: number) => string,
+  onProgress: ((event: WebResearchProgressEvent) => void) | undefined,
+  work: () => Promise<T>,
+): Promise<T> => {
+  if (!onProgress) {
+    return await work();
+  }
+  const startedAt = Date.now();
+  let lastBucket = -1;
+  const emit = () => {
+    const elapsed = Date.now() - startedAt;
+    const bucket = Math.floor(elapsed / HEARTBEAT_MS);
+    if (bucket === lastBucket) return;
+    lastBucket = bucket;
+    onProgress({ phase, detail: makeDetail(elapsed) });
+  };
+  emit();
+  const timer = setInterval(emit, HEARTBEAT_MS);
+  try {
+    return await work();
+  } finally {
+    clearInterval(timer);
+  }
+};
+export class WebResearchClient implements WebResearchQueryHandler {
+  private readonly openAiClient: OpenAI | null;
+  public constructor(
+    private readonly config: WebResearchRuntimeConfig,
+    private readonly aiConfig: SharedAiConfig,
+    private readonly approvedSources: readonly SourceDescriptor[] = traskApprovedResearchSources,
+    private readonly factoryOptions: WebResearchClientFactoryOptions = {},
+  ) {
+    this.openAiClient = aiConfig.openAiApiKey
+      ? new OpenAI({
+          apiKey: aiConfig.openAiApiKey,
+          ...(aiConfig.openAiBaseUrl ? { baseURL: aiConfig.openAiBaseUrl } : {}),
+          ...(aiConfig.openAiDefaultHeaders ? { defaultHeaders: aiConfig.openAiDefaultHeaders } : {}),
+        })
+      : null;
+  }
+  public async listModels(): Promise<readonly WebResearchModelOption[]> {
+    try {
+      const dynamicModels = await listHeadlessWebResearchModels(this.config);
+      const seen = new Set(DEFAULT_WEB_RESEARCH_MODELS.map((model) => model.id));
+      return [
+        ...DEFAULT_WEB_RESEARCH_MODELS,
+        ...dynamicModels.filter((model) => {
+          if (seen.has(model.id)) return false;
+          seen.add(model.id);
+          return true;
+        }),
+      ];
+    } catch {
+      return DEFAULT_WEB_RESEARCH_MODELS;
+    }
+  }
+  private async rewriteForDiscord(
+    query: string,
+    report: string,
+    approvedSources: readonly SourceDescriptor[],
+    preferredModel?: string,
+    communityDigest = "",
+  ): Promise<string> {
+    if (!this.openAiClient) {
+      return fallbackDiscordRewrite(query, report, approvedSources);
+    }
+    const allowedSources = approvedSources
+      .map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
+      .join("\n");
+    const preferredRewriteModel = normalizePreferredRewriteModel(preferredModel);
+    const modelsToTry = [
+      ...new Set([...(preferredRewriteModel ? [preferredRewriteModel] : []), this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks]),
+    ].slice(0, MAX_REWRITE_ATTEMPTS);
+    for (const model of modelsToTry) {
+      try {
+        const completion = await withTimeout(
+          this.openAiClient.chat.completions.create({
+            model,
+            temperature: 0.2,
+            messages: [
+              {
+                role: "system",
+                content: [
+                  "Rewrite research reports into concise Discord answers.",
+                  "Do not mention research steps, indexing, tooling, or backend behavior.",
+                  "Use only the numbered sources provided by the user.",
+                  "Return plain Markdown with no headings except the final Sources heading.",
+                ].join(" "),
+              },
+              {
+                role: "user",
+                content: [
+                  `Question: ${query}`,
+                  "Write a concise answer for Discord.",
+                  "Requirements:",
+                  "- Lead with the answer.",
+                  "- Use at most 3 short paragraphs or 5 compact bullets before sources.",
+                  "- Use inline numeric citations like [1], [2].",
+                  ' - End with the exact heading "Sources" on its own line.',
+                  "- Under Sources, include only the cited sources using the exact numbered lines provided below.",
+                  "Allowed Sources:",
+                  allowedSources,
+                  ...(communityDigest ? ["Community context (lower authority than web archives):", communityDigest] : []),
+                  "Research Report:",
+                  report,
+                ].join("\n\n"),
+              },
+            ],
+          }),
+          DEFAULT_REWRITE_TIMEOUT_MS,
+        );
+        const rewritten = completion.choices[0]?.message?.content?.trim();
+        if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
+          return rewritten;
+        }
+      } catch {
+        continue;
+      }
+    }
+    return fallbackDiscordRewrite(query, report, approvedSources);
+  }
+  private async resolveLocalHits(
+    query: string,
+    options: WebResearchQueryOptions | undefined,
+    onProgress?: (event: WebResearchProgressEvent) => void,
+  ): Promise<readonly SearchHit[]> {
+    const prefetched = options?.localHits?.filter((hit) => hit.url.trim()) ?? [];
+    if (prefetched.length > 0) {
+      return prefetched;
+    }
+    const provider = this.factoryOptions.localSearchProvider;
+    if (!provider) {
+      return [];
+    }
+    onProgress?.({
+      phase: "gather",
+      detail: "Searching imported server history…",
+    });
+    try {
+      return await provider.search(query, 6);
+    } catch {
+      return [];
+    }
+  }
+  private async rewriteForDiscordBrief(
+    query: string,
+    report: string,
+    approvedSources: readonly SourceDescriptor[],
+  ): Promise<string> {
+    if (!this.openAiClient) {
+      return fallbackDiscordBrief(query, report, approvedSources);
+    }
+    const allowedSources = approvedSources
+      .map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
+      .join("\n");
+    const modelsToTry = [...new Set([this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks])].slice(0, MAX_REWRITE_ATTEMPTS);
+    for (const model of modelsToTry) {
+      try {
+        const completion = await withTimeout(
+          this.openAiClient.chat.completions.create({
+            model,
+            temperature: 0.15,
+            max_tokens: 380,
+            messages: [
+              {
+                role: "system",
+                content: [
+                  "Rewrite research into a very short Discord chat reply (like a quick DM).",
+                  "No preamble, no essay tone, no meta commentary about research.",
+                  "Use only the numbered sources provided.",
+                  "Plain sentences; at most 2 short sentences OR up to 3 compact bullets before Sources.",
+                  'End with the exact heading "Sources" on its own line, then cited sources only.',
+                ].join(" "),
+              },
+              {
+                role: "user",
+                content: [
+                  `Question: ${query}`,
+                  "Write the shortest helpful answer.",
+                  "Allowed Sources:",
+                  allowedSources,
+                  "Research Report:",
+                  report,
+                ].join("\n\n"),
+              },
+            ],
+          }),
+          DEFAULT_REWRITE_TIMEOUT_MS,
+        );
+        const rewritten = completion.choices[0]?.message?.content?.trim();
+        if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
+          return rewritten;
+        }
+      } catch {
+        continue;
+      }
+    }
+    return fallbackDiscordBrief(query, report, approvedSources);
+  }
+  private async fetchResearchReport(
+    query: string,
+    customPrompt: string,
+    approvedSources: readonly SourceDescriptor[],
+    options?: WebResearchQueryOptions,
+  ): Promise<{ report: string; payload: WebResearchResponsePayload }> {
+    if (approvedSources.length === 0) {
+      throw new Error("No approved research sources are enabled.");
+    }
+    const allowedDomains = researchDomainsForSources(approvedSources);
+    const raw = await runHeadlessWebResearch(this.config, {
+      query: buildResearchTask(query),
+      custom_prompt: customPrompt,
+      source_urls: approvedSources.map((source) => source.homeUrl),
+      query_domains: allowedDomains,
+      allowed_url_prefixes: approvedSources.map((source) => source.homeUrl),
+      ...(options?.model?.trim() ? { model: options.model.trim() } : {}),
+      report_type: "research_report",
+      report_source: "web",
+    });
+    const payload: WebResearchResponsePayload = {
+      report: raw.report,
+      ...(raw.research_information !== undefined
+        ? { research_information: { ...raw.research_information } }
+        : {}),
+    };
+    const report = typeof raw.report === "string" ? normalizeReport(raw.report) : "";
+    if (!report) {
+      throw new Error("Trask web research returned an empty report.");
+    }
+    return { report, payload };
+  }
+  public async answerQuestion(
+    query: string,
+    onProgress?: (event: WebResearchProgressEvent) => void,
+    options?: WebResearchQueryOptions,
+  ): Promise<WebResearchAnswer> {
+    const approvedSources = routeSourcesForQuery(
+      query,
+      applySourcePreferences(this.approvedSources, options?.sourcePreferences),
+    );
+    try {
+      const localHits = await this.resolveLocalHits(query, options, onProgress);
+      const communitySources = searchHitsToCommunitySources(localHits);
+      const communityDigest = buildCommunityKnowledgeDigest(localHits);
+      if (localHits.length > 0) {
+        onProgress?.({
+          phase: "gather",
+          detail: `Found ${localHits.length} relevant message${localHits.length === 1 ? "" : "s"} in server history…`,
+        });
+      }
+      const allowedDomains = researchDomainsForSources(approvedSources);
+      onProgress?.({
+        phase: "gather",
+        detail: `Scanning ${approvedSources.length} approved source root${approvedSources.length === 1 ? "" : "s"} across ${allowedDomains.length} host${allowedDomains.length === 1 ? "" : "s"}…`,
+      });
+      const { report, payload } = await withProgressHeartbeat(
+        "gather",
+        (elapsedMs) => {
+          const seconds = Math.max(1, Math.floor(elapsedMs / 1000));
+          return `Researching approved archive sources… (${seconds}s)`;
+        },
+        onProgress,
+        async () => await this.fetchResearchReport(query, buildCustomPrompt(), approvedSources, options),
+      );
+      const rejectedUrls = collectRejectedUrlsFromPayload(payload);
+      if (rejectedUrls.length > 0) {
+        onProgress?.({
+          phase: "gather",
+          detail: `Rejected ${rejectedUrls.length} URL${rejectedUrls.length === 1 ? "" : "s"} outside approved source roots.`,
+        });
+      }
+      emitArchiveProbeEvents(payload, approvedSources, onProgress);
+      onProgress?.({
+        phase: "report",
+        detail: "Ranking passages and citations…",
+      });
+      const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
+      const retrievedSources = mergeCommunityAndWebSources(webEvidenceSources, communitySources);
+      const citedSourcesFromReport = rerankEvidenceSources(
+        query,
+        mergeSourcesPreserveOrder(
+          collectCitedSources(report, approvedSources, payload),
+          collectCitedSourcesFromText(report, approvedSources),
+        ),
+      );
+      onProgress?.({
+        phase: "sources",
+        detail: retrievedSources.length ? `${retrievedSources.length} sources retrieved` : "Mapping hosts to archive catalog…",
+        sources: retrievedSources,
+      });
+      onProgress?.({
+        phase: "compose",
+        detail: "Rendering Holocron answer…",
+      });
+      const sourcesForRewrite = mergeCommunityAndWebSources(
+        filterWebArchiveCitationSources(retrievedSources),
+        communitySources,
+      );
+      const webSourcesForRewrite = filterPublicWebCitationSources(sourcesForRewrite);
+      let answer: string;
+      if (webSourcesForRewrite.length === 0 && communitySources.length === 0) {
+        answer = degradedAnswerFallback(query, approvedSources);
+      } else if (isSynthesisFailureReport(report, payload)) {
+        const webSources = resolveWebSourcesForFailedSynthesis(query, webEvidenceSources);
+        if (webSources.length >= MIN_HOLOCRON_WEB_CITATIONS) {
+          const rewritePool = mergeCommunityAndWebSources(
+            filterPublicWebCitationSources(webSources),
+            communitySources,
+          );
+          answer = this.openAiClient
+            ? await this.rewriteForDiscord(query, report, rewritePool, options?.model, communityDigest)
+            : fallbackDiscordRewrite(query, report, rewritePool);
+        } else if (webSources.length > 0 || communitySources.length > 0) {
+          answer = sourceOnlyFallbackAnswer(query, sourcesForRewrite);
+        } else {
+          answer = degradedAnswerFallback(query, approvedSources);
+        }
+      } else if (this.openAiClient) {
+        answer = await this.rewriteForDiscord(
+          query,
+          report,
+          sourcesForRewrite,
+          options?.model,
+          communityDigest,
+        );
+      } else {
+        answer = fallbackDiscordRewrite(
+          query,
+          report,
+          sourcesForRewrite,
+        );
+      }
+      const webCitedSources = ensureMinimumWebCitations(
+        query,
+        filterPublicWebCitationSources(
+          mergeSourcesPreserveOrder(
+            collectCitedSourcesFromText(answer, retrievedSources),
+            citedSourcesFromReport,
+          ),
+        ),
+        webEvidenceSources,
+        payload,
+        approvedSources,
+      );
+      const communityCited = collectCitedSourcesFromText(answer, communitySources).filter(
+        (source) => isDiscordCitationUrl(source.homeUrl),
+      );
+      const citedSources = mergeCommunityAndWebSources(webCitedSources, communityCited);
+      return {
+        answer,
+        approvedSources: citedSources,
+        retrievedSources,
+        visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
+      };
+    } catch (error: unknown) {
+      const detail = error instanceof Error ? error.message : String(error);
+      onProgress?.({
+        phase: "compose",
+        detail: `Live web research failed: ${detail.slice(0, 240)}`,
+      });
+      const topic = stripTrailingQuestionMarks(query) || "this question";
+      return {
+        answer: `I could not complete live web research for "${topic}" right now (${detail}). Run scripts/bootstrap_trask_research.sh, set TRASK_WEB_RESEARCH_PYTHON, OPENAI_API_KEY or OPENROUTER_API_KEY, and TRASK_WEB_RESEARCH_TIMEOUT_MS, then retry.`,
+        approvedSources: [],
+        retrievedSources: [],
+        visitedUrls: [],
+      };
+    }
+  }
+  /** Shorter rewrite for proactive/channel replies (still source-backed). */
+  public async answerQuestionBrief(query: string): Promise<WebResearchBriefAnswer> {
+    try {
+      const approvedSources = routeSourcesForQuery(query, this.approvedSources);
+      const { report, payload } = await this.fetchResearchReport(query, buildCustomPromptBrief(), approvedSources);
+      const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
+      const retrievedSources = webEvidenceSources;
+      const answer = retrievedSources.length > 0
+        ? await this.rewriteForDiscordBrief(query, report, retrievedSources)
+        : degradedAnswerFallback(query, approvedSources);
+      return {
+        answer,
+        approvedSources: ensureMinimumWebCitations(
+          query,
+          filterPublicWebCitationSources(
+            mergeSourcesPreserveOrder(
+              collectCitedSourcesFromText(answer, retrievedSources),
+              collectCitedSources(report, approvedSources, payload),
+            ),
+          ),
+          webEvidenceSources,
+          payload,
+          approvedSources,
+        ),
+        retrievedSources,
+        visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
+        researchReport: report,
+      };
+    } catch {
+      const topic = stripTrailingQuestionMarks(query) || "this question";
+      const answer = `I could not complete live web research for "${topic}" right now.`;
+      return {
+        answer,
+        approvedSources: [],
+        retrievedSources: [],
+        visitedUrls: [],
+        researchReport: answer,
+      };
+    }
+  }
+}
+export const createWebResearchClient = (
+  config: WebResearchRuntimeConfig,
+  aiConfig: SharedAiConfig = loadSharedAiConfig(),
+  factoryOptions: WebResearchClientFactoryOptions = {},
+): WebResearchClient => {
+  return new WebResearchClient(config, aiConfig, traskApprovedResearchSources, factoryOptions);
+};
+// ---------------------------------------------------------------------------
+// Pure helpers exported for unit testing — not part of the public API surface.
+// ---------------------------------------------------------------------------
+export {
+  normalizeUrl as _normalizeUrl,
+  extractUrls as _extractUrls,
+  hostnameHint as _hostnameHint,
+  uniqueUrlsPreserveOrder as _uniqueUrlsPreserveOrder,
+  collectCitedSources as _collectCitedSources,
+  collectRetrievedSources as _collectRetrievedSources,
+  collectVisitedUrlsFromPayload as _collectVisitedUrlsFromPayload,
+  collectCitedSourcesFromText as _collectCitedSourcesFromText,
+  isSynthesisFailureReport as _isSynthesisFailureReport,
+  countPayloadWebUrls as _countPayloadWebUrls,
+  normalizeReport as _normalizeReport,
+  formatSourcesSection as _formatSourcesSection,
+  normalizePreferredRewriteModel as _normalizePreferredRewriteModel,
+  matchApprovedSource as _matchApprovedSource,
+  classifyQueryIntent as _classifyQueryIntent,
+  routeSourcesForQuery as _routeSourcesForQuery,
+};

pnpm-lock.yaml CHANGED Viewed

@@ -637,6 +637,8 @@ importers:
         specifier: ^4.21.0
         version: 4.21.0
   infra/matchmaking-inducer:
     dependencies:
       http-proxy:

         specifier: ^4.21.0
         version: 4.21.0
+  infra/holocron-trask-api: {}
   infra/matchmaking-inducer:
     dependencies:
       http-proxy:

requirements-trask-research.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+# Holocron / Trask web research runner (scripts/trask_web_research.py)
+crawl4ai>=0.8.6,<0.9
+duckduckgo-search>=7.0.0,<8
+trafilatura>=2.0.0,<3
+lxml_html_clean>=0.4.0
+redis>=5.0.0,<6

scripts/trask_cache.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+Redis cache for Trask web research (DuckDuckGo discovery + page scrape).
+Optional: set REDIS_URL or TRASK_REDIS_URL. Disable with TRASK_CACHE_DISABLED=1.
+Key layout (redis-development plugin conventions):
+  trask:search:{hash}     — discovered URL list (JSON)
+  trask:page:{hash}       — scraped markdown per normalized URL
+  trask:research:{hash}   — full run_payload JSON result
+All keys use SETEX with configurable TTLs.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import os
+from typing import Any, TYPE_CHECKING
+if TYPE_CHECKING:
+    from redis import Redis
+KEY_PREFIX = "trask"
+DEFAULT_SEARCH_TTL = 6 * 60 * 60  # 6h — DDG results drift slowly
+DEFAULT_PAGE_TTL = 7 * 24 * 60 * 60  # 7d — archive pages are fairly stable
+DEFAULT_RESEARCH_TTL = 60 * 60  # 1h — full answer bundle; shorter for freshness
+def cache_enabled() -> bool:
+    if os.environ.get("TRASK_CACHE_DISABLED", "").strip().lower() in ("1", "true", "yes"):
+        return False
+    return bool(_redis_url())
+def _redis_url() -> str | None:
+    return os.environ.get("TRASK_REDIS_URL") or os.environ.get("REDIS_URL")
+def _ttl(env_name: str, default: int) -> int:
+    raw = os.environ.get(env_name, "").strip()
+    if not raw:
+        return default
+    try:
+        return max(60, int(raw))
+    except ValueError:
+        return default
+def search_ttl() -> int:
+    return _ttl("TRASK_CACHE_SEARCH_TTL_SECONDS", DEFAULT_SEARCH_TTL)
+def page_ttl() -> int:
+    return _ttl("TRASK_CACHE_PAGE_TTL_SECONDS", DEFAULT_PAGE_TTL)
+def research_ttl() -> int:
+    return _ttl("TRASK_CACHE_RESEARCH_TTL_SECONDS", DEFAULT_RESEARCH_TTL)
+def get_client() -> Redis | None:
+    if not cache_enabled():
+        return None
+    url = _redis_url()
+    if not url:
+        return None
+    try:
+        import redis
+    except ImportError:
+        return None
+    return redis.from_url(url, decode_responses=True)
+def ping(client: Redis) -> bool:
+    try:
+        return bool(client.ping())
+    except Exception:
+        return False
+def _sha(parts: list[str]) -> str:
+    payload = "\x1f".join(parts).encode("utf-8")
+    return hashlib.sha256(payload).hexdigest()
+def _key(kind: str, digest: str) -> str:
+    return f"{KEY_PREFIX}:{kind}:{digest}"
+def _normalize_url(url: str) -> str:
+    return url.strip().rstrip("/").lower()
+def search_cache_key(query: str, query_domains: list[str]) -> str:
+    domains = "|".join(sorted(d.strip().lower() for d in query_domains if d.strip()))
+    return _key("search", _sha([query.strip().lower(), domains]))
+def page_cache_key(url: str) -> str:
+    return _key("page", _sha([_normalize_url(url)]))
+def research_cache_key(
+    query: str,
+    query_domains: list[str],
+    allowed_prefixes: list[str],
+    source_urls: list[str],
+) -> str:
+    domains = "|".join(sorted(d.strip().lower() for d in query_domains if d.strip()))
+    prefixes = "|".join(sorted(p.strip().rstrip("/").lower() for p in allowed_prefixes if p.strip()))
+    sources = "|".join(sorted(_normalize_url(u) for u in source_urls if u.strip()))
+    return _key("research", _sha([query.strip().lower(), domains, prefixes, sources]))
+def get_json(client: Redis, key: str) -> Any | None:
+    raw = client.get(key)
+    if not raw:
+        return None
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        return None
+def set_json(client: Redis, key: str, value: Any, ttl_seconds: int) -> None:
+    client.setex(key, ttl_seconds, json.dumps(value, ensure_ascii=False))
+def get_search(client: Redis, query: str, query_domains: list[str]) -> list[str] | None:
+    data = get_json(client, search_cache_key(query, query_domains))
+    if isinstance(data, list):
+        return [str(u) for u in data]
+    return None
+def set_search(client: Redis, query: str, query_domains: list[str], urls: list[str]) -> None:
+    set_json(client, search_cache_key(query, query_domains), urls, search_ttl())
+def get_pages_bulk(client: Redis, urls: list[str]) -> dict[str, str]:
+    """Return url -> markdown for cache hits (pipelined GET)."""
+    if not urls:
+        return {}
+    pipe = client.pipeline()
+    keys = [page_cache_key(u) for u in urls]
+    for key in keys:
+        pipe.get(key)
+    values = pipe.execute()
+    hits: dict[str, str] = {}
+    for url, body in zip(urls, values, strict=True):
+        if body and isinstance(body, str) and len(body) >= 1:
+            hits[url] = body
+    return hits
+def set_page(client: Redis, url: str, markdown: str) -> None:
+    if not markdown.strip():
+        return
+    client.setex(page_cache_key(url), page_ttl(), markdown)
+def set_pages_bulk(client: Redis, pages: dict[str, str]) -> None:
+    if not pages:
+        return
+    pipe = client.pipeline()
+    ttl = page_ttl()
+    for url, markdown in pages.items():
+        if markdown.strip():
+            pipe.setex(page_cache_key(url), ttl, markdown)
+    pipe.execute()
+def get_research(client: Redis, key: str) -> dict[str, Any] | None:
+    data = get_json(client, key)
+    return data if isinstance(data, dict) else None
+def set_research(client: Redis, key: str, result: dict[str, Any]) -> None:
+    set_json(client, key, result, research_ttl())
+def kb_doc_cache_key(source_id: str) -> str:
+    """Stable key for KB ingest dedup (markdown file, URL, discord export id, …)."""
+    return _key("kb", _sha([source_id.strip().lower()]))
+def get_kb_content_hash(client: Redis, source_id: str) -> str | None:
+    value = client.get(kb_doc_cache_key(source_id))
+    return value if isinstance(value, str) else None
+def set_kb_content_hash(client: Redis, source_id: str, content_hash: str) -> None:
+    ttl = _ttl("TRASK_CACHE_KB_TTL_SECONDS", 30 * 24 * 60 * 60)
+    client.setex(kb_doc_cache_key(source_id), ttl, content_hash)
+def kb_needs_reindex(client: Redis, source_id: str, content_hash: str) -> bool:
+    """True when document is new or content changed (for ingest pipelines)."""
+    previous = get_kb_content_hash(client, source_id)
+    if previous == content_hash:
+        return False
+    set_kb_content_hash(client, source_id, content_hash)
+    return True
+def research_key_for_payload(payload: dict[str, Any]) -> str:
+    query = str(payload.get("query") or "")
+    query_domains = [str(x) for x in (payload.get("query_domains") or []) if str(x).strip()]
+    allowed_prefixes = [str(x) for x in (payload.get("allowed_url_prefixes") or []) if str(x).strip()]
+    source_urls = [str(x) for x in (payload.get("source_urls") or []) if str(x).strip()]
+    return research_cache_key(query, query_domains, allowed_prefixes, source_urls)
+def annotate_cache_meta(result: dict[str, Any], stats: dict[str, int]) -> dict[str, Any]:
+    """Attach cache stats under research_information for operators."""
+    info = dict(result.get("research_information") or {})
+    info["cache"] = stats
+    out = dict(result)
+    out["research_information"] = info
+    return out
+def _self_test() -> int:
+    """In-memory-free checks using a real Redis if REDIS_URL is set."""
+    client = get_client()
+    if not client or not ping(client):
+        print("SKIP: Redis not configured or unreachable (set REDIS_URL to test)")
+        return 0
+    q = "__trask_cache_selftest__"
+    domains = ["example.com"]
+    urls = ["https://example.com/page-a", "https://example.com/page-b"]
+    set_search(client, q, domains, urls)
+    assert get_search(client, q, domains) == urls
+    body = "# hello from self-test"
+    set_page(client, urls[0], body)
+    hits = get_pages_bulk(client, urls)
+    assert hits.get(urls[0]) == body
+    research = {"report": "ok", "research_information": {}}
+    rkey = research_cache_key(q, domains, ["https://example.com"], [])
+    set_research(client, rkey, research)
+    assert get_research(client, rkey) == research
+    client.delete(search_cache_key(q, domains), page_cache_key(urls[0]), rkey)
+    print("OK: trask_cache self-test passed")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(_self_test())

scripts/trask_web_research.py ADDED Viewed

	@@ -0,0 +1,511 @@

+#!/usr/bin/env python3
+"""
+Headless web research for Trask / Holocron.
+stdin: JSON payload (query, allowed_url_prefixes, query_domains, source_urls, …)
+stdout: JSON { report, research_information }
+Discovery via DuckDuckGo; scrape via Crawl4AI (markdown); trafilatura fallback.
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import contextlib
+import json
+import os
+import re
+import sys
+from dataclasses import dataclass, field
+from typing import Any
+from urllib.parse import urlparse
+MAX_CANDIDATE_URLS = 12
+MAX_SCRAPE_URLS = 8
+MAX_MARKDOWN_CHARS_PER_PAGE = 12_000
+MIN_USABLE_BODY_CHARS = 280
+SEARCH_RESULTS_PER_DOMAIN = 4
+FORUM_CHROME_PATTERNS = [
+    re.compile(r"\bsign up\b", re.I),
+    re.compile(r"\ball activity\b", re.I),
+    re.compile(r"\bmark site read\b", re.I),
+    re.compile(r"\bactivity feed\b", re.I),
+    re.compile(r"\bexisting user\? sign in\b", re.I),
+    re.compile(r"\byour content feed\b", re.I),
+]
+def _normalize_prefix(value: str) -> str:
+    return value.strip().rstrip("/")
+def _url_allowed(url: str, prefixes: list[str]) -> bool:
+    candidate = _normalize_prefix(url)
+    for raw in prefixes:
+        prefix = _normalize_prefix(raw)
+        if not prefix:
+            continue
+        if candidate == prefix or candidate.startswith(prefix + "/"):
+            return True
+    return False
+def _unique_urls(urls: list[str]) -> list[str]:
+    seen: set[str] = set()
+    out: list[str] = []
+    for url in urls:
+        u = url.strip()
+        if not u or not u.startswith(("http://", "https://")):
+            continue
+        key = u.rstrip("/").lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(u)
+    return out
+def _host_from_url(url: str) -> str:
+    try:
+        return urlparse(url).netloc.lower().replace("www.", "")
+    except Exception:
+        return ""
+def _looks_like_forum_chrome(text: str) -> bool:
+    if len(text) < 120:
+        return True
+    hits = sum(1 for pat in FORUM_CHROME_PATTERNS if pat.search(text))
+    return hits >= 2
+def _extract_follow_up_links(markdown: str, query: str, allowed_prefixes: list[str]) -> list[str]:
+    tokens = _query_tokens(query)
+    if not tokens:
+        return []
+    found: list[str] = []
+    for _label, href in re.findall(r"\[([^\]]*)\]\((https?://[^)]+)\)", markdown):
+        lower = f"{_label} {href}".lower()
+        if not any(token in lower for token in tokens):
+            continue
+        if _url_allowed(href, allowed_prefixes):
+            found.append(href)
+    return _unique_urls(found)
+def _truncate(text: str, limit: int) -> str:
+    if len(text) <= limit:
+        return text
+    return text[: limit - 3].rstrip() + "..."
+@dataclass
+class PageEvidence:
+    url: str
+    markdown: str
+@dataclass
+class GatherResult:
+    pages: list[PageEvidence] = field(default_factory=list)
+    visited_urls: list[str] = field(default_factory=list)
+    retrieved_urls: list[str] = field(default_factory=list)
+    rejected_urls: list[str] = field(default_factory=list)
+    candidate_urls: list[str] = field(default_factory=list)
+    cache_stats: dict[str, int] = field(default_factory=dict)
+def _query_tokens(query: str) -> set[str]:
+    return {t for t in re.findall(r"[a-z0-9]{3,}", query.lower()) if t not in {"what", "where", "when", "does", "the", "for", "and", "how"}}
+def _rank_source_urls(query: str, source_urls: list[str]) -> list[str]:
+    tokens = _query_tokens(query)
+    scored: list[tuple[int, str]] = []
+    for url in source_urls:
+        lower = url.lower()
+        score = sum(2 for token in tokens if token in lower)
+        if "technical" in lower or "reference" in lower or "neocities" in lower:
+            score += 1
+        scored.append((score, url))
+    scored.sort(key=lambda pair: pair[0], reverse=True)
+    return [url for _, url in scored]
+def discover_urls(
+    query: str,
+    query_domains: list[str],
+    source_urls: list[str],
+    allowed_prefixes: list[str],
+    cache_client: Any | None = None,
+    cache_stats: dict[str, int] | None = None,
+) -> list[str]:
+    if cache_client is not None:
+        try:
+            from trask_cache import get_search, set_search
+            cached = get_search(cache_client, query, query_domains)
+            if cached is not None:
+                if cache_stats is not None:
+                    cache_stats["search_hits"] = cache_stats.get("search_hits", 0) + 1
+                filtered = [u for u in cached if _url_allowed(u, allowed_prefixes)]
+                return filtered[:MAX_CANDIDATE_URLS]
+            if cache_stats is not None:
+                cache_stats["search_misses"] = cache_stats.get("search_misses", 0) + 1
+        except Exception:
+            pass
+    allowed_sources = [u for u in _unique_urls(source_urls) if _url_allowed(u, allowed_prefixes)]
+    candidates: list[str] = _rank_source_urls(query, allowed_sources)
+    domains = [d.strip() for d in query_domains if d.strip()]
+    if not domains:
+        domains = list({_host_from_url(p) for p in allowed_prefixes if _host_from_url(p)})
+    try:
+        from duckduckgo_search import DDGS
+        with DDGS() as ddgs:
+            for domain in domains[:6]:
+                site_query = f"{query} site:{domain}"
+                try:
+                    for item in ddgs.text(site_query, max_results=SEARCH_RESULTS_PER_DOMAIN, backend="bing"):
+                        href = (item.get("href") or item.get("url") or "").strip()
+                        if href:
+                            candidates.append(href)
+                except Exception:
+                    continue
+            if len(candidates) < 3:
+                try:
+                    for item in ddgs.text(query, max_results=10, backend="bing"):
+                        href = (item.get("href") or item.get("url") or "").strip()
+                        if href:
+                            candidates.append(href)
+                except Exception:
+                    pass
+    except Exception:
+        pass
+    # DuckDuckGo may rate-limit; always keep ranked catalog homes as crawl seeds.
+    for url in allowed_sources:
+        if url not in candidates:
+            candidates.append(url)
+    filtered = [u for u in _unique_urls(candidates) if _url_allowed(u, allowed_prefixes)]
+    result = filtered[:MAX_CANDIDATE_URLS]
+    if cache_client is not None and result:
+        try:
+            from trask_cache import set_search
+            set_search(cache_client, query, query_domains, result)
+        except Exception:
+            pass
+    return result
+def _trafilatura_fetch(url: str) -> str:
+    try:
+        import trafilatura
+        downloaded = trafilatura.fetch_url(url)
+        if not downloaded:
+            return ""
+        text = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
+        return (text or "").strip()
+    except Exception:
+        return ""
+async def _crawl_with_shared_crawler(crawler: Any, url: str) -> str:
+    try:
+        from crawl4ai import CrawlerRunConfig, CacheMode
+        run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, word_count_threshold=10)
+        result = await crawler.arun(url=url, config=run_config)
+        if result.success and result.markdown:
+            return result.markdown.strip()
+    except Exception:
+        pass
+    return _trafilatura_fetch(url)
+@contextlib.contextmanager
+def _redirect_stdout_to_stderr():
+    previous = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        yield
+    finally:
+        sys.stdout = previous
+def _page_from_cache(url: str, cached_pages: dict[str, str]) -> str | None:
+    body = cached_pages.get(url)
+    if not body or len(body) < MIN_USABLE_BODY_CHARS or _looks_like_forum_chrome(body):
+        return None
+    return _truncate(body, MAX_MARKDOWN_CHARS_PER_PAGE)
+async def _resolve_page_body(
+    url: str,
+    crawler: Any | None,
+    cache_client: Any | None,
+    cached_pages: dict[str, str],
+    cache_stats: dict[str, int],
+) -> str:
+    cached = _page_from_cache(url, cached_pages)
+    if cached is not None:
+        cache_stats["page_hits"] = cache_stats.get("page_hits", 0) + 1
+        return cached
+    cache_stats["page_misses"] = cache_stats.get("page_misses", 0) + 1
+    if crawler is not None:
+        body = await _crawl_with_shared_crawler(crawler, url)
+    else:
+        body = _trafilatura_fetch(url)
+    if body and len(body) >= MIN_USABLE_BODY_CHARS and not _looks_like_forum_chrome(body):
+        trimmed = _truncate(body, MAX_MARKDOWN_CHARS_PER_PAGE)
+        if cache_client is not None:
+            try:
+                from trask_cache import set_page
+                set_page(cache_client, url, trimmed)
+            except Exception:
+                pass
+        return trimmed
+    return ""
+async def gather_evidence(
+    query: str,
+    query_domains: list[str],
+    source_urls: list[str],
+    allowed_prefixes: list[str],
+) -> GatherResult:
+    result = GatherResult()
+    cache_client = None
+    try:
+        from trask_cache import get_client, ping
+        candidate = get_client()
+        if candidate is not None and ping(candidate):
+            cache_client = candidate
+    except Exception:
+        cache_client = None
+    stats = result.cache_stats
+    result.candidate_urls = discover_urls(
+        query,
+        query_domains,
+        source_urls,
+        allowed_prefixes,
+        cache_client=cache_client,
+        cache_stats=stats,
+    )
+    scrape_targets = list(result.candidate_urls[:MAX_SCRAPE_URLS])
+    seen_targets = set(scrape_targets)
+    cached_pages: dict[str, str] = {}
+    if cache_client is not None and scrape_targets:
+        try:
+            from trask_cache import get_pages_bulk
+            cached_pages = get_pages_bulk(cache_client, scrape_targets)
+        except Exception:
+            cached_pages = {}
+    async def accept_url(url: str, crawler: Any | None) -> str | None:
+        result.visited_urls.append(url)
+        body = await _resolve_page_body(url, crawler, cache_client, cached_pages, stats)
+        if not body:
+            result.rejected_urls.append(url)
+            return None
+        result.pages.append(PageEvidence(url=url, markdown=body))
+        result.retrieved_urls.append(url)
+        return body
+    try:
+        from crawl4ai import AsyncWebCrawler, BrowserConfig
+        browser_config = BrowserConfig(headless=True, verbose=False)
+        with _redirect_stdout_to_stderr():
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                for url in list(scrape_targets):
+                    body = await accept_url(url, crawler)
+                    if not body:
+                        continue
+                    for follow_up in _extract_follow_up_links(body, query, allowed_prefixes):
+                        if follow_up in seen_targets or len(scrape_targets) >= MAX_SCRAPE_URLS:
+                            continue
+                        seen_targets.add(follow_up)
+                        scrape_targets.append(follow_up)
+                        if cache_client is not None:
+                            try:
+                                from trask_cache import get_pages_bulk
+                                cached_pages.update(get_pages_bulk(cache_client, [follow_up]))
+                            except Exception:
+                                pass
+                        await accept_url(follow_up, crawler)
+    except Exception:
+        for url in scrape_targets:
+            if url in result.retrieved_urls:
+                continue
+            await accept_url(url, None)
+    return result
+def build_report(query: str, gather: GatherResult) -> str:
+    if not gather.pages:
+        return "I could not complete live archive synthesis for this question right now."
+    sections: list[str] = [
+        f"# Research evidence for: {query.strip()}",
+        "",
+        "The following excerpts were retrieved from approved archive sources.",
+        "",
+    ]
+    for page in gather.pages:
+        sections.append(f"## Evidence from {page.url}")
+        sections.append("")
+        sections.append(page.markdown)
+        sections.append("")
+    return "\n".join(sections).strip()
+def run_payload(payload: dict[str, Any]) -> dict[str, Any]:
+    query = str(payload.get("query") or "").strip()
+    if not query:
+        raise ValueError("query is required")
+    query_domains = [str(x) for x in (payload.get("query_domains") or []) if str(x).strip()]
+    allowed_prefixes = [str(x) for x in (payload.get("allowed_url_prefixes") or []) if str(x).strip()]
+    source_urls = [str(x) for x in (payload.get("source_urls") or []) if str(x).strip()]
+    env_prefixes = os.environ.get("TRASK_ALLOWED_URL_PREFIXES", "")
+    if env_prefixes and not allowed_prefixes:
+        allowed_prefixes = [line.strip() for line in env_prefixes.splitlines() if line.strip()]
+    env_domains = os.environ.get("TRASK_ALLOWED_QUERY_DOMAINS", "")
+    if env_domains and not query_domains:
+        query_domains = [line.strip() for line in env_domains.splitlines() if line.strip()]
+    try:
+        from trask_cache import (
+            annotate_cache_meta,
+            get_client,
+            get_research,
+            ping,
+            research_key_for_payload,
+            set_research,
+        )
+        cache_client = get_client()
+        if cache_client is not None and ping(cache_client):
+            rkey = research_key_for_payload(
+                {
+                    "query": query,
+                    "query_domains": query_domains,
+                    "allowed_url_prefixes": allowed_prefixes,
+                    "source_urls": source_urls,
+                },
+            )
+            cached_result = get_research(cache_client, rkey)
+            if cached_result is not None:
+                stats = dict((cached_result.get("research_information") or {}).get("cache") or {})
+                stats["research_hits"] = stats.get("research_hits", 0) + 1
+                return annotate_cache_meta(cached_result, stats)
+    except Exception:
+        pass
+    gather = asyncio.run(
+        gather_evidence(query, query_domains, source_urls, allowed_prefixes),
+    )
+    report = build_report(query, gather)
+    result = {
+        "report": report,
+        "research_information": {
+            "source_urls": gather.retrieved_urls,
+            "cited_urls": gather.retrieved_urls,
+            "retrieved_urls": gather.retrieved_urls,
+            "visited_urls": gather.visited_urls,
+            "query_domains": query_domains,
+            "allowed_url_prefixes": allowed_prefixes,
+            "rejected_source_urls": gather.rejected_urls,
+        },
+    }
+    try:
+        from trask_cache import annotate_cache_meta, get_client, ping, research_key_for_payload, set_research
+        cache_client = get_client()
+        if cache_client is not None and ping(cache_client) and gather.pages:
+            rkey = research_key_for_payload(
+                {
+                    "query": query,
+                    "query_domains": query_domains,
+                    "allowed_url_prefixes": allowed_prefixes,
+                    "source_urls": source_urls,
+                },
+            )
+            stats = dict(gather.cache_stats)
+            stats["research_misses"] = stats.get("research_misses", 0) + 1
+            to_store = annotate_cache_meta(result, stats)
+            set_research(cache_client, rkey, to_store)
+            return to_store
+    except Exception:
+        pass
+    if gather.cache_stats:
+        try:
+            from trask_cache import annotate_cache_meta
+            return annotate_cache_meta(result, gather.cache_stats)
+        except Exception:
+            pass
+    return result
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Trask headless web research (Crawl4AI + DDG)")
+    parser.add_argument("--dry-run", action="store_true", help="Import dependencies and exit 0")
+    args = parser.parse_args()
+    if args.dry_run:
+        import crawl4ai  # noqa: F401
+        import duckduckgo_search  # noqa: F401
+        import trafilatura  # noqa: F401
+        print(json.dumps({"ok": True, "backend": "crawl4ai"}))
+        return 0
+    raw = sys.stdin.read()
+    if not raw.strip():
+        print(json.dumps({"error": "empty stdin"}), file=sys.stderr)
+        return 1
+    try:
+        payload = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        print(json.dumps({"error": f"invalid json: {exc}"}), file=sys.stderr)
+        return 1
+    try:
+        result = run_payload(payload)
+    except Exception as exc:
+        print(json.dumps({"error": str(exc)}), file=sys.stderr)
+        return 1
+    sys.stdout.write(json.dumps(result, ensure_ascii=False))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())