Spaces:
Configuration error
Configuration error
Deploy trask-http web research from community-bots@6f6709a0116dc99200b9a9ba4cf65f3bf5a649c9
Browse files- Dockerfile +22 -8
- README.md +3 -3
- apps/trask-http-server/src/main.ts +56 -4
- docs/trask-research-backends.md +80 -0
- package.json +4 -3
- packages/config/src/index.test.ts +11 -13
- packages/config/src/index.ts +70 -56
- packages/personas/src/index.ts +1 -1
- packages/retrieval/src/discord-permalink.test.ts +52 -0
- packages/retrieval/src/discord-permalink.ts +98 -0
- packages/retrieval/src/index.ts +31 -5
- packages/trask-http/src/router.test.ts +26 -26
- packages/trask-http/src/router.ts +29 -23
- packages/trask/src/community-knowledge.test.ts +72 -0
- packages/trask/src/community-knowledge.ts +61 -0
- packages/trask/src/index.ts +3 -1
- packages/trask/src/research-wizard.ts +17 -1270
- packages/trask/src/web-research-subprocess.ts +337 -0
- packages/trask/src/web-research.test.ts +38 -0
- packages/trask/src/web-research.ts +1559 -0
- pnpm-lock.yaml +2 -0
- requirements-trask-research.txt +6 -0
- scripts/trask_cache.py +254 -0
- scripts/trask_web_research.py +511 -0
Dockerfile
CHANGED
|
@@ -4,22 +4,33 @@
|
|
| 4 |
FROM node:24-bookworm AS base
|
| 5 |
WORKDIR /workspace
|
| 6 |
ENV NODE_ENV=production
|
| 7 |
-
RUN apt-get update \
|
| 8 |
-
&& apt-get install -y --no-install-recommends python3 python3-venv python3-pip \
|
| 9 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
RUN corepack enable && corepack prepare pnpm@10.11.0 --activate
|
| 11 |
|
| 12 |
FROM base AS deps
|
| 13 |
COPY package.json pnpm-lock.yaml pnpm-workspace.yaml tsconfig.base.json tsconfig.workspace.json ./
|
| 14 |
COPY packages ./packages
|
| 15 |
COPY apps/trask-http-server ./apps/trask-http-server
|
| 16 |
-
COPY vendor/ai-researchwizard ./vendor/ai-researchwizard
|
| 17 |
-
COPY vendor/llm_fallbacks ./vendor/llm_fallbacks
|
| 18 |
-
COPY scripts/bootstrap_trask_gpt_researcher.sh ./scripts/bootstrap_trask_gpt_researcher.sh
|
| 19 |
COPY data/ingest-worker ./data/ingest-worker
|
|
|
|
|
|
|
|
|
|
| 20 |
RUN pnpm install --frozen-lockfile
|
| 21 |
RUN pnpm --filter @openkotor/trask-http-server build
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
FROM base AS runtime
|
| 25 |
WORKDIR /workspace
|
|
@@ -28,11 +39,14 @@ ENV PORT=${PORT}
|
|
| 28 |
ENV TRASK_HTTP_PORT=${PORT}
|
| 29 |
ENV TRASK_WEB_ALLOW_ANONYMOUS=1
|
| 30 |
ENV TRASK_WEB_DEFAULT_USER_ID=qa-webui
|
| 31 |
-
ENV TRASK_GPT_RESEARCHER_PYTHON=/workspace/.venv-trask-gptr/bin/python
|
| 32 |
ENV TRASK_PUBLIC_WEB_ORIGIN=https://openkotor.github.io
|
| 33 |
ENV TRASK_RESEARCHWIZARD_TIMEOUT_MS=900000
|
|
|
|
| 34 |
ENV INGEST_STATE_DIR=/workspace/data/ingest-worker
|
| 35 |
ENV TRASK_HTTP_DATA_DIR=/workspace/data/trask-http-server
|
| 36 |
COPY --from=deps /workspace /workspace
|
|
|
|
|
|
|
|
|
|
| 37 |
EXPOSE 7860
|
| 38 |
CMD ["node", "apps/trask-http-server/dist/main.js"]
|
|
|
|
| 4 |
FROM node:24-bookworm AS base
|
| 5 |
WORKDIR /workspace
|
| 6 |
ENV NODE_ENV=production
|
|
|
|
|
|
|
|
|
|
| 7 |
RUN corepack enable && corepack prepare pnpm@10.11.0 --activate
|
| 8 |
|
| 9 |
FROM base AS deps
|
| 10 |
COPY package.json pnpm-lock.yaml pnpm-workspace.yaml tsconfig.base.json tsconfig.workspace.json ./
|
| 11 |
COPY packages ./packages
|
| 12 |
COPY apps/trask-http-server ./apps/trask-http-server
|
|
|
|
|
|
|
|
|
|
| 13 |
COPY data/ingest-worker ./data/ingest-worker
|
| 14 |
+
COPY scripts/trask_web_research.py scripts/trask_web_research.py
|
| 15 |
+
COPY scripts/trask_cache.py scripts/trask_cache.py
|
| 16 |
+
COPY requirements-trask-research.txt requirements-trask-research.txt
|
| 17 |
RUN pnpm install --frozen-lockfile
|
| 18 |
RUN pnpm --filter @openkotor/trask-http-server build
|
| 19 |
+
|
| 20 |
+
FROM base AS python-research
|
| 21 |
+
WORKDIR /workspace
|
| 22 |
+
RUN apt-get update \
|
| 23 |
+
&& apt-get install -y --no-install-recommends \
|
| 24 |
+
python3 python3-pip python3-venv \
|
| 25 |
+
libxml2-dev libxslt1-dev gcc \
|
| 26 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 27 |
+
COPY requirements-trask-research.txt /workspace/requirements-trask-research.txt
|
| 28 |
+
COPY scripts/trask_web_research.py /workspace/scripts/trask_web_research.py
|
| 29 |
+
COPY scripts/trask_cache.py /workspace/scripts/trask_cache.py
|
| 30 |
+
RUN python3 -m venv /workspace/.venv-trask-research \
|
| 31 |
+
&& /workspace/.venv-trask-research/bin/pip install --upgrade pip \
|
| 32 |
+
&& /workspace/.venv-trask-research/bin/pip install -r /workspace/requirements-trask-research.txt \
|
| 33 |
+
&& (/workspace/.venv-trask-research/bin/python -m crawl4ai install || true)
|
| 34 |
|
| 35 |
FROM base AS runtime
|
| 36 |
WORKDIR /workspace
|
|
|
|
| 39 |
ENV TRASK_HTTP_PORT=${PORT}
|
| 40 |
ENV TRASK_WEB_ALLOW_ANONYMOUS=1
|
| 41 |
ENV TRASK_WEB_DEFAULT_USER_ID=qa-webui
|
|
|
|
| 42 |
ENV TRASK_PUBLIC_WEB_ORIGIN=https://openkotor.github.io
|
| 43 |
ENV TRASK_RESEARCHWIZARD_TIMEOUT_MS=900000
|
| 44 |
+
ENV TRASK_WEB_RESEARCH_PYTHON=/workspace/.venv-trask-research/bin/python
|
| 45 |
ENV INGEST_STATE_DIR=/workspace/data/ingest-worker
|
| 46 |
ENV TRASK_HTTP_DATA_DIR=/workspace/data/trask-http-server
|
| 47 |
COPY --from=deps /workspace /workspace
|
| 48 |
+
COPY --from=python-research /workspace/.venv-trask-research /workspace/.venv-trask-research
|
| 49 |
+
COPY --from=python-research /workspace/scripts/trask_web_research.py /workspace/scripts/trask_web_research.py
|
| 50 |
+
COPY --from=python-research /workspace/scripts/trask_cache.py /workspace/scripts/trask_cache.py
|
| 51 |
EXPOSE 7860
|
| 52 |
CMD ["node", "apps/trask-http-server/dist/main.js"]
|
README.md
CHANGED
|
@@ -8,9 +8,9 @@ app_port: 7860
|
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
-
# Holocron Trask HTTP
|
| 12 |
|
| 13 |
-
Public `trask-http-server`
|
| 14 |
|
| 15 |
- Source: `apps/trask-http-server` in [OpenKotOR/community-bots](https://github.com/OpenKotOR/community-bots)
|
| 16 |
- Deployed by `.github/workflows/trask-http-public.yml`
|
|
@@ -27,4 +27,4 @@ CI syncs from GitHub repository secrets when they exist. None are required for d
|
|
| 27 |
| `TAVILY_API_KEY` | Optional web retrieval |
|
| 28 |
| `FAST_LLM` / `SMART_LLM` / `STRATEGIC_LLM` | Optional GPTR model overrides |
|
| 29 |
|
| 30 |
-
|
|
|
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Holocron Trask HTTP
|
| 12 |
|
| 13 |
+
Public `trask-http-server` and `/api/trask/*` for Holocron. Docker image includes **Crawl4AI** research venv (`TRASK_WEB_RESEARCH_PYTHON`). Set `OPENAI_API_KEY` or `OPENROUTER_API_KEY` for live synthesis (`docs/trask-research-backends.md`).
|
| 14 |
|
| 15 |
- Source: `apps/trask-http-server` in [OpenKotOR/community-bots](https://github.com/OpenKotOR/community-bots)
|
| 16 |
- Deployed by `.github/workflows/trask-http-public.yml`
|
|
|
|
| 27 |
| `TAVILY_API_KEY` | Optional web retrieval |
|
| 28 |
| `FAST_LLM` / `SMART_LLM` / `STRATEGIC_LLM` | Optional GPTR model overrides |
|
| 29 |
|
| 30 |
+
Holocron research **requires at least one working LLM** in the provider fallback chain (`llm_fallbacks` free models when no paid keys are set). On startup, `trask-http-server` probes the chain and exposes `researchAvailable` on `GET /api/trask/session`. Set `TRASK_STRICT_LLM_PROBE=1` to refuse boot when every provider fails.
|
apps/trask-http-server/src/main.ts
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
| 11 |
resolveCorsHeaders,
|
| 12 |
} from "@openkotor/platform";
|
| 13 |
import { createChunkSearchProvider } from "@openkotor/retrieval";
|
| 14 |
-
import {
|
| 15 |
import { createTraskHttpRouter, type TraskHttpAuth } from "@openkotor/trask-http";
|
| 16 |
import express, { type Request, type Response } from "express";
|
| 17 |
|
|
@@ -61,12 +61,20 @@ const config = loadTraskHttpServerConfig();
|
|
| 61 |
const resolveFromRoot = (p: string) => (path.isAbsolute(p) ? p : path.resolve(repoRoot, p));
|
| 62 |
|
| 63 |
const queryRepository = new JsonTraskQueryRepository(resolveDataFile(resolveFromRoot(config.dataDir), "trask-queries.json"));
|
| 64 |
-
const
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
const runtime = {
|
| 68 |
searchProvider,
|
| 69 |
-
|
| 70 |
queryRepository,
|
| 71 |
};
|
| 72 |
|
|
@@ -151,11 +159,36 @@ app.use((req, res, next) => {
|
|
| 151 |
next();
|
| 152 |
});
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
app.use(
|
| 155 |
"/api/trask",
|
| 156 |
createTraskHttpRouter({
|
| 157 |
runtime,
|
| 158 |
auth: createWebAuth(config),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
}),
|
| 160 |
);
|
| 161 |
|
|
@@ -182,6 +215,25 @@ const { server, listen } = createNodeApiHost({
|
|
| 182 |
|
| 183 |
listen(config.port, () => {
|
| 184 |
logger.info(`Trask HTTP API listening on port ${config.port}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
});
|
| 186 |
|
| 187 |
process.on("SIGINT", () => {
|
|
|
|
| 11 |
resolveCorsHeaders,
|
| 12 |
} from "@openkotor/platform";
|
| 13 |
import { createChunkSearchProvider } from "@openkotor/retrieval";
|
| 14 |
+
import { createWebResearchClient, probeHeadlessWebResearchDryRun } from "@openkotor/trask";
|
| 15 |
import { createTraskHttpRouter, type TraskHttpAuth } from "@openkotor/trask-http";
|
| 16 |
import express, { type Request, type Response } from "express";
|
| 17 |
|
|
|
|
| 61 |
const resolveFromRoot = (p: string) => (path.isAbsolute(p) ? p : path.resolve(repoRoot, p));
|
| 62 |
|
| 63 |
const queryRepository = new JsonTraskQueryRepository(resolveDataFile(resolveFromRoot(config.dataDir), "trask-queries.json"));
|
| 64 |
+
const discordGuildId =
|
| 65 |
+
process.env.TRASK_DISCORD_GUILD_ID?.trim()
|
| 66 |
+
|| process.env.DISCORD_TARGET_GUILD_ID?.trim()
|
| 67 |
+
|| undefined;
|
| 68 |
+
const searchProvider = createChunkSearchProvider(resolveFromRoot(config.chunkDir), {
|
| 69 |
+
...(discordGuildId ? { discordGuildId } : {}),
|
| 70 |
+
});
|
| 71 |
+
const webResearch = createWebResearchClient(config.webResearch, config.ai, {
|
| 72 |
+
localSearchProvider: searchProvider,
|
| 73 |
+
});
|
| 74 |
|
| 75 |
const runtime = {
|
| 76 |
searchProvider,
|
| 77 |
+
webResearch,
|
| 78 |
queryRepository,
|
| 79 |
};
|
| 80 |
|
|
|
|
| 159 |
next();
|
| 160 |
});
|
| 161 |
|
| 162 |
+
const hasLlmRewriteKey = Boolean(config.ai.openAiApiKey?.trim());
|
| 163 |
+
|
| 164 |
+
const researchUnavailableReason = (): string => {
|
| 165 |
+
if (!hasLlmRewriteKey) {
|
| 166 |
+
return "Set OPENAI_API_KEY or OPENROUTER_API_KEY for Holocron answer synthesis.";
|
| 167 |
+
}
|
| 168 |
+
return "Run scripts/bootstrap_trask_research.sh (Crawl4AI venv) and set TRASK_WEB_RESEARCH_PYTHON. See docs/trask-research-backends.md.";
|
| 169 |
+
};
|
| 170 |
+
|
| 171 |
+
const holocronSessionState: {
|
| 172 |
+
researchAvailable: boolean;
|
| 173 |
+
researchUnavailableReason?: string;
|
| 174 |
+
} = {
|
| 175 |
+
researchAvailable: false,
|
| 176 |
+
researchUnavailableReason: researchUnavailableReason(),
|
| 177 |
+
};
|
| 178 |
+
|
| 179 |
app.use(
|
| 180 |
"/api/trask",
|
| 181 |
createTraskHttpRouter({
|
| 182 |
runtime,
|
| 183 |
auth: createWebAuth(config),
|
| 184 |
+
getSession: () => ({
|
| 185 |
+
loggedIn: false,
|
| 186 |
+
oauthAvailable: false,
|
| 187 |
+
researchAvailable: holocronSessionState.researchAvailable,
|
| 188 |
+
...(holocronSessionState.researchUnavailableReason
|
| 189 |
+
? { researchUnavailableReason: holocronSessionState.researchUnavailableReason }
|
| 190 |
+
: {}),
|
| 191 |
+
}),
|
| 192 |
}),
|
| 193 |
);
|
| 194 |
|
|
|
|
| 215 |
|
| 216 |
listen(config.port, () => {
|
| 217 |
logger.info(`Trask HTTP API listening on port ${config.port}`);
|
| 218 |
+
void (async () => {
|
| 219 |
+
try {
|
| 220 |
+
const dryRunOk = await probeHeadlessWebResearchDryRun(config.webResearch);
|
| 221 |
+
if (dryRunOk && hasLlmRewriteKey) {
|
| 222 |
+
holocronSessionState.researchAvailable = true;
|
| 223 |
+
delete holocronSessionState.researchUnavailableReason;
|
| 224 |
+
logger.info("Holocron live web research is available (Crawl4AI gather + LLM synthesis).");
|
| 225 |
+
return;
|
| 226 |
+
}
|
| 227 |
+
holocronSessionState.researchAvailable = false;
|
| 228 |
+
holocronSessionState.researchUnavailableReason = researchUnavailableReason();
|
| 229 |
+
logger.warn(`Holocron live research unavailable: ${holocronSessionState.researchUnavailableReason}`);
|
| 230 |
+
} catch (error: unknown) {
|
| 231 |
+
const detail = error instanceof Error ? error.message : String(error);
|
| 232 |
+
holocronSessionState.researchAvailable = false;
|
| 233 |
+
holocronSessionState.researchUnavailableReason = `${researchUnavailableReason()} (${detail})`;
|
| 234 |
+
logger.warn(`Holocron research probe failed: ${detail}`);
|
| 235 |
+
}
|
| 236 |
+
})();
|
| 237 |
});
|
| 238 |
|
| 239 |
process.on("SIGINT", () => {
|
docs/trask-research-backends.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Trask / Holocron research backends
|
| 2 |
+
|
| 3 |
+
Holocron’s UI lives in **`apps/holocron-web`**. It talks to **`apps/trask-http-server`** at `/api/trask/*`.
|
| 4 |
+
|
| 5 |
+
## Default stack (implemented)
|
| 6 |
+
|
| 7 |
+
| Layer | Implementation |
|
| 8 |
+
|--------|----------------|
|
| 9 |
+
| **Discovery** | DuckDuckGo (`duckduckgo-search`) with `site:` hints from approved domains |
|
| 10 |
+
| **Scrape** | [Crawl4AI](https://github.com/unclecode/crawl4ai) → LLM-friendly markdown (`scripts/trask_web_research.py`) |
|
| 11 |
+
| **Synthesis** | Node `WebResearchClient` OpenAI-compatible rewrite (`packages/trask/src/web-research.ts`) |
|
| 12 |
+
|
| 13 |
+
### Bootstrap
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
bash scripts/bootstrap_trask_research.sh # creates .venv-trask-research
|
| 17 |
+
export TRASK_WEB_RESEARCH_PYTHON="$(pwd)/.venv-trask-research/bin/python"
|
| 18 |
+
# OPENAI_API_KEY or OPENROUTER_API_KEY required for Holocron synthesis
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
Fedora/RHEL hosts need `libxml2-devel` and `libxslt-devel` before the first bootstrap (for `lxml`).
|
| 22 |
+
|
| 23 |
+
### Environment
|
| 24 |
+
|
| 25 |
+
| Variable | Purpose |
|
| 26 |
+
|----------|---------|
|
| 27 |
+
| `TRASK_WEB_RESEARCH_PYTHON` | Python for `scripts/trask_web_research.py` (defaults to `.venv-trask-research`) |
|
| 28 |
+
| `TRASK_WEB_RESEARCH_SCRIPT` | Optional override script path |
|
| 29 |
+
| `TRASK_GPT_RESEARCHER_PYTHON` | Deprecated alias for `TRASK_WEB_RESEARCH_PYTHON` |
|
| 30 |
+
| `TRASK_WEB_RESEARCH_TIMEOUT_MS` | Subprocess timeout (default **900000**; legacy alias `TRASK_RESEARCHWIZARD_TIMEOUT_MS`) |
|
| 31 |
+
| `OPENAI_API_KEY` / `OPENROUTER_API_KEY` | LLM rewrite for final Holocron answers |
|
| 32 |
+
| `REDIS_URL` / `TRASK_REDIS_URL` | Optional Redis for research cache (`scripts/trask_cache.py`) |
|
| 33 |
+
| `TRASK_CACHE_DISABLED` | Set to `1` to bypass Redis even when `REDIS_URL` is set |
|
| 34 |
+
| `TRASK_CACHE_SEARCH_TTL_SECONDS` | DuckDuckGo URL-list cache TTL (default **21600** = 6h) |
|
| 35 |
+
| `TRASK_CACHE_PAGE_TTL_SECONDS` | Per-page markdown cache TTL (default **604800** = 7d) |
|
| 36 |
+
| `TRASK_CACHE_RESEARCH_TTL_SECONDS` | Full research JSON cache TTL (default **3600** = 1h) |
|
| 37 |
+
|
| 38 |
+
### Redis cache (optional, no Pinecone)
|
| 39 |
+
|
| 40 |
+
When `REDIS_URL` is set, `scripts/trask_web_research.py` uses `scripts/trask_cache.py` to avoid repeat work:
|
| 41 |
+
|
| 42 |
+
| Layer | Key pattern | What it skips |
|
| 43 |
+
|--------|-------------|----------------|
|
| 44 |
+
| Search | `trask:search:{hash}` | DuckDuckGo discovery for the same query + domains |
|
| 45 |
+
| Page | `trask:page:{hash}` | Crawl4AI / trafilatura fetch for the same URL |
|
| 46 |
+
| Research | `trask:research:{hash}` | Entire subprocess result for identical payload |
|
| 47 |
+
|
| 48 |
+
Cache stats appear under `research_information.cache` (e.g. `page_hits`, `search_misses`, `research_hits`).
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
# Local Redis (example)
|
| 52 |
+
podman run -d --name trask-redis -p 6379:6379 redis:7-alpine
|
| 53 |
+
export REDIS_URL=redis://localhost:6379/0
|
| 54 |
+
python scripts/trask_cache.py # connectivity self-test
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
### Verification
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
python scripts/smoke_trask_web_research.py --dry-run
|
| 61 |
+
node --import tsx/esm scripts/verify_trask_cli_qa.mjs
|
| 62 |
+
pnpm holocron:e2e # with trask-http-server on :4010
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Explicitly rejected (do not implement)
|
| 66 |
+
|
| 67 |
+
These were considered as follow-ups and are **out of scope**:
|
| 68 |
+
|
| 69 |
+
| Approach | Reason |
|
| 70 |
+
|----------|--------|
|
| 71 |
+
| Node-native **llm-scraper** for single-URL extraction | Not part of the product path; Crawl4AI + DDG covers live research. |
|
| 72 |
+
| **browser-use** integration | Not part of the product path. |
|
| 73 |
+
| Trask `/ask` via self-hosted **Firecrawl HTTP API** (reuse ingest key without Python) | Firecrawl remains **ingest-worker only** when `FIRECRAWL_API_KEY` is set—not the Holocron/Discord answer pipeline. |
|
| 74 |
+
| `TRASK_RESEARCH_BACKEND_URL` HTTP sidecar | Reserved env name only; no planned sidecar replacing `trask_web_research.py`. |
|
| 75 |
+
|
| 76 |
+
## Other references (not default)
|
| 77 |
+
|
| 78 |
+
- [khoj-ai/khoj](https://github.com/khoj-ai/khoj) — full Q&A product (not integrated)
|
| 79 |
+
- [assafelovic/gpt-researcher](https://github.com/assafelovic/gpt-researcher) — upstream of the removed vendored fork
|
| 80 |
+
- [searxng/searxng](https://github.com/searxng/searxng) — metasearch sidecar (not integrated)
|
package.json
CHANGED
|
@@ -31,10 +31,11 @@
|
|
| 31 |
"build:pazaak-nakama": "pnpm --filter @openkotor/pazaak-nakama build",
|
| 32 |
"dev:ingest": "pnpm --filter @openkotor/ingest-worker dev",
|
| 33 |
"discord:install-links": "tsx scripts/discord-install-links.ts",
|
| 34 |
-
"smoke:trask-gptr-dry": "python scripts/smoke_trask_headless_gptr.py --dry-run",
|
| 35 |
-
"smoke:trask-gptr": "python scripts/smoke_trask_headless_gptr.py",
|
| 36 |
"holocron:e2e": "node scripts/holocron-e2e-live-build.mjs && pnpm exec playwright test --config apps/holocron-web/playwright.config.ts",
|
| 37 |
-
"discord:smoke-trask-commands": "node scripts/discord_trask_commands_smoke.mjs"
|
|
|
|
|
|
|
|
|
|
| 38 |
},
|
| 39 |
"devDependencies": {
|
| 40 |
"@playwright/test": "^1.58.0",
|
|
|
|
| 31 |
"build:pazaak-nakama": "pnpm --filter @openkotor/pazaak-nakama build",
|
| 32 |
"dev:ingest": "pnpm --filter @openkotor/ingest-worker dev",
|
| 33 |
"discord:install-links": "tsx scripts/discord-install-links.ts",
|
|
|
|
|
|
|
| 34 |
"holocron:e2e": "node scripts/holocron-e2e-live-build.mjs && pnpm exec playwright test --config apps/holocron-web/playwright.config.ts",
|
| 35 |
+
"discord:smoke-trask-commands": "node scripts/discord_trask_commands_smoke.mjs",
|
| 36 |
+
"trask:env:fetch": "node scripts/discord_fetch_trask_env.mjs",
|
| 37 |
+
"trask:start": "bash scripts/trask_bot_start.sh",
|
| 38 |
+
"trask:wait-token": "bash scripts/discord_wait_token_and_start.sh"
|
| 39 |
},
|
| 40 |
"devDependencies": {
|
| 41 |
"@playwright/test": "^1.58.0",
|
packages/config/src/index.test.ts
CHANGED
|
@@ -84,9 +84,9 @@ test("loadSharedAiConfig returns undefined headers when no OpenRouter vars are s
|
|
| 84 |
// loadResearchWizardRuntimeConfig — timeout and script path
|
| 85 |
// ---------------------------------------------------------------------------
|
| 86 |
|
| 87 |
-
test("loadResearchWizardRuntimeConfig defaults timeout to
|
| 88 |
const cfg = loadResearchWizardRuntimeConfig({});
|
| 89 |
-
assert.equal(cfg.timeoutMs,
|
| 90 |
});
|
| 91 |
|
| 92 |
test("loadResearchWizardRuntimeConfig respects TRASK_RESEARCHWIZARD_TIMEOUT_MS override", () => {
|
|
@@ -94,23 +94,21 @@ test("loadResearchWizardRuntimeConfig respects TRASK_RESEARCHWIZARD_TIMEOUT_MS o
|
|
| 94 |
assert.equal(cfg.timeoutMs, 120000);
|
| 95 |
});
|
| 96 |
|
| 97 |
-
test("loadResearchWizardRuntimeConfig
|
| 98 |
const cfg = loadResearchWizardRuntimeConfig({});
|
|
|
|
|
|
|
| 99 |
assert.equal(cfg.headlessScriptPath, undefined);
|
| 100 |
});
|
| 101 |
|
| 102 |
-
test("loadResearchWizardRuntimeConfig
|
| 103 |
-
const cfg = loadResearchWizardRuntimeConfig({
|
| 104 |
-
assert.
|
| 105 |
});
|
| 106 |
|
| 107 |
-
test("loadResearchWizardRuntimeConfig
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
// Explicit TRASK_GPT_RESEARCHER_ROOT is set but has no gpt_researcher/ inside, so
|
| 111 |
-
// root resolves but the venv walk still yields 'python'.
|
| 112 |
-
assert.ok(typeof cfg.pythonExecutable === "string");
|
| 113 |
-
assert.ok(cfg.pythonExecutable.length > 0);
|
| 114 |
});
|
| 115 |
|
| 116 |
// ---------------------------------------------------------------------------
|
|
|
|
| 84 |
// loadResearchWizardRuntimeConfig — timeout and script path
|
| 85 |
// ---------------------------------------------------------------------------
|
| 86 |
|
| 87 |
+
test("loadResearchWizardRuntimeConfig defaults timeout to 900000 ms when TRASK_RESEARCHWIZARD_TIMEOUT_MS is absent", () => {
|
| 88 |
const cfg = loadResearchWizardRuntimeConfig({});
|
| 89 |
+
assert.equal(cfg.timeoutMs, 900000);
|
| 90 |
});
|
| 91 |
|
| 92 |
test("loadResearchWizardRuntimeConfig respects TRASK_RESEARCHWIZARD_TIMEOUT_MS override", () => {
|
|
|
|
| 94 |
assert.equal(cfg.timeoutMs, 120000);
|
| 95 |
});
|
| 96 |
|
| 97 |
+
test("loadResearchWizardRuntimeConfig resolves repoRoot and pythonExecutable", () => {
|
| 98 |
const cfg = loadResearchWizardRuntimeConfig({});
|
| 99 |
+
assert.ok(cfg.repoRoot.length > 0);
|
| 100 |
+
assert.ok(cfg.pythonExecutable.length > 0);
|
| 101 |
assert.equal(cfg.headlessScriptPath, undefined);
|
| 102 |
});
|
| 103 |
|
| 104 |
+
test("loadResearchWizardRuntimeConfig respects TRASK_WEB_RESEARCH_PYTHON override", () => {
|
| 105 |
+
const cfg = loadResearchWizardRuntimeConfig({ TRASK_WEB_RESEARCH_PYTHON: "/custom/python" });
|
| 106 |
+
assert.equal(cfg.pythonExecutable, "/custom/python");
|
| 107 |
});
|
| 108 |
|
| 109 |
+
test("loadResearchWizardRuntimeConfig resolves TRASK_RESEARCH_BACKEND_URL", () => {
|
| 110 |
+
const cfg = loadResearchWizardRuntimeConfig({ TRASK_RESEARCH_BACKEND_URL: "http://127.0.0.1:3002" });
|
| 111 |
+
assert.equal(cfg.backendUrl, "http://127.0.0.1:3002");
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
});
|
| 113 |
|
| 114 |
// ---------------------------------------------------------------------------
|
packages/config/src/index.ts
CHANGED
|
@@ -13,21 +13,26 @@ import { loadPolicyFromFile } from "@openkotor/pazaak-policy/file-loader";
|
|
| 13 |
import { config as loadDotEnv } from "dotenv";
|
| 14 |
import { z } from "zod";
|
| 15 |
|
| 16 |
-
function
|
|
|
|
| 17 |
let dir = resolve(process.cwd());
|
| 18 |
for (;;) {
|
| 19 |
-
const
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
const parent = dirname(dir);
|
| 22 |
-
if (parent === dir)
|
| 23 |
dir = parent;
|
| 24 |
}
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
-
const
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 31 |
} else {
|
| 32 |
loadDotEnv();
|
| 33 |
}
|
|
@@ -107,29 +112,29 @@ export interface SharedAiConfig {
|
|
| 107 |
databaseUrl: string | undefined;
|
| 108 |
}
|
| 109 |
|
| 110 |
-
export interface
|
| 111 |
-
/**
|
| 112 |
-
|
| 113 |
-
/** Python interpreter for `
|
| 114 |
pythonExecutable: string;
|
| 115 |
-
/** Optional absolute path to the headless runner; default `<
|
| 116 |
headlessScriptPath: string | undefined;
|
|
|
|
|
|
|
| 117 |
timeoutMs: number;
|
| 118 |
}
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
const findVendorAiResearchWizard = (startDir: string, maxHops = 24): string | undefined => {
|
| 128 |
let dir = resolve(startDir);
|
| 129 |
for (let hop = 0; hop < maxHops; hop++) {
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
return candidate;
|
| 133 |
}
|
| 134 |
const parent = dirname(dir);
|
| 135 |
if (parent === dir) {
|
|
@@ -140,72 +145,81 @@ const findVendorAiResearchWizard = (startDir: string, maxHops = 24): string | un
|
|
| 140 |
return undefined;
|
| 141 |
};
|
| 142 |
|
| 143 |
-
const
|
| 144 |
-
const explicit = readOptionalEnv("
|
| 145 |
-
|
| 146 |
if (explicit) {
|
| 147 |
return resolve(explicit.trim());
|
| 148 |
}
|
| 149 |
|
| 150 |
-
const fromCwd =
|
| 151 |
if (fromCwd) {
|
| 152 |
return fromCwd;
|
| 153 |
}
|
| 154 |
|
| 155 |
const configModuleDir = dirname(fileURLToPath(import.meta.url));
|
| 156 |
-
const fromPackage =
|
| 157 |
if (fromPackage) {
|
| 158 |
return fromPackage;
|
| 159 |
}
|
| 160 |
|
| 161 |
-
return
|
| 162 |
};
|
| 163 |
|
| 164 |
/**
|
| 165 |
-
* Prefer
|
| 166 |
-
*
|
| 167 |
*/
|
| 168 |
-
const
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
const explicit = readOptionalEnv("TRASK_GPT_RESEARCHER_PYTHON", env)?.trim();
|
| 173 |
if (explicit) {
|
| 174 |
return explicit;
|
| 175 |
}
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
const vendorDir = dirname(gptResearcherRoot);
|
| 182 |
-
const repoRoot = dirname(vendorDir);
|
| 183 |
-
const winPy = join(repoRoot, ".venv-trask-gptr", "Scripts", "python.exe");
|
| 184 |
-
const unixPy = join(repoRoot, ".venv-trask-gptr", "bin", "python");
|
| 185 |
|
| 186 |
if (process.platform === "win32" && existsSync(winPy)) {
|
| 187 |
return winPy;
|
| 188 |
}
|
| 189 |
-
|
| 190 |
if (existsSync(unixPy)) {
|
| 191 |
return unixPy;
|
| 192 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
return "
|
| 195 |
};
|
| 196 |
|
| 197 |
-
export const
|
| 198 |
-
const
|
| 199 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
return {
|
| 202 |
-
|
| 203 |
-
pythonExecutable:
|
| 204 |
headlessScriptPath: scriptRaw ? resolve(scriptRaw.trim()) : undefined,
|
| 205 |
-
|
|
|
|
| 206 |
};
|
| 207 |
};
|
| 208 |
|
|
|
|
|
|
|
|
|
|
| 209 |
export interface TraskProactiveConfig {
|
| 210 |
/** When true, reads channel messages (privileged intents) and may reply without `/ask`. */
|
| 211 |
enabled: boolean;
|
|
@@ -237,7 +251,7 @@ export interface TraskWelcomeConfig {
|
|
| 237 |
export interface TraskBotConfig {
|
| 238 |
discord: DiscordRuntimeConfig;
|
| 239 |
ai: SharedAiConfig;
|
| 240 |
-
|
| 241 |
allowedGuildIds: string[];
|
| 242 |
approvedChannelIds: string[];
|
| 243 |
/** Guild IDs where slash commands are registered (comma list in `TRASK_SLASH_GUILD_IDS`). */
|
|
@@ -380,7 +394,7 @@ export const loadTraskBotConfig = (env: NodeJS.ProcessEnv = process.env): TraskB
|
|
| 380 |
return {
|
| 381 |
discord: loadDiscordRuntimeConfig("TRASK", env),
|
| 382 |
ai: loadSharedAiConfig(env),
|
| 383 |
-
|
| 384 |
allowedGuildIds: readListEnv("TRASK_ALLOWED_GUILD_IDS", env),
|
| 385 |
approvedChannelIds,
|
| 386 |
slashCommandGuildIds: readListEnv("TRASK_SLASH_GUILD_IDS", env),
|
|
@@ -454,7 +468,7 @@ export const loadIngestWorkerConfig = (env: NodeJS.ProcessEnv = process.env): In
|
|
| 454 |
|
| 455 |
export interface TraskHttpServerConfig {
|
| 456 |
port: number;
|
| 457 |
-
|
| 458 |
ai: SharedAiConfig;
|
| 459 |
dataDir: string;
|
| 460 |
/** When set, require `Authorization: Bearer <key>` or `X-Trask-Api-Key`. */
|
|
@@ -471,7 +485,7 @@ export interface TraskHttpServerConfig {
|
|
| 471 |
export const loadTraskHttpServerConfig = (env: NodeJS.ProcessEnv = process.env): TraskHttpServerConfig => {
|
| 472 |
return {
|
| 473 |
port: integerish.parse(readOptionalEnv("TRASK_HTTP_PORT", env) ?? "4010"),
|
| 474 |
-
|
| 475 |
ai: loadSharedAiConfig(env),
|
| 476 |
dataDir: readOptionalEnv("TRASK_HTTP_DATA_DIR", env) ?? "data/trask-http-server",
|
| 477 |
webApiKey: readOptionalEnv("TRASK_WEB_API_KEY", env),
|
|
|
|
| 13 |
import { config as loadDotEnv } from "dotenv";
|
| 14 |
import { z } from "zod";
|
| 15 |
|
| 16 |
+
function findDotEnvFiles(): string[] {
|
| 17 |
+
const found: string[] = [];
|
| 18 |
let dir = resolve(process.cwd());
|
| 19 |
for (;;) {
|
| 20 |
+
const local = join(dir, ".env.local");
|
| 21 |
+
const env = join(dir, ".env");
|
| 22 |
+
if (existsSync(local)) found.push(local);
|
| 23 |
+
if (existsSync(env)) found.push(env);
|
| 24 |
const parent = dirname(dir);
|
| 25 |
+
if (parent === dir) break;
|
| 26 |
dir = parent;
|
| 27 |
}
|
| 28 |
+
return found;
|
| 29 |
}
|
| 30 |
|
| 31 |
+
const dotEnvPaths = findDotEnvFiles();
|
| 32 |
+
if (dotEnvPaths.length > 0) {
|
| 33 |
+
for (const path of dotEnvPaths) {
|
| 34 |
+
loadDotEnv({ path });
|
| 35 |
+
}
|
| 36 |
} else {
|
| 37 |
loadDotEnv();
|
| 38 |
}
|
|
|
|
| 112 |
databaseUrl: string | undefined;
|
| 113 |
}
|
| 114 |
|
| 115 |
+
export interface WebResearchRuntimeConfig {
|
| 116 |
+
/** Monorepo root (contains `scripts/trask_web_research.py`). */
|
| 117 |
+
repoRoot: string;
|
| 118 |
+
/** Python interpreter for `scripts/trask_web_research.py`. */
|
| 119 |
pythonExecutable: string;
|
| 120 |
+
/** Optional absolute path to the headless runner; default `<repoRoot>/scripts/trask_web_research.py`. */
|
| 121 |
headlessScriptPath: string | undefined;
|
| 122 |
+
/** Reserved for a future HTTP research sidecar (`TRASK_RESEARCH_BACKEND_URL`). */
|
| 123 |
+
backendUrl: string | undefined;
|
| 124 |
timeoutMs: number;
|
| 125 |
}
|
| 126 |
|
| 127 |
+
/** @deprecated Use WebResearchRuntimeConfig */
|
| 128 |
+
export type ResearchWizardRuntimeConfig = WebResearchRuntimeConfig;
|
| 129 |
|
| 130 |
+
const hasTraskWebResearchScript = (rootDir: string): boolean =>
|
| 131 |
+
existsSync(join(rootDir, "scripts", "trask_web_research.py"));
|
| 132 |
+
|
| 133 |
+
const findRepoRootWithWebResearch = (startDir: string, maxHops = 24): string | undefined => {
|
|
|
|
| 134 |
let dir = resolve(startDir);
|
| 135 |
for (let hop = 0; hop < maxHops; hop++) {
|
| 136 |
+
if (hasTraskWebResearchScript(dir)) {
|
| 137 |
+
return dir;
|
|
|
|
| 138 |
}
|
| 139 |
const parent = dirname(dir);
|
| 140 |
if (parent === dir) {
|
|
|
|
| 145 |
return undefined;
|
| 146 |
};
|
| 147 |
|
| 148 |
+
const resolveTraskResearchRepoRoot = (env: NodeJS.ProcessEnv): string => {
|
| 149 |
+
const explicit = readOptionalEnv("TRASK_REPO_ROOT", env);
|
|
|
|
| 150 |
if (explicit) {
|
| 151 |
return resolve(explicit.trim());
|
| 152 |
}
|
| 153 |
|
| 154 |
+
const fromCwd = findRepoRootWithWebResearch(process.cwd());
|
| 155 |
if (fromCwd) {
|
| 156 |
return fromCwd;
|
| 157 |
}
|
| 158 |
|
| 159 |
const configModuleDir = dirname(fileURLToPath(import.meta.url));
|
| 160 |
+
const fromPackage = findRepoRootWithWebResearch(join(configModuleDir, "..", ".."));
|
| 161 |
if (fromPackage) {
|
| 162 |
return fromPackage;
|
| 163 |
}
|
| 164 |
|
| 165 |
+
return process.cwd();
|
| 166 |
};
|
| 167 |
|
| 168 |
/**
|
| 169 |
+
* Prefer `.venv-trask-research` when `TRASK_WEB_RESEARCH_PYTHON` is unset.
|
| 170 |
+
* Falls back to deprecated `TRASK_GPT_RESEARCHER_PYTHON` for migration.
|
| 171 |
*/
|
| 172 |
+
const resolveTraskWebResearchPythonExecutable = (repoRoot: string, env: NodeJS.ProcessEnv): string => {
|
| 173 |
+
const explicit =
|
| 174 |
+
readOptionalEnv("TRASK_WEB_RESEARCH_PYTHON", env)?.trim() ||
|
| 175 |
+
readOptionalEnv("TRASK_GPT_RESEARCHER_PYTHON", env)?.trim();
|
|
|
|
| 176 |
if (explicit) {
|
| 177 |
return explicit;
|
| 178 |
}
|
| 179 |
|
| 180 |
+
const winPy = join(repoRoot, ".venv-trask-research", "Scripts", "python.exe");
|
| 181 |
+
const unixPy = join(repoRoot, ".venv-trask-research", "bin", "python");
|
| 182 |
+
const legacyWin = join(repoRoot, ".venv-trask-gptr", "Scripts", "python.exe");
|
| 183 |
+
const legacyUnix = join(repoRoot, ".venv-trask-gptr", "bin", "python");
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
if (process.platform === "win32" && existsSync(winPy)) {
|
| 186 |
return winPy;
|
| 187 |
}
|
|
|
|
| 188 |
if (existsSync(unixPy)) {
|
| 189 |
return unixPy;
|
| 190 |
}
|
| 191 |
+
if (process.platform === "win32" && existsSync(legacyWin)) {
|
| 192 |
+
return legacyWin;
|
| 193 |
+
}
|
| 194 |
+
if (existsSync(legacyUnix)) {
|
| 195 |
+
return legacyUnix;
|
| 196 |
+
}
|
| 197 |
|
| 198 |
+
return "python3";
|
| 199 |
};
|
| 200 |
|
| 201 |
+
export const loadWebResearchRuntimeConfig = (env: NodeJS.ProcessEnv = process.env): WebResearchRuntimeConfig => {
|
| 202 |
+
const repoRoot = resolveTraskResearchRepoRoot(env);
|
| 203 |
+
const scriptRaw =
|
| 204 |
+
readOptionalEnv("TRASK_WEB_RESEARCH_SCRIPT", env) ?? readOptionalEnv("TRASK_GPT_RESEARCHER_SCRIPT", env);
|
| 205 |
+
const backendUrl = readOptionalEnv("TRASK_RESEARCH_BACKEND_URL", env)?.trim() || undefined;
|
| 206 |
+
const timeoutRaw =
|
| 207 |
+
readOptionalEnv("TRASK_WEB_RESEARCH_TIMEOUT_MS", env)
|
| 208 |
+
?? readOptionalEnv("TRASK_RESEARCHWIZARD_TIMEOUT_MS", env)
|
| 209 |
+
?? "900000";
|
| 210 |
|
| 211 |
return {
|
| 212 |
+
repoRoot,
|
| 213 |
+
pythonExecutable: resolveTraskWebResearchPythonExecutable(repoRoot, env),
|
| 214 |
headlessScriptPath: scriptRaw ? resolve(scriptRaw.trim()) : undefined,
|
| 215 |
+
backendUrl,
|
| 216 |
+
timeoutMs: integerish.parse(timeoutRaw),
|
| 217 |
};
|
| 218 |
};
|
| 219 |
|
| 220 |
+
/** @deprecated Use loadWebResearchRuntimeConfig */
|
| 221 |
+
export const loadResearchWizardRuntimeConfig = loadWebResearchRuntimeConfig;
|
| 222 |
+
|
| 223 |
export interface TraskProactiveConfig {
|
| 224 |
/** When true, reads channel messages (privileged intents) and may reply without `/ask`. */
|
| 225 |
enabled: boolean;
|
|
|
|
| 251 |
export interface TraskBotConfig {
|
| 252 |
discord: DiscordRuntimeConfig;
|
| 253 |
ai: SharedAiConfig;
|
| 254 |
+
webResearch: WebResearchRuntimeConfig;
|
| 255 |
allowedGuildIds: string[];
|
| 256 |
approvedChannelIds: string[];
|
| 257 |
/** Guild IDs where slash commands are registered (comma list in `TRASK_SLASH_GUILD_IDS`). */
|
|
|
|
| 394 |
return {
|
| 395 |
discord: loadDiscordRuntimeConfig("TRASK", env),
|
| 396 |
ai: loadSharedAiConfig(env),
|
| 397 |
+
webResearch: loadWebResearchRuntimeConfig(env),
|
| 398 |
allowedGuildIds: readListEnv("TRASK_ALLOWED_GUILD_IDS", env),
|
| 399 |
approvedChannelIds,
|
| 400 |
slashCommandGuildIds: readListEnv("TRASK_SLASH_GUILD_IDS", env),
|
|
|
|
| 468 |
|
| 469 |
export interface TraskHttpServerConfig {
|
| 470 |
port: number;
|
| 471 |
+
webResearch: WebResearchRuntimeConfig;
|
| 472 |
ai: SharedAiConfig;
|
| 473 |
dataDir: string;
|
| 474 |
/** When set, require `Authorization: Bearer <key>` or `X-Trask-Api-Key`. */
|
|
|
|
| 485 |
export const loadTraskHttpServerConfig = (env: NodeJS.ProcessEnv = process.env): TraskHttpServerConfig => {
|
| 486 |
return {
|
| 487 |
port: integerish.parse(readOptionalEnv("TRASK_HTTP_PORT", env) ?? "4010"),
|
| 488 |
+
webResearch: loadWebResearchRuntimeConfig(env),
|
| 489 |
ai: loadSharedAiConfig(env),
|
| 490 |
dataDir: readOptionalEnv("TRASK_HTTP_DATA_DIR", env) ?? "data/trask-http-server",
|
| 491 |
webApiKey: readOptionalEnv("TRASK_WEB_API_KEY", env),
|
packages/personas/src/index.ts
CHANGED
|
@@ -20,7 +20,7 @@ export * from "./hk-dialog.js";
|
|
| 20 |
export const personaProfiles: Record<PersonaProfile["id"], PersonaProfile> = {
|
| 21 |
trask: {
|
| 22 |
id: "trask",
|
| 23 |
-
displayName: "Trask
|
| 24 |
summary: "Republic-first guide voice for quick help, troubleshooting, and source-backed answers.",
|
| 25 |
speechStyle: [
|
| 26 |
"direct and practical",
|
|
|
|
| 20 |
export const personaProfiles: Record<PersonaProfile["id"], PersonaProfile> = {
|
| 21 |
trask: {
|
| 22 |
id: "trask",
|
| 23 |
+
displayName: "Trask Q&A Assistant",
|
| 24 |
summary: "Republic-first guide voice for quick help, troubleshooting, and source-backed answers.",
|
| 25 |
speechStyle: [
|
| 26 |
"direct and practical",
|
packages/retrieval/src/discord-permalink.test.ts
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import assert from "node:assert/strict";
|
| 2 |
+
import { describe, test } from "node:test";
|
| 3 |
+
|
| 4 |
+
import {
|
| 5 |
+
anchorMessageIdFromChunkTags,
|
| 6 |
+
buildDiscordMessagePermalink,
|
| 7 |
+
channelIdFromChunkTags,
|
| 8 |
+
guildIdFromChunkTags,
|
| 9 |
+
isDiscordCitationUrl,
|
| 10 |
+
resolveDiscordChunkCitationUrl,
|
| 11 |
+
} from "./discord-permalink.js";
|
| 12 |
+
|
| 13 |
+
describe("discord permalink helpers", () => {
|
| 14 |
+
test("buildDiscordMessagePermalink formats discord.com URL", () => {
|
| 15 |
+
assert.equal(
|
| 16 |
+
buildDiscordMessagePermalink("111", "222", "333"),
|
| 17 |
+
"https://discord.com/channels/111/222/333",
|
| 18 |
+
);
|
| 19 |
+
});
|
| 20 |
+
|
| 21 |
+
test("resolveDiscordChunkCitationUrl prefers stored HTTPS permalink", () => {
|
| 22 |
+
const url = resolveDiscordChunkCitationUrl({
|
| 23 |
+
url: "https://discord.com/channels/g/c/m",
|
| 24 |
+
tags: [],
|
| 25 |
+
});
|
| 26 |
+
assert.equal(url, "https://discord.com/channels/g/c/m");
|
| 27 |
+
});
|
| 28 |
+
|
| 29 |
+
test("resolveDiscordChunkCitationUrl builds from discord:// and tags", () => {
|
| 30 |
+
const url = resolveDiscordChunkCitationUrl(
|
| 31 |
+
{
|
| 32 |
+
url: "discord://approved-channels/9001/8001-8002",
|
| 33 |
+
tags: ["guild:1001", "channel:9001", "anchorMessage:8001"],
|
| 34 |
+
},
|
| 35 |
+
"fallback-should-not-use",
|
| 36 |
+
);
|
| 37 |
+
assert.equal(url, "https://discord.com/channels/1001/9001/8001");
|
| 38 |
+
});
|
| 39 |
+
|
| 40 |
+
test("tag parsers read guild channel and anchor", () => {
|
| 41 |
+
const tags = ["guild:g1", "channel:c1", "anchorMessage:m1"];
|
| 42 |
+
assert.equal(guildIdFromChunkTags(tags), "g1");
|
| 43 |
+
assert.equal(channelIdFromChunkTags(tags), "c1");
|
| 44 |
+
assert.equal(anchorMessageIdFromChunkTags(tags), "m1");
|
| 45 |
+
});
|
| 46 |
+
|
| 47 |
+
test("isDiscordCitationUrl recognizes discord schemes", () => {
|
| 48 |
+
assert.equal(isDiscordCitationUrl("discord://approved-channels/1/2"), true);
|
| 49 |
+
assert.equal(isDiscordCitationUrl("https://discord.com/channels/1/2/3"), true);
|
| 50 |
+
assert.equal(isDiscordCitationUrl("https://example.com"), false);
|
| 51 |
+
});
|
| 52 |
+
});
|
packages/retrieval/src/discord-permalink.ts
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** Discord message permalink and chunk URL helpers for Trask community citations. */
|
| 2 |
+
|
| 3 |
+
const DISCORD_CHUNK_URL_PATTERN =
|
| 4 |
+
/^discord:\/\/approved-channels\/([^/]+)\/([^/-]+)(?:-([^/]+))?$/;
|
| 5 |
+
|
| 6 |
+
export function buildDiscordMessagePermalink(
|
| 7 |
+
guildId: string,
|
| 8 |
+
channelId: string,
|
| 9 |
+
messageId: string,
|
| 10 |
+
): string {
|
| 11 |
+
const guild = guildId.trim();
|
| 12 |
+
const channel = channelId.trim();
|
| 13 |
+
const message = messageId.trim();
|
| 14 |
+
if (!guild || !channel || !message) {
|
| 15 |
+
return "";
|
| 16 |
+
}
|
| 17 |
+
return `https://discord.com/channels/${guild}/${channel}/${message}`;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
export function parseDiscordChunkUrl(
|
| 21 |
+
url: string,
|
| 22 |
+
): { channelId: string; firstMessageId: string; lastMessageId?: string } | null {
|
| 23 |
+
const match = DISCORD_CHUNK_URL_PATTERN.exec(url.trim());
|
| 24 |
+
if (!match) return null;
|
| 25 |
+
const channelId = match[1]?.trim();
|
| 26 |
+
const firstMessageId = match[2]?.trim();
|
| 27 |
+
const lastMessageId = match[3]?.trim();
|
| 28 |
+
if (!channelId || !firstMessageId) return null;
|
| 29 |
+
return lastMessageId
|
| 30 |
+
? { channelId, firstMessageId, lastMessageId }
|
| 31 |
+
: { channelId, firstMessageId };
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
export function guildIdFromChunkTags(tags: readonly string[]): string | undefined {
|
| 35 |
+
for (const tag of tags) {
|
| 36 |
+
if (tag.startsWith("guild:")) {
|
| 37 |
+
const value = tag.slice("guild:".length).trim();
|
| 38 |
+
if (value) return value;
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
return undefined;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
export function channelIdFromChunkTags(tags: readonly string[]): string | undefined {
|
| 45 |
+
for (const tag of tags) {
|
| 46 |
+
if (tag.startsWith("channel:")) {
|
| 47 |
+
const value = tag.slice("channel:".length).trim();
|
| 48 |
+
if (value) return value;
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
return undefined;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
export function anchorMessageIdFromChunkTags(tags: readonly string[]): string | undefined {
|
| 55 |
+
for (const tag of tags) {
|
| 56 |
+
if (tag.startsWith("anchorMessage:")) {
|
| 57 |
+
const value = tag.slice("anchorMessage:".length).trim();
|
| 58 |
+
if (value) return value;
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
return undefined;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/**
|
| 65 |
+
* Resolve a chunk record to an HTTPS citation URL when possible.
|
| 66 |
+
* Prefers stored https://discord.com permalinks; falls back to guild id from tags or option.
|
| 67 |
+
*/
|
| 68 |
+
export function resolveDiscordChunkCitationUrl(
|
| 69 |
+
chunk: { url: string; tags: readonly string[] },
|
| 70 |
+
fallbackGuildId?: string,
|
| 71 |
+
): string {
|
| 72 |
+
const url = chunk.url.trim();
|
| 73 |
+
if (url.startsWith("https://discord.com/channels/")) {
|
| 74 |
+
return url;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
const guildId = guildIdFromChunkTags(chunk.tags) ?? fallbackGuildId?.trim();
|
| 78 |
+
const channelFromTags = channelIdFromChunkTags(chunk.tags);
|
| 79 |
+
const anchorFromTags = anchorMessageIdFromChunkTags(chunk.tags);
|
| 80 |
+
|
| 81 |
+
const parsed = parseDiscordChunkUrl(url);
|
| 82 |
+
const channelId = channelFromTags ?? parsed?.channelId;
|
| 83 |
+
const messageId = anchorFromTags ?? parsed?.firstMessageId;
|
| 84 |
+
|
| 85 |
+
if (guildId && channelId && messageId) {
|
| 86 |
+
return buildDiscordMessagePermalink(guildId, channelId, messageId);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
return url;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
export function isDiscordCitationUrl(url: string): boolean {
|
| 93 |
+
const trimmed = url.trim();
|
| 94 |
+
return (
|
| 95 |
+
trimmed.startsWith("discord://")
|
| 96 |
+
|| trimmed.startsWith("https://discord.com/channels/")
|
| 97 |
+
);
|
| 98 |
+
}
|
packages/retrieval/src/index.ts
CHANGED
|
@@ -1,6 +1,16 @@
|
|
| 1 |
import { mkdir, open, readFile, readdir, rename, rm, stat, utimes, writeFile } from "node:fs/promises";
|
| 2 |
import path from "node:path";
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
export type SourceKind = "website" | "github" | "discord";
|
| 5 |
|
| 6 |
export interface SourceDescriptor {
|
|
@@ -709,8 +719,9 @@ export interface SourceIndexRecord {
|
|
| 709 |
tags: readonly string[];
|
| 710 |
}
|
| 711 |
|
| 712 |
-
|
| 713 |
-
|
|
|
|
| 714 |
|
| 715 |
type SerializableValue = object | string | number | boolean | null;
|
| 716 |
|
|
@@ -874,10 +885,16 @@ export class FileChunkStore {
|
|
| 874 |
}
|
| 875 |
}
|
| 876 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 877 |
export class ChunkSearchProvider implements SearchProvider {
|
| 878 |
public constructor(
|
| 879 |
private readonly chunkStore: FileChunkStore,
|
| 880 |
private readonly catalog: StaticCatalogSearchProvider,
|
|
|
|
| 881 |
) {}
|
| 882 |
|
| 883 |
public async listSources(): Promise<readonly SourceDescriptor[]> {
|
|
@@ -893,7 +910,7 @@ export class ChunkSearchProvider implements SearchProvider {
|
|
| 893 |
this.catalog.search(query, limit),
|
| 894 |
this.chunkStore.loadAllChunks(),
|
| 895 |
]);
|
| 896 |
-
const searchableChunks = allChunks.filter((chunk) => !
|
| 897 |
|
| 898 |
const chunkHits: SearchHit[] = searchableChunks
|
| 899 |
.map((chunk) => {
|
|
@@ -908,6 +925,11 @@ export class ChunkSearchProvider implements SearchProvider {
|
|
| 908 |
score += textTokens.filter((t) => t === token).length;
|
| 909 |
}
|
| 910 |
score += intentScoreDelta(intent, chunk.tags);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 911 |
|
| 912 |
return {
|
| 913 |
sourceId: chunk.sourceId,
|
|
@@ -915,7 +937,7 @@ export class ChunkSearchProvider implements SearchProvider {
|
|
| 915 |
kind: chunk.kind,
|
| 916 |
title: chunk.title,
|
| 917 |
snippet: chunk.chunkText.slice(0, 800).trim() + (chunk.chunkText.length > 800 ? "\u2026" : ""),
|
| 918 |
-
url:
|
| 919 |
score,
|
| 920 |
tags: chunk.tags,
|
| 921 |
} satisfies SearchHit;
|
|
@@ -942,9 +964,13 @@ export class ChunkSearchProvider implements SearchProvider {
|
|
| 942 |
}
|
| 943 |
}
|
| 944 |
|
| 945 |
-
export const createChunkSearchProvider = (
|
|
|
|
|
|
|
|
|
|
| 946 |
return new ChunkSearchProvider(
|
| 947 |
new FileChunkStore(stateDir),
|
| 948 |
new StaticCatalogSearchProvider(defaultSourceCatalog, new FileReindexQueueStore(stateDir)),
|
|
|
|
| 949 |
);
|
| 950 |
};
|
|
|
|
| 1 |
import { mkdir, open, readFile, readdir, rename, rm, stat, utimes, writeFile } from "node:fs/promises";
|
| 2 |
import path from "node:path";
|
| 3 |
|
| 4 |
+
export {
|
| 5 |
+
anchorMessageIdFromChunkTags,
|
| 6 |
+
buildDiscordMessagePermalink,
|
| 7 |
+
channelIdFromChunkTags,
|
| 8 |
+
guildIdFromChunkTags,
|
| 9 |
+
isDiscordCitationUrl,
|
| 10 |
+
parseDiscordChunkUrl,
|
| 11 |
+
resolveDiscordChunkCitationUrl,
|
| 12 |
+
} from "./discord-permalink.js";
|
| 13 |
+
|
| 14 |
export type SourceKind = "website" | "github" | "discord";
|
| 15 |
|
| 16 |
export interface SourceDescriptor {
|
|
|
|
| 719 |
tags: readonly string[];
|
| 720 |
}
|
| 721 |
|
| 722 |
+
import { isDiscordCitationUrl, resolveDiscordChunkCitationUrl } from "./discord-permalink.js";
|
| 723 |
+
|
| 724 |
+
const isExcludedChunkUrl = (url: string): boolean => url.startsWith("local://");
|
| 725 |
|
| 726 |
type SerializableValue = object | string | number | boolean | null;
|
| 727 |
|
|
|
|
| 885 |
}
|
| 886 |
}
|
| 887 |
|
| 888 |
+
export interface ChunkSearchProviderOptions {
|
| 889 |
+
/** Resolves `discord://` chunk URLs to HTTPS permalinks when tags omit guild id. */
|
| 890 |
+
discordGuildId?: string;
|
| 891 |
+
}
|
| 892 |
+
|
| 893 |
export class ChunkSearchProvider implements SearchProvider {
|
| 894 |
public constructor(
|
| 895 |
private readonly chunkStore: FileChunkStore,
|
| 896 |
private readonly catalog: StaticCatalogSearchProvider,
|
| 897 |
+
private readonly options: ChunkSearchProviderOptions = {},
|
| 898 |
) {}
|
| 899 |
|
| 900 |
public async listSources(): Promise<readonly SourceDescriptor[]> {
|
|
|
|
| 910 |
this.catalog.search(query, limit),
|
| 911 |
this.chunkStore.loadAllChunks(),
|
| 912 |
]);
|
| 913 |
+
const searchableChunks = allChunks.filter((chunk) => !isExcludedChunkUrl(chunk.url));
|
| 914 |
|
| 915 |
const chunkHits: SearchHit[] = searchableChunks
|
| 916 |
.map((chunk) => {
|
|
|
|
| 925 |
score += textTokens.filter((t) => t === token).length;
|
| 926 |
}
|
| 927 |
score += intentScoreDelta(intent, chunk.tags);
|
| 928 |
+
if (isDiscordCitationUrl(chunk.url)) {
|
| 929 |
+
score += 1;
|
| 930 |
+
}
|
| 931 |
+
|
| 932 |
+
const citationUrl = resolveDiscordChunkCitationUrl(chunk, this.options.discordGuildId);
|
| 933 |
|
| 934 |
return {
|
| 935 |
sourceId: chunk.sourceId,
|
|
|
|
| 937 |
kind: chunk.kind,
|
| 938 |
title: chunk.title,
|
| 939 |
snippet: chunk.chunkText.slice(0, 800).trim() + (chunk.chunkText.length > 800 ? "\u2026" : ""),
|
| 940 |
+
url: citationUrl,
|
| 941 |
score,
|
| 942 |
tags: chunk.tags,
|
| 943 |
} satisfies SearchHit;
|
|
|
|
| 964 |
}
|
| 965 |
}
|
| 966 |
|
| 967 |
+
export const createChunkSearchProvider = (
|
| 968 |
+
stateDir: string,
|
| 969 |
+
options?: ChunkSearchProviderOptions,
|
| 970 |
+
): ChunkSearchProvider => {
|
| 971 |
return new ChunkSearchProvider(
|
| 972 |
new FileChunkStore(stateDir),
|
| 973 |
new StaticCatalogSearchProvider(defaultSourceCatalog, new FileReindexQueueStore(stateDir)),
|
| 974 |
+
options ?? {},
|
| 975 |
);
|
| 976 |
};
|
packages/trask-http/src/router.test.ts
CHANGED
|
@@ -7,10 +7,10 @@ import path from "node:path";
|
|
| 7 |
import { JsonTraskQueryRepository } from "@openkotor/persistence";
|
| 8 |
import type { SourceDescriptor } from "@openkotor/retrieval";
|
| 9 |
import type {
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
} from "@openkotor/trask";
|
| 15 |
import express from "express";
|
| 16 |
import request from "supertest";
|
|
@@ -28,11 +28,11 @@ const mockSource: SourceDescriptor = {
|
|
| 28 |
tags: [],
|
| 29 |
};
|
| 30 |
|
| 31 |
-
const mockWizard:
|
| 32 |
async answerQuestion(
|
| 33 |
_query: string,
|
| 34 |
-
onProgress?: (event:
|
| 35 |
-
): Promise<
|
| 36 |
onProgress?.({ phase: "gather", detail: "test" });
|
| 37 |
return {
|
| 38 |
answer: "Stub answer.\n\nSources\n1. Test Source - https://example.com",
|
|
@@ -76,7 +76,7 @@ test("GET /session returns anonymous payload by default", async () => {
|
|
| 76 |
createTraskHttpRouter({
|
| 77 |
runtime: {
|
| 78 |
searchProvider,
|
| 79 |
-
|
| 80 |
queryRepository,
|
| 81 |
},
|
| 82 |
auth: {
|
|
@@ -112,7 +112,7 @@ test("GET /session uses getSession override", async () => {
|
|
| 112 |
createTraskHttpRouter({
|
| 113 |
runtime: {
|
| 114 |
searchProvider,
|
| 115 |
-
|
| 116 |
queryRepository,
|
| 117 |
},
|
| 118 |
auth: {
|
|
@@ -153,7 +153,7 @@ test("POST /auth/logout returns 204 by default", async () => {
|
|
| 153 |
createTraskHttpRouter({
|
| 154 |
runtime: {
|
| 155 |
searchProvider,
|
| 156 |
-
|
| 157 |
queryRepository,
|
| 158 |
},
|
| 159 |
auth: {
|
|
@@ -187,7 +187,7 @@ test("GET /sources returns JSON when authenticated", async () => {
|
|
| 187 |
createTraskHttpRouter({
|
| 188 |
runtime: {
|
| 189 |
searchProvider,
|
| 190 |
-
|
| 191 |
queryRepository,
|
| 192 |
},
|
| 193 |
auth: {
|
|
@@ -223,7 +223,7 @@ test("GET /models defaults to Auto only when the wizard has no live model list",
|
|
| 223 |
createTraskHttpRouter({
|
| 224 |
runtime: {
|
| 225 |
searchProvider,
|
| 226 |
-
|
| 227 |
queryRepository,
|
| 228 |
},
|
| 229 |
auth: {
|
|
@@ -235,7 +235,7 @@ test("GET /models defaults to Auto only when the wizard has no live model list",
|
|
| 235 |
const res = await request(app).get("/api/trask/models");
|
| 236 |
assert.equal(res.status, 200);
|
| 237 |
assert.deepEqual(res.body.models, [
|
| 238 |
-
{ id: "auto", label: "Auto", provider: "
|
| 239 |
]);
|
| 240 |
});
|
| 241 |
|
|
@@ -253,12 +253,12 @@ test("GET /models filters out non-free model ids", async () => {
|
|
| 253 |
},
|
| 254 |
};
|
| 255 |
|
| 256 |
-
const
|
| 257 |
...mockWizard,
|
| 258 |
async listModels() {
|
| 259 |
return [
|
| 260 |
{ id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
|
| 261 |
-
{ id: "litellm:foo/bar", label: "Paid-ish", provider: "
|
| 262 |
{ id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
|
| 263 |
];
|
| 264 |
},
|
|
@@ -271,7 +271,7 @@ test("GET /models filters out non-free model ids", async () => {
|
|
| 271 |
createTraskHttpRouter({
|
| 272 |
runtime: {
|
| 273 |
searchProvider,
|
| 274 |
-
|
| 275 |
queryRepository,
|
| 276 |
},
|
| 277 |
auth: {
|
|
@@ -283,13 +283,13 @@ test("GET /models filters out non-free model ids", async () => {
|
|
| 283 |
const res = await request(app).get("/api/trask/models");
|
| 284 |
assert.equal(res.status, 200);
|
| 285 |
assert.deepEqual(res.body.models, [
|
| 286 |
-
{ id: "auto", label: "Auto", provider: "
|
| 287 |
{ id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
|
| 288 |
{ id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
|
| 289 |
]);
|
| 290 |
});
|
| 291 |
|
| 292 |
-
test("POST /ask rejects model ids outside the current
|
| 293 |
const queryRepository = new JsonTraskQueryRepository(path.join(tmpDir, `qmr-${Math.random()}.json`));
|
| 294 |
const searchProvider = {
|
| 295 |
async listSources() {
|
|
@@ -310,7 +310,7 @@ test("POST /ask rejects model ids outside the current ResearchWizard list", asyn
|
|
| 310 |
createTraskHttpRouter({
|
| 311 |
runtime: {
|
| 312 |
searchProvider,
|
| 313 |
-
|
| 314 |
queryRepository,
|
| 315 |
},
|
| 316 |
auth: {
|
|
@@ -348,7 +348,7 @@ test("POST /ask persists, returns 202, completes asynchronously", async () => {
|
|
| 348 |
createTraskHttpRouter({
|
| 349 |
runtime: {
|
| 350 |
searchProvider,
|
| 351 |
-
|
| 352 |
queryRepository,
|
| 353 |
},
|
| 354 |
auth: {
|
|
@@ -388,8 +388,8 @@ test("POST /ask forwards source weights to the research wizard", async () => {
|
|
| 388 |
return { queuedSourceIds: [] as string[], mode: "file-queue" as const };
|
| 389 |
},
|
| 390 |
};
|
| 391 |
-
let receivedOptions:
|
| 392 |
-
const weightedWizard:
|
| 393 |
async answerQuestion(_query, _onProgress, options) {
|
| 394 |
receivedOptions = options;
|
| 395 |
return {
|
|
@@ -408,7 +408,7 @@ test("POST /ask forwards source weights to the research wizard", async () => {
|
|
| 408 |
createTraskHttpRouter({
|
| 409 |
runtime: {
|
| 410 |
searchProvider,
|
| 411 |
-
|
| 412 |
queryRepository,
|
| 413 |
},
|
| 414 |
auth: {
|
|
@@ -455,7 +455,7 @@ test("GET /thread/:threadId returns persisted rows for the authenticated user",
|
|
| 455 |
createTraskHttpRouter({
|
| 456 |
runtime: {
|
| 457 |
searchProvider,
|
| 458 |
-
|
| 459 |
queryRepository,
|
| 460 |
},
|
| 461 |
auth: {
|
|
@@ -510,7 +510,7 @@ test("GET /thread/:threadId requires authentication", async () => {
|
|
| 510 |
createTraskHttpRouter({
|
| 511 |
runtime: {
|
| 512 |
searchProvider,
|
| 513 |
-
|
| 514 |
queryRepository,
|
| 515 |
},
|
| 516 |
auth: {
|
|
@@ -554,7 +554,7 @@ test("anonymous persistQueries=false skips disk but still returns threadId", asy
|
|
| 554 |
createTraskHttpRouter({
|
| 555 |
runtime: {
|
| 556 |
searchProvider,
|
| 557 |
-
|
| 558 |
queryRepository,
|
| 559 |
},
|
| 560 |
auth: {
|
|
|
|
| 7 |
import { JsonTraskQueryRepository } from "@openkotor/persistence";
|
| 8 |
import type { SourceDescriptor } from "@openkotor/retrieval";
|
| 9 |
import type {
|
| 10 |
+
WebResearchAnswer,
|
| 11 |
+
WebResearchProgressEvent,
|
| 12 |
+
WebResearchQueryHandler,
|
| 13 |
+
WebResearchQueryOptions,
|
| 14 |
} from "@openkotor/trask";
|
| 15 |
import express from "express";
|
| 16 |
import request from "supertest";
|
|
|
|
| 28 |
tags: [],
|
| 29 |
};
|
| 30 |
|
| 31 |
+
const mockWizard: WebResearchQueryHandler = {
|
| 32 |
async answerQuestion(
|
| 33 |
_query: string,
|
| 34 |
+
onProgress?: (event: WebResearchProgressEvent) => void,
|
| 35 |
+
): Promise<WebResearchAnswer> {
|
| 36 |
onProgress?.({ phase: "gather", detail: "test" });
|
| 37 |
return {
|
| 38 |
answer: "Stub answer.\n\nSources\n1. Test Source - https://example.com",
|
|
|
|
| 76 |
createTraskHttpRouter({
|
| 77 |
runtime: {
|
| 78 |
searchProvider,
|
| 79 |
+
webResearch: mockWizard,
|
| 80 |
queryRepository,
|
| 81 |
},
|
| 82 |
auth: {
|
|
|
|
| 112 |
createTraskHttpRouter({
|
| 113 |
runtime: {
|
| 114 |
searchProvider,
|
| 115 |
+
webResearch: mockWizard,
|
| 116 |
queryRepository,
|
| 117 |
},
|
| 118 |
auth: {
|
|
|
|
| 153 |
createTraskHttpRouter({
|
| 154 |
runtime: {
|
| 155 |
searchProvider,
|
| 156 |
+
webResearch: mockWizard,
|
| 157 |
queryRepository,
|
| 158 |
},
|
| 159 |
auth: {
|
|
|
|
| 187 |
createTraskHttpRouter({
|
| 188 |
runtime: {
|
| 189 |
searchProvider,
|
| 190 |
+
webResearch: mockWizard,
|
| 191 |
queryRepository,
|
| 192 |
},
|
| 193 |
auth: {
|
|
|
|
| 223 |
createTraskHttpRouter({
|
| 224 |
runtime: {
|
| 225 |
searchProvider,
|
| 226 |
+
webResearch: mockWizard,
|
| 227 |
queryRepository,
|
| 228 |
},
|
| 229 |
auth: {
|
|
|
|
| 235 |
const res = await request(app).get("/api/trask/models");
|
| 236 |
assert.equal(res.status, 200);
|
| 237 |
assert.deepEqual(res.body.models, [
|
| 238 |
+
{ id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
|
| 239 |
]);
|
| 240 |
});
|
| 241 |
|
|
|
|
| 253 |
},
|
| 254 |
};
|
| 255 |
|
| 256 |
+
const webResearch = {
|
| 257 |
...mockWizard,
|
| 258 |
async listModels() {
|
| 259 |
return [
|
| 260 |
{ id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
|
| 261 |
+
{ id: "litellm:foo/bar", label: "Paid-ish", provider: "Trask web research" },
|
| 262 |
{ id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
|
| 263 |
];
|
| 264 |
},
|
|
|
|
| 271 |
createTraskHttpRouter({
|
| 272 |
runtime: {
|
| 273 |
searchProvider,
|
| 274 |
+
webResearch,
|
| 275 |
queryRepository,
|
| 276 |
},
|
| 277 |
auth: {
|
|
|
|
| 283 |
const res = await request(app).get("/api/trask/models");
|
| 284 |
assert.equal(res.status, 200);
|
| 285 |
assert.deepEqual(res.body.models, [
|
| 286 |
+
{ id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
|
| 287 |
{ id: "openrouter:openrouter/free", label: "Free", provider: "OpenRouter" },
|
| 288 |
{ id: "vendor/model:free", label: "Free tag", provider: "Vendor" },
|
| 289 |
]);
|
| 290 |
});
|
| 291 |
|
| 292 |
+
test("POST /ask rejects model ids outside the current web research model list", async () => {
|
| 293 |
const queryRepository = new JsonTraskQueryRepository(path.join(tmpDir, `qmr-${Math.random()}.json`));
|
| 294 |
const searchProvider = {
|
| 295 |
async listSources() {
|
|
|
|
| 310 |
createTraskHttpRouter({
|
| 311 |
runtime: {
|
| 312 |
searchProvider,
|
| 313 |
+
webResearch: mockWizard,
|
| 314 |
queryRepository,
|
| 315 |
},
|
| 316 |
auth: {
|
|
|
|
| 348 |
createTraskHttpRouter({
|
| 349 |
runtime: {
|
| 350 |
searchProvider,
|
| 351 |
+
webResearch: mockWizard,
|
| 352 |
queryRepository,
|
| 353 |
},
|
| 354 |
auth: {
|
|
|
|
| 388 |
return { queuedSourceIds: [] as string[], mode: "file-queue" as const };
|
| 389 |
},
|
| 390 |
};
|
| 391 |
+
let receivedOptions: WebResearchQueryOptions | undefined;
|
| 392 |
+
const weightedWizard: WebResearchQueryHandler = {
|
| 393 |
async answerQuestion(_query, _onProgress, options) {
|
| 394 |
receivedOptions = options;
|
| 395 |
return {
|
|
|
|
| 408 |
createTraskHttpRouter({
|
| 409 |
runtime: {
|
| 410 |
searchProvider,
|
| 411 |
+
webResearch: weightedWizard,
|
| 412 |
queryRepository,
|
| 413 |
},
|
| 414 |
auth: {
|
|
|
|
| 455 |
createTraskHttpRouter({
|
| 456 |
runtime: {
|
| 457 |
searchProvider,
|
| 458 |
+
webResearch: mockWizard,
|
| 459 |
queryRepository,
|
| 460 |
},
|
| 461 |
auth: {
|
|
|
|
| 510 |
createTraskHttpRouter({
|
| 511 |
runtime: {
|
| 512 |
searchProvider,
|
| 513 |
+
webResearch: mockWizard,
|
| 514 |
queryRepository,
|
| 515 |
},
|
| 516 |
auth: {
|
|
|
|
| 554 |
createTraskHttpRouter({
|
| 555 |
runtime: {
|
| 556 |
searchProvider,
|
| 557 |
+
webResearch: mockWizard,
|
| 558 |
queryRepository,
|
| 559 |
},
|
| 560 |
auth: {
|
packages/trask-http/src/router.ts
CHANGED
|
@@ -14,10 +14,10 @@ import { normalizeAuthHandlerError, type AuthHandlerThrown } from "@openkotor/pl
|
|
| 14 |
import type { SearchProvider } from "@openkotor/retrieval";
|
| 15 |
|
| 16 |
import type {
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
} from "@openkotor/trask";
|
| 22 |
|
| 23 |
import { Router, type Request, type Response, type RequestHandler } from "express";
|
|
@@ -28,7 +28,7 @@ export interface TraskHttpRuntime {
|
|
| 28 |
|
| 29 |
searchProvider: SearchProvider;
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
queryRepository: JsonTraskQueryRepository;
|
| 34 |
|
|
@@ -68,6 +68,12 @@ export interface TraskHttpSessionDto {
|
|
| 68 |
|
| 69 |
oauthAvailable?: boolean;
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
discord?: { id: string; username: string; displayName: string };
|
| 72 |
|
| 73 |
}
|
|
@@ -184,7 +190,7 @@ const mapTraskQueryRecord = (record: TraskQueryRecord): TraskQueryRecord => ({
|
|
| 184 |
});
|
| 185 |
|
| 186 |
const mapDescriptorsToSourceRecords = (
|
| 187 |
-
sources:
|
| 188 |
): readonly TraskSourceRecord[] => {
|
| 189 |
if (!sources?.length) return [];
|
| 190 |
return sources.map((s) => ({
|
|
@@ -215,11 +221,11 @@ const appendLiveTrace = async (
|
|
| 215 |
|
| 216 |
const CANCELED_QUERY_ERROR = "Canceled by newer request.";
|
| 217 |
|
| 218 |
-
const DEFAULT_TRASK_MODEL_OPTIONS: readonly
|
| 219 |
-
{ id: "auto", label: "Auto", provider: "
|
| 220 |
];
|
| 221 |
|
| 222 |
-
const mapModelOption = (option:
|
| 223 |
id: option.id,
|
| 224 |
label: option.label,
|
| 225 |
provider: option.provider,
|
|
@@ -232,11 +238,11 @@ const isFreeModelId = (id: string): boolean => {
|
|
| 232 |
};
|
| 233 |
|
| 234 |
const resolveTraskModelOptions = async (
|
| 235 |
-
|
| 236 |
-
): Promise<readonly
|
| 237 |
-
const dynamicModels =
|
| 238 |
const seen = new Set<string>();
|
| 239 |
-
const models:
|
| 240 |
for (const option of [...DEFAULT_TRASK_MODEL_OPTIONS, ...dynamicModels]) {
|
| 241 |
const id = option.id.trim();
|
| 242 |
if (!id || seen.has(id) || !isFreeModelId(id)) continue;
|
|
@@ -244,7 +250,7 @@ const resolveTraskModelOptions = async (
|
|
| 244 |
models.push(mapModelOption({
|
| 245 |
id,
|
| 246 |
label: option.label.trim() || id,
|
| 247 |
-
provider: option.provider.trim() || "
|
| 248 |
...(option.recommended ? { recommended: true } : {}),
|
| 249 |
}));
|
| 250 |
}
|
|
@@ -309,14 +315,14 @@ const normalizeTraskModelFromBody = (raw: ScalarOrObject | undefined): string |
|
|
| 309 |
return model;
|
| 310 |
};
|
| 311 |
|
| 312 |
-
const normalizeSourcePreferencesFromBody = (raw: ScalarOrObject | undefined):
|
| 313 |
if (raw === undefined || raw === null) return undefined;
|
| 314 |
if (!Array.isArray(raw)) {
|
| 315 |
throw Object.assign(new Error("sourceWeights must be an array when provided."), { status: 422 });
|
| 316 |
}
|
| 317 |
|
| 318 |
return raw
|
| 319 |
-
.map((entry):
|
| 320 |
if (!entry || typeof entry !== "object" || Array.isArray(entry)) return undefined;
|
| 321 |
const value = entry as Record<string, unknown>;
|
| 322 |
const url = typeof value.url === "string" ? value.url.trim() : "";
|
|
@@ -330,7 +336,7 @@ const normalizeSourcePreferencesFromBody = (raw: ScalarOrObject | undefined): Re
|
|
| 330 |
enabled: value.enabled !== false,
|
| 331 |
};
|
| 332 |
})
|
| 333 |
-
.filter((entry): entry is
|
| 334 |
};
|
| 335 |
|
| 336 |
|
|
@@ -394,7 +400,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
|
|
| 394 |
options.auth.requireAuth(async (_req, res, _user) => {
|
| 395 |
try {
|
| 396 |
const trask = requireRuntime();
|
| 397 |
-
const models = await resolveTraskModelOptions(trask.
|
| 398 |
res.json({ models: models.map(mapModelOption) });
|
| 399 |
} catch (err) {
|
| 400 |
handleTraskError(res, err as AuthHandlerThrown);
|
|
@@ -605,7 +611,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
|
|
| 605 |
let threadId: string;
|
| 606 |
|
| 607 |
let model: string | undefined;
|
| 608 |
-
let sourcePreferences:
|
| 609 |
|
| 610 |
const persist = shouldPersistForUser(user);
|
| 611 |
|
|
@@ -624,9 +630,9 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
|
|
| 624 |
sourcePreferences = normalizeSourcePreferencesFromBody(body.sourceWeights);
|
| 625 |
|
| 626 |
if (model) {
|
| 627 |
-
const allowedModels = await resolveTraskModelOptions(trask.
|
| 628 |
if (!allowedModels.some((option) => option.id === model)) {
|
| 629 |
-
throw Object.assign(new Error("model is not available in the current
|
| 630 |
}
|
| 631 |
}
|
| 632 |
|
|
@@ -652,7 +658,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
|
|
| 652 |
|
| 653 |
if (!persist) {
|
| 654 |
try {
|
| 655 |
-
const result = await trask.
|
| 656 |
...(model ? { model } : {}),
|
| 657 |
...(sourcePreferences ? { sourcePreferences } : {}),
|
| 658 |
});
|
|
@@ -735,7 +741,7 @@ export const createTraskHttpRouter = <TUser extends TraskHttpUser = TraskHttpUse
|
|
| 735 |
|
| 736 |
void (async () => {
|
| 737 |
try {
|
| 738 |
-
const result = await trask.
|
| 739 |
await appendLiveTrace(trask.queryRepository, queryId, {
|
| 740 |
phase: ev.phase,
|
| 741 |
...(ev.detail !== undefined ? { detail: ev.detail } : {}),
|
|
|
|
| 14 |
import type { SearchProvider } from "@openkotor/retrieval";
|
| 15 |
|
| 16 |
import type {
|
| 17 |
+
WebResearchModelOption,
|
| 18 |
+
WebResearchProgressEvent,
|
| 19 |
+
WebResearchQueryHandler,
|
| 20 |
+
WebResearchSourcePreference,
|
| 21 |
} from "@openkotor/trask";
|
| 22 |
|
| 23 |
import { Router, type Request, type Response, type RequestHandler } from "express";
|
|
|
|
| 28 |
|
| 29 |
searchProvider: SearchProvider;
|
| 30 |
|
| 31 |
+
webResearch: WebResearchQueryHandler;
|
| 32 |
|
| 33 |
queryRepository: JsonTraskQueryRepository;
|
| 34 |
|
|
|
|
| 68 |
|
| 69 |
oauthAvailable?: boolean;
|
| 70 |
|
| 71 |
+
/** False when startup LLM probe found no working provider in the fallback chain. */
|
| 72 |
+
|
| 73 |
+
researchAvailable?: boolean;
|
| 74 |
+
|
| 75 |
+
researchUnavailableReason?: string;
|
| 76 |
+
|
| 77 |
discord?: { id: string; username: string; displayName: string };
|
| 78 |
|
| 79 |
}
|
|
|
|
| 190 |
});
|
| 191 |
|
| 192 |
const mapDescriptorsToSourceRecords = (
|
| 193 |
+
sources: WebResearchProgressEvent["sources"],
|
| 194 |
): readonly TraskSourceRecord[] => {
|
| 195 |
if (!sources?.length) return [];
|
| 196 |
return sources.map((s) => ({
|
|
|
|
| 221 |
|
| 222 |
const CANCELED_QUERY_ERROR = "Canceled by newer request.";
|
| 223 |
|
| 224 |
+
const DEFAULT_TRASK_MODEL_OPTIONS: readonly WebResearchModelOption[] = [
|
| 225 |
+
{ id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
|
| 226 |
];
|
| 227 |
|
| 228 |
+
const mapModelOption = (option: WebResearchModelOption): WebResearchModelOption => ({
|
| 229 |
id: option.id,
|
| 230 |
label: option.label,
|
| 231 |
provider: option.provider,
|
|
|
|
| 238 |
};
|
| 239 |
|
| 240 |
const resolveTraskModelOptions = async (
|
| 241 |
+
webResearch: WebResearchQueryHandler,
|
| 242 |
+
): Promise<readonly WebResearchModelOption[]> => {
|
| 243 |
+
const dynamicModels = webResearch.listModels ? await webResearch.listModels() : [];
|
| 244 |
const seen = new Set<string>();
|
| 245 |
+
const models: WebResearchModelOption[] = [];
|
| 246 |
for (const option of [...DEFAULT_TRASK_MODEL_OPTIONS, ...dynamicModels]) {
|
| 247 |
const id = option.id.trim();
|
| 248 |
if (!id || seen.has(id) || !isFreeModelId(id)) continue;
|
|
|
|
| 250 |
models.push(mapModelOption({
|
| 251 |
id,
|
| 252 |
label: option.label.trim() || id,
|
| 253 |
+
provider: option.provider.trim() || "WebResearch",
|
| 254 |
...(option.recommended ? { recommended: true } : {}),
|
| 255 |
}));
|
| 256 |
}
|
|
|
|
| 315 |
return model;
|
| 316 |
};
|
| 317 |
|
| 318 |
+
const normalizeSourcePreferencesFromBody = (raw: ScalarOrObject | undefined): WebResearchSourcePreference[] | undefined => {
|
| 319 |
if (raw === undefined || raw === null) return undefined;
|
| 320 |
if (!Array.isArray(raw)) {
|
| 321 |
throw Object.assign(new Error("sourceWeights must be an array when provided."), { status: 422 });
|
| 322 |
}
|
| 323 |
|
| 324 |
return raw
|
| 325 |
+
.map((entry): WebResearchSourcePreference | undefined => {
|
| 326 |
if (!entry || typeof entry !== "object" || Array.isArray(entry)) return undefined;
|
| 327 |
const value = entry as Record<string, unknown>;
|
| 328 |
const url = typeof value.url === "string" ? value.url.trim() : "";
|
|
|
|
| 336 |
enabled: value.enabled !== false,
|
| 337 |
};
|
| 338 |
})
|
| 339 |
+
.filter((entry): entry is WebResearchSourcePreference => entry !== undefined);
|
| 340 |
};
|
| 341 |
|
| 342 |
|
|
|
|
| 400 |
options.auth.requireAuth(async (_req, res, _user) => {
|
| 401 |
try {
|
| 402 |
const trask = requireRuntime();
|
| 403 |
+
const models = await resolveTraskModelOptions(trask.webResearch);
|
| 404 |
res.json({ models: models.map(mapModelOption) });
|
| 405 |
} catch (err) {
|
| 406 |
handleTraskError(res, err as AuthHandlerThrown);
|
|
|
|
| 611 |
let threadId: string;
|
| 612 |
|
| 613 |
let model: string | undefined;
|
| 614 |
+
let sourcePreferences: WebResearchSourcePreference[] | undefined;
|
| 615 |
|
| 616 |
const persist = shouldPersistForUser(user);
|
| 617 |
|
|
|
|
| 630 |
sourcePreferences = normalizeSourcePreferencesFromBody(body.sourceWeights);
|
| 631 |
|
| 632 |
if (model) {
|
| 633 |
+
const allowedModels = await resolveTraskModelOptions(trask.webResearch);
|
| 634 |
if (!allowedModels.some((option) => option.id === model)) {
|
| 635 |
+
throw Object.assign(new Error("model is not available in the current web research model list."), { status: 422 });
|
| 636 |
}
|
| 637 |
}
|
| 638 |
|
|
|
|
| 658 |
|
| 659 |
if (!persist) {
|
| 660 |
try {
|
| 661 |
+
const result = await trask.webResearch.answerQuestion(query, undefined, {
|
| 662 |
...(model ? { model } : {}),
|
| 663 |
...(sourcePreferences ? { sourcePreferences } : {}),
|
| 664 |
});
|
|
|
|
| 741 |
|
| 742 |
void (async () => {
|
| 743 |
try {
|
| 744 |
+
const result = await trask.webResearch.answerQuestion(query, async (ev) => {
|
| 745 |
await appendLiveTrace(trask.queryRepository, queryId, {
|
| 746 |
phase: ev.phase,
|
| 747 |
...(ev.detail !== undefined ? { detail: ev.detail } : {}),
|
packages/trask/src/community-knowledge.test.ts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import assert from "node:assert/strict";
|
| 2 |
+
import { describe, test } from "node:test";
|
| 3 |
+
|
| 4 |
+
import type { SearchHit } from "@openkotor/retrieval";
|
| 5 |
+
|
| 6 |
+
import {
|
| 7 |
+
buildCommunityKnowledgeDigest,
|
| 8 |
+
filterWebArchiveCitationSources,
|
| 9 |
+
mergeCommunityAndWebSources,
|
| 10 |
+
searchHitsToCommunitySources,
|
| 11 |
+
} from "./community-knowledge.js";
|
| 12 |
+
|
| 13 |
+
const sampleHit = (url: string): SearchHit => ({
|
| 14 |
+
sourceId: "approved-discord-knowledge",
|
| 15 |
+
sourceName: "Approved Discord Knowledge",
|
| 16 |
+
kind: "discord",
|
| 17 |
+
title: "#general",
|
| 18 |
+
snippet: "Revan was a Jedi.",
|
| 19 |
+
url,
|
| 20 |
+
score: 3,
|
| 21 |
+
tags: ["discord"],
|
| 22 |
+
});
|
| 23 |
+
|
| 24 |
+
describe("community knowledge helpers", () => {
|
| 25 |
+
test("searchHitsToCommunitySources maps discord permalinks", () => {
|
| 26 |
+
const sources = searchHitsToCommunitySources([
|
| 27 |
+
sampleHit("https://discord.com/channels/1/2/3"),
|
| 28 |
+
]);
|
| 29 |
+
assert.equal(sources.length, 1);
|
| 30 |
+
assert.equal(sources[0]!.kind, "discord");
|
| 31 |
+
assert.equal(sources[0]!.homeUrl, "https://discord.com/channels/1/2/3");
|
| 32 |
+
});
|
| 33 |
+
|
| 34 |
+
test("buildCommunityKnowledgeDigest includes permalink lines", () => {
|
| 35 |
+
const digest = buildCommunityKnowledgeDigest([
|
| 36 |
+
sampleHit("https://discord.com/channels/1/2/3"),
|
| 37 |
+
]);
|
| 38 |
+
assert.match(digest, /Permalink: https:\/\/discord\.com\/channels\/1\/2\/3/);
|
| 39 |
+
});
|
| 40 |
+
|
| 41 |
+
test("filterWebArchiveCitationSources excludes discord URLs", () => {
|
| 42 |
+
const web = {
|
| 43 |
+
id: "w1",
|
| 44 |
+
name: "Web",
|
| 45 |
+
kind: "website" as const,
|
| 46 |
+
homeUrl: "https://deadlystream.com",
|
| 47 |
+
description: "fixture",
|
| 48 |
+
freshnessPolicy: "live",
|
| 49 |
+
approvalScope: "global",
|
| 50 |
+
tags: ["web"],
|
| 51 |
+
};
|
| 52 |
+
const discord = searchHitsToCommunitySources([sampleHit("https://discord.com/channels/1/2/3")])[0]!;
|
| 53 |
+
const filtered = filterWebArchiveCitationSources([web, discord]);
|
| 54 |
+
assert.equal(filtered.length, 1);
|
| 55 |
+
assert.equal(filtered[0]!.homeUrl, web.homeUrl);
|
| 56 |
+
});
|
| 57 |
+
|
| 58 |
+
test("mergeCommunityAndWebSources dedupes by URL", () => {
|
| 59 |
+
const web = {
|
| 60 |
+
id: "w1",
|
| 61 |
+
name: "Web",
|
| 62 |
+
kind: "website" as const,
|
| 63 |
+
homeUrl: "https://example.com/a",
|
| 64 |
+
description: "fixture",
|
| 65 |
+
freshnessPolicy: "live",
|
| 66 |
+
approvalScope: "global",
|
| 67 |
+
tags: ["web"],
|
| 68 |
+
};
|
| 69 |
+
const merged = mergeCommunityAndWebSources([web], [web]);
|
| 70 |
+
assert.equal(merged.length, 1);
|
| 71 |
+
});
|
| 72 |
+
});
|
packages/trask/src/community-knowledge.ts
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { SearchHit, SourceDescriptor } from "@openkotor/retrieval";
|
| 2 |
+
import { isDiscordCitationUrl } from "@openkotor/retrieval";
|
| 3 |
+
|
| 4 |
+
export const COMMUNITY_SOURCE_ID = "approved-discord-knowledge";
|
| 5 |
+
|
| 6 |
+
export function searchHitsToCommunitySources(hits: readonly SearchHit[]): SourceDescriptor[] {
|
| 7 |
+
return hits.map((hit, index) => ({
|
| 8 |
+
id: `${COMMUNITY_SOURCE_ID}-hit-${index + 1}`,
|
| 9 |
+
name: hit.title.trim() || "Discord message",
|
| 10 |
+
kind: "discord",
|
| 11 |
+
homeUrl: hit.url,
|
| 12 |
+
description: hit.snippet.slice(0, 280),
|
| 13 |
+
freshnessPolicy: "live-and-imported",
|
| 14 |
+
approvalScope: "approved-channels",
|
| 15 |
+
tags: [...hit.tags, "community"],
|
| 16 |
+
}));
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
export function buildCommunityKnowledgeDigest(hits: readonly SearchHit[]): string {
|
| 20 |
+
if (hits.length === 0) return "";
|
| 21 |
+
const blocks = hits.map((hit, index) => {
|
| 22 |
+
const lines = [
|
| 23 |
+
`[${index + 1}] ${hit.title}`,
|
| 24 |
+
hit.snippet,
|
| 25 |
+
`Permalink: ${hit.url}`,
|
| 26 |
+
];
|
| 27 |
+
return lines.join("\n");
|
| 28 |
+
});
|
| 29 |
+
return [
|
| 30 |
+
"Community context (lower authority than approved web archives; prefer web sources when they conflict):",
|
| 31 |
+
"",
|
| 32 |
+
...blocks,
|
| 33 |
+
].join("\n");
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
export function isCommunityCitationUrl(url: string): boolean {
|
| 37 |
+
return isDiscordCitationUrl(url);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
export function filterWebArchiveCitationSources(sources: readonly SourceDescriptor[]): SourceDescriptor[] {
|
| 41 |
+
return sources.filter((source) => {
|
| 42 |
+
const url = source.homeUrl.trim();
|
| 43 |
+
if (!url.startsWith("http://") && !url.startsWith("https://")) return false;
|
| 44 |
+
return !isCommunityCitationUrl(url);
|
| 45 |
+
});
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
export function mergeCommunityAndWebSources(
|
| 49 |
+
webSources: readonly SourceDescriptor[],
|
| 50 |
+
communitySources: readonly SourceDescriptor[],
|
| 51 |
+
): SourceDescriptor[] {
|
| 52 |
+
const seen = new Set<string>();
|
| 53 |
+
const merged: SourceDescriptor[] = [];
|
| 54 |
+
for (const source of [...webSources, ...communitySources]) {
|
| 55 |
+
const key = source.homeUrl.trim().toLowerCase();
|
| 56 |
+
if (!key || seen.has(key)) continue;
|
| 57 |
+
seen.add(key);
|
| 58 |
+
merged.push(source);
|
| 59 |
+
}
|
| 60 |
+
return merged;
|
| 61 |
+
}
|
packages/trask/src/index.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
-
export * from "./
|
|
|
|
| 2 |
export * from "./discord-reply-format.js";
|
| 3 |
export * from "./proactive-llm.js";
|
|
|
|
|
|
| 1 |
+
export * from "./web-research.js";
|
| 2 |
+
export * from "./web-research-subprocess.js";
|
| 3 |
export * from "./discord-reply-format.js";
|
| 4 |
export * from "./proactive-llm.js";
|
| 5 |
+
export * from "./community-knowledge.js";
|
packages/trask/src/research-wizard.ts
CHANGED
|
@@ -1,1271 +1,18 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
import {
|
| 5 |
-
isTraskApprovedBaseUrl,
|
| 6 |
-
isTraskApprovedResearchUrl,
|
| 7 |
-
sourceUrlMatchesDescriptor,
|
| 8 |
-
traskApprovedResearchBaseHosts,
|
| 9 |
-
traskApprovedResearchSources,
|
| 10 |
-
type SourceDescriptor,
|
| 11 |
-
} from "@openkotor/retrieval";
|
| 12 |
-
|
| 13 |
-
import {
|
| 14 |
-
listHeadlessGptResearcherModels,
|
| 15 |
-
runHeadlessGptResearcher,
|
| 16 |
-
type HeadlessAiResearchWizardModelOption,
|
| 17 |
-
} from "./gpt-researcher-subprocess.js";
|
| 18 |
-
|
| 19 |
-
export interface ResearchWizardAnswer {
|
| 20 |
-
answer: string;
|
| 21 |
-
/** Sources explicitly cited in the final answer shown to users. */
|
| 22 |
-
approvedSources: readonly SourceDescriptor[];
|
| 23 |
-
/** Sources retrieved as candidate evidence for the answer/rewrite stage. */
|
| 24 |
-
retrievedSources: readonly SourceDescriptor[];
|
| 25 |
-
/** Allowlisted URLs the headless researcher touched while gathering evidence. */
|
| 26 |
-
visitedUrls: readonly string[];
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
export interface ResearchWizardBriefAnswer extends ResearchWizardAnswer {
|
| 30 |
-
/** Normalized research report text used for proactive semantic gating. */
|
| 31 |
-
researchReport: string;
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
/** Fine-grained phases for Holocron clients polling thread history. */
|
| 35 |
-
export interface ResearchWizardProgressEvent {
|
| 36 |
-
phase: "gather" | "report" | "sources" | "compose";
|
| 37 |
-
detail?: string;
|
| 38 |
-
sources?: readonly SourceDescriptor[];
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
export interface ResearchWizardQueryOptions {
|
| 42 |
-
/** Preferred ai-researchwizard model id, e.g. `openrouter:openrouter/auto` or `litellm:moonshotai/kimi-k2`. */
|
| 43 |
-
model?: string;
|
| 44 |
-
/** Optional per-request source enablement and weight hints from Holocron's Source Prioritization dialog. */
|
| 45 |
-
sourcePreferences?: readonly ResearchWizardSourcePreference[];
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
export interface ResearchWizardSourcePreference {
|
| 49 |
-
name?: string;
|
| 50 |
-
url: string;
|
| 51 |
-
weight: number;
|
| 52 |
-
enabled: boolean;
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
export interface ResearchWizardModelOption extends HeadlessAiResearchWizardModelOption {}
|
| 56 |
-
|
| 57 |
-
/** Structural type for adapters that only need full Q&A (e.g. Trask HTTP `/ask`). */
|
| 58 |
-
export interface ResearchWizardQueryHandler {
|
| 59 |
-
answerQuestion(
|
| 60 |
-
query: string,
|
| 61 |
-
onProgress?: (event: ResearchWizardProgressEvent) => void,
|
| 62 |
-
options?: ResearchWizardQueryOptions,
|
| 63 |
-
): Promise<ResearchWizardAnswer>;
|
| 64 |
-
listModels?(): Promise<readonly ResearchWizardModelOption[]>;
|
| 65 |
-
}
|
| 66 |
-
|
| 67 |
-
const DEFAULT_RESEARCH_WIZARD_MODELS: readonly ResearchWizardModelOption[] = [
|
| 68 |
-
{ id: "auto", label: "Auto", provider: "ResearchWizard fallback", recommended: true },
|
| 69 |
-
];
|
| 70 |
-
|
| 71 |
-
interface ResearchWizardResponsePayload {
|
| 72 |
-
report?: string | null;
|
| 73 |
-
research_information?: {
|
| 74 |
-
source_urls?: readonly string[] | null;
|
| 75 |
-
cited_urls?: readonly string[] | null;
|
| 76 |
-
retrieved_urls?: readonly string[] | null;
|
| 77 |
-
visited_urls?: readonly string[] | null;
|
| 78 |
-
query_domains?: readonly string[] | null;
|
| 79 |
-
allowed_url_prefixes?: readonly string[] | null;
|
| 80 |
-
rejected_source_urls?: readonly string[] | null;
|
| 81 |
-
};
|
| 82 |
-
}
|
| 83 |
-
|
| 84 |
-
const buildResearchTask = (query: string): string => {
|
| 85 |
-
return query.trim();
|
| 86 |
-
};
|
| 87 |
-
|
| 88 |
-
const buildCustomPrompt = (): string => {
|
| 89 |
-
return [
|
| 90 |
-
"Answer the user's question as a Discord-native KOTOR assistant reply using only the provided research context.",
|
| 91 |
-
"Requirements:",
|
| 92 |
-
"- Lead with the answer, not an introduction.",
|
| 93 |
-
"- Sound direct, practical, and helpful.",
|
| 94 |
-
"- Keep the answer concise: at most 3 short paragraphs or 5 compact bullets total before sources.",
|
| 95 |
-
"- Do not describe your research process, retrieval steps, indexing, backend systems, or source policy unless the user explicitly asks.",
|
| 96 |
-
"- Include inline numeric citations like [1] tied to concrete claims.",
|
| 97 |
-
' - End with the exact heading "Sources" on its own line.',
|
| 98 |
-
"- Under Sources, list only the sources you cited, each on its own numbered line in the format: 1. Source Name - URL",
|
| 99 |
-
"- Do not add markdown headings other than the final Sources heading.",
|
| 100 |
-
].join("\n");
|
| 101 |
-
};
|
| 102 |
-
|
| 103 |
-
const buildCustomPromptBrief = (): string => {
|
| 104 |
-
return [
|
| 105 |
-
"Produce a compact research digest for Star Wars: Knights of the Old Republic (KOTOR 1/2) modding questions.",
|
| 106 |
-
"Constraints:",
|
| 107 |
-
"- Stay under ~900 words; bullet key facts when possible.",
|
| 108 |
-
"- Do not narrate tooling, retrieval steps, or how you searched.",
|
| 109 |
-
"- Prefer actionable answers over background essays.",
|
| 110 |
-
"- Include inline numeric citations like [1] tied to concrete claims.",
|
| 111 |
-
' - End with the exact heading "Sources" on its own line.',
|
| 112 |
-
"- Under Sources, list only cited sources as numbered lines: 1. Source Name - URL",
|
| 113 |
-
].join("\n");
|
| 114 |
-
};
|
| 115 |
-
|
| 116 |
-
const normalizeUrl = (value: string): string => value.replace(/\/+$/, "").trim();
|
| 117 |
-
|
| 118 |
-
const extractUrls = (value: string): string[] => {
|
| 119 |
-
const matches = value.match(/[a-z][a-z0-9+.-]*:\/\/[^\s)>\]]+/giu) ?? [];
|
| 120 |
-
return [...new Set(matches.map((match) => match.replace(/[.,;:!?]+$/, "")))];
|
| 121 |
-
};
|
| 122 |
-
|
| 123 |
-
const extractSourceSectionUrls = (value: string): string[] => {
|
| 124 |
-
const normalized = value.replace(/\r\n/g, "\n");
|
| 125 |
-
const sourceHeading = /\n(?:#{1,6}\s*)?(?:Sources|References)\s*\n/i;
|
| 126 |
-
const match = normalized.match(sourceHeading);
|
| 127 |
-
if (!match || match.index === undefined) {
|
| 128 |
-
return extractUrls(normalized);
|
| 129 |
-
}
|
| 130 |
-
const sourceSection = normalized.slice(match.index + match[0].length);
|
| 131 |
-
return extractUrls(sourceSection);
|
| 132 |
-
};
|
| 133 |
-
|
| 134 |
-
const hostnameHint = (url: string): string => {
|
| 135 |
-
try {
|
| 136 |
-
return new URL(url).hostname.replace(/^www\./, "").toLowerCase();
|
| 137 |
-
} catch {
|
| 138 |
-
return url.slice(0, 48);
|
| 139 |
-
}
|
| 140 |
-
};
|
| 141 |
-
|
| 142 |
-
/** Dedupe by normalized URL; preserves first-seen order for stable Holocron pulses. */
|
| 143 |
-
const uniqueUrlsPreserveOrder = (urls: readonly string[]): string[] => {
|
| 144 |
-
const seen = new Set<string>();
|
| 145 |
-
const out: string[] = [];
|
| 146 |
-
for (const raw of urls) {
|
| 147 |
-
const u = normalizeUrl(raw);
|
| 148 |
-
if (!u || seen.has(u)) continue;
|
| 149 |
-
seen.add(u);
|
| 150 |
-
out.push(u);
|
| 151 |
-
}
|
| 152 |
-
return out;
|
| 153 |
-
};
|
| 154 |
-
|
| 155 |
-
const payloadUrls = (values: readonly string[] | null | undefined): string[] =>
|
| 156 |
-
Array.isArray(values) ? values.filter((value): value is string => typeof value === "string") : [];
|
| 157 |
-
|
| 158 |
-
const isAllowedSourceUrl = (url: string, sourcePool: readonly SourceDescriptor[]): boolean => {
|
| 159 |
-
if (!isPublicWebCitationUrl(url)) return false;
|
| 160 |
-
if (sourcePool.some((source) => sourceUrlMatchesDescriptor(url, source))) return true;
|
| 161 |
-
if (isTraskApprovedResearchUrl(url, sourcePool)) return true;
|
| 162 |
-
return isTraskApprovedBaseUrl(url);
|
| 163 |
-
};
|
| 164 |
-
|
| 165 |
-
/** Visited URLs from ai-researchwizard payload (Holocron live facet pings). */
|
| 166 |
-
const collectVisitedUrlsFromPayload = (
|
| 167 |
-
payload: ResearchWizardResponsePayload,
|
| 168 |
-
approvedSources: readonly SourceDescriptor[],
|
| 169 |
-
): string[] => {
|
| 170 |
-
const info = payload.research_information;
|
| 171 |
-
return uniqueUrlsPreserveOrder(payloadUrls(info?.visited_urls)).filter((url) =>
|
| 172 |
-
isAllowedSourceUrl(url, approvedSources),
|
| 173 |
-
);
|
| 174 |
-
};
|
| 175 |
-
|
| 176 |
-
const collectRejectedUrlsFromPayload = (payload: ResearchWizardResponsePayload): string[] => {
|
| 177 |
-
const rawRejected = payload.research_information?.rejected_source_urls;
|
| 178 |
-
return Array.isArray(rawRejected)
|
| 179 |
-
? uniqueUrlsPreserveOrder(rawRejected.filter((value): value is string => typeof value === "string"))
|
| 180 |
-
: [];
|
| 181 |
-
};
|
| 182 |
-
|
| 183 |
-
const MAX_ARCHIVE_PROBE_EVENTS = 28;
|
| 184 |
-
|
| 185 |
-
const emitArchiveProbeEvents = (
|
| 186 |
-
payload: ResearchWizardResponsePayload,
|
| 187 |
-
approvedSources: readonly SourceDescriptor[],
|
| 188 |
-
onProgress?: (event: ResearchWizardProgressEvent) => void,
|
| 189 |
-
): void => {
|
| 190 |
-
if (!onProgress) return;
|
| 191 |
-
|
| 192 |
-
const urls = collectVisitedUrlsFromPayload(payload, approvedSources).slice(0, MAX_ARCHIVE_PROBE_EVENTS * 2);
|
| 193 |
-
|
| 194 |
-
let emitted = 0;
|
| 195 |
-
for (const url of urls) {
|
| 196 |
-
if (emitted >= MAX_ARCHIVE_PROBE_EVENTS) break;
|
| 197 |
-
const matched = matchApprovedSource(url, approvedSources);
|
| 198 |
-
const host = hostnameHint(url);
|
| 199 |
-
onProgress({
|
| 200 |
-
phase: "gather",
|
| 201 |
-
detail: matched ? `Facet · ${matched.name}` : `Touch · ${host}`,
|
| 202 |
-
...(matched ? { sources: [matched] } : {}),
|
| 203 |
-
});
|
| 204 |
-
emitted++;
|
| 205 |
-
}
|
| 206 |
-
};
|
| 207 |
-
|
| 208 |
-
const matchApprovedSource = (
|
| 209 |
-
url: string,
|
| 210 |
-
approvedSources: readonly SourceDescriptor[],
|
| 211 |
-
): SourceDescriptor | undefined => {
|
| 212 |
-
const candidate = normalizeUrl(url);
|
| 213 |
-
|
| 214 |
-
return approvedSources.find((source) => {
|
| 215 |
-
const homeUrl = normalizeUrl(source.homeUrl);
|
| 216 |
-
return candidate === homeUrl || candidate.startsWith(`${homeUrl}/`);
|
| 217 |
-
});
|
| 218 |
-
};
|
| 219 |
-
|
| 220 |
-
const sourceUrlLabel = (source: SourceDescriptor, url: string): string => {
|
| 221 |
-
try {
|
| 222 |
-
const exact = new URL(url);
|
| 223 |
-
const base = new URL(source.homeUrl);
|
| 224 |
-
const exactPath = decodeURIComponent(exact.pathname.replace(/\/+$/u, ""));
|
| 225 |
-
const basePath = decodeURIComponent(base.pathname.replace(/\/+$/u, ""));
|
| 226 |
-
if (exactPath === basePath) return source.name;
|
| 227 |
-
const relativePath = exactPath.startsWith(`${basePath}/`) ? exactPath.slice(basePath.length + 1) : exactPath;
|
| 228 |
-
const cleaned = relativePath
|
| 229 |
-
.replace(/^blob\/[^/]+\//u, "")
|
| 230 |
-
.replace(/^tree\/[^/]+\//u, "")
|
| 231 |
-
.replace(/^wiki\//u, "")
|
| 232 |
-
.split("/")
|
| 233 |
-
.filter(Boolean)
|
| 234 |
-
.slice(-2)
|
| 235 |
-
.join("/")
|
| 236 |
-
.replace(/[-_]+/gu, " ")
|
| 237 |
-
.trim();
|
| 238 |
-
if (!cleaned) return source.name;
|
| 239 |
-
const lineAnchor = exact.hash && /^#L\d+(?:-L\d+)?$/iu.test(exact.hash) ? exact.hash : "";
|
| 240 |
-
return `${source.name}: ${cleaned}${lineAnchor}`;
|
| 241 |
-
} catch {
|
| 242 |
-
return source.name;
|
| 243 |
-
}
|
| 244 |
-
};
|
| 245 |
-
|
| 246 |
-
const exactSourceFromUrl = (url: string, approvedSources: readonly SourceDescriptor[]): SourceDescriptor | undefined => {
|
| 247 |
-
const exactUrl = normalizeUrl(url);
|
| 248 |
-
const catalogMatch = matchApprovedSource(url, approvedSources);
|
| 249 |
-
if (catalogMatch) {
|
| 250 |
-
const sourceUrl = normalizeUrl(catalogMatch.homeUrl);
|
| 251 |
-
return {
|
| 252 |
-
...catalogMatch,
|
| 253 |
-
id: exactUrl === sourceUrl ? catalogMatch.id : `${catalogMatch.id}:${exactUrl}`,
|
| 254 |
-
name: sourceUrlLabel(catalogMatch, exactUrl),
|
| 255 |
-
homeUrl: exactUrl,
|
| 256 |
-
};
|
| 257 |
-
}
|
| 258 |
-
if (!isTraskApprovedBaseUrl(url)) return undefined;
|
| 259 |
-
const host = hostnameHint(url);
|
| 260 |
-
return {
|
| 261 |
-
id: `approved-web:${exactUrl}`,
|
| 262 |
-
name: host,
|
| 263 |
-
kind: "website",
|
| 264 |
-
homeUrl: exactUrl,
|
| 265 |
-
description: `Approved web source (${host})`,
|
| 266 |
-
freshnessPolicy: "live web research",
|
| 267 |
-
approvalScope: "approved research host",
|
| 268 |
-
tags: [host],
|
| 269 |
-
};
|
| 270 |
-
};
|
| 271 |
-
|
| 272 |
-
const isCatalogRootUrl = (url: string, approvedSources: readonly SourceDescriptor[]): boolean => {
|
| 273 |
-
const normalized = normalizeUrl(url);
|
| 274 |
-
return approvedSources.some((source) => normalizeUrl(source.homeUrl) === normalized);
|
| 275 |
-
};
|
| 276 |
-
|
| 277 |
-
const materializeSourcesFromUrls = (
|
| 278 |
-
urls: readonly string[],
|
| 279 |
-
sourcePool: readonly SourceDescriptor[],
|
| 280 |
-
): readonly SourceDescriptor[] => {
|
| 281 |
-
const candidateUrls = uniqueUrlsPreserveOrder(
|
| 282 |
-
urls.filter((url) => isAllowedSourceUrl(url, sourcePool)),
|
| 283 |
-
);
|
| 284 |
-
|
| 285 |
-
const matched: SourceDescriptor[] = [];
|
| 286 |
-
const hasPreciseUrl = candidateUrls.some((url) => !isCatalogRootUrl(url, sourcePool));
|
| 287 |
-
|
| 288 |
-
for (const url of candidateUrls) {
|
| 289 |
-
if (hasPreciseUrl && isCatalogRootUrl(url, sourcePool)) continue;
|
| 290 |
-
const source = exactSourceFromUrl(url, sourcePool);
|
| 291 |
-
|
| 292 |
-
if (source && !matched.some((entry) => normalizeUrl(entry.homeUrl) === normalizeUrl(source.homeUrl))) {
|
| 293 |
-
matched.push(source);
|
| 294 |
-
}
|
| 295 |
-
}
|
| 296 |
-
|
| 297 |
-
return matched.slice(0, 6);
|
| 298 |
-
};
|
| 299 |
-
|
| 300 |
-
const collectCitedSources = (
|
| 301 |
-
report: string,
|
| 302 |
-
approvedSources: readonly SourceDescriptor[],
|
| 303 |
-
payload: ResearchWizardResponsePayload,
|
| 304 |
-
): readonly SourceDescriptor[] => {
|
| 305 |
-
const info = payload.research_information;
|
| 306 |
-
return materializeSourcesFromUrls([
|
| 307 |
-
...extractSourceSectionUrls(report),
|
| 308 |
-
...payloadUrls(info?.cited_urls),
|
| 309 |
-
...payloadUrls(info?.source_urls),
|
| 310 |
-
], approvedSources);
|
| 311 |
-
};
|
| 312 |
-
|
| 313 |
-
const collectRetrievedSources = (
|
| 314 |
-
report: string,
|
| 315 |
-
approvedSources: readonly SourceDescriptor[],
|
| 316 |
-
payload: ResearchWizardResponsePayload,
|
| 317 |
-
): readonly SourceDescriptor[] => {
|
| 318 |
-
const info = payload.research_information;
|
| 319 |
-
return materializeSourcesFromUrls([
|
| 320 |
-
...payloadUrls(info?.retrieved_urls),
|
| 321 |
-
...payloadUrls(info?.cited_urls),
|
| 322 |
-
...payloadUrls(info?.source_urls),
|
| 323 |
-
...extractSourceSectionUrls(report),
|
| 324 |
-
], approvedSources);
|
| 325 |
-
};
|
| 326 |
-
|
| 327 |
-
const collectCitedSourcesFromText = (
|
| 328 |
-
text: string,
|
| 329 |
-
sourcePool: readonly SourceDescriptor[],
|
| 330 |
-
): readonly SourceDescriptor[] => materializeSourcesFromUrls(extractSourceSectionUrls(text), sourcePool);
|
| 331 |
-
|
| 332 |
-
const normalizeReport = (value: string): string => {
|
| 333 |
-
return value
|
| 334 |
-
.replace(/^#\s+.*$/m, "")
|
| 335 |
-
.replace(/^##\s+Table of Contents[\s\S]*?(?=^##\s+|^Sources\s*$|^#\s+|$)/im, "")
|
| 336 |
-
.replace(/\n{3,}/g, "\n\n")
|
| 337 |
-
.trim();
|
| 338 |
-
};
|
| 339 |
-
|
| 340 |
-
const formatSourcesSection = (sources: readonly SourceDescriptor[]): string => {
|
| 341 |
-
return [
|
| 342 |
-
"Sources",
|
| 343 |
-
...sources.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`),
|
| 344 |
-
].join("\n");
|
| 345 |
-
};
|
| 346 |
-
|
| 347 |
-
const countPayloadWebUrls = (payload: ResearchWizardResponsePayload): number => {
|
| 348 |
-
const info = payload.research_information;
|
| 349 |
-
const urls = uniqueUrlsPreserveOrder([
|
| 350 |
-
...payloadUrls(info?.cited_urls),
|
| 351 |
-
...payloadUrls(info?.retrieved_urls),
|
| 352 |
-
...payloadUrls(info?.visited_urls),
|
| 353 |
-
...payloadUrls(info?.source_urls),
|
| 354 |
-
]);
|
| 355 |
-
return urls.filter((url) => isPublicWebCitationUrl(url)).length;
|
| 356 |
-
};
|
| 357 |
-
|
| 358 |
-
const isSynthesisFailureReport = (report: string, payload: ResearchWizardResponsePayload): boolean => {
|
| 359 |
-
const normalized = report.trim();
|
| 360 |
-
const webUrlCount = countPayloadWebUrls(payload);
|
| 361 |
-
if (webUrlCount >= MIN_HOLOCRON_WEB_CITATIONS) {
|
| 362 |
-
return /^i could not complete live archive synthesis\b/iu.test(normalized);
|
| 363 |
-
}
|
| 364 |
-
if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
|
| 365 |
-
return true;
|
| 366 |
-
}
|
| 367 |
-
if (
|
| 368 |
-
/^-\s+\S+.*is an approved archive page that may answer questions about/iu.test(normalized)
|
| 369 |
-
) {
|
| 370 |
-
return true;
|
| 371 |
-
}
|
| 372 |
-
return false;
|
| 373 |
-
};
|
| 374 |
-
|
| 375 |
-
const sourceOnlyFallbackAnswer = (query: string, sources: readonly SourceDescriptor[]): string => {
|
| 376 |
-
if (sources.length === 0) return "I could not complete live archive synthesis for this question right now.";
|
| 377 |
-
const topic = query.trim().replace(/\?+$/u, "") || "this question";
|
| 378 |
-
return [
|
| 379 |
-
`I found candidate sources for ${topic}, but I could not support a grounded answer from the retrieved evidence.`,
|
| 380 |
-
"Review the sources below or try a narrower wording.",
|
| 381 |
-
"",
|
| 382 |
-
formatSourcesSection(sources),
|
| 383 |
-
].join("\n");
|
| 384 |
-
};
|
| 385 |
-
|
| 386 |
-
const DEFAULT_REWRITE_TIMEOUT_MS = 15_000;
|
| 387 |
-
const MAX_REWRITE_ATTEMPTS = 2;
|
| 388 |
-
|
| 389 |
-
const normalizePreferredRewriteModel = (model: string | undefined): string | undefined => {
|
| 390 |
-
const trimmed = model?.trim();
|
| 391 |
-
if (!trimmed) return undefined;
|
| 392 |
-
if (trimmed.startsWith("litellm:")) return trimmed.slice("litellm:".length).trim() || undefined;
|
| 393 |
-
if (trimmed.startsWith("openrouter:")) return trimmed.slice("openrouter:".length).trim() || undefined;
|
| 394 |
-
return trimmed;
|
| 395 |
-
};
|
| 396 |
-
|
| 397 |
-
const withTimeout = async <T>(promise: Promise<T>, timeoutMs: number): Promise<T> => {
|
| 398 |
-
return await new Promise<T>((resolve, reject) => {
|
| 399 |
-
const timer = setTimeout(() => {
|
| 400 |
-
reject(new Error(`rewrite timed out after ${timeoutMs}ms`));
|
| 401 |
-
}, timeoutMs);
|
| 402 |
-
|
| 403 |
-
void promise.then(
|
| 404 |
-
(value) => {
|
| 405 |
-
clearTimeout(timer);
|
| 406 |
-
resolve(value);
|
| 407 |
-
},
|
| 408 |
-
(error: unknown) => {
|
| 409 |
-
clearTimeout(timer);
|
| 410 |
-
reject(error);
|
| 411 |
-
},
|
| 412 |
-
);
|
| 413 |
-
});
|
| 414 |
-
};
|
| 415 |
-
|
| 416 |
-
const fallbackDiscordRewrite = (
|
| 417 |
-
query: string,
|
| 418 |
-
report: string,
|
| 419 |
-
sources: readonly SourceDescriptor[],
|
| 420 |
-
): string => {
|
| 421 |
-
if (sources.length === 0) {
|
| 422 |
-
return degradedAnswerFallback(query, sources);
|
| 423 |
-
}
|
| 424 |
-
const normalized = normalizeReport(report);
|
| 425 |
-
if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
|
| 426 |
-
return sourceOnlyFallbackAnswer(query, sources);
|
| 427 |
-
}
|
| 428 |
-
|
| 429 |
-
const sourceIndexByUrl = new Map<string, number>(
|
| 430 |
-
sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
|
| 431 |
-
);
|
| 432 |
-
|
| 433 |
-
const [bodyOnlyCandidate = ""] = normalized.split(/\n(?:#{1,6}\s*)?(?:Sources|References)\s*\n/i, 1);
|
| 434 |
-
const bodyOnly = bodyOnlyCandidate
|
| 435 |
-
.replace(/\[([^\]]+)\]\((https?:\/\/[^)]+)\)/g, (_match, text: string, url: string) => {
|
| 436 |
-
const matchedSource = matchApprovedSource(url, sources);
|
| 437 |
-
const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
|
| 438 |
-
return citationIndex ? `${text} [${citationIndex}]` : text;
|
| 439 |
-
})
|
| 440 |
-
.replace(/^#{1,6}\s+.*$/gm, "")
|
| 441 |
-
.replace(/^\|.*\|$/gm, "")
|
| 442 |
-
.replace(/\*+/g, "")
|
| 443 |
-
.replace(/\n{3,}/g, "\n\n")
|
| 444 |
-
.trim();
|
| 445 |
-
|
| 446 |
-
const paragraphs = bodyOnly
|
| 447 |
-
.split(/\n{2,}/)
|
| 448 |
-
.map((paragraph) => paragraph.trim())
|
| 449 |
-
.filter((paragraph) => paragraph.length > 0);
|
| 450 |
-
|
| 451 |
-
const selected: string[] = [];
|
| 452 |
-
let totalLength = 0;
|
| 453 |
-
|
| 454 |
-
for (const paragraph of paragraphs) {
|
| 455 |
-
if (selected.length >= 2) break;
|
| 456 |
-
if (totalLength + paragraph.length > 900 && selected.length > 0) break;
|
| 457 |
-
selected.push(paragraph);
|
| 458 |
-
totalLength += paragraph.length;
|
| 459 |
-
}
|
| 460 |
-
|
| 461 |
-
let summary = selected.join("\n\n").trim();
|
| 462 |
-
|
| 463 |
-
if (!summary) {
|
| 464 |
-
summary = bodyOnly.slice(0, 900).trim();
|
| 465 |
-
}
|
| 466 |
-
|
| 467 |
-
if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
|
| 468 |
-
summary = `${summary} [1]`.trim();
|
| 469 |
-
}
|
| 470 |
-
|
| 471 |
-
return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
|
| 472 |
-
};
|
| 473 |
-
|
| 474 |
-
const fallbackDiscordBrief = (query: string, report: string, sources: readonly SourceDescriptor[]): string => {
|
| 475 |
-
if (sources.length === 0) {
|
| 476 |
-
return degradedAnswerFallback(query, sources);
|
| 477 |
-
}
|
| 478 |
-
const normalized = normalizeReport(report);
|
| 479 |
-
if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
|
| 480 |
-
return sourceOnlyFallbackAnswer(query, sources);
|
| 481 |
-
}
|
| 482 |
-
|
| 483 |
-
const sourceIndexByUrl = new Map<string, number>(
|
| 484 |
-
sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
|
| 485 |
-
);
|
| 486 |
-
|
| 487 |
-
const [bodyOnlyCandidate = ""] = normalized.split(/\n(?:#{1,6}\s*)?(?:Sources|References)\s*\n/i, 1);
|
| 488 |
-
const bodyOnly = bodyOnlyCandidate
|
| 489 |
-
.replace(/\[([^\]]+)\]\((https?:\/\/[^)]+)\)/g, (_match, text: string, url: string) => {
|
| 490 |
-
const matchedSource = matchApprovedSource(url, sources);
|
| 491 |
-
const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
|
| 492 |
-
return citationIndex ? `${text} [${citationIndex}]` : text;
|
| 493 |
-
})
|
| 494 |
-
.replace(/^#{1,6}\s+.*$/gm, "")
|
| 495 |
-
.replace(/\*+/g, "")
|
| 496 |
-
.replace(/\n{3,}/g, "\n\n")
|
| 497 |
-
.trim();
|
| 498 |
-
|
| 499 |
-
const firstChunk = bodyOnly.split(/\n{2,}/)[0]?.trim() ?? bodyOnly;
|
| 500 |
-
let summary = firstChunk.slice(0, 420).trim();
|
| 501 |
-
|
| 502 |
-
if (!summary) {
|
| 503 |
-
summary = bodyOnly.slice(0, 420).trim();
|
| 504 |
-
}
|
| 505 |
-
|
| 506 |
-
if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
|
| 507 |
-
summary = `${summary} [1]`.trim();
|
| 508 |
-
}
|
| 509 |
-
|
| 510 |
-
return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
|
| 511 |
-
};
|
| 512 |
-
|
| 513 |
-
const degradedAnswerFallback = (_query: string, _approvedSources: readonly SourceDescriptor[]): string => {
|
| 514 |
-
return "I could not complete live archive synthesis for this question right now.";
|
| 515 |
-
};
|
| 516 |
-
|
| 517 |
-
const normalizePreferenceUrl = (url: string): URL | undefined => {
|
| 518 |
-
try {
|
| 519 |
-
return new URL(url.trim().replace(/\/+$/, ""));
|
| 520 |
-
} catch {
|
| 521 |
-
return undefined;
|
| 522 |
-
}
|
| 523 |
-
};
|
| 524 |
-
|
| 525 |
-
const preferenceMatchesSource = (preference: ResearchWizardSourcePreference, source: SourceDescriptor): boolean => {
|
| 526 |
-
const preferenceUrl = normalizePreferenceUrl(preference.url);
|
| 527 |
-
const sourceUrl = normalizePreferenceUrl(source.homeUrl);
|
| 528 |
-
|
| 529 |
-
if (preferenceUrl && sourceUrl) {
|
| 530 |
-
const preferenceHost = preferenceUrl.hostname.replace(/^www\./, "").toLowerCase();
|
| 531 |
-
const sourceHost = sourceUrl.hostname.replace(/^www\./, "").toLowerCase();
|
| 532 |
-
const preferencePath = preferenceUrl.pathname.replace(/\/+$/, "");
|
| 533 |
-
const sourcePath = sourceUrl.pathname.replace(/\/+$/, "");
|
| 534 |
-
|
| 535 |
-
if (preferenceHost === sourceHost && (preferencePath === "" || sourcePath === preferencePath || sourcePath.startsWith(`${preferencePath}/`))) {
|
| 536 |
-
return true;
|
| 537 |
-
}
|
| 538 |
-
|
| 539 |
-
if (preferenceHost === sourceHost && preferenceUrl.pathname === "/") {
|
| 540 |
-
return true;
|
| 541 |
-
}
|
| 542 |
-
}
|
| 543 |
-
|
| 544 |
-
const preferenceName = preference.name?.trim().toLowerCase();
|
| 545 |
-
return Boolean(preferenceName && preferenceName === source.name.trim().toLowerCase());
|
| 546 |
-
};
|
| 547 |
-
|
| 548 |
-
const applySourcePreferences = (
|
| 549 |
-
approvedSources: readonly SourceDescriptor[],
|
| 550 |
-
preferences?: readonly ResearchWizardSourcePreference[],
|
| 551 |
-
): readonly SourceDescriptor[] => {
|
| 552 |
-
if (!preferences?.length) return approvedSources;
|
| 553 |
-
|
| 554 |
-
const ranked = approvedSources
|
| 555 |
-
.map((source, index) => {
|
| 556 |
-
const preference = preferences.find((entry) => preferenceMatchesSource(entry, source));
|
| 557 |
-
return {
|
| 558 |
-
source,
|
| 559 |
-
index,
|
| 560 |
-
enabled: preference ? preference.enabled : true,
|
| 561 |
-
weight: preference && Number.isFinite(preference.weight) ? preference.weight : 1,
|
| 562 |
-
};
|
| 563 |
-
})
|
| 564 |
-
.filter((entry) => entry.enabled)
|
| 565 |
-
.sort((left, right) => right.weight - left.weight || left.index - right.index)
|
| 566 |
-
.map((entry) => entry.source);
|
| 567 |
-
|
| 568 |
-
return ranked;
|
| 569 |
-
};
|
| 570 |
-
|
| 571 |
-
type ResearchQueryIntent = "tooling" | "technical" | "lore" | "general";
|
| 572 |
-
|
| 573 |
-
const TOOLING_QUERY_TERMS = [
|
| 574 |
-
"mdlops",
|
| 575 |
-
"mdledit",
|
| 576 |
-
"kotormax",
|
| 577 |
-
"kotorblender",
|
| 578 |
-
"pykotor",
|
| 579 |
-
"xoreos",
|
| 580 |
-
"reone",
|
| 581 |
-
"tslpatcher",
|
| 582 |
-
"toolchain",
|
| 583 |
-
"modding",
|
| 584 |
-
"tool",
|
| 585 |
-
"script",
|
| 586 |
-
"gff",
|
| 587 |
-
"2da",
|
| 588 |
-
"tlk",
|
| 589 |
-
"nss",
|
| 590 |
-
"ncs",
|
| 591 |
-
"utc",
|
| 592 |
-
"uti",
|
| 593 |
-
"mdl",
|
| 594 |
-
"mdx",
|
| 595 |
-
"texture",
|
| 596 |
-
"convert",
|
| 597 |
-
"blender",
|
| 598 |
-
"3ds",
|
| 599 |
-
];
|
| 600 |
-
|
| 601 |
-
const TECHNICAL_QUERY_TERMS = [
|
| 602 |
-
"widescreen",
|
| 603 |
-
"resolution",
|
| 604 |
-
"hud",
|
| 605 |
-
"screen",
|
| 606 |
-
"crash",
|
| 607 |
-
"compatibility",
|
| 608 |
-
"steam",
|
| 609 |
-
"windows",
|
| 610 |
-
"linux",
|
| 611 |
-
"mac",
|
| 612 |
-
"save",
|
| 613 |
-
"saves",
|
| 614 |
-
"install",
|
| 615 |
-
"launcher",
|
| 616 |
-
"driver",
|
| 617 |
-
"movies",
|
| 618 |
-
"cutscene",
|
| 619 |
-
"graphics",
|
| 620 |
-
"aspect",
|
| 621 |
-
];
|
| 622 |
-
|
| 623 |
-
const LORE_QUERY_TERMS = [
|
| 624 |
-
"bastila",
|
| 625 |
-
"revan",
|
| 626 |
-
"malak",
|
| 627 |
-
"shan",
|
| 628 |
-
"jedi",
|
| 629 |
-
"sith",
|
| 630 |
-
"rakata",
|
| 631 |
-
"star forge",
|
| 632 |
-
"temple summit",
|
| 633 |
-
"companion",
|
| 634 |
-
"romance",
|
| 635 |
-
"story",
|
| 636 |
-
"lore",
|
| 637 |
-
];
|
| 638 |
-
|
| 639 |
-
const LORE_SOURCE_IDS = new Set(["wikipedia-kotor", "strategywiki-kotor"]);
|
| 640 |
-
|
| 641 |
-
const queryIncludesAny = (query: string, terms: readonly string[]): boolean => {
|
| 642 |
-
const lowered = query.toLowerCase();
|
| 643 |
-
return terms.some((term) => lowered.includes(term));
|
| 644 |
-
};
|
| 645 |
-
|
| 646 |
-
const classifyQueryIntent = (query: string): ResearchQueryIntent => {
|
| 647 |
-
const lowered = query.toLowerCase();
|
| 648 |
-
if (queryIncludesAny(lowered, TOOLING_QUERY_TERMS)) return "tooling";
|
| 649 |
-
if (queryIncludesAny(lowered, TECHNICAL_QUERY_TERMS)) return "technical";
|
| 650 |
-
if (queryIncludesAny(lowered, LORE_QUERY_TERMS)) return "lore";
|
| 651 |
-
return "general";
|
| 652 |
-
};
|
| 653 |
-
|
| 654 |
-
const routeSourcesForQuery = (
|
| 655 |
-
query: string,
|
| 656 |
-
approvedSources: readonly SourceDescriptor[],
|
| 657 |
-
): readonly SourceDescriptor[] => {
|
| 658 |
-
const intent = classifyQueryIntent(query);
|
| 659 |
-
if (intent === "tooling" || intent === "technical") {
|
| 660 |
-
const filtered = approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id));
|
| 661 |
-
return filtered.length > 0 ? filtered : approvedSources;
|
| 662 |
-
}
|
| 663 |
-
if (intent === "lore") {
|
| 664 |
-
return [
|
| 665 |
-
...approvedSources.filter((source) => LORE_SOURCE_IDS.has(source.id)),
|
| 666 |
-
...approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id)),
|
| 667 |
-
];
|
| 668 |
-
}
|
| 669 |
-
return approvedSources;
|
| 670 |
-
};
|
| 671 |
-
|
| 672 |
-
const mergeSourcesPreserveOrder = (...groups: readonly (readonly SourceDescriptor[])[]): SourceDescriptor[] => {
|
| 673 |
-
const merged: SourceDescriptor[] = [];
|
| 674 |
-
const seen = new Set<string>();
|
| 675 |
-
for (const group of groups) {
|
| 676 |
-
for (const source of group) {
|
| 677 |
-
const key = normalizeUrl(source.homeUrl);
|
| 678 |
-
if (seen.has(key)) continue;
|
| 679 |
-
seen.add(key);
|
| 680 |
-
merged.push(source);
|
| 681 |
-
}
|
| 682 |
-
}
|
| 683 |
-
return merged;
|
| 684 |
-
};
|
| 685 |
-
|
| 686 |
-
const normalizeMatchToken = (token: string): string => {
|
| 687 |
-
const lowered = token.toLowerCase();
|
| 688 |
-
if (lowered.length <= 6) return lowered;
|
| 689 |
-
return lowered.slice(0, 6);
|
| 690 |
-
};
|
| 691 |
-
|
| 692 |
-
const tokenizeQuery = (query: string): string[] =>
|
| 693 |
-
[...new Set(
|
| 694 |
-
query
|
| 695 |
-
.toLowerCase()
|
| 696 |
-
.replace(/[^\p{L}\p{N}\s-]/gu, " ")
|
| 697 |
-
.split(/\s+/)
|
| 698 |
-
.filter((token) => token.length >= 4)
|
| 699 |
-
.map(normalizeMatchToken),
|
| 700 |
-
)];
|
| 701 |
-
|
| 702 |
-
/** Citations must be real public web pages on the approved allowlist (live GPTR research only). */
|
| 703 |
-
const isPublicWebCitationUrl = (url: string): boolean => {
|
| 704 |
-
if (url.startsWith("local://") || url.startsWith("discord://")) return false;
|
| 705 |
-
try {
|
| 706 |
-
const parsed = new URL(url);
|
| 707 |
-
return parsed.protocol === "https:" || parsed.protocol === "http:";
|
| 708 |
-
} catch {
|
| 709 |
-
return false;
|
| 710 |
-
}
|
| 711 |
-
};
|
| 712 |
-
|
| 713 |
-
const filterPublicWebCitationSources = (sources: readonly SourceDescriptor[]): SourceDescriptor[] =>
|
| 714 |
-
sources.filter((source) => isPublicWebCitationUrl(source.homeUrl));
|
| 715 |
-
|
| 716 |
-
/** Holocron e2e and product policy: answers must ground on multiple approved web sources. */
|
| 717 |
-
export const MIN_HOLOCRON_WEB_CITATIONS = 2;
|
| 718 |
-
|
| 719 |
-
const collectWebEvidenceSources = (
|
| 720 |
-
query: string,
|
| 721 |
-
report: string,
|
| 722 |
-
approvedSources: readonly SourceDescriptor[],
|
| 723 |
-
payload: ResearchWizardResponsePayload,
|
| 724 |
-
): readonly SourceDescriptor[] => {
|
| 725 |
-
const pool = mergeSourcesPreserveOrder(
|
| 726 |
-
collectRetrievedSources(report, approvedSources, payload),
|
| 727 |
-
collectCitedSources(report, approvedSources, payload),
|
| 728 |
-
materializeSourcesFromUrls(collectVisitedUrlsFromPayload(payload, approvedSources), approvedSources),
|
| 729 |
-
);
|
| 730 |
-
return rerankEvidenceSources(query, filterPublicWebCitationSources(pool));
|
| 731 |
-
};
|
| 732 |
-
|
| 733 |
-
const ensureMinimumWebCitations = (
|
| 734 |
-
query: string,
|
| 735 |
-
cited: readonly SourceDescriptor[],
|
| 736 |
-
evidence: readonly SourceDescriptor[],
|
| 737 |
-
payload?: ResearchWizardResponsePayload,
|
| 738 |
-
approvedSources: readonly SourceDescriptor[] = [],
|
| 739 |
-
): readonly SourceDescriptor[] => {
|
| 740 |
-
const info = payload?.research_information;
|
| 741 |
-
const payloadBacked = payload
|
| 742 |
-
? materializeSourcesFromUrls(
|
| 743 |
-
uniqueUrlsPreserveOrder([
|
| 744 |
-
...payloadUrls(info?.cited_urls),
|
| 745 |
-
...payloadUrls(info?.retrieved_urls),
|
| 746 |
-
...payloadUrls(info?.visited_urls),
|
| 747 |
-
...payloadUrls(info?.source_urls),
|
| 748 |
-
]),
|
| 749 |
-
approvedSources,
|
| 750 |
-
)
|
| 751 |
-
: [];
|
| 752 |
-
|
| 753 |
-
const merged = rerankEvidenceSources(
|
| 754 |
-
query,
|
| 755 |
-
mergeSourcesPreserveOrder(cited, evidence, payloadBacked),
|
| 756 |
-
);
|
| 757 |
-
const webOnly = filterPublicWebCitationSources(merged);
|
| 758 |
-
if (webOnly.length >= MIN_HOLOCRON_WEB_CITATIONS) {
|
| 759 |
-
return webOnly.slice(0, 8);
|
| 760 |
-
}
|
| 761 |
-
const padded = rerankEvidenceSources(
|
| 762 |
-
query,
|
| 763 |
-
mergeSourcesPreserveOrder(webOnly, filterPublicWebCitationSources(evidence), payloadBacked),
|
| 764 |
-
);
|
| 765 |
-
return padded.length >= MIN_HOLOCRON_WEB_CITATIONS
|
| 766 |
-
? padded.slice(0, 8)
|
| 767 |
-
: filterPublicWebCitationSources(payloadBacked).slice(0, 8);
|
| 768 |
-
};
|
| 769 |
-
|
| 770 |
-
const composeAnswerFromWebSources = (query: string, sources: readonly SourceDescriptor[]): string => {
|
| 771 |
-
const webSources = filterPublicWebCitationSources(sources).slice(0, 5);
|
| 772 |
-
if (webSources.length === 0) {
|
| 773 |
-
return sourceOnlyFallbackAnswer(query, sources);
|
| 774 |
-
}
|
| 775 |
-
return sourceOnlyFallbackAnswer(query, webSources);
|
| 776 |
-
};
|
| 777 |
-
|
| 778 |
-
const sourceMatchesQuery = (source: SourceDescriptor, query: string): boolean => {
|
| 779 |
-
const tokens = tokenizeQuery(query);
|
| 780 |
-
if (tokens.length === 0) return false;
|
| 781 |
-
const haystack = `${source.name} ${source.description ?? ""} ${source.homeUrl}`.toLowerCase();
|
| 782 |
-
let hits = 0;
|
| 783 |
-
for (const token of tokens) {
|
| 784 |
-
if (haystack.includes(token)) hits += 1;
|
| 785 |
-
}
|
| 786 |
-
return hits >= Math.min(2, tokens.length);
|
| 787 |
-
};
|
| 788 |
-
|
| 789 |
-
const sourceRelevanceScore = (source: SourceDescriptor, query: string): number => {
|
| 790 |
-
const tokens = tokenizeQuery(query);
|
| 791 |
-
if (tokens.length === 0) return 1;
|
| 792 |
-
const haystack = [
|
| 793 |
-
source.name,
|
| 794 |
-
source.description,
|
| 795 |
-
source.homeUrl,
|
| 796 |
-
...(source.tags ?? []),
|
| 797 |
-
].join(" ").toLowerCase();
|
| 798 |
-
let hits = 0;
|
| 799 |
-
for (const token of tokens) {
|
| 800 |
-
if (haystack.includes(token)) hits += 1;
|
| 801 |
-
}
|
| 802 |
-
const titleBonus = tokens.some((token) => source.name.toLowerCase().includes(token)) ? 2 : 0;
|
| 803 |
-
const urlBonus = tokens.some((token) => source.homeUrl.toLowerCase().includes(token)) ? 1 : 0;
|
| 804 |
-
return hits * 2 + titleBonus + urlBonus;
|
| 805 |
-
};
|
| 806 |
-
|
| 807 |
-
const rerankEvidenceSources = (query: string, sources: readonly SourceDescriptor[]): readonly SourceDescriptor[] => {
|
| 808 |
-
const tokens = tokenizeQuery(query);
|
| 809 |
-
const ranked = sources
|
| 810 |
-
.map((source, index) => ({
|
| 811 |
-
source,
|
| 812 |
-
index,
|
| 813 |
-
score: sourceRelevanceScore(source, query),
|
| 814 |
-
}))
|
| 815 |
-
.sort((left, right) => right.score - left.score || left.index - right.index);
|
| 816 |
-
if (tokens.length === 0) {
|
| 817 |
-
return ranked.map((entry) => entry.source).slice(0, 4);
|
| 818 |
-
}
|
| 819 |
-
const strong = ranked.filter((entry) => entry.score >= 2).map((entry) => entry.source);
|
| 820 |
-
return strong.slice(0, 8);
|
| 821 |
-
};
|
| 822 |
-
|
| 823 |
-
const resolveWebSourcesForFailedSynthesis = (
|
| 824 |
-
query: string,
|
| 825 |
-
retrievedSources: readonly SourceDescriptor[],
|
| 826 |
-
): readonly SourceDescriptor[] => {
|
| 827 |
-
const candidates = filterPublicWebCitationSources(retrievedSources);
|
| 828 |
-
const matched = candidates.filter((source) => sourceMatchesQuery(source, query));
|
| 829 |
-
return (matched.length > 0 ? matched : candidates).slice(0, 5);
|
| 830 |
-
};
|
| 831 |
-
|
| 832 |
-
const researchDomainsForSources = (sources: readonly SourceDescriptor[]): string[] => {
|
| 833 |
-
const enabledHosts = new Set<string>();
|
| 834 |
-
for (const source of sources) {
|
| 835 |
-
try {
|
| 836 |
-
const host = new URL(source.homeUrl).hostname.replace(/^www\./, "").toLowerCase();
|
| 837 |
-
const baseHost = traskApprovedResearchBaseHosts.find((base) => host === base || host.endsWith(`.${base}`));
|
| 838 |
-
if (baseHost) enabledHosts.add(baseHost);
|
| 839 |
-
} catch {
|
| 840 |
-
continue;
|
| 841 |
-
}
|
| 842 |
-
}
|
| 843 |
-
return [...enabledHosts];
|
| 844 |
-
};
|
| 845 |
-
|
| 846 |
-
const HEARTBEAT_MS = 8000;
|
| 847 |
-
|
| 848 |
-
const withProgressHeartbeat = async <T>(
|
| 849 |
-
phase: ResearchWizardProgressEvent["phase"],
|
| 850 |
-
makeDetail: (elapsedMs: number) => string,
|
| 851 |
-
onProgress: ((event: ResearchWizardProgressEvent) => void) | undefined,
|
| 852 |
-
work: () => Promise<T>,
|
| 853 |
-
): Promise<T> => {
|
| 854 |
-
if (!onProgress) {
|
| 855 |
-
return await work();
|
| 856 |
-
}
|
| 857 |
-
|
| 858 |
-
const startedAt = Date.now();
|
| 859 |
-
let lastBucket = -1;
|
| 860 |
-
const emit = () => {
|
| 861 |
-
const elapsed = Date.now() - startedAt;
|
| 862 |
-
const bucket = Math.floor(elapsed / HEARTBEAT_MS);
|
| 863 |
-
if (bucket === lastBucket) return;
|
| 864 |
-
lastBucket = bucket;
|
| 865 |
-
onProgress({ phase, detail: makeDetail(elapsed) });
|
| 866 |
-
};
|
| 867 |
-
|
| 868 |
-
emit();
|
| 869 |
-
const timer = setInterval(emit, HEARTBEAT_MS);
|
| 870 |
-
try {
|
| 871 |
-
return await work();
|
| 872 |
-
} finally {
|
| 873 |
-
clearInterval(timer);
|
| 874 |
-
}
|
| 875 |
-
};
|
| 876 |
-
|
| 877 |
-
export class ResearchWizardClient implements ResearchWizardQueryHandler {
|
| 878 |
-
private readonly openAiClient: OpenAI | null;
|
| 879 |
-
|
| 880 |
-
public constructor(
|
| 881 |
-
private readonly config: ResearchWizardRuntimeConfig,
|
| 882 |
-
private readonly aiConfig: SharedAiConfig,
|
| 883 |
-
private readonly approvedSources: readonly SourceDescriptor[] = traskApprovedResearchSources,
|
| 884 |
-
) {
|
| 885 |
-
this.openAiClient = aiConfig.openAiApiKey
|
| 886 |
-
? new OpenAI({
|
| 887 |
-
apiKey: aiConfig.openAiApiKey,
|
| 888 |
-
...(aiConfig.openAiBaseUrl ? { baseURL: aiConfig.openAiBaseUrl } : {}),
|
| 889 |
-
...(aiConfig.openAiDefaultHeaders ? { defaultHeaders: aiConfig.openAiDefaultHeaders } : {}),
|
| 890 |
-
})
|
| 891 |
-
: null;
|
| 892 |
-
}
|
| 893 |
-
|
| 894 |
-
public async listModels(): Promise<readonly ResearchWizardModelOption[]> {
|
| 895 |
-
try {
|
| 896 |
-
const dynamicModels = await listHeadlessGptResearcherModels(this.config);
|
| 897 |
-
const seen = new Set(DEFAULT_RESEARCH_WIZARD_MODELS.map((model) => model.id));
|
| 898 |
-
return [
|
| 899 |
-
...DEFAULT_RESEARCH_WIZARD_MODELS,
|
| 900 |
-
...dynamicModels.filter((model) => {
|
| 901 |
-
if (seen.has(model.id)) return false;
|
| 902 |
-
seen.add(model.id);
|
| 903 |
-
return true;
|
| 904 |
-
}),
|
| 905 |
-
];
|
| 906 |
-
} catch {
|
| 907 |
-
return DEFAULT_RESEARCH_WIZARD_MODELS;
|
| 908 |
-
}
|
| 909 |
-
}
|
| 910 |
-
|
| 911 |
-
private async rewriteForDiscord(
|
| 912 |
-
query: string,
|
| 913 |
-
report: string,
|
| 914 |
-
approvedSources: readonly SourceDescriptor[],
|
| 915 |
-
preferredModel?: string,
|
| 916 |
-
): Promise<string> {
|
| 917 |
-
if (!this.openAiClient) {
|
| 918 |
-
return fallbackDiscordRewrite(query, report, approvedSources);
|
| 919 |
-
}
|
| 920 |
-
|
| 921 |
-
const allowedSources = approvedSources
|
| 922 |
-
.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
|
| 923 |
-
.join("\n");
|
| 924 |
-
|
| 925 |
-
const preferredRewriteModel = normalizePreferredRewriteModel(preferredModel);
|
| 926 |
-
const modelsToTry = [
|
| 927 |
-
...new Set([...(preferredRewriteModel ? [preferredRewriteModel] : []), this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks]),
|
| 928 |
-
].slice(0, MAX_REWRITE_ATTEMPTS);
|
| 929 |
-
|
| 930 |
-
for (const model of modelsToTry) {
|
| 931 |
-
try {
|
| 932 |
-
const completion = await withTimeout(
|
| 933 |
-
this.openAiClient.chat.completions.create({
|
| 934 |
-
model,
|
| 935 |
-
temperature: 0.2,
|
| 936 |
-
messages: [
|
| 937 |
-
{
|
| 938 |
-
role: "system",
|
| 939 |
-
content: [
|
| 940 |
-
"Rewrite research reports into concise Discord answers.",
|
| 941 |
-
"Do not mention research steps, indexing, tooling, or backend behavior.",
|
| 942 |
-
"Use only the numbered sources provided by the user.",
|
| 943 |
-
"Return plain Markdown with no headings except the final Sources heading.",
|
| 944 |
-
].join(" "),
|
| 945 |
-
},
|
| 946 |
-
{
|
| 947 |
-
role: "user",
|
| 948 |
-
content: [
|
| 949 |
-
`Question: ${query}`,
|
| 950 |
-
"Write a concise answer for Discord.",
|
| 951 |
-
"Requirements:",
|
| 952 |
-
"- Lead with the answer.",
|
| 953 |
-
"- Use at most 3 short paragraphs or 5 compact bullets before sources.",
|
| 954 |
-
"- Use inline numeric citations like [1], [2].",
|
| 955 |
-
' - End with the exact heading "Sources" on its own line.',
|
| 956 |
-
"- Under Sources, include only the cited sources using the exact numbered lines provided below.",
|
| 957 |
-
"Allowed Sources:",
|
| 958 |
-
allowedSources,
|
| 959 |
-
"Research Report:",
|
| 960 |
-
report,
|
| 961 |
-
].join("\n\n"),
|
| 962 |
-
},
|
| 963 |
-
],
|
| 964 |
-
}),
|
| 965 |
-
DEFAULT_REWRITE_TIMEOUT_MS,
|
| 966 |
-
);
|
| 967 |
-
|
| 968 |
-
const rewritten = completion.choices[0]?.message?.content?.trim();
|
| 969 |
-
|
| 970 |
-
if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
|
| 971 |
-
return rewritten;
|
| 972 |
-
}
|
| 973 |
-
} catch {
|
| 974 |
-
continue;
|
| 975 |
-
}
|
| 976 |
-
}
|
| 977 |
-
|
| 978 |
-
return fallbackDiscordRewrite(query, report, approvedSources);
|
| 979 |
-
}
|
| 980 |
-
|
| 981 |
-
private async rewriteForDiscordBrief(
|
| 982 |
-
query: string,
|
| 983 |
-
report: string,
|
| 984 |
-
approvedSources: readonly SourceDescriptor[],
|
| 985 |
-
): Promise<string> {
|
| 986 |
-
if (!this.openAiClient) {
|
| 987 |
-
return fallbackDiscordBrief(query, report, approvedSources);
|
| 988 |
-
}
|
| 989 |
-
|
| 990 |
-
const allowedSources = approvedSources
|
| 991 |
-
.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
|
| 992 |
-
.join("\n");
|
| 993 |
-
|
| 994 |
-
const modelsToTry = [...new Set([this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks])].slice(0, MAX_REWRITE_ATTEMPTS);
|
| 995 |
-
|
| 996 |
-
for (const model of modelsToTry) {
|
| 997 |
-
try {
|
| 998 |
-
const completion = await withTimeout(
|
| 999 |
-
this.openAiClient.chat.completions.create({
|
| 1000 |
-
model,
|
| 1001 |
-
temperature: 0.15,
|
| 1002 |
-
max_tokens: 380,
|
| 1003 |
-
messages: [
|
| 1004 |
-
{
|
| 1005 |
-
role: "system",
|
| 1006 |
-
content: [
|
| 1007 |
-
"Rewrite research into a very short Discord chat reply (like a quick DM).",
|
| 1008 |
-
"No preamble, no essay tone, no meta commentary about research.",
|
| 1009 |
-
"Use only the numbered sources provided.",
|
| 1010 |
-
"Plain sentences; at most 2 short sentences OR up to 3 compact bullets before Sources.",
|
| 1011 |
-
'End with the exact heading "Sources" on its own line, then cited sources only.',
|
| 1012 |
-
].join(" "),
|
| 1013 |
-
},
|
| 1014 |
-
{
|
| 1015 |
-
role: "user",
|
| 1016 |
-
content: [
|
| 1017 |
-
`Question: ${query}`,
|
| 1018 |
-
"Write the shortest helpful answer.",
|
| 1019 |
-
"Allowed Sources:",
|
| 1020 |
-
allowedSources,
|
| 1021 |
-
"Research Report:",
|
| 1022 |
-
report,
|
| 1023 |
-
].join("\n\n"),
|
| 1024 |
-
},
|
| 1025 |
-
],
|
| 1026 |
-
}),
|
| 1027 |
-
DEFAULT_REWRITE_TIMEOUT_MS,
|
| 1028 |
-
);
|
| 1029 |
-
|
| 1030 |
-
const rewritten = completion.choices[0]?.message?.content?.trim();
|
| 1031 |
-
|
| 1032 |
-
if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
|
| 1033 |
-
return rewritten;
|
| 1034 |
-
}
|
| 1035 |
-
} catch {
|
| 1036 |
-
continue;
|
| 1037 |
-
}
|
| 1038 |
-
}
|
| 1039 |
-
|
| 1040 |
-
return fallbackDiscordBrief(query, report, approvedSources);
|
| 1041 |
-
}
|
| 1042 |
-
|
| 1043 |
-
private async fetchResearchReport(
|
| 1044 |
-
query: string,
|
| 1045 |
-
customPrompt: string,
|
| 1046 |
-
approvedSources: readonly SourceDescriptor[],
|
| 1047 |
-
options?: ResearchWizardQueryOptions,
|
| 1048 |
-
): Promise<{ report: string; payload: ResearchWizardResponsePayload }> {
|
| 1049 |
-
if (approvedSources.length === 0) {
|
| 1050 |
-
throw new Error("No approved research sources are enabled.");
|
| 1051 |
-
}
|
| 1052 |
-
|
| 1053 |
-
const allowedDomains = researchDomainsForSources(approvedSources);
|
| 1054 |
-
const raw = await runHeadlessGptResearcher(this.config, {
|
| 1055 |
-
query: buildResearchTask(query),
|
| 1056 |
-
custom_prompt: customPrompt,
|
| 1057 |
-
query_domains: allowedDomains,
|
| 1058 |
-
allowed_url_prefixes: approvedSources.map((source) => source.homeUrl),
|
| 1059 |
-
...(options?.model?.trim() ? { model: options.model.trim() } : {}),
|
| 1060 |
-
report_type: "research_report",
|
| 1061 |
-
report_source: "web",
|
| 1062 |
-
});
|
| 1063 |
-
|
| 1064 |
-
const payload: ResearchWizardResponsePayload = {
|
| 1065 |
-
report: raw.report,
|
| 1066 |
-
...(raw.research_information !== undefined
|
| 1067 |
-
? { research_information: { ...raw.research_information } }
|
| 1068 |
-
: {}),
|
| 1069 |
-
};
|
| 1070 |
-
|
| 1071 |
-
const report = typeof raw.report === "string" ? normalizeReport(raw.report) : "";
|
| 1072 |
-
|
| 1073 |
-
if (!report) {
|
| 1074 |
-
throw new Error("ai-researchwizard returned an empty report.");
|
| 1075 |
-
}
|
| 1076 |
-
|
| 1077 |
-
return { report, payload };
|
| 1078 |
-
}
|
| 1079 |
-
|
| 1080 |
-
public async answerQuestion(
|
| 1081 |
-
query: string,
|
| 1082 |
-
onProgress?: (event: ResearchWizardProgressEvent) => void,
|
| 1083 |
-
options?: ResearchWizardQueryOptions,
|
| 1084 |
-
): Promise<ResearchWizardAnswer> {
|
| 1085 |
-
const approvedSources = routeSourcesForQuery(
|
| 1086 |
-
query,
|
| 1087 |
-
applySourcePreferences(this.approvedSources, options?.sourcePreferences),
|
| 1088 |
-
);
|
| 1089 |
-
try {
|
| 1090 |
-
const allowedDomains = researchDomainsForSources(approvedSources);
|
| 1091 |
-
onProgress?.({
|
| 1092 |
-
phase: "gather",
|
| 1093 |
-
detail: `Scanning ${approvedSources.length} approved source root${approvedSources.length === 1 ? "" : "s"} across ${allowedDomains.length} host${allowedDomains.length === 1 ? "" : "s"}…`,
|
| 1094 |
-
});
|
| 1095 |
-
const { report, payload } = await withProgressHeartbeat(
|
| 1096 |
-
"gather",
|
| 1097 |
-
(elapsedMs) => {
|
| 1098 |
-
const seconds = Math.max(1, Math.floor(elapsedMs / 1000));
|
| 1099 |
-
return `Researching approved archive sources… (${seconds}s)`;
|
| 1100 |
-
},
|
| 1101 |
-
onProgress,
|
| 1102 |
-
async () => await this.fetchResearchReport(query, buildCustomPrompt(), approvedSources, options),
|
| 1103 |
-
);
|
| 1104 |
-
const rejectedUrls = collectRejectedUrlsFromPayload(payload);
|
| 1105 |
-
if (rejectedUrls.length > 0) {
|
| 1106 |
-
onProgress?.({
|
| 1107 |
-
phase: "gather",
|
| 1108 |
-
detail: `Rejected ${rejectedUrls.length} URL${rejectedUrls.length === 1 ? "" : "s"} outside approved source roots.`,
|
| 1109 |
-
});
|
| 1110 |
-
}
|
| 1111 |
-
emitArchiveProbeEvents(payload, approvedSources, onProgress);
|
| 1112 |
-
onProgress?.({
|
| 1113 |
-
phase: "report",
|
| 1114 |
-
detail: "Ranking passages and citations…",
|
| 1115 |
-
});
|
| 1116 |
-
const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
|
| 1117 |
-
const retrievedSources = webEvidenceSources;
|
| 1118 |
-
const citedSourcesFromReport = rerankEvidenceSources(
|
| 1119 |
-
query,
|
| 1120 |
-
mergeSourcesPreserveOrder(
|
| 1121 |
-
collectCitedSources(report, approvedSources, payload),
|
| 1122 |
-
collectCitedSourcesFromText(report, approvedSources),
|
| 1123 |
-
),
|
| 1124 |
-
);
|
| 1125 |
-
onProgress?.({
|
| 1126 |
-
phase: "sources",
|
| 1127 |
-
detail: retrievedSources.length ? `${retrievedSources.length} sources retrieved` : "Mapping hosts to archive catalog…",
|
| 1128 |
-
sources: retrievedSources,
|
| 1129 |
-
});
|
| 1130 |
-
onProgress?.({
|
| 1131 |
-
phase: "compose",
|
| 1132 |
-
detail: "Rendering Holocron answer…",
|
| 1133 |
-
});
|
| 1134 |
-
let answer: string;
|
| 1135 |
-
if (retrievedSources.length === 0) {
|
| 1136 |
-
answer = degradedAnswerFallback(query, approvedSources);
|
| 1137 |
-
} else if (isSynthesisFailureReport(report, payload)) {
|
| 1138 |
-
const webSources = resolveWebSourcesForFailedSynthesis(query, retrievedSources);
|
| 1139 |
-
if (webSources.length >= MIN_HOLOCRON_WEB_CITATIONS) {
|
| 1140 |
-
const sourcesForRewrite = filterPublicWebCitationSources(webSources);
|
| 1141 |
-
answer = this.openAiClient
|
| 1142 |
-
? await this.rewriteForDiscord(query, report, sourcesForRewrite, options?.model)
|
| 1143 |
-
: fallbackDiscordRewrite(query, report, sourcesForRewrite);
|
| 1144 |
-
} else if (webSources.length > 0) {
|
| 1145 |
-
answer = sourceOnlyFallbackAnswer(query, webSources);
|
| 1146 |
-
} else if (retrievedSources.length > 0) {
|
| 1147 |
-
answer = sourceOnlyFallbackAnswer(query, retrievedSources);
|
| 1148 |
-
} else {
|
| 1149 |
-
answer = degradedAnswerFallback(query, approvedSources);
|
| 1150 |
-
}
|
| 1151 |
-
} else if (this.openAiClient) {
|
| 1152 |
-
answer = await this.rewriteForDiscord(
|
| 1153 |
-
query,
|
| 1154 |
-
report,
|
| 1155 |
-
filterPublicWebCitationSources(retrievedSources),
|
| 1156 |
-
options?.model,
|
| 1157 |
-
);
|
| 1158 |
-
} else {
|
| 1159 |
-
answer = fallbackDiscordRewrite(
|
| 1160 |
-
query,
|
| 1161 |
-
report,
|
| 1162 |
-
filterPublicWebCitationSources(retrievedSources),
|
| 1163 |
-
);
|
| 1164 |
-
}
|
| 1165 |
-
|
| 1166 |
-
const citedSources = ensureMinimumWebCitations(
|
| 1167 |
-
query,
|
| 1168 |
-
filterPublicWebCitationSources(
|
| 1169 |
-
mergeSourcesPreserveOrder(
|
| 1170 |
-
collectCitedSourcesFromText(answer, retrievedSources),
|
| 1171 |
-
citedSourcesFromReport,
|
| 1172 |
-
),
|
| 1173 |
-
),
|
| 1174 |
-
webEvidenceSources,
|
| 1175 |
-
payload,
|
| 1176 |
-
approvedSources,
|
| 1177 |
-
);
|
| 1178 |
-
|
| 1179 |
-
return {
|
| 1180 |
-
answer,
|
| 1181 |
-
approvedSources: citedSources,
|
| 1182 |
-
retrievedSources,
|
| 1183 |
-
visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
|
| 1184 |
-
};
|
| 1185 |
-
} catch (error: unknown) {
|
| 1186 |
-
const detail = error instanceof Error ? error.message : String(error);
|
| 1187 |
-
onProgress?.({
|
| 1188 |
-
phase: "compose",
|
| 1189 |
-
detail: `Live web research failed: ${detail.slice(0, 240)}`,
|
| 1190 |
-
});
|
| 1191 |
-
const topic = query.trim().replace(/\?+$/u, "") || "this question";
|
| 1192 |
-
return {
|
| 1193 |
-
answer: `I could not complete live web research for "${topic}" right now (${detail}). Ensure GPTR Python (TRASK_GPT_RESEARCHER_PYTHON), retriever keys (e.g. TAVILY_API_KEY), and TRASK_RESEARCHWIZARD_TIMEOUT_MS are configured, then retry.`,
|
| 1194 |
-
approvedSources: [],
|
| 1195 |
-
retrievedSources: [],
|
| 1196 |
-
visitedUrls: [],
|
| 1197 |
-
};
|
| 1198 |
-
}
|
| 1199 |
-
}
|
| 1200 |
-
|
| 1201 |
-
/** Shorter rewrite for proactive/channel replies (still source-backed). */
|
| 1202 |
-
public async answerQuestionBrief(query: string): Promise<ResearchWizardBriefAnswer> {
|
| 1203 |
-
try {
|
| 1204 |
-
const approvedSources = routeSourcesForQuery(query, this.approvedSources);
|
| 1205 |
-
const { report, payload } = await this.fetchResearchReport(query, buildCustomPromptBrief(), approvedSources);
|
| 1206 |
-
const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
|
| 1207 |
-
const retrievedSources = webEvidenceSources;
|
| 1208 |
-
const answer = retrievedSources.length > 0
|
| 1209 |
-
? await this.rewriteForDiscordBrief(query, report, retrievedSources)
|
| 1210 |
-
: degradedAnswerFallback(query, approvedSources);
|
| 1211 |
-
|
| 1212 |
-
return {
|
| 1213 |
-
answer,
|
| 1214 |
-
approvedSources: ensureMinimumWebCitations(
|
| 1215 |
-
query,
|
| 1216 |
-
filterPublicWebCitationSources(
|
| 1217 |
-
mergeSourcesPreserveOrder(
|
| 1218 |
-
collectCitedSourcesFromText(answer, retrievedSources),
|
| 1219 |
-
collectCitedSources(report, approvedSources, payload),
|
| 1220 |
-
),
|
| 1221 |
-
),
|
| 1222 |
-
webEvidenceSources,
|
| 1223 |
-
payload,
|
| 1224 |
-
approvedSources,
|
| 1225 |
-
),
|
| 1226 |
-
retrievedSources,
|
| 1227 |
-
visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
|
| 1228 |
-
researchReport: report,
|
| 1229 |
-
};
|
| 1230 |
-
} catch {
|
| 1231 |
-
const topic = query.trim().replace(/\?+$/u, "") || "this question";
|
| 1232 |
-
const answer = `I could not complete live web research for "${topic}" right now.`;
|
| 1233 |
-
return {
|
| 1234 |
-
answer,
|
| 1235 |
-
approvedSources: [],
|
| 1236 |
-
retrievedSources: [],
|
| 1237 |
-
visitedUrls: [],
|
| 1238 |
-
researchReport: answer,
|
| 1239 |
-
};
|
| 1240 |
-
}
|
| 1241 |
-
}
|
| 1242 |
-
}
|
| 1243 |
-
|
| 1244 |
-
export const createResearchWizardClient = (
|
| 1245 |
-
config: ResearchWizardRuntimeConfig,
|
| 1246 |
-
aiConfig: SharedAiConfig = loadSharedAiConfig(),
|
| 1247 |
-
): ResearchWizardClient => {
|
| 1248 |
-
return new ResearchWizardClient(config, aiConfig, traskApprovedResearchSources);
|
| 1249 |
-
};
|
| 1250 |
-
|
| 1251 |
-
// ---------------------------------------------------------------------------
|
| 1252 |
-
// Pure helpers exported for unit testing — not part of the public API surface.
|
| 1253 |
-
// ---------------------------------------------------------------------------
|
| 1254 |
export {
|
| 1255 |
-
|
| 1256 |
-
|
| 1257 |
-
|
| 1258 |
-
|
| 1259 |
-
|
| 1260 |
-
|
| 1261 |
-
|
| 1262 |
-
|
| 1263 |
-
|
| 1264 |
-
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
-
|
| 1269 |
-
classifyQueryIntent as _classifyQueryIntent,
|
| 1270 |
-
routeSourcesForQuery as _routeSourcesForQuery,
|
| 1271 |
-
};
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* @deprecated Import from `./web-research.js` instead. ResearchWizard naming is retired.
|
| 3 |
+
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
export {
|
| 5 |
+
WebResearchClient as ResearchWizardClient,
|
| 6 |
+
createWebResearchClient as createResearchWizardClient,
|
| 7 |
+
} from "./web-research.js";
|
| 8 |
+
|
| 9 |
+
export type {
|
| 10 |
+
WebResearchAnswer as ResearchWizardAnswer,
|
| 11 |
+
WebResearchBriefAnswer as ResearchWizardBriefAnswer,
|
| 12 |
+
WebResearchClientFactoryOptions as ResearchWizardClientFactoryOptions,
|
| 13 |
+
WebResearchModelOption as ResearchWizardModelOption,
|
| 14 |
+
WebResearchProgressEvent as ResearchWizardProgressEvent,
|
| 15 |
+
WebResearchQueryHandler as ResearchWizardQueryHandler,
|
| 16 |
+
WebResearchQueryOptions as ResearchWizardQueryOptions,
|
| 17 |
+
WebResearchSourcePreference as ResearchWizardSourcePreference,
|
| 18 |
+
} from "./web-research.js";
|
|
|
|
|
|
|
|
|
packages/trask/src/web-research-subprocess.ts
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { spawn } from "node:child_process";
|
| 2 |
+
import { existsSync } from "node:fs";
|
| 3 |
+
import { dirname, join, resolve } from "node:path";
|
| 4 |
+
|
| 5 |
+
import type { WebResearchRuntimeConfig } from "@openkotor/config";
|
| 6 |
+
|
| 7 |
+
export interface HeadlessWebResearchResult {
|
| 8 |
+
readonly report: string;
|
| 9 |
+
readonly research_information?: {
|
| 10 |
+
readonly source_urls?: readonly string[] | null;
|
| 11 |
+
readonly cited_urls?: readonly string[] | null;
|
| 12 |
+
readonly retrieved_urls?: readonly string[] | null;
|
| 13 |
+
readonly visited_urls?: readonly string[] | null;
|
| 14 |
+
readonly query_domains?: readonly string[] | null;
|
| 15 |
+
readonly allowed_url_prefixes?: readonly string[] | null;
|
| 16 |
+
readonly rejected_source_urls?: readonly string[] | null;
|
| 17 |
+
};
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
export interface HeadlessWebResearchModelOption {
|
| 21 |
+
readonly id: string;
|
| 22 |
+
readonly label: string;
|
| 23 |
+
readonly provider: string;
|
| 24 |
+
readonly recommended?: boolean;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
/** stdin payload for `scripts/trask_web_research.py`. */
|
| 28 |
+
export interface HeadlessWebResearchRequestPayload {
|
| 29 |
+
readonly query: string;
|
| 30 |
+
readonly custom_prompt?: string;
|
| 31 |
+
readonly source_urls?: readonly string[];
|
| 32 |
+
readonly query_domains?: readonly string[];
|
| 33 |
+
readonly allowed_url_prefixes?: readonly string[];
|
| 34 |
+
readonly model?: string;
|
| 35 |
+
readonly report_type?: string;
|
| 36 |
+
readonly report_source?: string;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
/** @deprecated Use HeadlessWebResearchResult */
|
| 40 |
+
export type HeadlessAiResearchWizardResult = HeadlessWebResearchResult;
|
| 41 |
+
|
| 42 |
+
/** @deprecated Use HeadlessWebResearchRequestPayload */
|
| 43 |
+
export type HeadlessAiResearchWizardRequestPayload = HeadlessWebResearchRequestPayload;
|
| 44 |
+
|
| 45 |
+
/** @deprecated Use HeadlessWebResearchModelOption */
|
| 46 |
+
export type HeadlessAiResearchWizardModelOption = HeadlessWebResearchModelOption;
|
| 47 |
+
|
| 48 |
+
const findRepoRoot = (startDir: string, maxHops = 24): string => {
|
| 49 |
+
let dir = resolve(startDir);
|
| 50 |
+
for (let hop = 0; hop < maxHops; hop++) {
|
| 51 |
+
const script = join(dir, "scripts", "trask_web_research.py");
|
| 52 |
+
if (existsSync(script)) {
|
| 53 |
+
return dir;
|
| 54 |
+
}
|
| 55 |
+
const parent = dirname(dir);
|
| 56 |
+
if (parent === dir) {
|
| 57 |
+
return process.cwd();
|
| 58 |
+
}
|
| 59 |
+
dir = parent;
|
| 60 |
+
}
|
| 61 |
+
return process.cwd();
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
const defaultScriptPath = (repoRoot: string): string => join(repoRoot, "scripts", "trask_web_research.py");
|
| 65 |
+
|
| 66 |
+
const spawnHeadless = (
|
| 67 |
+
python: string,
|
| 68 |
+
script: string,
|
| 69 |
+
cwd: string,
|
| 70 |
+
payload: HeadlessWebResearchRequestPayload,
|
| 71 |
+
timeoutMs: number,
|
| 72 |
+
): Promise<{ stdout: string; stderr: string; code: number | null }> => {
|
| 73 |
+
return new Promise((resolvePromise, rejectPromise) => {
|
| 74 |
+
const child = spawn(python, [script], {
|
| 75 |
+
cwd,
|
| 76 |
+
stdio: ["pipe", "pipe", "pipe"],
|
| 77 |
+
env: {
|
| 78 |
+
...process.env,
|
| 79 |
+
TRASK_ALLOWED_QUERY_DOMAINS: (payload.query_domains ?? []).join("\n"),
|
| 80 |
+
TRASK_ALLOWED_URL_PREFIXES: (payload.allowed_url_prefixes ?? []).join("\n"),
|
| 81 |
+
PYTHONIOENCODING: "utf-8",
|
| 82 |
+
PYTHONUTF8: "1",
|
| 83 |
+
},
|
| 84 |
+
});
|
| 85 |
+
|
| 86 |
+
const chunksOut: Buffer[] = [];
|
| 87 |
+
const chunksErr: Buffer[] = [];
|
| 88 |
+
let settled = false;
|
| 89 |
+
|
| 90 |
+
child.stdout?.on("data", (chunk: Buffer | string) => {
|
| 91 |
+
chunksOut.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
| 92 |
+
});
|
| 93 |
+
child.stderr?.on("data", (chunk: Buffer | string) => {
|
| 94 |
+
chunksErr.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
| 95 |
+
});
|
| 96 |
+
|
| 97 |
+
const timer = setTimeout(() => {
|
| 98 |
+
if (settled) {
|
| 99 |
+
return;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
settled = true;
|
| 103 |
+
child.kill("SIGTERM");
|
| 104 |
+
rejectPromise(new Error(`Trask web research runner timed out after ${timeoutMs}ms`));
|
| 105 |
+
}, timeoutMs);
|
| 106 |
+
|
| 107 |
+
child.on("error", (error) => {
|
| 108 |
+
if (settled) {
|
| 109 |
+
return;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
settled = true;
|
| 113 |
+
clearTimeout(timer);
|
| 114 |
+
rejectPromise(error);
|
| 115 |
+
});
|
| 116 |
+
|
| 117 |
+
child.on("close", (exitCode) => {
|
| 118 |
+
if (settled) {
|
| 119 |
+
return;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
settled = true;
|
| 123 |
+
clearTimeout(timer);
|
| 124 |
+
resolvePromise({
|
| 125 |
+
stdout: Buffer.concat(chunksOut).toString("utf8").trim(),
|
| 126 |
+
stderr: Buffer.concat(chunksErr).toString("utf8").trim(),
|
| 127 |
+
code: exitCode,
|
| 128 |
+
});
|
| 129 |
+
});
|
| 130 |
+
|
| 131 |
+
try {
|
| 132 |
+
child.stdin?.write(Buffer.from(JSON.stringify(payload), "utf8"));
|
| 133 |
+
child.stdin?.end();
|
| 134 |
+
} catch (error) {
|
| 135 |
+
if (!settled) {
|
| 136 |
+
settled = true;
|
| 137 |
+
clearTimeout(timer);
|
| 138 |
+
rejectPromise(error);
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
});
|
| 142 |
+
};
|
| 143 |
+
|
| 144 |
+
export const runHeadlessWebResearch = async (
|
| 145 |
+
config: WebResearchRuntimeConfig,
|
| 146 |
+
payload: HeadlessWebResearchRequestPayload,
|
| 147 |
+
): Promise<HeadlessWebResearchResult> => {
|
| 148 |
+
const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
|
| 149 |
+
const script = (config.headlessScriptPath?.trim() || defaultScriptPath(repoRoot)).trim();
|
| 150 |
+
|
| 151 |
+
if (!existsSync(script)) {
|
| 152 |
+
throw new Error(
|
| 153 |
+
`Trask web research script not found: ${script}. Run scripts/bootstrap_trask_research.sh or set TRASK_WEB_RESEARCH_SCRIPT.`,
|
| 154 |
+
);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
const python = config.pythonExecutable?.trim() || "python";
|
| 158 |
+
const { stdout, stderr, code } = await spawnHeadless(python, script, repoRoot, payload, config.timeoutMs);
|
| 159 |
+
|
| 160 |
+
if (code !== 0) {
|
| 161 |
+
throw new Error(`Trask web research runner exited ${code ?? "unknown"}: ${stderr || stdout || "no output"}`);
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
try {
|
| 165 |
+
const parsed = JSON.parse(stdout) as HeadlessWebResearchResult;
|
| 166 |
+
|
| 167 |
+
if (typeof parsed.report !== "string" || !parsed.report.trim()) {
|
| 168 |
+
throw new Error("Web research runner returned empty report.");
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
return parsed;
|
| 172 |
+
} catch (error) {
|
| 173 |
+
if (error instanceof SyntaxError) {
|
| 174 |
+
throw new Error(`Trask web research runner returned invalid JSON: ${stdout.slice(0, 400)}`);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
throw error;
|
| 178 |
+
}
|
| 179 |
+
};
|
| 180 |
+
|
| 181 |
+
/** @deprecated Use runHeadlessWebResearch */
|
| 182 |
+
export const runHeadlessGptResearcher = runHeadlessWebResearch;
|
| 183 |
+
|
| 184 |
+
const labelFromModelId = (modelId: string): string => {
|
| 185 |
+
const tail = modelId.split("/").pop() ?? modelId;
|
| 186 |
+
return tail
|
| 187 |
+
.replace(/[-_]+/gu, " ")
|
| 188 |
+
.replace(/\b\w/gu, (char) => char.toUpperCase())
|
| 189 |
+
.replace(/\bGpt\b/gu, "GPT")
|
| 190 |
+
.replace(/\bAi\b/gu, "AI");
|
| 191 |
+
};
|
| 192 |
+
|
| 193 |
+
const providerFromModelId = (modelId: string): string => {
|
| 194 |
+
const withoutPrefix = modelId.includes(":") ? modelId.split(":", 2)[1] ?? modelId : modelId;
|
| 195 |
+
const provider = withoutPrefix.includes("/") ? withoutPrefix.split("/", 1)[0] ?? withoutPrefix : "Trask web research";
|
| 196 |
+
return provider
|
| 197 |
+
.replace(/[-_]+/gu, " ")
|
| 198 |
+
.replace(/\b\w/gu, (char) => char.toUpperCase())
|
| 199 |
+
.replace(/\bAi\b/gu, "AI")
|
| 200 |
+
.replace(/^Openrouter$/u, "OpenRouter");
|
| 201 |
+
};
|
| 202 |
+
|
| 203 |
+
const normalizeWebResearchModelId = (modelId: string): string => {
|
| 204 |
+
const trimmed = modelId.trim();
|
| 205 |
+
if (!trimmed) return "";
|
| 206 |
+
if (trimmed.includes(":")) return trimmed;
|
| 207 |
+
return trimmed.startsWith("openrouter/") ? `openrouter:${trimmed}` : `litellm:${trimmed}`;
|
| 208 |
+
};
|
| 209 |
+
|
| 210 |
+
const parseModelList = (stdout: string): HeadlessWebResearchModelOption[] => {
|
| 211 |
+
const parsed = JSON.parse(stdout) as unknown;
|
| 212 |
+
if (!Array.isArray(parsed)) return [];
|
| 213 |
+
|
| 214 |
+
const seen = new Set<string>();
|
| 215 |
+
const models: HeadlessWebResearchModelOption[] = [];
|
| 216 |
+
for (const raw of parsed) {
|
| 217 |
+
if (typeof raw !== "string") continue;
|
| 218 |
+
const id = normalizeWebResearchModelId(raw);
|
| 219 |
+
if (!id || seen.has(id)) continue;
|
| 220 |
+
seen.add(id);
|
| 221 |
+
models.push({
|
| 222 |
+
id,
|
| 223 |
+
label: labelFromModelId(id),
|
| 224 |
+
provider: providerFromModelId(id),
|
| 225 |
+
});
|
| 226 |
+
}
|
| 227 |
+
return models;
|
| 228 |
+
};
|
| 229 |
+
|
| 230 |
+
export const listHeadlessWebResearchModels = async (
|
| 231 |
+
config: WebResearchRuntimeConfig,
|
| 232 |
+
): Promise<HeadlessWebResearchModelOption[]> => {
|
| 233 |
+
const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
|
| 234 |
+
const python = config.pythonExecutable?.trim() || "python";
|
| 235 |
+
const script = [
|
| 236 |
+
"import json, sys",
|
| 237 |
+
"from pathlib import Path",
|
| 238 |
+
"root = Path(sys.argv[1]).resolve()",
|
| 239 |
+
"fallbacks = root / 'vendor' / 'llm_fallbacks' / 'src'",
|
| 240 |
+
"sys.path.insert(0, str(fallbacks))",
|
| 241 |
+
"try:",
|
| 242 |
+
" from llm_fallbacks.config import FREE_CHAT_MODELS",
|
| 243 |
+
" models = [name for name, _ in FREE_CHAT_MODELS]",
|
| 244 |
+
"except Exception:",
|
| 245 |
+
" from llm_fallbacks import filter_models",
|
| 246 |
+
" models = list(filter_models(model_type='chat', free_only=True))",
|
| 247 |
+
"print(json.dumps(models[:60]))",
|
| 248 |
+
].join("\n");
|
| 249 |
+
|
| 250 |
+
const { stdout, stderr, code } = await new Promise<{ stdout: string; stderr: string; code: number | null }>(
|
| 251 |
+
(resolvePromise, rejectPromise) => {
|
| 252 |
+
const child = spawn(python, ["-c", script, repoRoot], {
|
| 253 |
+
cwd: process.cwd(),
|
| 254 |
+
stdio: ["ignore", "pipe", "pipe"],
|
| 255 |
+
env: {
|
| 256 |
+
...process.env,
|
| 257 |
+
PYTHONIOENCODING: "utf-8",
|
| 258 |
+
PYTHONUTF8: "1",
|
| 259 |
+
},
|
| 260 |
+
});
|
| 261 |
+
const chunksOut: Buffer[] = [];
|
| 262 |
+
const chunksErr: Buffer[] = [];
|
| 263 |
+
let settled = false;
|
| 264 |
+
const timer = setTimeout(() => {
|
| 265 |
+
if (settled) return;
|
| 266 |
+
settled = true;
|
| 267 |
+
child.kill("SIGTERM");
|
| 268 |
+
rejectPromise(new Error("Trask web research model list timed out"));
|
| 269 |
+
}, Math.min(config.timeoutMs, 15_000));
|
| 270 |
+
child.stdout?.on("data", (chunk: Buffer | string) => chunksOut.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)));
|
| 271 |
+
child.stderr?.on("data", (chunk: Buffer | string) => chunksErr.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)));
|
| 272 |
+
child.on("error", (error) => {
|
| 273 |
+
if (settled) return;
|
| 274 |
+
settled = true;
|
| 275 |
+
clearTimeout(timer);
|
| 276 |
+
rejectPromise(error);
|
| 277 |
+
});
|
| 278 |
+
child.on("close", (exitCode) => {
|
| 279 |
+
if (settled) return;
|
| 280 |
+
settled = true;
|
| 281 |
+
clearTimeout(timer);
|
| 282 |
+
resolvePromise({
|
| 283 |
+
stdout: Buffer.concat(chunksOut).toString("utf8").trim(),
|
| 284 |
+
stderr: Buffer.concat(chunksErr).toString("utf8").trim(),
|
| 285 |
+
code: exitCode,
|
| 286 |
+
});
|
| 287 |
+
});
|
| 288 |
+
},
|
| 289 |
+
);
|
| 290 |
+
|
| 291 |
+
if (code !== 0) {
|
| 292 |
+
throw new Error(`Trask web research model list exited ${code ?? "unknown"}: ${stderr || stdout || "no output"}`);
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
return parseModelList(stdout);
|
| 296 |
+
};
|
| 297 |
+
|
| 298 |
+
/** @deprecated Use listHeadlessWebResearchModels */
|
| 299 |
+
export const listHeadlessGptResearcherModels = listHeadlessWebResearchModels;
|
| 300 |
+
|
| 301 |
+
export const probeHeadlessWebResearchDryRun = async (config: WebResearchRuntimeConfig): Promise<boolean> => {
|
| 302 |
+
const repoRoot = config.repoRoot?.trim() || findRepoRoot(process.cwd());
|
| 303 |
+
const script = (config.headlessScriptPath?.trim() || defaultScriptPath(repoRoot)).trim();
|
| 304 |
+
if (!existsSync(script)) {
|
| 305 |
+
return false;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
const python = config.pythonExecutable?.trim() || "python";
|
| 309 |
+
const { code } = await new Promise<{ code: number | null }>((resolvePromise, rejectPromise) => {
|
| 310 |
+
const child = spawn(python, [script, "--dry-run"], {
|
| 311 |
+
cwd: repoRoot,
|
| 312 |
+
stdio: ["ignore", "pipe", "pipe"],
|
| 313 |
+
env: { ...process.env, PYTHONIOENCODING: "utf-8", PYTHONUTF8: "1" },
|
| 314 |
+
});
|
| 315 |
+
let settled = false;
|
| 316 |
+
const timer = setTimeout(() => {
|
| 317 |
+
if (settled) return;
|
| 318 |
+
settled = true;
|
| 319 |
+
child.kill("SIGTERM");
|
| 320 |
+
rejectPromise(new Error("dry-run probe timed out"));
|
| 321 |
+
}, 15_000);
|
| 322 |
+
child.on("error", () => {
|
| 323 |
+
if (settled) return;
|
| 324 |
+
settled = true;
|
| 325 |
+
clearTimeout(timer);
|
| 326 |
+
resolvePromise({ code: 1 });
|
| 327 |
+
});
|
| 328 |
+
child.on("close", (exitCode) => {
|
| 329 |
+
if (settled) return;
|
| 330 |
+
settled = true;
|
| 331 |
+
clearTimeout(timer);
|
| 332 |
+
resolvePromise({ code: exitCode });
|
| 333 |
+
});
|
| 334 |
+
});
|
| 335 |
+
|
| 336 |
+
return code === 0;
|
| 337 |
+
};
|
packages/trask/src/web-research.test.ts
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import assert from "node:assert/strict";
|
| 2 |
+
import { describe, test } from "node:test";
|
| 3 |
+
|
| 4 |
+
import { loadWebResearchRuntimeConfig } from "@openkotor/config";
|
| 5 |
+
|
| 6 |
+
import { createWebResearchClient } from "./web-research.js";
|
| 7 |
+
|
| 8 |
+
describe("WebResearchClient", () => {
|
| 9 |
+
test("createWebResearchClient accepts runtime config", () => {
|
| 10 |
+
const cfg = loadWebResearchRuntimeConfig({});
|
| 11 |
+
const client = createWebResearchClient(cfg, {
|
| 12 |
+
openAiApiKey: undefined,
|
| 13 |
+
openAiBaseUrl: undefined,
|
| 14 |
+
openAiDefaultHeaders: undefined,
|
| 15 |
+
firecrawlApiKey: undefined,
|
| 16 |
+
chatModel: "gpt-5.4-mini",
|
| 17 |
+
chatModelFallbacks: [],
|
| 18 |
+
embeddingModel: "text-embedding-3-large",
|
| 19 |
+
databaseUrl: undefined,
|
| 20 |
+
});
|
| 21 |
+
assert.ok(client);
|
| 22 |
+
});
|
| 23 |
+
});
|
| 24 |
+
|
| 25 |
+
describe("loadWebResearchRuntimeConfig", () => {
|
| 26 |
+
test("TRASK_WEB_RESEARCH_TIMEOUT_MS overrides legacy TRASK_RESEARCHWIZARD_TIMEOUT_MS", () => {
|
| 27 |
+
const cfg = loadWebResearchRuntimeConfig({
|
| 28 |
+
TRASK_WEB_RESEARCH_TIMEOUT_MS: "60000",
|
| 29 |
+
TRASK_RESEARCHWIZARD_TIMEOUT_MS: "900000",
|
| 30 |
+
});
|
| 31 |
+
assert.equal(cfg.timeoutMs, 60_000);
|
| 32 |
+
});
|
| 33 |
+
|
| 34 |
+
test("TRASK_WEB_RESEARCH_PYTHON is respected", () => {
|
| 35 |
+
const cfg = loadWebResearchRuntimeConfig({ TRASK_WEB_RESEARCH_PYTHON: "/custom/python" });
|
| 36 |
+
assert.equal(cfg.pythonExecutable, "/custom/python");
|
| 37 |
+
});
|
| 38 |
+
});
|
packages/trask/src/web-research.ts
ADDED
|
@@ -0,0 +1,1559 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import OpenAI from "openai";
|
| 2 |
+
|
| 3 |
+
import { loadSharedAiConfig, type WebResearchRuntimeConfig, type SharedAiConfig } from "@openkotor/config";
|
| 4 |
+
import {
|
| 5 |
+
isDiscordCitationUrl,
|
| 6 |
+
isTraskApprovedBaseUrl,
|
| 7 |
+
isTraskApprovedResearchUrl,
|
| 8 |
+
sourceUrlMatchesDescriptor,
|
| 9 |
+
traskApprovedResearchBaseHosts,
|
| 10 |
+
traskApprovedResearchSources,
|
| 11 |
+
type SearchHit,
|
| 12 |
+
type SearchProvider,
|
| 13 |
+
type SourceDescriptor,
|
| 14 |
+
} from "@openkotor/retrieval";
|
| 15 |
+
|
| 16 |
+
import {
|
| 17 |
+
buildCommunityKnowledgeDigest,
|
| 18 |
+
filterWebArchiveCitationSources,
|
| 19 |
+
mergeCommunityAndWebSources,
|
| 20 |
+
searchHitsToCommunitySources,
|
| 21 |
+
} from "./community-knowledge.js";
|
| 22 |
+
|
| 23 |
+
import {
|
| 24 |
+
listHeadlessWebResearchModels,
|
| 25 |
+
runHeadlessWebResearch,
|
| 26 |
+
type HeadlessWebResearchModelOption,
|
| 27 |
+
} from "./web-research-subprocess.js";
|
| 28 |
+
|
| 29 |
+
export interface WebResearchAnswer {
|
| 30 |
+
answer: string;
|
| 31 |
+
/** Sources explicitly cited in the final answer shown to users. */
|
| 32 |
+
approvedSources: readonly SourceDescriptor[];
|
| 33 |
+
/** Sources retrieved as candidate evidence for the answer/rewrite stage. */
|
| 34 |
+
retrievedSources: readonly SourceDescriptor[];
|
| 35 |
+
/** Allowlisted URLs the headless researcher touched while gathering evidence. */
|
| 36 |
+
visitedUrls: readonly string[];
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
export interface WebResearchBriefAnswer extends WebResearchAnswer {
|
| 40 |
+
/** Normalized research report text used for proactive semantic gating. */
|
| 41 |
+
researchReport: string;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
/** Fine-grained phases for Holocron clients polling thread history. */
|
| 45 |
+
export interface WebResearchProgressEvent {
|
| 46 |
+
phase: "gather" | "report" | "sources" | "compose";
|
| 47 |
+
detail?: string;
|
| 48 |
+
sources?: readonly SourceDescriptor[];
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
export interface WebResearchQueryOptions {
|
| 52 |
+
/** Preferred rewrite model id, e.g. `openrouter:openrouter/auto` or `litellm:moonshotai/kimi-k2`. */
|
| 53 |
+
model?: string;
|
| 54 |
+
/** Optional per-request source enablement and weight hints from Holocron's Source Prioritization dialog. */
|
| 55 |
+
sourcePreferences?: readonly WebResearchSourcePreference[];
|
| 56 |
+
/** Imported Discord chunks and/or live channel hits merged before web research. */
|
| 57 |
+
localHits?: readonly SearchHit[];
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
export interface WebResearchClientFactoryOptions {
|
| 61 |
+
/** When set, searches imported chunks when `localHits` are not passed per request. */
|
| 62 |
+
localSearchProvider?: SearchProvider;
|
| 63 |
+
/** Resolves discord:// chunk URLs when searching imported history. */
|
| 64 |
+
discordGuildId?: string;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
export interface WebResearchSourcePreference {
|
| 68 |
+
name?: string;
|
| 69 |
+
url: string;
|
| 70 |
+
weight: number;
|
| 71 |
+
enabled: boolean;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
export interface WebResearchModelOption extends HeadlessWebResearchModelOption {}
|
| 75 |
+
|
| 76 |
+
/** Structural type for adapters that only need full Q&A (e.g. Trask HTTP `/ask`). */
|
| 77 |
+
export interface WebResearchQueryHandler {
|
| 78 |
+
answerQuestion(
|
| 79 |
+
query: string,
|
| 80 |
+
onProgress?: (event: WebResearchProgressEvent) => void,
|
| 81 |
+
options?: WebResearchQueryOptions,
|
| 82 |
+
): Promise<WebResearchAnswer>;
|
| 83 |
+
listModels?(): Promise<readonly WebResearchModelOption[]>;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
const DEFAULT_WEB_RESEARCH_MODELS: readonly WebResearchModelOption[] = [
|
| 87 |
+
{ id: "auto", label: "Auto", provider: "Trask web research", recommended: true },
|
| 88 |
+
];
|
| 89 |
+
|
| 90 |
+
interface WebResearchResponsePayload {
|
| 91 |
+
report?: string | null;
|
| 92 |
+
research_information?: {
|
| 93 |
+
source_urls?: readonly string[] | null;
|
| 94 |
+
cited_urls?: readonly string[] | null;
|
| 95 |
+
retrieved_urls?: readonly string[] | null;
|
| 96 |
+
visited_urls?: readonly string[] | null;
|
| 97 |
+
query_domains?: readonly string[] | null;
|
| 98 |
+
allowed_url_prefixes?: readonly string[] | null;
|
| 99 |
+
rejected_source_urls?: readonly string[] | null;
|
| 100 |
+
};
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
const buildResearchTask = (query: string): string => {
|
| 104 |
+
return query.trim();
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
const buildCustomPrompt = (): string => {
|
| 108 |
+
return [
|
| 109 |
+
"Answer the user's question as a Discord-native KOTOR assistant reply using only the provided research context.",
|
| 110 |
+
"Requirements:",
|
| 111 |
+
"- Lead with the answer, not an introduction.",
|
| 112 |
+
"- Sound direct, practical, and helpful.",
|
| 113 |
+
"- Keep the answer concise: at most 3 short paragraphs or 5 compact bullets total before sources.",
|
| 114 |
+
"- Do not describe your research process, retrieval steps, indexing, backend systems, or source policy unless the user explicitly asks.",
|
| 115 |
+
"- Include inline numeric citations like [1] tied to concrete claims.",
|
| 116 |
+
' - End with the exact heading "Sources" on its own line.',
|
| 117 |
+
"- Under Sources, list only the sources you cited, each on its own numbered line in the format: 1. Source Name - URL",
|
| 118 |
+
"- Do not add markdown headings other than the final Sources heading.",
|
| 119 |
+
].join("\n");
|
| 120 |
+
};
|
| 121 |
+
|
| 122 |
+
const buildCustomPromptBrief = (): string => {
|
| 123 |
+
return [
|
| 124 |
+
"Produce a compact research digest for Star Wars: Knights of the Old Republic (KOTOR 1/2) modding questions.",
|
| 125 |
+
"Constraints:",
|
| 126 |
+
"- Stay under ~900 words; bullet key facts when possible.",
|
| 127 |
+
"- Do not narrate tooling, retrieval steps, or how you searched.",
|
| 128 |
+
"- Prefer actionable answers over background essays.",
|
| 129 |
+
"- Include inline numeric citations like [1] tied to concrete claims.",
|
| 130 |
+
' - End with the exact heading "Sources" on its own line.',
|
| 131 |
+
"- Under Sources, list only cited sources as numbered lines: 1. Source Name - URL",
|
| 132 |
+
].join("\n");
|
| 133 |
+
};
|
| 134 |
+
|
| 135 |
+
const stripTrailingChars = (value: string, chars: string): string => {
|
| 136 |
+
let end = value.length;
|
| 137 |
+
while (end > 0 && chars.includes(value[end - 1]!)) end -= 1;
|
| 138 |
+
return value.slice(0, end);
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
const stripTrailingSlashes = (value: string): string => stripTrailingChars(value, "/");
|
| 142 |
+
|
| 143 |
+
const stripTrailingQuestionMarks = (value: string): string => stripTrailingChars(value.trim(), "?");
|
| 144 |
+
|
| 145 |
+
const collapseExcessiveNewlines = (value: string): string => {
|
| 146 |
+
const lines = value.split("\n");
|
| 147 |
+
const out: string[] = [];
|
| 148 |
+
let blankRun = 0;
|
| 149 |
+
for (const line of lines) {
|
| 150 |
+
if (line.trim() === "") {
|
| 151 |
+
blankRun += 1;
|
| 152 |
+
if (blankRun <= 1) out.push("");
|
| 153 |
+
} else {
|
| 154 |
+
blankRun = 0;
|
| 155 |
+
out.push(line);
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
return out.join("\n").trim();
|
| 159 |
+
};
|
| 160 |
+
|
| 161 |
+
const isSourcesHeadingLine = (line: string): boolean => {
|
| 162 |
+
let trimmed = line.trim();
|
| 163 |
+
if (trimmed.startsWith("#")) {
|
| 164 |
+
while (trimmed.startsWith("#")) trimmed = trimmed.slice(1);
|
| 165 |
+
trimmed = trimmed.trimStart();
|
| 166 |
+
}
|
| 167 |
+
return /^sources$/iu.test(trimmed) || /^references$/iu.test(trimmed);
|
| 168 |
+
};
|
| 169 |
+
|
| 170 |
+
const splitAtSourcesHeading = (value: string): string => {
|
| 171 |
+
const normalized = value.replace(/\r\n/g, "\n");
|
| 172 |
+
const lines = normalized.split("\n");
|
| 173 |
+
for (let i = 0; i < lines.length; i++) {
|
| 174 |
+
if (isSourcesHeadingLine(lines[i] ?? "")) {
|
| 175 |
+
return lines.slice(0, i).join("\n");
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
return normalized;
|
| 179 |
+
};
|
| 180 |
+
|
| 181 |
+
const extractSourceSectionUrls = (value: string): string[] => {
|
| 182 |
+
const normalized = value.replace(/\r\n/g, "\n");
|
| 183 |
+
const lines = normalized.split("\n");
|
| 184 |
+
for (let i = 0; i < lines.length; i++) {
|
| 185 |
+
if (isSourcesHeadingLine(lines[i] ?? "")) {
|
| 186 |
+
return extractUrls(lines.slice(i + 1).join("\n"));
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
return extractUrls(normalized);
|
| 190 |
+
};
|
| 191 |
+
|
| 192 |
+
const isUrlTerminator = (ch: string): boolean => /\s/u.test(ch) || ch === ")" || ch === ">" || ch === "]";
|
| 193 |
+
|
| 194 |
+
const extractUrls = (value: string): string[] => {
|
| 195 |
+
const urls: string[] = [];
|
| 196 |
+
const lower = value.toLowerCase();
|
| 197 |
+
let i = 0;
|
| 198 |
+
while (i < value.length) {
|
| 199 |
+
const httpsIdx = lower.indexOf("https://", i);
|
| 200 |
+
const httpIdx = lower.indexOf("http://", i);
|
| 201 |
+
if (httpsIdx === -1 && httpIdx === -1) break;
|
| 202 |
+
const start = httpsIdx === -1
|
| 203 |
+
? httpIdx
|
| 204 |
+
: httpIdx === -1
|
| 205 |
+
? httpsIdx
|
| 206 |
+
: Math.min(httpsIdx, httpIdx);
|
| 207 |
+
let end = start;
|
| 208 |
+
while (end < value.length && !isUrlTerminator(value[end]!)) end += 1;
|
| 209 |
+
urls.push(stripTrailingChars(value.slice(start, end), ".,;:!?"));
|
| 210 |
+
i = end;
|
| 211 |
+
}
|
| 212 |
+
return [...new Set(urls)];
|
| 213 |
+
};
|
| 214 |
+
|
| 215 |
+
const rewriteMarkdownLinks = (
|
| 216 |
+
text: string,
|
| 217 |
+
onLink: (label: string, url: string) => string,
|
| 218 |
+
): string => {
|
| 219 |
+
let result = "";
|
| 220 |
+
let i = 0;
|
| 221 |
+
while (i < text.length) {
|
| 222 |
+
if (text[i] !== "[") {
|
| 223 |
+
result += text[i];
|
| 224 |
+
i += 1;
|
| 225 |
+
continue;
|
| 226 |
+
}
|
| 227 |
+
const closeBracket = text.indexOf("]", i + 1);
|
| 228 |
+
if (closeBracket === -1 || text[closeBracket + 1] !== "(") {
|
| 229 |
+
result += text[i];
|
| 230 |
+
i += 1;
|
| 231 |
+
continue;
|
| 232 |
+
}
|
| 233 |
+
const closeParen = text.indexOf(")", closeBracket + 2);
|
| 234 |
+
if (closeParen === -1) {
|
| 235 |
+
result += text[i];
|
| 236 |
+
i += 1;
|
| 237 |
+
continue;
|
| 238 |
+
}
|
| 239 |
+
const label = text.slice(i + 1, closeBracket);
|
| 240 |
+
const url = text.slice(closeBracket + 2, closeParen);
|
| 241 |
+
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
| 242 |
+
result += text.slice(i, closeParen + 1);
|
| 243 |
+
i = closeParen + 1;
|
| 244 |
+
continue;
|
| 245 |
+
}
|
| 246 |
+
result += onLink(label, url);
|
| 247 |
+
i = closeParen + 1;
|
| 248 |
+
}
|
| 249 |
+
return result;
|
| 250 |
+
};
|
| 251 |
+
|
| 252 |
+
/** True when the line opens with 1–6 `#` characters followed by Unicode whitespace (ATX heading). */
|
| 253 |
+
const isAtxMarkdownHeadingLine = (line: string): boolean => {
|
| 254 |
+
let i = 0;
|
| 255 |
+
let hashes = 0;
|
| 256 |
+
while (i < line.length && line[i] === "#" && hashes < 6) {
|
| 257 |
+
hashes += 1;
|
| 258 |
+
i += 1;
|
| 259 |
+
}
|
| 260 |
+
if (hashes === 0 || hashes > 6) return false;
|
| 261 |
+
if (i >= line.length) return false;
|
| 262 |
+
return /\s/u.test(line[i]!);
|
| 263 |
+
};
|
| 264 |
+
|
| 265 |
+
const stripMarkdownHeaders = (text: string): string =>
|
| 266 |
+
text
|
| 267 |
+
.split("\n")
|
| 268 |
+
.filter((line) => !isAtxMarkdownHeadingLine(line))
|
| 269 |
+
.join("\n");
|
| 270 |
+
|
| 271 |
+
/** Pipe-delimited markdown table row heuristic: trimmed line starts and ends with `|`. */
|
| 272 |
+
const looksLikeMarkdownTableRow = (line: string): boolean => {
|
| 273 |
+
const trimmed = line.trim();
|
| 274 |
+
return trimmed.length >= 2 && trimmed[0] === "|" && trimmed[trimmed.length - 1] === "|";
|
| 275 |
+
};
|
| 276 |
+
|
| 277 |
+
const stripMarkdownTableRows = (text: string): string =>
|
| 278 |
+
text
|
| 279 |
+
.split("\n")
|
| 280 |
+
.filter((line) => !looksLikeMarkdownTableRow(line))
|
| 281 |
+
.join("\n");
|
| 282 |
+
|
| 283 |
+
const stripAsteriskRuns = (text: string): string => {
|
| 284 |
+
let out = "";
|
| 285 |
+
let i = 0;
|
| 286 |
+
while (i < text.length) {
|
| 287 |
+
if (text[i] === "*") {
|
| 288 |
+
while (i < text.length && text[i] === "*") i += 1;
|
| 289 |
+
continue;
|
| 290 |
+
}
|
| 291 |
+
out += text[i];
|
| 292 |
+
i += 1;
|
| 293 |
+
}
|
| 294 |
+
return out;
|
| 295 |
+
};
|
| 296 |
+
|
| 297 |
+
const splitParagraphs = (text: string): string[] => {
|
| 298 |
+
const paragraphs: string[] = [];
|
| 299 |
+
let current: string[] = [];
|
| 300 |
+
for (const line of text.split("\n")) {
|
| 301 |
+
if (line.trim() === "") {
|
| 302 |
+
if (current.length > 0) {
|
| 303 |
+
paragraphs.push(current.join("\n").trim());
|
| 304 |
+
current = [];
|
| 305 |
+
}
|
| 306 |
+
} else {
|
| 307 |
+
current.push(line);
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
if (current.length > 0) paragraphs.push(current.join("\n").trim());
|
| 311 |
+
return paragraphs.filter((paragraph) => paragraph.length > 0);
|
| 312 |
+
};
|
| 313 |
+
|
| 314 |
+
const normalizeUrl = (value: string): string => stripTrailingSlashes(value).trim();
|
| 315 |
+
|
| 316 |
+
const hostnameHint = (url: string): string => {
|
| 317 |
+
try {
|
| 318 |
+
return new URL(url).hostname.replace(/^www\./, "").toLowerCase();
|
| 319 |
+
} catch {
|
| 320 |
+
return url.slice(0, 48);
|
| 321 |
+
}
|
| 322 |
+
};
|
| 323 |
+
|
| 324 |
+
/** Dedupe by normalized URL; preserves first-seen order for stable Holocron pulses. */
|
| 325 |
+
const uniqueUrlsPreserveOrder = (urls: readonly string[]): string[] => {
|
| 326 |
+
const seen = new Set<string>();
|
| 327 |
+
const out: string[] = [];
|
| 328 |
+
for (const raw of urls) {
|
| 329 |
+
const u = normalizeUrl(raw);
|
| 330 |
+
if (!u || seen.has(u)) continue;
|
| 331 |
+
seen.add(u);
|
| 332 |
+
out.push(u);
|
| 333 |
+
}
|
| 334 |
+
return out;
|
| 335 |
+
};
|
| 336 |
+
|
| 337 |
+
const payloadUrls = (values: readonly string[] | null | undefined): string[] =>
|
| 338 |
+
Array.isArray(values) ? values.filter((value): value is string => typeof value === "string") : [];
|
| 339 |
+
|
| 340 |
+
const isAllowedSourceUrl = (url: string, sourcePool: readonly SourceDescriptor[]): boolean => {
|
| 341 |
+
if (!isPublicWebCitationUrl(url)) return false;
|
| 342 |
+
if (sourcePool.some((source) => sourceUrlMatchesDescriptor(url, source))) return true;
|
| 343 |
+
if (isTraskApprovedResearchUrl(url, sourcePool)) return true;
|
| 344 |
+
return isTraskApprovedBaseUrl(url);
|
| 345 |
+
};
|
| 346 |
+
|
| 347 |
+
/** Visited URLs from web research payload (Holocron live facet pings). */
|
| 348 |
+
const collectVisitedUrlsFromPayload = (
|
| 349 |
+
payload: WebResearchResponsePayload,
|
| 350 |
+
approvedSources: readonly SourceDescriptor[],
|
| 351 |
+
): string[] => {
|
| 352 |
+
const info = payload.research_information;
|
| 353 |
+
return uniqueUrlsPreserveOrder(payloadUrls(info?.visited_urls)).filter((url) =>
|
| 354 |
+
isAllowedSourceUrl(url, approvedSources),
|
| 355 |
+
);
|
| 356 |
+
};
|
| 357 |
+
|
| 358 |
+
const collectRejectedUrlsFromPayload = (payload: WebResearchResponsePayload): string[] => {
|
| 359 |
+
const rawRejected = payload.research_information?.rejected_source_urls;
|
| 360 |
+
return Array.isArray(rawRejected)
|
| 361 |
+
? uniqueUrlsPreserveOrder(rawRejected.filter((value): value is string => typeof value === "string"))
|
| 362 |
+
: [];
|
| 363 |
+
};
|
| 364 |
+
|
| 365 |
+
const MAX_ARCHIVE_PROBE_EVENTS = 28;
|
| 366 |
+
|
| 367 |
+
const emitArchiveProbeEvents = (
|
| 368 |
+
payload: WebResearchResponsePayload,
|
| 369 |
+
approvedSources: readonly SourceDescriptor[],
|
| 370 |
+
onProgress?: (event: WebResearchProgressEvent) => void,
|
| 371 |
+
): void => {
|
| 372 |
+
if (!onProgress) return;
|
| 373 |
+
|
| 374 |
+
const urls = collectVisitedUrlsFromPayload(payload, approvedSources).slice(0, MAX_ARCHIVE_PROBE_EVENTS * 2);
|
| 375 |
+
|
| 376 |
+
let emitted = 0;
|
| 377 |
+
for (const url of urls) {
|
| 378 |
+
if (emitted >= MAX_ARCHIVE_PROBE_EVENTS) break;
|
| 379 |
+
const matched = matchApprovedSource(url, approvedSources);
|
| 380 |
+
const host = hostnameHint(url);
|
| 381 |
+
onProgress({
|
| 382 |
+
phase: "gather",
|
| 383 |
+
detail: matched ? `Facet · ${matched.name}` : `Touch · ${host}`,
|
| 384 |
+
...(matched ? { sources: [matched] } : {}),
|
| 385 |
+
});
|
| 386 |
+
emitted++;
|
| 387 |
+
}
|
| 388 |
+
};
|
| 389 |
+
|
| 390 |
+
const matchApprovedSource = (
|
| 391 |
+
url: string,
|
| 392 |
+
approvedSources: readonly SourceDescriptor[],
|
| 393 |
+
): SourceDescriptor | undefined => {
|
| 394 |
+
const candidate = normalizeUrl(url);
|
| 395 |
+
|
| 396 |
+
return approvedSources.find((source) => {
|
| 397 |
+
const homeUrl = normalizeUrl(source.homeUrl);
|
| 398 |
+
return candidate === homeUrl || candidate.startsWith(`${homeUrl}/`);
|
| 399 |
+
});
|
| 400 |
+
};
|
| 401 |
+
|
| 402 |
+
const sourceUrlLabel = (source: SourceDescriptor, url: string): string => {
|
| 403 |
+
try {
|
| 404 |
+
const exact = new URL(url);
|
| 405 |
+
const base = new URL(source.homeUrl);
|
| 406 |
+
const exactPath = decodeURIComponent(stripTrailingSlashes(exact.pathname));
|
| 407 |
+
const basePath = decodeURIComponent(stripTrailingSlashes(base.pathname));
|
| 408 |
+
if (exactPath === basePath) return source.name;
|
| 409 |
+
const relativePath = exactPath.startsWith(`${basePath}/`) ? exactPath.slice(basePath.length + 1) : exactPath;
|
| 410 |
+
const cleaned = relativePath
|
| 411 |
+
.replace(/^blob\/[^/]+\//u, "")
|
| 412 |
+
.replace(/^tree\/[^/]+\//u, "")
|
| 413 |
+
.replace(/^wiki\//u, "")
|
| 414 |
+
.split("/")
|
| 415 |
+
.filter(Boolean)
|
| 416 |
+
.slice(-2)
|
| 417 |
+
.join("/")
|
| 418 |
+
.replace(/[-_]+/gu, " ")
|
| 419 |
+
.trim();
|
| 420 |
+
if (!cleaned) return source.name;
|
| 421 |
+
const lineAnchor = exact.hash && /^#L\d+(?:-L\d+)?$/iu.test(exact.hash) ? exact.hash : "";
|
| 422 |
+
return `${source.name}: ${cleaned}${lineAnchor}`;
|
| 423 |
+
} catch {
|
| 424 |
+
return source.name;
|
| 425 |
+
}
|
| 426 |
+
};
|
| 427 |
+
|
| 428 |
+
const exactSourceFromUrl = (url: string, approvedSources: readonly SourceDescriptor[]): SourceDescriptor | undefined => {
|
| 429 |
+
const exactUrl = normalizeUrl(url);
|
| 430 |
+
const catalogMatch = matchApprovedSource(url, approvedSources);
|
| 431 |
+
if (catalogMatch) {
|
| 432 |
+
const sourceUrl = normalizeUrl(catalogMatch.homeUrl);
|
| 433 |
+
return {
|
| 434 |
+
...catalogMatch,
|
| 435 |
+
id: exactUrl === sourceUrl ? catalogMatch.id : `${catalogMatch.id}:${exactUrl}`,
|
| 436 |
+
name: sourceUrlLabel(catalogMatch, exactUrl),
|
| 437 |
+
homeUrl: exactUrl,
|
| 438 |
+
};
|
| 439 |
+
}
|
| 440 |
+
if (!isTraskApprovedBaseUrl(url)) return undefined;
|
| 441 |
+
const host = hostnameHint(url);
|
| 442 |
+
return {
|
| 443 |
+
id: `approved-web:${exactUrl}`,
|
| 444 |
+
name: host,
|
| 445 |
+
kind: "website",
|
| 446 |
+
homeUrl: exactUrl,
|
| 447 |
+
description: `Approved web source (${host})`,
|
| 448 |
+
freshnessPolicy: "live web research",
|
| 449 |
+
approvalScope: "approved research host",
|
| 450 |
+
tags: [host],
|
| 451 |
+
};
|
| 452 |
+
};
|
| 453 |
+
|
| 454 |
+
const isCatalogRootUrl = (url: string, approvedSources: readonly SourceDescriptor[]): boolean => {
|
| 455 |
+
const normalized = normalizeUrl(url);
|
| 456 |
+
return approvedSources.some((source) => normalizeUrl(source.homeUrl) === normalized);
|
| 457 |
+
};
|
| 458 |
+
|
| 459 |
+
const materializeSourcesFromUrls = (
|
| 460 |
+
urls: readonly string[],
|
| 461 |
+
sourcePool: readonly SourceDescriptor[],
|
| 462 |
+
): readonly SourceDescriptor[] => {
|
| 463 |
+
const candidateUrls = uniqueUrlsPreserveOrder(
|
| 464 |
+
urls.filter((url) => isAllowedSourceUrl(url, sourcePool)),
|
| 465 |
+
);
|
| 466 |
+
|
| 467 |
+
const matched: SourceDescriptor[] = [];
|
| 468 |
+
const hasPreciseUrl = candidateUrls.some((url) => !isCatalogRootUrl(url, sourcePool));
|
| 469 |
+
|
| 470 |
+
for (const url of candidateUrls) {
|
| 471 |
+
if (hasPreciseUrl && isCatalogRootUrl(url, sourcePool)) continue;
|
| 472 |
+
const source = exactSourceFromUrl(url, sourcePool);
|
| 473 |
+
|
| 474 |
+
if (source && !matched.some((entry) => normalizeUrl(entry.homeUrl) === normalizeUrl(source.homeUrl))) {
|
| 475 |
+
matched.push(source);
|
| 476 |
+
}
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
return matched.slice(0, 6);
|
| 480 |
+
};
|
| 481 |
+
|
| 482 |
+
const collectCitedSources = (
|
| 483 |
+
report: string,
|
| 484 |
+
approvedSources: readonly SourceDescriptor[],
|
| 485 |
+
payload: WebResearchResponsePayload,
|
| 486 |
+
): readonly SourceDescriptor[] => {
|
| 487 |
+
const info = payload.research_information;
|
| 488 |
+
return materializeSourcesFromUrls([
|
| 489 |
+
...extractSourceSectionUrls(report),
|
| 490 |
+
...payloadUrls(info?.cited_urls),
|
| 491 |
+
...payloadUrls(info?.source_urls),
|
| 492 |
+
], approvedSources);
|
| 493 |
+
};
|
| 494 |
+
|
| 495 |
+
const collectRetrievedSources = (
|
| 496 |
+
report: string,
|
| 497 |
+
approvedSources: readonly SourceDescriptor[],
|
| 498 |
+
payload: WebResearchResponsePayload,
|
| 499 |
+
): readonly SourceDescriptor[] => {
|
| 500 |
+
const info = payload.research_information;
|
| 501 |
+
return materializeSourcesFromUrls([
|
| 502 |
+
...payloadUrls(info?.retrieved_urls),
|
| 503 |
+
...payloadUrls(info?.cited_urls),
|
| 504 |
+
...payloadUrls(info?.source_urls),
|
| 505 |
+
...extractSourceSectionUrls(report),
|
| 506 |
+
], approvedSources);
|
| 507 |
+
};
|
| 508 |
+
|
| 509 |
+
const collectCitedSourcesFromText = (
|
| 510 |
+
text: string,
|
| 511 |
+
sourcePool: readonly SourceDescriptor[],
|
| 512 |
+
): readonly SourceDescriptor[] => materializeSourcesFromUrls(extractSourceSectionUrls(text), sourcePool);
|
| 513 |
+
|
| 514 |
+
const startsWithTableOfContentsHeading = (trimmed: string): boolean => {
|
| 515 |
+
const lower = trimmed.toLowerCase();
|
| 516 |
+
if (!lower.startsWith("##")) return false;
|
| 517 |
+
let i = 2;
|
| 518 |
+
while (i < lower.length && /\s/u.test(lower[i]!)) i += 1;
|
| 519 |
+
return lower.startsWith("table of contents", i);
|
| 520 |
+
};
|
| 521 |
+
|
| 522 |
+
/** `##` at line start followed by Unicode whitespace (matches prior `^##\\s+` checks). */
|
| 523 |
+
const startsWithH2WithSpace = (trimmed: string): boolean =>
|
| 524 |
+
trimmed.startsWith("##") && trimmed.length > 2 && /\s/u.test(trimmed[2]!);
|
| 525 |
+
|
| 526 |
+
/** Single-level ATX heading: `# ` but not `## …` (H1 title line). */
|
| 527 |
+
const isH1AtxHeadingLine = (trimmed: string): boolean => {
|
| 528 |
+
if (!trimmed.startsWith("#")) return false;
|
| 529 |
+
if (trimmed.startsWith("##")) return false;
|
| 530 |
+
return trimmed.length > 1 && /\s/u.test(trimmed[1]!);
|
| 531 |
+
};
|
| 532 |
+
|
| 533 |
+
const normalizeReport = (value: string): string => {
|
| 534 |
+
const lines = value.replace(/\r\n/g, "\n").split("\n");
|
| 535 |
+
const out: string[] = [];
|
| 536 |
+
let skippingToc = false;
|
| 537 |
+
for (const line of lines) {
|
| 538 |
+
const trimmed = line.trim();
|
| 539 |
+
if (startsWithTableOfContentsHeading(trimmed)) {
|
| 540 |
+
skippingToc = true;
|
| 541 |
+
continue;
|
| 542 |
+
}
|
| 543 |
+
if (skippingToc) {
|
| 544 |
+
if (
|
| 545 |
+
startsWithH2WithSpace(trimmed)
|
| 546 |
+
|| isSourcesHeadingLine(line)
|
| 547 |
+
|| isH1AtxHeadingLine(trimmed)
|
| 548 |
+
) {
|
| 549 |
+
skippingToc = false;
|
| 550 |
+
} else {
|
| 551 |
+
continue;
|
| 552 |
+
}
|
| 553 |
+
}
|
| 554 |
+
if (isH1AtxHeadingLine(trimmed)) continue;
|
| 555 |
+
out.push(line);
|
| 556 |
+
}
|
| 557 |
+
return collapseExcessiveNewlines(out.join("\n"));
|
| 558 |
+
};
|
| 559 |
+
|
| 560 |
+
const formatSourcesSection = (sources: readonly SourceDescriptor[]): string => {
|
| 561 |
+
return [
|
| 562 |
+
"Sources",
|
| 563 |
+
...sources.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`),
|
| 564 |
+
].join("\n");
|
| 565 |
+
};
|
| 566 |
+
|
| 567 |
+
const countPayloadWebUrls = (payload: WebResearchResponsePayload): number => {
|
| 568 |
+
const info = payload.research_information;
|
| 569 |
+
const urls = uniqueUrlsPreserveOrder([
|
| 570 |
+
...payloadUrls(info?.cited_urls),
|
| 571 |
+
...payloadUrls(info?.retrieved_urls),
|
| 572 |
+
...payloadUrls(info?.visited_urls),
|
| 573 |
+
...payloadUrls(info?.source_urls),
|
| 574 |
+
]);
|
| 575 |
+
return urls.filter((url) => isPublicWebCitationUrl(url)).length;
|
| 576 |
+
};
|
| 577 |
+
|
| 578 |
+
const LEGACY_APPROVED_ARCHIVE_BULLET_MARKER =
|
| 579 |
+
"is an approved archive page that may answer questions about";
|
| 580 |
+
|
| 581 |
+
/**
|
| 582 |
+
* Legacy failure copy used a markdown bullet whose tail contained a fixed phrase.
|
| 583 |
+
* Implemented without `.*`-style regexes to avoid polynomial backtracking on adversarial input.
|
| 584 |
+
*/
|
| 585 |
+
const hasLegacyApprovedArchiveFailureBullet = (normalized: string): boolean => {
|
| 586 |
+
const lower = normalized.toLowerCase();
|
| 587 |
+
const marker = LEGACY_APPROVED_ARCHIVE_BULLET_MARKER.toLowerCase();
|
| 588 |
+
if (!lower.startsWith("-")) return false;
|
| 589 |
+
let i = 1;
|
| 590 |
+
while (i < lower.length && /\s/u.test(lower[i]!)) i += 1;
|
| 591 |
+
if (i >= lower.length || /\s/u.test(lower[i]!)) return false;
|
| 592 |
+
while (i < lower.length && /\S/u.test(lower[i]!)) i += 1;
|
| 593 |
+
return lower.indexOf(marker, i) !== -1;
|
| 594 |
+
};
|
| 595 |
+
|
| 596 |
+
const isSynthesisFailureReport = (report: string, payload: WebResearchResponsePayload): boolean => {
|
| 597 |
+
const normalized = report.trim();
|
| 598 |
+
const webUrlCount = countPayloadWebUrls(payload);
|
| 599 |
+
if (webUrlCount >= MIN_HOLOCRON_WEB_CITATIONS) {
|
| 600 |
+
return /^i could not complete live archive synthesis\b/iu.test(normalized);
|
| 601 |
+
}
|
| 602 |
+
if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
|
| 603 |
+
return true;
|
| 604 |
+
}
|
| 605 |
+
if (hasLegacyApprovedArchiveFailureBullet(normalized)) {
|
| 606 |
+
return true;
|
| 607 |
+
}
|
| 608 |
+
return false;
|
| 609 |
+
};
|
| 610 |
+
|
| 611 |
+
const sourceOnlyFallbackAnswer = (query: string, sources: readonly SourceDescriptor[]): string => {
|
| 612 |
+
if (sources.length === 0) return "I could not complete live archive synthesis for this question right now.";
|
| 613 |
+
const topic = stripTrailingQuestionMarks(query) || "this question";
|
| 614 |
+
return [
|
| 615 |
+
`I found candidate sources for ${topic}, but I could not support a grounded answer from the retrieved evidence.`,
|
| 616 |
+
"Review the sources below or try a narrower wording.",
|
| 617 |
+
"",
|
| 618 |
+
formatSourcesSection(sources),
|
| 619 |
+
].join("\n");
|
| 620 |
+
};
|
| 621 |
+
|
| 622 |
+
const DEFAULT_REWRITE_TIMEOUT_MS = 15_000;
|
| 623 |
+
const MAX_REWRITE_ATTEMPTS = 2;
|
| 624 |
+
|
| 625 |
+
const normalizePreferredRewriteModel = (model: string | undefined): string | undefined => {
|
| 626 |
+
const trimmed = model?.trim();
|
| 627 |
+
if (!trimmed) return undefined;
|
| 628 |
+
if (trimmed.startsWith("litellm:")) return trimmed.slice("litellm:".length).trim() || undefined;
|
| 629 |
+
if (trimmed.startsWith("openrouter:")) return trimmed.slice("openrouter:".length).trim() || undefined;
|
| 630 |
+
return trimmed;
|
| 631 |
+
};
|
| 632 |
+
|
| 633 |
+
const withTimeout = async <T>(promise: Promise<T>, timeoutMs: number): Promise<T> => {
|
| 634 |
+
return await new Promise<T>((resolve, reject) => {
|
| 635 |
+
const timer = setTimeout(() => {
|
| 636 |
+
reject(new Error(`rewrite timed out after ${timeoutMs}ms`));
|
| 637 |
+
}, timeoutMs);
|
| 638 |
+
|
| 639 |
+
void promise.then(
|
| 640 |
+
(value) => {
|
| 641 |
+
clearTimeout(timer);
|
| 642 |
+
resolve(value);
|
| 643 |
+
},
|
| 644 |
+
(error: unknown) => {
|
| 645 |
+
clearTimeout(timer);
|
| 646 |
+
reject(error);
|
| 647 |
+
},
|
| 648 |
+
);
|
| 649 |
+
});
|
| 650 |
+
};
|
| 651 |
+
|
| 652 |
+
const fallbackDiscordRewrite = (
|
| 653 |
+
query: string,
|
| 654 |
+
report: string,
|
| 655 |
+
sources: readonly SourceDescriptor[],
|
| 656 |
+
): string => {
|
| 657 |
+
if (sources.length === 0) {
|
| 658 |
+
return degradedAnswerFallback(query, sources);
|
| 659 |
+
}
|
| 660 |
+
const normalized = normalizeReport(report);
|
| 661 |
+
if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
|
| 662 |
+
return sourceOnlyFallbackAnswer(query, sources);
|
| 663 |
+
}
|
| 664 |
+
|
| 665 |
+
const sourceIndexByUrl = new Map<string, number>(
|
| 666 |
+
sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
|
| 667 |
+
);
|
| 668 |
+
|
| 669 |
+
const bodyOnly = collapseExcessiveNewlines(
|
| 670 |
+
stripAsteriskRuns(
|
| 671 |
+
stripMarkdownTableRows(
|
| 672 |
+
stripMarkdownHeaders(
|
| 673 |
+
rewriteMarkdownLinks(splitAtSourcesHeading(normalized), (text, url) => {
|
| 674 |
+
const matchedSource = matchApprovedSource(url, sources);
|
| 675 |
+
const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
|
| 676 |
+
return citationIndex ? `${text} [${citationIndex}]` : text;
|
| 677 |
+
}),
|
| 678 |
+
),
|
| 679 |
+
),
|
| 680 |
+
),
|
| 681 |
+
);
|
| 682 |
+
|
| 683 |
+
const paragraphs = splitParagraphs(bodyOnly);
|
| 684 |
+
|
| 685 |
+
const selected: string[] = [];
|
| 686 |
+
let totalLength = 0;
|
| 687 |
+
|
| 688 |
+
for (const paragraph of paragraphs) {
|
| 689 |
+
if (selected.length >= 2) break;
|
| 690 |
+
if (totalLength + paragraph.length > 900 && selected.length > 0) break;
|
| 691 |
+
selected.push(paragraph);
|
| 692 |
+
totalLength += paragraph.length;
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
let summary = selected.join("\n\n").trim();
|
| 696 |
+
|
| 697 |
+
if (!summary) {
|
| 698 |
+
summary = bodyOnly.slice(0, 900).trim();
|
| 699 |
+
}
|
| 700 |
+
|
| 701 |
+
if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
|
| 702 |
+
summary = `${summary} [1]`.trim();
|
| 703 |
+
}
|
| 704 |
+
|
| 705 |
+
return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
|
| 706 |
+
};
|
| 707 |
+
|
| 708 |
+
const fallbackDiscordBrief = (query: string, report: string, sources: readonly SourceDescriptor[]): string => {
|
| 709 |
+
if (sources.length === 0) {
|
| 710 |
+
return degradedAnswerFallback(query, sources);
|
| 711 |
+
}
|
| 712 |
+
const normalized = normalizeReport(report);
|
| 713 |
+
if (/^i could not complete live archive synthesis\b/iu.test(normalized)) {
|
| 714 |
+
return sourceOnlyFallbackAnswer(query, sources);
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
const sourceIndexByUrl = new Map<string, number>(
|
| 718 |
+
sources.map((source, index) => [normalizeUrl(source.homeUrl), index + 1]),
|
| 719 |
+
);
|
| 720 |
+
|
| 721 |
+
const bodyOnly = collapseExcessiveNewlines(
|
| 722 |
+
stripAsteriskRuns(
|
| 723 |
+
stripMarkdownHeaders(
|
| 724 |
+
rewriteMarkdownLinks(splitAtSourcesHeading(normalized), (text, url) => {
|
| 725 |
+
const matchedSource = matchApprovedSource(url, sources);
|
| 726 |
+
const citationIndex = matchedSource ? sourceIndexByUrl.get(normalizeUrl(matchedSource.homeUrl)) : undefined;
|
| 727 |
+
return citationIndex ? `${text} [${citationIndex}]` : text;
|
| 728 |
+
}),
|
| 729 |
+
),
|
| 730 |
+
),
|
| 731 |
+
);
|
| 732 |
+
|
| 733 |
+
const firstChunk = splitParagraphs(bodyOnly)[0] ?? bodyOnly;
|
| 734 |
+
let summary = firstChunk.slice(0, 420).trim();
|
| 735 |
+
|
| 736 |
+
if (!summary) {
|
| 737 |
+
summary = bodyOnly.slice(0, 420).trim();
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
if (sources.length > 0 && !/\[\d+\]/.test(summary)) {
|
| 741 |
+
summary = `${summary} [1]`.trim();
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
return sources.length > 0 ? `${summary}\n\n${formatSourcesSection(sources)}` : summary;
|
| 745 |
+
};
|
| 746 |
+
|
| 747 |
+
const degradedAnswerFallback = (_query: string, _approvedSources: readonly SourceDescriptor[]): string => {
|
| 748 |
+
return "I could not complete live archive synthesis for this question right now.";
|
| 749 |
+
};
|
| 750 |
+
|
| 751 |
+
const normalizePreferenceUrl = (url: string): URL | undefined => {
|
| 752 |
+
try {
|
| 753 |
+
return new URL(stripTrailingSlashes(url.trim()));
|
| 754 |
+
} catch {
|
| 755 |
+
return undefined;
|
| 756 |
+
}
|
| 757 |
+
};
|
| 758 |
+
|
| 759 |
+
const preferenceMatchesSource = (preference: WebResearchSourcePreference, source: SourceDescriptor): boolean => {
|
| 760 |
+
const preferenceUrl = normalizePreferenceUrl(preference.url);
|
| 761 |
+
const sourceUrl = normalizePreferenceUrl(source.homeUrl);
|
| 762 |
+
|
| 763 |
+
if (preferenceUrl && sourceUrl) {
|
| 764 |
+
const preferenceHost = preferenceUrl.hostname.replace(/^www\./, "").toLowerCase();
|
| 765 |
+
const sourceHost = sourceUrl.hostname.replace(/^www\./, "").toLowerCase();
|
| 766 |
+
const preferencePath = stripTrailingSlashes(preferenceUrl.pathname);
|
| 767 |
+
const sourcePath = stripTrailingSlashes(sourceUrl.pathname);
|
| 768 |
+
|
| 769 |
+
if (preferenceHost === sourceHost && (preferencePath === "" || sourcePath === preferencePath || sourcePath.startsWith(`${preferencePath}/`))) {
|
| 770 |
+
return true;
|
| 771 |
+
}
|
| 772 |
+
|
| 773 |
+
if (preferenceHost === sourceHost && preferenceUrl.pathname === "/") {
|
| 774 |
+
return true;
|
| 775 |
+
}
|
| 776 |
+
}
|
| 777 |
+
|
| 778 |
+
const preferenceName = preference.name?.trim().toLowerCase();
|
| 779 |
+
return Boolean(preferenceName && preferenceName === source.name.trim().toLowerCase());
|
| 780 |
+
};
|
| 781 |
+
|
| 782 |
+
const applySourcePreferences = (
|
| 783 |
+
approvedSources: readonly SourceDescriptor[],
|
| 784 |
+
preferences?: readonly WebResearchSourcePreference[],
|
| 785 |
+
): readonly SourceDescriptor[] => {
|
| 786 |
+
if (!preferences?.length) return approvedSources;
|
| 787 |
+
|
| 788 |
+
const ranked = approvedSources
|
| 789 |
+
.map((source, index) => {
|
| 790 |
+
const preference = preferences.find((entry) => preferenceMatchesSource(entry, source));
|
| 791 |
+
return {
|
| 792 |
+
source,
|
| 793 |
+
index,
|
| 794 |
+
enabled: preference ? preference.enabled : true,
|
| 795 |
+
weight: preference && Number.isFinite(preference.weight) ? preference.weight : 1,
|
| 796 |
+
};
|
| 797 |
+
})
|
| 798 |
+
.filter((entry) => entry.enabled)
|
| 799 |
+
.sort((left, right) => right.weight - left.weight || left.index - right.index)
|
| 800 |
+
.map((entry) => entry.source);
|
| 801 |
+
|
| 802 |
+
return ranked;
|
| 803 |
+
};
|
| 804 |
+
|
| 805 |
+
type ResearchQueryIntent = "tooling" | "technical" | "lore" | "general";
|
| 806 |
+
|
| 807 |
+
const TOOLING_QUERY_TERMS = [
|
| 808 |
+
"mdlops",
|
| 809 |
+
"mdledit",
|
| 810 |
+
"kotormax",
|
| 811 |
+
"kotorblender",
|
| 812 |
+
"pykotor",
|
| 813 |
+
"xoreos",
|
| 814 |
+
"reone",
|
| 815 |
+
"tslpatcher",
|
| 816 |
+
"toolchain",
|
| 817 |
+
"modding",
|
| 818 |
+
"tool",
|
| 819 |
+
"script",
|
| 820 |
+
"gff",
|
| 821 |
+
"2da",
|
| 822 |
+
"tlk",
|
| 823 |
+
"nss",
|
| 824 |
+
"ncs",
|
| 825 |
+
"utc",
|
| 826 |
+
"uti",
|
| 827 |
+
"mdl",
|
| 828 |
+
"mdx",
|
| 829 |
+
"texture",
|
| 830 |
+
"convert",
|
| 831 |
+
"blender",
|
| 832 |
+
"3ds",
|
| 833 |
+
];
|
| 834 |
+
|
| 835 |
+
const TECHNICAL_QUERY_TERMS = [
|
| 836 |
+
"widescreen",
|
| 837 |
+
"resolution",
|
| 838 |
+
"hud",
|
| 839 |
+
"screen",
|
| 840 |
+
"crash",
|
| 841 |
+
"compatibility",
|
| 842 |
+
"steam",
|
| 843 |
+
"windows",
|
| 844 |
+
"linux",
|
| 845 |
+
"mac",
|
| 846 |
+
"save",
|
| 847 |
+
"saves",
|
| 848 |
+
"install",
|
| 849 |
+
"launcher",
|
| 850 |
+
"driver",
|
| 851 |
+
"movies",
|
| 852 |
+
"cutscene",
|
| 853 |
+
"graphics",
|
| 854 |
+
"aspect",
|
| 855 |
+
];
|
| 856 |
+
|
| 857 |
+
const LORE_QUERY_TERMS = [
|
| 858 |
+
"bastila",
|
| 859 |
+
"revan",
|
| 860 |
+
"malak",
|
| 861 |
+
"shan",
|
| 862 |
+
"jedi",
|
| 863 |
+
"sith",
|
| 864 |
+
"rakata",
|
| 865 |
+
"star forge",
|
| 866 |
+
"temple summit",
|
| 867 |
+
"companion",
|
| 868 |
+
"romance",
|
| 869 |
+
"story",
|
| 870 |
+
"lore",
|
| 871 |
+
];
|
| 872 |
+
|
| 873 |
+
const LORE_SOURCE_IDS = new Set(["wikipedia-kotor", "strategywiki-kotor"]);
|
| 874 |
+
|
| 875 |
+
const queryIncludesAny = (query: string, terms: readonly string[]): boolean => {
|
| 876 |
+
const lowered = query.toLowerCase();
|
| 877 |
+
return terms.some((term) => lowered.includes(term));
|
| 878 |
+
};
|
| 879 |
+
|
| 880 |
+
const classifyQueryIntent = (query: string): ResearchQueryIntent => {
|
| 881 |
+
const lowered = query.toLowerCase();
|
| 882 |
+
if (queryIncludesAny(lowered, TOOLING_QUERY_TERMS)) return "tooling";
|
| 883 |
+
if (queryIncludesAny(lowered, TECHNICAL_QUERY_TERMS)) return "technical";
|
| 884 |
+
if (queryIncludesAny(lowered, LORE_QUERY_TERMS)) return "lore";
|
| 885 |
+
return "general";
|
| 886 |
+
};
|
| 887 |
+
|
| 888 |
+
const routeSourcesForQuery = (
|
| 889 |
+
query: string,
|
| 890 |
+
approvedSources: readonly SourceDescriptor[],
|
| 891 |
+
): readonly SourceDescriptor[] => {
|
| 892 |
+
const intent = classifyQueryIntent(query);
|
| 893 |
+
if (intent === "tooling" || intent === "technical") {
|
| 894 |
+
const filtered = approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id));
|
| 895 |
+
return filtered.length > 0 ? filtered : approvedSources;
|
| 896 |
+
}
|
| 897 |
+
if (intent === "lore") {
|
| 898 |
+
return [
|
| 899 |
+
...approvedSources.filter((source) => LORE_SOURCE_IDS.has(source.id)),
|
| 900 |
+
...approvedSources.filter((source) => !LORE_SOURCE_IDS.has(source.id)),
|
| 901 |
+
];
|
| 902 |
+
}
|
| 903 |
+
return approvedSources;
|
| 904 |
+
};
|
| 905 |
+
|
| 906 |
+
const mergeSourcesPreserveOrder = (...groups: readonly (readonly SourceDescriptor[])[]): SourceDescriptor[] => {
|
| 907 |
+
const merged: SourceDescriptor[] = [];
|
| 908 |
+
const seen = new Set<string>();
|
| 909 |
+
for (const group of groups) {
|
| 910 |
+
for (const source of group) {
|
| 911 |
+
const key = normalizeUrl(source.homeUrl);
|
| 912 |
+
if (seen.has(key)) continue;
|
| 913 |
+
seen.add(key);
|
| 914 |
+
merged.push(source);
|
| 915 |
+
}
|
| 916 |
+
}
|
| 917 |
+
return merged;
|
| 918 |
+
};
|
| 919 |
+
|
| 920 |
+
const normalizeMatchToken = (token: string): string => {
|
| 921 |
+
const lowered = token.toLowerCase();
|
| 922 |
+
if (lowered.length <= 6) return lowered;
|
| 923 |
+
return lowered.slice(0, 6);
|
| 924 |
+
};
|
| 925 |
+
|
| 926 |
+
const tokenizeQuery = (query: string): string[] =>
|
| 927 |
+
[...new Set(
|
| 928 |
+
query
|
| 929 |
+
.toLowerCase()
|
| 930 |
+
.replace(/[^\p{L}\p{N}\s-]/gu, " ")
|
| 931 |
+
.split(/\s+/)
|
| 932 |
+
.filter((token) => token.length >= 4)
|
| 933 |
+
.map(normalizeMatchToken),
|
| 934 |
+
)];
|
| 935 |
+
|
| 936 |
+
/** Citations must be real public web pages on the approved allowlist (live web research only). */
|
| 937 |
+
const isPublicWebCitationUrl = (url: string): boolean => {
|
| 938 |
+
if (url.startsWith("local://") || url.startsWith("discord://") || isDiscordCitationUrl(url)) return false;
|
| 939 |
+
try {
|
| 940 |
+
const parsed = new URL(url);
|
| 941 |
+
return parsed.protocol === "https:" || parsed.protocol === "http:";
|
| 942 |
+
} catch {
|
| 943 |
+
return false;
|
| 944 |
+
}
|
| 945 |
+
};
|
| 946 |
+
|
| 947 |
+
const filterPublicWebCitationSources = (sources: readonly SourceDescriptor[]): SourceDescriptor[] =>
|
| 948 |
+
sources.filter((source) => isPublicWebCitationUrl(source.homeUrl));
|
| 949 |
+
|
| 950 |
+
/** Holocron e2e and product policy: answers must ground on multiple approved web sources. */
|
| 951 |
+
export const MIN_HOLOCRON_WEB_CITATIONS = 2;
|
| 952 |
+
|
| 953 |
+
const collectWebEvidenceSources = (
|
| 954 |
+
query: string,
|
| 955 |
+
report: string,
|
| 956 |
+
approvedSources: readonly SourceDescriptor[],
|
| 957 |
+
payload: WebResearchResponsePayload,
|
| 958 |
+
): readonly SourceDescriptor[] => {
|
| 959 |
+
const pool = mergeSourcesPreserveOrder(
|
| 960 |
+
collectRetrievedSources(report, approvedSources, payload),
|
| 961 |
+
collectCitedSources(report, approvedSources, payload),
|
| 962 |
+
materializeSourcesFromUrls(collectVisitedUrlsFromPayload(payload, approvedSources), approvedSources),
|
| 963 |
+
);
|
| 964 |
+
return rerankEvidenceSources(query, filterPublicWebCitationSources(pool));
|
| 965 |
+
};
|
| 966 |
+
|
| 967 |
+
const ensureMinimumWebCitations = (
|
| 968 |
+
query: string,
|
| 969 |
+
cited: readonly SourceDescriptor[],
|
| 970 |
+
evidence: readonly SourceDescriptor[],
|
| 971 |
+
payload?: WebResearchResponsePayload,
|
| 972 |
+
approvedSources: readonly SourceDescriptor[] = [],
|
| 973 |
+
): readonly SourceDescriptor[] => {
|
| 974 |
+
const info = payload?.research_information;
|
| 975 |
+
const payloadBacked = payload
|
| 976 |
+
? materializeSourcesFromUrls(
|
| 977 |
+
uniqueUrlsPreserveOrder([
|
| 978 |
+
...payloadUrls(info?.cited_urls),
|
| 979 |
+
...payloadUrls(info?.retrieved_urls),
|
| 980 |
+
...payloadUrls(info?.visited_urls),
|
| 981 |
+
...payloadUrls(info?.source_urls),
|
| 982 |
+
]),
|
| 983 |
+
approvedSources,
|
| 984 |
+
)
|
| 985 |
+
: [];
|
| 986 |
+
|
| 987 |
+
const merged = rerankEvidenceSources(
|
| 988 |
+
query,
|
| 989 |
+
mergeSourcesPreserveOrder(cited, evidence, payloadBacked),
|
| 990 |
+
);
|
| 991 |
+
const webOnly = filterPublicWebCitationSources(merged);
|
| 992 |
+
if (webOnly.length >= MIN_HOLOCRON_WEB_CITATIONS) {
|
| 993 |
+
return webOnly.slice(0, 8);
|
| 994 |
+
}
|
| 995 |
+
const padded = rerankEvidenceSources(
|
| 996 |
+
query,
|
| 997 |
+
mergeSourcesPreserveOrder(webOnly, filterPublicWebCitationSources(evidence), payloadBacked),
|
| 998 |
+
);
|
| 999 |
+
return padded.length >= MIN_HOLOCRON_WEB_CITATIONS
|
| 1000 |
+
? padded.slice(0, 8)
|
| 1001 |
+
: filterPublicWebCitationSources(payloadBacked).slice(0, 8);
|
| 1002 |
+
};
|
| 1003 |
+
|
| 1004 |
+
const composeAnswerFromWebSources = (query: string, sources: readonly SourceDescriptor[]): string => {
|
| 1005 |
+
const webSources = filterPublicWebCitationSources(sources).slice(0, 5);
|
| 1006 |
+
if (webSources.length === 0) {
|
| 1007 |
+
return sourceOnlyFallbackAnswer(query, sources);
|
| 1008 |
+
}
|
| 1009 |
+
return sourceOnlyFallbackAnswer(query, webSources);
|
| 1010 |
+
};
|
| 1011 |
+
|
| 1012 |
+
const sourceMatchesQuery = (source: SourceDescriptor, query: string): boolean => {
|
| 1013 |
+
const tokens = tokenizeQuery(query);
|
| 1014 |
+
if (tokens.length === 0) return false;
|
| 1015 |
+
const haystack = `${source.name} ${source.description ?? ""} ${source.homeUrl}`.toLowerCase();
|
| 1016 |
+
let hits = 0;
|
| 1017 |
+
for (const token of tokens) {
|
| 1018 |
+
if (haystack.includes(token)) hits += 1;
|
| 1019 |
+
}
|
| 1020 |
+
return hits >= Math.min(2, tokens.length);
|
| 1021 |
+
};
|
| 1022 |
+
|
| 1023 |
+
const sourceRelevanceScore = (source: SourceDescriptor, query: string): number => {
|
| 1024 |
+
const tokens = tokenizeQuery(query);
|
| 1025 |
+
if (tokens.length === 0) return 1;
|
| 1026 |
+
const haystack = [
|
| 1027 |
+
source.name,
|
| 1028 |
+
source.description,
|
| 1029 |
+
source.homeUrl,
|
| 1030 |
+
...(source.tags ?? []),
|
| 1031 |
+
].join(" ").toLowerCase();
|
| 1032 |
+
let hits = 0;
|
| 1033 |
+
for (const token of tokens) {
|
| 1034 |
+
if (haystack.includes(token)) hits += 1;
|
| 1035 |
+
}
|
| 1036 |
+
const titleBonus = tokens.some((token) => source.name.toLowerCase().includes(token)) ? 2 : 0;
|
| 1037 |
+
const urlBonus = tokens.some((token) => source.homeUrl.toLowerCase().includes(token)) ? 1 : 0;
|
| 1038 |
+
return hits * 2 + titleBonus + urlBonus;
|
| 1039 |
+
};
|
| 1040 |
+
|
| 1041 |
+
const rerankEvidenceSources = (query: string, sources: readonly SourceDescriptor[]): readonly SourceDescriptor[] => {
|
| 1042 |
+
const tokens = tokenizeQuery(query);
|
| 1043 |
+
const ranked = sources
|
| 1044 |
+
.map((source, index) => ({
|
| 1045 |
+
source,
|
| 1046 |
+
index,
|
| 1047 |
+
score: sourceRelevanceScore(source, query),
|
| 1048 |
+
}))
|
| 1049 |
+
.sort((left, right) => right.score - left.score || left.index - right.index);
|
| 1050 |
+
if (tokens.length === 0) {
|
| 1051 |
+
return ranked.map((entry) => entry.source).slice(0, 4);
|
| 1052 |
+
}
|
| 1053 |
+
const strong = ranked.filter((entry) => entry.score >= 2).map((entry) => entry.source);
|
| 1054 |
+
return strong.slice(0, 8);
|
| 1055 |
+
};
|
| 1056 |
+
|
| 1057 |
+
const resolveWebSourcesForFailedSynthesis = (
|
| 1058 |
+
query: string,
|
| 1059 |
+
retrievedSources: readonly SourceDescriptor[],
|
| 1060 |
+
): readonly SourceDescriptor[] => {
|
| 1061 |
+
const candidates = filterPublicWebCitationSources(retrievedSources);
|
| 1062 |
+
const matched = candidates.filter((source) => sourceMatchesQuery(source, query));
|
| 1063 |
+
return (matched.length > 0 ? matched : candidates).slice(0, 5);
|
| 1064 |
+
};
|
| 1065 |
+
|
| 1066 |
+
const researchDomainsForSources = (sources: readonly SourceDescriptor[]): string[] => {
|
| 1067 |
+
const enabledHosts = new Set<string>();
|
| 1068 |
+
for (const source of sources) {
|
| 1069 |
+
try {
|
| 1070 |
+
const host = new URL(source.homeUrl).hostname.replace(/^www\./, "").toLowerCase();
|
| 1071 |
+
const baseHost = traskApprovedResearchBaseHosts.find((base) => host === base || host.endsWith(`.${base}`));
|
| 1072 |
+
if (baseHost) enabledHosts.add(baseHost);
|
| 1073 |
+
} catch {
|
| 1074 |
+
continue;
|
| 1075 |
+
}
|
| 1076 |
+
}
|
| 1077 |
+
return [...enabledHosts];
|
| 1078 |
+
};
|
| 1079 |
+
|
| 1080 |
+
const HEARTBEAT_MS = 8000;
|
| 1081 |
+
|
| 1082 |
+
const withProgressHeartbeat = async <T>(
|
| 1083 |
+
phase: WebResearchProgressEvent["phase"],
|
| 1084 |
+
makeDetail: (elapsedMs: number) => string,
|
| 1085 |
+
onProgress: ((event: WebResearchProgressEvent) => void) | undefined,
|
| 1086 |
+
work: () => Promise<T>,
|
| 1087 |
+
): Promise<T> => {
|
| 1088 |
+
if (!onProgress) {
|
| 1089 |
+
return await work();
|
| 1090 |
+
}
|
| 1091 |
+
|
| 1092 |
+
const startedAt = Date.now();
|
| 1093 |
+
let lastBucket = -1;
|
| 1094 |
+
const emit = () => {
|
| 1095 |
+
const elapsed = Date.now() - startedAt;
|
| 1096 |
+
const bucket = Math.floor(elapsed / HEARTBEAT_MS);
|
| 1097 |
+
if (bucket === lastBucket) return;
|
| 1098 |
+
lastBucket = bucket;
|
| 1099 |
+
onProgress({ phase, detail: makeDetail(elapsed) });
|
| 1100 |
+
};
|
| 1101 |
+
|
| 1102 |
+
emit();
|
| 1103 |
+
const timer = setInterval(emit, HEARTBEAT_MS);
|
| 1104 |
+
try {
|
| 1105 |
+
return await work();
|
| 1106 |
+
} finally {
|
| 1107 |
+
clearInterval(timer);
|
| 1108 |
+
}
|
| 1109 |
+
};
|
| 1110 |
+
|
| 1111 |
+
export class WebResearchClient implements WebResearchQueryHandler {
|
| 1112 |
+
private readonly openAiClient: OpenAI | null;
|
| 1113 |
+
|
| 1114 |
+
public constructor(
|
| 1115 |
+
private readonly config: WebResearchRuntimeConfig,
|
| 1116 |
+
private readonly aiConfig: SharedAiConfig,
|
| 1117 |
+
private readonly approvedSources: readonly SourceDescriptor[] = traskApprovedResearchSources,
|
| 1118 |
+
private readonly factoryOptions: WebResearchClientFactoryOptions = {},
|
| 1119 |
+
) {
|
| 1120 |
+
this.openAiClient = aiConfig.openAiApiKey
|
| 1121 |
+
? new OpenAI({
|
| 1122 |
+
apiKey: aiConfig.openAiApiKey,
|
| 1123 |
+
...(aiConfig.openAiBaseUrl ? { baseURL: aiConfig.openAiBaseUrl } : {}),
|
| 1124 |
+
...(aiConfig.openAiDefaultHeaders ? { defaultHeaders: aiConfig.openAiDefaultHeaders } : {}),
|
| 1125 |
+
})
|
| 1126 |
+
: null;
|
| 1127 |
+
}
|
| 1128 |
+
|
| 1129 |
+
public async listModels(): Promise<readonly WebResearchModelOption[]> {
|
| 1130 |
+
try {
|
| 1131 |
+
const dynamicModels = await listHeadlessWebResearchModels(this.config);
|
| 1132 |
+
const seen = new Set(DEFAULT_WEB_RESEARCH_MODELS.map((model) => model.id));
|
| 1133 |
+
return [
|
| 1134 |
+
...DEFAULT_WEB_RESEARCH_MODELS,
|
| 1135 |
+
...dynamicModels.filter((model) => {
|
| 1136 |
+
if (seen.has(model.id)) return false;
|
| 1137 |
+
seen.add(model.id);
|
| 1138 |
+
return true;
|
| 1139 |
+
}),
|
| 1140 |
+
];
|
| 1141 |
+
} catch {
|
| 1142 |
+
return DEFAULT_WEB_RESEARCH_MODELS;
|
| 1143 |
+
}
|
| 1144 |
+
}
|
| 1145 |
+
|
| 1146 |
+
private async rewriteForDiscord(
|
| 1147 |
+
query: string,
|
| 1148 |
+
report: string,
|
| 1149 |
+
approvedSources: readonly SourceDescriptor[],
|
| 1150 |
+
preferredModel?: string,
|
| 1151 |
+
communityDigest = "",
|
| 1152 |
+
): Promise<string> {
|
| 1153 |
+
if (!this.openAiClient) {
|
| 1154 |
+
return fallbackDiscordRewrite(query, report, approvedSources);
|
| 1155 |
+
}
|
| 1156 |
+
|
| 1157 |
+
const allowedSources = approvedSources
|
| 1158 |
+
.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
|
| 1159 |
+
.join("\n");
|
| 1160 |
+
|
| 1161 |
+
const preferredRewriteModel = normalizePreferredRewriteModel(preferredModel);
|
| 1162 |
+
const modelsToTry = [
|
| 1163 |
+
...new Set([...(preferredRewriteModel ? [preferredRewriteModel] : []), this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks]),
|
| 1164 |
+
].slice(0, MAX_REWRITE_ATTEMPTS);
|
| 1165 |
+
|
| 1166 |
+
for (const model of modelsToTry) {
|
| 1167 |
+
try {
|
| 1168 |
+
const completion = await withTimeout(
|
| 1169 |
+
this.openAiClient.chat.completions.create({
|
| 1170 |
+
model,
|
| 1171 |
+
temperature: 0.2,
|
| 1172 |
+
messages: [
|
| 1173 |
+
{
|
| 1174 |
+
role: "system",
|
| 1175 |
+
content: [
|
| 1176 |
+
"Rewrite research reports into concise Discord answers.",
|
| 1177 |
+
"Do not mention research steps, indexing, tooling, or backend behavior.",
|
| 1178 |
+
"Use only the numbered sources provided by the user.",
|
| 1179 |
+
"Return plain Markdown with no headings except the final Sources heading.",
|
| 1180 |
+
].join(" "),
|
| 1181 |
+
},
|
| 1182 |
+
{
|
| 1183 |
+
role: "user",
|
| 1184 |
+
content: [
|
| 1185 |
+
`Question: ${query}`,
|
| 1186 |
+
"Write a concise answer for Discord.",
|
| 1187 |
+
"Requirements:",
|
| 1188 |
+
"- Lead with the answer.",
|
| 1189 |
+
"- Use at most 3 short paragraphs or 5 compact bullets before sources.",
|
| 1190 |
+
"- Use inline numeric citations like [1], [2].",
|
| 1191 |
+
' - End with the exact heading "Sources" on its own line.',
|
| 1192 |
+
"- Under Sources, include only the cited sources using the exact numbered lines provided below.",
|
| 1193 |
+
"Allowed Sources:",
|
| 1194 |
+
allowedSources,
|
| 1195 |
+
...(communityDigest ? ["Community context (lower authority than web archives):", communityDigest] : []),
|
| 1196 |
+
"Research Report:",
|
| 1197 |
+
report,
|
| 1198 |
+
].join("\n\n"),
|
| 1199 |
+
},
|
| 1200 |
+
],
|
| 1201 |
+
}),
|
| 1202 |
+
DEFAULT_REWRITE_TIMEOUT_MS,
|
| 1203 |
+
);
|
| 1204 |
+
|
| 1205 |
+
const rewritten = completion.choices[0]?.message?.content?.trim();
|
| 1206 |
+
|
| 1207 |
+
if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
|
| 1208 |
+
return rewritten;
|
| 1209 |
+
}
|
| 1210 |
+
} catch {
|
| 1211 |
+
continue;
|
| 1212 |
+
}
|
| 1213 |
+
}
|
| 1214 |
+
|
| 1215 |
+
return fallbackDiscordRewrite(query, report, approvedSources);
|
| 1216 |
+
}
|
| 1217 |
+
|
| 1218 |
+
private async resolveLocalHits(
|
| 1219 |
+
query: string,
|
| 1220 |
+
options: WebResearchQueryOptions | undefined,
|
| 1221 |
+
onProgress?: (event: WebResearchProgressEvent) => void,
|
| 1222 |
+
): Promise<readonly SearchHit[]> {
|
| 1223 |
+
const prefetched = options?.localHits?.filter((hit) => hit.url.trim()) ?? [];
|
| 1224 |
+
if (prefetched.length > 0) {
|
| 1225 |
+
return prefetched;
|
| 1226 |
+
}
|
| 1227 |
+
|
| 1228 |
+
const provider = this.factoryOptions.localSearchProvider;
|
| 1229 |
+
if (!provider) {
|
| 1230 |
+
return [];
|
| 1231 |
+
}
|
| 1232 |
+
|
| 1233 |
+
onProgress?.({
|
| 1234 |
+
phase: "gather",
|
| 1235 |
+
detail: "Searching imported server history…",
|
| 1236 |
+
});
|
| 1237 |
+
|
| 1238 |
+
try {
|
| 1239 |
+
return await provider.search(query, 6);
|
| 1240 |
+
} catch {
|
| 1241 |
+
return [];
|
| 1242 |
+
}
|
| 1243 |
+
}
|
| 1244 |
+
|
| 1245 |
+
private async rewriteForDiscordBrief(
|
| 1246 |
+
query: string,
|
| 1247 |
+
report: string,
|
| 1248 |
+
approvedSources: readonly SourceDescriptor[],
|
| 1249 |
+
): Promise<string> {
|
| 1250 |
+
if (!this.openAiClient) {
|
| 1251 |
+
return fallbackDiscordBrief(query, report, approvedSources);
|
| 1252 |
+
}
|
| 1253 |
+
|
| 1254 |
+
const allowedSources = approvedSources
|
| 1255 |
+
.map((source, index) => `${index + 1}. ${source.name} - ${source.homeUrl}`)
|
| 1256 |
+
.join("\n");
|
| 1257 |
+
|
| 1258 |
+
const modelsToTry = [...new Set([this.aiConfig.chatModel, ...this.aiConfig.chatModelFallbacks])].slice(0, MAX_REWRITE_ATTEMPTS);
|
| 1259 |
+
|
| 1260 |
+
for (const model of modelsToTry) {
|
| 1261 |
+
try {
|
| 1262 |
+
const completion = await withTimeout(
|
| 1263 |
+
this.openAiClient.chat.completions.create({
|
| 1264 |
+
model,
|
| 1265 |
+
temperature: 0.15,
|
| 1266 |
+
max_tokens: 380,
|
| 1267 |
+
messages: [
|
| 1268 |
+
{
|
| 1269 |
+
role: "system",
|
| 1270 |
+
content: [
|
| 1271 |
+
"Rewrite research into a very short Discord chat reply (like a quick DM).",
|
| 1272 |
+
"No preamble, no essay tone, no meta commentary about research.",
|
| 1273 |
+
"Use only the numbered sources provided.",
|
| 1274 |
+
"Plain sentences; at most 2 short sentences OR up to 3 compact bullets before Sources.",
|
| 1275 |
+
'End with the exact heading "Sources" on its own line, then cited sources only.',
|
| 1276 |
+
].join(" "),
|
| 1277 |
+
},
|
| 1278 |
+
{
|
| 1279 |
+
role: "user",
|
| 1280 |
+
content: [
|
| 1281 |
+
`Question: ${query}`,
|
| 1282 |
+
"Write the shortest helpful answer.",
|
| 1283 |
+
"Allowed Sources:",
|
| 1284 |
+
allowedSources,
|
| 1285 |
+
"Research Report:",
|
| 1286 |
+
report,
|
| 1287 |
+
].join("\n\n"),
|
| 1288 |
+
},
|
| 1289 |
+
],
|
| 1290 |
+
}),
|
| 1291 |
+
DEFAULT_REWRITE_TIMEOUT_MS,
|
| 1292 |
+
);
|
| 1293 |
+
|
| 1294 |
+
const rewritten = completion.choices[0]?.message?.content?.trim();
|
| 1295 |
+
|
| 1296 |
+
if (rewritten && /\nSources\s*\n/i.test(rewritten)) {
|
| 1297 |
+
return rewritten;
|
| 1298 |
+
}
|
| 1299 |
+
} catch {
|
| 1300 |
+
continue;
|
| 1301 |
+
}
|
| 1302 |
+
}
|
| 1303 |
+
|
| 1304 |
+
return fallbackDiscordBrief(query, report, approvedSources);
|
| 1305 |
+
}
|
| 1306 |
+
|
| 1307 |
+
private async fetchResearchReport(
|
| 1308 |
+
query: string,
|
| 1309 |
+
customPrompt: string,
|
| 1310 |
+
approvedSources: readonly SourceDescriptor[],
|
| 1311 |
+
options?: WebResearchQueryOptions,
|
| 1312 |
+
): Promise<{ report: string; payload: WebResearchResponsePayload }> {
|
| 1313 |
+
if (approvedSources.length === 0) {
|
| 1314 |
+
throw new Error("No approved research sources are enabled.");
|
| 1315 |
+
}
|
| 1316 |
+
|
| 1317 |
+
const allowedDomains = researchDomainsForSources(approvedSources);
|
| 1318 |
+
const raw = await runHeadlessWebResearch(this.config, {
|
| 1319 |
+
query: buildResearchTask(query),
|
| 1320 |
+
custom_prompt: customPrompt,
|
| 1321 |
+
source_urls: approvedSources.map((source) => source.homeUrl),
|
| 1322 |
+
query_domains: allowedDomains,
|
| 1323 |
+
allowed_url_prefixes: approvedSources.map((source) => source.homeUrl),
|
| 1324 |
+
...(options?.model?.trim() ? { model: options.model.trim() } : {}),
|
| 1325 |
+
report_type: "research_report",
|
| 1326 |
+
report_source: "web",
|
| 1327 |
+
});
|
| 1328 |
+
|
| 1329 |
+
const payload: WebResearchResponsePayload = {
|
| 1330 |
+
report: raw.report,
|
| 1331 |
+
...(raw.research_information !== undefined
|
| 1332 |
+
? { research_information: { ...raw.research_information } }
|
| 1333 |
+
: {}),
|
| 1334 |
+
};
|
| 1335 |
+
|
| 1336 |
+
const report = typeof raw.report === "string" ? normalizeReport(raw.report) : "";
|
| 1337 |
+
|
| 1338 |
+
if (!report) {
|
| 1339 |
+
throw new Error("Trask web research returned an empty report.");
|
| 1340 |
+
}
|
| 1341 |
+
|
| 1342 |
+
return { report, payload };
|
| 1343 |
+
}
|
| 1344 |
+
|
| 1345 |
+
public async answerQuestion(
|
| 1346 |
+
query: string,
|
| 1347 |
+
onProgress?: (event: WebResearchProgressEvent) => void,
|
| 1348 |
+
options?: WebResearchQueryOptions,
|
| 1349 |
+
): Promise<WebResearchAnswer> {
|
| 1350 |
+
const approvedSources = routeSourcesForQuery(
|
| 1351 |
+
query,
|
| 1352 |
+
applySourcePreferences(this.approvedSources, options?.sourcePreferences),
|
| 1353 |
+
);
|
| 1354 |
+
try {
|
| 1355 |
+
const localHits = await this.resolveLocalHits(query, options, onProgress);
|
| 1356 |
+
const communitySources = searchHitsToCommunitySources(localHits);
|
| 1357 |
+
const communityDigest = buildCommunityKnowledgeDigest(localHits);
|
| 1358 |
+
if (localHits.length > 0) {
|
| 1359 |
+
onProgress?.({
|
| 1360 |
+
phase: "gather",
|
| 1361 |
+
detail: `Found ${localHits.length} relevant message${localHits.length === 1 ? "" : "s"} in server history…`,
|
| 1362 |
+
});
|
| 1363 |
+
}
|
| 1364 |
+
|
| 1365 |
+
const allowedDomains = researchDomainsForSources(approvedSources);
|
| 1366 |
+
onProgress?.({
|
| 1367 |
+
phase: "gather",
|
| 1368 |
+
detail: `Scanning ${approvedSources.length} approved source root${approvedSources.length === 1 ? "" : "s"} across ${allowedDomains.length} host${allowedDomains.length === 1 ? "" : "s"}…`,
|
| 1369 |
+
});
|
| 1370 |
+
const { report, payload } = await withProgressHeartbeat(
|
| 1371 |
+
"gather",
|
| 1372 |
+
(elapsedMs) => {
|
| 1373 |
+
const seconds = Math.max(1, Math.floor(elapsedMs / 1000));
|
| 1374 |
+
return `Researching approved archive sources… (${seconds}s)`;
|
| 1375 |
+
},
|
| 1376 |
+
onProgress,
|
| 1377 |
+
async () => await this.fetchResearchReport(query, buildCustomPrompt(), approvedSources, options),
|
| 1378 |
+
);
|
| 1379 |
+
const rejectedUrls = collectRejectedUrlsFromPayload(payload);
|
| 1380 |
+
if (rejectedUrls.length > 0) {
|
| 1381 |
+
onProgress?.({
|
| 1382 |
+
phase: "gather",
|
| 1383 |
+
detail: `Rejected ${rejectedUrls.length} URL${rejectedUrls.length === 1 ? "" : "s"} outside approved source roots.`,
|
| 1384 |
+
});
|
| 1385 |
+
}
|
| 1386 |
+
emitArchiveProbeEvents(payload, approvedSources, onProgress);
|
| 1387 |
+
onProgress?.({
|
| 1388 |
+
phase: "report",
|
| 1389 |
+
detail: "Ranking passages and citations…",
|
| 1390 |
+
});
|
| 1391 |
+
const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
|
| 1392 |
+
const retrievedSources = mergeCommunityAndWebSources(webEvidenceSources, communitySources);
|
| 1393 |
+
const citedSourcesFromReport = rerankEvidenceSources(
|
| 1394 |
+
query,
|
| 1395 |
+
mergeSourcesPreserveOrder(
|
| 1396 |
+
collectCitedSources(report, approvedSources, payload),
|
| 1397 |
+
collectCitedSourcesFromText(report, approvedSources),
|
| 1398 |
+
),
|
| 1399 |
+
);
|
| 1400 |
+
onProgress?.({
|
| 1401 |
+
phase: "sources",
|
| 1402 |
+
detail: retrievedSources.length ? `${retrievedSources.length} sources retrieved` : "Mapping hosts to archive catalog…",
|
| 1403 |
+
sources: retrievedSources,
|
| 1404 |
+
});
|
| 1405 |
+
onProgress?.({
|
| 1406 |
+
phase: "compose",
|
| 1407 |
+
detail: "Rendering Holocron answer…",
|
| 1408 |
+
});
|
| 1409 |
+
const sourcesForRewrite = mergeCommunityAndWebSources(
|
| 1410 |
+
filterWebArchiveCitationSources(retrievedSources),
|
| 1411 |
+
communitySources,
|
| 1412 |
+
);
|
| 1413 |
+
const webSourcesForRewrite = filterPublicWebCitationSources(sourcesForRewrite);
|
| 1414 |
+
|
| 1415 |
+
let answer: string;
|
| 1416 |
+
if (webSourcesForRewrite.length === 0 && communitySources.length === 0) {
|
| 1417 |
+
answer = degradedAnswerFallback(query, approvedSources);
|
| 1418 |
+
} else if (isSynthesisFailureReport(report, payload)) {
|
| 1419 |
+
const webSources = resolveWebSourcesForFailedSynthesis(query, webEvidenceSources);
|
| 1420 |
+
if (webSources.length >= MIN_HOLOCRON_WEB_CITATIONS) {
|
| 1421 |
+
const rewritePool = mergeCommunityAndWebSources(
|
| 1422 |
+
filterPublicWebCitationSources(webSources),
|
| 1423 |
+
communitySources,
|
| 1424 |
+
);
|
| 1425 |
+
answer = this.openAiClient
|
| 1426 |
+
? await this.rewriteForDiscord(query, report, rewritePool, options?.model, communityDigest)
|
| 1427 |
+
: fallbackDiscordRewrite(query, report, rewritePool);
|
| 1428 |
+
} else if (webSources.length > 0 || communitySources.length > 0) {
|
| 1429 |
+
answer = sourceOnlyFallbackAnswer(query, sourcesForRewrite);
|
| 1430 |
+
} else {
|
| 1431 |
+
answer = degradedAnswerFallback(query, approvedSources);
|
| 1432 |
+
}
|
| 1433 |
+
} else if (this.openAiClient) {
|
| 1434 |
+
answer = await this.rewriteForDiscord(
|
| 1435 |
+
query,
|
| 1436 |
+
report,
|
| 1437 |
+
sourcesForRewrite,
|
| 1438 |
+
options?.model,
|
| 1439 |
+
communityDigest,
|
| 1440 |
+
);
|
| 1441 |
+
} else {
|
| 1442 |
+
answer = fallbackDiscordRewrite(
|
| 1443 |
+
query,
|
| 1444 |
+
report,
|
| 1445 |
+
sourcesForRewrite,
|
| 1446 |
+
);
|
| 1447 |
+
}
|
| 1448 |
+
|
| 1449 |
+
const webCitedSources = ensureMinimumWebCitations(
|
| 1450 |
+
query,
|
| 1451 |
+
filterPublicWebCitationSources(
|
| 1452 |
+
mergeSourcesPreserveOrder(
|
| 1453 |
+
collectCitedSourcesFromText(answer, retrievedSources),
|
| 1454 |
+
citedSourcesFromReport,
|
| 1455 |
+
),
|
| 1456 |
+
),
|
| 1457 |
+
webEvidenceSources,
|
| 1458 |
+
payload,
|
| 1459 |
+
approvedSources,
|
| 1460 |
+
);
|
| 1461 |
+
const communityCited = collectCitedSourcesFromText(answer, communitySources).filter(
|
| 1462 |
+
(source) => isDiscordCitationUrl(source.homeUrl),
|
| 1463 |
+
);
|
| 1464 |
+
const citedSources = mergeCommunityAndWebSources(webCitedSources, communityCited);
|
| 1465 |
+
|
| 1466 |
+
return {
|
| 1467 |
+
answer,
|
| 1468 |
+
approvedSources: citedSources,
|
| 1469 |
+
retrievedSources,
|
| 1470 |
+
visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
|
| 1471 |
+
};
|
| 1472 |
+
} catch (error: unknown) {
|
| 1473 |
+
const detail = error instanceof Error ? error.message : String(error);
|
| 1474 |
+
onProgress?.({
|
| 1475 |
+
phase: "compose",
|
| 1476 |
+
detail: `Live web research failed: ${detail.slice(0, 240)}`,
|
| 1477 |
+
});
|
| 1478 |
+
const topic = stripTrailingQuestionMarks(query) || "this question";
|
| 1479 |
+
return {
|
| 1480 |
+
answer: `I could not complete live web research for "${topic}" right now (${detail}). Run scripts/bootstrap_trask_research.sh, set TRASK_WEB_RESEARCH_PYTHON, OPENAI_API_KEY or OPENROUTER_API_KEY, and TRASK_WEB_RESEARCH_TIMEOUT_MS, then retry.`,
|
| 1481 |
+
approvedSources: [],
|
| 1482 |
+
retrievedSources: [],
|
| 1483 |
+
visitedUrls: [],
|
| 1484 |
+
};
|
| 1485 |
+
}
|
| 1486 |
+
}
|
| 1487 |
+
|
| 1488 |
+
/** Shorter rewrite for proactive/channel replies (still source-backed). */
|
| 1489 |
+
public async answerQuestionBrief(query: string): Promise<WebResearchBriefAnswer> {
|
| 1490 |
+
try {
|
| 1491 |
+
const approvedSources = routeSourcesForQuery(query, this.approvedSources);
|
| 1492 |
+
const { report, payload } = await this.fetchResearchReport(query, buildCustomPromptBrief(), approvedSources);
|
| 1493 |
+
const webEvidenceSources = collectWebEvidenceSources(query, report, approvedSources, payload);
|
| 1494 |
+
const retrievedSources = webEvidenceSources;
|
| 1495 |
+
const answer = retrievedSources.length > 0
|
| 1496 |
+
? await this.rewriteForDiscordBrief(query, report, retrievedSources)
|
| 1497 |
+
: degradedAnswerFallback(query, approvedSources);
|
| 1498 |
+
|
| 1499 |
+
return {
|
| 1500 |
+
answer,
|
| 1501 |
+
approvedSources: ensureMinimumWebCitations(
|
| 1502 |
+
query,
|
| 1503 |
+
filterPublicWebCitationSources(
|
| 1504 |
+
mergeSourcesPreserveOrder(
|
| 1505 |
+
collectCitedSourcesFromText(answer, retrievedSources),
|
| 1506 |
+
collectCitedSources(report, approvedSources, payload),
|
| 1507 |
+
),
|
| 1508 |
+
),
|
| 1509 |
+
webEvidenceSources,
|
| 1510 |
+
payload,
|
| 1511 |
+
approvedSources,
|
| 1512 |
+
),
|
| 1513 |
+
retrievedSources,
|
| 1514 |
+
visitedUrls: collectVisitedUrlsFromPayload(payload, approvedSources),
|
| 1515 |
+
researchReport: report,
|
| 1516 |
+
};
|
| 1517 |
+
} catch {
|
| 1518 |
+
const topic = stripTrailingQuestionMarks(query) || "this question";
|
| 1519 |
+
const answer = `I could not complete live web research for "${topic}" right now.`;
|
| 1520 |
+
return {
|
| 1521 |
+
answer,
|
| 1522 |
+
approvedSources: [],
|
| 1523 |
+
retrievedSources: [],
|
| 1524 |
+
visitedUrls: [],
|
| 1525 |
+
researchReport: answer,
|
| 1526 |
+
};
|
| 1527 |
+
}
|
| 1528 |
+
}
|
| 1529 |
+
}
|
| 1530 |
+
|
| 1531 |
+
export const createWebResearchClient = (
|
| 1532 |
+
config: WebResearchRuntimeConfig,
|
| 1533 |
+
aiConfig: SharedAiConfig = loadSharedAiConfig(),
|
| 1534 |
+
factoryOptions: WebResearchClientFactoryOptions = {},
|
| 1535 |
+
): WebResearchClient => {
|
| 1536 |
+
return new WebResearchClient(config, aiConfig, traskApprovedResearchSources, factoryOptions);
|
| 1537 |
+
};
|
| 1538 |
+
|
| 1539 |
+
// ---------------------------------------------------------------------------
|
| 1540 |
+
// Pure helpers exported for unit testing — not part of the public API surface.
|
| 1541 |
+
// ---------------------------------------------------------------------------
|
| 1542 |
+
export {
|
| 1543 |
+
normalizeUrl as _normalizeUrl,
|
| 1544 |
+
extractUrls as _extractUrls,
|
| 1545 |
+
hostnameHint as _hostnameHint,
|
| 1546 |
+
uniqueUrlsPreserveOrder as _uniqueUrlsPreserveOrder,
|
| 1547 |
+
collectCitedSources as _collectCitedSources,
|
| 1548 |
+
collectRetrievedSources as _collectRetrievedSources,
|
| 1549 |
+
collectVisitedUrlsFromPayload as _collectVisitedUrlsFromPayload,
|
| 1550 |
+
collectCitedSourcesFromText as _collectCitedSourcesFromText,
|
| 1551 |
+
isSynthesisFailureReport as _isSynthesisFailureReport,
|
| 1552 |
+
countPayloadWebUrls as _countPayloadWebUrls,
|
| 1553 |
+
normalizeReport as _normalizeReport,
|
| 1554 |
+
formatSourcesSection as _formatSourcesSection,
|
| 1555 |
+
normalizePreferredRewriteModel as _normalizePreferredRewriteModel,
|
| 1556 |
+
matchApprovedSource as _matchApprovedSource,
|
| 1557 |
+
classifyQueryIntent as _classifyQueryIntent,
|
| 1558 |
+
routeSourcesForQuery as _routeSourcesForQuery,
|
| 1559 |
+
};
|
pnpm-lock.yaml
CHANGED
|
@@ -637,6 +637,8 @@ importers:
|
|
| 637 |
specifier: ^4.21.0
|
| 638 |
version: 4.21.0
|
| 639 |
|
|
|
|
|
|
|
| 640 |
infra/matchmaking-inducer:
|
| 641 |
dependencies:
|
| 642 |
http-proxy:
|
|
|
|
| 637 |
specifier: ^4.21.0
|
| 638 |
version: 4.21.0
|
| 639 |
|
| 640 |
+
infra/holocron-trask-api: {}
|
| 641 |
+
|
| 642 |
infra/matchmaking-inducer:
|
| 643 |
dependencies:
|
| 644 |
http-proxy:
|
requirements-trask-research.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Holocron / Trask web research runner (scripts/trask_web_research.py)
|
| 2 |
+
crawl4ai>=0.8.6,<0.9
|
| 3 |
+
duckduckgo-search>=7.0.0,<8
|
| 4 |
+
trafilatura>=2.0.0,<3
|
| 5 |
+
lxml_html_clean>=0.4.0
|
| 6 |
+
redis>=5.0.0,<6
|
scripts/trask_cache.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Redis cache for Trask web research (DuckDuckGo discovery + page scrape).
|
| 3 |
+
|
| 4 |
+
Optional: set REDIS_URL or TRASK_REDIS_URL. Disable with TRASK_CACHE_DISABLED=1.
|
| 5 |
+
|
| 6 |
+
Key layout (redis-development plugin conventions):
|
| 7 |
+
trask:search:{hash} — discovered URL list (JSON)
|
| 8 |
+
trask:page:{hash} — scraped markdown per normalized URL
|
| 9 |
+
trask:research:{hash} — full run_payload JSON result
|
| 10 |
+
|
| 11 |
+
All keys use SETEX with configurable TTLs.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import hashlib
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
from typing import Any, TYPE_CHECKING
|
| 20 |
+
|
| 21 |
+
if TYPE_CHECKING:
|
| 22 |
+
from redis import Redis
|
| 23 |
+
|
| 24 |
+
KEY_PREFIX = "trask"
|
| 25 |
+
|
| 26 |
+
DEFAULT_SEARCH_TTL = 6 * 60 * 60 # 6h — DDG results drift slowly
|
| 27 |
+
DEFAULT_PAGE_TTL = 7 * 24 * 60 * 60 # 7d — archive pages are fairly stable
|
| 28 |
+
DEFAULT_RESEARCH_TTL = 60 * 60 # 1h — full answer bundle; shorter for freshness
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def cache_enabled() -> bool:
|
| 32 |
+
if os.environ.get("TRASK_CACHE_DISABLED", "").strip().lower() in ("1", "true", "yes"):
|
| 33 |
+
return False
|
| 34 |
+
return bool(_redis_url())
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _redis_url() -> str | None:
|
| 38 |
+
return os.environ.get("TRASK_REDIS_URL") or os.environ.get("REDIS_URL")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _ttl(env_name: str, default: int) -> int:
|
| 42 |
+
raw = os.environ.get(env_name, "").strip()
|
| 43 |
+
if not raw:
|
| 44 |
+
return default
|
| 45 |
+
try:
|
| 46 |
+
return max(60, int(raw))
|
| 47 |
+
except ValueError:
|
| 48 |
+
return default
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def search_ttl() -> int:
|
| 52 |
+
return _ttl("TRASK_CACHE_SEARCH_TTL_SECONDS", DEFAULT_SEARCH_TTL)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def page_ttl() -> int:
|
| 56 |
+
return _ttl("TRASK_CACHE_PAGE_TTL_SECONDS", DEFAULT_PAGE_TTL)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def research_ttl() -> int:
|
| 60 |
+
return _ttl("TRASK_CACHE_RESEARCH_TTL_SECONDS", DEFAULT_RESEARCH_TTL)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def get_client() -> Redis | None:
|
| 64 |
+
if not cache_enabled():
|
| 65 |
+
return None
|
| 66 |
+
url = _redis_url()
|
| 67 |
+
if not url:
|
| 68 |
+
return None
|
| 69 |
+
try:
|
| 70 |
+
import redis
|
| 71 |
+
except ImportError:
|
| 72 |
+
return None
|
| 73 |
+
return redis.from_url(url, decode_responses=True)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def ping(client: Redis) -> bool:
|
| 77 |
+
try:
|
| 78 |
+
return bool(client.ping())
|
| 79 |
+
except Exception:
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _sha(parts: list[str]) -> str:
|
| 84 |
+
payload = "\x1f".join(parts).encode("utf-8")
|
| 85 |
+
return hashlib.sha256(payload).hexdigest()
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _key(kind: str, digest: str) -> str:
|
| 89 |
+
return f"{KEY_PREFIX}:{kind}:{digest}"
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _normalize_url(url: str) -> str:
|
| 93 |
+
return url.strip().rstrip("/").lower()
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def search_cache_key(query: str, query_domains: list[str]) -> str:
|
| 97 |
+
domains = "|".join(sorted(d.strip().lower() for d in query_domains if d.strip()))
|
| 98 |
+
return _key("search", _sha([query.strip().lower(), domains]))
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def page_cache_key(url: str) -> str:
|
| 102 |
+
return _key("page", _sha([_normalize_url(url)]))
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def research_cache_key(
|
| 106 |
+
query: str,
|
| 107 |
+
query_domains: list[str],
|
| 108 |
+
allowed_prefixes: list[str],
|
| 109 |
+
source_urls: list[str],
|
| 110 |
+
) -> str:
|
| 111 |
+
domains = "|".join(sorted(d.strip().lower() for d in query_domains if d.strip()))
|
| 112 |
+
prefixes = "|".join(sorted(p.strip().rstrip("/").lower() for p in allowed_prefixes if p.strip()))
|
| 113 |
+
sources = "|".join(sorted(_normalize_url(u) for u in source_urls if u.strip()))
|
| 114 |
+
return _key("research", _sha([query.strip().lower(), domains, prefixes, sources]))
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def get_json(client: Redis, key: str) -> Any | None:
|
| 118 |
+
raw = client.get(key)
|
| 119 |
+
if not raw:
|
| 120 |
+
return None
|
| 121 |
+
try:
|
| 122 |
+
return json.loads(raw)
|
| 123 |
+
except json.JSONDecodeError:
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def set_json(client: Redis, key: str, value: Any, ttl_seconds: int) -> None:
|
| 128 |
+
client.setex(key, ttl_seconds, json.dumps(value, ensure_ascii=False))
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def get_search(client: Redis, query: str, query_domains: list[str]) -> list[str] | None:
|
| 132 |
+
data = get_json(client, search_cache_key(query, query_domains))
|
| 133 |
+
if isinstance(data, list):
|
| 134 |
+
return [str(u) for u in data]
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def set_search(client: Redis, query: str, query_domains: list[str], urls: list[str]) -> None:
|
| 139 |
+
set_json(client, search_cache_key(query, query_domains), urls, search_ttl())
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def get_pages_bulk(client: Redis, urls: list[str]) -> dict[str, str]:
|
| 143 |
+
"""Return url -> markdown for cache hits (pipelined GET)."""
|
| 144 |
+
if not urls:
|
| 145 |
+
return {}
|
| 146 |
+
pipe = client.pipeline()
|
| 147 |
+
keys = [page_cache_key(u) for u in urls]
|
| 148 |
+
for key in keys:
|
| 149 |
+
pipe.get(key)
|
| 150 |
+
values = pipe.execute()
|
| 151 |
+
hits: dict[str, str] = {}
|
| 152 |
+
for url, body in zip(urls, values, strict=True):
|
| 153 |
+
if body and isinstance(body, str) and len(body) >= 1:
|
| 154 |
+
hits[url] = body
|
| 155 |
+
return hits
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def set_page(client: Redis, url: str, markdown: str) -> None:
|
| 159 |
+
if not markdown.strip():
|
| 160 |
+
return
|
| 161 |
+
client.setex(page_cache_key(url), page_ttl(), markdown)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def set_pages_bulk(client: Redis, pages: dict[str, str]) -> None:
|
| 165 |
+
if not pages:
|
| 166 |
+
return
|
| 167 |
+
pipe = client.pipeline()
|
| 168 |
+
ttl = page_ttl()
|
| 169 |
+
for url, markdown in pages.items():
|
| 170 |
+
if markdown.strip():
|
| 171 |
+
pipe.setex(page_cache_key(url), ttl, markdown)
|
| 172 |
+
pipe.execute()
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def get_research(client: Redis, key: str) -> dict[str, Any] | None:
|
| 176 |
+
data = get_json(client, key)
|
| 177 |
+
return data if isinstance(data, dict) else None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def set_research(client: Redis, key: str, result: dict[str, Any]) -> None:
|
| 181 |
+
set_json(client, key, result, research_ttl())
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def kb_doc_cache_key(source_id: str) -> str:
|
| 185 |
+
"""Stable key for KB ingest dedup (markdown file, URL, discord export id, …)."""
|
| 186 |
+
return _key("kb", _sha([source_id.strip().lower()]))
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def get_kb_content_hash(client: Redis, source_id: str) -> str | None:
|
| 190 |
+
value = client.get(kb_doc_cache_key(source_id))
|
| 191 |
+
return value if isinstance(value, str) else None
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def set_kb_content_hash(client: Redis, source_id: str, content_hash: str) -> None:
|
| 195 |
+
ttl = _ttl("TRASK_CACHE_KB_TTL_SECONDS", 30 * 24 * 60 * 60)
|
| 196 |
+
client.setex(kb_doc_cache_key(source_id), ttl, content_hash)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def kb_needs_reindex(client: Redis, source_id: str, content_hash: str) -> bool:
|
| 200 |
+
"""True when document is new or content changed (for ingest pipelines)."""
|
| 201 |
+
previous = get_kb_content_hash(client, source_id)
|
| 202 |
+
if previous == content_hash:
|
| 203 |
+
return False
|
| 204 |
+
set_kb_content_hash(client, source_id, content_hash)
|
| 205 |
+
return True
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def research_key_for_payload(payload: dict[str, Any]) -> str:
|
| 209 |
+
query = str(payload.get("query") or "")
|
| 210 |
+
query_domains = [str(x) for x in (payload.get("query_domains") or []) if str(x).strip()]
|
| 211 |
+
allowed_prefixes = [str(x) for x in (payload.get("allowed_url_prefixes") or []) if str(x).strip()]
|
| 212 |
+
source_urls = [str(x) for x in (payload.get("source_urls") or []) if str(x).strip()]
|
| 213 |
+
return research_cache_key(query, query_domains, allowed_prefixes, source_urls)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def annotate_cache_meta(result: dict[str, Any], stats: dict[str, int]) -> dict[str, Any]:
|
| 217 |
+
"""Attach cache stats under research_information for operators."""
|
| 218 |
+
info = dict(result.get("research_information") or {})
|
| 219 |
+
info["cache"] = stats
|
| 220 |
+
out = dict(result)
|
| 221 |
+
out["research_information"] = info
|
| 222 |
+
return out
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def _self_test() -> int:
|
| 226 |
+
"""In-memory-free checks using a real Redis if REDIS_URL is set."""
|
| 227 |
+
client = get_client()
|
| 228 |
+
if not client or not ping(client):
|
| 229 |
+
print("SKIP: Redis not configured or unreachable (set REDIS_URL to test)")
|
| 230 |
+
return 0
|
| 231 |
+
|
| 232 |
+
q = "__trask_cache_selftest__"
|
| 233 |
+
domains = ["example.com"]
|
| 234 |
+
urls = ["https://example.com/page-a", "https://example.com/page-b"]
|
| 235 |
+
set_search(client, q, domains, urls)
|
| 236 |
+
assert get_search(client, q, domains) == urls
|
| 237 |
+
|
| 238 |
+
body = "# hello from self-test"
|
| 239 |
+
set_page(client, urls[0], body)
|
| 240 |
+
hits = get_pages_bulk(client, urls)
|
| 241 |
+
assert hits.get(urls[0]) == body
|
| 242 |
+
|
| 243 |
+
research = {"report": "ok", "research_information": {}}
|
| 244 |
+
rkey = research_cache_key(q, domains, ["https://example.com"], [])
|
| 245 |
+
set_research(client, rkey, research)
|
| 246 |
+
assert get_research(client, rkey) == research
|
| 247 |
+
|
| 248 |
+
client.delete(search_cache_key(q, domains), page_cache_key(urls[0]), rkey)
|
| 249 |
+
print("OK: trask_cache self-test passed")
|
| 250 |
+
return 0
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
if __name__ == "__main__":
|
| 254 |
+
raise SystemExit(_self_test())
|
scripts/trask_web_research.py
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Headless web research for Trask / Holocron.
|
| 4 |
+
|
| 5 |
+
stdin: JSON payload (query, allowed_url_prefixes, query_domains, source_urls, …)
|
| 6 |
+
stdout: JSON { report, research_information }
|
| 7 |
+
|
| 8 |
+
Discovery via DuckDuckGo; scrape via Crawl4AI (markdown); trafilatura fallback.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import asyncio
|
| 15 |
+
import contextlib
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
import sys
|
| 20 |
+
from dataclasses import dataclass, field
|
| 21 |
+
from typing import Any
|
| 22 |
+
from urllib.parse import urlparse
|
| 23 |
+
|
| 24 |
+
MAX_CANDIDATE_URLS = 12
|
| 25 |
+
MAX_SCRAPE_URLS = 8
|
| 26 |
+
MAX_MARKDOWN_CHARS_PER_PAGE = 12_000
|
| 27 |
+
MIN_USABLE_BODY_CHARS = 280
|
| 28 |
+
SEARCH_RESULTS_PER_DOMAIN = 4
|
| 29 |
+
|
| 30 |
+
FORUM_CHROME_PATTERNS = [
|
| 31 |
+
re.compile(r"\bsign up\b", re.I),
|
| 32 |
+
re.compile(r"\ball activity\b", re.I),
|
| 33 |
+
re.compile(r"\bmark site read\b", re.I),
|
| 34 |
+
re.compile(r"\bactivity feed\b", re.I),
|
| 35 |
+
re.compile(r"\bexisting user\? sign in\b", re.I),
|
| 36 |
+
re.compile(r"\byour content feed\b", re.I),
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _normalize_prefix(value: str) -> str:
|
| 41 |
+
return value.strip().rstrip("/")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _url_allowed(url: str, prefixes: list[str]) -> bool:
|
| 45 |
+
candidate = _normalize_prefix(url)
|
| 46 |
+
for raw in prefixes:
|
| 47 |
+
prefix = _normalize_prefix(raw)
|
| 48 |
+
if not prefix:
|
| 49 |
+
continue
|
| 50 |
+
if candidate == prefix or candidate.startswith(prefix + "/"):
|
| 51 |
+
return True
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _unique_urls(urls: list[str]) -> list[str]:
|
| 56 |
+
seen: set[str] = set()
|
| 57 |
+
out: list[str] = []
|
| 58 |
+
for url in urls:
|
| 59 |
+
u = url.strip()
|
| 60 |
+
if not u or not u.startswith(("http://", "https://")):
|
| 61 |
+
continue
|
| 62 |
+
key = u.rstrip("/").lower()
|
| 63 |
+
if key in seen:
|
| 64 |
+
continue
|
| 65 |
+
seen.add(key)
|
| 66 |
+
out.append(u)
|
| 67 |
+
return out
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _host_from_url(url: str) -> str:
|
| 71 |
+
try:
|
| 72 |
+
return urlparse(url).netloc.lower().replace("www.", "")
|
| 73 |
+
except Exception:
|
| 74 |
+
return ""
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _looks_like_forum_chrome(text: str) -> bool:
|
| 78 |
+
if len(text) < 120:
|
| 79 |
+
return True
|
| 80 |
+
hits = sum(1 for pat in FORUM_CHROME_PATTERNS if pat.search(text))
|
| 81 |
+
return hits >= 2
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _extract_follow_up_links(markdown: str, query: str, allowed_prefixes: list[str]) -> list[str]:
|
| 85 |
+
tokens = _query_tokens(query)
|
| 86 |
+
if not tokens:
|
| 87 |
+
return []
|
| 88 |
+
found: list[str] = []
|
| 89 |
+
for _label, href in re.findall(r"\[([^\]]*)\]\((https?://[^)]+)\)", markdown):
|
| 90 |
+
lower = f"{_label} {href}".lower()
|
| 91 |
+
if not any(token in lower for token in tokens):
|
| 92 |
+
continue
|
| 93 |
+
if _url_allowed(href, allowed_prefixes):
|
| 94 |
+
found.append(href)
|
| 95 |
+
return _unique_urls(found)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _truncate(text: str, limit: int) -> str:
|
| 99 |
+
if len(text) <= limit:
|
| 100 |
+
return text
|
| 101 |
+
return text[: limit - 3].rstrip() + "..."
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
@dataclass
|
| 105 |
+
class PageEvidence:
|
| 106 |
+
url: str
|
| 107 |
+
markdown: str
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@dataclass
|
| 111 |
+
class GatherResult:
|
| 112 |
+
pages: list[PageEvidence] = field(default_factory=list)
|
| 113 |
+
visited_urls: list[str] = field(default_factory=list)
|
| 114 |
+
retrieved_urls: list[str] = field(default_factory=list)
|
| 115 |
+
rejected_urls: list[str] = field(default_factory=list)
|
| 116 |
+
candidate_urls: list[str] = field(default_factory=list)
|
| 117 |
+
cache_stats: dict[str, int] = field(default_factory=dict)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def _query_tokens(query: str) -> set[str]:
|
| 121 |
+
return {t for t in re.findall(r"[a-z0-9]{3,}", query.lower()) if t not in {"what", "where", "when", "does", "the", "for", "and", "how"}}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _rank_source_urls(query: str, source_urls: list[str]) -> list[str]:
|
| 125 |
+
tokens = _query_tokens(query)
|
| 126 |
+
scored: list[tuple[int, str]] = []
|
| 127 |
+
for url in source_urls:
|
| 128 |
+
lower = url.lower()
|
| 129 |
+
score = sum(2 for token in tokens if token in lower)
|
| 130 |
+
if "technical" in lower or "reference" in lower or "neocities" in lower:
|
| 131 |
+
score += 1
|
| 132 |
+
scored.append((score, url))
|
| 133 |
+
scored.sort(key=lambda pair: pair[0], reverse=True)
|
| 134 |
+
return [url for _, url in scored]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def discover_urls(
|
| 138 |
+
query: str,
|
| 139 |
+
query_domains: list[str],
|
| 140 |
+
source_urls: list[str],
|
| 141 |
+
allowed_prefixes: list[str],
|
| 142 |
+
cache_client: Any | None = None,
|
| 143 |
+
cache_stats: dict[str, int] | None = None,
|
| 144 |
+
) -> list[str]:
|
| 145 |
+
if cache_client is not None:
|
| 146 |
+
try:
|
| 147 |
+
from trask_cache import get_search, set_search
|
| 148 |
+
|
| 149 |
+
cached = get_search(cache_client, query, query_domains)
|
| 150 |
+
if cached is not None:
|
| 151 |
+
if cache_stats is not None:
|
| 152 |
+
cache_stats["search_hits"] = cache_stats.get("search_hits", 0) + 1
|
| 153 |
+
filtered = [u for u in cached if _url_allowed(u, allowed_prefixes)]
|
| 154 |
+
return filtered[:MAX_CANDIDATE_URLS]
|
| 155 |
+
if cache_stats is not None:
|
| 156 |
+
cache_stats["search_misses"] = cache_stats.get("search_misses", 0) + 1
|
| 157 |
+
except Exception:
|
| 158 |
+
pass
|
| 159 |
+
|
| 160 |
+
allowed_sources = [u for u in _unique_urls(source_urls) if _url_allowed(u, allowed_prefixes)]
|
| 161 |
+
candidates: list[str] = _rank_source_urls(query, allowed_sources)
|
| 162 |
+
|
| 163 |
+
domains = [d.strip() for d in query_domains if d.strip()]
|
| 164 |
+
if not domains:
|
| 165 |
+
domains = list({_host_from_url(p) for p in allowed_prefixes if _host_from_url(p)})
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
from duckduckgo_search import DDGS
|
| 169 |
+
|
| 170 |
+
with DDGS() as ddgs:
|
| 171 |
+
for domain in domains[:6]:
|
| 172 |
+
site_query = f"{query} site:{domain}"
|
| 173 |
+
try:
|
| 174 |
+
for item in ddgs.text(site_query, max_results=SEARCH_RESULTS_PER_DOMAIN, backend="bing"):
|
| 175 |
+
href = (item.get("href") or item.get("url") or "").strip()
|
| 176 |
+
if href:
|
| 177 |
+
candidates.append(href)
|
| 178 |
+
except Exception:
|
| 179 |
+
continue
|
| 180 |
+
if len(candidates) < 3:
|
| 181 |
+
try:
|
| 182 |
+
for item in ddgs.text(query, max_results=10, backend="bing"):
|
| 183 |
+
href = (item.get("href") or item.get("url") or "").strip()
|
| 184 |
+
if href:
|
| 185 |
+
candidates.append(href)
|
| 186 |
+
except Exception:
|
| 187 |
+
pass
|
| 188 |
+
except Exception:
|
| 189 |
+
pass
|
| 190 |
+
|
| 191 |
+
# DuckDuckGo may rate-limit; always keep ranked catalog homes as crawl seeds.
|
| 192 |
+
for url in allowed_sources:
|
| 193 |
+
if url not in candidates:
|
| 194 |
+
candidates.append(url)
|
| 195 |
+
|
| 196 |
+
filtered = [u for u in _unique_urls(candidates) if _url_allowed(u, allowed_prefixes)]
|
| 197 |
+
result = filtered[:MAX_CANDIDATE_URLS]
|
| 198 |
+
|
| 199 |
+
if cache_client is not None and result:
|
| 200 |
+
try:
|
| 201 |
+
from trask_cache import set_search
|
| 202 |
+
|
| 203 |
+
set_search(cache_client, query, query_domains, result)
|
| 204 |
+
except Exception:
|
| 205 |
+
pass
|
| 206 |
+
|
| 207 |
+
return result
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _trafilatura_fetch(url: str) -> str:
|
| 211 |
+
try:
|
| 212 |
+
import trafilatura
|
| 213 |
+
|
| 214 |
+
downloaded = trafilatura.fetch_url(url)
|
| 215 |
+
if not downloaded:
|
| 216 |
+
return ""
|
| 217 |
+
text = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
|
| 218 |
+
return (text or "").strip()
|
| 219 |
+
except Exception:
|
| 220 |
+
return ""
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
async def _crawl_with_shared_crawler(crawler: Any, url: str) -> str:
|
| 224 |
+
try:
|
| 225 |
+
from crawl4ai import CrawlerRunConfig, CacheMode
|
| 226 |
+
|
| 227 |
+
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, word_count_threshold=10)
|
| 228 |
+
result = await crawler.arun(url=url, config=run_config)
|
| 229 |
+
if result.success and result.markdown:
|
| 230 |
+
return result.markdown.strip()
|
| 231 |
+
except Exception:
|
| 232 |
+
pass
|
| 233 |
+
return _trafilatura_fetch(url)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@contextlib.contextmanager
|
| 237 |
+
def _redirect_stdout_to_stderr():
|
| 238 |
+
previous = sys.stdout
|
| 239 |
+
sys.stdout = sys.stderr
|
| 240 |
+
try:
|
| 241 |
+
yield
|
| 242 |
+
finally:
|
| 243 |
+
sys.stdout = previous
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _page_from_cache(url: str, cached_pages: dict[str, str]) -> str | None:
|
| 247 |
+
body = cached_pages.get(url)
|
| 248 |
+
if not body or len(body) < MIN_USABLE_BODY_CHARS or _looks_like_forum_chrome(body):
|
| 249 |
+
return None
|
| 250 |
+
return _truncate(body, MAX_MARKDOWN_CHARS_PER_PAGE)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
async def _resolve_page_body(
|
| 254 |
+
url: str,
|
| 255 |
+
crawler: Any | None,
|
| 256 |
+
cache_client: Any | None,
|
| 257 |
+
cached_pages: dict[str, str],
|
| 258 |
+
cache_stats: dict[str, int],
|
| 259 |
+
) -> str:
|
| 260 |
+
cached = _page_from_cache(url, cached_pages)
|
| 261 |
+
if cached is not None:
|
| 262 |
+
cache_stats["page_hits"] = cache_stats.get("page_hits", 0) + 1
|
| 263 |
+
return cached
|
| 264 |
+
|
| 265 |
+
cache_stats["page_misses"] = cache_stats.get("page_misses", 0) + 1
|
| 266 |
+
if crawler is not None:
|
| 267 |
+
body = await _crawl_with_shared_crawler(crawler, url)
|
| 268 |
+
else:
|
| 269 |
+
body = _trafilatura_fetch(url)
|
| 270 |
+
|
| 271 |
+
if body and len(body) >= MIN_USABLE_BODY_CHARS and not _looks_like_forum_chrome(body):
|
| 272 |
+
trimmed = _truncate(body, MAX_MARKDOWN_CHARS_PER_PAGE)
|
| 273 |
+
if cache_client is not None:
|
| 274 |
+
try:
|
| 275 |
+
from trask_cache import set_page
|
| 276 |
+
|
| 277 |
+
set_page(cache_client, url, trimmed)
|
| 278 |
+
except Exception:
|
| 279 |
+
pass
|
| 280 |
+
return trimmed
|
| 281 |
+
return ""
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
async def gather_evidence(
|
| 285 |
+
query: str,
|
| 286 |
+
query_domains: list[str],
|
| 287 |
+
source_urls: list[str],
|
| 288 |
+
allowed_prefixes: list[str],
|
| 289 |
+
) -> GatherResult:
|
| 290 |
+
result = GatherResult()
|
| 291 |
+
cache_client = None
|
| 292 |
+
try:
|
| 293 |
+
from trask_cache import get_client, ping
|
| 294 |
+
|
| 295 |
+
candidate = get_client()
|
| 296 |
+
if candidate is not None and ping(candidate):
|
| 297 |
+
cache_client = candidate
|
| 298 |
+
except Exception:
|
| 299 |
+
cache_client = None
|
| 300 |
+
|
| 301 |
+
stats = result.cache_stats
|
| 302 |
+
result.candidate_urls = discover_urls(
|
| 303 |
+
query,
|
| 304 |
+
query_domains,
|
| 305 |
+
source_urls,
|
| 306 |
+
allowed_prefixes,
|
| 307 |
+
cache_client=cache_client,
|
| 308 |
+
cache_stats=stats,
|
| 309 |
+
)
|
| 310 |
+
scrape_targets = list(result.candidate_urls[:MAX_SCRAPE_URLS])
|
| 311 |
+
seen_targets = set(scrape_targets)
|
| 312 |
+
|
| 313 |
+
cached_pages: dict[str, str] = {}
|
| 314 |
+
if cache_client is not None and scrape_targets:
|
| 315 |
+
try:
|
| 316 |
+
from trask_cache import get_pages_bulk
|
| 317 |
+
|
| 318 |
+
cached_pages = get_pages_bulk(cache_client, scrape_targets)
|
| 319 |
+
except Exception:
|
| 320 |
+
cached_pages = {}
|
| 321 |
+
|
| 322 |
+
async def accept_url(url: str, crawler: Any | None) -> str | None:
|
| 323 |
+
result.visited_urls.append(url)
|
| 324 |
+
body = await _resolve_page_body(url, crawler, cache_client, cached_pages, stats)
|
| 325 |
+
if not body:
|
| 326 |
+
result.rejected_urls.append(url)
|
| 327 |
+
return None
|
| 328 |
+
result.pages.append(PageEvidence(url=url, markdown=body))
|
| 329 |
+
result.retrieved_urls.append(url)
|
| 330 |
+
return body
|
| 331 |
+
|
| 332 |
+
try:
|
| 333 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
| 334 |
+
|
| 335 |
+
browser_config = BrowserConfig(headless=True, verbose=False)
|
| 336 |
+
with _redirect_stdout_to_stderr():
|
| 337 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
| 338 |
+
for url in list(scrape_targets):
|
| 339 |
+
body = await accept_url(url, crawler)
|
| 340 |
+
if not body:
|
| 341 |
+
continue
|
| 342 |
+
for follow_up in _extract_follow_up_links(body, query, allowed_prefixes):
|
| 343 |
+
if follow_up in seen_targets or len(scrape_targets) >= MAX_SCRAPE_URLS:
|
| 344 |
+
continue
|
| 345 |
+
seen_targets.add(follow_up)
|
| 346 |
+
scrape_targets.append(follow_up)
|
| 347 |
+
if cache_client is not None:
|
| 348 |
+
try:
|
| 349 |
+
from trask_cache import get_pages_bulk
|
| 350 |
+
|
| 351 |
+
cached_pages.update(get_pages_bulk(cache_client, [follow_up]))
|
| 352 |
+
except Exception:
|
| 353 |
+
pass
|
| 354 |
+
await accept_url(follow_up, crawler)
|
| 355 |
+
except Exception:
|
| 356 |
+
for url in scrape_targets:
|
| 357 |
+
if url in result.retrieved_urls:
|
| 358 |
+
continue
|
| 359 |
+
await accept_url(url, None)
|
| 360 |
+
|
| 361 |
+
return result
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def build_report(query: str, gather: GatherResult) -> str:
|
| 365 |
+
if not gather.pages:
|
| 366 |
+
return "I could not complete live archive synthesis for this question right now."
|
| 367 |
+
|
| 368 |
+
sections: list[str] = [
|
| 369 |
+
f"# Research evidence for: {query.strip()}",
|
| 370 |
+
"",
|
| 371 |
+
"The following excerpts were retrieved from approved archive sources.",
|
| 372 |
+
"",
|
| 373 |
+
]
|
| 374 |
+
for page in gather.pages:
|
| 375 |
+
sections.append(f"## Evidence from {page.url}")
|
| 376 |
+
sections.append("")
|
| 377 |
+
sections.append(page.markdown)
|
| 378 |
+
sections.append("")
|
| 379 |
+
return "\n".join(sections).strip()
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def run_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
| 383 |
+
query = str(payload.get("query") or "").strip()
|
| 384 |
+
if not query:
|
| 385 |
+
raise ValueError("query is required")
|
| 386 |
+
|
| 387 |
+
query_domains = [str(x) for x in (payload.get("query_domains") or []) if str(x).strip()]
|
| 388 |
+
allowed_prefixes = [str(x) for x in (payload.get("allowed_url_prefixes") or []) if str(x).strip()]
|
| 389 |
+
source_urls = [str(x) for x in (payload.get("source_urls") or []) if str(x).strip()]
|
| 390 |
+
|
| 391 |
+
env_prefixes = os.environ.get("TRASK_ALLOWED_URL_PREFIXES", "")
|
| 392 |
+
if env_prefixes and not allowed_prefixes:
|
| 393 |
+
allowed_prefixes = [line.strip() for line in env_prefixes.splitlines() if line.strip()]
|
| 394 |
+
env_domains = os.environ.get("TRASK_ALLOWED_QUERY_DOMAINS", "")
|
| 395 |
+
if env_domains and not query_domains:
|
| 396 |
+
query_domains = [line.strip() for line in env_domains.splitlines() if line.strip()]
|
| 397 |
+
|
| 398 |
+
try:
|
| 399 |
+
from trask_cache import (
|
| 400 |
+
annotate_cache_meta,
|
| 401 |
+
get_client,
|
| 402 |
+
get_research,
|
| 403 |
+
ping,
|
| 404 |
+
research_key_for_payload,
|
| 405 |
+
set_research,
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
cache_client = get_client()
|
| 409 |
+
if cache_client is not None and ping(cache_client):
|
| 410 |
+
rkey = research_key_for_payload(
|
| 411 |
+
{
|
| 412 |
+
"query": query,
|
| 413 |
+
"query_domains": query_domains,
|
| 414 |
+
"allowed_url_prefixes": allowed_prefixes,
|
| 415 |
+
"source_urls": source_urls,
|
| 416 |
+
},
|
| 417 |
+
)
|
| 418 |
+
cached_result = get_research(cache_client, rkey)
|
| 419 |
+
if cached_result is not None:
|
| 420 |
+
stats = dict((cached_result.get("research_information") or {}).get("cache") or {})
|
| 421 |
+
stats["research_hits"] = stats.get("research_hits", 0) + 1
|
| 422 |
+
return annotate_cache_meta(cached_result, stats)
|
| 423 |
+
except Exception:
|
| 424 |
+
pass
|
| 425 |
+
|
| 426 |
+
gather = asyncio.run(
|
| 427 |
+
gather_evidence(query, query_domains, source_urls, allowed_prefixes),
|
| 428 |
+
)
|
| 429 |
+
report = build_report(query, gather)
|
| 430 |
+
|
| 431 |
+
result = {
|
| 432 |
+
"report": report,
|
| 433 |
+
"research_information": {
|
| 434 |
+
"source_urls": gather.retrieved_urls,
|
| 435 |
+
"cited_urls": gather.retrieved_urls,
|
| 436 |
+
"retrieved_urls": gather.retrieved_urls,
|
| 437 |
+
"visited_urls": gather.visited_urls,
|
| 438 |
+
"query_domains": query_domains,
|
| 439 |
+
"allowed_url_prefixes": allowed_prefixes,
|
| 440 |
+
"rejected_source_urls": gather.rejected_urls,
|
| 441 |
+
},
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
from trask_cache import annotate_cache_meta, get_client, ping, research_key_for_payload, set_research
|
| 446 |
+
|
| 447 |
+
cache_client = get_client()
|
| 448 |
+
if cache_client is not None and ping(cache_client) and gather.pages:
|
| 449 |
+
rkey = research_key_for_payload(
|
| 450 |
+
{
|
| 451 |
+
"query": query,
|
| 452 |
+
"query_domains": query_domains,
|
| 453 |
+
"allowed_url_prefixes": allowed_prefixes,
|
| 454 |
+
"source_urls": source_urls,
|
| 455 |
+
},
|
| 456 |
+
)
|
| 457 |
+
stats = dict(gather.cache_stats)
|
| 458 |
+
stats["research_misses"] = stats.get("research_misses", 0) + 1
|
| 459 |
+
to_store = annotate_cache_meta(result, stats)
|
| 460 |
+
set_research(cache_client, rkey, to_store)
|
| 461 |
+
return to_store
|
| 462 |
+
except Exception:
|
| 463 |
+
pass
|
| 464 |
+
|
| 465 |
+
if gather.cache_stats:
|
| 466 |
+
try:
|
| 467 |
+
from trask_cache import annotate_cache_meta
|
| 468 |
+
|
| 469 |
+
return annotate_cache_meta(result, gather.cache_stats)
|
| 470 |
+
except Exception:
|
| 471 |
+
pass
|
| 472 |
+
|
| 473 |
+
return result
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
def main() -> int:
|
| 477 |
+
parser = argparse.ArgumentParser(description="Trask headless web research (Crawl4AI + DDG)")
|
| 478 |
+
parser.add_argument("--dry-run", action="store_true", help="Import dependencies and exit 0")
|
| 479 |
+
args = parser.parse_args()
|
| 480 |
+
|
| 481 |
+
if args.dry_run:
|
| 482 |
+
import crawl4ai # noqa: F401
|
| 483 |
+
import duckduckgo_search # noqa: F401
|
| 484 |
+
import trafilatura # noqa: F401
|
| 485 |
+
|
| 486 |
+
print(json.dumps({"ok": True, "backend": "crawl4ai"}))
|
| 487 |
+
return 0
|
| 488 |
+
|
| 489 |
+
raw = sys.stdin.read()
|
| 490 |
+
if not raw.strip():
|
| 491 |
+
print(json.dumps({"error": "empty stdin"}), file=sys.stderr)
|
| 492 |
+
return 1
|
| 493 |
+
|
| 494 |
+
try:
|
| 495 |
+
payload = json.loads(raw)
|
| 496 |
+
except json.JSONDecodeError as exc:
|
| 497 |
+
print(json.dumps({"error": f"invalid json: {exc}"}), file=sys.stderr)
|
| 498 |
+
return 1
|
| 499 |
+
|
| 500 |
+
try:
|
| 501 |
+
result = run_payload(payload)
|
| 502 |
+
except Exception as exc:
|
| 503 |
+
print(json.dumps({"error": str(exc)}), file=sys.stderr)
|
| 504 |
+
return 1
|
| 505 |
+
|
| 506 |
+
sys.stdout.write(json.dumps(result, ensure_ascii=False))
|
| 507 |
+
return 0
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
if __name__ == "__main__":
|
| 511 |
+
raise SystemExit(main())
|