ArunKr commited on
Commit
9849ff1
·
verified ·
1 Parent(s): d14f9f5

Upload folder using huggingface_hub

Browse files
TASKS.md CHANGED
@@ -51,9 +51,9 @@ Legend:
51
  ## P3 — RAG + indexing (docs/web/GitHub) + “password manager”
52
  - [x] Clarify “password manager” scope and threat model (`docs/PASSWORD_MANAGER_SCOPE.md`).
53
  - [x] Document upload indexing connector (MVP: text-only, keyword search).
54
- - [ ] Website crawler indexing (depth/allowlist/robots/rate limits).
55
  - [ ] GitHub repo indexing connector (branch/path filters + token support).
56
- - [ ] Jobs UI (progress/retries/errors/access controls).
57
 
58
  ## P3 — P2P pubsub chat + account manager
59
  - [ ] Account manager: identities/devices, memberships, permissions, moderation.
 
51
  ## P3 — RAG + indexing (docs/web/GitHub) + “password manager”
52
  - [x] Clarify “password manager” scope and threat model (`docs/PASSWORD_MANAGER_SCOPE.md`).
53
  - [x] Document upload indexing connector (MVP: text-only, keyword search).
54
+ - [~] Website crawler indexing (MVP: same-origin crawl with depth/pages, basic robots, private-host blocking).
55
  - [ ] GitHub repo indexing connector (branch/path filters + token support).
56
+ - [~] Jobs UI (MVP: start/cancel/list crawl jobs).
57
 
58
  ## P3 — P2P pubsub chat + account manager
59
  - [ ] Account manager: identities/devices, memberships, permissions, moderation.
app/indexing_jobs.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import re
6
+ import socket
7
+ import uuid
8
+ from dataclasses import asdict, dataclass, field
9
+ from datetime import UTC, datetime
10
+ from html import unescape
11
+ from pathlib import Path
12
+ from typing import Any
13
+ from urllib.parse import urljoin, urlparse
14
+
15
+ import httpx
16
+ from fastapi import HTTPException
17
+
18
+ from app.storage import user_data_dir
19
+
20
+
21
+ def _jobs_path(user_id: str) -> Path:
22
+ return user_data_dir(user_id) / "indexing-jobs.json"
23
+
24
+
25
+ def _now_iso() -> str:
26
+ return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
27
+
28
+
29
+ def _is_public_host(hostname: str) -> bool:
30
+ host = (hostname or "").strip().lower()
31
+ if not host:
32
+ return False
33
+ if host in {"localhost", "localhost.localdomain"}:
34
+ return False
35
+ if host.endswith(".local") or host.endswith(".internal"):
36
+ return False
37
+
38
+ try:
39
+ infos = socket.getaddrinfo(host, None, proto=socket.IPPROTO_TCP)
40
+ except Exception:
41
+ return False
42
+
43
+ import ipaddress
44
+
45
+ for info in infos:
46
+ addr = info[4][0]
47
+ try:
48
+ ip = ipaddress.ip_address(addr)
49
+ except Exception:
50
+ return False
51
+ if (
52
+ ip.is_private
53
+ or ip.is_loopback
54
+ or ip.is_link_local
55
+ or ip.is_multicast
56
+ or ip.is_reserved
57
+ or ip.is_unspecified
58
+ ):
59
+ return False
60
+ return True
61
+
62
+
63
+ def _normalize_url(url: str) -> str:
64
+ u = (url or "").strip()
65
+ if not u:
66
+ raise HTTPException(status_code=400, detail={"code": "invalid_request", "message": "Missing URL"})
67
+ p = urlparse(u)
68
+ if p.scheme not in {"https", "http"}:
69
+ raise HTTPException(status_code=400, detail={"code": "invalid_request", "message": "URL must be http(s)"})
70
+ if not p.netloc:
71
+ raise HTTPException(status_code=400, detail={"code": "invalid_request", "message": "URL must include a host"})
72
+ if not _is_public_host(p.hostname or ""):
73
+ raise HTTPException(status_code=400, detail={"code": "invalid_request", "message": "Host is not allowed"})
74
+ # Normalize: strip fragment, keep query.
75
+ normalized = p._replace(fragment="").geturl()
76
+ return normalized
77
+
78
+
79
+ def _extract_links(html: str) -> list[str]:
80
+ # Best-effort: handle common href patterns; ignore javascript:, mailto:, etc.
81
+ out: list[str] = []
82
+ for m in re.finditer(r"""href\s*=\s*['"]([^'"]+)['"]""", html, flags=re.IGNORECASE):
83
+ href = (m.group(1) or "").strip()
84
+ if not href:
85
+ continue
86
+ if href.startswith("#"):
87
+ continue
88
+ if href.lower().startswith(("javascript:", "mailto:", "tel:", "data:")):
89
+ continue
90
+ out.append(href)
91
+ return out
92
+
93
+
94
+ def _html_to_text(html: str) -> str:
95
+ s = html or ""
96
+ s = re.sub(r"(?is)<script.*?>.*?</script>", " ", s)
97
+ s = re.sub(r"(?is)<style.*?>.*?</style>", " ", s)
98
+ s = re.sub(r"(?is)<noscript.*?>.*?</noscript>", " ", s)
99
+ s = re.sub(r"(?i)<br\s*/?>", "\n", s)
100
+ s = re.sub(r"(?i)</p\s*>", "\n\n", s)
101
+ s = re.sub(r"(?is)<[^>]+>", " ", s)
102
+ s = unescape(s)
103
+ s = re.sub(r"[ \t\r\f\v]+", " ", s)
104
+ s = re.sub(r"\n{3,}", "\n\n", s)
105
+ return s.strip()
106
+
107
+
108
+ def _chunk_text(text: str, *, max_chars: int = 1200, overlap: int = 120) -> list[dict[str, str]]:
109
+ t = (text or "").strip()
110
+ if not t:
111
+ return []
112
+ chunks: list[dict[str, str]] = []
113
+ i = 0
114
+ n = len(t)
115
+ while i < n:
116
+ j = min(n, i + max_chars)
117
+ chunk = t[i:j].strip()
118
+ if chunk:
119
+ chunks.append({"id": str(uuid.uuid4()), "text": chunk})
120
+ if j >= n:
121
+ break
122
+ i = max(0, j - overlap)
123
+ return chunks
124
+
125
+
126
+ def _rag_dir(user_id: str) -> Path:
127
+ root = user_data_dir(user_id) / "rag"
128
+ root.mkdir(parents=True, exist_ok=True)
129
+ return root
130
+
131
+
132
+ def _rag_index_path(user_id: str) -> Path:
133
+ return _rag_dir(user_id) / "rag-index.json"
134
+
135
+
136
+ def _load_rag_index(path: Path) -> dict[str, Any]:
137
+ try:
138
+ return json.loads(path.read_text(encoding="utf-8"))
139
+ except FileNotFoundError:
140
+ return {"version": 1, "documents": []}
141
+
142
+
143
+ def _save_rag_index(path: Path, data: dict[str, Any]) -> None:
144
+ path.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
145
+
146
+
147
+ def add_rag_document(user_id: str, *, name: str, text: str, source: str | None = None) -> dict[str, Any]:
148
+ doc_id = str(uuid.uuid4())
149
+ chunks = _chunk_text(text)
150
+ rag_root = _rag_dir(user_id)
151
+ doc_path = rag_root / f"{doc_id}.txt"
152
+ doc_path.write_text(text, encoding="utf-8")
153
+
154
+ idx_path = _rag_index_path(user_id)
155
+ idx = _load_rag_index(idx_path)
156
+ docs = idx.get("documents")
157
+ if not isinstance(docs, list):
158
+ docs = []
159
+ idx["documents"] = docs
160
+
161
+ entry: dict[str, Any] = {
162
+ "id": doc_id,
163
+ "name": name,
164
+ "createdAt": _now_iso(),
165
+ "bytes": len(text.encode("utf-8")),
166
+ "path": doc_path.name,
167
+ "chunks": chunks,
168
+ }
169
+ if source:
170
+ entry["source"] = source
171
+
172
+ docs.append(entry)
173
+ _save_rag_index(idx_path, idx)
174
+ return {"id": doc_id, "chunks": len(chunks)}
175
+
176
+
177
+ @dataclass
178
+ class IndexJob:
179
+ id: str
180
+ type: str
181
+ createdAt: str
182
+ status: str = "queued" # queued|running|succeeded|failed|canceled
183
+ params: dict[str, Any] = field(default_factory=dict)
184
+ progress: dict[str, Any] = field(default_factory=dict)
185
+ result: dict[str, Any] = field(default_factory=dict)
186
+ error: str | None = None
187
+
188
+
189
+ class IndexJobStore:
190
+ def __init__(self) -> None:
191
+ self._locks: dict[str, asyncio.Lock] = {}
192
+ self._tasks: dict[str, dict[str, asyncio.Task]] = {}
193
+
194
+ def _lock(self, user_id: str) -> asyncio.Lock:
195
+ if user_id not in self._locks:
196
+ self._locks[user_id] = asyncio.Lock()
197
+ return self._locks[user_id]
198
+
199
+ def _task_map(self, user_id: str) -> dict[str, asyncio.Task]:
200
+ return self._tasks.setdefault(user_id, {})
201
+
202
+ async def list_jobs(self, user_id: str) -> list[dict[str, Any]]:
203
+ path = _jobs_path(user_id)
204
+ try:
205
+ data = json.loads(path.read_text(encoding="utf-8"))
206
+ jobs = data.get("jobs") if isinstance(data, dict) else None
207
+ if not isinstance(jobs, list):
208
+ return []
209
+ return [j for j in jobs if isinstance(j, dict)]
210
+ except FileNotFoundError:
211
+ return []
212
+ except json.JSONDecodeError:
213
+ return []
214
+
215
+ async def get_job(self, user_id: str, job_id: str) -> dict[str, Any] | None:
216
+ for j in await self.list_jobs(user_id):
217
+ if str(j.get("id") or "") == job_id:
218
+ return j
219
+ return None
220
+
221
+ async def _save_jobs(self, user_id: str, jobs: list[dict[str, Any]]) -> None:
222
+ path = _jobs_path(user_id)
223
+ payload = {"version": 1, "jobs": jobs}
224
+ tmp = path.with_suffix(".tmp")
225
+ tmp.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
226
+ tmp.replace(path)
227
+
228
+ async def _update_job(self, user_id: str, job: IndexJob) -> None:
229
+ async with self._lock(user_id):
230
+ jobs = await self.list_jobs(user_id)
231
+ updated = False
232
+ for i, j in enumerate(jobs):
233
+ if str(j.get("id") or "") == job.id:
234
+ jobs[i] = asdict(job)
235
+ updated = True
236
+ break
237
+ if not updated:
238
+ jobs.append(asdict(job))
239
+ await self._save_jobs(user_id, jobs)
240
+
241
+ async def cancel_job(self, user_id: str, job_id: str) -> bool:
242
+ task = self._task_map(user_id).get(job_id)
243
+ if task and not task.done():
244
+ task.cancel()
245
+ return True
246
+ return False
247
+
248
+ async def create_web_crawl_job(
249
+ self,
250
+ user_id: str,
251
+ *,
252
+ start_url: str,
253
+ max_pages: int = 25,
254
+ max_depth: int = 2,
255
+ rate_limit_sec: float = 0.25,
256
+ respect_robots: bool = True,
257
+ ) -> IndexJob:
258
+ url = _normalize_url(start_url)
259
+ max_pages = max(1, min(int(max_pages), 150))
260
+ max_depth = max(0, min(int(max_depth), 6))
261
+ rate_limit_sec = float(rate_limit_sec)
262
+ if rate_limit_sec < 0:
263
+ rate_limit_sec = 0.0
264
+ if rate_limit_sec > 5:
265
+ rate_limit_sec = 5.0
266
+
267
+ job = IndexJob(
268
+ id=str(uuid.uuid4()),
269
+ type="web_crawl",
270
+ createdAt=_now_iso(),
271
+ params={
272
+ "startUrl": url,
273
+ "maxPages": max_pages,
274
+ "maxDepth": max_depth,
275
+ "rateLimitSec": rate_limit_sec,
276
+ "respectRobots": bool(respect_robots),
277
+ },
278
+ )
279
+ await self._update_job(user_id, job)
280
+
281
+ task = asyncio.create_task(self._run_web_crawl(user_id, job))
282
+ self._task_map(user_id)[job.id] = task
283
+ return job
284
+
285
+ async def _run_web_crawl(self, user_id: str, job: IndexJob) -> None:
286
+ job.status = "running"
287
+ job.progress = {"visited": 0, "indexedPages": 0, "queued": 0}
288
+ await self._update_job(user_id, job)
289
+
290
+ start_url = job.params.get("startUrl") or ""
291
+ max_pages = int(job.params.get("maxPages") or 25)
292
+ max_depth = int(job.params.get("maxDepth") or 2)
293
+ rate_limit_sec = float(job.params.get("rateLimitSec") or 0.25)
294
+ respect_robots = bool(job.params.get("respectRobots") is True)
295
+
296
+ origin = urlparse(start_url)
297
+ allowed_netloc = origin.netloc
298
+ base = f"{origin.scheme}://{origin.netloc}"
299
+
300
+ robots_disallow_all = False
301
+ if respect_robots:
302
+ try:
303
+ async with httpx.AsyncClient(timeout=10.0, follow_redirects=False, headers={"User-Agent": "autonomy-labs/1.0"}) as c:
304
+ r = await c.get(f"{base}/robots.txt")
305
+ if r.status_code == 200:
306
+ txt = (r.text or "")
307
+ # Very small parser: disallow all if user-agent * has Disallow: /
308
+ in_star = False
309
+ for line in txt.splitlines():
310
+ line = line.strip()
311
+ if not line or line.startswith("#"):
312
+ continue
313
+ if line.lower().startswith("user-agent:"):
314
+ ua = line.split(":", 1)[1].strip()
315
+ in_star = ua == "*"
316
+ if in_star and line.lower().startswith("disallow:"):
317
+ val = line.split(":", 1)[1].strip()
318
+ if val == "/":
319
+ robots_disallow_all = True
320
+ break
321
+ except Exception:
322
+ robots_disallow_all = False
323
+
324
+ if robots_disallow_all:
325
+ job.status = "failed"
326
+ job.error = "robots.txt disallows crawling"
327
+ await self._update_job(user_id, job)
328
+ return
329
+
330
+ queue: list[tuple[str, int]] = [(start_url, 0)]
331
+ visited: set[str] = set()
332
+ pages: list[tuple[str, str]] = []
333
+
334
+ async with httpx.AsyncClient(timeout=15.0, follow_redirects=False, headers={"User-Agent": "autonomy-labs/1.0"}) as client:
335
+ try:
336
+ while queue and len(visited) < max_pages:
337
+ url, depth = queue.pop(0)
338
+ if url in visited:
339
+ continue
340
+ visited.add(url)
341
+ job.progress = {"visited": len(visited), "indexedPages": len(pages), "queued": len(queue)}
342
+ await self._update_job(user_id, job)
343
+
344
+ parsed = urlparse(url)
345
+ if parsed.scheme not in {"https", "http"}:
346
+ continue
347
+ if parsed.netloc != allowed_netloc:
348
+ continue
349
+ if not _is_public_host(parsed.hostname or ""):
350
+ continue
351
+
352
+ resp = await client.get(url)
353
+ if resp.status_code in {301, 302, 303, 307, 308}:
354
+ loc = resp.headers.get("location") or ""
355
+ if loc:
356
+ nxt = urljoin(url, loc)
357
+ nxtp = urlparse(nxt)
358
+ if nxtp.netloc == allowed_netloc and nxt not in visited:
359
+ queue.append((nxt, depth))
360
+ await asyncio.sleep(rate_limit_sec)
361
+ continue
362
+ if resp.status_code != 200:
363
+ await asyncio.sleep(rate_limit_sec)
364
+ continue
365
+
366
+ ctype = (resp.headers.get("content-type") or "").lower()
367
+ if "text/html" not in ctype:
368
+ await asyncio.sleep(rate_limit_sec)
369
+ continue
370
+ content = resp.text
371
+ if len(content) > 1_000_000:
372
+ content = content[:1_000_000]
373
+
374
+ text = _html_to_text(content)
375
+ if text:
376
+ pages.append((url, text))
377
+
378
+ if depth < max_depth:
379
+ for href in _extract_links(content):
380
+ nxt = urljoin(url, href)
381
+ try:
382
+ nxt = _normalize_url(nxt)
383
+ except HTTPException:
384
+ continue
385
+ nxtp = urlparse(nxt)
386
+ if nxtp.netloc != allowed_netloc:
387
+ continue
388
+ if nxt not in visited:
389
+ queue.append((nxt, depth + 1))
390
+
391
+ await asyncio.sleep(rate_limit_sec)
392
+ except asyncio.CancelledError:
393
+ job.status = "canceled"
394
+ job.error = None
395
+ await self._update_job(user_id, job)
396
+ return
397
+ except Exception as e:
398
+ job.status = "failed"
399
+ job.error = str(e)
400
+ await self._update_job(user_id, job)
401
+ return
402
+
403
+ # Build a single RAG doc
404
+ combined = []
405
+ for url, text in pages:
406
+ combined.append(f"URL: {url}\n\n{text}\n\n---\n")
407
+ combined_text = "\n".join(combined).strip()
408
+ if not combined_text:
409
+ job.status = "failed"
410
+ job.error = "No indexable pages found"
411
+ await self._update_job(user_id, job)
412
+ return
413
+
414
+ result = add_rag_document(user_id, name=f"Website: {start_url}", text=combined_text, source=start_url)
415
+ job.status = "succeeded"
416
+ job.result = {"pages": len(pages), "ragDoc": result}
417
+ job.progress = {"visited": len(visited), "indexedPages": len(pages), "queued": 0}
418
+ await self._update_job(user_id, job)
app/routes/indexing.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from fastapi import APIRouter, HTTPException, Request
4
+ from pydantic import BaseModel
5
+
6
+ from app.auth import require_user_from_request
7
+ from app.settings import feature_enabled
8
+
9
+ router = APIRouter()
10
+
11
+
12
+ class WebCrawlRequest(BaseModel):
13
+ url: str
14
+ maxPages: int = 25
15
+ maxDepth: int = 2
16
+ rateLimitSec: float = 0.25
17
+ respectRobots: bool = True
18
+
19
+
20
+ @router.get("/api/indexing/jobs")
21
+ async def list_indexing_jobs(http_request: Request):
22
+ if not feature_enabled("indexing"):
23
+ raise HTTPException(status_code=403, detail={"code": "feature_disabled", "message": "Indexing is disabled"})
24
+ user = await require_user_from_request(http_request)
25
+ user_id = str(user.get("id") or "")
26
+ store = http_request.app.state.index_job_store
27
+ jobs = await store.list_jobs(user_id)
28
+ # Newest first
29
+ jobs.sort(key=lambda j: str(j.get("createdAt") or ""), reverse=True)
30
+ return {"jobs": jobs}
31
+
32
+
33
+ @router.post("/api/indexing/jobs/web-crawl")
34
+ async def start_web_crawl(body: WebCrawlRequest, http_request: Request):
35
+ if not feature_enabled("indexing"):
36
+ raise HTTPException(status_code=403, detail={"code": "feature_disabled", "message": "Indexing is disabled"})
37
+ user = await require_user_from_request(http_request)
38
+ user_id = str(user.get("id") or "")
39
+ store = http_request.app.state.index_job_store
40
+ job = await store.create_web_crawl_job(
41
+ user_id,
42
+ start_url=body.url,
43
+ max_pages=body.maxPages,
44
+ max_depth=body.maxDepth,
45
+ rate_limit_sec=body.rateLimitSec,
46
+ respect_robots=body.respectRobots,
47
+ )
48
+ return {"ok": True, "job": job.__dict__}
49
+
50
+
51
+ @router.post("/api/indexing/jobs/{job_id}/cancel")
52
+ async def cancel_job(job_id: str, http_request: Request):
53
+ if not feature_enabled("indexing"):
54
+ raise HTTPException(status_code=403, detail={"code": "feature_disabled", "message": "Indexing is disabled"})
55
+ user = await require_user_from_request(http_request)
56
+ user_id = str(user.get("id") or "")
57
+ store = http_request.app.state.index_job_store
58
+ ok = await store.cancel_job(user_id, job_id)
59
+ return {"ok": True, "canceled": ok}
60
+
app/server.py CHANGED
@@ -12,11 +12,13 @@ from fastapi.staticfiles import StaticFiles
12
  from starlette.exceptions import HTTPException as StarletteHTTPException
13
 
14
  from app.errors import normalize_error
 
15
  from app.mcp_client import McpStdioClient
16
  from app.routes.admin import router as admin_router
17
  from app.routes.base import router as base_router
18
  from app.routes.chat import router as chat_router
19
  from app.routes.codex import router as codex_router
 
20
  from app.routes.mcp import router as mcp_router
21
  from app.routes.rag import router as rag_router
22
  from app.routes.terminal import router as terminal_router
@@ -28,6 +30,7 @@ _ROOT = Path(__file__).resolve().parent.parent
28
  @asynccontextmanager
29
  async def lifespan(app: FastAPI):
30
  app.state.codex_mcp_client = McpStdioClient(["codex", "mcp-server"])
 
31
  app.state.device_login_attempts = {}
32
  app.state.device_login_lock = asyncio.Lock()
33
  stop = asyncio.Event()
@@ -94,6 +97,7 @@ def create_app() -> FastAPI:
94
  app.include_router(base_router)
95
  app.include_router(chat_router)
96
  app.include_router(codex_router)
 
97
  app.include_router(mcp_router)
98
  app.include_router(terminal_router)
99
  app.include_router(user_router)
 
12
  from starlette.exceptions import HTTPException as StarletteHTTPException
13
 
14
  from app.errors import normalize_error
15
+ from app.indexing_jobs import IndexJobStore
16
  from app.mcp_client import McpStdioClient
17
  from app.routes.admin import router as admin_router
18
  from app.routes.base import router as base_router
19
  from app.routes.chat import router as chat_router
20
  from app.routes.codex import router as codex_router
21
+ from app.routes.indexing import router as indexing_router
22
  from app.routes.mcp import router as mcp_router
23
  from app.routes.rag import router as rag_router
24
  from app.routes.terminal import router as terminal_router
 
30
  @asynccontextmanager
31
  async def lifespan(app: FastAPI):
32
  app.state.codex_mcp_client = McpStdioClient(["codex", "mcp-server"])
33
+ app.state.index_job_store = IndexJobStore()
34
  app.state.device_login_attempts = {}
35
  app.state.device_login_lock = asyncio.Lock()
36
  stop = asyncio.Event()
 
97
  app.include_router(base_router)
98
  app.include_router(chat_router)
99
  app.include_router(codex_router)
100
+ app.include_router(indexing_router)
101
  app.include_router(mcp_router)
102
  app.include_router(terminal_router)
103
  app.include_router(user_router)
docs/TROUBLESHOOTING.md CHANGED
@@ -29,6 +29,11 @@ This repo prefers env-based auth for provider CLIs (keep tokens out of git and U
29
 
30
  Set `ENABLE_INDEXING=1` in your environment and restart the container.
31
 
 
 
 
 
 
32
  ## Terminal shows vertical/1-column text
33
 
34
  This usually means the terminal “fit” ran while the terminal view was hidden or at size 0.
 
29
 
30
  Set `ENABLE_INDEXING=1` in your environment and restart the container.
31
 
32
+ ## Website indexing fails (“Host is not allowed”)
33
+
34
+ Website indexing blocks private/localhost targets to reduce SSRF risk.
35
+ Use a public `http(s)` URL and keep indexing within the same origin.
36
+
37
  ## Terminal shows vertical/1-column text
38
 
39
  This usually means the terminal “fit” ran while the terminal view was hidden or at size 0.
static/dashboard.html CHANGED
@@ -484,6 +484,43 @@
484
  </div>
485
  <div id="rag-search-results" class="mt-2 space-y-2"></div>
486
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  <div>
488
  <div class="text-xs font-semibold text-gray-400 mb-1 uppercase">Documents</div>
489
  <div id="rag-documents" class="space-y-2"></div>
 
484
  </div>
485
  <div id="rag-search-results" class="mt-2 space-y-2"></div>
486
  </div>
487
+ <div class="bg-gray-900/30 border border-gray-700 rounded-lg p-3 space-y-2">
488
+ <div class="flex items-center justify-between gap-2">
489
+ <div class="text-xs font-semibold text-gray-300 uppercase">Website Indexing</div>
490
+ <button onclick="loadIndexingJobs()"
491
+ class="bg-gray-700 hover:bg-gray-600 text-white px-2 py-1 rounded text-xs">Refresh Jobs</button>
492
+ </div>
493
+ <div class="text-xs text-gray-400">Indexes same-origin HTML pages into RAG with depth/page limits. Blocks private/localhost targets.</div>
494
+ <div class="grid grid-cols-1 md:grid-cols-3 gap-2">
495
+ <input id="crawl-url" type="text" placeholder="https://example.com"
496
+ class="md:col-span-2 bg-gray-700 text-sm rounded border border-gray-600 p-2 text-white outline-none focus:border-blue-500">
497
+ <button onclick="startWebCrawlJob()"
498
+ class="bg-blue-600 hover:bg-blue-700 text-white px-3 py-2 rounded text-xs">Start</button>
499
+ </div>
500
+ <div class="grid grid-cols-3 gap-2">
501
+ <div>
502
+ <label class="block text-xs font-semibold text-gray-400 mb-1 uppercase">Max Pages</label>
503
+ <input id="crawl-max-pages" type="number" min="1" max="150" value="25"
504
+ class="w-full bg-gray-700 text-sm rounded border border-gray-600 p-2 text-white outline-none focus:border-blue-500">
505
+ </div>
506
+ <div>
507
+ <label class="block text-xs font-semibold text-gray-400 mb-1 uppercase">Max Depth</label>
508
+ <input id="crawl-max-depth" type="number" min="0" max="6" value="2"
509
+ class="w-full bg-gray-700 text-sm rounded border border-gray-600 p-2 text-white outline-none focus:border-blue-500">
510
+ </div>
511
+ <div>
512
+ <label class="block text-xs font-semibold text-gray-400 mb-1 uppercase">Rate (sec)</label>
513
+ <input id="crawl-rate" type="number" min="0" max="5" step="0.05" value="0.25"
514
+ class="w-full bg-gray-700 text-sm rounded border border-gray-600 p-2 text-white outline-none focus:border-blue-500">
515
+ </div>
516
+ </div>
517
+ <div class="flex items-center gap-2">
518
+ <input id="crawl-respect-robots" type="checkbox" checked class="h-4 w-4 accent-blue-600">
519
+ <label for="crawl-respect-robots" class="text-xs text-gray-300">Respect robots.txt (basic)</label>
520
+ </div>
521
+ <div id="indexing-status" class="text-xs text-gray-500"></div>
522
+ <div id="indexing-jobs" class="space-y-2"></div>
523
+ </div>
524
  <div>
525
  <div class="text-xs font-semibold text-gray-400 mb-1 uppercase">Documents</div>
526
  <div id="rag-documents" class="space-y-2"></div>
static/dashboard.js CHANGED
@@ -369,6 +369,113 @@ let supabase;
369
  if (status && !enabled) status.textContent = 'Enable indexing with ENABLE_INDEXING=1 and restart.';
370
  }
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  function setRagStatus(text) {
373
  const el = document.getElementById('rag-status');
374
  if (el) el.textContent = text || '';
@@ -595,6 +702,7 @@ let supabase;
595
  applyIndexingUi(me);
596
  if (me?.features?.indexing) {
597
  loadRagDocuments();
 
598
  }
599
 
600
  // Chat sidebar collapse preference (desktop)
 
369
  if (status && !enabled) status.textContent = 'Enable indexing with ENABLE_INDEXING=1 and restart.';
370
  }
371
 
372
+ function setIndexingStatus(text) {
373
+ const el = document.getElementById('indexing-status');
374
+ if (el) el.textContent = text || '';
375
+ }
376
+
377
+ function renderIndexingJobs(jobs) {
378
+ const el = document.getElementById('indexing-jobs');
379
+ if (!el) return;
380
+ el.innerHTML = '';
381
+ const list = Array.isArray(jobs) ? jobs : [];
382
+ if (!list.length) {
383
+ const empty = document.createElement('div');
384
+ empty.className = 'text-xs text-gray-500';
385
+ empty.textContent = 'No indexing jobs yet.';
386
+ el.appendChild(empty);
387
+ return;
388
+ }
389
+
390
+ for (const j of list.slice(0, 30)) {
391
+ const row = document.createElement('div');
392
+ row.className = 'bg-gray-800/40 border border-gray-700 rounded-lg px-3 py-2 flex items-start justify-between gap-3';
393
+
394
+ const left = document.createElement('div');
395
+ left.className = 'min-w-0';
396
+ const title = document.createElement('div');
397
+ title.className = 'text-sm text-gray-100 truncate';
398
+ title.textContent = `${j?.type || 'job'} • ${j?.status || 'unknown'}`;
399
+ const meta = document.createElement('div');
400
+ meta.className = 'text-xs text-gray-500 mt-0.5';
401
+ const p = j?.progress || {};
402
+ const visited = p?.visited ?? 0;
403
+ const indexed = p?.indexedPages ?? 0;
404
+ meta.textContent = `${j?.createdAt || ''} • visited ${visited} • indexed ${indexed}`;
405
+ left.appendChild(title);
406
+ left.appendChild(meta);
407
+
408
+ const right = document.createElement('div');
409
+ right.className = 'shrink-0 flex gap-2';
410
+ if (j?.status === 'running' || j?.status === 'queued') {
411
+ const btn = document.createElement('button');
412
+ btn.className = 'bg-red-600 hover:bg-red-700 text-white px-2 py-1 rounded text-xs';
413
+ btn.textContent = 'Cancel';
414
+ btn.onclick = () => cancelIndexingJob(String(j?.id || ''));
415
+ right.appendChild(btn);
416
+ }
417
+
418
+ row.appendChild(left);
419
+ row.appendChild(right);
420
+ el.appendChild(row);
421
+ }
422
+ }
423
+
424
+ async function loadIndexingJobs() {
425
+ try {
426
+ setIndexingStatus('Loading jobs...');
427
+ const res = await authFetch('/api/indexing/jobs');
428
+ if (!res.ok) throw new Error(await res.text());
429
+ const data = await res.json();
430
+ renderIndexingJobs(data?.jobs || []);
431
+ setIndexingStatus('');
432
+ } catch (e) {
433
+ setIndexingStatus(`Failed to load jobs: ${e?.message || e}`);
434
+ }
435
+ }
436
+
437
+ async function startWebCrawlJob() {
438
+ const url = (document.getElementById('crawl-url')?.value || '').trim();
439
+ const maxPages = Number(document.getElementById('crawl-max-pages')?.value || 25);
440
+ const maxDepth = Number(document.getElementById('crawl-max-depth')?.value || 2);
441
+ const rateLimitSec = Number(document.getElementById('crawl-rate')?.value || 0.25);
442
+ const respectRobots = !!document.getElementById('crawl-respect-robots')?.checked;
443
+ if (!url) {
444
+ setIndexingStatus('Enter a URL.');
445
+ return;
446
+ }
447
+ try {
448
+ setIndexingStatus('Starting job...');
449
+ const res = await authFetch('/api/indexing/jobs/web-crawl', {
450
+ method: 'POST',
451
+ headers: { 'Content-Type': 'application/json' },
452
+ body: JSON.stringify({ url, maxPages, maxDepth, rateLimitSec, respectRobots }),
453
+ });
454
+ if (!res.ok) throw new Error(await res.text());
455
+ await res.json();
456
+ setIndexingStatus('Job started.');
457
+ loadIndexingJobs();
458
+ } catch (e) {
459
+ setIndexingStatus(`Failed to start job: ${e?.message || e}`);
460
+ }
461
+ }
462
+
463
+ async function cancelIndexingJob(jobId) {
464
+ const id = (jobId || '').trim();
465
+ if (!id) return;
466
+ if (!confirm('Cancel this job?')) return;
467
+ try {
468
+ setIndexingStatus('Canceling...');
469
+ const res = await authFetch(`/api/indexing/jobs/${encodeURIComponent(id)}/cancel`, { method: 'POST' });
470
+ if (!res.ok) throw new Error(await res.text());
471
+ await res.json();
472
+ setIndexingStatus('Canceled.');
473
+ loadIndexingJobs();
474
+ } catch (e) {
475
+ setIndexingStatus(`Failed to cancel: ${e?.message || e}`);
476
+ }
477
+ }
478
+
479
  function setRagStatus(text) {
480
  const el = document.getElementById('rag-status');
481
  if (el) el.textContent = text || '';
 
702
  applyIndexingUi(me);
703
  if (me?.features?.indexing) {
704
  loadRagDocuments();
705
+ loadIndexingJobs();
706
  }
707
 
708
  // Chat sidebar collapse preference (desktop)