lamhieu commited on
Commit
6ed4967
·
1 Parent(s): b1a38a9

chore: update something

Browse files
docsifer/analytics/service.py CHANGED
@@ -165,9 +165,7 @@ class AnalyticsService:
165
  backoff = 1.0
166
  while not self._stopped.is_set():
167
  try:
168
- await asyncio.wait_for(
169
- self._stopped.wait(), timeout=self._sync_interval
170
- )
171
  return # stopped
172
  except asyncio.TimeoutError:
173
  pass
 
165
  backoff = 1.0
166
  while not self._stopped.is_set():
167
  try:
168
+ await asyncio.wait_for(self._stopped.wait(), timeout=self._sync_interval)
 
 
169
  return # stopped
170
  except asyncio.TimeoutError:
171
  pass
docsifer/analytics/store.py CHANGED
@@ -96,9 +96,7 @@ class UpstashStore:
96
  keys = scan[1] or []
97
  for key in keys:
98
  period = key.split(":", 2)[-1]
99
- data = await loop.run_in_executor(
100
- None, partial(self._client.hgetall, key)
101
- )
102
  for label, count in (data or {}).items():
103
  try:
104
  result[metric][period][label] = int(count)
@@ -123,6 +121,7 @@ class UpstashStore:
123
  # Prefer pipeline when available (single HTTP round trip)
124
  pipeline = getattr(self._client, "pipeline", None)
125
  if callable(pipeline):
 
126
  def _run() -> None:
127
  pipe = pipeline()
128
  for key, label, count in ops:
@@ -135,9 +134,7 @@ class UpstashStore:
135
  # Fallback: parallel single-op calls
136
  await asyncio.gather(
137
  *(
138
- loop.run_in_executor(
139
- None, partial(self._client.hincrby, key, label, count)
140
- )
141
  for key, label, count in ops
142
  )
143
  )
@@ -166,7 +163,7 @@ class logger_suppress: # noqa: N801 - tiny utility, lower-case name for context
166
  def __init__(self, message: str) -> None:
167
  self._message = message
168
 
169
- def __enter__(self) -> "logger_suppress":
170
  return self
171
 
172
  def __exit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
@@ -174,7 +171,7 @@ class logger_suppress: # noqa: N801 - tiny utility, lower-case name for context
174
  logger.warning("%s: %s", self._message, exc)
175
  return True
176
 
177
- async def __aenter__(self) -> "logger_suppress":
178
  return self
179
 
180
  async def __aexit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
 
96
  keys = scan[1] or []
97
  for key in keys:
98
  period = key.split(":", 2)[-1]
99
+ data = await loop.run_in_executor(None, partial(self._client.hgetall, key))
 
 
100
  for label, count in (data or {}).items():
101
  try:
102
  result[metric][period][label] = int(count)
 
121
  # Prefer pipeline when available (single HTTP round trip)
122
  pipeline = getattr(self._client, "pipeline", None)
123
  if callable(pipeline):
124
+
125
  def _run() -> None:
126
  pipe = pipeline()
127
  for key, label, count in ops:
 
134
  # Fallback: parallel single-op calls
135
  await asyncio.gather(
136
  *(
137
+ loop.run_in_executor(None, partial(self._client.hincrby, key, label, count))
 
 
138
  for key, label, count in ops
139
  )
140
  )
 
163
  def __init__(self, message: str) -> None:
164
  self._message = message
165
 
166
+ def __enter__(self) -> logger_suppress:
167
  return self
168
 
169
  def __exit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
 
171
  logger.warning("%s: %s", self._message, exc)
172
  return True
173
 
174
+ async def __aenter__(self) -> logger_suppress:
175
  return self
176
 
177
  async def __aexit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
docsifer/api/error_handlers.py CHANGED
@@ -59,9 +59,7 @@ async def _http_exception_handler(request: Request, exc: HTTPException) -> JSONR
59
  )
60
 
61
 
62
- async def _validation_handler(
63
- request: Request, exc: RequestValidationError
64
- ) -> JSONResponse:
65
  return JSONResponse(
66
  status_code=422,
67
  content=_payload(
 
59
  )
60
 
61
 
62
+ async def _validation_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
 
 
63
  return JSONResponse(
64
  status_code=422,
65
  content=_payload(
docsifer/api/middleware.py CHANGED
@@ -3,7 +3,7 @@
3
  from __future__ import annotations
4
 
5
  import secrets
6
- from typing import Awaitable, Callable
7
 
8
  from fastapi import FastAPI, Request, Response
9
  from fastapi.responses import JSONResponse
 
3
  from __future__ import annotations
4
 
5
  import secrets
6
+ from collections.abc import Awaitable, Callable
7
 
8
  from fastapi import FastAPI, Request, Response
9
  from fastapi.responses import JSONResponse
docsifer/api/v1/convert.py CHANGED
@@ -147,60 +147,57 @@ async def convert_document(
147
  http_cfg = _parse_json_form("http", http, HTTPConfig)
148
  convert_cfg = _parse_json_form("settings", settings_form, ConvertSettings)
149
 
150
- async with per_ip.acquire(client_ip):
151
- async with gate.acquire():
152
- if file is not None:
153
- guard.check(0) # file size unknown until streamed
154
- _check_extension(file.filename or "", set(settings.allowed_extensions))
155
-
156
- tmp_root = Path(tempfile.mkdtemp(prefix="docsifer-", dir=settings.tmp_dir))
157
- try:
158
- safe_name = _safe_filename(file.filename)
159
- dst = tmp_root / safe_name
160
- size = await _stream_to_disk(
161
- file, dst, max_bytes=settings.max_upload_bytes
162
- )
163
- logger.info(
164
- "Convert file received",
165
- extra={
166
- "filename": safe_name,
167
- "size": size,
168
- "client_ip": client_ip,
169
- },
170
- )
171
- guard.check(size)
172
-
173
- result = await asyncio.wait_for(
174
- converter.convert_file(
175
- dst,
176
- openai_config=openai_cfg.to_dict(),
177
- http_config=http_cfg.to_dict(),
178
- cleanup_html=convert_cfg.cleanup,
179
- ),
180
- timeout=settings.request_timeout_sec,
181
- )
182
- finally:
183
- shutil.rmtree(tmp_root, ignore_errors=True)
184
- else:
185
- safe_url = validate_url(
186
- url or "",
187
- allowed_schemes=settings.url_allowed_schemes,
188
- allow_private_networks=settings.url_allow_private_networks,
189
- )
190
  logger.info(
191
- "Convert URL received",
192
- extra={"url": safe_url, "client_ip": client_ip},
 
 
 
 
193
  )
194
- guard.check(0)
 
195
  result = await asyncio.wait_for(
196
  converter.convert_file(
197
- safe_url,
198
  openai_config=openai_cfg.to_dict(),
199
  http_config=http_cfg.to_dict(),
200
  cleanup_html=convert_cfg.cleanup,
201
  ),
202
  timeout=settings.request_timeout_sec,
203
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  background_tasks.add_task(_record_access_safe, analytics, result.token_count)
206
  return ConvertResponse(filename=result.filename, markdown=result.markdown)
 
147
  http_cfg = _parse_json_form("http", http, HTTPConfig)
148
  convert_cfg = _parse_json_form("settings", settings_form, ConvertSettings)
149
 
150
+ async with per_ip.acquire(client_ip), gate.acquire():
151
+ if file is not None:
152
+ guard.check(0) # file size unknown until streamed
153
+ _check_extension(file.filename or "", set(settings.allowed_extensions))
154
+
155
+ tmp_root = Path(tempfile.mkdtemp(prefix="docsifer-", dir=settings.tmp_dir))
156
+ try:
157
+ safe_name = _safe_filename(file.filename)
158
+ dst = tmp_root / safe_name
159
+ size = await _stream_to_disk(file, dst, max_bytes=settings.max_upload_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  logger.info(
161
+ "Convert file received",
162
+ extra={
163
+ "filename": safe_name,
164
+ "size": size,
165
+ "client_ip": client_ip,
166
+ },
167
  )
168
+ guard.check(size)
169
+
170
  result = await asyncio.wait_for(
171
  converter.convert_file(
172
+ dst,
173
  openai_config=openai_cfg.to_dict(),
174
  http_config=http_cfg.to_dict(),
175
  cleanup_html=convert_cfg.cleanup,
176
  ),
177
  timeout=settings.request_timeout_sec,
178
  )
179
+ finally:
180
+ shutil.rmtree(tmp_root, ignore_errors=True)
181
+ else:
182
+ safe_url = validate_url(
183
+ url or "",
184
+ allowed_schemes=settings.url_allowed_schemes,
185
+ allow_private_networks=settings.url_allow_private_networks,
186
+ )
187
+ logger.info(
188
+ "Convert URL received",
189
+ extra={"url": safe_url, "client_ip": client_ip},
190
+ )
191
+ guard.check(0)
192
+ result = await asyncio.wait_for(
193
+ converter.convert_file(
194
+ safe_url,
195
+ openai_config=openai_cfg.to_dict(),
196
+ http_config=http_cfg.to_dict(),
197
+ cleanup_html=convert_cfg.cleanup,
198
+ ),
199
+ timeout=settings.request_timeout_sec,
200
+ )
201
 
202
  background_tasks.add_task(_record_access_safe, analytics, result.token_count)
203
  return ConvertResponse(filename=result.filename, markdown=result.markdown)
docsifer/config.py CHANGED
@@ -133,10 +133,7 @@ class Settings(BaseSettings):
133
  """True when a real (non-empty, non-localhost) Redis URL is configured."""
134
  if not self.analytics_enabled:
135
  return False
136
- url = (self.redis_url or "").strip()
137
- if not url:
138
- return False
139
- return True
140
 
141
  # ---------------------------------------------------------------------
142
  # Quotas (anonymous, BYOK, authenticated)
 
133
  """True when a real (non-empty, non-localhost) Redis URL is configured."""
134
  if not self.analytics_enabled:
135
  return False
136
+ return bool((self.redis_url or "").strip())
 
 
 
137
 
138
  # ---------------------------------------------------------------------
139
  # Quotas (anonymous, BYOK, authenticated)
docsifer/core/llm_registry.py CHANGED
@@ -44,7 +44,7 @@ class LLMConfig:
44
  *,
45
  default_base_url: str,
46
  default_model: str,
47
- ) -> "LLMConfig | None":
48
  if not data:
49
  return None
50
  api_key = (data.get("api_key") or "").strip()
@@ -92,9 +92,7 @@ class LLMRegistry:
92
  client = OpenAI(
93
  api_key=config.api_key,
94
  base_url=config.base_url,
95
- timeout=httpx.Timeout(
96
- self._request_timeout, connect=self._connect_timeout
97
- ),
98
  max_retries=self._max_retries,
99
  )
100
  return MarkItDown(llm_client=client, llm_model=config.model)
 
44
  *,
45
  default_base_url: str,
46
  default_model: str,
47
+ ) -> LLMConfig | None:
48
  if not data:
49
  return None
50
  api_key = (data.get("api_key") or "").strip()
 
92
  client = OpenAI(
93
  api_key=config.api_key,
94
  base_url=config.base_url,
95
+ timeout=httpx.Timeout(self._request_timeout, connect=self._connect_timeout),
 
 
96
  max_retries=self._max_retries,
97
  )
98
  return MarkItDown(llm_client=client, llm_model=config.model)
docsifer/core/service.py CHANGED
@@ -107,9 +107,7 @@ class DocsiferService:
107
  async def shutdown(self) -> None:
108
  """Stop the worker pool gracefully (called from app lifespan)."""
109
  self._llm_registry.clear()
110
- await asyncio.get_running_loop().run_in_executor(
111
- None, self._executor.shutdown, True
112
- )
113
 
114
  # ------------------------------------------------------------------
115
  # Internal
 
107
  async def shutdown(self) -> None:
108
  """Stop the worker pool gracefully (called from app lifespan)."""
109
  self._llm_registry.clear()
110
+ await asyncio.get_running_loop().run_in_executor(None, self._executor.shutdown, True)
 
 
111
 
112
  # ------------------------------------------------------------------
113
  # Internal
docsifer/main.py CHANGED
@@ -18,8 +18,8 @@ from __future__ import annotations
18
  import asyncio
19
  import contextlib
20
  import logging
 
21
  from contextlib import asynccontextmanager
22
- from typing import AsyncIterator
23
 
24
  from fastapi import FastAPI
25
  from fastapi.middleware.cors import CORSMiddleware
@@ -54,9 +54,7 @@ def _build_analytics_store(settings: Settings) -> AnalyticsStore:
54
  logger.info("Analytics: Upstash store configured")
55
  return store
56
  except Exception as exc: # pragma: no cover - defensive
57
- logger.warning(
58
- "Could not initialize Upstash store (%s); falling back to in-memory", exc
59
- )
60
  return InMemoryStore()
61
 
62
 
 
18
  import asyncio
19
  import contextlib
20
  import logging
21
+ from collections.abc import AsyncIterator
22
  from contextlib import asynccontextmanager
 
23
 
24
  from fastapi import FastAPI
25
  from fastapi.middleware.cors import CORSMiddleware
 
54
  logger.info("Analytics: Upstash store configured")
55
  return store
56
  except Exception as exc: # pragma: no cover - defensive
57
+ logger.warning("Could not initialize Upstash store (%s); falling back to in-memory", exc)
 
 
58
  return InMemoryStore()
59
 
60
 
docsifer/safety/circuit_breaker.py CHANGED
@@ -4,7 +4,8 @@ from __future__ import annotations
4
 
5
  import asyncio
6
  import time
7
- from typing import Awaitable, Callable, TypeVar
 
8
 
9
  from ..exceptions import CircuitOpenError
10
 
 
4
 
5
  import asyncio
6
  import time
7
+ from collections.abc import Awaitable, Callable
8
+ from typing import TypeVar
9
 
10
  from ..exceptions import CircuitOpenError
11
 
docsifer/safety/conversion_gate.py CHANGED
@@ -4,7 +4,7 @@ from __future__ import annotations
4
 
5
  import asyncio
6
  import contextlib
7
- from typing import AsyncIterator
8
 
9
  from ..exceptions import QueueFullError
10
 
 
4
 
5
  import asyncio
6
  import contextlib
7
+ from collections.abc import AsyncIterator
8
 
9
  from ..exceptions import QueueFullError
10
 
docsifer/safety/disk_cleanup.py CHANGED
@@ -32,17 +32,22 @@ async def disk_cleanup_loop(
32
  pass
33
 
34
  try:
35
- now = time.time()
36
- removed = 0
37
- for entry in tmp_dir.glob(pattern):
38
- try:
39
- if now - entry.stat().st_mtime > ttl_sec:
40
- if entry.is_file():
41
- entry.unlink(missing_ok=True)
42
- removed += 1
43
- except OSError as exc:
44
- logger.debug("Could not remove %s: %s", entry, exc)
45
  if removed:
46
  logger.info("Disk cleanup removed %d stale file(s)", removed)
47
  except Exception as exc:
48
  logger.warning("Disk cleanup failed: %s", exc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  pass
33
 
34
  try:
35
+ removed = await asyncio.to_thread(_sweep_once, tmp_dir, pattern, ttl_sec)
 
 
 
 
 
 
 
 
 
36
  if removed:
37
  logger.info("Disk cleanup removed %d stale file(s)", removed)
38
  except Exception as exc:
39
  logger.warning("Disk cleanup failed: %s", exc)
40
+
41
+
42
+ def _sweep_once(tmp_dir: Path, pattern: str, ttl_sec: int) -> int:
43
+ """Synchronous sweep — meant to be run via ``asyncio.to_thread``."""
44
+ now = time.time()
45
+ removed = 0
46
+ for entry in tmp_dir.glob(pattern):
47
+ try:
48
+ if now - entry.stat().st_mtime > ttl_sec and entry.is_file():
49
+ entry.unlink(missing_ok=True)
50
+ removed += 1
51
+ except OSError as exc:
52
+ logger.debug("Could not remove %s: %s", entry, exc)
53
+ return removed
docsifer/safety/per_ip_limiter.py CHANGED
@@ -7,7 +7,7 @@ from __future__ import annotations
7
  import asyncio
8
  import contextlib
9
  import time
10
- from typing import AsyncIterator
11
 
12
  from ..exceptions import TooManyRequestsError
13
 
 
7
  import asyncio
8
  import contextlib
9
  import time
10
+ from collections.abc import AsyncIterator
11
 
12
  from ..exceptions import TooManyRequestsError
13
 
docsifer/ui/gradio_app.py CHANGED
@@ -136,7 +136,7 @@ def _status(text: str, *, level: str = "info") -> str:
136
  color = palette.get(level, palette["info"])
137
  return (
138
  f'<div style="padding:10px 14px;border-radius:10px;'
139
- f'background:{color}14;color:{color};font-size:0.9rem;'
140
  f'border:1px solid {color}30;">{text}</div>'
141
  )
142
 
@@ -209,13 +209,11 @@ def build_interface(settings: Settings, app: FastAPI) -> gr.Blocks:
209
  return "", None, _status(f"Conversion failed: {exc}", level="err")
210
 
211
  asyncio.create_task(_record_access(result.token_count))
212
- md_path = _write_markdown_file(
213
- result.markdown, tmp_dir=settings.tmp_dir, stem=stem
214
- )
215
  byte_len = len(result.markdown.encode("utf-8"))
216
  ok_msg = (
217
  f"Done — {len(result.markdown):,} chars · "
218
- f"{byte_len/1024:.1f} KB · ~{result.token_count:,} tokens"
219
  )
220
  return result.markdown, md_path, _status(ok_msg, level="ok")
221
  finally:
 
136
  color = palette.get(level, palette["info"])
137
  return (
138
  f'<div style="padding:10px 14px;border-radius:10px;'
139
+ f"background:{color}14;color:{color};font-size:0.9rem;"
140
  f'border:1px solid {color}30;">{text}</div>'
141
  )
142
 
 
209
  return "", None, _status(f"Conversion failed: {exc}", level="err")
210
 
211
  asyncio.create_task(_record_access(result.token_count))
212
+ md_path = _write_markdown_file(result.markdown, tmp_dir=settings.tmp_dir, stem=stem)
 
 
213
  byte_len = len(result.markdown.encode("utf-8"))
214
  ok_msg = (
215
  f"Done — {len(result.markdown):,} chars · "
216
+ f"{byte_len / 1024:.1f} KB · ~{result.token_count:,} tokens"
217
  )
218
  return result.markdown, md_path, _status(ok_msg, level="ok")
219
  finally:
tests/integration/test_convert.py CHANGED
@@ -1,4 +1,3 @@
1
- import io
2
  import json
3
 
4
  import pytest
 
 
1
  import json
2
 
3
  import pytest
tests/unit/test_html_cleaner.py CHANGED
@@ -2,8 +2,10 @@ from docsifer.core.html_cleaner import clean_html
2
 
3
 
4
  def test_removes_style_script_noscript() -> None:
5
- html = "<html><head><style>body{}</style><script>x()</script></head>" \
6
- "<body><noscript>nope</noscript><p>hi</p></body></html>"
 
 
7
  out = clean_html(html)
8
  assert "<style" not in out.lower()
9
  assert "<script" not in out.lower()
@@ -13,11 +15,11 @@ def test_removes_style_script_noscript() -> None:
13
 
14
  def test_removes_hidden_attribute_and_inline_styles() -> None:
15
  html = (
16
- '<div hidden>secret</div>'
17
  '<div style="display:none">also-hidden</div>'
18
  '<div style="display: none;">spaced</div>'
19
  '<div aria-hidden="true">aria</div>'
20
- '<p>visible</p>'
21
  )
22
  out = clean_html(html)
23
  assert "secret" not in out
 
2
 
3
 
4
  def test_removes_style_script_noscript() -> None:
5
+ html = (
6
+ "<html><head><style>body{}</style><script>x()</script></head>"
7
+ "<body><noscript>nope</noscript><p>hi</p></body></html>"
8
+ )
9
  out = clean_html(html)
10
  assert "<style" not in out.lower()
11
  assert "<script" not in out.lower()
 
15
 
16
  def test_removes_hidden_attribute_and_inline_styles() -> None:
17
  html = (
18
+ "<div hidden>secret</div>"
19
  '<div style="display:none">also-hidden</div>'
20
  '<div style="display: none;">spaced</div>'
21
  '<div aria-hidden="true">aria</div>'
22
+ "<p>visible</p>"
23
  )
24
  out = clean_html(html)
25
  assert "secret" not in out