chore: update something
Browse files- docsifer/analytics/service.py +1 -3
- docsifer/analytics/store.py +5 -8
- docsifer/api/error_handlers.py +1 -3
- docsifer/api/middleware.py +1 -1
- docsifer/api/v1/convert.py +41 -44
- docsifer/config.py +1 -4
- docsifer/core/llm_registry.py +2 -4
- docsifer/core/service.py +1 -3
- docsifer/main.py +2 -4
- docsifer/safety/circuit_breaker.py +2 -1
- docsifer/safety/conversion_gate.py +1 -1
- docsifer/safety/disk_cleanup.py +15 -10
- docsifer/safety/per_ip_limiter.py +1 -1
- docsifer/ui/gradio_app.py +3 -5
- tests/integration/test_convert.py +0 -1
- tests/unit/test_html_cleaner.py +6 -4
docsifer/analytics/service.py
CHANGED
|
@@ -165,9 +165,7 @@ class AnalyticsService:
|
|
| 165 |
backoff = 1.0
|
| 166 |
while not self._stopped.is_set():
|
| 167 |
try:
|
| 168 |
-
await asyncio.wait_for(
|
| 169 |
-
self._stopped.wait(), timeout=self._sync_interval
|
| 170 |
-
)
|
| 171 |
return # stopped
|
| 172 |
except asyncio.TimeoutError:
|
| 173 |
pass
|
|
|
|
| 165 |
backoff = 1.0
|
| 166 |
while not self._stopped.is_set():
|
| 167 |
try:
|
| 168 |
+
await asyncio.wait_for(self._stopped.wait(), timeout=self._sync_interval)
|
|
|
|
|
|
|
| 169 |
return # stopped
|
| 170 |
except asyncio.TimeoutError:
|
| 171 |
pass
|
docsifer/analytics/store.py
CHANGED
|
@@ -96,9 +96,7 @@ class UpstashStore:
|
|
| 96 |
keys = scan[1] or []
|
| 97 |
for key in keys:
|
| 98 |
period = key.split(":", 2)[-1]
|
| 99 |
-
data = await loop.run_in_executor(
|
| 100 |
-
None, partial(self._client.hgetall, key)
|
| 101 |
-
)
|
| 102 |
for label, count in (data or {}).items():
|
| 103 |
try:
|
| 104 |
result[metric][period][label] = int(count)
|
|
@@ -123,6 +121,7 @@ class UpstashStore:
|
|
| 123 |
# Prefer pipeline when available (single HTTP round trip)
|
| 124 |
pipeline = getattr(self._client, "pipeline", None)
|
| 125 |
if callable(pipeline):
|
|
|
|
| 126 |
def _run() -> None:
|
| 127 |
pipe = pipeline()
|
| 128 |
for key, label, count in ops:
|
|
@@ -135,9 +134,7 @@ class UpstashStore:
|
|
| 135 |
# Fallback: parallel single-op calls
|
| 136 |
await asyncio.gather(
|
| 137 |
*(
|
| 138 |
-
loop.run_in_executor(
|
| 139 |
-
None, partial(self._client.hincrby, key, label, count)
|
| 140 |
-
)
|
| 141 |
for key, label, count in ops
|
| 142 |
)
|
| 143 |
)
|
|
@@ -166,7 +163,7 @@ class logger_suppress: # noqa: N801 - tiny utility, lower-case name for context
|
|
| 166 |
def __init__(self, message: str) -> None:
|
| 167 |
self._message = message
|
| 168 |
|
| 169 |
-
def __enter__(self) ->
|
| 170 |
return self
|
| 171 |
|
| 172 |
def __exit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
|
|
@@ -174,7 +171,7 @@ class logger_suppress: # noqa: N801 - tiny utility, lower-case name for context
|
|
| 174 |
logger.warning("%s: %s", self._message, exc)
|
| 175 |
return True
|
| 176 |
|
| 177 |
-
async def __aenter__(self) ->
|
| 178 |
return self
|
| 179 |
|
| 180 |
async def __aexit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
|
|
|
|
| 96 |
keys = scan[1] or []
|
| 97 |
for key in keys:
|
| 98 |
period = key.split(":", 2)[-1]
|
| 99 |
+
data = await loop.run_in_executor(None, partial(self._client.hgetall, key))
|
|
|
|
|
|
|
| 100 |
for label, count in (data or {}).items():
|
| 101 |
try:
|
| 102 |
result[metric][period][label] = int(count)
|
|
|
|
| 121 |
# Prefer pipeline when available (single HTTP round trip)
|
| 122 |
pipeline = getattr(self._client, "pipeline", None)
|
| 123 |
if callable(pipeline):
|
| 124 |
+
|
| 125 |
def _run() -> None:
|
| 126 |
pipe = pipeline()
|
| 127 |
for key, label, count in ops:
|
|
|
|
| 134 |
# Fallback: parallel single-op calls
|
| 135 |
await asyncio.gather(
|
| 136 |
*(
|
| 137 |
+
loop.run_in_executor(None, partial(self._client.hincrby, key, label, count))
|
|
|
|
|
|
|
| 138 |
for key, label, count in ops
|
| 139 |
)
|
| 140 |
)
|
|
|
|
| 163 |
def __init__(self, message: str) -> None:
|
| 164 |
self._message = message
|
| 165 |
|
| 166 |
+
def __enter__(self) -> logger_suppress:
|
| 167 |
return self
|
| 168 |
|
| 169 |
def __exit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
|
|
|
|
| 171 |
logger.warning("%s: %s", self._message, exc)
|
| 172 |
return True
|
| 173 |
|
| 174 |
+
async def __aenter__(self) -> logger_suppress:
|
| 175 |
return self
|
| 176 |
|
| 177 |
async def __aexit__(self, exc_type, exc, tb) -> bool: # type: ignore[no-untyped-def]
|
docsifer/api/error_handlers.py
CHANGED
|
@@ -59,9 +59,7 @@ async def _http_exception_handler(request: Request, exc: HTTPException) -> JSONR
|
|
| 59 |
)
|
| 60 |
|
| 61 |
|
| 62 |
-
async def _validation_handler(
|
| 63 |
-
request: Request, exc: RequestValidationError
|
| 64 |
-
) -> JSONResponse:
|
| 65 |
return JSONResponse(
|
| 66 |
status_code=422,
|
| 67 |
content=_payload(
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
|
| 62 |
+
async def _validation_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
|
|
|
|
|
|
|
| 63 |
return JSONResponse(
|
| 64 |
status_code=422,
|
| 65 |
content=_payload(
|
docsifer/api/middleware.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import secrets
|
| 6 |
-
from
|
| 7 |
|
| 8 |
from fastapi import FastAPI, Request, Response
|
| 9 |
from fastapi.responses import JSONResponse
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import secrets
|
| 6 |
+
from collections.abc import Awaitable, Callable
|
| 7 |
|
| 8 |
from fastapi import FastAPI, Request, Response
|
| 9 |
from fastapi.responses import JSONResponse
|
docsifer/api/v1/convert.py
CHANGED
|
@@ -147,60 +147,57 @@ async def convert_document(
|
|
| 147 |
http_cfg = _parse_json_form("http", http, HTTPConfig)
|
| 148 |
convert_cfg = _parse_json_form("settings", settings_form, ConvertSettings)
|
| 149 |
|
| 150 |
-
async with per_ip.acquire(client_ip):
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
size = await _stream_to_disk(
|
| 161 |
-
file, dst, max_bytes=settings.max_upload_bytes
|
| 162 |
-
)
|
| 163 |
-
logger.info(
|
| 164 |
-
"Convert file received",
|
| 165 |
-
extra={
|
| 166 |
-
"filename": safe_name,
|
| 167 |
-
"size": size,
|
| 168 |
-
"client_ip": client_ip,
|
| 169 |
-
},
|
| 170 |
-
)
|
| 171 |
-
guard.check(size)
|
| 172 |
-
|
| 173 |
-
result = await asyncio.wait_for(
|
| 174 |
-
converter.convert_file(
|
| 175 |
-
dst,
|
| 176 |
-
openai_config=openai_cfg.to_dict(),
|
| 177 |
-
http_config=http_cfg.to_dict(),
|
| 178 |
-
cleanup_html=convert_cfg.cleanup,
|
| 179 |
-
),
|
| 180 |
-
timeout=settings.request_timeout_sec,
|
| 181 |
-
)
|
| 182 |
-
finally:
|
| 183 |
-
shutil.rmtree(tmp_root, ignore_errors=True)
|
| 184 |
-
else:
|
| 185 |
-
safe_url = validate_url(
|
| 186 |
-
url or "",
|
| 187 |
-
allowed_schemes=settings.url_allowed_schemes,
|
| 188 |
-
allow_private_networks=settings.url_allow_private_networks,
|
| 189 |
-
)
|
| 190 |
logger.info(
|
| 191 |
-
"Convert
|
| 192 |
-
extra={
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
)
|
| 194 |
-
guard.check(
|
|
|
|
| 195 |
result = await asyncio.wait_for(
|
| 196 |
converter.convert_file(
|
| 197 |
-
|
| 198 |
openai_config=openai_cfg.to_dict(),
|
| 199 |
http_config=http_cfg.to_dict(),
|
| 200 |
cleanup_html=convert_cfg.cleanup,
|
| 201 |
),
|
| 202 |
timeout=settings.request_timeout_sec,
|
| 203 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
background_tasks.add_task(_record_access_safe, analytics, result.token_count)
|
| 206 |
return ConvertResponse(filename=result.filename, markdown=result.markdown)
|
|
|
|
| 147 |
http_cfg = _parse_json_form("http", http, HTTPConfig)
|
| 148 |
convert_cfg = _parse_json_form("settings", settings_form, ConvertSettings)
|
| 149 |
|
| 150 |
+
async with per_ip.acquire(client_ip), gate.acquire():
|
| 151 |
+
if file is not None:
|
| 152 |
+
guard.check(0) # file size unknown until streamed
|
| 153 |
+
_check_extension(file.filename or "", set(settings.allowed_extensions))
|
| 154 |
+
|
| 155 |
+
tmp_root = Path(tempfile.mkdtemp(prefix="docsifer-", dir=settings.tmp_dir))
|
| 156 |
+
try:
|
| 157 |
+
safe_name = _safe_filename(file.filename)
|
| 158 |
+
dst = tmp_root / safe_name
|
| 159 |
+
size = await _stream_to_disk(file, dst, max_bytes=settings.max_upload_bytes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
logger.info(
|
| 161 |
+
"Convert file received",
|
| 162 |
+
extra={
|
| 163 |
+
"filename": safe_name,
|
| 164 |
+
"size": size,
|
| 165 |
+
"client_ip": client_ip,
|
| 166 |
+
},
|
| 167 |
)
|
| 168 |
+
guard.check(size)
|
| 169 |
+
|
| 170 |
result = await asyncio.wait_for(
|
| 171 |
converter.convert_file(
|
| 172 |
+
dst,
|
| 173 |
openai_config=openai_cfg.to_dict(),
|
| 174 |
http_config=http_cfg.to_dict(),
|
| 175 |
cleanup_html=convert_cfg.cleanup,
|
| 176 |
),
|
| 177 |
timeout=settings.request_timeout_sec,
|
| 178 |
)
|
| 179 |
+
finally:
|
| 180 |
+
shutil.rmtree(tmp_root, ignore_errors=True)
|
| 181 |
+
else:
|
| 182 |
+
safe_url = validate_url(
|
| 183 |
+
url or "",
|
| 184 |
+
allowed_schemes=settings.url_allowed_schemes,
|
| 185 |
+
allow_private_networks=settings.url_allow_private_networks,
|
| 186 |
+
)
|
| 187 |
+
logger.info(
|
| 188 |
+
"Convert URL received",
|
| 189 |
+
extra={"url": safe_url, "client_ip": client_ip},
|
| 190 |
+
)
|
| 191 |
+
guard.check(0)
|
| 192 |
+
result = await asyncio.wait_for(
|
| 193 |
+
converter.convert_file(
|
| 194 |
+
safe_url,
|
| 195 |
+
openai_config=openai_cfg.to_dict(),
|
| 196 |
+
http_config=http_cfg.to_dict(),
|
| 197 |
+
cleanup_html=convert_cfg.cleanup,
|
| 198 |
+
),
|
| 199 |
+
timeout=settings.request_timeout_sec,
|
| 200 |
+
)
|
| 201 |
|
| 202 |
background_tasks.add_task(_record_access_safe, analytics, result.token_count)
|
| 203 |
return ConvertResponse(filename=result.filename, markdown=result.markdown)
|
docsifer/config.py
CHANGED
|
@@ -133,10 +133,7 @@ class Settings(BaseSettings):
|
|
| 133 |
"""True when a real (non-empty, non-localhost) Redis URL is configured."""
|
| 134 |
if not self.analytics_enabled:
|
| 135 |
return False
|
| 136 |
-
|
| 137 |
-
if not url:
|
| 138 |
-
return False
|
| 139 |
-
return True
|
| 140 |
|
| 141 |
# ---------------------------------------------------------------------
|
| 142 |
# Quotas (anonymous, BYOK, authenticated)
|
|
|
|
| 133 |
"""True when a real (non-empty, non-localhost) Redis URL is configured."""
|
| 134 |
if not self.analytics_enabled:
|
| 135 |
return False
|
| 136 |
+
return bool((self.redis_url or "").strip())
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# ---------------------------------------------------------------------
|
| 139 |
# Quotas (anonymous, BYOK, authenticated)
|
docsifer/core/llm_registry.py
CHANGED
|
@@ -44,7 +44,7 @@ class LLMConfig:
|
|
| 44 |
*,
|
| 45 |
default_base_url: str,
|
| 46 |
default_model: str,
|
| 47 |
-
) ->
|
| 48 |
if not data:
|
| 49 |
return None
|
| 50 |
api_key = (data.get("api_key") or "").strip()
|
|
@@ -92,9 +92,7 @@ class LLMRegistry:
|
|
| 92 |
client = OpenAI(
|
| 93 |
api_key=config.api_key,
|
| 94 |
base_url=config.base_url,
|
| 95 |
-
timeout=httpx.Timeout(
|
| 96 |
-
self._request_timeout, connect=self._connect_timeout
|
| 97 |
-
),
|
| 98 |
max_retries=self._max_retries,
|
| 99 |
)
|
| 100 |
return MarkItDown(llm_client=client, llm_model=config.model)
|
|
|
|
| 44 |
*,
|
| 45 |
default_base_url: str,
|
| 46 |
default_model: str,
|
| 47 |
+
) -> LLMConfig | None:
|
| 48 |
if not data:
|
| 49 |
return None
|
| 50 |
api_key = (data.get("api_key") or "").strip()
|
|
|
|
| 92 |
client = OpenAI(
|
| 93 |
api_key=config.api_key,
|
| 94 |
base_url=config.base_url,
|
| 95 |
+
timeout=httpx.Timeout(self._request_timeout, connect=self._connect_timeout),
|
|
|
|
|
|
|
| 96 |
max_retries=self._max_retries,
|
| 97 |
)
|
| 98 |
return MarkItDown(llm_client=client, llm_model=config.model)
|
docsifer/core/service.py
CHANGED
|
@@ -107,9 +107,7 @@ class DocsiferService:
|
|
| 107 |
async def shutdown(self) -> None:
|
| 108 |
"""Stop the worker pool gracefully (called from app lifespan)."""
|
| 109 |
self._llm_registry.clear()
|
| 110 |
-
await asyncio.get_running_loop().run_in_executor(
|
| 111 |
-
None, self._executor.shutdown, True
|
| 112 |
-
)
|
| 113 |
|
| 114 |
# ------------------------------------------------------------------
|
| 115 |
# Internal
|
|
|
|
| 107 |
async def shutdown(self) -> None:
|
| 108 |
"""Stop the worker pool gracefully (called from app lifespan)."""
|
| 109 |
self._llm_registry.clear()
|
| 110 |
+
await asyncio.get_running_loop().run_in_executor(None, self._executor.shutdown, True)
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# ------------------------------------------------------------------
|
| 113 |
# Internal
|
docsifer/main.py
CHANGED
|
@@ -18,8 +18,8 @@ from __future__ import annotations
|
|
| 18 |
import asyncio
|
| 19 |
import contextlib
|
| 20 |
import logging
|
|
|
|
| 21 |
from contextlib import asynccontextmanager
|
| 22 |
-
from typing import AsyncIterator
|
| 23 |
|
| 24 |
from fastapi import FastAPI
|
| 25 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -54,9 +54,7 @@ def _build_analytics_store(settings: Settings) -> AnalyticsStore:
|
|
| 54 |
logger.info("Analytics: Upstash store configured")
|
| 55 |
return store
|
| 56 |
except Exception as exc: # pragma: no cover - defensive
|
| 57 |
-
logger.warning(
|
| 58 |
-
"Could not initialize Upstash store (%s); falling back to in-memory", exc
|
| 59 |
-
)
|
| 60 |
return InMemoryStore()
|
| 61 |
|
| 62 |
|
|
|
|
| 18 |
import asyncio
|
| 19 |
import contextlib
|
| 20 |
import logging
|
| 21 |
+
from collections.abc import AsyncIterator
|
| 22 |
from contextlib import asynccontextmanager
|
|
|
|
| 23 |
|
| 24 |
from fastapi import FastAPI
|
| 25 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 54 |
logger.info("Analytics: Upstash store configured")
|
| 55 |
return store
|
| 56 |
except Exception as exc: # pragma: no cover - defensive
|
| 57 |
+
logger.warning("Could not initialize Upstash store (%s); falling back to in-memory", exc)
|
|
|
|
|
|
|
| 58 |
return InMemoryStore()
|
| 59 |
|
| 60 |
|
docsifer/safety/circuit_breaker.py
CHANGED
|
@@ -4,7 +4,8 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
import asyncio
|
| 6 |
import time
|
| 7 |
-
from
|
|
|
|
| 8 |
|
| 9 |
from ..exceptions import CircuitOpenError
|
| 10 |
|
|
|
|
| 4 |
|
| 5 |
import asyncio
|
| 6 |
import time
|
| 7 |
+
from collections.abc import Awaitable, Callable
|
| 8 |
+
from typing import TypeVar
|
| 9 |
|
| 10 |
from ..exceptions import CircuitOpenError
|
| 11 |
|
docsifer/safety/conversion_gate.py
CHANGED
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
import asyncio
|
| 6 |
import contextlib
|
| 7 |
-
from
|
| 8 |
|
| 9 |
from ..exceptions import QueueFullError
|
| 10 |
|
|
|
|
| 4 |
|
| 5 |
import asyncio
|
| 6 |
import contextlib
|
| 7 |
+
from collections.abc import AsyncIterator
|
| 8 |
|
| 9 |
from ..exceptions import QueueFullError
|
| 10 |
|
docsifer/safety/disk_cleanup.py
CHANGED
|
@@ -32,17 +32,22 @@ async def disk_cleanup_loop(
|
|
| 32 |
pass
|
| 33 |
|
| 34 |
try:
|
| 35 |
-
|
| 36 |
-
removed = 0
|
| 37 |
-
for entry in tmp_dir.glob(pattern):
|
| 38 |
-
try:
|
| 39 |
-
if now - entry.stat().st_mtime > ttl_sec:
|
| 40 |
-
if entry.is_file():
|
| 41 |
-
entry.unlink(missing_ok=True)
|
| 42 |
-
removed += 1
|
| 43 |
-
except OSError as exc:
|
| 44 |
-
logger.debug("Could not remove %s: %s", entry, exc)
|
| 45 |
if removed:
|
| 46 |
logger.info("Disk cleanup removed %d stale file(s)", removed)
|
| 47 |
except Exception as exc:
|
| 48 |
logger.warning("Disk cleanup failed: %s", exc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
pass
|
| 33 |
|
| 34 |
try:
|
| 35 |
+
removed = await asyncio.to_thread(_sweep_once, tmp_dir, pattern, ttl_sec)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if removed:
|
| 37 |
logger.info("Disk cleanup removed %d stale file(s)", removed)
|
| 38 |
except Exception as exc:
|
| 39 |
logger.warning("Disk cleanup failed: %s", exc)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _sweep_once(tmp_dir: Path, pattern: str, ttl_sec: int) -> int:
|
| 43 |
+
"""Synchronous sweep — meant to be run via ``asyncio.to_thread``."""
|
| 44 |
+
now = time.time()
|
| 45 |
+
removed = 0
|
| 46 |
+
for entry in tmp_dir.glob(pattern):
|
| 47 |
+
try:
|
| 48 |
+
if now - entry.stat().st_mtime > ttl_sec and entry.is_file():
|
| 49 |
+
entry.unlink(missing_ok=True)
|
| 50 |
+
removed += 1
|
| 51 |
+
except OSError as exc:
|
| 52 |
+
logger.debug("Could not remove %s: %s", entry, exc)
|
| 53 |
+
return removed
|
docsifer/safety/per_ip_limiter.py
CHANGED
|
@@ -7,7 +7,7 @@ from __future__ import annotations
|
|
| 7 |
import asyncio
|
| 8 |
import contextlib
|
| 9 |
import time
|
| 10 |
-
from
|
| 11 |
|
| 12 |
from ..exceptions import TooManyRequestsError
|
| 13 |
|
|
|
|
| 7 |
import asyncio
|
| 8 |
import contextlib
|
| 9 |
import time
|
| 10 |
+
from collections.abc import AsyncIterator
|
| 11 |
|
| 12 |
from ..exceptions import TooManyRequestsError
|
| 13 |
|
docsifer/ui/gradio_app.py
CHANGED
|
@@ -136,7 +136,7 @@ def _status(text: str, *, level: str = "info") -> str:
|
|
| 136 |
color = palette.get(level, palette["info"])
|
| 137 |
return (
|
| 138 |
f'<div style="padding:10px 14px;border-radius:10px;'
|
| 139 |
-
f
|
| 140 |
f'border:1px solid {color}30;">{text}</div>'
|
| 141 |
)
|
| 142 |
|
|
@@ -209,13 +209,11 @@ def build_interface(settings: Settings, app: FastAPI) -> gr.Blocks:
|
|
| 209 |
return "", None, _status(f"Conversion failed: {exc}", level="err")
|
| 210 |
|
| 211 |
asyncio.create_task(_record_access(result.token_count))
|
| 212 |
-
md_path = _write_markdown_file(
|
| 213 |
-
result.markdown, tmp_dir=settings.tmp_dir, stem=stem
|
| 214 |
-
)
|
| 215 |
byte_len = len(result.markdown.encode("utf-8"))
|
| 216 |
ok_msg = (
|
| 217 |
f"Done — {len(result.markdown):,} chars · "
|
| 218 |
-
f"{byte_len/1024:.1f} KB · ~{result.token_count:,} tokens"
|
| 219 |
)
|
| 220 |
return result.markdown, md_path, _status(ok_msg, level="ok")
|
| 221 |
finally:
|
|
|
|
| 136 |
color = palette.get(level, palette["info"])
|
| 137 |
return (
|
| 138 |
f'<div style="padding:10px 14px;border-radius:10px;'
|
| 139 |
+
f"background:{color}14;color:{color};font-size:0.9rem;"
|
| 140 |
f'border:1px solid {color}30;">{text}</div>'
|
| 141 |
)
|
| 142 |
|
|
|
|
| 209 |
return "", None, _status(f"Conversion failed: {exc}", level="err")
|
| 210 |
|
| 211 |
asyncio.create_task(_record_access(result.token_count))
|
| 212 |
+
md_path = _write_markdown_file(result.markdown, tmp_dir=settings.tmp_dir, stem=stem)
|
|
|
|
|
|
|
| 213 |
byte_len = len(result.markdown.encode("utf-8"))
|
| 214 |
ok_msg = (
|
| 215 |
f"Done — {len(result.markdown):,} chars · "
|
| 216 |
+
f"{byte_len / 1024:.1f} KB · ~{result.token_count:,} tokens"
|
| 217 |
)
|
| 218 |
return result.markdown, md_path, _status(ok_msg, level="ok")
|
| 219 |
finally:
|
tests/integration/test_convert.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import io
|
| 2 |
import json
|
| 3 |
|
| 4 |
import pytest
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
import pytest
|
tests/unit/test_html_cleaner.py
CHANGED
|
@@ -2,8 +2,10 @@ from docsifer.core.html_cleaner import clean_html
|
|
| 2 |
|
| 3 |
|
| 4 |
def test_removes_style_script_noscript() -> None:
|
| 5 |
-
html =
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
out = clean_html(html)
|
| 8 |
assert "<style" not in out.lower()
|
| 9 |
assert "<script" not in out.lower()
|
|
@@ -13,11 +15,11 @@ def test_removes_style_script_noscript() -> None:
|
|
| 13 |
|
| 14 |
def test_removes_hidden_attribute_and_inline_styles() -> None:
|
| 15 |
html = (
|
| 16 |
-
|
| 17 |
'<div style="display:none">also-hidden</div>'
|
| 18 |
'<div style="display: none;">spaced</div>'
|
| 19 |
'<div aria-hidden="true">aria</div>'
|
| 20 |
-
|
| 21 |
)
|
| 22 |
out = clean_html(html)
|
| 23 |
assert "secret" not in out
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def test_removes_style_script_noscript() -> None:
|
| 5 |
+
html = (
|
| 6 |
+
"<html><head><style>body{}</style><script>x()</script></head>"
|
| 7 |
+
"<body><noscript>nope</noscript><p>hi</p></body></html>"
|
| 8 |
+
)
|
| 9 |
out = clean_html(html)
|
| 10 |
assert "<style" not in out.lower()
|
| 11 |
assert "<script" not in out.lower()
|
|
|
|
| 15 |
|
| 16 |
def test_removes_hidden_attribute_and_inline_styles() -> None:
|
| 17 |
html = (
|
| 18 |
+
"<div hidden>secret</div>"
|
| 19 |
'<div style="display:none">also-hidden</div>'
|
| 20 |
'<div style="display: none;">spaced</div>'
|
| 21 |
'<div aria-hidden="true">aria</div>'
|
| 22 |
+
"<p>visible</p>"
|
| 23 |
)
|
| 24 |
out = clean_html(html)
|
| 25 |
assert "secret" not in out
|