DocuMaker / src /config.py
vivekchakraverty's picture
Surface Generate errors in the guide panel; raise LLM max_tokens to 4096
8b53847
Raw
History Blame Contribute Delete
5.63 kB
"""Central configuration for DocuMaker.
Every tunable is read from environment variables (optionally a local ``.env``
file), so model ids / devices can be swapped without touching code.
"""
from __future__ import annotations
import functools
import os
from pathlib import Path
from dotenv import load_dotenv
# Project root = parent of the ``src`` package directory.
PROJECT_ROOT = Path(__file__).resolve().parent.parent
# Load .env from the project root if present (silently ignored if missing).
load_dotenv(PROJECT_ROOT / ".env")
def _flag(name: str, default: str = "0") -> bool:
return os.getenv(name, default).strip().lower() not in ("0", "false", "no", "")
# --- Paths -------------------------------------------------------------------
WORK_DIR = Path(os.getenv("DOCUMAKER_WORK_DIR", str(PROJECT_ROOT / "work"))).resolve()
WORK_DIR.mkdir(parents=True, exist_ok=True)
# --- HuggingFace credentials -------------------------------------------------
def _token_candidates() -> list[str]:
seen: set[str] = set()
out: list[str] = []
for value in (
os.getenv("DOCUMAKER_HF_TOKEN"),
os.getenv("HF_TOKEN"),
os.getenv("HUGGINGFACEHUB_API_TOKEN"),
):
value = (value or "").strip()
if value and value not in seen:
seen.add(value)
out.append(value)
return out
# Whether a UI token may be mirrored into the process environment. True for the
# default local single-user app; turned off automatically for shared/multi-user
# launches so one user's token can't leak to another via the global environment.
_ALLOW_ENV_TOKEN = True
def set_allow_env_token(allowed: bool) -> None:
global _ALLOW_ENV_TOKEN
_ALLOW_ENV_TOKEN = bool(allowed)
def apply_token(token: str | None) -> str | None:
"""Return the cleaned UI token and, **in single-user mode only**, mirror it
into the process ``HF_TOKEN``/``HUGGINGFACEHUB_API_TOKEN`` so huggingface_hub
(InferenceClient auto-discovery, model downloads) also uses it.
In multi-user/shared mode the environment is left untouched — the token is
still threaded explicitly to the LLM and captioner, so it stays scoped to the
caller's session and nothing leaks across users. Returns None if empty.
"""
token = (token or "").strip()
if token and _ALLOW_ENV_TOKEN:
os.environ["HF_TOKEN"] = token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = token
return token or None
@functools.lru_cache(maxsize=1)
def resolve_hf_token() -> str | None:
"""Return the first *valid* HF token among the configured candidates.
Environments often have a stale ``HF_TOKEN`` alongside a working
``HUGGINGFACEHUB_API_TOKEN`` (or vice versa). We validate via ``whoami`` and
pick the one that authenticates, then point huggingface_hub's own
auto-discovery (model downloads, etc.) at the same working token.
"""
candidates = _token_candidates()
if not candidates:
return None
chosen = candidates[0]
try:
from huggingface_hub import whoami
for token in candidates:
try:
whoami(token=token)
chosen = token
break
except Exception:
continue
except Exception:
pass # offline / hub import issue — fall back to the first candidate
os.environ["HF_TOKEN"] = chosen
os.environ["HUGGINGFACEHUB_API_TOKEN"] = chosen
return chosen
# --- Text LLM (HF Inference API) --------------------------------------------
LLM_MODEL = os.getenv("DOCUMAKER_LLM_MODEL", "Qwen/Qwen2.5-7B-Instruct")
LLM_PROVIDER = os.getenv("DOCUMAKER_LLM_PROVIDER", "").strip() or None
LLM_MAX_TOKENS = int(os.getenv("DOCUMAKER_LLM_MAX_TOKENS", "4096"))
LLM_TEMPERATURE = float(os.getenv("DOCUMAKER_LLM_TEMPERATURE", "0.3"))
# Approx. characters of transcript per LLM chunk (keeps prompts within context).
LLM_CHUNK_CHARS = int(os.getenv("DOCUMAKER_LLM_CHUNK_CHARS", "6000"))
# --- Vision LLM (HF Inference API) + local fallback -------------------------
ENABLE_VISION = _flag("DOCUMAKER_ENABLE_VISION", "1")
VLM_MODEL = os.getenv("DOCUMAKER_VLM_MODEL", "Qwen/Qwen2-VL-7B-Instruct")
VLM_PROVIDER = os.getenv("DOCUMAKER_VLM_PROVIDER", "").strip() or None
LOCAL_CAPTION_MODEL = os.getenv(
"DOCUMAKER_LOCAL_CAPTION_MODEL", "Salesforce/blip-image-captioning-base"
)
# --- Whisper (local faster-whisper) -----------------------------------------
WHISPER_MODEL = os.getenv("DOCUMAKER_WHISPER_MODEL", "small")
WHISPER_DEVICE = os.getenv("DOCUMAKER_WHISPER_DEVICE", "auto").strip().lower()
# Blank => choose automatically per device (int8_float16 on CUDA, int8 on CPU).
WHISPER_COMPUTE_TYPE = os.getenv("DOCUMAKER_WHISPER_COMPUTE_TYPE", "").strip()
# --- Frame extraction --------------------------------------------------------
SCENE_THRESHOLD = float(os.getenv("DOCUMAKER_SCENE_THRESHOLD", "27.0"))
SCENE_MIN_LEN_SEC = float(os.getenv("DOCUMAKER_SCENE_MIN_LEN_SEC", "1.0"))
DEDUP_HASH_DISTANCE = int(os.getenv("DOCUMAKER_DEDUP_HASH_DISTANCE", "6"))
# --- DOCX --------------------------------------------------------------------
DOCX_IMAGE_WIDTH_INCHES = float(os.getenv("DOCUMAKER_DOCX_IMAGE_WIDTH_INCHES", "5.5"))
# --- External binaries -------------------------------------------------------
FFMPEG_BIN = os.getenv("DOCUMAKER_FFMPEG_BIN", "ffmpeg")
FFPROBE_BIN = os.getenv("DOCUMAKER_FFPROBE_BIN", "ffprobe")
def session_dir(session_id: str) -> Path:
"""Return (creating if needed) the working directory for one session."""
d = WORK_DIR / session_id
(d / "frames").mkdir(parents=True, exist_ok=True)
return d