Spaces:

tusarway
/

rag-backend

Running

App Files Files Community

imtrt004 commited on Mar 6

Commit

6780118

1 Parent(s): ac40983

fix: remove exAI

Browse files

Files changed (5) hide show

Dockerfile +3 -9
app.py +0 -5
generation/llm.py +24 -14
model/loader.py +4 -65
requirements.txt +3 -5

Dockerfile CHANGED Viewed

@@ -21,15 +21,9 @@ RUN pip install torch --index-url https://download.pytorch.org/whl/cpu \
 # -- Step 2: Everything else -------------------------------------------------
 RUN pip install -r requirements.txt --no-cache-dir
-# -- Step 3: Install transformers from git main ----------------------------------
-# EXAONE's modeling_exaone.py uses check_model_inputs from transformers.utils.generic
-# which was added AFTER the latest PyPI release. Installing from git main ensures
-# we always have the same version LGAI tested their model files against.
-# git is available from Step 1 (apt-get install git).
-# The echo is intentional: changing this line text busts Docker's layer cache.
-RUN echo "transformers-pin: git-main build: 2026-03-05-v7" \
-    && pip install --force-reinstall --no-cache-dir \
-       "transformers @ git+https://github.com/huggingface/transformers.git"
 COPY . .

 # -- Step 2: Everything else -------------------------------------------------
 RUN pip install -r requirements.txt --no-cache-dir
+# -- Step 3: Pin transformers to a stable release ---------------------------
+# Llama 3.2 works with any recent PyPI release; no git-main needed.
+RUN pip install --force-reinstall --no-cache-dir "transformers>=4.43.0,<5.0.0"
 COPY . .

app.py CHANGED Viewed

@@ -67,11 +67,6 @@ async def lifespan(app: FastAPI):
         error("STARTUP", f"Embedding model failed: {exc}")
     section("STARTUP", "LLM")
-    try:
-        tf_version = importlib.metadata.version("transformers")
-        step("STARTUP", f"transformers=={tf_version}")
-    except Exception as exc:
-        warn("STARTUP", f"Could not read transformers version: {exc}")
     step("STARTUP", f"Loading  {get_model_name()}  in background thread…")
     loop = asyncio.get_event_loop()
     try:

         error("STARTUP", f"Embedding model failed: {exc}")
     section("STARTUP", "LLM")
     step("STARTUP", f"Loading  {get_model_name()}  in background thread…")
     loop = asyncio.get_event_loop()
     try:

generation/llm.py CHANGED Viewed

@@ -114,23 +114,33 @@ def stream_answer(
         tokenizer,
         skip_prompt=True,
         skip_special_tokens=True,
-        timeout=120.0,
     )
-    thread = Thread(
-        target=model.generate,
-        kwargs=dict(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            streamer=streamer,
-            max_new_tokens=2048,
-            do_sample=False,          # greedy – fastest on CPU, fully deterministic
-            pad_token_id=tokenizer.eos_token_id,
-        ),
-        daemon=True,
-    )
     thread.start()
     yield from _strip_thinking_stream(streamer)
-    thread.join(timeout=120)

         tokenizer,
         skip_prompt=True,
         skip_special_tokens=True,
+        timeout=None,   # No timeout — CPU prefill of large docs can take >120s
     )
+    # Capture generate-thread exceptions so the streamer never hangs forever
+    _gen_exc: list = [None]
+    def _generate():
+        try:
+            model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                streamer=streamer,
+                max_new_tokens=2048,
+                do_sample=False,          # greedy – fastest on CPU, fully deterministic
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        except Exception as exc:
+            _gen_exc[0] = exc
+            # Unblock the streamer consumer so it doesn't wait forever
+            streamer.text_queue.put(streamer.stop_signal)
+    thread = Thread(target=_generate, daemon=True)
     thread.start()
     yield from _strip_thinking_stream(streamer)
+    thread.join()
+    if _gen_exc[0] is not None:
+        raise RuntimeError(f"LLM generation failed: {_gen_exc[0]}") from _gen_exc[0]

model/loader.py CHANGED Viewed

@@ -1,23 +1,19 @@
 """
 Self-hosted LLM using HuggingFace Transformers — zero external API, no C++ compilation.
-All speeds measured with Q4_K_M GGUF on 2 vCPU / 16 GB RAM (HF Free Tier).
 Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
   #1  TinyLlama/TinyLlama-1.1B-Chat-v1.0           ~1 GB    40-60 tok/s  Apache 2.0   demos, prototypes
   #2  Qwen/Qwen3-0.6B                               ~0.5 GB  45-55 tok/s  Apache 2.0   speed-critical, Think mode
-  #3  meta-llama/Llama-3.2-1B-Instruct              ~1.5 GB  35-50 tok/s  Community    128K ctx, long-context
   #4  HuggingFaceTB/SmolLM2-1.7B-Instruct          ~2 GB    25-35 tok/s  Apache 2.0   good quality/size ratio
   #5  Qwen/Qwen2.5-1.5B-Instruct                    ~2 GB    25-40 tok/s  Apache 2.0   multilingual, 32K ctx
   #6  stabilityai/stablelm-2-zephyr-1_6b            ~2 GB    25-40 tok/s  MIT          DPO-tuned chat feel
   #7  Qwen/Qwen2.5-Coder-1.5B-Instruct              ~2 GB    25-40 tok/s  Apache 2.0   code completion/review
   #8  microsoft/phi-2                                ~3 GB    18-28 tok/s  MIT          reasoning & logic
-  #9  LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct  [DEF]  ~3 GB    20-30 tok/s  Research     best quality under 3B
-  #10 google/gemma-3-1b-it                           ~1.5 GB  35-48 tok/s  Gemma ToU    multilingual, 140+ langs
 Note:
-  - EXAONE requires trust_remote_code=True (LG AI custom architecture).
-    RopeParameters was added in transformers 5.0 (March 2026); EXAONE's updated
-    configuration_exaone.py requires it. Pin transformers>=5.0.0,<6.0.0.
   - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
   - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
 """
@@ -26,36 +22,11 @@ import os
 import time
 import threading
-# ── transformers version guard ────────────────────────────────────────────────
-import importlib.metadata as _ilm
-from packaging.version import Version as _V
-_tv_str = _ilm.version("transformers")
-try:
-    _tv = _V(_tv_str)
-    # Git/dev installs report as e.g. "5.4.0.dev0" — base_version strips the dev suffix
-    _tv_base = _V(_tv.base_version)
-    if _tv_base < _V("5.0.0"):
-        raise RuntimeError(
-            f"[MODEL] transformers=={_tv_str} is too old.\n"
-            "EXAONE-3.5 requires transformers>=5.0.0 (installs from git main are fine).\n"
-            "The Dockerfile Step 3 should install from git+https://github.com/huggingface/transformers.git"
-        )
-except Exception as _ver_exc:
-    import warnings as _w
-    _w.warn(f"[MODEL] Could not check transformers version ({_ver_exc}); proceeding anyway.")
-# ── end guard ─────────────────────────────────────────────────────────────────
 import warnings
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from model.log import section, step, ok, warn, error
-# Suppress FutureWarning from EXAONE's cached modeling file re: input_embeds rename
-warnings.filterwarnings(
-    "ignore",
-    message=r".*input_embeds.*is deprecated.*Use.*inputs_embeds.*",
-    category=FutureWarning,
-)
 # Suppress torch_dtype deprecation warning from transformers dev build
 warnings.filterwarnings(
     "ignore",
@@ -63,13 +34,12 @@ warnings.filterwarnings(
     category=FutureWarning,
 )
-MODEL_ID = os.environ.get("LLM_MODEL", "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct")
 # Models that need trust_remote_code=True (custom architectures)
 _TRUST_REMOTE_CODE_MODELS = (
     "LiquidAI/",
     "DavidAU/LFM",
-    "LGAI-EXAONE/",
 )
 def _needs_trust_remote_code(model_id: str) -> bool:
@@ -98,26 +68,6 @@ def _load() -> None:
     if _trc:
         step("MODEL", "trust_remote_code=True  (custom architecture)")
-    # Compatibility shim: some dev builds may not export check_model_inputs yet,
-    # but EXAONE's modeling_exaone.py expects it from transformers.utils.generic.
-    # IMPORTANT: must be a real pass-through decorator — returning None would
-    # replace any @check_model_inputs-decorated forward() with None, causing
-    # "TypeError: 'NoneType' object is not callable" during generate().
-    try:
-        import transformers.utils.generic as _tug  # type: ignore
-        if not hasattr(_tug, "check_model_inputs"):
-            def _check_model_inputs(func=None, **_kwargs):
-                # Handles both @check_model_inputs and @check_model_inputs(...)
-                if callable(func):
-                    return func          # used as bare decorator
-                def _decorator(f):
-                    return f             # used as decorator factory with args
-                return _decorator
-            setattr(_tug, "check_model_inputs", _check_model_inputs)
-            warn("MODEL", "Patched missing transformers.utils.generic.check_model_inputs")
-    except Exception as exc:
-        warn("MODEL", f"Could not apply transformers compatibility shim: {exc}")
     # ── Tokenizer ─────────────────────────────────────────────────────────────
     _loading_msg = f"Loading tokenizer…"
     step("MODEL", f"Fetching tokenizer…")
@@ -143,17 +93,6 @@ def _load() -> None:
             device_map="cpu",
             low_cpu_mem_usage=True,
         )
-    except ImportError as exc:
-        _hint = ""
-        if "RopeParameters" in str(exc):
-            _hint = (
-                "\n  Hint: EXAONE-3.5 requires transformers>=5.0.0\n"
-                "  (RopeParameters was added in transformers 5.0, March 2026).\n"
-                "  Ensure requirements.txt pins  transformers>=5.0.0,<6.0.0\n"
-                "  and do a Factory Rebuild in the Space settings."
-            )
-        error("MODEL", f"{exc}{_hint}")
-        raise
     except Exception as exc:
         error("MODEL", str(exc))
         raise

 """
 Self-hosted LLM using HuggingFace Transformers — zero external API, no C++ compilation.
+All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier).
 Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
   #1  TinyLlama/TinyLlama-1.1B-Chat-v1.0           ~1 GB    40-60 tok/s  Apache 2.0   demos, prototypes
   #2  Qwen/Qwen3-0.6B                               ~0.5 GB  45-55 tok/s  Apache 2.0   speed-critical, Think mode
+  #3  meta-llama/Llama-3.2-1B-Instruct  [DEF]       ~1.5 GB  35-50 tok/s  Community    128K ctx, long-context
   #4  HuggingFaceTB/SmolLM2-1.7B-Instruct          ~2 GB    25-35 tok/s  Apache 2.0   good quality/size ratio
   #5  Qwen/Qwen2.5-1.5B-Instruct                    ~2 GB    25-40 tok/s  Apache 2.0   multilingual, 32K ctx
   #6  stabilityai/stablelm-2-zephyr-1_6b            ~2 GB    25-40 tok/s  MIT          DPO-tuned chat feel
   #7  Qwen/Qwen2.5-Coder-1.5B-Instruct              ~2 GB    25-40 tok/s  Apache 2.0   code completion/review
   #8  microsoft/phi-2                                ~3 GB    18-28 tok/s  MIT          reasoning & logic
+  #9  google/gemma-3-1b-it                           ~1.5 GB  35-48 tok/s  Gemma ToU    multilingual, 140+ langs
 Note:
   - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
   - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
 """
 import time
 import threading
 import warnings
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from model.log import section, step, ok, warn, error
 # Suppress torch_dtype deprecation warning from transformers dev build
 warnings.filterwarnings(
     "ignore",
     category=FutureWarning,
 )
+MODEL_ID = os.environ.get("LLM_MODEL", "meta-llama/Llama-3.2-1B-Instruct")
 # Models that need trust_remote_code=True (custom architectures)
 _TRUST_REMOTE_CODE_MODELS = (
     "LiquidAI/",
     "DavidAU/LFM",
 )
 def _needs_trust_remote_code(model_id: str) -> bool:
     if _trc:
         step("MODEL", "trust_remote_code=True  (custom architecture)")
     # ── Tokenizer ─────────────────────────────────────────────────────────────
     _loading_msg = f"Loading tokenizer…"
     step("MODEL", f"Fetching tokenizer…")
             device_map="cpu",
             low_cpu_mem_usage=True,
         )
     except Exception as exc:
         error("MODEL", str(exc))
         raise

requirements.txt CHANGED Viewed

@@ -1,11 +1,9 @@
-# cache-bust: 2026-03-05-v7
 fastapi
 uvicorn[standard]==0.34.0
 sentence-transformers>=3.0.0,<4.0.0
-# transformers is intentionally NOT listed here.
-# sentence-transformers 3.x requires transformers<5.0.0, but EXAONE needs 5.x.
-# Listing both causes ResolutionImpossible at pip resolve time.
-# The Dockerfile Step 3 force-reinstalls transformers>=5.0.0 after Step 2.
 accelerate>=0.26.0
 huggingface-hub>=0.31.0
 supabase==2.13.0

+# cache-bust: 2026-03-06-v1
 fastapi
 uvicorn[standard]==0.34.0
 sentence-transformers>=3.0.0,<4.0.0
+# transformers is pinned separately in the Dockerfile to avoid ResolutionImpossible
+# between sentence-transformers and the version needed for the active LLM.
 accelerate>=0.26.0
 huggingface-hub>=0.31.0
 supabase==2.13.0