imtrt004 commited on
Commit
6780118
Β·
1 Parent(s): ac40983

fix: remove exAI

Browse files
Files changed (5) hide show
  1. Dockerfile +3 -9
  2. app.py +0 -5
  3. generation/llm.py +24 -14
  4. model/loader.py +4 -65
  5. requirements.txt +3 -5
Dockerfile CHANGED
@@ -21,15 +21,9 @@ RUN pip install torch --index-url https://download.pytorch.org/whl/cpu \
21
  # -- Step 2: Everything else -------------------------------------------------
22
  RUN pip install -r requirements.txt --no-cache-dir
23
 
24
- # -- Step 3: Install transformers from git main ----------------------------------
25
- # EXAONE's modeling_exaone.py uses check_model_inputs from transformers.utils.generic
26
- # which was added AFTER the latest PyPI release. Installing from git main ensures
27
- # we always have the same version LGAI tested their model files against.
28
- # git is available from Step 1 (apt-get install git).
29
- # The echo is intentional: changing this line text busts Docker's layer cache.
30
- RUN echo "transformers-pin: git-main build: 2026-03-05-v7" \
31
- && pip install --force-reinstall --no-cache-dir \
32
- "transformers @ git+https://github.com/huggingface/transformers.git"
33
 
34
  COPY . .
35
 
 
21
  # -- Step 2: Everything else -------------------------------------------------
22
  RUN pip install -r requirements.txt --no-cache-dir
23
 
24
+ # -- Step 3: Pin transformers to a stable release ---------------------------
25
+ # Llama 3.2 works with any recent PyPI release; no git-main needed.
26
+ RUN pip install --force-reinstall --no-cache-dir "transformers>=4.43.0,<5.0.0"
 
 
 
 
 
 
27
 
28
  COPY . .
29
 
app.py CHANGED
@@ -67,11 +67,6 @@ async def lifespan(app: FastAPI):
67
  error("STARTUP", f"Embedding model failed: {exc}")
68
 
69
  section("STARTUP", "LLM")
70
- try:
71
- tf_version = importlib.metadata.version("transformers")
72
- step("STARTUP", f"transformers=={tf_version}")
73
- except Exception as exc:
74
- warn("STARTUP", f"Could not read transformers version: {exc}")
75
  step("STARTUP", f"Loading {get_model_name()} in background thread…")
76
  loop = asyncio.get_event_loop()
77
  try:
 
67
  error("STARTUP", f"Embedding model failed: {exc}")
68
 
69
  section("STARTUP", "LLM")
 
 
 
 
 
70
  step("STARTUP", f"Loading {get_model_name()} in background thread…")
71
  loop = asyncio.get_event_loop()
72
  try:
generation/llm.py CHANGED
@@ -114,23 +114,33 @@ def stream_answer(
114
  tokenizer,
115
  skip_prompt=True,
116
  skip_special_tokens=True,
117
- timeout=120.0,
118
  )
119
 
120
- thread = Thread(
121
- target=model.generate,
122
- kwargs=dict(
123
- input_ids=input_ids,
124
- attention_mask=attention_mask,
125
- streamer=streamer,
126
- max_new_tokens=2048,
127
- do_sample=False, # greedy – fastest on CPU, fully deterministic
128
- pad_token_id=tokenizer.eos_token_id,
129
- ),
130
- daemon=True,
131
- )
 
 
 
 
 
 
 
132
  thread.start()
133
 
134
  yield from _strip_thinking_stream(streamer)
135
 
136
- thread.join(timeout=120)
 
 
 
 
114
  tokenizer,
115
  skip_prompt=True,
116
  skip_special_tokens=True,
117
+ timeout=None, # No timeout β€” CPU prefill of large docs can take >120s
118
  )
119
 
120
+ # Capture generate-thread exceptions so the streamer never hangs forever
121
+ _gen_exc: list = [None]
122
+
123
+ def _generate():
124
+ try:
125
+ model.generate(
126
+ input_ids=input_ids,
127
+ attention_mask=attention_mask,
128
+ streamer=streamer,
129
+ max_new_tokens=2048,
130
+ do_sample=False, # greedy – fastest on CPU, fully deterministic
131
+ pad_token_id=tokenizer.eos_token_id,
132
+ )
133
+ except Exception as exc:
134
+ _gen_exc[0] = exc
135
+ # Unblock the streamer consumer so it doesn't wait forever
136
+ streamer.text_queue.put(streamer.stop_signal)
137
+
138
+ thread = Thread(target=_generate, daemon=True)
139
  thread.start()
140
 
141
  yield from _strip_thinking_stream(streamer)
142
 
143
+ thread.join()
144
+
145
+ if _gen_exc[0] is not None:
146
+ raise RuntimeError(f"LLM generation failed: {_gen_exc[0]}") from _gen_exc[0]
model/loader.py CHANGED
@@ -1,23 +1,19 @@
1
  """
2
  Self-hosted LLM using HuggingFace Transformers β€” zero external API, no C++ compilation.
3
- All speeds measured with Q4_K_M GGUF on 2 vCPU / 16 GB RAM (HF Free Tier).
4
 
5
  Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
6
  #1 TinyLlama/TinyLlama-1.1B-Chat-v1.0 ~1 GB 40-60 tok/s Apache 2.0 demos, prototypes
7
  #2 Qwen/Qwen3-0.6B ~0.5 GB 45-55 tok/s Apache 2.0 speed-critical, Think mode
8
- #3 meta-llama/Llama-3.2-1B-Instruct ~1.5 GB 35-50 tok/s Community 128K ctx, long-context
9
  #4 HuggingFaceTB/SmolLM2-1.7B-Instruct ~2 GB 25-35 tok/s Apache 2.0 good quality/size ratio
10
  #5 Qwen/Qwen2.5-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 multilingual, 32K ctx
11
  #6 stabilityai/stablelm-2-zephyr-1_6b ~2 GB 25-40 tok/s MIT DPO-tuned chat feel
12
  #7 Qwen/Qwen2.5-Coder-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 code completion/review
13
  #8 microsoft/phi-2 ~3 GB 18-28 tok/s MIT reasoning & logic
14
- #9 LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct [DEF] ~3 GB 20-30 tok/s Research best quality under 3B
15
- #10 google/gemma-3-1b-it ~1.5 GB 35-48 tok/s Gemma ToU multilingual, 140+ langs
16
 
17
  Note:
18
- - EXAONE requires trust_remote_code=True (LG AI custom architecture).
19
- RopeParameters was added in transformers 5.0 (March 2026); EXAONE's updated
20
- configuration_exaone.py requires it. Pin transformers>=5.0.0,<6.0.0.
21
  - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
22
  - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
23
  """
@@ -26,36 +22,11 @@ import os
26
  import time
27
  import threading
28
 
29
- # ── transformers version guard ────────────────────────────────────────────────
30
- import importlib.metadata as _ilm
31
- from packaging.version import Version as _V
32
- _tv_str = _ilm.version("transformers")
33
- try:
34
- _tv = _V(_tv_str)
35
- # Git/dev installs report as e.g. "5.4.0.dev0" β€” base_version strips the dev suffix
36
- _tv_base = _V(_tv.base_version)
37
- if _tv_base < _V("5.0.0"):
38
- raise RuntimeError(
39
- f"[MODEL] transformers=={_tv_str} is too old.\n"
40
- "EXAONE-3.5 requires transformers>=5.0.0 (installs from git main are fine).\n"
41
- "The Dockerfile Step 3 should install from git+https://github.com/huggingface/transformers.git"
42
- )
43
- except Exception as _ver_exc:
44
- import warnings as _w
45
- _w.warn(f"[MODEL] Could not check transformers version ({_ver_exc}); proceeding anyway.")
46
- # ── end guard ─────────────────────────────────────────────────────────────────
47
-
48
  import warnings
49
  import torch
50
  from transformers import AutoTokenizer, AutoModelForCausalLM
51
  from model.log import section, step, ok, warn, error
52
 
53
- # Suppress FutureWarning from EXAONE's cached modeling file re: input_embeds rename
54
- warnings.filterwarnings(
55
- "ignore",
56
- message=r".*input_embeds.*is deprecated.*Use.*inputs_embeds.*",
57
- category=FutureWarning,
58
- )
59
  # Suppress torch_dtype deprecation warning from transformers dev build
60
  warnings.filterwarnings(
61
  "ignore",
@@ -63,13 +34,12 @@ warnings.filterwarnings(
63
  category=FutureWarning,
64
  )
65
 
66
- MODEL_ID = os.environ.get("LLM_MODEL", "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct")
67
 
68
  # Models that need trust_remote_code=True (custom architectures)
69
  _TRUST_REMOTE_CODE_MODELS = (
70
  "LiquidAI/",
71
  "DavidAU/LFM",
72
- "LGAI-EXAONE/",
73
  )
74
 
75
  def _needs_trust_remote_code(model_id: str) -> bool:
@@ -98,26 +68,6 @@ def _load() -> None:
98
  if _trc:
99
  step("MODEL", "trust_remote_code=True (custom architecture)")
100
 
101
- # Compatibility shim: some dev builds may not export check_model_inputs yet,
102
- # but EXAONE's modeling_exaone.py expects it from transformers.utils.generic.
103
- # IMPORTANT: must be a real pass-through decorator β€” returning None would
104
- # replace any @check_model_inputs-decorated forward() with None, causing
105
- # "TypeError: 'NoneType' object is not callable" during generate().
106
- try:
107
- import transformers.utils.generic as _tug # type: ignore
108
- if not hasattr(_tug, "check_model_inputs"):
109
- def _check_model_inputs(func=None, **_kwargs):
110
- # Handles both @check_model_inputs and @check_model_inputs(...)
111
- if callable(func):
112
- return func # used as bare decorator
113
- def _decorator(f):
114
- return f # used as decorator factory with args
115
- return _decorator
116
- setattr(_tug, "check_model_inputs", _check_model_inputs)
117
- warn("MODEL", "Patched missing transformers.utils.generic.check_model_inputs")
118
- except Exception as exc:
119
- warn("MODEL", f"Could not apply transformers compatibility shim: {exc}")
120
-
121
  # ── Tokenizer ─────────────────────────────────────────────────────────────
122
  _loading_msg = f"Loading tokenizer…"
123
  step("MODEL", f"Fetching tokenizer…")
@@ -143,17 +93,6 @@ def _load() -> None:
143
  device_map="cpu",
144
  low_cpu_mem_usage=True,
145
  )
146
- except ImportError as exc:
147
- _hint = ""
148
- if "RopeParameters" in str(exc):
149
- _hint = (
150
- "\n Hint: EXAONE-3.5 requires transformers>=5.0.0\n"
151
- " (RopeParameters was added in transformers 5.0, March 2026).\n"
152
- " Ensure requirements.txt pins transformers>=5.0.0,<6.0.0\n"
153
- " and do a Factory Rebuild in the Space settings."
154
- )
155
- error("MODEL", f"{exc}{_hint}")
156
- raise
157
  except Exception as exc:
158
  error("MODEL", str(exc))
159
  raise
 
1
  """
2
  Self-hosted LLM using HuggingFace Transformers β€” zero external API, no C++ compilation.
3
+ All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier).
4
 
5
  Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
6
  #1 TinyLlama/TinyLlama-1.1B-Chat-v1.0 ~1 GB 40-60 tok/s Apache 2.0 demos, prototypes
7
  #2 Qwen/Qwen3-0.6B ~0.5 GB 45-55 tok/s Apache 2.0 speed-critical, Think mode
8
+ #3 meta-llama/Llama-3.2-1B-Instruct [DEF] ~1.5 GB 35-50 tok/s Community 128K ctx, long-context
9
  #4 HuggingFaceTB/SmolLM2-1.7B-Instruct ~2 GB 25-35 tok/s Apache 2.0 good quality/size ratio
10
  #5 Qwen/Qwen2.5-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 multilingual, 32K ctx
11
  #6 stabilityai/stablelm-2-zephyr-1_6b ~2 GB 25-40 tok/s MIT DPO-tuned chat feel
12
  #7 Qwen/Qwen2.5-Coder-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 code completion/review
13
  #8 microsoft/phi-2 ~3 GB 18-28 tok/s MIT reasoning & logic
14
+ #9 google/gemma-3-1b-it ~1.5 GB 35-48 tok/s Gemma ToU multilingual, 140+ langs
 
15
 
16
  Note:
 
 
 
17
  - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
18
  - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
19
  """
 
22
  import time
23
  import threading
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  import warnings
26
  import torch
27
  from transformers import AutoTokenizer, AutoModelForCausalLM
28
  from model.log import section, step, ok, warn, error
29
 
 
 
 
 
 
 
30
  # Suppress torch_dtype deprecation warning from transformers dev build
31
  warnings.filterwarnings(
32
  "ignore",
 
34
  category=FutureWarning,
35
  )
36
 
37
+ MODEL_ID = os.environ.get("LLM_MODEL", "meta-llama/Llama-3.2-1B-Instruct")
38
 
39
  # Models that need trust_remote_code=True (custom architectures)
40
  _TRUST_REMOTE_CODE_MODELS = (
41
  "LiquidAI/",
42
  "DavidAU/LFM",
 
43
  )
44
 
45
  def _needs_trust_remote_code(model_id: str) -> bool:
 
68
  if _trc:
69
  step("MODEL", "trust_remote_code=True (custom architecture)")
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  # ── Tokenizer ─────────────────────────────────────────────────────────────
72
  _loading_msg = f"Loading tokenizer…"
73
  step("MODEL", f"Fetching tokenizer…")
 
93
  device_map="cpu",
94
  low_cpu_mem_usage=True,
95
  )
 
 
 
 
 
 
 
 
 
 
 
96
  except Exception as exc:
97
  error("MODEL", str(exc))
98
  raise
requirements.txt CHANGED
@@ -1,11 +1,9 @@
1
- # cache-bust: 2026-03-05-v7
2
  fastapi
3
  uvicorn[standard]==0.34.0
4
  sentence-transformers>=3.0.0,<4.0.0
5
- # transformers is intentionally NOT listed here.
6
- # sentence-transformers 3.x requires transformers<5.0.0, but EXAONE needs 5.x.
7
- # Listing both causes ResolutionImpossible at pip resolve time.
8
- # The Dockerfile Step 3 force-reinstalls transformers>=5.0.0 after Step 2.
9
  accelerate>=0.26.0
10
  huggingface-hub>=0.31.0
11
  supabase==2.13.0
 
1
+ # cache-bust: 2026-03-06-v1
2
  fastapi
3
  uvicorn[standard]==0.34.0
4
  sentence-transformers>=3.0.0,<4.0.0
5
+ # transformers is pinned separately in the Dockerfile to avoid ResolutionImpossible
6
+ # between sentence-transformers and the version needed for the active LLM.
 
 
7
  accelerate>=0.26.0
8
  huggingface-hub>=0.31.0
9
  supabase==2.13.0