SeaWolf-AI commited on
Commit
d422c24
Β·
verified Β·
1 Parent(s): 3e3322c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -185
app.py CHANGED
@@ -1,47 +1,48 @@
1
  """
2
- 🧬 Darwin-35B-A3B-Opus β€” ZeroGPU Direct Serving
3
- transformers + @spaces.GPU Β· Vision support Β· Streaming
4
  """
5
- import sys
6
  print(f"[BOOT] Python {sys.version}", flush=True)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import base64, os, re, json, io
9
  from typing import Generator, Optional
10
- from threading import Thread
11
 
12
- # ── Core imports ──────────────────────────────────────────────────────────
13
- import torch
14
- import spaces
15
  import gradio as gr
16
- print(f"[BOOT] gradio {gr.__version__}, torch {torch.__version__}", flush=True)
17
-
18
- from transformers import (
19
- AutoProcessor,
20
- AutoModelForImageTextToText,
21
- AutoModelForCausalLM,
22
- AutoTokenizer,
23
- TextIteratorStreamer,
24
- )
25
- from PIL import Image
26
- import requests
27
- import httpx
28
  from fastapi import FastAPI, Request
29
  from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
30
  from urllib.parse import urlencode
31
  import pathlib, secrets
32
 
33
- # SSL κ²½κ³  λ¬΄μ‹œ
34
  import urllib3
35
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
36
 
37
  # ══════════════════════════════════════════════════════════════════════════════
38
  # 1. MODEL CONFIG
39
  # ══════════════════════════════════════════════════════════════════════════════
40
- MODEL_ID = "FINAL-Bench/Darwin-35B-A3B-Opus"
41
- MODEL_NAME = "Darwin-35B-A3B-Opus"
 
42
  MODEL_CAP = {
43
  "arch": "MoE", "active": "3B / 35B total",
44
- "ctx": "262K", "thinking": True, "vision": True,
45
  "max_tokens": 16384, "temp_max": 1.5,
46
  }
47
 
@@ -55,56 +56,44 @@ PRESETS = {
55
  }
56
 
57
  # ══════════════════════════════════════════════════════════════════════════════
58
- # 2. MODEL LOADING
59
  # ══════════════════════════════════════════════════════════════════════════════
60
- print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
61
-
62
- IS_VISION = True
63
- processor = None
64
- tokenizer = None
65
- model = None
66
-
67
- try:
68
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
69
- print("[MODEL] AutoProcessor loaded (vision mode)", flush=True)
70
- except Exception as e:
71
- print(f"[MODEL] AutoProcessor failed: {e}", flush=True)
72
- print("[MODEL] Falling back to AutoTokenizer (text-only mode)", flush=True)
73
- IS_VISION = False
74
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
75
-
76
- # λͺ¨λΈ λ‘œλ“œ β€” dtype= μš°μ„ , μ‹€νŒ¨ μ‹œ torch_dtype= 폴백, μ΅œμ’… 4bit
77
- _load_ok = False
78
- ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
79
-
80
- for attempt, load_kwargs in enumerate([
81
- dict(dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
82
- dict(torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
83
- ]):
84
  try:
85
- model = ModelClass.from_pretrained(MODEL_ID, **load_kwargs)
86
- print(f"[MODEL] {ModelClass.__name__} loaded (attempt {attempt+1}) βœ“", flush=True)
87
- _load_ok = True
88
- break
 
 
 
 
 
 
 
 
 
 
 
89
  except Exception as e:
90
- print(f"[MODEL] Attempt {attempt+1} failed: {e}", flush=True)
91
-
92
- if not _load_ok:
93
- print("[MODEL] Retrying with 4-bit quantization...", flush=True)
94
- from transformers import BitsAndBytesConfig
95
- bnb_config = BitsAndBytesConfig(
96
- load_in_4bit=True, bnb_4bit_quant_type="nf4",
97
- bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
98
- )
99
- model = ModelClass.from_pretrained(
100
- MODEL_ID, quantization_config=bnb_config,
101
- device_map="auto", trust_remote_code=True,
102
- )
103
- print("[MODEL] 4-bit quantized model loaded βœ“", flush=True)
104
 
105
- # ν† ν¬λ‚˜μ΄μ € κ²°μ •
106
- _tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
107
- print(f"[MODEL] Ready β€” vision={IS_VISION}, dtype={model.dtype}", flush=True)
 
 
 
 
 
 
 
 
108
 
109
  # ══════════════════════════════════════════════════════════════════════════════
110
  # 3. THINKING MODE HELPERS
@@ -175,27 +164,8 @@ def format_response(raw: str) -> str:
175
  return raw
176
 
177
  # ══════════════════════════════════════════════════════════════════════════════
178
- # 4. IMAGE HELPERS
179
- # ══════════════════════════════════════════════════════════════════════════════
180
- def _load_image_from_source(src: str) -> Optional[Image.Image]:
181
- try:
182
- if src.startswith("data:"):
183
- _, b64 = src.split(",", 1)
184
- return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
185
- elif src.startswith("http"):
186
- resp = requests.get(src, timeout=15)
187
- resp.raise_for_status()
188
- return Image.open(io.BytesIO(resp.content)).convert("RGB")
189
- except Exception as e:
190
- print(f"[IMG] Failed to load image: {e}", flush=True)
191
- return None
192
-
193
  # ══════════════════════════════════════════════════════════════════════════════
194
- # 5. GENERATION β€” β˜… @spaces.GPU on Gradio fn (핡심 μˆ˜μ •) β˜…
195
- # ZeroGPUλŠ” Gradio 이벀트 ν•¨μˆ˜μ— @spaces.GPUκ°€ μžˆμ–΄μ•Ό κ°μ§€ν•©λ‹ˆλ‹€.
196
- # λ‚΄λΆ€ μ„œλΈŒν•¨μˆ˜κ°€ μ•„λ‹Œ, ChatInterface의 fn에 직접 λ°μ½”λ ˆμ΄μ…˜!
197
- # ══════════════════════════════════════════════════════════════════════════════
198
- @spaces.GPU(duration=180)
199
  def generate_reply(
200
  message: str,
201
  history: list,
@@ -245,98 +215,44 @@ def generate_reply(
245
  _, clean = parse_think_blocks(at)
246
  messages.append({"role":"assistant","content":clean})
247
 
248
- # ── ν˜„μž¬ λ©”μ‹œμ§€ (이미지 포함 κ°€λŠ₯) ──
249
- has_image = False
250
- pil_image = None
251
-
252
- if image_input and isinstance(image_input, str) and image_input.strip():
253
- pil_image = _load_image_from_source(image_input)
254
- if pil_image:
255
- has_image = True
256
-
257
- if IS_VISION and has_image and pil_image:
258
- messages.append({
259
- "role": "user",
260
- "content": [
261
- {"type": "image", "image": pil_image},
262
- {"type": "text", "text": message},
263
- ]
264
- })
265
- else:
266
- messages.append({"role": "user", "content": message})
267
-
268
- # ── ν† ν¬λ‚˜μ΄μ¦ˆ ──
269
- try:
270
- if IS_VISION and processor is not None:
271
- text_prompt = processor.apply_chat_template(
272
- messages, tokenize=False, add_generation_prompt=True,
273
- )
274
- if has_image and pil_image:
275
- inputs = processor(
276
- text=[text_prompt], images=[pil_image],
277
- return_tensors="pt", padding=True,
278
- )
279
- else:
280
- inputs = processor(
281
- text=[text_prompt], return_tensors="pt", padding=True,
282
- )
283
- else:
284
- text_prompt = tokenizer.apply_chat_template(
285
- messages, tokenize=False, add_generation_prompt=True,
286
- )
287
- inputs = tokenizer(text_prompt, return_tensors="pt")
288
- except Exception as e:
289
- yield f"**❌ Tokenization error:** `{e}`"
290
- return
291
-
292
- # ── GPU둜 이동 ──
293
- inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
294
-
295
- # ── Streamer ──
296
- streamer = TextIteratorStreamer(_tok, skip_special_tokens=True, skip_prompt=True)
297
-
298
- input_len = inputs["input_ids"].shape[-1]
299
- print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
300
- f"temp={temperature}, vision={has_image}", flush=True)
301
-
302
- # ── generate β†’ 별도 μŠ€λ ˆλ“œ (GPU μ»¨ν…μŠ€νŠΈλŠ” 이 ν•¨μˆ˜κ°€ μœ μ§€) ──
303
- gen_kwargs = dict(
304
- **inputs,
305
- max_new_tokens=max_new_tokens,
306
- do_sample=temperature > 0.01,
307
- temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
308
- top_p=float(top_p),
309
- streamer=streamer,
310
- use_cache=True,
311
- )
312
 
313
- thread = Thread(target=model.generate, kwargs=gen_kwargs)
314
- thread.start()
315
 
316
- output = ""
317
  try:
318
- for text in streamer:
319
- output += text
320
- yield format_response(output)
321
- except Exception as e:
322
- if output:
323
- yield format_response(output)
324
- else:
325
- yield f"**❌ Generation error:** `{e}`"
326
 
327
- thread.join()
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
- if output:
330
- print(f"[GEN] Done β€” {len(output)} chars", flush=True)
331
- yield format_response(output)
332
- else:
333
- yield "**⚠️ λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
334
 
335
 
336
  # ══════════════════════════════════════════════════════════════════════════════
337
- # 6. GRADIO BLOCKS
338
  # ══════════════════════════════════════════════════════════════════════════════
339
- with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
340
  thinking_toggle = gr.Radio(
341
  choices=["⚑ Fast Mode (direct answer)",
342
  "🧠 Thinking Mode (chain-of-thought reasoning)"],
@@ -359,7 +275,7 @@ with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
359
  )
360
 
361
  # ══════════════════════════════════════════════════════════════════════════════
362
- # 7. FASTAPI β€” index.html + OAuth + μœ ν‹Έ API
363
  # ══════════════════════════════════════════════════════════════════════════════
364
  fapp = FastAPI()
365
  SESSIONS: dict[str, dict] = {}
@@ -379,7 +295,6 @@ SCOPES = os.getenv("OAUTH_SCOPES", "openid profile")
379
 
380
  def _sid(req: Request) -> Optional[str]:
381
  return req.cookies.get("mc_session")
382
-
383
  def _user(req: Request) -> Optional[dict]:
384
  sid = _sid(req)
385
  return SESSIONS.get(sid) if sid else None
@@ -419,7 +334,6 @@ async def oauth_callback(code: str = "", error: str = "", state: str = ""):
419
  if uinfo.status_code != 200:
420
  return RedirectResponse("/?auth_error=1")
421
  user = uinfo.json()
422
-
423
  sid = secrets.token_urlsafe(32)
424
  SESSIONS[sid] = {
425
  "logged_in": True,
@@ -442,11 +356,9 @@ async def oauth_logout(request: Request):
442
 
443
  @fapp.get("/health")
444
  async def health():
445
- return {
446
- "status": "ok", "model": MODEL_ID,
447
- "vision": IS_VISION, "dtype": str(model.dtype),
448
- }
449
 
 
450
  BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
451
 
452
  @fapp.post("/api/search")
@@ -471,6 +383,7 @@ async def api_search(request: Request):
471
  except Exception as e:
472
  return JSONResponse({"error": str(e)}, status_code=500)
473
 
 
474
  @fapp.post("/api/extract-pdf")
475
  async def api_extract_pdf(request: Request):
476
  try:
@@ -494,14 +407,10 @@ async def api_extract_pdf(request: Request):
494
  return JSONResponse({"error": str(e)}, status_code=500)
495
 
496
  # ══════════════════════════════════════════════════════════════════════════════
497
- # 8. MOUNT & LAUNCH
498
- # @spaces.GPUλŠ” λͺ¨λ“ˆ λ‘œλ“œ μ‹œ μžλ™ 감지됨 (generate_reply에 λ°μ½”λ ˆμ΄μ…˜).
499
- # uvicorn.run()으둜 μ„œλ²„λ₯Ό μ‹œμž‘ν•΄μ•Ό ν”„λ‘œμ„ΈμŠ€κ°€ μœ μ§€λ©λ‹ˆλ‹€.
500
  # ══════════════════════════════════════════════════════════════════════════════
501
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
502
 
503
- print("[BOOT] Darwin-35B-A3B-Opus Β· ZeroGPU Direct Serving Β· Ready", flush=True)
504
-
505
  if __name__ == "__main__":
506
- import uvicorn
507
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  """
2
+ 🧬 Darwin-35B-A3B-Opus Q8 GGUF β€” llama-cpp-python Direct Serving
3
+ μ „μš© GPU Β· OpenAI-compatible streaming Β· μ»€μŠ€ν…€ ν”„λ‘ νŠΈμ—”λ“œ
4
  """
5
+ import sys, subprocess
6
  print(f"[BOOT] Python {sys.version}", flush=True)
7
 
8
+ # ── llama-cpp-python CUDA μ„€μΉ˜ 확인 ──
9
+ try:
10
+ from llama_cpp import Llama
11
+ print("[BOOT] llama-cpp-python already installed", flush=True)
12
+ except ImportError:
13
+ print("[BOOT] Installing llama-cpp-python with CUDA...", flush=True)
14
+ subprocess.check_call([
15
+ sys.executable, "-m", "pip", "install",
16
+ "llama-cpp-python", "--no-cache-dir", "--prefer-binary",
17
+ "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu124",
18
+ ])
19
+ from llama_cpp import Llama
20
+ print("[BOOT] llama-cpp-python installed βœ“", flush=True)
21
+
22
  import base64, os, re, json, io
23
  from typing import Generator, Optional
 
24
 
 
 
 
25
  import gradio as gr
26
+ print(f"[BOOT] gradio {gr.__version__}", flush=True)
27
+
28
+ import requests, httpx, uvicorn
 
 
 
 
 
 
 
 
 
29
  from fastapi import FastAPI, Request
30
  from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
31
  from urllib.parse import urlencode
32
  import pathlib, secrets
33
 
 
34
  import urllib3
35
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
36
 
37
  # ══════════════════════════════════════════════════════════════════════════════
38
  # 1. MODEL CONFIG
39
  # ══════════════════════════════════════════════════════════════════════════════
40
+ REPO_ID = "FINAL-Bench/Darwin-35B-A3B-Opus-Q8-GGUF"
41
+ GGUF_FILE = "darwin-35b-a3b-opus-q8_0-00001-of-00003.gguf"
42
+ MODEL_NAME = "Darwin-35B-A3B-Opus-Q8"
43
  MODEL_CAP = {
44
  "arch": "MoE", "active": "3B / 35B total",
45
+ "ctx": "262K", "thinking": True, "vision": False,
46
  "max_tokens": 16384, "temp_max": 1.5,
47
  }
48
 
 
56
  }
57
 
58
  # ══════════════════════════════════════════════════════════════════════════════
59
+ # 2. VRAM 감지 + λͺ¨λΈ λ‘œλ”©
60
  # ══════════════════════════════════════════════════════════════════════════════
61
+ def detect_gpu_layers() -> int:
62
+ """μ‚¬μš© κ°€λŠ₯ν•œ VRAM에 따라 n_gpu_layers μžλ™ κ²°μ •"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  try:
64
+ import torch
65
+ if torch.cuda.is_available():
66
+ vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3)
67
+ print(f"[GPU] {torch.cuda.get_device_name(0)} β€” {vram_gb:.1f} GB VRAM", flush=True)
68
+ if vram_gb >= 40: # A100 40GB β€” 전체 λ ˆμ΄μ–΄ GPU
69
+ return -1 # -1 = all layers
70
+ elif vram_gb >= 24: # A10G 24GB β€” μ•½ 25λ ˆμ΄μ–΄
71
+ return 28
72
+ elif vram_gb >= 16: # T4 16GB β€” μ•½ 15λ ˆμ΄μ–΄
73
+ return 18
74
+ else:
75
+ return 10
76
+ else:
77
+ print("[GPU] No CUDA device found, CPU-only mode", flush=True)
78
+ return 0
79
  except Exception as e:
80
+ print(f"[GPU] Detection failed: {e}, using CPU", flush=True)
81
+ return 0
82
+
83
+ N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", str(detect_gpu_layers())))
84
+ N_CTX = int(os.getenv("N_CTX", "32768"))
 
 
 
 
 
 
 
 
 
85
 
86
+ print(f"[MODEL] Loading {REPO_ID} ...", flush=True)
87
+ print(f"[MODEL] n_gpu_layers={N_GPU_LAYERS}, n_ctx={N_CTX}", flush=True)
88
+
89
+ llm = Llama.from_pretrained(
90
+ repo_id=REPO_ID,
91
+ filename=GGUF_FILE,
92
+ n_gpu_layers=N_GPU_LAYERS,
93
+ n_ctx=N_CTX,
94
+ verbose=True,
95
+ )
96
+ print(f"[MODEL] {MODEL_NAME} loaded βœ“", flush=True)
97
 
98
  # ══════════════════════════════════════════════════════════════════════════════
99
  # 3. THINKING MODE HELPERS
 
164
  return raw
165
 
166
  # ══════════════════════════════════════════════════════════════════════════════
167
+ # 4. GENERATION β€” llama-cpp-python 슀트리밍 (μ΄ˆκ°„λ‹¨)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # ══════════════════════════════════════════════════════════════════════════════
 
 
 
 
 
169
  def generate_reply(
170
  message: str,
171
  history: list,
 
215
  _, clean = parse_think_blocks(at)
216
  messages.append({"role":"assistant","content":clean})
217
 
218
+ # PDF ν…μŠ€νŠΈκ°€ image_input에 λ“€μ–΄μ˜¬ 수 있음 (ν”„λ‘ νŠΈμ—”λ“œ ν˜Έν™˜)
219
+ messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
+ print(f"[GEN] msgs={len(messages)}, max_new={max_new_tokens}, temp={temperature}", flush=True)
 
222
 
223
+ # ── llama-cpp 슀트리밍 β€” μ‹¬ν”Œ! ──
224
  try:
225
+ stream = llm.create_chat_completion(
226
+ messages=messages,
227
+ max_tokens=max_new_tokens,
228
+ temperature=max(temperature, 0.01) if temperature > 0.01 else 0.0,
229
+ top_p=float(top_p),
230
+ stream=True,
231
+ )
 
232
 
233
+ raw = ""
234
+ for chunk in stream:
235
+ delta = chunk.get("choices", [{}])[0].get("delta", {})
236
+ token = delta.get("content", "")
237
+ if token:
238
+ raw += token
239
+ yield format_response(raw)
240
+
241
+ if raw:
242
+ print(f"[GEN] Done β€” {len(raw)} chars", flush=True)
243
+ yield format_response(raw)
244
+ else:
245
+ yield "**⚠️ λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
246
 
247
+ except Exception as e:
248
+ print(f"[GEN] Error: {e}", flush=True)
249
+ yield f"**❌ Generation error:** `{e}`"
 
 
250
 
251
 
252
  # ══════════════════════════════════════════════════════════════════════════════
253
+ # 5. GRADIO BLOCKS
254
  # ══════════════════════════════════════════════════════════════════════════════
255
+ with gr.Blocks(title=MODEL_NAME) as gradio_demo:
256
  thinking_toggle = gr.Radio(
257
  choices=["⚑ Fast Mode (direct answer)",
258
  "🧠 Thinking Mode (chain-of-thought reasoning)"],
 
275
  )
276
 
277
  # ══════════════════════════════════════════════════════════════════════════════
278
+ # 6. FASTAPI β€” index.html + OAuth + μœ ν‹Έ API
279
  # ══════════════════════════════════════════════════════════════════════════════
280
  fapp = FastAPI()
281
  SESSIONS: dict[str, dict] = {}
 
295
 
296
  def _sid(req: Request) -> Optional[str]:
297
  return req.cookies.get("mc_session")
 
298
  def _user(req: Request) -> Optional[dict]:
299
  sid = _sid(req)
300
  return SESSIONS.get(sid) if sid else None
 
334
  if uinfo.status_code != 200:
335
  return RedirectResponse("/?auth_error=1")
336
  user = uinfo.json()
 
337
  sid = secrets.token_urlsafe(32)
338
  SESSIONS[sid] = {
339
  "logged_in": True,
 
356
 
357
  @fapp.get("/health")
358
  async def health():
359
+ return {"status": "ok", "model": MODEL_NAME, "gpu_layers": N_GPU_LAYERS, "ctx": N_CTX}
 
 
 
360
 
361
+ # ── Web Search API (Brave) ──
362
  BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
363
 
364
  @fapp.post("/api/search")
 
383
  except Exception as e:
384
  return JSONResponse({"error": str(e)}, status_code=500)
385
 
386
+ # ── PDF Text Extraction ──
387
  @fapp.post("/api/extract-pdf")
388
  async def api_extract_pdf(request: Request):
389
  try:
 
407
  return JSONResponse({"error": str(e)}, status_code=500)
408
 
409
  # ══════════════════════════════════════════════════════════════════════════════
410
+ # 7. MOUNT & RUN β€” μ „μš© GPUμ΄λ―€λ‘œ uvicorn.run() 정상 μ‚¬μš©
 
 
411
  # ══════════════════════════════════════════════════════════════════════════════
412
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
413
 
 
 
414
  if __name__ == "__main__":
415
+ print(f"[BOOT] {MODEL_NAME} Β· llama-cpp Β· GPU layers: {N_GPU_LAYERS}", flush=True)
416
  uvicorn.run(app, host="0.0.0.0", port=7860)