SeaWolf-AI commited on
Commit
afc2199
Β·
verified Β·
1 Parent(s): 4cbb479

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -120
app.py CHANGED
@@ -24,7 +24,7 @@ from transformers import (
24
  )
25
  from PIL import Image
26
  import requests
27
- import httpx, uvicorn
28
  from fastapi import FastAPI, Request
29
  from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
30
  from urllib.parse import urlencode
@@ -55,11 +55,11 @@ PRESETS = {
55
  }
56
 
57
  # ══════════════════════════════════════════════════════════════════════════════
58
- # 2. MODEL LOADING (ZeroGPU: CPU at import, GPU at inference)
59
  # ══════════════════════════════════════════════════════════════════════════════
60
  print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
61
 
62
- IS_VISION = True # λͺ¨λΈμ΄ vision μ§€μ›ν•˜λŠ”μ§€ μ—¬λΆ€
63
  processor = None
64
  tokenizer = None
65
  model = None
@@ -73,48 +73,41 @@ except Exception as e:
73
  IS_VISION = False
74
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
75
 
76
- try:
77
- if IS_VISION:
78
- model = AutoModelForImageTextToText.from_pretrained(
79
- MODEL_ID,
80
- torch_dtype=torch.bfloat16,
81
- device_map="auto",
82
- trust_remote_code=True,
83
- )
84
- print("[MODEL] AutoModelForImageTextToText loaded βœ“", flush=True)
85
- else:
86
- model = AutoModelForCausalLM.from_pretrained(
87
- MODEL_ID,
88
- torch_dtype=torch.bfloat16,
89
- device_map="auto",
90
- trust_remote_code=True,
91
- )
92
- print("[MODEL] AutoModelForCausalLM loaded βœ“", flush=True)
93
- except Exception as e:
94
- print(f"[MODEL] bfloat16 load failed: {e}", flush=True)
95
  print("[MODEL] Retrying with 4-bit quantization...", flush=True)
96
  from transformers import BitsAndBytesConfig
97
  bnb_config = BitsAndBytesConfig(
98
- load_in_4bit=True,
99
- bnb_4bit_quant_type="nf4",
100
- bnb_4bit_compute_dtype=torch.bfloat16,
101
- bnb_4bit_use_double_quant=True,
102
  )
103
- ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
104
  model = ModelClass.from_pretrained(
105
- MODEL_ID,
106
- quantization_config=bnb_config,
107
- device_map="auto",
108
- trust_remote_code=True,
109
  )
110
  print("[MODEL] 4-bit quantized model loaded βœ“", flush=True)
111
 
112
  # ν† ν¬λ‚˜μ΄μ € κ²°μ •
113
  _tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
114
- print(f"[MODEL] Ready β€” device: {model.device}, dtype: {model.dtype}", flush=True)
115
 
116
  # ══════════════════════════════════════════════════════════════════════════════
117
- # 3. THINKING MODE HELPERS (κΈ°μ‘΄ 둜직 μœ μ§€)
118
  # ══════════════════════════════════════════════════════════════════════════════
119
  def parse_think_blocks(text: str) -> tuple[str, str]:
120
  m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
@@ -156,20 +149,15 @@ def _split_thinking_answer(raw: str) -> tuple:
156
  answer_start = i
157
  break
158
  if answer_start > 0:
159
- thinking = "\n".join(lines[:answer_start]).strip()
160
- answer = "\n".join(lines[answer_start:]).strip()
161
- return thinking, answer
162
  return "", raw
163
 
164
  def format_response(raw: str) -> str:
165
  chain, answer = parse_think_blocks(raw)
166
  if chain:
167
  return (
168
- "<details>\n"
169
- "<summary>🧠 Reasoning Chain β€” click to expand</summary>\n\n"
170
- f"{chain}\n\n"
171
- "</details>\n\n"
172
- f"{answer}"
173
  )
174
  if "<think>" in raw and "</think>" not in raw:
175
  think_len = len(raw) - raw.index("<think>") - 7
@@ -179,11 +167,8 @@ def format_response(raw: str) -> str:
179
  thinking, answer = _split_thinking_answer(raw)
180
  if thinking and answer:
181
  return (
182
- f"<details>\n"
183
- f"<summary>🧠 Reasoning Chain ({len(thinking)} chars)</summary>\n\n"
184
- f"{thinking}\n\n"
185
- f"</details>\n\n"
186
- f"{answer}"
187
  )
188
  elif thinking and not answer:
189
  return f"🧠 Reasoning... ({len(raw)} chars)"
@@ -193,7 +178,6 @@ def format_response(raw: str) -> str:
193
  # 4. IMAGE HELPERS
194
  # ══════════════════════════════════════════════════════════════════════════════
195
  def _load_image_from_source(src: str) -> Optional[Image.Image]:
196
- """base64 data URI λ˜λŠ” URL β†’ PIL Image"""
197
  try:
198
  if src.startswith("data:"):
199
  _, b64 = src.split(",", 1)
@@ -207,32 +191,11 @@ def _load_image_from_source(src: str) -> Optional[Image.Image]:
207
  return None
208
 
209
  # ══════════════════════════════════════════════════════════════════════════════
210
- # 5. GENERATION β€” ZeroGPU + TextIteratorStreamer
 
 
211
  # ══════════════════════════════════════════════════════════════════════════════
212
  @spaces.GPU(duration=180)
213
- def _run_generation(input_ids, attention_mask, pixel_values, image_grid_thw,
214
- max_new_tokens, temperature, top_p, streamer):
215
- """GPU ν• λ‹Ή ν›„ μ‹€ν–‰λ˜λŠ” μ‹€μ œ 생성 ν•¨μˆ˜"""
216
- gen_kwargs = dict(
217
- input_ids=input_ids.to(model.device),
218
- attention_mask=attention_mask.to(model.device),
219
- max_new_tokens=max_new_tokens,
220
- do_sample=temperature > 0.01,
221
- temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
222
- top_p=top_p,
223
- streamer=streamer,
224
- use_cache=True,
225
- )
226
- # vision inputs (있으면)
227
- if pixel_values is not None:
228
- gen_kwargs["pixel_values"] = pixel_values.to(model.device)
229
- if image_grid_thw is not None:
230
- gen_kwargs["image_grid_thw"] = image_grid_thw.to(model.device)
231
-
232
- with torch.inference_mode():
233
- model.generate(**gen_kwargs)
234
-
235
-
236
  def generate_reply(
237
  message: str,
238
  history: list,
@@ -252,7 +215,6 @@ def generate_reply(
252
  if system_prompt.strip():
253
  messages.append({"role": "system", "content": system_prompt.strip()})
254
 
255
- # history (ν”„λ‘ νŠΈμ—”λ“œ: [user, assistant] νŠœν”Œ 리슀트)
256
  for turn in history:
257
  if isinstance(turn, dict):
258
  role = turn.get("role", "")
@@ -292,8 +254,7 @@ def generate_reply(
292
  if pil_image:
293
  has_image = True
294
 
295
- if IS_VISION and has_image:
296
- # Vision λͺ¨λ“œ: 이미지 + ν…μŠ€νŠΈ
297
  messages.append({
298
  "role": "user",
299
  "content": [
@@ -308,62 +269,48 @@ def generate_reply(
308
  try:
309
  if IS_VISION and processor is not None:
310
  text_prompt = processor.apply_chat_template(
311
- messages,
312
- tokenize=False,
313
- add_generation_prompt=True,
314
  )
315
  if has_image and pil_image:
316
  inputs = processor(
317
- text=[text_prompt],
318
- images=[pil_image],
319
- return_tensors="pt",
320
- padding=True,
321
  )
322
  else:
323
  inputs = processor(
324
- text=[text_prompt],
325
- return_tensors="pt",
326
- padding=True,
327
  )
328
  else:
329
- # text-only λͺ¨λ“œ
330
  text_prompt = tokenizer.apply_chat_template(
331
- messages,
332
- tokenize=False,
333
- add_generation_prompt=True,
334
  )
335
  inputs = tokenizer(text_prompt, return_tensors="pt")
336
  except Exception as e:
337
  yield f"**❌ Tokenization error:** `{e}`"
338
  return
339
 
340
- # ── Streamer μ„€μ • ──
341
- decode_tok = _tok
342
- streamer = TextIteratorStreamer(decode_tok, skip_special_tokens=True, skip_prompt=True)
343
 
344
- # ── ν…μ„œ μΆ”μΆœ ──
345
- input_ids = inputs["input_ids"]
346
- attention_mask = inputs.get("attention_mask", torch.ones_like(input_ids))
347
- pixel_values = inputs.get("pixel_values", None)
348
- image_grid_thw = inputs.get("image_grid_thw", None)
349
 
350
- print(f"[GEN] tokens={input_ids.shape[-1]}, max_new={max_new_tokens}, "
 
351
  f"temp={temperature}, vision={has_image}", flush=True)
352
 
353
- # ── μŠ€λ ˆλ“œμ—μ„œ 생성 μ‹€ν–‰ ──
354
- thread = Thread(
355
- target=_run_generation,
356
- kwargs=dict(
357
- input_ids=input_ids,
358
- attention_mask=attention_mask,
359
- pixel_values=pixel_values,
360
- image_grid_thw=image_grid_thw,
361
- max_new_tokens=max_new_tokens,
362
- temperature=temperature,
363
- top_p=float(top_p),
364
- streamer=streamer,
365
- ),
366
  )
 
 
367
  thread.start()
368
 
369
  output = ""
@@ -496,14 +443,10 @@ async def oauth_logout(request: Request):
496
  @fapp.get("/health")
497
  async def health():
498
  return {
499
- "status": "ok",
500
- "model": MODEL_ID,
501
- "vision": IS_VISION,
502
- "device": str(model.device),
503
- "dtype": str(model.dtype),
504
  }
505
 
506
- # ── Web Search API (Brave) ──
507
  BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
508
 
509
  @fapp.post("/api/search")
@@ -528,7 +471,6 @@ async def api_search(request: Request):
528
  except Exception as e:
529
  return JSONResponse({"error": str(e)}, status_code=500)
530
 
531
- # ── PDF Text Extraction ──
532
  @fapp.post("/api/extract-pdf")
533
  async def api_extract_pdf(request: Request):
534
  try:
@@ -551,9 +493,17 @@ async def api_extract_pdf(request: Request):
551
  except Exception as e:
552
  return JSONResponse({"error": str(e)}, status_code=500)
553
 
554
- # ── Mount ──
 
 
 
 
 
555
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
556
 
557
- if __name__ == "__main__":
558
- print(f"[BOOT] Darwin-35B-A3B-Opus Β· ZeroGPU Direct Serving", flush=True)
 
 
 
559
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
24
  )
25
  from PIL import Image
26
  import requests
27
+ import httpx
28
  from fastapi import FastAPI, Request
29
  from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
30
  from urllib.parse import urlencode
 
55
  }
56
 
57
  # ══════════════════════════════════════════════════════════════════════════════
58
+ # 2. MODEL LOADING
59
  # ══════════════════════════════════════════════════════════════════════════════
60
  print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
61
 
62
+ IS_VISION = True
63
  processor = None
64
  tokenizer = None
65
  model = None
 
73
  IS_VISION = False
74
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
75
 
76
+ # λͺ¨λΈ λ‘œλ“œ β€” dtype= μš°μ„ , μ‹€νŒ¨ μ‹œ torch_dtype= 폴백, μ΅œμ’… 4bit
77
+ _load_ok = False
78
+ ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
79
+
80
+ for attempt, load_kwargs in enumerate([
81
+ dict(dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
82
+ dict(torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
83
+ ]):
84
+ try:
85
+ model = ModelClass.from_pretrained(MODEL_ID, **load_kwargs)
86
+ print(f"[MODEL] {ModelClass.__name__} loaded (attempt {attempt+1}) βœ“", flush=True)
87
+ _load_ok = True
88
+ break
89
+ except Exception as e:
90
+ print(f"[MODEL] Attempt {attempt+1} failed: {e}", flush=True)
91
+
92
+ if not _load_ok:
 
 
93
  print("[MODEL] Retrying with 4-bit quantization...", flush=True)
94
  from transformers import BitsAndBytesConfig
95
  bnb_config = BitsAndBytesConfig(
96
+ load_in_4bit=True, bnb_4bit_quant_type="nf4",
97
+ bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
 
 
98
  )
 
99
  model = ModelClass.from_pretrained(
100
+ MODEL_ID, quantization_config=bnb_config,
101
+ device_map="auto", trust_remote_code=True,
 
 
102
  )
103
  print("[MODEL] 4-bit quantized model loaded βœ“", flush=True)
104
 
105
  # ν† ν¬λ‚˜μ΄μ € κ²°μ •
106
  _tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
107
+ print(f"[MODEL] Ready β€” vision={IS_VISION}, dtype={model.dtype}", flush=True)
108
 
109
  # ══════════════════════════════════════════════════════════════════════════════
110
+ # 3. THINKING MODE HELPERS
111
  # ══════════════════════════════════════════════════════════════════════════════
112
  def parse_think_blocks(text: str) -> tuple[str, str]:
113
  m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
 
149
  answer_start = i
150
  break
151
  if answer_start > 0:
152
+ return "\n".join(lines[:answer_start]).strip(), "\n".join(lines[answer_start:]).strip()
 
 
153
  return "", raw
154
 
155
  def format_response(raw: str) -> str:
156
  chain, answer = parse_think_blocks(raw)
157
  if chain:
158
  return (
159
+ "<details>\n<summary>🧠 Reasoning Chain β€” click to expand</summary>\n\n"
160
+ f"{chain}\n\n</details>\n\n{answer}"
 
 
 
161
  )
162
  if "<think>" in raw and "</think>" not in raw:
163
  think_len = len(raw) - raw.index("<think>") - 7
 
167
  thinking, answer = _split_thinking_answer(raw)
168
  if thinking and answer:
169
  return (
170
+ f"<details>\n<summary>🧠 Reasoning Chain ({len(thinking)} chars)</summary>\n\n"
171
+ f"{thinking}\n\n</details>\n\n{answer}"
 
 
 
172
  )
173
  elif thinking and not answer:
174
  return f"🧠 Reasoning... ({len(raw)} chars)"
 
178
  # 4. IMAGE HELPERS
179
  # ══════════════════════════════════════════════════════════════════════════════
180
  def _load_image_from_source(src: str) -> Optional[Image.Image]:
 
181
  try:
182
  if src.startswith("data:"):
183
  _, b64 = src.split(",", 1)
 
191
  return None
192
 
193
  # ══════════════════════════════════════════════════════════════════════════════
194
+ # 5. GENERATION β€” β˜… @spaces.GPU on Gradio fn (핡심 μˆ˜μ •) β˜…
195
+ # ZeroGPUλŠ” Gradio 이벀트 ν•¨μˆ˜μ— @spaces.GPUκ°€ μžˆμ–΄μ•Ό κ°μ§€ν•©λ‹ˆλ‹€.
196
+ # λ‚΄λΆ€ μ„œλΈŒν•¨μˆ˜κ°€ μ•„λ‹Œ, ChatInterface의 fn에 직접 λ°μ½”λ ˆμ΄μ…˜!
197
  # ══════════════════════════════════════════════════════════════════════════════
198
  @spaces.GPU(duration=180)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def generate_reply(
200
  message: str,
201
  history: list,
 
215
  if system_prompt.strip():
216
  messages.append({"role": "system", "content": system_prompt.strip()})
217
 
 
218
  for turn in history:
219
  if isinstance(turn, dict):
220
  role = turn.get("role", "")
 
254
  if pil_image:
255
  has_image = True
256
 
257
+ if IS_VISION and has_image and pil_image:
 
258
  messages.append({
259
  "role": "user",
260
  "content": [
 
269
  try:
270
  if IS_VISION and processor is not None:
271
  text_prompt = processor.apply_chat_template(
272
+ messages, tokenize=False, add_generation_prompt=True,
 
 
273
  )
274
  if has_image and pil_image:
275
  inputs = processor(
276
+ text=[text_prompt], images=[pil_image],
277
+ return_tensors="pt", padding=True,
 
 
278
  )
279
  else:
280
  inputs = processor(
281
+ text=[text_prompt], return_tensors="pt", padding=True,
 
 
282
  )
283
  else:
 
284
  text_prompt = tokenizer.apply_chat_template(
285
+ messages, tokenize=False, add_generation_prompt=True,
 
 
286
  )
287
  inputs = tokenizer(text_prompt, return_tensors="pt")
288
  except Exception as e:
289
  yield f"**❌ Tokenization error:** `{e}`"
290
  return
291
 
292
+ # ── GPU둜 이동 ──
293
+ inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
 
294
 
295
+ # ── Streamer ──
296
+ streamer = TextIteratorStreamer(_tok, skip_special_tokens=True, skip_prompt=True)
 
 
 
297
 
298
+ input_len = inputs["input_ids"].shape[-1]
299
+ print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
300
  f"temp={temperature}, vision={has_image}", flush=True)
301
 
302
+ # ── generate β†’ 별도 μŠ€λ ˆλ“œ (GPU μ»¨ν…μŠ€νŠΈλŠ” 이 ν•¨μˆ˜κ°€ μœ μ§€) ──
303
+ gen_kwargs = dict(
304
+ **inputs,
305
+ max_new_tokens=max_new_tokens,
306
+ do_sample=temperature > 0.01,
307
+ temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
308
+ top_p=float(top_p),
309
+ streamer=streamer,
310
+ use_cache=True,
 
 
 
 
311
  )
312
+
313
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
314
  thread.start()
315
 
316
  output = ""
 
443
  @fapp.get("/health")
444
  async def health():
445
  return {
446
+ "status": "ok", "model": MODEL_ID,
447
+ "vision": IS_VISION, "dtype": str(model.dtype),
 
 
 
448
  }
449
 
 
450
  BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
451
 
452
  @fapp.post("/api/search")
 
471
  except Exception as e:
472
  return JSONResponse({"error": str(e)}, status_code=500)
473
 
 
474
  @fapp.post("/api/extract-pdf")
475
  async def api_extract_pdf(request: Request):
476
  try:
 
493
  except Exception as e:
494
  return JSONResponse({"error": str(e)}, status_code=500)
495
 
496
+ # ══════════════════════════════════════════════════════════════════════════════
497
+ # 8. MOUNT & LAUNCH
498
+ # β˜… 핡심: uvicorn.run() μ‚¬μš© κΈˆμ§€! β˜…
499
+ # HF Spaces ZeroGPU λŸ°νƒ€μž„μ΄ λͺ¨λ“ˆ μŠ€μΊ” β†’ 'app' λ³€μˆ˜ 감지 β†’ μžλ™ μ„œλΉ™.
500
+ # uvicorn.run()을 ν˜ΈμΆœν•˜λ©΄ ZeroGPU wrapperλ₯Ό μš°νšŒν•˜μ—¬ μ¦‰μ‹œ μ’…λ£Œλ¨.
501
+ # ══════════════════════════════════════════════════════════════════════════════
502
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
503
 
504
+ print("[BOOT] Darwin-35B-A3B-Opus Β· ZeroGPU Direct Serving Β· Ready", flush=True)
505
+
506
+ # ── 둜컬 개발 μ „μš© (SPACE_ID 없을 λ•Œλ§Œ) ──
507
+ if __name__ == "__main__" and not os.getenv("SPACE_ID"):
508
+ import uvicorn
509
  uvicorn.run(app, host="0.0.0.0", port=7860)