aciang commited on
Commit
cb123e0
·
verified ·
1 Parent(s): a4432b4

S2 hotfix: app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -78
app.py CHANGED
@@ -1,99 +1,86 @@
1
 
2
- import os, time, threading, torch, gradio as gr
3
- from huggingface_hub import snapshot_download
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
5
 
6
- SPACE_TITLE = "LanguageBridge Multimodal Chatbot (Mistral-7B)"
7
- PRIMARY_MODEL = "aciang/mistral7b-tk-sft-20251019-merged"
8
- FALLBACK_MODEL = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
9
 
10
- # ---- 加速下載 + 固定快取(/data Spaces 會持久化)----
11
- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER","1")
12
- os.environ.setdefault("HF_HOME","/data/.cache/hf") # 持久化 cache
13
- os.environ.setdefault("TRANSFORMERS_CACHE","/data/.cache/hf/transformers")
14
- os.makedirs(os.environ["HF_HOME"], exist_ok=True)
15
 
16
- # ---- 嘗試先把 tokenizer 拉到本地(秒載 UI)----
17
- def _ensure_tokenizer(model_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  try:
19
- snapshot_download(model_id, allow_patterns=["tokenizer.*","*tokenizer*","special_tokens_map.json"], local_dir=None)
20
  except Exception as e:
21
- print("[tok prefetch] skip:", e)
22
-
23
- # ---- 模型載入(含 4-bit 後援)----
24
- def load_llm(prefer_primary=True):
25
- model_id = PRIMARY_MODEL if prefer_primary else FALLBACK_MODEL
26
- use_4bit = (model_id != PRIMARY_MODEL)
27
-
28
- if use_4bit:
29
- bnb = BitsAndBytesConfig(
30
- load_in_4bit=True, bnb_4bit_quant_type="nf4",
31
- bnb_4bit_use_double_quant=True,
32
- bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16
33
  )
34
- kw = dict(device_map="auto", quantization_config=bnb, trust_remote_code=False)
35
- else:
36
- kw = dict(device_map="auto", trust_remote_code=False)
 
 
 
37
 
38
- print(f"[load] try model = {model_id} | 4bit={use_4bit}")
39
- t0 = time.time()
40
- tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
41
- if tok.pad_token is None: tok.pad_token = tok.eos_token
42
- tok.padding_side = "left"
43
- mdl = AutoModelForCausalLM.from_pretrained(model_id, **kw)
44
- mdl.eval()
45
- print(f"[load] ok in {time.time()-t0:.1f}s")
46
- return tok, mdl, model_id
47
-
48
- # ---- 啟動邏輯:先開一條背景線程載入 PRIMARY;若超時改載入 FALLBACK ----
49
- tokenizer = None
50
- llm = None
51
- active_model = None
52
-
53
- def boot():
54
- global tokenizer, llm, active_model
55
- _ensure_tokenizer(PRIMARY_MODEL)
56
- deadline = time.time() + 14*60 # 14 分鐘內載不完就切換(留 16 分緩衝 < 30 分鐘)
57
- try:
58
- tokenizer, llm, active_model = load_llm(prefer_primary=True)
59
- except Exception as e:
60
- print("[boot] primary failed early:", e)
61
- if llm is None or time.time() > deadline:
62
- print("[boot] switching to FALLBACK for fast availability...")
63
- tokenizer, llm, active_model = load_llm(prefer_primary=False)
64
 
65
- boot_th = threading.Thread(target=boot); boot_th.start()
 
 
 
 
 
 
66
 
67
- SYSTEM = (
68
- "你是語言橋助教。回覆重點:1) 條列步驟 2) 簡潔正確 3) 不確定就說明不足並提出假設。"
69
- )
70
 
71
- @torch.inference_mode()
72
- def stream_answer(q, mx=256, temp=0.6, top_p=0.95):
73
- boot_th.join() # 確保載入完成
74
- prompt = f"{SYSTEM}\\n\\n使用者:{q}\\n助教:"
75
- inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
76
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
77
  gen = dict(**inputs, streamer=streamer, max_new_tokens=int(mx),
78
  temperature=float(temp), top_p=float(top_p),
79
- do_sample=True if float(temp)>0 else False,
80
- eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
81
- t = threading.Thread(target=llm.generate, kwargs=gen); t.start()
 
82
  buf = ""
83
  for tok in streamer:
84
  buf += tok
85
  yield buf
86
 
87
- with gr.Blocks(title=SPACE_TITLE, fill_height=True) as demo:
88
- gr.Markdown(f"### {SPACE_TITLE}\\n目前模型:`{active_model or 'loading…'}`\\n(首次啟動若超時將自動切到 4-bit 權重)")
89
- q = gr.Textbox(label="你的問題 / 指令")
90
- mx = gr.Slider(64, 1024, value=512, step=32, label="max_new_tokens")
91
- tp = gr.Slider(0.0, 1.2, value=0.6, step=0.05, label="temperature")
92
- top = gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="top_p")
93
- go = gr.Button("送出 🚀", variant="primary")
94
- out = gr.Textbox(label="輸出", lines=12)
95
- go.click(stream_answer, inputs=[q, mx, tp, top], outputs=out)
96
- demo.queue(api_open=False)
 
 
 
 
 
97
 
98
  if __name__ == "__main__":
99
  demo.launch(share=False, show_error=True)
 
1
 
2
+ import os, time, torch, gradio as gr
3
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # 加速首次下載
 
4
 
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig
 
 
6
 
7
+ TITLE = os.getenv("SPACE_TITLE", "LanguageBridge Multimodal Chatbot (Mistral-7B)")
8
+ MODEL_ID = os.getenv("MODEL_ID", "aciang/mistral7b-tk-sft-20251019-merged")
 
 
 
9
 
10
+ SYSTEM_PROMPT = (
11
+ "你是『語言橋』助教。回答原則:條列、準確、可重現步驟;不足處要誠實說明。"
12
+ )
13
+
14
+ _tok, _llm = None, None
15
+ def load_llm():
16
+ global _tok, _llm
17
+ if _llm is not None:
18
+ return _tok, _llm
19
+ # 4-bit(失敗則自動回退)
20
+ bnb = BitsAndBytesConfig(
21
+ load_in_4bit=True, bnb_4bit_quant_type="nf4",
22
+ bnb_4bit_use_double_quant=True,
23
+ bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16
24
+ )
25
+ kwargs = dict(device_map="auto", trust_remote_code=False, quantization_config=bnb)
26
  try:
27
+ _llm = AutoModelForCausalLM.from_pretrained(MODEL_ID, **kwargs)
28
  except Exception as e:
29
+ print("[4-bit failed] fallback:", e)
30
+ _llm = AutoModelForCausalLM.from_pretrained(
31
+ MODEL_ID,
32
+ torch_dtype=(torch.float16 if torch.cuda.is_available() else torch.float32),
33
+ device_map=("auto" if torch.cuda.is_available() else None),
34
+ trust_remote_code=False
 
 
 
 
 
 
35
  )
36
+ _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
37
+ if _tok.pad_token is None: _tok.pad_token = _tok.eos_token
38
+ _tok.padding_side = "left"
39
+ if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True
40
+ _llm.config.use_cache = True
41
+ return _tok, _llm
42
 
43
+ def format_prompt(user_text:str)->str:
44
+ return f"{SYSTEM_PROMPT}\n\n使用者:{user_text}\n助教:"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ @torch.inference_mode()
47
+ def generate(user_text, mx=256, temp=0.6, top_p=0.95):
48
+ global _tok, _llm
49
+ if _llm is None:
50
+ yield "(正在載入模型,首次需要數十秒到數分鐘,請稍候…)"
51
+ _tok, _llm = load_llm()
52
+ yield "(模型載入完成,開始回應…)"
53
 
54
+ prompt = format_prompt(user_text)
55
+ inputs = _tok(prompt, return_tensors="pt").to(_llm.device)
 
56
 
57
+ streamer = TextIteratorStreamer(_tok, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
58
  gen = dict(**inputs, streamer=streamer, max_new_tokens=int(mx),
59
  temperature=float(temp), top_p=float(top_p),
60
+ do_sample=True, eos_token_id=_tok.eos_token_id, pad_token_id=_tok.pad_token_id)
61
+ import threading
62
+ t = threading.Thread(target=_llm.generate, kwargs=gen); t.start()
63
+
64
  buf = ""
65
  for tok in streamer:
66
  buf += tok
67
  yield buf
68
 
69
+ with gr.Blocks(title=TITLE, fill_height=True) as demo:
70
+ gr.Markdown(f"## {TITLE}\n模型:`{MODEL_ID}`(延遲載入)")
71
+ chat_in = gr.Textbox(label="你的問題 / 指令", placeholder="輸入文字…", lines=4)
72
+ with gr.Row():
73
+ mx = gr.Slider(64, 1024, value=256, step=32, label="max_new_tokens")
74
+ temp = gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="temperature")
75
+ top = gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="top_p")
76
+ go = gr.Button("送出 🚀", variant="primary")
77
+ out = gr.Textbox(label="輸出(流式)", lines=18)
78
+ clr = gr.Button("清除")
79
+
80
+ go.click(generate, inputs=[chat_in, mx, temp, top], outputs=out)
81
+ clr.click(lambda: "", outputs=out)
82
+
83
+ demo.queue(max_size=32, api_open=False)
84
 
85
  if __name__ == "__main__":
86
  demo.launch(share=False, show_error=True)