aciang commited on
Commit
30ba118
·
verified ·
1 Parent(s): 272b97f

Init/Update LanguageBridge Multimodal Chatbot Space (final)

Browse files
Files changed (3) hide show
  1. README.md +16 -6
  2. app.py +85 -75
  3. requirements.txt +1 -3
README.md CHANGED
@@ -1,10 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
1
  # LanguageBridge — Multimodal Chatbot (Mistral-7B)
2
 
3
- 文字先穩定,影像/語音可用 USE_IMAGE/USE_AUDIO 變數開關。
4
 
5
  - Core model: `aciang/mistral7b-tk-sft-20251019-merged`
6
- - 若要啟用影像/語音:
7
- - 進入 **Settings → Variables**,新增:
8
- - `USE_IMAGE=1`
9
- - `USE_AUDIO=1`
10
- - 如遇套件相依衝突,可在 **Settings → Runtime** 切換到 **T4** 或 **A10G**。
 
1
+ ---
2
+ title: LanguageBridge — Multimodal Chatbot (Mistral-7B)
3
+ emoji: 🌉
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: other
10
+ ---
11
+
12
  # LanguageBridge — Multimodal Chatbot (Mistral-7B)
13
 
14
+ 以文字為主穩定推理;影像/語音可透過 Variables 開關(USE_IMAGE/USE_AUDIO)。
15
 
16
  - Core model: `aciang/mistral7b-tk-sft-20251019-merged`
17
+ - **啟用影像/語音**:到 **Settings → Variables** 新增
18
+ - `USE_IMAGE=1`
19
+ - `USE_AUDIO=1`
20
+ - 若遇 CUDA/依賴衝突,請到 **Settings → Hardware** 改用 T4 或 A10G。
 
app.py CHANGED
@@ -1,96 +1,104 @@
1
 
2
- import os, torch, time
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- TITLE = os.getenv("SPACE_TITLE", "LanguageBridge — Multimodal Chatbot (Mistral-7B)")
7
- MODEL_ID = os.getenv("MODEL_ID", "aciang/mistral7b-tk-sft-20251019-merged")
 
8
  SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT",
9
- "你是語言橋助教,回答務必:條列、準確、可引用部落知識與科學。必要時給出步驟清單。")
10
 
11
- # 可用 Space Variables 覆蓋:USE_IMAGE=1 / USE_AUDIO=1
12
- USE_IMAGE = os.getenv("USE_IMAGE", "0") in ("1", "true", "True")
13
- USE_AUDIO = os.getenv("USE_AUDIO", "0") in ("1", "true", "True")
14
 
15
  def load_llm():
16
  dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
  try:
18
- model = AutoModelForCausalLM.from_pretrained(
19
- MODEL_ID, torch_dtype=dtype, device_map="auto"
20
- )
21
  except Exception as e:
22
- print(f"[Fallback CPU] load error: {e}")
23
- model = AutoModelForCausalLM.from_pretrained(
24
- MODEL_ID, torch_dtype=torch.float32, device_map=None
25
- )
26
  tok = AutoTokenizer.from_pretrained(MODEL_ID)
27
  return tok, model
28
 
29
  tokenizer, llm = load_llm()
30
  llm.eval()
31
 
32
- def lazy_load_captioner():
 
 
 
 
 
33
  try:
34
  from transformers import BlipProcessor, BlipForConditionalGeneration
35
  cap_id = os.getenv("CAPTION_MODEL_ID", "Salesforce/blip-image-captioning-base")
36
- proc = BlipProcessor.from_pretrained(cap_id)
37
- vmod = BlipForConditionalGeneration.from_pretrained(
38
  cap_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
39
  )
40
- if torch.cuda.is_available():
41
- vmod = vmod.to("cuda")
42
- return proc, vmod
43
  except Exception as e:
44
  print(f"[Image OFF] {e}")
45
- return None, None
46
-
47
- CAP_PROC, CAP_MOD = (None, None)
48
- if USE_IMAGE:
49
- CAP_PROC, CAP_MOD = lazy_load_captioner()
50
-
51
- def lazy_load_asr():
 
 
52
  try:
53
  import whisper
54
  asr_id = os.getenv("ASR_MODEL_ID", "tiny")
55
- return whisper.load_model(asr_id)
56
  except Exception as e:
57
  print(f"[Audio OFF] {e}")
58
- return None
59
-
60
- ASR = lazy_load_asr() if USE_AUDIO else None
61
 
62
  @torch.inference_mode()
63
  def generate_reply(history, image, audio, max_new_tokens, temperature, top_p):
64
- sys_prompt = SYSTEM_PROMPT
65
- user_parts = []
66
-
67
- if image is not None and USE_IMAGE and CAP_PROC and CAP_MOD:
68
- try:
69
- from PIL import Image
70
- im = Image.open(image).convert("RGB")
71
- inputs = CAP_PROC(im, return_tensors="pt").to(CAP_MOD.device)
72
- out = CAP_MOD.generate(**inputs, max_new_tokens=64)
73
- cap = CAP_PROC.decode(out[0], skip_special_tokens=True)
74
- user_parts.append(f"[影像描述] {cap}")
75
- except Exception as e:
76
- user_parts.append(f"[影像處理失敗: {e}]")
77
-
78
- if audio is not None and USE_AUDIO and ASR is not None:
79
- try:
80
- result = ASR.transcribe(audio, fp16=torch.cuda.is_available())
81
- user_parts.append(f"[語音辨識] {result.get('text','')}")
82
- except Exception as e:
83
- user_parts.append(f"[語音處理失敗: {e}]")
84
-
85
- text = ""
86
- for role, msg in reversed(history or []):
87
- if role == "user":
88
- text = msg.get("content", "") if isinstance(msg, dict) else str(msg)
89
- break
90
- if user_parts:
91
- text = (text + "\n" if text else "") + "\n".join(user_parts)
92
-
93
- prompt = f"{sys_prompt}\n\n使用者:{text}\n助教:"
 
 
 
 
 
 
 
 
94
  inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
95
  out = llm.generate(
96
  **inputs,
@@ -102,23 +110,23 @@ def generate_reply(history, image, audio, max_new_tokens, temperature, top_p):
102
  pad_token_id=tokenizer.eos_token_id
103
  )
104
  ans = tokenizer.decode(out[0], skip_special_tokens=True)
105
- if "助教:" in ans:
106
- ans = ans.split("助教:", 1)[-1].strip()
107
  return ans
108
 
109
  with gr.Blocks(title=TITLE, fill_height=True) as demo:
110
- gr.Markdown(f"## {TITLE}\n- Core model: `{MODEL_ID}`\n- 影像/語音預設關閉(可在 Space Variables: `USE_IMAGE=1` / `USE_AUDIO=1` 開啟)")
 
111
  with gr.Row():
112
- chat = gr.Chatbot(height=450, type="messages", show_copy_button=True)
113
  with gr.Column(scale=0):
114
  user_txt = gr.Textbox(label="你的問題 / 指令", placeholder="請輸入文字…", interactive=True)
115
- img = gr.Image(label="(可選) 上傳圖片", type="filepath", visible=bool(USE_IMAGE), interactive=True)
116
- aud = gr.Audio(label="(可選) 上傳語音", type="filepath", sources=["upload","microphone"], visible=bool(USE_AUDIO), interactive=True)
117
  mx = gr.Slider(64, 1024, value=512, step=32, label="max_new_tokens")
118
- tp = gr.Slider(0.1, 1.2, value=0.6, step=0.05, label="temperature")
119
- top = gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="top_p")
120
  btn = gr.Button("送出 🚀", variant="primary")
121
- clr = gr.Button("清除對話")
122
 
123
  def respond(history, text, image, audio, mx, tp, top):
124
  history = history or []
@@ -133,8 +141,10 @@ with gr.Blocks(title=TITLE, fill_height=True) as demo:
133
  btn.click(respond, inputs=[chat, user_txt, img, aud, mx, tp, top], outputs=[chat, user_txt])
134
  clr.click(lambda: ([], ""), outputs=[chat, user_txt])
135
 
136
- try:
137
- demo.queue().launch(share=False, show_error=True, show_api=False)
138
- except Exception as e:
139
- print(f"[local 失敗,改用 share=True] {e}")
140
- demo.queue().launch(share=True, show_error=True, show_api=False)
 
 
 
1
 
2
+ import os, torch
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
+ TITLE = os.getenv("SPACE_TITLE", "LanguageBridge — Multimodal Chatbot (Mistral-7B)")
7
+ MODEL_ID = os.getenv("MODEL_ID", "aciang/mistral7b-tk-sft-20251019-merged")
8
+
9
  SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT",
10
+ "你是語言橋助教,回答務必:條列、準確,能連結部落知識與科學方法;必要時提供清楚步驟。")
11
 
12
+ USE_IMAGE = os.getenv("USE_IMAGE", "0") in ("1","true","True")
13
+ USE_AUDIO = os.getenv("USE_AUDIO", "0") in ("1","true","True")
 
14
 
15
  def load_llm():
16
  dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
  try:
18
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=dtype, device_map="auto")
 
 
19
  except Exception as e:
20
+ print(f"[LLM fallback CPU] {e}")
21
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32, device_map=None)
 
 
22
  tok = AutoTokenizer.from_pretrained(MODEL_ID)
23
  return tok, model
24
 
25
  tokenizer, llm = load_llm()
26
  llm.eval()
27
 
28
+ # (可選) 影像 caption 懶載入
29
+ CAP_PROC, CAP_MOD = None, None
30
+ def ensure_captioner():
31
+ global CAP_PROC, CAP_MOD
32
+ if CAP_PROC is not None:
33
+ return CAP_PROC, CAP_MOD
34
  try:
35
  from transformers import BlipProcessor, BlipForConditionalGeneration
36
  cap_id = os.getenv("CAPTION_MODEL_ID", "Salesforce/blip-image-captioning-base")
37
+ CAP_PROC = BlipProcessor.from_pretrained(cap_id)
38
+ CAP_MOD = BlipForConditionalGeneration.from_pretrained(
39
  cap_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
40
  )
41
+ if torch.cuda.is_available(): CAP_MOD.to("cuda")
 
 
42
  except Exception as e:
43
  print(f"[Image OFF] {e}")
44
+ CAP_PROC, CAP_MOD = None, None
45
+ return CAP_PROC, CAP_MOD
46
+
47
+ # (可選) Whisper 懶載入
48
+ ASR = None
49
+ def ensure_asr():
50
+ global ASR
51
+ if ASR is not None:
52
+ return ASR
53
  try:
54
  import whisper
55
  asr_id = os.getenv("ASR_MODEL_ID", "tiny")
56
+ ASR = whisper.load_model(asr_id)
57
  except Exception as e:
58
  print(f"[Audio OFF] {e}")
59
+ ASR = None
60
+ return ASR
 
61
 
62
  @torch.inference_mode()
63
  def generate_reply(history, image, audio, max_new_tokens, temperature, top_p):
64
+ # 取最後一輪 user 文字
65
+ user_text = ""
66
+ if history:
67
+ for role, msg in history[::-1]:
68
+ if role == "user":
69
+ user_text = msg.get("content","") if isinstance(msg, dict) else str(msg)
70
+ break
71
+
72
+ extra_parts = []
73
+
74
+ # 影像
75
+ if image and USE_IMAGE:
76
+ proc, vmod = ensure_captioner()
77
+ if proc and vmod:
78
+ try:
79
+ from PIL import Image
80
+ im = Image.open(image).convert("RGB")
81
+ inputs = proc(im, return_tensors="pt").to(vmod.device)
82
+ out = vmod.generate(**inputs, max_new_tokens=64)
83
+ cap = proc.decode(out[0], skip_special_tokens=True)
84
+ extra_parts.append(f"[影像描述] {cap}")
85
+ except Exception as e:
86
+ extra_parts.append(f"[影像處理失敗] {e}")
87
+
88
+ # 語音
89
+ if audio and USE_AUDIO:
90
+ asr = ensure_asr()
91
+ if asr:
92
+ try:
93
+ res = asr.transcribe(audio, fp16=torch.cuda.is_available())
94
+ extra_parts.append(f"[語音辨識] {res.get('text','')}")
95
+ except Exception as e:
96
+ extra_parts.append(f"[語音處理失敗] {e}")
97
+
98
+ if extra_parts:
99
+ user_text = (user_text + "\n" if user_text else "") + "\n".join(extra_parts)
100
+
101
+ prompt = f"{SYSTEM_PROMPT}\n\n使用者:{user_text}\n助教:"
102
  inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
103
  out = llm.generate(
104
  **inputs,
 
110
  pad_token_id=tokenizer.eos_token_id
111
  )
112
  ans = tokenizer.decode(out[0], skip_special_tokens=True)
113
+ if "助教:" in ans: ans = ans.split("助教:",1)[-1].strip()
 
114
  return ans
115
 
116
  with gr.Blocks(title=TITLE, fill_height=True) as demo:
117
+ gr.Markdown(f"## {TITLE}\n- 模型:`{MODEL_ID}`\n- 影像/語音預設關閉(可在 Variables `USE_IMAGE=1` / `USE_AUDIO=1`)")
118
+
119
  with gr.Row():
120
+ chat = gr.Chatbot(height=460, type="messages", show_copy_button=True)
121
  with gr.Column(scale=0):
122
  user_txt = gr.Textbox(label="你的問題 / 指令", placeholder="請輸入文字…", interactive=True)
123
+ img = gr.Image(label="(可選) 圖片", type="filepath", visible=bool(USE_IMAGE), interactive=True)
124
+ aud = gr.Audio(label="(可選) 語音", type="filepath", sources=["upload","microphone"], visible=bool(USE_AUDIO), interactive=True)
125
  mx = gr.Slider(64, 1024, value=512, step=32, label="max_new_tokens")
126
+ tp = gr.Slider(0.1, 1.2, value=0.6, step=0.05, label="temperature")
127
+ top = gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="top_p")
128
  btn = gr.Button("送出 🚀", variant="primary")
129
+ clr = gr.Button("清除")
130
 
131
  def respond(history, text, image, audio, mx, tp, top):
132
  history = history or []
 
141
  btn.click(respond, inputs=[chat, user_txt, img, aud, mx, tp, top], outputs=[chat, user_txt])
142
  clr.click(lambda: ([], ""), outputs=[chat, user_txt])
143
 
144
+ # 僅在 Space 環境啟動;此檔由 HF 自動執行,Colab 不需啟動。
145
+ if __name__ == "__main__":
146
+ try:
147
+ demo.queue().launch(share=False, show_error=True, show_api=False)
148
+ except Exception as e:
149
+ print(f"[local 失敗,改用 share=True] {e}")
150
+ demo.queue().launch(share=True, show_error=True, show_api=False)
requirements.txt CHANGED
@@ -2,10 +2,8 @@ gradio>=4.44.0
2
  transformers>=4.44.0
3
  accelerate>=0.31.0
4
  bitsandbytes
5
- torch
6
  huggingface_hub
7
- # 若要啟用影像/語音,請於 Space Variables USE_IMAGE/USE_AUDIO=1,
8
- # 並可視情況取消註解下列依賴(或在 Runtime 選擇 T4/A10G)
9
  # pillow
10
  # torchaudio
11
  # soundfile
 
2
  transformers>=4.44.0
3
  accelerate>=0.31.0
4
  bitsandbytes
 
5
  huggingface_hub
6
+ # 需要影像/語音時再取消以下註解(或在 Space Docker 環境中安裝)
 
7
  # pillow
8
  # torchaudio
9
  # soundfile