Files changed (1) hide show
  1. app.py +30 -2026
app.py CHANGED
@@ -1,2035 +1,39 @@
1
- import os
2
- import json
3
- import uuid
4
- import time
5
- import copy
6
- import base64
7
- import logging
8
- import argparse
9
- import math
10
- import multiprocessing as mp
11
- from io import BytesIO
12
- from typing import Generator, Any, Dict, Optional
13
-
14
- import spaces
15
- import torch
16
  import gradio as gr
17
- import numpy as np
18
- from PIL import Image
19
- from decord import VideoReader, cpu
20
- from scipy.spatial import cKDTree
21
- # import modelscope_studio as mgr
22
-
23
- # 导入模型相关模块
24
- try:
25
- from models import ModelMiniCPMV4_5
26
- except ImportError:
27
- print("Warning: models module not found. Please ensure models.py is available.")
28
- class ModelMiniCPMV4_5:
29
- def __init__(self, model_path):
30
- self.model_path = model_path
31
- self.model = None
32
-
33
- def __call__(self, query):
34
- return "Model not loaded", 0
35
-
36
- # 全局配置
37
- ERROR_MSG = "Error, please retry"
38
- model_name = 'MiniCPM-V 4.5'
39
- disable_text_only = False # 允许纯文本消息,便于测试
40
- DOUBLE_FRAME_DURATION = 30
41
- MAX_NUM_FRAMES = 180
42
- MAX_NUM_PACKING = 3
43
- TIME_SCALE = 0.1
44
- IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
45
- VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
46
-
47
- ENABLE_PARALLEL_ENCODING = True
48
- PARALLEL_PROCESSES = None
49
-
50
- # 全局模型实例
51
- global_model = None
52
-
53
- # 日志配置
54
- logging.basicConfig(level=logging.INFO)
55
- logger = logging.getLogger(__name__)
56
-
57
-
58
- # 全局模型配置
59
- model_config = {
60
- 'model_path': None,
61
- 'model_type': None,
62
- 'instance_id': 0
63
- }
64
-
65
- # 全局模型缓存(在GPU进程中)
66
- _gpu_model_cache = None
67
-
68
- def _initialize_gpu_model():
69
- """在GPU进程中获取模型并移到GPU"""
70
- global _gpu_model_cache
71
- if _gpu_model_cache is None:
72
- logger.info(f"在GPU进程中初始化模型: {model_config['model_type']}")
73
-
74
- match model_config['model_type'].lower():
75
- case 'minicpmv4_5':
76
- _gpu_model_cache = ModelMiniCPMV4_5(model_config['model_path'])
77
- case _:
78
- raise ValueError(f"Unsupported model type: {model_config['model_type']}")
79
-
80
- logger.info(f"模型在CPU上初始化完成")
81
-
82
- # 每次推理时将模型移到GPU
83
- if hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'to'):
84
- logger.info("将模型移到GPU...")
85
- _gpu_model_cache.model.to('cuda')
86
- elif hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'model') and hasattr(_gpu_model_cache.model.model, 'to'):
87
- logger.info("将模型移到GPU(嵌套模型)...")
88
- _gpu_model_cache.model.model.to('cuda')
89
-
90
- return _gpu_model_cache
91
-
92
- @spaces.GPU
93
- def gpu_handler(query):
94
- """GPU推理处理器"""
95
- model = _initialize_gpu_model()
96
-
97
- res, output_tokens = model({
98
- "image": query["image"],
99
- "question": query["question"],
100
- "params": query.get("params", "{}"),
101
- "temporal_ids": query.get("temporal_ids", None)
102
- })
103
- return {
104
- "result": res,
105
- "usage": {"output_tokens": output_tokens}
106
- }
107
-
108
- @spaces.GPU
109
- def gpu_stream_handler(query):
110
- """GPU流式推理处理器"""
111
- model = _initialize_gpu_model()
112
-
113
- params = json.loads(query.get("params", "{}"))
114
- params["stream"] = True
115
- query["params"] = json.dumps(params)
116
-
117
- try:
118
- generator = model({
119
- "image": query["image"],
120
- "question": query["question"],
121
- "params": query["params"],
122
- "temporal_ids": query.get("temporal_ids", None)
123
- })
124
-
125
- # 收集生成器的所有输出,避免序列化问题
126
- full_response = ""
127
- for chunk in generator:
128
- full_response += chunk
129
-
130
- return full_response
131
- except Exception as e:
132
- logger.error(f"GPU stream handler error: {e}")
133
- return f"Stream error: {str(e)}"
134
-
135
- class Model:
136
- """模型封装类,不持有实际模型对象"""
137
-
138
- def __init__(self, model_path: str, model_type: str, instance_id: int = 0):
139
- self.instance_id = instance_id
140
- self.model_path = model_path
141
- self.model_type = model_type
142
-
143
- # 设置全局配置
144
- model_config['model_path'] = model_path
145
- model_config['model_type'] = model_type
146
- model_config['instance_id'] = instance_id
147
-
148
- logger.info(f"实例 {instance_id}: 配置模型类型 {model_type}")
149
- logger.info(f"实例 {instance_id}: 模型路径 {model_path}")
150
-
151
- def handler(self, query):
152
- """非流式推理处理器"""
153
- return gpu_handler(query)
154
-
155
- def stream_handler(self, query):
156
- """流式推理处理器"""
157
- return gpu_stream_handler(query)
158
-
159
-
160
- def initialize_model():
161
- """初始化全局模型"""
162
- global global_model, _gpu_model_cache
163
-
164
- # 默认配置
165
- model_path = os.getenv('MODEL_PATH', 'openbmb/MiniCPM-V-4_5')
166
- model_type = os.getenv('MODEL_TYPE', 'minicpmv4_5')
167
-
168
- logger.info(f"="*50)
169
- logger.info(f"启动MiniCPM-V服务")
170
- logger.info(f"模型路径: {model_path}")
171
- logger.info(f"模型类型: {model_type}")
172
- logger.info(f"="*50)
173
-
174
- # 创建模型封装类
175
- global_model = Model(model_path, model_type, 0)
176
-
177
- # 在主进程中预加载模型到CPU(可选,为了更快的首次推理)
178
- try:
179
- logger.info("在主进程中预加载模型到CPU...")
180
- match model_type.lower():
181
- case 'minicpmv4_5':
182
- _gpu_model_cache = ModelMiniCPMV4_5(model_path)
183
- case _:
184
- raise ValueError(f"Unsupported model type: {model_type}")
185
-
186
- logger.info("模型在主进程CPU上预加载完成")
187
- except Exception as e:
188
- logger.warning(f"主进程预加载模型失败,将在GPU进程中加载: {e}")
189
- _gpu_model_cache = None
190
-
191
- return global_model
192
-
193
-
194
- # 工具函数
195
- def get_file_extension(filename):
196
- return os.path.splitext(filename)[1].lower()
197
-
198
-
199
- def is_image(filename):
200
- return get_file_extension(filename) in IMAGE_EXTENSIONS
201
-
202
 
203
- def is_video(filename):
204
- return get_file_extension(filename) in VIDEO_EXTENSIONS
205
 
206
-
207
- def map_to_nearest_scale(values, scale):
208
- tree = cKDTree(np.asarray(scale)[:, None])
209
- _, indices = tree.query(np.asarray(values)[:, None])
210
- return np.asarray(scale)[indices]
211
-
212
-
213
- def group_array(arr, size):
214
- return [arr[i:i+size] for i in range(0, len(arr), size)]
215
-
216
-
217
- def encode_image(image):
218
- """编码单张图片"""
219
- if not isinstance(image, Image.Image):
220
- if hasattr(image, 'path'):
221
- image = Image.open(image.path)
222
- elif hasattr(image, 'file') and hasattr(image.file, 'path'):
223
- image = Image.open(image.file.path)
224
- elif hasattr(image, 'name'):
225
- image = Image.open(image.name)
226
- else:
227
- image_path = getattr(image, 'url', getattr(image, 'orig_name', str(image)))
228
- image = Image.open(image_path)
229
-
230
- # 调整图片大小
231
- max_size = 448*16
232
- if max(image.size) > max_size:
233
- w, h = image.size
234
- if w > h:
235
- new_w = max_size
236
- new_h = int(h * max_size / w)
237
- else:
238
- new_h = max_size
239
- new_w = int(w * max_size / h)
240
- image = image.resize((new_w, new_h), resample=Image.BICUBIC)
241
-
242
- # 转换为base64
243
- buffered = BytesIO()
244
- image.save(buffered, format="png")
245
- im_b64 = base64.b64encode(buffered.getvalue()).decode()
246
- return [{"type": "image", "pairs": im_b64}]
247
-
248
-
249
- def encode_image_parallel(image_data):
250
- """并行图片编码包装函数"""
251
- try:
252
- return encode_image(image_data)
253
- except Exception as e:
254
- print(f"[Parallel encoding error] Image encoding failed: {e}")
255
- return None
256
-
257
-
258
- def encode_images_parallel(frames, num_processes=None):
259
- """多进程并行图片编码"""
260
- if not ENABLE_PARALLEL_ENCODING:
261
- print(f"[Parallel encoding] Parallel encoding disabled, using serial processing")
262
- encoded_frames = []
263
- for frame in frames:
264
- encoded = encode_image(frame)
265
- if encoded:
266
- encoded_frames.extend(encoded)
267
- return encoded_frames
268
-
269
- if num_processes is None:
270
- cpu_cores = mp.cpu_count()
271
- if PARALLEL_PROCESSES:
272
- num_processes = PARALLEL_PROCESSES
273
- else:
274
- if len(frames) >= 50:
275
- num_processes = min(cpu_cores, len(frames), 32)
276
- elif len(frames) >= 20:
277
- num_processes = min(cpu_cores, len(frames), 16)
278
- else:
279
- num_processes = min(cpu_cores, len(frames), 8)
280
-
281
- print(f"[Parallel encoding] Starting parallel encoding of {len(frames)} frame images, using {num_processes} processes")
282
-
283
- if len(frames) <= 2:
284
- print(f"[Parallel encoding] Few images ({len(frames)} frames), using serial processing")
285
- encoded_frames = []
286
- for frame in frames:
287
- encoded = encode_image(frame)
288
- if encoded:
289
- encoded_frames.extend(encoded)
290
- return encoded_frames
291
-
292
- start_time = time.time()
293
- try:
294
- with mp.Pool(processes=num_processes) as pool:
295
- results = pool.map(encode_image_parallel, frames)
296
-
297
- encoded_frames = []
298
- for result in results:
299
- if result:
300
- encoded_frames.extend(result)
301
-
302
- total_time = time.time() - start_time
303
- print(f"[Parallel encoding] Parallel encoding completed, total time: {total_time:.3f}s, encoded {len(encoded_frames)} images")
304
-
305
- return encoded_frames
306
-
307
- except Exception as e:
308
- print(f"[Parallel encoding] Parallel processing failed, falling back to serial processing: {e}")
309
- encoded_frames = []
310
- for frame in frames:
311
- encoded = encode_image(frame)
312
- if encoded:
313
- encoded_frames.extend(encoded)
314
- return encoded_frames
315
-
316
-
317
- def encode_video(video, choose_fps=None):
318
- """编码视频文件"""
319
- def uniform_sample(l, n):
320
- gap = len(l) / n
321
- idxs = [int(i * gap + gap / 2) for i in range(n)]
322
- return [l[i] for i in idxs]
323
-
324
- if hasattr(video, 'path'):
325
- video_path = video.path
326
- elif hasattr(video, 'file') and hasattr(video.file, 'path'):
327
- video_path = video.file.path
328
- elif hasattr(video, 'name'):
329
- video_path = video.name
330
- else:
331
- video_path = getattr(video, 'url', getattr(video, 'orig_name', str(video)))
332
-
333
- vr = VideoReader(video_path, ctx=cpu(0))
334
- fps = vr.get_avg_fps()
335
- video_duration = len(vr) / fps
336
-
337
- frame_idx = [i for i in range(0, len(vr))]
338
-
339
- effective_fps = choose_fps if choose_fps else 1
340
-
341
- if video_duration < DOUBLE_FRAME_DURATION and effective_fps <= 5:
342
- effective_fps = effective_fps * 2
343
- packing_nums = 2
344
- choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
345
- elif effective_fps * int(video_duration) <= MAX_NUM_FRAMES:
346
- packing_nums = 1
347
- choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
348
  else:
349
- packing_size = math.ceil(video_duration * effective_fps / MAX_NUM_FRAMES)
350
- if packing_size <= MAX_NUM_PACKING:
351
- choose_frames = round(video_duration * effective_fps)
352
- packing_nums = packing_size
353
- else:
354
- choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
355
- packing_nums = MAX_NUM_PACKING
356
-
357
- choose_idx = choose_frames
358
-
359
- frame_idx = np.array(uniform_sample(frame_idx, choose_idx))
360
- frames = vr.get_batch(frame_idx).asnumpy()
361
-
362
- frame_idx_ts = frame_idx / fps
363
- scale = np.arange(0, video_duration, TIME_SCALE)
364
-
365
- frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
366
- frame_ts_id = frame_ts_id.astype(np.int32)
367
-
368
- assert len(frames) == len(frame_ts_id)
369
-
370
- frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
371
- frame_ts_id_group = group_array(frame_ts_id.tolist(), packing_nums)
372
-
373
- print(f"[Performance] Starting image encoding, total {len(frames)} frames")
374
-
375
- if ENABLE_PARALLEL_ENCODING:
376
- print(f"[Image encoding] Using multi-process parallel encoding, CPU cores: {mp.cpu_count()}")
377
- encoded_frames = encode_images_parallel(frames, PARALLEL_PROCESSES)
378
- else:
379
- print("[Warning] Parallel encoding disabled, using serial processing")
380
- encoded_frames = []
381
- for frame in frames:
382
- encoded = encode_image(frame)
383
- if encoded:
384
- encoded_frames.extend(encoded)
385
-
386
- return encoded_frames, frame_ts_id_group
387
-
388
-
389
- # 响应处理函数
390
- def parse_thinking_response(response_text):
391
- """解析包含<think>标签的响应文本,支持流式解析"""
392
- import re
393
-
394
- # 完整的thinking标签匹配
395
- complete_think_pattern = r'<think>(.*?)</think>'
396
- thinking_matches = re.findall(complete_think_pattern, response_text, re.DOTALL)
397
-
398
- if thinking_matches:
399
- # 有完整的thinking标签
400
- thinking_content = "\n\n".join(thinking_matches).strip()
401
- print("thinking_content---:", thinking_content)
402
- formal_answer = re.sub(complete_think_pattern, '', response_text, flags=re.DOTALL).strip()
403
- return thinking_content, formal_answer
404
- else:
405
- # 检查是否有未完成的thinking标签
406
- partial_think_match = re.search(r'<think>(.*?)$', response_text, re.DOTALL)
407
- if partial_think_match:
408
- # 有开始标签但没有结束标签,说明thinking内容正在输出中
409
- # 返回特殊标识,表示正在thinking过程中
410
- return "STREAMING", ""
411
- else:
412
- # 没有thinking标签,直接返回原文作为正式回答
413
- return "", response_text.strip()
414
-
415
 
416
- def parse_thinking_response_for_final(response_text):
417
- """最终解析thinking响应,用于完成时的格式化"""
418
- import re
419
-
420
- # 首先尝试匹配完整的thinking标签
421
- think_pattern = r'<think>(.*?)</think>'
422
- thinking_matches = re.findall(think_pattern, response_text, re.DOTALL)
423
-
424
- if thinking_matches:
425
- thinking_content = "\n\n".join(thinking_matches).strip()
426
- formal_answer = re.sub(think_pattern, '', response_text, flags=re.DOTALL).strip()
427
- print(f"[parse_final] 找到完整thinking标签, thinking长度: {len(thinking_content)}, answer长度: {len(formal_answer)}")
428
- else:
429
- # 如果没有完整标签,检查是否有未闭合的<think>标签
430
- if '<think>' in response_text:
431
- think_start = response_text.find('<think>')
432
- if think_start != -1:
433
- # 提取thinking内容(从<think>之后到字符串结束)
434
- thinking_content = response_text[think_start + 7:].strip() # 跳过<think>
435
- # formal_answer是<think>之前的内容
436
- formal_answer = response_text[:think_start].strip()
437
-
438
- # 如果formal_answer为空,说明整个响应都是thinking内容
439
- if not formal_answer:
440
- formal_answer = "" # 没有正式回答
441
-
442
- print(f"[parse_final] 找到未闭合thinking标签")
443
- print(f"[parse_final] thinking内容: '{thinking_content[:100]}...'")
444
- print(f"[parse_final] formal_answer: '{formal_answer[:100]}...'")
445
- else:
446
- thinking_content = ""
447
- formal_answer = response_text.strip()
448
- print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
449
- else:
450
- thinking_content = ""
451
- formal_answer = response_text.strip()
452
- print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
453
-
454
- return thinking_content, formal_answer
455
-
456
-
457
- def normalize_text_for_html(text):
458
- """轻量级文本规范化"""
459
- import re
460
-
461
- if not text:
462
- return ""
463
-
464
- text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
465
- lines = [line.strip() for line in text.split("\n")]
466
- text = "\n".join(lines)
467
- text = text.strip()
468
- return text
469
-
470
-
471
- def format_response_with_thinking(thinking_content, formal_answer):
472
- """格式化包含思考过程的响应"""
473
- print(f"[format_thinking] thinking_content长度: {len(thinking_content) if thinking_content else 0}")
474
- print(f"[format_thinking] formal_answer长度: {len(formal_answer) if formal_answer else 0}")
475
- print(f"[format_thinking] thinking_content前100字符: '{thinking_content[:100] if thinking_content else 'None'}...'")
476
- print(f"[format_thinking] formal_answer前100字符: '{formal_answer[:100] if formal_answer else 'None'}...'")
477
-
478
- # 检查内容是否为空
479
- if not thinking_content and not formal_answer:
480
- print("[format_thinking] 警告:thinking_content和formal_answer都为空!")
481
- elif not formal_answer:
482
- print("[format_thinking] 警告:formal_answer为空!")
483
- elif not thinking_content:
484
- print("[format_thinking] 注意:thinking_content为空,将使用简化格式")
485
-
486
- # 添加一个唯一的ID来强制前端重新渲染
487
- import uuid
488
- unique_id = uuid.uuid4().hex[:8]
489
-
490
- # 如果有thinking内容,显示完整的thinking格式
491
- if thinking_content and thinking_content.strip():
492
- formatted_response = f"""
493
- <div class="response-container" id="response-{unique_id}">
494
- <div class="thinking-section">
495
- <div class="thinking-header">🤔 think</div>
496
- <div class="thinking-content">{thinking_content}</div>
497
- </div>
498
- <div class="formal-section">
499
- <div class="formal-header">💡 answer</div>
500
- <div class="formal-content">{formal_answer if formal_answer else '(无正式回答)'}</div>
501
- </div>
502
- </div>
503
- """
504
- else:
505
- # 如果没有thinking内容,直接显示回答
506
- content_to_show = formal_answer if formal_answer and formal_answer.strip() else "(空回答)"
507
- formatted_response = f"""
508
- <div class="response-container" id="response-{unique_id}">
509
- <div class="formal-section">
510
- <div class="formal-content">{content_to_show}</div>
511
- </div>
512
- </div>
513
- """
514
-
515
- return "\n" + formatted_response.strip() + "\n"
516
-
517
-
518
- def check_mm_type(mm_file):
519
- """检查多媒体文件类型"""
520
- if hasattr(mm_file, 'path'):
521
- path = mm_file.path
522
- elif hasattr(mm_file, 'file') and hasattr(mm_file.file, 'path'):
523
- path = mm_file.file.path
524
- elif hasattr(mm_file, 'name'):
525
- path = mm_file.name
526
- else:
527
- path = getattr(mm_file, 'url', getattr(mm_file, 'orig_name', str(mm_file)))
528
-
529
- if is_image(path):
530
- return "image"
531
- if is_video(path):
532
- return "video"
533
- return None
534
-
535
-
536
- def encode_mm_file(mm_file, choose_fps=None):
537
- """编码多媒体文件"""
538
- if check_mm_type(mm_file) == 'image':
539
- return encode_image(mm_file), None
540
- if check_mm_type(mm_file) == 'video':
541
- encoded_frames, frame_ts_id_group = encode_video(mm_file, choose_fps)
542
- return encoded_frames, frame_ts_id_group
543
- return None, None
544
-
545
-
546
- def encode_message(_question, choose_fps=None):
547
- """编码消息"""
548
- import re
549
-
550
- files = _question.files if _question.files else []
551
- question = _question.text if _question.text else ""
552
- message = []
553
- temporal_ids = []
554
-
555
- # 检查是否使用旧的占位符格式
556
- pattern = r"\[mm_media\]\d+\[/mm_media\]"
557
- if re.search(pattern, question):
558
- # 旧格式:使用占位符
559
- matches = re.split(pattern, question)
560
-
561
- if len(matches) != len(files) + 1:
562
- gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
563
- # 不使用 assert,而是处理不匹配的情况
564
- if len(matches) > len(files) + 1:
565
- matches = matches[:len(files) + 1]
566
- else:
567
- while len(matches) < len(files) + 1:
568
- matches.append("")
569
-
570
- text = matches[0].strip()
571
- if text:
572
- message.append({"type": "text", "pairs": text})
573
-
574
- for i in range(len(files)):
575
- encoded_content, frame_ts_id_group = encode_mm_file(files[i], choose_fps)
576
- if encoded_content:
577
- message += encoded_content
578
- if frame_ts_id_group:
579
- temporal_ids.extend(frame_ts_id_group)
580
-
581
- if i + 1 < len(matches):
582
- text = matches[i + 1].strip()
583
- if text:
584
- message.append({"type": "text", "pairs": text})
585
- else:
586
- # 新格式:简单的文本 + 文件列表
587
- if question.strip():
588
- message.append({"type": "text", "pairs": question.strip()})
589
-
590
- for file in files:
591
- encoded_content, frame_ts_id_group = encode_mm_file(file, choose_fps)
592
- if encoded_content:
593
- message += encoded_content
594
- if frame_ts_id_group:
595
- temporal_ids.extend(frame_ts_id_group)
596
-
597
- return message, temporal_ids if temporal_ids else None
598
-
599
-
600
- def check_has_videos(_question):
601
- """检查是否包含视频"""
602
- images_cnt = 0
603
- videos_cnt = 0
604
- files = _question.files if _question.files else []
605
- for file in files:
606
- if check_mm_type(file) == "image":
607
- images_cnt += 1
608
- else:
609
- videos_cnt += 1
610
- return images_cnt, videos_cnt
611
-
612
-
613
- def save_media_to_persistent_cache(_question, session_id):
614
- """将图片和视频保存到持久化缓存中,返回保存的路径信息"""
615
- import os
616
- import shutil
617
- import uuid
618
- from pathlib import Path
619
-
620
- files = _question.files if _question.files else []
621
- saved_media = []
622
-
623
- # 创建会话专用的媒体缓存目录
624
- cache_dir = Path("./media_cache") / session_id
625
- cache_dir.mkdir(parents=True, exist_ok=True)
626
-
627
- for file in files:
628
- file_type = check_mm_type(file)
629
- if file_type in ["image", "video"]:
630
- try:
631
- # 获取原始文件路径
632
- original_path = None
633
- if hasattr(file, 'name'):
634
- original_path = file.name
635
- elif hasattr(file, 'path'):
636
- original_path = file.path
637
- elif hasattr(file, 'file') and hasattr(file.file, 'path'):
638
- original_path = file.file.path
639
- else:
640
- continue
641
-
642
- if original_path and os.path.exists(original_path):
643
- # 生成唯一的文件名
644
- file_ext = os.path.splitext(original_path)[1]
645
- prefix = "img" if file_type == "image" else "vid"
646
- unique_filename = f"{prefix}_{uuid.uuid4().hex[:8]}{file_ext}"
647
- cached_path = cache_dir / unique_filename
648
-
649
- # 复制文件到缓存目录
650
- shutil.copy2(original_path, cached_path)
651
-
652
- saved_media.append({
653
- 'type': file_type,
654
- 'original_path': original_path,
655
- 'cached_path': str(cached_path),
656
- 'filename': unique_filename
657
- })
658
- print(f"[save_media_to_persistent_cache] {file_type}已保存: {cached_path}")
659
- except Exception as e:
660
- print(f"[save_media_to_persistent_cache] 保存{file_type}失败: {e}")
661
- continue
662
-
663
- return saved_media
664
-
665
-
666
- def format_user_message_with_files(_question, session_id=None):
667
- """格式化包含文件的用户消息,支持图片和视频显示"""
668
- user_text = _question.text if _question.text else ""
669
- files = _question.files if _question.files else []
670
-
671
- if not files:
672
- return user_text, []
673
-
674
- # 保存媒体文件到持久化缓存
675
- saved_media = []
676
- if session_id:
677
- saved_media = save_media_to_persistent_cache(_question, session_id)
678
-
679
- if len(files) == 1:
680
- file = files[0]
681
- file_type = check_mm_type(file)
682
-
683
- # 如果是图片或视频且已保存到缓存
684
- if file_type in ["image", "video"] and saved_media:
685
- media_info = saved_media[0]
686
- if file_type == "image":
687
- if user_text:
688
- return f"🖼️ {user_text}", saved_media
689
- else:
690
- return "🖼️ 图片", saved_media
691
- elif file_type == "video":
692
- if user_text:
693
- return f"🎬 {user_text}", saved_media
694
- else:
695
- return "🎬 视频", saved_media
696
- else:
697
- # 其他文件类型,使用文本描述
698
- return f"[1 file uploaded] {user_text}", saved_media
699
- else:
700
- # 多个文件,统计不同类型
701
- image_count = len([m for m in saved_media if m['type'] == 'image'])
702
- video_count = len([m for m in saved_media if m['type'] == 'video'])
703
- other_count = len(files) - image_count - video_count
704
-
705
- # 构建描述文本
706
- parts = []
707
- if image_count > 0:
708
- parts.append(f"{image_count} image{'s' if image_count > 1 else ''}")
709
- if video_count > 0:
710
- parts.append(f"{video_count} video{'s' if video_count > 1 else ''}")
711
- if other_count > 0:
712
- parts.append(f"{other_count} other file{'s' if other_count > 1 else ''}")
713
-
714
- if parts:
715
- files_desc = ", ".join(parts)
716
- return f"[{files_desc} uploaded] {user_text}", saved_media
717
- else:
718
- return f"[{len(files)} files uploaded] {user_text}", saved_media
719
-
720
-
721
- def update_media_gallery(app_session):
722
- """更新媒体画廊显示(图片和视频)"""
723
- import os
724
- media_cache = app_session.get('media_cache', [])
725
-
726
- if not media_cache:
727
- return gr.update(value=[], visible=False)
728
-
729
- # 获取所有缓存媒体文件的路径(图片和视频都支持)
730
- media_paths = [media_info['cached_path'] for media_info in media_cache if os.path.exists(media_info['cached_path'])]
731
-
732
- if media_paths:
733
- return gr.update(value=media_paths, visible=True)
734
- else:
735
- return gr.update(value=[], visible=False)
736
-
737
-
738
- def format_fewshot_user_message(image_path, user_text):
739
- """格式化FewShot用户消息,支持图片显示"""
740
- if image_path and user_text:
741
- return (user_text, image_path)
742
- elif image_path:
743
- return ("", image_path)
744
- else:
745
- return user_text
746
-
747
-
748
- # 主要的聊天函数
749
- def chat_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
750
- """直接调用模型进行聊天(非流式)"""
751
- default_params = {"num_beams": 3, "repetition_penalty": 1.2, "max_new_tokens": 16284}
752
- if params is None:
753
- params = default_params
754
-
755
- use_streaming = params.get('stream', False)
756
-
757
- if use_streaming:
758
- return chat_stream_direct(img_b64, msgs, ctx, params, vision_hidden_states, temporal_ids, session_id)
759
- else:
760
- # 构建请求数据
761
- query = {
762
- "image": img_b64,
763
- "question": json.dumps(msgs, ensure_ascii=True),
764
- "params": json.dumps(params, ensure_ascii=True),
765
- }
766
-
767
- if temporal_ids:
768
- query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
769
-
770
- if session_id:
771
- query["session_id"] = session_id
772
-
773
- try:
774
- # 直接调用模型
775
- result = global_model.handler(query)
776
- raw_result = result['result']
777
-
778
- # 清理结果
779
- import re
780
- cleaned_result = re.sub(r'(<box>.*</box>)', '', raw_result)
781
- cleaned_result = cleaned_result.replace('<ref>', '')
782
- cleaned_result = cleaned_result.replace('</ref>', '')
783
- cleaned_result = cleaned_result.replace('<box>', '')
784
- cleaned_result = cleaned_result.replace('</box>', '')
785
-
786
- # 解析思考过程
787
- thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
788
- thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
789
- formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
790
- formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
791
-
792
- context_result = formal_answer_raw if formal_answer_raw else cleaned_result
793
- return 0, formatted_result, context_result, None
794
-
795
- except Exception as e:
796
- print(f"Chat error: {e}")
797
- import traceback
798
- traceback.print_exc()
799
- return -1, ERROR_MSG, None, None
800
-
801
-
802
- def chat_stream_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
803
- """直接调用模型进行流式聊天"""
804
  try:
805
- # 构建请求数据
806
- query = {
807
- "image": img_b64,
808
- "question": json.dumps(msgs, ensure_ascii=True),
809
- "params": json.dumps(params, ensure_ascii=True),
810
- }
811
-
812
- if temporal_ids:
813
- query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
814
-
815
- if session_id:
816
- query["session_id"] = session_id
817
-
818
- # 直接调用流式处理器
819
- generator = global_model.stream_handler(query)
820
-
821
- full_response = ""
822
- for chunk in generator:
823
- full_response += chunk
824
-
825
- if not full_response:
826
- return -1, ERROR_MSG, None, None
827
-
828
- # 清理结果
829
- import re
830
- cleaned_result = re.sub(r'(<box>.*</box>)', '', full_response)
831
- cleaned_result = cleaned_result.replace('<ref>', '')
832
- cleaned_result = cleaned_result.replace('</ref>', '')
833
- cleaned_result = cleaned_result.replace('<box>', '')
834
- cleaned_result = cleaned_result.replace('</box>', '')
835
-
836
- # 解析思考过程
837
- thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
838
- thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
839
- formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
840
- formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
841
-
842
- context_result = formal_answer_raw if formal_answer_raw else cleaned_result
843
- return 0, formatted_result, context_result, None
844
-
845
  except Exception as e:
846
- print(f"Stream chat error: {e}")
847
- import traceback
848
- traceback.print_exc()
849
- return -1, ERROR_MSG, None, None
850
-
851
-
852
- def chat_stream_character_generator(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, stop_control=None, session_id=None):
853
- """字符级流式生成器"""
854
- print(f"[chat_stream_character_generator] Starting character-level streaming")
855
- print(f"[chat_stream_character_generator] stop_control: {stop_control}")
856
-
857
- try:
858
- # 构建请求数据
859
- query = {
860
- "image": img_b64,
861
- "question": json.dumps(msgs, ensure_ascii=True),
862
- "params": json.dumps(params, ensure_ascii=True),
863
- }
864
-
865
- if temporal_ids:
866
- query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)
867
-
868
- if session_id:
869
- query["session_id"] = session_id
870
-
871
- # 调用流式处理器 - 现在返回完整响应而不是生成器
872
- full_response = global_model.stream_handler(query)
873
-
874
- # 清理响应
875
- import re
876
- clean_response = re.sub(r'(<box>.*</box>)', '', full_response)
877
- clean_response = clean_response.replace('<ref>', '')
878
- clean_response = clean_response.replace('</ref>', '')
879
- clean_response = clean_response.replace('<box>', '')
880
- clean_response = clean_response.replace('</box>', '')
881
-
882
- # 逐字符yield以模拟流式输出
883
- char_count = 0
884
- for char in clean_response:
885
- # 检查停止标志
886
- if stop_control and stop_control.get('stop_streaming', False):
887
- print(f"[chat_stream_character_generator] *** 在第{char_count}个字符处收到停止信号 ***")
888
- break
889
-
890
- char_count += 1
891
- if char_count % 10 == 0:
892
- print(f"[chat_stream_character_generator] 已输出{char_count}个字符,stop_flag: {stop_control.get('stop_streaming', False) if stop_control else 'None'}")
893
-
894
- yield char
895
-
896
- # 添加小延迟以模拟流式效果
897
- import time
898
- time.sleep(0.01)
899
-
900
- print(f"[chat_stream_character_generator] 流式输出完成,总共输出{char_count}个字符")
901
-
902
- except Exception as e:
903
- print(f"[chat_stream_character_generator] 异常: {e}")
904
- error_msg = f"Stream error: {str(e)}"
905
- for char in error_msg:
906
- yield char
907
-
908
-
909
- # UI组件创建函数
910
- def create_component(params, comp='Slider'):
911
- if comp == 'Slider':
912
- return gr.Slider(
913
- minimum=params['minimum'],
914
- maximum=params['maximum'],
915
- value=params['value'],
916
- step=params['step'],
917
- interactive=params['interactive'],
918
- label=params['label']
919
- )
920
- elif comp == 'Radio':
921
- return gr.Radio(
922
- choices=params['choices'],
923
- value=params['value'],
924
- interactive=params['interactive'],
925
- label=params['label']
926
- )
927
- elif comp == 'Button':
928
- return gr.Button(
929
- value=params['value'],
930
- interactive=True
931
- )
932
- elif comp == 'Checkbox':
933
- return gr.Checkbox(
934
- value=params['value'],
935
- interactive=params['interactive'],
936
- label=params['label'],
937
- info=params.get('info', None)
938
- )
939
-
940
-
941
- def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
942
- # 使用标准的 Gradio 组件替代 MultimodalInput,添加预览功能
943
- return gr.File(
944
- file_count="multiple",
945
- file_types=["image", "video"],
946
- label="Upload Images/Videos",
947
- interactive=not (upload_image_disabled and upload_video_disabled),
948
- show_label=True,
949
- height=200 # 设置高度以显示预览
950
- )
951
-
952
-
953
- # UI控制函数
954
- def update_streaming_mode_state(params_form):
955
- """根据解码类型更新流式模式状态"""
956
- if params_form == 'Beam Search':
957
- return gr.update(value=False, interactive=False, info="Beam Search mode does not support streaming output")
958
- else:
959
- return gr.update(value=True, interactive=True, info="Enable real-time streaming response")
960
-
961
-
962
- def stop_streaming(_app_cfg):
963
- """停止流式输出"""
964
- _app_cfg['stop_streaming'] = True
965
- print(f"[stop_streaming] Set stop flag to True")
966
- return _app_cfg
967
-
968
-
969
- def reset_stop_flag(_app_cfg):
970
- """重置停止标志"""
971
- _app_cfg['stop_streaming'] = False
972
- print(f"[reset_stop_flag] Reset stop flag to False")
973
- return _app_cfg
974
-
975
-
976
- def check_and_handle_stop(_app_cfg, context="unknown"):
977
- """检查停止标志"""
978
- should_stop = _app_cfg.get('stop_streaming', False)
979
- is_streaming = _app_cfg.get('is_streaming', False)
980
-
981
- if should_stop:
982
- print(f"[check_and_handle_stop] *** Stop signal detected at {context} ***")
983
- print(f"[check_and_handle_stop] stop_streaming: {should_stop}, is_streaming: {is_streaming}")
984
- return True
985
- return False
986
-
987
-
988
- def stop_button_clicked(_app_cfg):
989
- """处理停止按钮点击"""
990
- print("[stop_button_clicked] *** Stop button clicked ***")
991
- print(f"[stop_button_clicked] Current state - is_streaming: {_app_cfg.get('is_streaming', False)}")
992
- print(f"[stop_button_clicked] Current state - stop_streaming: {_app_cfg.get('stop_streaming', False)}")
993
-
994
- _app_cfg['stop_streaming'] = True
995
- _app_cfg['is_streaming'] = False
996
- print(f"[stop_button_clicked] Set stop_streaming = True, is_streaming = False")
997
-
998
- return _app_cfg, gr.update(visible=False)
999
-
1000
-
1001
- # 主要的响应函数
1002
- def respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1003
- """流式响应生成器"""
1004
- print(f"[respond_stream] Called with streaming_mode: {streaming_mode}, fps_setting: {fps_setting}")
1005
-
1006
- _app_cfg['is_streaming'] = True
1007
- _app_cfg['stop_streaming'] = False
1008
-
1009
- if params_form == 'Beam Search':
1010
- streaming_mode = False
1011
- print(f"[respond_stream] Beam Search模式,强制禁用流式模式")
1012
- _app_cfg['is_streaming'] = False
1013
-
1014
- _context = _app_cfg['ctx'].copy()
1015
- encoded_message, temporal_ids = encode_message(_question, fps_setting)
1016
- _context.append({'role': 'user', 'contents': encoded_message})
1017
-
1018
- images_cnt = _app_cfg['images_cnt']
1019
- videos_cnt = _app_cfg['videos_cnt']
1020
- files_cnts = check_has_videos(_question)
1021
-
1022
- if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
1023
- gr.Warning("Only supports single video file input right now!")
1024
- yield create_multimodal_input(True, True), _chat_bot, _app_cfg, gr.update(visible=False)
1025
- return
1026
-
1027
- if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
1028
- gr.Warning("Please chat with at least one image or video.")
1029
- yield create_multimodal_input(False, False), _chat_bot, _app_cfg, gr.update(visible=False)
1030
- return
1031
-
1032
- if params_form == 'Beam Search':
1033
- params = {
1034
- 'sampling': False,
1035
- 'num_beams': 3,
1036
- 'repetition_penalty': 1.2,
1037
- "max_new_tokens": 16284,
1038
- "enable_thinking": thinking_mode,
1039
- "stream": False
1040
- }
1041
- else:
1042
- params = {
1043
- 'sampling': True,
1044
- 'top_p': 0.8,
1045
- 'top_k': 100,
1046
- 'temperature': 0.7,
1047
- 'repetition_penalty': 1.03,
1048
- "max_new_tokens": 16284,
1049
- "enable_thinking": thinking_mode,
1050
- "stream": streaming_mode
1051
- }
1052
-
1053
- if files_cnts[1] + videos_cnt > 0:
1054
- params["max_inp_length"] = 2048 * 10
1055
- params["use_image_id"] = False
1056
- params["max_slice_nums"] = 1
1057
-
1058
- images_cnt += files_cnts[0]
1059
- videos_cnt += files_cnts[1]
1060
-
1061
- # 构建用户消息显示(流式模式)
1062
- user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
1063
-
1064
- # 将媒体信息保存到会话状态中
1065
- if saved_images:
1066
- if 'media_cache' not in _app_cfg:
1067
- _app_cfg['media_cache'] = []
1068
- _app_cfg['media_cache'].extend(saved_images)
1069
-
1070
- _chat_bot.append((user_message, ""))
1071
- _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
1072
-
1073
- gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
1074
-
1075
- upload_image_disabled = videos_cnt > 0
1076
- upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1077
-
1078
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
1079
-
1080
- print(f"[respond_stream] 开始字符级流式输出循环")
1081
- char_count = 0
1082
- accumulated_content = ""
1083
-
1084
- for _char in gen:
1085
- char_count += 1
1086
-
1087
- if check_and_handle_stop(_app_cfg, f"字符{char_count}"):
1088
- break
1089
-
1090
- accumulated_content += _char
1091
- _context[-1]["contents"][0]["pairs"] += _char
1092
-
1093
- # 实时显示内容(thinking模式也实时显示)
1094
- if thinking_mode:
1095
- # 尝试解析当前累积的内容
1096
- thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
1097
-
1098
- # 如果解析出了完整的thinking内容,使用格式化显示
1099
- if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
1100
- thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1101
- formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1102
- formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1103
- _chat_bot[-1] = (user_message, formatted_display)
1104
- else:
1105
- # 正在thinking过程中或者还没有完整标签,直接显示原始内容(实时流式)
1106
- _chat_bot[-1] = (user_message, accumulated_content)
1107
- else:
1108
- # 非thinking模式,直接显示累积内容
1109
- _chat_bot[-1] = (user_message, accumulated_content)
1110
-
1111
- if char_count % 5 == 0: # 更频繁的更新以提供更好的流式体验
1112
- print(f"[respond_stream] 已处理{char_count}个字符,stop_flag: {_app_cfg.get('stop_streaming', False)}")
1113
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
1114
- time.sleep(0.02) # 稍微增加延迟以避免过于频繁的更新
1115
- else:
1116
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
1117
-
1118
- if _app_cfg.get('stop_streaming', False):
1119
- print("[respond_stream] 流式输出已停止")
1120
-
1121
- # 最终处理thinking格式化
1122
- final_content = accumulated_content
1123
- if thinking_mode:
1124
- thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(final_content)
1125
- thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1126
- formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1127
- formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1128
-
1129
- _chat_bot[-1] = (user_message, formatted_result)
1130
- _context[-1]["contents"][0]["pairs"] = formal_answer_raw if formal_answer_raw else final_content
1131
- else:
1132
- _chat_bot[-1] = (user_message, final_content)
1133
- _context[-1]["contents"][0]["pairs"] = final_content
1134
-
1135
- _app_cfg['ctx'] = _context
1136
- _app_cfg['images_cnt'] = images_cnt
1137
- _app_cfg['videos_cnt'] = videos_cnt
1138
- _app_cfg['is_streaming'] = False
1139
-
1140
- upload_image_disabled = videos_cnt > 0
1141
- upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1142
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1143
-
1144
-
1145
- def respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1146
- """主响应函数"""
1147
- if 'session_id' not in _app_cfg:
1148
- _app_cfg['session_id'] = uuid.uuid4().hex[:16]
1149
- print(f"[会话] 为现有会话生成session_id: {_app_cfg['session_id']}")
1150
-
1151
- # 记录thinking模式状态变化
1152
- prev_thinking_mode = _app_cfg.get('last_thinking_mode', False)
1153
- _app_cfg['last_thinking_mode'] = thinking_mode
1154
-
1155
- if prev_thinking_mode != thinking_mode:
1156
- print(f"[respond] Thinking模式切换: {prev_thinking_mode} -> {thinking_mode}")
1157
- # 强制清理可能的缓存状态
1158
- if hasattr(_app_cfg, 'thinking_cache'):
1159
- del _app_cfg['thinking_cache']
1160
- # 添加额外的状态重置
1161
- if thinking_mode and not prev_thinking_mode:
1162
- print("[respond] 启用thinking模式,重置相关状态")
1163
- _app_cfg['thinking_enabled'] = True
1164
- elif not thinking_mode and prev_thinking_mode:
1165
- print("[respond] 禁用thinking模式")
1166
- _app_cfg['thinking_enabled'] = False
1167
-
1168
- if params_form == 'Beam Search':
1169
- streaming_mode = False
1170
- print(f"[respond] Beam Search模式,强制禁用流式模式")
1171
-
1172
- if streaming_mode:
1173
- print("[respond] 选择流式模式")
1174
- yield from respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting)
1175
- return
1176
-
1177
- # 非流式模式
1178
- _context = _app_cfg['ctx'].copy()
1179
- encoded_message, temporal_ids = encode_message(_question, fps_setting)
1180
- _context.append({'role': 'user', 'contents': encoded_message})
1181
-
1182
- images_cnt = _app_cfg['images_cnt']
1183
- videos_cnt = _app_cfg['videos_cnt']
1184
- files_cnts = check_has_videos(_question)
1185
- if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
1186
- gr.Warning("Only supports single video file input right now!")
1187
- upload_image_disabled = videos_cnt > 0
1188
- upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1189
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1190
- return
1191
- if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
1192
- gr.Warning("Please chat with at least one image or video.")
1193
- upload_image_disabled = videos_cnt > 0
1194
- upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1195
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1196
- return
1197
-
1198
- if params_form == 'Beam Search':
1199
- params = {
1200
- 'sampling': False,
1201
- 'num_beams': 3,
1202
- 'repetition_penalty': 1.2,
1203
- "max_new_tokens": 16284,
1204
- "enable_thinking": thinking_mode,
1205
- "stream": False
1206
- }
1207
- else:
1208
- params = {
1209
- 'sampling': True,
1210
- 'top_p': 0.8,
1211
- 'top_k': 100,
1212
- 'temperature': 0.7,
1213
- 'repetition_penalty': 1.03,
1214
- "max_new_tokens": 16284,
1215
- "enable_thinking": thinking_mode,
1216
- "stream": False
1217
- }
1218
-
1219
- if files_cnts[1] + videos_cnt > 0:
1220
- params["max_inp_length"] = 2048 * 10
1221
- params["use_image_id"] = False
1222
- params["max_slice_nums"] = 1
1223
-
1224
- # 调用聊天函数
1225
- code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
1226
-
1227
- images_cnt += files_cnts[0]
1228
- videos_cnt += files_cnts[1]
1229
-
1230
- if code == 0:
1231
- context_content = _context_answer if _context_answer else _answer
1232
- _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
1233
-
1234
- # 根据thinking_mode决定是否应用thinking格式化
1235
- if thinking_mode:
1236
- thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(_answer)
1237
- thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1238
- formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1239
- print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, thinking_content: '{thinking_content_raw[:50]}...'")
1240
- formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1241
- else:
1242
- print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, 使用原始回答")
1243
- formatted_result = _answer
1244
-
1245
- # 构建用户消息显示
1246
- user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
1247
-
1248
- # 将媒体信息保存到会话状态中
1249
- if saved_images:
1250
- if 'media_cache' not in _app_cfg:
1251
- _app_cfg['media_cache'] = []
1252
- _app_cfg['media_cache'].extend(saved_images)
1253
-
1254
- _chat_bot.append((user_message, formatted_result))
1255
-
1256
- _app_cfg['ctx'] = _context
1257
- _app_cfg['sts'] = sts
1258
- else:
1259
- _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": "Error occurred during processing"}]})
1260
- # 构建用户消息显示(错误情况)
1261
- user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
1262
-
1263
- # 将媒体信息保存到会话状态中
1264
- if saved_images:
1265
- if 'media_cache' not in _app_cfg:
1266
- _app_cfg['media_cache'] = []
1267
- _app_cfg['media_cache'].extend(saved_images)
1268
-
1269
- _chat_bot.append((user_message, "Error occurred during processing"))
1270
-
1271
- _app_cfg['images_cnt'] = images_cnt
1272
- _app_cfg['videos_cnt'] = videos_cnt
1273
- _app_cfg['is_streaming'] = False
1274
-
1275
- upload_image_disabled = videos_cnt > 0
1276
- upload_video_disabled = videos_cnt > 0 or images_cnt > 0
1277
-
1278
- # 统一使用yield返回结果,确保与流式模式兼容
1279
- yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
1280
-
1281
-
1282
- # FewShot相关函数
1283
- def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
1284
- if 'session_id' not in _app_cfg:
1285
- _app_cfg['session_id'] = uuid.uuid4().hex[:16]
1286
- print(f"[会话] 为FewShot示例生成session_id: {_app_cfg['session_id']}")
1287
-
1288
- ctx = _app_cfg["ctx"]
1289
-
1290
- # 构建用户消息
1291
- user_msg = ""
1292
- if _image is not None:
1293
- image = Image.open(_image).convert("RGB")
1294
- ctx.append({"role": "user", "contents": [
1295
- *encode_image(image),
1296
- {"type": "text", "pairs": _user_message}
1297
- ]})
1298
- user_msg = f"[Image uploaded] {_user_message}"
1299
- else:
1300
- if _user_message:
1301
- ctx.append({"role": "user", "contents": [{"type": "text", "pairs": _user_message}]})
1302
- user_msg = _user_message
1303
-
1304
- # 构建助手消息
1305
- if _assistant_message:
1306
- ctx.append({"role": "assistant", "contents": [{"type": "text", "pairs": _assistant_message}]})
1307
-
1308
- # 只有当用户消息和助手消息都存在时才添加到聊天记录
1309
- if user_msg and _assistant_message:
1310
- formatted_user_msg = format_fewshot_user_message(_image, _user_message) if _image else user_msg
1311
- _chat_bot.append([formatted_user_msg, _assistant_message])
1312
-
1313
- return None, "", "", _chat_bot, _app_cfg
1314
-
1315
-
1316
- def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1317
- """FewShot响应函数"""
1318
- print(f"[fewshot_respond] Called with streaming_mode: {streaming_mode}")
1319
-
1320
- if 'session_id' not in _app_cfg:
1321
- _app_cfg['session_id'] = uuid.uuid4().hex[:16]
1322
- print(f"[会话] 为FewShot会话生成session_id: {_app_cfg['session_id']}")
1323
-
1324
- if params_form == 'Beam Search':
1325
- streaming_mode = False
1326
- print(f"[fewshot_respond] Beam Search模式,强制禁用流式模式")
1327
-
1328
- user_message_contents = []
1329
- _context = _app_cfg["ctx"].copy()
1330
- images_cnt = _app_cfg["images_cnt"]
1331
- temporal_ids = None
1332
-
1333
- if _image:
1334
- image = Image.open(_image).convert("RGB")
1335
- user_message_contents += encode_image(image)
1336
- images_cnt += 1
1337
- if _user_message:
1338
- user_message_contents += [{"type": "text", "pairs": _user_message}]
1339
- if user_message_contents:
1340
- _context.append({"role": "user", "contents": user_message_contents})
1341
-
1342
- if params_form == 'Beam Search':
1343
- params = {
1344
- 'sampling': False,
1345
- 'num_beams': 3,
1346
- 'repetition_penalty': 1.2,
1347
- "max_new_tokens": 16284,
1348
- "enable_thinking": thinking_mode,
1349
- "stream": False
1350
- }
1351
- else:
1352
- params = {
1353
- 'sampling': True,
1354
- 'top_p': 0.8,
1355
- 'top_k': 100,
1356
- 'temperature': 0.7,
1357
- 'repetition_penalty': 1.03,
1358
- "max_new_tokens": 16284,
1359
- "enable_thinking": thinking_mode,
1360
- "stream": streaming_mode
1361
- }
1362
-
1363
- if disable_text_only and images_cnt == 0:
1364
- gr.Warning("Please chat with at least one image or video.")
1365
- yield _image, _user_message, '', _chat_bot, _app_cfg
1366
- return
1367
-
1368
- if streaming_mode:
1369
- print(f"[fewshot_respond] Using streaming mode")
1370
- _app_cfg['is_streaming'] = True
1371
- _app_cfg['stop_streaming'] = False
1372
-
1373
- if _image:
1374
- user_msg = format_fewshot_user_message(_image, _user_message)
1375
- _chat_bot.append([user_msg, ""])
1376
- else:
1377
- _chat_bot.append([_user_message, ""])
1378
-
1379
- _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
1380
-
1381
- _app_cfg['stop_streaming'] = False
1382
-
1383
- gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
1384
-
1385
- yield _image, _user_message, '', _chat_bot, _app_cfg
1386
-
1387
- accumulated_content = ""
1388
- for _char in gen:
1389
- if _app_cfg.get('stop_streaming', False):
1390
- print("[fewshot_respond] 收到停止信号,中断流式响应")
1391
- break
1392
-
1393
- accumulated_content += _char
1394
- _context[-1]["contents"][0]["pairs"] += _char
1395
-
1396
- # 实时解析和格式化thinking内容
1397
- if thinking_mode:
1398
- # 尝试解析当前累积的内容
1399
- thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
1400
-
1401
- # 如果解析出了完整的thinking内容,使用格式化显示
1402
- if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
1403
- thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
1404
- formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
1405
- formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
1406
- _chat_bot[-1] = (_chat_bot[-1][0], formatted_display)
1407
- else:
1408
- # 正在thinking过程中或者还没有完整标签,直接显示原始内容(实时流式)
1409
- _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
1410
- else:
1411
- # 非thinking模式,直接显示累积内容
1412
- _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
1413
-
1414
- yield _image, _user_message, '', _chat_bot, _app_cfg
1415
-
1416
- final_content = _context[-1]["contents"][0]["pairs"]
1417
-
1418
- _app_cfg['ctx'] = _context
1419
- _app_cfg['images_cnt'] = images_cnt
1420
- _app_cfg['is_streaming'] = False
1421
-
1422
- yield _image, '', '', _chat_bot, _app_cfg
1423
-
1424
- else:
1425
- # 非流式模式
1426
- code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])
1427
-
1428
- context_content = _context_answer if _context_answer else _answer
1429
- _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
1430
-
1431
- if _image:
1432
- user_msg = format_fewshot_user_message(_image, _user_message)
1433
- _chat_bot.append([user_msg, _answer])
1434
- else:
1435
- _chat_bot.append([_user_message, _answer])
1436
-
1437
- if code == 0:
1438
- _app_cfg['ctx'] = _context
1439
- _app_cfg['sts'] = sts
1440
- _app_cfg['images_cnt'] = images_cnt
1441
-
1442
- _app_cfg['is_streaming'] = False
1443
- yield None, '', '', _chat_bot, _app_cfg
1444
-
1445
-
1446
- # 其他UI函数
1447
- def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1448
- print(f"[regenerate] streaming_mode: {streaming_mode}")
1449
- print(f"[regenerate] thinking_mode: {thinking_mode}")
1450
- print(f"[regenerate] chat_type: {_app_cfg.get('chat_type', 'unknown')}")
1451
-
1452
- if params_form == 'Beam Search':
1453
- streaming_mode = False
1454
- print(f"[regenerate] Beam Search模式,强制禁用流式模式")
1455
-
1456
- if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
1457
- gr.Warning('No question for regeneration.')
1458
- yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
1459
- return
1460
-
1461
- if _app_cfg["chat_type"] == "Chat":
1462
- images_cnt = _app_cfg['images_cnt']
1463
- videos_cnt = _app_cfg['videos_cnt']
1464
- _question = _chat_bot[-1][0]
1465
- _chat_bot = _chat_bot[:-1]
1466
- _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
1467
- files_cnts = check_has_videos(_question)
1468
- images_cnt -= files_cnts[0]
1469
- videos_cnt -= files_cnts[1]
1470
- _app_cfg['images_cnt'] = images_cnt
1471
- _app_cfg['videos_cnt'] = videos_cnt
1472
-
1473
- print(f"[regenerate] About to call respond with streaming_mode: {streaming_mode}")
1474
- for result in respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1475
- new_input, _chat_bot, _app_cfg, _stop_button = result
1476
- _question = new_input
1477
- yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
1478
- else:
1479
- # 在 tuples 格式下,_chat_bot[-1][0] 是字符串
1480
- last_user_message = _chat_bot[-1][0]
1481
- last_image = None
1482
-
1483
- # 检查消息是否包含图片标识
1484
- if "[Image uploaded]" in last_user_message:
1485
- # 从消息中提取实际的用户消息
1486
- last_user_message = last_user_message.replace("[Image uploaded] ", "")
1487
- # 注意:在简化的 tuples 格式下,我们无法直接获取图片文件
1488
- # 这里需要根据实际需要进行处理
1489
- _chat_bot = _chat_bot[:-1]
1490
- _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
1491
-
1492
- print(f"[regenerate] About to call fewshot_respond with streaming_mode: {streaming_mode}")
1493
- for result in fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
1494
- _image, _user_message, _assistant_message, _chat_bot, _app_cfg = result
1495
- yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
1496
-
1497
-
1498
- def flushed():
1499
- return gr.update(interactive=True)
1500
-
1501
-
1502
- def clear_media_cache(session_id):
1503
- """清理指定会话的媒体缓存"""
1504
- import shutil
1505
- from pathlib import Path
1506
-
1507
- try:
1508
- cache_dir = Path("./media_cache") / session_id
1509
- if cache_dir.exists():
1510
- shutil.rmtree(cache_dir)
1511
- print(f"[clear_media_cache] 已清理会话 {session_id} 的媒体缓存")
1512
- except Exception as e:
1513
- print(f"[clear_media_cache] 清理缓存失败: {e}")
1514
-
1515
-
1516
- def clear(txt_input, file_upload, chat_bot, app_session):
1517
- # 清理旧会话的媒体缓存
1518
- if 'session_id' in app_session:
1519
- clear_media_cache(app_session['session_id'])
1520
-
1521
- chat_bot = copy.deepcopy(init_conversation)
1522
- app_session['sts'] = None
1523
- app_session['ctx'] = []
1524
- app_session['images_cnt'] = 0
1525
- app_session['videos_cnt'] = 0
1526
- app_session['stop_streaming'] = False
1527
- app_session['is_streaming'] = False
1528
- app_session['media_cache'] = [] # 清空媒体缓存信息
1529
- app_session['last_thinking_mode'] = False # 重置thinking模式状态
1530
- app_session['session_id'] = uuid.uuid4().hex[:16]
1531
- print(f"[���话] 生成新会话ID: {app_session['session_id']}")
1532
- return "", None, gr.update(value=[], visible=False), gr.update(value=[], visible=False), chat_bot, app_session, None, '', ''
1533
-
1534
-
1535
- def select_chat_type(_tab, _app_cfg):
1536
- _app_cfg["chat_type"] = _tab
1537
- return _app_cfg
1538
-
1539
-
1540
- # UI配置
1541
- form_radio = {
1542
- 'choices': ['Beam Search', 'Sampling'],
1543
- 'value': 'Sampling',
1544
- 'interactive': True,
1545
- 'label': 'Decode Type'
1546
- }
1547
-
1548
- thinking_checkbox = {
1549
- 'value': False,
1550
- 'interactive': True,
1551
- 'label': 'Enable Thinking Mode',
1552
- }
1553
-
1554
- streaming_checkbox = {
1555
- 'value': True,
1556
- 'interactive': True,
1557
- 'label': 'Enable Streaming Mode',
1558
- }
1559
-
1560
- fps_slider = {
1561
- 'minimum': 1,
1562
- 'maximum': 20,
1563
- 'value': 3,
1564
- 'step': 1,
1565
- 'interactive': True,
1566
- 'label': 'Custom FPS for Video Processing'
1567
- }
1568
-
1569
- init_conversation = [
1570
- ["", "You can talk to me now"]
1571
- ]
1572
-
1573
- css = """
1574
- video { height: auto !important; }
1575
- .example label { font-size: 16px;}
1576
-
1577
- /* Current Media Gallery 滚动条样式 - 使用class选择器更安全 */
1578
- .current-media-gallery {
1579
- overflow-y: auto !important;
1580
- max-height: 600px !important;
1581
- position: relative !important;
1582
- }
1583
-
1584
- /* 确保只影响特定的Gallery容器内部 */
1585
- .current-media-gallery > div,
1586
- .current-media-gallery .gallery-container {
1587
- overflow-y: auto !important;
1588
- max-height: 580px !important;
1589
- }
1590
-
1591
- .current-media-gallery .gallery-item {
1592
- margin-bottom: 10px !important;
1593
- }
1594
-
1595
- /* 只为Current Media Gallery自定义滚动条样式 */
1596
- .current-media-gallery::-webkit-scrollbar,
1597
- .current-media-gallery > div::-webkit-scrollbar,
1598
- .current-media-gallery .gallery-container::-webkit-scrollbar {
1599
- width: 8px !important;
1600
- }
1601
-
1602
- .current-media-gallery::-webkit-scrollbar-track,
1603
- .current-media-gallery > div::-webkit-scrollbar-track,
1604
- .current-media-gallery .gallery-container::-webkit-scrollbar-track {
1605
- background: #f1f1f1 !important;
1606
- border-radius: 4px !important;
1607
- }
1608
-
1609
- .current-media-gallery::-webkit-scrollbar-thumb,
1610
- .current-media-gallery > div::-webkit-scrollbar-thumb,
1611
- .current-media-gallery .gallery-container::-webkit-scrollbar-thumb {
1612
- background: #c1c1c1 !important;
1613
- border-radius: 4px !important;
1614
- }
1615
-
1616
- .current-media-gallery::-webkit-scrollbar-thumb:hover,
1617
- .current-media-gallery > div::-webkit-scrollbar-thumb:hover,
1618
- .current-media-gallery .gallery-container::-webkit-scrollbar-thumb:hover {
1619
- background: #a8a8a8 !important;
1620
- }
1621
-
1622
- /* 隐藏Current Media的不必要元素 */
1623
- .current-media-gallery .upload-container,
1624
- .current-media-gallery .drop-zone,
1625
- .current-media-gallery .file-upload,
1626
- .current-media-gallery .upload-text,
1627
- .current-media-gallery .drop-text {
1628
- display: none !important;
1629
- }
1630
-
1631
- .current-media-gallery .clear-button,
1632
- .current-media-gallery .delete-button,
1633
- .current-media-gallery .remove-button {
1634
- display: none !important;
1635
- }
1636
-
1637
- /* 当Gallery为空时隐藏标签和占位文本 */
1638
- .current-media-gallery:not([style*="display: none"]) .gallery-container:empty::after {
1639
- content: "";
1640
- display: none;
1641
- }
1642
-
1643
- .current-media-gallery .empty-gallery-text,
1644
- .current-media-gallery .placeholder-text {
1645
- display: none !important;
1646
- }
1647
-
1648
- /* 确保滚动条不会影响到其他组件 */
1649
- .current-media-gallery {
1650
- isolation: isolate !important;
1651
- }
1652
-
1653
- /* 重置其他Gallery组件的滚动条样式,防止被污染 */
1654
- .gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar {
1655
- width: initial !important;
1656
- }
1657
-
1658
- .gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-track {
1659
- background: initial !important;
1660
- border-radius: initial !important;
1661
- }
1662
-
1663
- .gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-thumb {
1664
- background: initial !important;
1665
- border-radius: initial !important;
1666
- }
1667
-
1668
- /* 确保chatbot不受影响 */
1669
- .thinking-chatbot::-webkit-scrollbar {
1670
- width: initial !important;
1671
- }
1672
-
1673
- .thinking-chatbot::-webkit-scrollbar-track {
1674
- background: initial !important;
1675
- }
1676
-
1677
- .thinking-chatbot::-webkit-scrollbar-thumb {
1678
- background: initial !important;
1679
- }
1680
-
1681
- /* 思考过程和正式回答的样式 */
1682
- .response-container {
1683
- margin: 10px 0;
1684
- }
1685
-
1686
- .thinking-section {
1687
- background: linear-gradient(135deg, #f8f9ff 0%, #f0f4ff 100%);
1688
- border: 1px solid #d1d9ff;
1689
- border-radius: 12px;
1690
- padding: 16px;
1691
- margin-bottom: 0px;
1692
- box-shadow: 0 2px 8px rgba(67, 90, 235, 0.1);
1693
- }
1694
-
1695
- .thinking-header {
1696
- font-weight: 600;
1697
- color: #4c5aa3;
1698
- font-size: 14px;
1699
- margin-bottom: 12px;
1700
- display: flex;
1701
- align-items: center;
1702
- gap: 8px;
1703
- }
1704
-
1705
- .thinking-content {
1706
- color: #5a6ba8;
1707
- font-size: 13px;
1708
- line-height: 1;
1709
- font-style: italic;
1710
- background: rgba(255, 255, 255, 0.6);
1711
- padding: 12px;
1712
- border-radius: 8px;
1713
- border-left: 3px solid #4c5aa3;
1714
- white-space: pre-wrap;
1715
- }
1716
-
1717
- .formal-section {
1718
- background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%);
1719
- border: 1px solid #e9ecef;
1720
- border-radius: 12px;
1721
- padding: 16px;
1722
- box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
1723
- }
1724
-
1725
- .formal-header {
1726
- font-weight: 600;
1727
- color: #28a745;
1728
- font-size: 14px;
1729
- margin-bottom: 12px;
1730
- display: flex;
1731
- align-items: center;
1732
- gap: 8px;
1733
- }
1734
-
1735
- .formal-content {
1736
- color: #333;
1737
- font-size: 14px;
1738
- line-height: 1;
1739
- white-space: pre-wrap;
1740
- }
1741
-
1742
- /* 聊天机器人容器样式 */
1743
- .thinking-chatbot .message {
1744
- border-radius: 12px;
1745
- overflow: visible;
1746
- margin-top: 0 !important;
1747
- margin-bottom: 0 !important;
1748
- }
1749
-
1750
- .thinking-chatbot .message-wrap {
1751
- margin-top: 0 !important;
1752
- margin-bottom: 0 !important;
1753
- }
1754
-
1755
- .thinking-chatbot .message.bot {
1756
- background: transparent !important;
1757
- border: none !important;
1758
- padding: 8px !important;
1759
- }
1760
-
1761
- .thinking-chatbot .message.bot .content {
1762
- background: transparent !important;
1763
- }
1764
- """
1765
-
1766
- introduction = """
1767
- ## Features:
1768
- 1. Chat with single image
1769
- 2. Chat with multiple images
1770
- 3. Chat with video
1771
- 4. Streaming Mode: Real-time response streaming
1772
- 5. Thinking Mode: Show model reasoning process
1773
-
1774
- Click `How to use` tab to see examples.
1775
- """
1776
-
1777
-
1778
- # 主应用
1779
- def create_app():
1780
- with gr.Blocks(css=css) as demo:
1781
- with gr.Tab(model_name):
1782
- with gr.Row():
1783
- with gr.Column(scale=1, min_width=300):
1784
- gr.Markdown(value=introduction)
1785
- params_form = create_component(form_radio, comp='Radio')
1786
- thinking_mode = create_component(thinking_checkbox, comp='Checkbox')
1787
- streaming_mode = create_component(streaming_checkbox, comp='Checkbox')
1788
-
1789
- fps_setting = create_component(fps_slider, comp='Slider')
1790
- regenerate = create_component({'value': 'Regenerate'}, comp='Button')
1791
- clear_button = create_component({'value': 'Clear History'}, comp='Button')
1792
-
1793
- stop_button = gr.Button("Stop", visible=False)
1794
-
1795
- with gr.Column(scale=3, min_width=500):
1796
- initial_session_id = uuid.uuid4().hex[:16]
1797
- print(f"[会话] 初始化会话,生成session_id: {initial_session_id}")
1798
- app_session = gr.State({
1799
- 'sts': None, 'ctx': [], 'images_cnt': 0, 'videos_cnt': 0,
1800
- 'chat_type': 'Chat', 'stop_streaming': False, 'is_streaming': False,
1801
- 'session_id': initial_session_id, 'media_cache': [], 'last_thinking_mode': False
1802
- })
1803
- with gr.Row():
1804
- with gr.Column(scale=4):
1805
- chat_bot = gr.Chatbot(
1806
- label=f"Chat with {model_name}",
1807
- value=copy.deepcopy(init_conversation),
1808
- height=600,
1809
- elem_classes="thinking-chatbot"
1810
- )
1811
- with gr.Column(scale=1, min_width=200):
1812
- current_images = gr.Gallery(
1813
- label="Current Media",
1814
- show_label=True,
1815
- elem_id="current_media",
1816
- elem_classes="current-media-gallery",
1817
- columns=1,
1818
- rows=1, # 设为1行,让内容可以垂直滚动
1819
- height=600,
1820
- visible=False,
1821
- container=True, # 启用容器模式
1822
- allow_preview=True, # 允许预览
1823
- show_download_button=False, # 隐藏下载按钮
1824
- interactive=False, # 禁用交互,防止用户上传/删除
1825
- show_share_button=False # 隐藏分享按钮
1826
- )
1827
-
1828
- with gr.Tab("Chat") as chat_tab:
1829
- chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
1830
-
1831
- with gr.Row():
1832
- with gr.Column(scale=4):
1833
- txt_input = gr.Textbox(
1834
- placeholder="Type your message here...",
1835
- label="Message",
1836
- lines=2
1837
- )
1838
- with gr.Column(scale=1):
1839
- submit_btn = gr.Button("Submit", variant="primary")
1840
-
1841
- with gr.Row():
1842
- with gr.Column():
1843
- file_upload = create_multimodal_input()
1844
- # 添加图片预览组件
1845
- file_preview = gr.Gallery(
1846
- label="Uploaded Files Preview",
1847
- show_label=True,
1848
- elem_id="file_preview",
1849
- columns=3,
1850
- rows=2,
1851
- height="auto",
1852
- visible=False
1853
- )
1854
-
1855
- # 添加文件上传时的预览更新
1856
- def update_file_preview(files):
1857
- if files:
1858
- # 过滤出图片文件进行预览
1859
- image_files = []
1860
- for file in files:
1861
- if hasattr(file, 'name'):
1862
- file_path = file.name
1863
- else:
1864
- file_path = str(file)
1865
-
1866
- # 检查是否是图片文件
1867
- if any(file_path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']):
1868
- image_files.append(file_path)
1869
-
1870
- if image_files:
1871
- return gr.update(value=image_files, visible=True)
1872
-
1873
- return gr.update(value=[], visible=False)
1874
-
1875
- file_upload.change(
1876
- update_file_preview,
1877
- inputs=[file_upload],
1878
- outputs=[file_preview]
1879
- )
1880
-
1881
- # 创建一个包装函数来处理新的输入格式
1882
- def handle_submit(message, files, chat_bot, current_images_gallery, app_session, params_form, thinking_mode, streaming_mode, fps_setting):
1883
- print(f"[handle_submit] 收到输入: message='{message}', files={files}, chat_bot长度={len(chat_bot)}")
1884
-
1885
- # 如果消息为空且没有文件,直接返回
1886
- if not message and not files:
1887
- print("[handle_submit] 消息和文件都为空,直接返回")
1888
- return message, files, chat_bot, current_images_gallery, app_session, gr.update(visible=False)
1889
-
1890
- # 模拟原来的 MultimodalInput 格式
1891
- class MockInput:
1892
- def __init__(self, text, files):
1893
- self.text = text
1894
- self.files = files if files else []
1895
-
1896
- mock_question = MockInput(message, files)
1897
- print(f"[handle_submit] 创建MockInput: text='{mock_question.text}', files={len(mock_question.files)}")
1898
-
1899
- # respond 函数返回生成器,我们需要逐步yield结果
1900
- result_generator = respond(mock_question, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting)
1901
-
1902
- # 如果是生成器,逐步yield
1903
- if hasattr(result_generator, '__iter__') and not isinstance(result_generator, (str, bytes, tuple)):
1904
- print("[handle_submit] 使用生成器模式")
1905
- for result in result_generator:
1906
- new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result
1907
- print(f"[handle_submit] yield结果: chat_bot长度={len(updated_chat_bot)}")
1908
-
1909
- # 更新媒体显示
1910
- media_gallery_update = update_media_gallery(updated_app_session)
1911
-
1912
- # 返回正确的输出格式
1913
- yield "", None, updated_chat_bot, media_gallery_update, updated_app_session, stop_btn_update
1914
- else:
1915
- print("[handle_submit] 使用非生成器模式")
1916
- # 如果不是生成器,直接返回
1917
- new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result_generator
1918
- print(f"[handle_submit] 直接返回结果: chat_bot长度={len(updated_chat_bot)}")
1919
-
1920
- # 更新图片显示
1921
- image_gallery_update = update_image_gallery(updated_app_session)
1922
-
1923
- yield "", None, updated_chat_bot, image_gallery_update, updated_app_session, stop_btn_update
1924
-
1925
- submit_btn.click(
1926
- handle_submit,
1927
- [txt_input, file_upload, chat_bot, current_images, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
1928
- [txt_input, file_upload, chat_bot, current_images, app_session, stop_button]
1929
- )
1930
-
1931
- with gr.Tab("Few Shot", visible=False) as fewshot_tab:
1932
- fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
1933
- with gr.Row():
1934
- with gr.Column(scale=1):
1935
- image_input = gr.Image(type="filepath", sources=["upload"])
1936
- with gr.Column(scale=3):
1937
- user_message = gr.Textbox(label="User")
1938
- assistant_message = gr.Textbox(label="Assistant")
1939
- with gr.Row():
1940
- add_demonstration_button = gr.Button("Add Example")
1941
- generate_button = gr.Button(value="Generate", variant="primary")
1942
-
1943
- add_demonstration_button.click(
1944
- fewshot_add_demonstration,
1945
- [image_input, user_message, assistant_message, chat_bot, app_session],
1946
- [image_input, user_message, assistant_message, chat_bot, app_session]
1947
- )
1948
- generate_button.click(
1949
- fewshot_respond,
1950
- [image_input, user_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
1951
- [image_input, user_message, assistant_message, chat_bot, app_session]
1952
- )
1953
-
1954
- chat_tab.select(
1955
- select_chat_type,
1956
- [chat_tab_label, app_session],
1957
- [app_session]
1958
- )
1959
- chat_tab.select(
1960
- clear,
1961
- [txt_input, file_upload, chat_bot, app_session],
1962
- [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
1963
- )
1964
- fewshot_tab.select(
1965
- select_chat_type,
1966
- [fewshot_tab_label, app_session],
1967
- [app_session]
1968
- )
1969
- fewshot_tab.select(
1970
- clear,
1971
- [txt_input, file_upload, chat_bot, app_session],
1972
- [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
1973
- )
1974
- # chat_bot.flushed(flushed, outputs=[txt_input]) # 标准 Chatbot 可能不支持 flushed
1975
-
1976
- params_form.change(
1977
- update_streaming_mode_state,
1978
- inputs=[params_form],
1979
- outputs=[streaming_mode]
1980
- )
1981
-
1982
- regenerate.click(
1983
- regenerate_button_clicked,
1984
- [txt_input, image_input, user_message, assistant_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
1985
- [txt_input, image_input, user_message, assistant_message, chat_bot, app_session]
1986
- )
1987
- clear_button.click(
1988
- clear,
1989
- [txt_input, file_upload, chat_bot, app_session],
1990
- [txt_input, file_upload, file_preview, current_images, chat_bot, app_session, image_input, user_message, assistant_message]
1991
- )
1992
-
1993
- stop_button.click(
1994
- stop_button_clicked,
1995
- [app_session],
1996
- [app_session, stop_button]
1997
- )
1998
-
1999
- return demo
2000
-
2001
 
2002
  if __name__ == "__main__":
2003
- # 解析命令行参数
2004
- parser = argparse.ArgumentParser(description='Web Demo for MiniCPM-V 4.5')
2005
- parser.add_argument('--port', type=int, default=7860, help='Port to run the web demo on')
2006
- parser.add_argument('--no-parallel-encoding', action='store_true', help='Disable parallel image encoding')
2007
- parser.add_argument('--parallel-processes', type=int, default=None, help='Number of parallel processes for image encoding')
2008
- args = parser.parse_args()
2009
-
2010
- # 配置并行编码
2011
- if args.no_parallel_encoding:
2012
- ENABLE_PARALLEL_ENCODING = False
2013
- print("[性能优化] 并行图像编码已禁用")
2014
- else:
2015
- ENABLE_PARALLEL_ENCODING = True
2016
- print("[性能优化] 并行图像编码已启用")
2017
-
2018
- if args.parallel_processes:
2019
- PARALLEL_PROCESSES = args.parallel_processes
2020
- print(f"[性能优化] 设置并行进程数为: {PARALLEL_PROCESSES}")
2021
- else:
2022
- print(f"[性能优化] 自动检测并行进程数,CPU核心数: {mp.cpu_count()}")
2023
-
2024
- # 初始化模型
2025
- initialize_model()
2026
-
2027
- # 创建并启动应用
2028
- demo = create_app()
2029
- demo.launch(
2030
- share=False,
2031
- debug=True,
2032
- show_api=False,
2033
- server_port=args.port,
2034
- server_name="0.0.0.0"
2035
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from gpt4all import GPT4All
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ # Cargar el modelo local (requiere archivo .gguf en carpeta "models/")
5
+ MODEL_PATH = "models/mistral-7b-openorca.Q4_0.gguf"
6
 
7
+ def responder(pregunta, modo):
8
+ # Plantilla de modos
9
+ if modo == "Didáctico":
10
+ prompt = f"Explica de manera sencilla para un estudiante: {pregunta}"
11
+ elif modo == "Paso a paso":
12
+ prompt = f"Resuelve paso a paso: {pregunta}"
13
+ elif modo == "Examen":
14
+ prompt = f"Responde de forma breve, como si fuera un examen: {pregunta}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  else:
16
+ prompt = f"Responde con referencias de científicos famosos: {pregunta}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  try:
19
+ model = GPT4All(MODEL_PATH)
20
+ with model.chat_session() as session:
21
+ respuesta = session.generate(prompt, max_tokens=256)
22
+ return respuesta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  except Exception as e:
24
+ return f"⚠️ Error: {str(e)}\nAsegúrate de poner el archivo .gguf en la carpeta 'models/'."
25
+
26
+ # Interfaz Gradio
27
+ demo = gr.Interface(
28
+ fn=responder,
29
+ inputs=[
30
+ gr.Textbox(label="Tu pregunta de Química o Física"),
31
+ gr.Radio(["Didáctico", "Paso a paso", "Examen", "Referencias"], label="Modo")
32
+ ],
33
+ outputs=gr.Textbox(label="Respuesta IA"),
34
+ title="ExploraLab — Tutor IA de Química y Física",
35
+ description="Tu asistente académico gratuito para aprender ciencias."
36
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  if __name__ == "__main__":
39
+ demo.launch()