s880453 commited on
Commit
070f232
·
verified ·
1 Parent(s): bbb1469

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -46
app.py CHANGED
@@ -225,9 +225,11 @@ def intelligent_routing_and_reranking(query: str, selected_speakers: List[str],
225
  if item['speaker'] in INTERVIEWERS:
226
  continue
227
 
228
- # 受訪者過濾
229
- if selected_speakers and item['speaker'] not in selected_speakers:
230
- continue
 
 
231
 
232
  # 計算向量相似度
233
  item_vector = np.array(item['embedding'])
@@ -305,8 +307,8 @@ def intelligent_routing_and_reranking(query: str, selected_speakers: List[str],
305
  # 按加權分數排序
306
  final_results.sort(key=lambda x: x.weighted_score, reverse=True)
307
 
308
- # Step 5: 上下文擴展(turn_index ±10)
309
- expanded_results = expand_context_by_turn_index(final_results[:5])
310
 
311
  return expanded_results
312
 
@@ -327,37 +329,52 @@ def intelligent_routing_and_reranking(query: str, selected_speakers: List[str],
327
  print(f"智慧路由失敗: {str(e)}")
328
  return []
329
 
330
- def expand_context_by_turn_index(search_results: List[SearchResult], context_window: int = 10) -> List[SearchResult]:
331
- """根據 turn_index 擴展上下文"""
332
  expanded_results = []
333
  added_indexes = set()
334
 
335
  for result in search_results:
336
  # 添加原始結果
337
- if result.turn_index not in added_indexes:
 
338
  expanded_results.append(result)
339
- added_indexes.add(result.turn_index)
340
 
341
- # 查找前後文
342
  target_turn = result.turn_index
 
 
343
  for item in dataset:
344
  item_turn = item.get('turn_index', 0)
 
345
 
 
 
 
 
346
  # 檢查是否在範圍內
347
- if abs(item_turn - target_turn) <= context_window and item_turn not in added_indexes:
348
- # 檢查是否為同一發言人或相關發言人
349
- if item['speaker'] not in INTERVIEWERS:
350
- context_result = SearchResult(
351
- text=item.get('text', ''),
352
- speaker=item.get('speaker', ''),
353
- turn_index=item_turn,
354
- file_id=item.get('file_id', ''),
355
- vector_score=0.0,
356
- llm_score=0.0,
357
- weighted_score=result.weighted_score * 0.5 # 上下文權重降低
358
- )
359
- expanded_results.append(context_result)
360
- added_indexes.add(item_turn)
 
 
 
 
 
 
 
361
 
362
  return expanded_results
363
 
@@ -424,8 +441,8 @@ def rag_chat(question, selected_speakers, history):
424
  for i, raw_context in enumerate(raw_contexts[:3], 1):
425
  # 確保 raw_context 是字串且有內容
426
  if raw_context and raw_context != "未能提取原始內容":
427
- # 截取前500個字元,如果內容較短則顯示全部
428
- display_text = raw_context if len(raw_context) <= 500 else f"{raw_context[:500]}..."
429
  answer_with_sources += f"\n**來源 {i}:**\n{display_text}\n"
430
  else:
431
  answer_with_sources += f"\n**來源 {i}:** 無內容\n"
@@ -458,12 +475,32 @@ def parse_word_document(file_path):
458
  print(f"解析文檔失敗: {str(e)}")
459
  return []
460
 
461
- def single_interviewee_guide_filling(file_path, selected_speakers):
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  """單一受訪者訪綱填答 - 整合冠軍策略"""
463
  if not init_success:
464
  return None, "系統尚未初始化"
465
 
466
  try:
 
 
 
 
 
 
 
467
  # 解析 Word 訪綱
468
  questions = parse_word_document(file_path)
469
 
@@ -474,6 +511,7 @@ def single_interviewee_guide_filling(file_path, selected_speakers):
474
  output_doc = Document()
475
  output_doc.add_heading('訪談訪綱 - AI 智慧填答', 0)
476
  output_doc.add_paragraph(f'處理時間:{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
 
477
  output_doc.add_paragraph(f'選擇的受訪者:{", ".join(selected_speakers) if selected_speakers else "全部"}')
478
  output_doc.add_paragraph(f'使用技術:Multilingual-E5-Large + GPT-4o-mini + 冠軍級 RAG')
479
  output_doc.add_paragraph('')
@@ -539,7 +577,7 @@ def single_interviewee_guide_filling(file_path, selected_speakers):
539
  p = output_doc.add_paragraph()
540
  p.add_run(f"{j}. [{raw['speaker']} - Turn {raw['turn_index']}] ").bold = True
541
  p.add_run(f"(相關性: {raw['score']:.3f})\n")
542
- p.add_run(f"{raw['text'][:500]}...")
543
  else:
544
  output_doc.add_paragraph("未找到相關內容")
545
 
@@ -555,15 +593,78 @@ def single_interviewee_guide_filling(file_path, selected_speakers):
555
  output_doc.save(output_buffer)
556
  output_buffer.seek(0)
557
 
558
- output_filename = f"filled_guide_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
 
 
 
559
  with open(output_filename, 'wb') as f:
560
  f.write(output_buffer.getvalue())
561
 
562
- return output_filename, "訪綱填答完成!使用冠軍級 RAG 策略"
563
 
564
  except Exception as e:
565
  return None, f"處理失敗:{str(e)}"
566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  # ==========================================
568
  # Gradio 介面
569
  # ==========================================
@@ -644,37 +745,61 @@ def create_interface():
644
  ### 智慧訪綱填答系統
645
 
646
  **特色功能:**
 
 
647
  - 使用冠軍級 RAG 策略
648
  - 每個問題獨立處理
649
  - 顯示原始 RAG 檢索內容
650
- - 加權評分機制
 
 
 
651
  """)
652
 
653
  with gr.Row():
654
  with gr.Column():
655
  guide_speakers = gr.CheckboxGroup(
656
  choices=[],
657
- label="選擇受訪者",
658
- info="建議選擇單一受訪者以獲得最佳效果"
659
  )
660
 
661
- file_input = gr.File(
662
- label="上傳訪綱 (Word 格式)",
663
- file_types=[".docx"]
 
 
 
 
 
 
 
 
 
 
664
  )
665
 
666
- process_btn = gr.Button("開始處理", variant="primary")
667
 
668
  with gr.Column():
669
  process_status = gr.Textbox(
670
  label="處理狀態",
671
- interactive=False
 
672
  )
673
 
674
- download_file = gr.File(
675
- label="下載結果",
 
676
  visible=False
677
  )
 
 
 
 
 
 
 
678
 
679
  # 技術細節
680
  with gr.Accordion("🔧 技術細節", open=False):
@@ -708,17 +833,35 @@ def create_interface():
708
  def clear_chat():
709
  return []
710
 
711
- def process_guide(file, speakers):
 
712
  if not file:
713
  return "請上傳文件", gr.File(visible=False)
714
 
715
- result_file, status = single_interviewee_guide_filling(file.name, speakers)
716
 
717
  if result_file:
718
  return status, gr.File(value=result_file, visible=True)
719
  else:
720
  return status, gr.File(visible=False)
721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  def update_status():
723
  success, message = initialize_system()
724
  if success:
@@ -744,10 +887,18 @@ def create_interface():
744
 
745
  clear_btn.click(clear_chat, outputs=[chatbot])
746
 
747
- process_btn.click(
748
- process_guide,
749
- inputs=[file_input, guide_speakers],
750
- outputs=[process_status, download_file]
 
 
 
 
 
 
 
 
751
  )
752
 
753
  init_btn.click(
 
225
  if item['speaker'] in INTERVIEWERS:
226
  continue
227
 
228
+ # 重要修正:嚴格的受訪者過濾
229
+ # 如果有選擇特定受訪者,只使用該受訪者的資料
230
+ if selected_speakers and len(selected_speakers) > 0:
231
+ if item['speaker'] not in selected_speakers:
232
+ continue # 跳過不在選擇列表中的受訪者
233
 
234
  # 計算向量相似度
235
  item_vector = np.array(item['embedding'])
 
307
  # 按加權分數排序
308
  final_results.sort(key=lambda x: x.weighted_score, reverse=True)
309
 
310
+ # Step 5: 上下文擴展(只擴展相同受訪者的內容)
311
+ expanded_results = expand_context_by_turn_index(final_results[:5], selected_speakers)
312
 
313
  return expanded_results
314
 
 
329
  print(f"智慧路由失敗: {str(e)}")
330
  return []
331
 
332
+ def expand_context_by_turn_index(search_results: List[SearchResult], selected_speakers: List[str] = None, context_window: int = 10) -> List[SearchResult]:
333
+ """根據 turn_index 擴展上下文 - 只擴展相同受訪者的內容"""
334
  expanded_results = []
335
  added_indexes = set()
336
 
337
  for result in search_results:
338
  # 添加原始結果
339
+ key = f"{result.speaker}_{result.turn_index}"
340
+ if key not in added_indexes:
341
  expanded_results.append(result)
342
+ added_indexes.add(key)
343
 
344
+ # 查找前後文 - 只從相同受訪者的內容中查找
345
  target_turn = result.turn_index
346
+ target_speaker = result.speaker
347
+
348
  for item in dataset:
349
  item_turn = item.get('turn_index', 0)
350
+ item_speaker = item.get('speaker', '')
351
 
352
+ # 重要:確保是同一個受訪者的內容
353
+ if item_speaker != target_speaker:
354
+ continue
355
+
356
  # 檢查是否在範圍內
357
+ if abs(item_turn - target_turn) <= context_window:
358
+ key = f"{item_speaker}_{item_turn}"
359
+ if key not in added_indexes:
360
+ # 確保不是採訪者
361
+ if item_speaker not in INTERVIEWERS:
362
+ context_result = SearchResult(
363
+ text=item.get('text', ''),
364
+ speaker=item_speaker,
365
+ turn_index=item_turn,
366
+ file_id=item.get('file_id', ''),
367
+ vector_score=0.0,
368
+ llm_score=0.0,
369
+ weighted_score=result.weighted_score * 0.5 # 上下文權重降低
370
+ )
371
+ expanded_results.append(context_result)
372
+ added_indexes.add(key)
373
+
374
+ # 再次過濾,確保只包含選定的受訪者
375
+ if selected_speakers and len(selected_speakers) > 0:
376
+ filtered_results = [r for r in expanded_results if r.speaker in selected_speakers]
377
+ return filtered_results
378
 
379
  return expanded_results
380
 
 
441
  for i, raw_context in enumerate(raw_contexts[:3], 1):
442
  # 確保 raw_context 是字串且有內容
443
  if raw_context and raw_context != "未能提取原始內容":
444
+ # 截取前300個字元,如果內容較短則顯示全部
445
+ display_text = raw_context if len(raw_context) <= 300 else f"{raw_context[:300]}..."
446
  answer_with_sources += f"\n**來源 {i}:**\n{display_text}\n"
447
  else:
448
  answer_with_sources += f"\n**來源 {i}:** 無內容\n"
 
475
  print(f"解析文檔失敗: {str(e)}")
476
  return []
477
 
478
+ def extract_speaker_from_filename(filename, available_speakers):
479
+ """從檔案名稱中提取受訪者名稱"""
480
+ import os
481
+ base_name = os.path.basename(filename)
482
+ base_name_no_ext = os.path.splitext(base_name)[0]
483
+
484
+ # 檢查檔名中是否包含任何受訪者名稱
485
+ for speaker in available_speakers:
486
+ if speaker in base_name_no_ext:
487
+ return [speaker]
488
+
489
+ return None # 如果沒有找到,返回 None
490
+
491
+ def single_interviewee_guide_filling(file_path, selected_speakers, file_name=None):
492
  """單一受訪者訪綱填答 - 整合冠軍策略"""
493
  if not init_success:
494
  return None, "系統尚未初始化"
495
 
496
  try:
497
+ # 如果提供了檔案名稱,嘗試從中提取受訪者
498
+ if file_name:
499
+ detected_speakers = extract_speaker_from_filename(file_name, all_speakers)
500
+ if detected_speakers:
501
+ selected_speakers = detected_speakers
502
+ print(f"從檔名 '{file_name}' 中檢測到受訪者: {detected_speakers[0]}")
503
+
504
  # 解析 Word 訪綱
505
  questions = parse_word_document(file_path)
506
 
 
511
  output_doc = Document()
512
  output_doc.add_heading('訪談訪綱 - AI 智慧填答', 0)
513
  output_doc.add_paragraph(f'處理時間:{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
514
+ output_doc.add_paragraph(f'原始檔案:{file_name if file_name else "未知"}')
515
  output_doc.add_paragraph(f'選擇的受訪者:{", ".join(selected_speakers) if selected_speakers else "全部"}')
516
  output_doc.add_paragraph(f'使用技術:Multilingual-E5-Large + GPT-4o-mini + 冠軍級 RAG')
517
  output_doc.add_paragraph('')
 
577
  p = output_doc.add_paragraph()
578
  p.add_run(f"{j}. [{raw['speaker']} - Turn {raw['turn_index']}] ").bold = True
579
  p.add_run(f"(相關性: {raw['score']:.3f})\n")
580
+ p.add_run(f"{raw['text'][:300]}...")
581
  else:
582
  output_doc.add_paragraph("未找到相關內容")
583
 
 
593
  output_doc.save(output_buffer)
594
  output_buffer.seek(0)
595
 
596
+ # 生成輸出檔名
597
+ speaker_suffix = f"_{selected_speakers[0]}" if len(selected_speakers) == 1 else "_多位"
598
+ output_filename = f"filled_guide{speaker_suffix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
599
+
600
  with open(output_filename, 'wb') as f:
601
  f.write(output_buffer.getvalue())
602
 
603
+ return output_filename, f"訪綱填答完成!檔案:{output_filename}"
604
 
605
  except Exception as e:
606
  return None, f"處理失敗:{str(e)}"
607
 
608
+ def batch_process_guides(files, default_speakers):
609
+ """批量處理多個訪綱檔案"""
610
+ if not init_success:
611
+ return [], "系統尚未初始化"
612
+
613
+ results = []
614
+ processed_files = []
615
+
616
+ try:
617
+ total_files = len(files)
618
+ print(f"開始批量處理 {total_files} 個檔案")
619
+
620
+ for idx, file in enumerate(files, 1):
621
+ try:
622
+ file_name = file.name if hasattr(file, 'name') else str(file)
623
+ print(f"\n處理檔案 {idx}/{total_files}: {file_name}")
624
+
625
+ # 從檔名中檢測受訪者
626
+ detected_speakers = extract_speaker_from_filename(file_name, all_speakers)
627
+
628
+ if detected_speakers:
629
+ speakers_to_use = detected_speakers
630
+ status_msg = f"檔案 {idx}: 從檔名檢測到受訪者 {detected_speakers[0]}"
631
+ else:
632
+ speakers_to_use = default_speakers
633
+ status_msg = f"檔案 {idx}: 使用預設受訪者"
634
+
635
+ print(status_msg)
636
+ results.append(status_msg)
637
+
638
+ # 處理單個檔案
639
+ output_file, process_status = single_interviewee_guide_filling(
640
+ file.name if hasattr(file, 'name') else file,
641
+ speakers_to_use,
642
+ file_name
643
+ )
644
+
645
+ if output_file:
646
+ processed_files.append(output_file)
647
+ results.append(f"✅ {file_name} 處理成功 -> {output_file}")
648
+ else:
649
+ results.append(f"❌ {file_name} 處理失敗: {process_status}")
650
+
651
+ # 避免系統過載,每處理一個檔案休息一下
652
+ time.sleep(1)
653
+
654
+ except Exception as e:
655
+ error_msg = f"❌ 檔案 {idx} 處理錯誤: {str(e)}"
656
+ print(error_msg)
657
+ results.append(error_msg)
658
+
659
+ # 彙總結果
660
+ summary = f"\n處理完成!\n成功: {len(processed_files)}/{total_files} 個檔案\n"
661
+ summary += "\n詳細結果:\n" + "\n".join(results)
662
+
663
+ return processed_files, summary
664
+
665
+ except Exception as e:
666
+ return [], f"批量處理失敗:{str(e)}"
667
+
668
  # ==========================================
669
  # Gradio 介面
670
  # ==========================================
 
745
  ### 智慧訪綱填答系統
746
 
747
  **特色功能:**
748
+ - 支援批量處理(最多15個檔案)
749
+ - 自動從檔名識別受訪者
750
  - 使用冠軍級 RAG 策略
751
  - 每個問題獨立處理
752
  - 顯示原始 RAG 檢索內容
753
+
754
+ **檔名識別規則:**
755
+ - 如果檔名包含受訪者姓名(如:`訪綱_陳美玲.docx`),系統會自動使用該受訪者的資料
756
+ - 如果檔名未包含受訪者姓名,則使用您勾選的受訪者
757
  """)
758
 
759
  with gr.Row():
760
  with gr.Column():
761
  guide_speakers = gr.CheckboxGroup(
762
  choices=[],
763
+ label="預設受訪者(當檔名未指定時使用)",
764
+ info="檔名中有受訪者名稱時會自動覆蓋此選擇"
765
  )
766
 
767
+ # 單檔上傳(保留原功能)
768
+ with gr.Accordion("單檔處理", open=False):
769
+ single_file_input = gr.File(
770
+ label="上傳單個訪綱 (Word 格式)",
771
+ file_types=[".docx"]
772
+ )
773
+ single_process_btn = gr.Button("處理單檔", variant="secondary")
774
+
775
+ # 批量上傳(新功能)
776
+ batch_file_input = gr.File(
777
+ label="批量上傳訪綱(最多15個 Word 檔案)",
778
+ file_types=[".docx"],
779
+ file_count="multiple"
780
  )
781
 
782
+ batch_process_btn = gr.Button("🚀 批量處理所有檔案", variant="primary", size="lg")
783
 
784
  with gr.Column():
785
  process_status = gr.Textbox(
786
  label="處理狀態",
787
+ interactive=False,
788
+ lines=10
789
  )
790
 
791
+ # 單檔下載
792
+ single_download_file = gr.File(
793
+ label="下載單檔結果",
794
  visible=False
795
  )
796
+
797
+ # 批量下載
798
+ batch_download_files = gr.File(
799
+ label="下載所有結果",
800
+ visible=False,
801
+ file_count="multiple"
802
+ )
803
 
804
  # 技術細節
805
  with gr.Accordion("🔧 技術細節", open=False):
 
833
  def clear_chat():
834
  return []
835
 
836
+ def process_single_guide(file, speakers):
837
+ """處理單個檔案"""
838
  if not file:
839
  return "請上傳文件", gr.File(visible=False)
840
 
841
+ result_file, status = single_interviewee_guide_filling(file.name, speakers, file.name)
842
 
843
  if result_file:
844
  return status, gr.File(value=result_file, visible=True)
845
  else:
846
  return status, gr.File(visible=False)
847
 
848
+ def process_batch_guides(files, speakers):
849
+ """批量處理多個檔案"""
850
+ if not files:
851
+ return "請上傳至少一個檔案", gr.File(visible=False)
852
+
853
+ # 限制最多15個檔案
854
+ if len(files) > 15:
855
+ return f"檔案數量超過限制(最多15個),您上傳了 {len(files)} 個", gr.File(visible=False)
856
+
857
+ # 批量處理
858
+ processed_files, status = batch_process_guides(files, speakers)
859
+
860
+ if processed_files:
861
+ return status, gr.File(value=processed_files, visible=True, file_count="multiple")
862
+ else:
863
+ return status, gr.File(visible=False)
864
+
865
  def update_status():
866
  success, message = initialize_system()
867
  if success:
 
887
 
888
  clear_btn.click(clear_chat, outputs=[chatbot])
889
 
890
+ # 單檔處理
891
+ single_process_btn.click(
892
+ process_single_guide,
893
+ inputs=[single_file_input, guide_speakers],
894
+ outputs=[process_status, single_download_file]
895
+ )
896
+
897
+ # 批量處理
898
+ batch_process_btn.click(
899
+ process_batch_guides,
900
+ inputs=[batch_file_input, guide_speakers],
901
+ outputs=[process_status, batch_download_files]
902
  )
903
 
904
  init_btn.click(