Spaces:

s880453
/

netzerointerview-ragsystem

Sleeping

App Files Files Community

s880453 commited on Aug 13, 2025

Commit

070f232

verified ·

1 Parent(s): bbb1469

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -46

app.py CHANGED Viewed

@@ -225,9 +225,11 @@ def intelligent_routing_and_reranking(query: str, selected_speakers: List[str],
             if item['speaker'] in INTERVIEWERS:
                 continue
-            # 受訪者過濾
-            if selected_speakers and item['speaker'] not in selected_speakers:
-                continue
             # 計算向量相似度
             item_vector = np.array(item['embedding'])
@@ -305,8 +307,8 @@ def intelligent_routing_and_reranking(query: str, selected_speakers: List[str],
             # 按加權分數排序
             final_results.sort(key=lambda x: x.weighted_score, reverse=True)
-            # Step 5: 上下文擴展（turn_index ±10）
-            expanded_results = expand_context_by_turn_index(final_results[:5])
             return expanded_results
@@ -327,37 +329,52 @@ def intelligent_routing_and_reranking(query: str, selected_speakers: List[str],
         print(f"智慧路由失敗: {str(e)}")
         return []
-def expand_context_by_turn_index(search_results: List[SearchResult], context_window: int = 10) -> List[SearchResult]:
-    """根據 turn_index 擴展上下文"""
     expanded_results = []
     added_indexes = set()
     for result in search_results:
         # 添加原始結果
-        if result.turn_index not in added_indexes:
             expanded_results.append(result)
-            added_indexes.add(result.turn_index)
-        # 查找前後文
         target_turn = result.turn_index
         for item in dataset:
             item_turn = item.get('turn_index', 0)
             # 檢查是否在範圍內
-            if abs(item_turn - target_turn) <= context_window and item_turn not in added_indexes:
-                # 檢查是否為同一發言人或相關發言人
-                if item['speaker'] not in INTERVIEWERS:
-                    context_result = SearchResult(
-                        text=item.get('text', ''),
-                        speaker=item.get('speaker', ''),
-                        turn_index=item_turn,
-                        file_id=item.get('file_id', ''),
-                        vector_score=0.0,
-                        llm_score=0.0,
-                        weighted_score=result.weighted_score * 0.5  # 上下文權重降低
-                    )
-                    expanded_results.append(context_result)
-                    added_indexes.add(item_turn)
     return expanded_results
@@ -424,8 +441,8 @@ def rag_chat(question, selected_speakers, history):
         for i, raw_context in enumerate(raw_contexts[:3], 1):
             # 確保 raw_context 是字串且有內容
             if raw_context and raw_context != "未能提取原始內容":
-                # 截取前500個字元，如果內容較短則顯示全部
-                display_text = raw_context if len(raw_context) <= 500 else f"{raw_context[:500]}..."
                 answer_with_sources += f"\n**來源 {i}:**\n{display_text}\n"
             else:
                 answer_with_sources += f"\n**來源 {i}:** 無內容\n"
@@ -458,12 +475,32 @@ def parse_word_document(file_path):
         print(f"解析文檔失敗: {str(e)}")
         return []
-def single_interviewee_guide_filling(file_path, selected_speakers):
     """單一受訪者訪綱填答 - 整合冠軍策略"""
     if not init_success:
         return None, "系統尚未初始化"
     try:
         # 解析 Word 訪綱
         questions = parse_word_document(file_path)
@@ -474,6 +511,7 @@ def single_interviewee_guide_filling(file_path, selected_speakers):
         output_doc = Document()
         output_doc.add_heading('訪談訪綱 - AI 智慧填答', 0)
         output_doc.add_paragraph(f'處理時間：{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
         output_doc.add_paragraph(f'選擇的受訪者：{", ".join(selected_speakers) if selected_speakers else "全部"}')
         output_doc.add_paragraph(f'使用技術：Multilingual-E5-Large + GPT-4o-mini + 冠軍級 RAG')
         output_doc.add_paragraph('')
@@ -539,7 +577,7 @@ def single_interviewee_guide_filling(file_path, selected_speakers):
                     p = output_doc.add_paragraph()
                     p.add_run(f"{j}. [{raw['speaker']} - Turn {raw['turn_index']}] ").bold = True
                     p.add_run(f"(相關性: {raw['score']:.3f})\n")
-                    p.add_run(f"{raw['text'][:500]}...")
             else:
                 output_doc.add_paragraph("未找到相關內容")
@@ -555,15 +593,78 @@ def single_interviewee_guide_filling(file_path, selected_speakers):
         output_doc.save(output_buffer)
         output_buffer.seek(0)
-        output_filename = f"filled_guide_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
         with open(output_filename, 'wb') as f:
             f.write(output_buffer.getvalue())
-        return output_filename, "訪綱填答完成！使用冠軍級 RAG 策略"
     except Exception as e:
         return None, f"處理失敗：{str(e)}"
 # ==========================================
 # Gradio 介面
 # ==========================================
@@ -644,37 +745,61 @@ def create_interface():
                 ### 智慧訪綱填答系統
                 **特色功能：**
                 - 使用冠軍級 RAG 策略
                 - 每個問題獨立處理
                 - 顯示原始 RAG 檢索內容
-                - 加權評分機制
                 """)
                 with gr.Row():
                     with gr.Column():
                         guide_speakers = gr.CheckboxGroup(
                             choices=[],
-                            label="選擇受訪者",
-                            info="建議選擇單一受訪者以獲得最佳效果"
                         )
-                        file_input = gr.File(
-                            label="上傳訪綱 (Word 格式)",
-                            file_types=[".docx"]
                         )
-                        process_btn = gr.Button("開始處理", variant="primary")
                     with gr.Column():
                         process_status = gr.Textbox(
                             label="處理狀態",
-                            interactive=False
                         )
-                        download_file = gr.File(
-                            label="下載結果",
                             visible=False
                         )
         # 技術細節
         with gr.Accordion("🔧 技術細節", open=False):
@@ -708,17 +833,35 @@ def create_interface():
         def clear_chat():
             return []
-        def process_guide(file, speakers):
             if not file:
                 return "請上傳文件", gr.File(visible=False)
-            result_file, status = single_interviewee_guide_filling(file.name, speakers)
             if result_file:
                 return status, gr.File(value=result_file, visible=True)
             else:
                 return status, gr.File(visible=False)
         def update_status():
             success, message = initialize_system()
             if success:
@@ -744,10 +887,18 @@ def create_interface():
         clear_btn.click(clear_chat, outputs=[chatbot])
-        process_btn.click(
-            process_guide,
-            inputs=[file_input, guide_speakers],
-            outputs=[process_status, download_file]
         )
         init_btn.click(

             if item['speaker'] in INTERVIEWERS:
                 continue
+            # 重要修正：嚴格的受訪者過濾
+            # 如果有選擇特定受訪者，只使用該受訪者的資料
+            if selected_speakers and len(selected_speakers) > 0:
+                if item['speaker'] not in selected_speakers:
+                    continue  # 跳過不在選擇列表中的受訪者
             # 計算向量相似度
             item_vector = np.array(item['embedding'])
             # 按加權分數排序
             final_results.sort(key=lambda x: x.weighted_score, reverse=True)
+            # Step 5: 上下文擴展（只擴展相同受訪者的內容）
+            expanded_results = expand_context_by_turn_index(final_results[:5], selected_speakers)
             return expanded_results
         print(f"智慧路由失敗: {str(e)}")
         return []
+def expand_context_by_turn_index(search_results: List[SearchResult], selected_speakers: List[str] = None, context_window: int = 10) -> List[SearchResult]:
+    """根據 turn_index 擴展上下文 - 只擴展相同受訪者的內容"""
     expanded_results = []
     added_indexes = set()
     for result in search_results:
         # 添加原始結果
+        key = f"{result.speaker}_{result.turn_index}"
+        if key not in added_indexes:
             expanded_results.append(result)
+            added_indexes.add(key)
+        # 查找前後文 - 只從相同受訪者的內容中查找
         target_turn = result.turn_index
+        target_speaker = result.speaker
         for item in dataset:
             item_turn = item.get('turn_index', 0)
+            item_speaker = item.get('speaker', '')
+            # 重要：確保是同一個受訪者的內容
+            if item_speaker != target_speaker:
+                continue
             # 檢查是否在範圍內
+            if abs(item_turn - target_turn) <= context_window:
+                key = f"{item_speaker}_{item_turn}"
+                if key not in added_indexes:
+                    # 確保不是採訪者
+                    if item_speaker not in INTERVIEWERS:
+                        context_result = SearchResult(
+                            text=item.get('text', ''),
+                            speaker=item_speaker,
+                            turn_index=item_turn,
+                            file_id=item.get('file_id', ''),
+                            vector_score=0.0,
+                            llm_score=0.0,
+                            weighted_score=result.weighted_score * 0.5  # 上下文權重降低
+                        )
+                        expanded_results.append(context_result)
+                        added_indexes.add(key)
+    # 再次過濾，確保只包含選定的受訪者
+    if selected_speakers and len(selected_speakers) > 0:
+        filtered_results = [r for r in expanded_results if r.speaker in selected_speakers]
+        return filtered_results
     return expanded_results
         for i, raw_context in enumerate(raw_contexts[:3], 1):
             # 確保 raw_context 是字串且有內容
             if raw_context and raw_context != "未能提取原始內容":
+                # 截取前300個字元，如果內容較短則顯示全部
+                display_text = raw_context if len(raw_context) <= 300 else f"{raw_context[:300]}..."
                 answer_with_sources += f"\n**來源 {i}:**\n{display_text}\n"
             else:
                 answer_with_sources += f"\n**來源 {i}:** 無內容\n"
         print(f"解析文檔失敗: {str(e)}")
         return []
+def extract_speaker_from_filename(filename, available_speakers):
+    """從檔案名稱中提取受訪者名稱"""
+    import os
+    base_name = os.path.basename(filename)
+    base_name_no_ext = os.path.splitext(base_name)[0]
+    # 檢查檔名中是否包含任何受訪者名稱
+    for speaker in available_speakers:
+        if speaker in base_name_no_ext:
+            return [speaker]
+    return None  # 如果沒有找到，返回 None
+def single_interviewee_guide_filling(file_path, selected_speakers, file_name=None):
     """單一受訪者訪綱填答 - 整合冠軍策略"""
     if not init_success:
         return None, "系統尚未初始化"
     try:
+        # 如果提供了檔案名稱，嘗試從中提取受訪者
+        if file_name:
+            detected_speakers = extract_speaker_from_filename(file_name, all_speakers)
+            if detected_speakers:
+                selected_speakers = detected_speakers
+                print(f"從檔名 '{file_name}' 中檢測到受訪者: {detected_speakers[0]}")
         # 解析 Word 訪綱
         questions = parse_word_document(file_path)
         output_doc = Document()
         output_doc.add_heading('訪談訪綱 - AI 智慧填答', 0)
         output_doc.add_paragraph(f'處理時間：{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
+        output_doc.add_paragraph(f'原始檔案：{file_name if file_name else "未知"}')
         output_doc.add_paragraph(f'選擇的受訪者：{", ".join(selected_speakers) if selected_speakers else "全部"}')
         output_doc.add_paragraph(f'使用技術：Multilingual-E5-Large + GPT-4o-mini + 冠軍級 RAG')
         output_doc.add_paragraph('')
                     p = output_doc.add_paragraph()
                     p.add_run(f"{j}. [{raw['speaker']} - Turn {raw['turn_index']}] ").bold = True
                     p.add_run(f"(相關性: {raw['score']:.3f})\n")
+                    p.add_run(f"{raw['text'][:300]}...")
             else:
                 output_doc.add_paragraph("未找到相關內容")
         output_doc.save(output_buffer)
         output_buffer.seek(0)
+        # 生成輸出檔名
+        speaker_suffix = f"_{selected_speakers[0]}" if len(selected_speakers) == 1 else "_多位"
+        output_filename = f"filled_guide{speaker_suffix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
         with open(output_filename, 'wb') as f:
             f.write(output_buffer.getvalue())
+        return output_filename, f"訪綱填答完成！檔案：{output_filename}"
     except Exception as e:
         return None, f"處理失敗：{str(e)}"
+def batch_process_guides(files, default_speakers):
+    """批量處理多個訪綱檔案"""
+    if not init_success:
+        return [], "系統尚未初始化"
+    results = []
+    processed_files = []
+    try:
+        total_files = len(files)
+        print(f"開始批量處理 {total_files} 個檔案")
+        for idx, file in enumerate(files, 1):
+            try:
+                file_name = file.name if hasattr(file, 'name') else str(file)
+                print(f"\n處理檔案 {idx}/{total_files}: {file_name}")
+                # 從檔名中檢測受訪者
+                detected_speakers = extract_speaker_from_filename(file_name, all_speakers)
+                if detected_speakers:
+                    speakers_to_use = detected_speakers
+                    status_msg = f"檔案 {idx}: 從檔名檢測到受訪者 {detected_speakers[0]}"
+                else:
+                    speakers_to_use = default_speakers
+                    status_msg = f"檔案 {idx}: 使用預設受訪者"
+                print(status_msg)
+                results.append(status_msg)
+                # 處理單個檔案
+                output_file, process_status = single_interviewee_guide_filling(
+                    file.name if hasattr(file, 'name') else file,
+                    speakers_to_use,
+                    file_name
+                )
+                if output_file:
+                    processed_files.append(output_file)
+                    results.append(f"✅ {file_name} 處理成功 -> {output_file}")
+                else:
+                    results.append(f"❌ {file_name} 處理失敗: {process_status}")
+                # 避免系統過載，每處理一個檔案休息一下
+                time.sleep(1)
+            except Exception as e:
+                error_msg = f"❌ 檔案 {idx} 處理錯誤: {str(e)}"
+                print(error_msg)
+                results.append(error_msg)
+        # 彙總結果
+        summary = f"\n處理完成！\n成功: {len(processed_files)}/{total_files} 個檔案\n"
+        summary += "\n詳細結果：\n" + "\n".join(results)
+        return processed_files, summary
+    except Exception as e:
+        return [], f"批量處理失敗：{str(e)}"
 # ==========================================
 # Gradio 介面
 # ==========================================
                 ### 智慧訪綱填答系統
                 **特色功能：**
+                - 支援批量處理（最多15個檔案）
+                - 自動從檔名識別受訪者
                 - 使用冠軍級 RAG 策略
                 - 每個問題獨立處理
                 - 顯示原始 RAG 檢索內容
+                **檔名識別規則：**
+                - 如果檔名包含受訪者姓名（如：`訪綱_陳美玲.docx`），系統會自動使用該受訪者的資料
+                - 如果檔名未包含受訪者姓名，則使用您勾選的受訪者
                 """)
                 with gr.Row():
                     with gr.Column():
                         guide_speakers = gr.CheckboxGroup(
                             choices=[],
+                            label="預設受訪者（當檔名未指定時使用）",
+                            info="檔名中有受訪者名稱時會自動覆蓋此選擇"
                         )
+                        # 單檔上傳（保留原功能）
+                        with gr.Accordion("單檔處理", open=False):
+                            single_file_input = gr.File(
+                                label="上傳單個訪綱 (Word 格式)",
+                                file_types=[".docx"]
+                            )
+                            single_process_btn = gr.Button("處理單檔", variant="secondary")
+                        # 批量上傳（新功能）
+                        batch_file_input = gr.File(
+                            label="批量上傳訪綱（最多15個 Word 檔案）",
+                            file_types=[".docx"],
+                            file_count="multiple"
                         )
+                        batch_process_btn = gr.Button("🚀 批量處理所有檔案", variant="primary", size="lg")
                     with gr.Column():
                         process_status = gr.Textbox(
                             label="處理狀態",
+                            interactive=False,
+                            lines=10
                         )
+                        # 單檔下載
+                        single_download_file = gr.File(
+                            label="下載單檔結果",
                             visible=False
                         )
+                        # 批量下載
+                        batch_download_files = gr.File(
+                            label="下載所有結果",
+                            visible=False,
+                            file_count="multiple"
+                        )
         # 技術細節
         with gr.Accordion("🔧 技術細節", open=False):
         def clear_chat():
             return []
+        def process_single_guide(file, speakers):
+            """處理單個檔案"""
             if not file:
                 return "請上傳文件", gr.File(visible=False)
+            result_file, status = single_interviewee_guide_filling(file.name, speakers, file.name)
             if result_file:
                 return status, gr.File(value=result_file, visible=True)
             else:
                 return status, gr.File(visible=False)
+        def process_batch_guides(files, speakers):
+            """批量處理多個檔案"""
+            if not files:
+                return "請上傳至少一個檔案", gr.File(visible=False)
+            # 限制最多15個檔案
+            if len(files) > 15:
+                return f"檔案數量超過限制（最多15個），您上傳了 {len(files)} 個", gr.File(visible=False)
+            # 批量處理
+            processed_files, status = batch_process_guides(files, speakers)
+            if processed_files:
+                return status, gr.File(value=processed_files, visible=True, file_count="multiple")
+            else:
+                return status, gr.File(visible=False)
         def update_status():
             success, message = initialize_system()
             if success:
         clear_btn.click(clear_chat, outputs=[chatbot])
+        # 單檔處理
+        single_process_btn.click(
+            process_single_guide,
+            inputs=[single_file_input, guide_speakers],
+            outputs=[process_status, single_download_file]
+        )
+        # 批量處理
+        batch_process_btn.click(
+            process_batch_guides,
+            inputs=[batch_file_input, guide_speakers],
+            outputs=[process_status, batch_download_files]
         )
         init_btn.click(