Spaces:

JesseLiu
/

chatbot-mimic-notes

Sleeping

App Files Files Community

Jesse Liu commited on Sep 3, 2025

Commit

b3e7b51

1 Parent(s): 85eb95c

update

Browse files

Files changed (3) hide show

assets/llama-3.2-3b_io.jsonl +0 -0
chatgpt.py +301 -14
evaluation_module_readme.md +155 -0

assets/llama-3.2-3b_io.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

chatgpt.py CHANGED Viewed

@@ -7,6 +7,8 @@ import os
 import requests
 import time
 import threading
 from datetime import datetime, timedelta
 session = requests.Session()
@@ -177,10 +179,14 @@ def submit_text_and_respond(edited_text, api_key, username, selected_title, hist
     print(response)
     if isinstance(response, str):
         history.append((edited_text, response))
-        return history, "", []
     doctor_response = response['doctor_response']
     history.append((edited_text, doctor_response))
-    return history, ""  # Return memory_graph as output
 def set_initialize_button(api_key_input, username_input):
     message = asyncio.run(initialization(api_key_input, username_input))
@@ -200,6 +206,136 @@ def start_recording(audio_file):
     except Exception as e:
         return f"Failed to transcribe: {str(e)}"
 def update_methods(chapter):
     return gr.update(choices=interview_protocols[chapter], value=interview_protocols[chapter][0])
@@ -268,18 +404,11 @@ with gr.Blocks(css=css) as app:
                             data.append(json.loads(line))
                 return data
-            jsonl_path = "/Users/jinhaoduan/workspace/chatbot-mimic-notes/assets/structured_results_o3_with_inputs.jsonl"
-            structured_results = load_jsonl(jsonl_path)
-            notes_path = '/Users/jinhaoduan/workspace/chatbot-mimic-notes/assets/structured_results_o3_md.jsonl'
-            notes = load_jsonl(notes_path)
             options = []
-            for r in structured_results:
-                _note = 'Notes not found.'
-                for _note in notes:
-                    if _note['hadm_id'] == r['hadm_id']:
-                        _note = _note['content']
-                        break
-                options.append({'title': 'Patient:' + str(r['subject_id']), 'text': r['content'] + str(r['raw_note']) + str(r['medications']), 'note': _note})
@@ -351,7 +480,7 @@ with gr.Blocks(css=css) as app:
             submit_button.click(
                 submit_text_and_respond,
                 inputs=[transcription_box, api_key_state, username_input, selected_title, state, chatbot_type_state],
-                outputs=[chatbot, transcription_box]
             )
             # download_button.click(
@@ -396,6 +525,164 @@ with gr.Blocks(css=css) as app:
                 outputs=[predefined_option_text, markdown_display, selected_title]
             )
     app.queue()

 import requests
 import time
 import threading
+import json
+import csv
 from datetime import datetime, timedelta
 session = requests.Session()
     print(response)
     if isinstance(response, str):
         history.append((edited_text, response))
+        # Generate conversation ID for evaluation
+        conversation_id = f"{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        return history, "", conversation_id, edited_text, response
     doctor_response = response['doctor_response']
     history.append((edited_text, doctor_response))
+    # Generate conversation ID for evaluation
+    conversation_id = f"{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    return history, "", conversation_id, edited_text, doctor_response
 def set_initialize_button(api_key_input, username_input):
     message = asyncio.run(initialization(api_key_input, username_input))
     except Exception as e:
         return f"Failed to transcribe: {str(e)}"
+# Evaluation functions
+def save_evaluation(conversation_id, user_input, bot_response, rating, feedback, expert_name, categories):
+    """Save evaluation data to a JSON file"""
+    timestamp = datetime.now().isoformat()
+    evaluation = {
+        "timestamp": timestamp,
+        "conversation_id": conversation_id,
+        "expert_name": expert_name,
+        "user_input": user_input,
+        "bot_response": bot_response,
+        "overall_rating": rating,
+        "feedback": feedback,
+        "categories": categories
+    }
+    # Create evaluations directory if it doesn't exist
+    eval_dir = "evaluations"
+    if not os.path.exists(eval_dir):
+        os.makedirs(eval_dir)
+    # Save to JSON file
+    eval_file = os.path.join(eval_dir, f"evaluation_{timestamp.replace(':', '-')}.json")
+    with open(eval_file, 'w', encoding='utf-8') as f:
+        json.dump(evaluation, f, ensure_ascii=False, indent=2)
+    # Also append to a master CSV file for easier analysis
+    csv_file = os.path.join(eval_dir, "evaluations_master.csv")
+    file_exists = os.path.isfile(csv_file)
+    with open(csv_file, 'a', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        if not file_exists:
+            writer.writerow(['timestamp', 'conversation_id', 'expert_name', 'overall_rating',
+                           'medical_accuracy', 'clinical_relevance', 'communication_clarity',
+                           'safety_considerations', 'feedback'])
+        writer.writerow([
+            timestamp, conversation_id, expert_name, rating,
+            categories.get('medical_accuracy', ''),
+            categories.get('clinical_relevance', ''),
+            categories.get('communication_clarity', ''),
+            categories.get('safety_considerations', ''),
+            feedback
+        ])
+    return f"Evaluation saved successfully at {timestamp}"
+def submit_evaluation(conversation_id, user_input, bot_response, overall_rating,
+                     medical_accuracy, clinical_relevance, communication_clarity,
+                     safety_considerations, feedback, expert_name):
+    """Process and save the evaluation"""
+    if not expert_name.strip():
+        return "Please enter your name before submitting evaluation."
+    if overall_rating == 0:
+        return "Please provide an overall rating before submitting."
+    categories = {
+        'medical_accuracy': medical_accuracy,
+        'clinical_relevance': clinical_relevance,
+        'communication_clarity': communication_clarity,
+        'safety_considerations': safety_considerations
+    }
+    result = save_evaluation(conversation_id, user_input, bot_response,
+                           overall_rating, feedback, expert_name, categories)
+    # Reset form after successful submission
+    return result, "", 0, 0, 0, 0, 0, ""
+def get_conversation_for_evaluation(history):
+    """Get the last conversation pair for evaluation"""
+    if not history or len(history) == 0:
+        return "", ""
+    last_conversation = history[-1]
+    user_input = last_conversation[0] if len(last_conversation) > 0 else ""
+    bot_response = last_conversation[1] if len(last_conversation) > 1 else ""
+    return user_input, bot_response
+def export_evaluations():
+    """Export evaluation data for analysis"""
+    eval_dir = "evaluations"
+    csv_file = os.path.join(eval_dir, "evaluations_master.csv")
+    if not os.path.exists(csv_file):
+        return None, "No evaluation data found."
+    return csv_file, f"Evaluation data exported. Total evaluations in file."
+def get_evaluation_stats():
+    """Get basic statistics about evaluations"""
+    eval_dir = "evaluations"
+    csv_file = os.path.join(eval_dir, "evaluations_master.csv")
+    if not os.path.exists(csv_file):
+        return "No evaluation data available."
+    try:
+        import pandas as pd
+        df = pd.read_csv(csv_file)
+        total_evaluations = len(df)
+        avg_overall_rating = df['overall_rating'].mean() if 'overall_rating' in df.columns else 0
+        avg_medical_accuracy = df['medical_accuracy'].mean() if 'medical_accuracy' in df.columns else 0
+        expert_count = df['expert_name'].nunique() if 'expert_name' in df.columns else 0
+        stats = f"""
+📊 **Evaluation Statistics**
+- **Total Evaluations**: {total_evaluations}
+- **Average Overall Rating**: {avg_overall_rating:.2f}/5
+- **Average Medical Accuracy**: {avg_medical_accuracy:.2f}/5
+- **Number of Experts**: {expert_count}
+- **Latest Evaluation**: {df['timestamp'].iloc[-1] if not df.empty else 'N/A'}
+        """
+        return stats
+    except ImportError:
+        # Fallback if pandas is not available
+        with open(csv_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            total_evaluations = len(lines) - 1  # Subtract header
+            return f"Total evaluations: {total_evaluations} (Install pandas for detailed stats)"
+    except Exception as e:
+        return f"Error reading evaluation data: {str(e)}"
 def update_methods(chapter):
     return gr.update(choices=interview_protocols[chapter], value=interview_protocols[chapter][0])
                             data.append(json.loads(line))
                 return data
+            jsonl_path = "/Users/liuzijie/Desktop/chatbot-mimic-notes/assets/llama-3.2-3b_io.jsonl"
+            llama_results = load_jsonl(jsonl_path)
             options = []
+            for r in llama_results:
+                options.append({'title': 'Patient:' + str(r['patient_id']), 'text': r['input'], 'note': r['output']})
             submit_button.click(
                 submit_text_and_respond,
                 inputs=[transcription_box, api_key_state, username_input, selected_title, state, chatbot_type_state],
+                outputs=[chatbot, transcription_box, conversation_id_state, eval_user_input_state, eval_bot_response_state]
             )
             # download_button.click(
                 outputs=[predefined_option_text, markdown_display, selected_title]
             )
+            # Evaluation Module
+            with gr.Box():
+                gr.Markdown("## 🔬 Expert Evaluation Module")
+                # Hidden fields to store conversation data for evaluation
+                conversation_id_state = gr.State("")
+                eval_user_input_state = gr.State("")
+                eval_bot_response_state = gr.State("")
+                with gr.Row():
+                    expert_name_input = gr.Textbox(
+                        label="Expert Name",
+                        placeholder="Enter your name",
+                        scale=2
+                    )
+                    evaluation_status = gr.Textbox(
+                        label="Status",
+                        interactive=False,
+                        scale=1
+                    )
+                # Overall Rating
+                overall_rating = gr.Slider(
+                    minimum=1,
+                    maximum=5,
+                    step=1,
+                    label="Overall Rating (1=Poor, 5=Excellent)",
+                    value=0
+                )
+                # Category-specific ratings
+                with gr.Row():
+                    medical_accuracy = gr.Slider(
+                        minimum=1,
+                        maximum=5,
+                        step=1,
+                        label="Medical Accuracy",
+                        value=0
+                    )
+                    clinical_relevance = gr.Slider(
+                        minimum=1,
+                        maximum=5,
+                        step=1,
+                        label="Clinical Relevance",
+                        value=0
+                    )
+                with gr.Row():
+                    communication_clarity = gr.Slider(
+                        minimum=1,
+                        maximum=5,
+                        step=1,
+                        label="Communication Clarity",
+                        value=0
+                    )
+                    safety_considerations = gr.Slider(
+                        minimum=1,
+                        maximum=5,
+                        step=1,
+                        label="Safety Considerations",
+                        value=0
+                    )
+                # Detailed feedback
+                feedback_text = gr.Textbox(
+                    label="Detailed Feedback",
+                    placeholder="Please provide specific feedback about the response...",
+                    lines=4
+                )
+                # Current conversation display (readonly)
+                with gr.Accordion("Current Conversation", open=False):
+                    current_user_input = gr.Textbox(
+                        label="User Input",
+                        interactive=False,
+                        lines=2
+                    )
+                    current_bot_response = gr.Textbox(
+                        label="Bot Response",
+                        interactive=False,
+                        lines=3
+                    )
+                # Submit evaluation button
+                submit_eval_button = gr.Button(
+                    "Submit Evaluation",
+                    variant="primary",
+                    size="large"
+                )
+            # Connect evaluation functionality
+            def update_eval_display(conversation_id, user_input, bot_response):
+                """Update the evaluation display with current conversation"""
+                return user_input, bot_response
+            # Update evaluation display when new conversation happens
+            submit_button.click(
+                fn=update_eval_display,
+                inputs=[conversation_id_state, eval_user_input_state, eval_bot_response_state],
+                outputs=[current_user_input, current_bot_response]
+            )
+            # Handle evaluation submission
+            submit_eval_button.click(
+                fn=submit_evaluation,
+                inputs=[
+                    conversation_id_state,
+                    eval_user_input_state,
+                    eval_bot_response_state,
+                    overall_rating,
+                    medical_accuracy,
+                    clinical_relevance,
+                    communication_clarity,
+                    safety_considerations,
+                    feedback_text,
+                    expert_name_input
+                ],
+                outputs=[
+                    evaluation_status,
+                    feedback_text,
+                    overall_rating,
+                    medical_accuracy,
+                    clinical_relevance,
+                    communication_clarity,
+                    safety_considerations,
+                    expert_name_input
+                ]
+            )
+            # Admin Panel for Evaluation Management
+            with gr.Box():
+                gr.Markdown("## 📈 Evaluation Analytics")
+                with gr.Row():
+                    refresh_stats_button = gr.Button("Refresh Statistics", variant="secondary")
+                    export_data_button = gr.Button("Export Data", variant="secondary")
+                evaluation_stats_display = gr.Markdown(
+                    value="Click 'Refresh Statistics' to view evaluation data.",
+                    label="Statistics"
+                )
+                export_file_output = gr.File(
+                    label="Download Evaluation Data",
+                    visible=False
+                )
+                # Connect admin functions
+                refresh_stats_button.click(
+                    fn=get_evaluation_stats,
+                    outputs=[evaluation_stats_display]
+                )
+                export_data_button.click(
+                    fn=export_evaluations,
+                    outputs=[export_file_output, evaluation_stats_display]
+                )
     app.queue()

evaluation_module_readme.md ADDED Viewed

	@@ -0,0 +1,155 @@

+# Chatbot Evaluation Module 🔬
+## 概述 (Overview)
+为您的医疗聊天机器人界面添加了一个专业的人类专家评估模块，允许医学专家对AI回复进行评分和反馈。
+## 功能特性 (Features)
+### 1. 多维度评估系统
+- **整体评分**: 1-5分制度评估
+- **医学准确性** (Medical Accuracy): 评估医学信息的准确性
+- **临床相关性** (Clinical Relevance): 评估回复的临床价值
+- **沟通清晰度** (Communication Clarity): 评估表达的清晰程度
+- **安全考虑** (Safety Considerations): 评估安全性和风险
+### 2. 详细反馈系统
+- 文本框提供详细反馈意见
+- 专家姓名记录
+- 时间戳自动记录
+- 对话ID追踪
+### 3. 数据管理
+- 自动保存为JSON和CSV格式
+- 实时统计分析
+- 数据导出功能
+- 评估历史查看
+## 使用方法 (Usage)
+### 对于医学专家 (For Medical Experts)
+1. **进行对话**
+   - 与chatbot进行正常对话
+   - 每次回复后，评估模块会自动显示当前对话
+2. **填写评估**
+   - 输入您的姓名
+   - 使用滑块评分 (1-5分)
+   - 提供详细文字反馈
+3. **提交评估**
+   - 点击"Submit Evaluation"按钮
+   - 系统会确认保存成功
+### 对于管理员 (For Administrators)
+1. **查看统计**
+   - 点击"Refresh Statistics"查看评估数据
+   - 包括平均评分、专家数量等
+2. **导出数据**
+   - 点击"Export Data"下载CSV文件
+   - 用于进一步分析
+## 数据存储结构 (Data Structure)
+### JSON格式 (Individual Evaluations)
+```json
+{
+  "timestamp": "2024-01-01T12:00:00",
+  "conversation_id": "user123_20240101_120000",
+  "expert_name": "Dr. Smith",
+  "user_input": "用户输入...",
+  "bot_response": "机器人回复...",
+  "overall_rating": 4,
+  "feedback": "详细反馈...",
+  "categories": {
+    "medical_accuracy": 5,
+    "clinical_relevance": 4,
+    "communication_clarity": 4,
+    "safety_considerations": 5
+  }
+}
+```
+### CSV格式 (Master File)
+- timestamp: 时间戳
+- conversation_id: 对话ID
+- expert_name: 专家姓名
+- overall_rating: 整体评分
+- medical_accuracy: 医学准确性
+- clinical_relevance: 临床相关性
+- communication_clarity: 沟通清晰度
+- safety_considerations: 安全考虑
+- feedback: 详细反馈
+## 文件结构 (File Structure)
+```
+/evaluations/
+├── evaluation_YYYY-MM-DDTHH-mm-ss.json  # 单个评估记录
+├── evaluation_YYYY-MM-DDTHH-mm-ss.json
+├── ...
+└── evaluations_master.csv               # 汇总CSV文件
+```
+## 技术要求 (Requirements)
+### 必需依赖
+- gradio
+- json (内置)
+- csv (内置)
+- datetime (内置)
+- os (内置)
+### 可选依赖
+- pandas (用于高级统计分析)
+## 安装pandas (可选)
+```bash
+pip install pandas
+```
+## 评估标准建议 (Evaluation Guidelines)
+### 评分标准 (Rating Scale)
+- **5分 (Excellent)**: 完全准确，高度相关，表达清晰
+- **4分 (Good)**: 基本准确，相关性好，表达清楚
+- **3分 (Fair)**: 可接受，有些问题但不严重
+- **2分 (Poor)**: 明显问题，需要改进
+- **1分 (Very Poor)**: 严重错误，不可接受
+### 医学准确性评估要点
+- 医学事实是否正确
+- 诊断建议是否合理
+- 治疗方案是否适当
+- 药物信息是否准确
+### 安全考虑评估要点
+- 是否避免危险建议
+- 是否提醒就医
+- 是否注明AI限制
+- 风险评估是否合理
+## 故障排除 (Troubleshooting)
+### 常见问题
+1. **评估未保存**: 检查是否填写了专家姓名和评分
+2. **统计不显示**: 确保有评估数据存在
+3. **导出失败**: 检查文件权限和磁盘空间
+### 数据恢复
+所有评估数据都保存在`/evaluations/`目录下，可以手动备份或恢复。
+## 更新日志 (Changelog)
+### v1.0 (Current)
+- 基础评估功能
+- 多维度评分系统
+- 数据导出功能
+- 统计分析面板
+---
+**注意**: 此评估模块专为医学专家设计，用于提升AI医疗助手的质量和安全性。所有评估数据应严格按照医疗数据隐私法规处理。