Jesse Liu commited on
Commit
b3e7b51
·
1 Parent(s): 85eb95c
assets/llama-3.2-3b_io.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
chatgpt.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  import requests
8
  import time
9
  import threading
 
 
10
  from datetime import datetime, timedelta
11
 
12
  session = requests.Session()
@@ -177,10 +179,14 @@ def submit_text_and_respond(edited_text, api_key, username, selected_title, hist
177
  print(response)
178
  if isinstance(response, str):
179
  history.append((edited_text, response))
180
- return history, "", []
 
 
181
  doctor_response = response['doctor_response']
182
  history.append((edited_text, doctor_response))
183
- return history, "" # Return memory_graph as output
 
 
184
 
185
  def set_initialize_button(api_key_input, username_input):
186
  message = asyncio.run(initialization(api_key_input, username_input))
@@ -200,6 +206,136 @@ def start_recording(audio_file):
200
  except Exception as e:
201
  return f"Failed to transcribe: {str(e)}"
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def update_methods(chapter):
204
  return gr.update(choices=interview_protocols[chapter], value=interview_protocols[chapter][0])
205
 
@@ -268,18 +404,11 @@ with gr.Blocks(css=css) as app:
268
  data.append(json.loads(line))
269
  return data
270
 
271
- jsonl_path = "/Users/jinhaoduan/workspace/chatbot-mimic-notes/assets/structured_results_o3_with_inputs.jsonl"
272
- structured_results = load_jsonl(jsonl_path)
273
- notes_path = '/Users/jinhaoduan/workspace/chatbot-mimic-notes/assets/structured_results_o3_md.jsonl'
274
- notes = load_jsonl(notes_path)
275
  options = []
276
- for r in structured_results:
277
- _note = 'Notes not found.'
278
- for _note in notes:
279
- if _note['hadm_id'] == r['hadm_id']:
280
- _note = _note['content']
281
- break
282
- options.append({'title': 'Patient:' + str(r['subject_id']), 'text': r['content'] + str(r['raw_note']) + str(r['medications']), 'note': _note})
283
 
284
 
285
 
@@ -351,7 +480,7 @@ with gr.Blocks(css=css) as app:
351
  submit_button.click(
352
  submit_text_and_respond,
353
  inputs=[transcription_box, api_key_state, username_input, selected_title, state, chatbot_type_state],
354
- outputs=[chatbot, transcription_box]
355
  )
356
 
357
  # download_button.click(
@@ -396,6 +525,164 @@ with gr.Blocks(css=css) as app:
396
  outputs=[predefined_option_text, markdown_display, selected_title]
397
  )
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
 
401
  app.queue()
 
7
  import requests
8
  import time
9
  import threading
10
+ import json
11
+ import csv
12
  from datetime import datetime, timedelta
13
 
14
  session = requests.Session()
 
179
  print(response)
180
  if isinstance(response, str):
181
  history.append((edited_text, response))
182
+ # Generate conversation ID for evaluation
183
+ conversation_id = f"{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
184
+ return history, "", conversation_id, edited_text, response
185
  doctor_response = response['doctor_response']
186
  history.append((edited_text, doctor_response))
187
+ # Generate conversation ID for evaluation
188
+ conversation_id = f"{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
189
+ return history, "", conversation_id, edited_text, doctor_response
190
 
191
  def set_initialize_button(api_key_input, username_input):
192
  message = asyncio.run(initialization(api_key_input, username_input))
 
206
  except Exception as e:
207
  return f"Failed to transcribe: {str(e)}"
208
 
209
+ # Evaluation functions
210
+ def save_evaluation(conversation_id, user_input, bot_response, rating, feedback, expert_name, categories):
211
+ """Save evaluation data to a JSON file"""
212
+ timestamp = datetime.now().isoformat()
213
+
214
+ evaluation = {
215
+ "timestamp": timestamp,
216
+ "conversation_id": conversation_id,
217
+ "expert_name": expert_name,
218
+ "user_input": user_input,
219
+ "bot_response": bot_response,
220
+ "overall_rating": rating,
221
+ "feedback": feedback,
222
+ "categories": categories
223
+ }
224
+
225
+ # Create evaluations directory if it doesn't exist
226
+ eval_dir = "evaluations"
227
+ if not os.path.exists(eval_dir):
228
+ os.makedirs(eval_dir)
229
+
230
+ # Save to JSON file
231
+ eval_file = os.path.join(eval_dir, f"evaluation_{timestamp.replace(':', '-')}.json")
232
+ with open(eval_file, 'w', encoding='utf-8') as f:
233
+ json.dump(evaluation, f, ensure_ascii=False, indent=2)
234
+
235
+ # Also append to a master CSV file for easier analysis
236
+ csv_file = os.path.join(eval_dir, "evaluations_master.csv")
237
+ file_exists = os.path.isfile(csv_file)
238
+
239
+ with open(csv_file, 'a', newline='', encoding='utf-8') as f:
240
+ writer = csv.writer(f)
241
+ if not file_exists:
242
+ writer.writerow(['timestamp', 'conversation_id', 'expert_name', 'overall_rating',
243
+ 'medical_accuracy', 'clinical_relevance', 'communication_clarity',
244
+ 'safety_considerations', 'feedback'])
245
+
246
+ writer.writerow([
247
+ timestamp, conversation_id, expert_name, rating,
248
+ categories.get('medical_accuracy', ''),
249
+ categories.get('clinical_relevance', ''),
250
+ categories.get('communication_clarity', ''),
251
+ categories.get('safety_considerations', ''),
252
+ feedback
253
+ ])
254
+
255
+ return f"Evaluation saved successfully at {timestamp}"
256
+
257
+ def submit_evaluation(conversation_id, user_input, bot_response, overall_rating,
258
+ medical_accuracy, clinical_relevance, communication_clarity,
259
+ safety_considerations, feedback, expert_name):
260
+ """Process and save the evaluation"""
261
+ if not expert_name.strip():
262
+ return "Please enter your name before submitting evaluation."
263
+
264
+ if overall_rating == 0:
265
+ return "Please provide an overall rating before submitting."
266
+
267
+ categories = {
268
+ 'medical_accuracy': medical_accuracy,
269
+ 'clinical_relevance': clinical_relevance,
270
+ 'communication_clarity': communication_clarity,
271
+ 'safety_considerations': safety_considerations
272
+ }
273
+
274
+ result = save_evaluation(conversation_id, user_input, bot_response,
275
+ overall_rating, feedback, expert_name, categories)
276
+
277
+ # Reset form after successful submission
278
+ return result, "", 0, 0, 0, 0, 0, ""
279
+
280
+ def get_conversation_for_evaluation(history):
281
+ """Get the last conversation pair for evaluation"""
282
+ if not history or len(history) == 0:
283
+ return "", ""
284
+
285
+ last_conversation = history[-1]
286
+ user_input = last_conversation[0] if len(last_conversation) > 0 else ""
287
+ bot_response = last_conversation[1] if len(last_conversation) > 1 else ""
288
+
289
+ return user_input, bot_response
290
+
291
+ def export_evaluations():
292
+ """Export evaluation data for analysis"""
293
+ eval_dir = "evaluations"
294
+ csv_file = os.path.join(eval_dir, "evaluations_master.csv")
295
+
296
+ if not os.path.exists(csv_file):
297
+ return None, "No evaluation data found."
298
+
299
+ return csv_file, f"Evaluation data exported. Total evaluations in file."
300
+
301
+ def get_evaluation_stats():
302
+ """Get basic statistics about evaluations"""
303
+ eval_dir = "evaluations"
304
+ csv_file = os.path.join(eval_dir, "evaluations_master.csv")
305
+
306
+ if not os.path.exists(csv_file):
307
+ return "No evaluation data available."
308
+
309
+ try:
310
+ import pandas as pd
311
+ df = pd.read_csv(csv_file)
312
+
313
+ total_evaluations = len(df)
314
+ avg_overall_rating = df['overall_rating'].mean() if 'overall_rating' in df.columns else 0
315
+ avg_medical_accuracy = df['medical_accuracy'].mean() if 'medical_accuracy' in df.columns else 0
316
+
317
+ expert_count = df['expert_name'].nunique() if 'expert_name' in df.columns else 0
318
+
319
+ stats = f"""
320
+ 📊 **Evaluation Statistics**
321
+
322
+ - **Total Evaluations**: {total_evaluations}
323
+ - **Average Overall Rating**: {avg_overall_rating:.2f}/5
324
+ - **Average Medical Accuracy**: {avg_medical_accuracy:.2f}/5
325
+ - **Number of Experts**: {expert_count}
326
+ - **Latest Evaluation**: {df['timestamp'].iloc[-1] if not df.empty else 'N/A'}
327
+ """
328
+ return stats
329
+
330
+ except ImportError:
331
+ # Fallback if pandas is not available
332
+ with open(csv_file, 'r', encoding='utf-8') as f:
333
+ lines = f.readlines()
334
+ total_evaluations = len(lines) - 1 # Subtract header
335
+ return f"Total evaluations: {total_evaluations} (Install pandas for detailed stats)"
336
+ except Exception as e:
337
+ return f"Error reading evaluation data: {str(e)}"
338
+
339
  def update_methods(chapter):
340
  return gr.update(choices=interview_protocols[chapter], value=interview_protocols[chapter][0])
341
 
 
404
  data.append(json.loads(line))
405
  return data
406
 
407
+ jsonl_path = "/Users/liuzijie/Desktop/chatbot-mimic-notes/assets/llama-3.2-3b_io.jsonl"
408
+ llama_results = load_jsonl(jsonl_path)
 
 
409
  options = []
410
+ for r in llama_results:
411
+ options.append({'title': 'Patient:' + str(r['patient_id']), 'text': r['input'], 'note': r['output']})
 
 
 
 
 
412
 
413
 
414
 
 
480
  submit_button.click(
481
  submit_text_and_respond,
482
  inputs=[transcription_box, api_key_state, username_input, selected_title, state, chatbot_type_state],
483
+ outputs=[chatbot, transcription_box, conversation_id_state, eval_user_input_state, eval_bot_response_state]
484
  )
485
 
486
  # download_button.click(
 
525
  outputs=[predefined_option_text, markdown_display, selected_title]
526
  )
527
 
528
+ # Evaluation Module
529
+ with gr.Box():
530
+ gr.Markdown("## 🔬 Expert Evaluation Module")
531
+
532
+ # Hidden fields to store conversation data for evaluation
533
+ conversation_id_state = gr.State("")
534
+ eval_user_input_state = gr.State("")
535
+ eval_bot_response_state = gr.State("")
536
+
537
+ with gr.Row():
538
+ expert_name_input = gr.Textbox(
539
+ label="Expert Name",
540
+ placeholder="Enter your name",
541
+ scale=2
542
+ )
543
+ evaluation_status = gr.Textbox(
544
+ label="Status",
545
+ interactive=False,
546
+ scale=1
547
+ )
548
+
549
+ # Overall Rating
550
+ overall_rating = gr.Slider(
551
+ minimum=1,
552
+ maximum=5,
553
+ step=1,
554
+ label="Overall Rating (1=Poor, 5=Excellent)",
555
+ value=0
556
+ )
557
+
558
+ # Category-specific ratings
559
+ with gr.Row():
560
+ medical_accuracy = gr.Slider(
561
+ minimum=1,
562
+ maximum=5,
563
+ step=1,
564
+ label="Medical Accuracy",
565
+ value=0
566
+ )
567
+ clinical_relevance = gr.Slider(
568
+ minimum=1,
569
+ maximum=5,
570
+ step=1,
571
+ label="Clinical Relevance",
572
+ value=0
573
+ )
574
+
575
+ with gr.Row():
576
+ communication_clarity = gr.Slider(
577
+ minimum=1,
578
+ maximum=5,
579
+ step=1,
580
+ label="Communication Clarity",
581
+ value=0
582
+ )
583
+ safety_considerations = gr.Slider(
584
+ minimum=1,
585
+ maximum=5,
586
+ step=1,
587
+ label="Safety Considerations",
588
+ value=0
589
+ )
590
+
591
+ # Detailed feedback
592
+ feedback_text = gr.Textbox(
593
+ label="Detailed Feedback",
594
+ placeholder="Please provide specific feedback about the response...",
595
+ lines=4
596
+ )
597
+
598
+ # Current conversation display (readonly)
599
+ with gr.Accordion("Current Conversation", open=False):
600
+ current_user_input = gr.Textbox(
601
+ label="User Input",
602
+ interactive=False,
603
+ lines=2
604
+ )
605
+ current_bot_response = gr.Textbox(
606
+ label="Bot Response",
607
+ interactive=False,
608
+ lines=3
609
+ )
610
+
611
+ # Submit evaluation button
612
+ submit_eval_button = gr.Button(
613
+ "Submit Evaluation",
614
+ variant="primary",
615
+ size="large"
616
+ )
617
+
618
+ # Connect evaluation functionality
619
+ def update_eval_display(conversation_id, user_input, bot_response):
620
+ """Update the evaluation display with current conversation"""
621
+ return user_input, bot_response
622
+
623
+ # Update evaluation display when new conversation happens
624
+ submit_button.click(
625
+ fn=update_eval_display,
626
+ inputs=[conversation_id_state, eval_user_input_state, eval_bot_response_state],
627
+ outputs=[current_user_input, current_bot_response]
628
+ )
629
+
630
+ # Handle evaluation submission
631
+ submit_eval_button.click(
632
+ fn=submit_evaluation,
633
+ inputs=[
634
+ conversation_id_state,
635
+ eval_user_input_state,
636
+ eval_bot_response_state,
637
+ overall_rating,
638
+ medical_accuracy,
639
+ clinical_relevance,
640
+ communication_clarity,
641
+ safety_considerations,
642
+ feedback_text,
643
+ expert_name_input
644
+ ],
645
+ outputs=[
646
+ evaluation_status,
647
+ feedback_text,
648
+ overall_rating,
649
+ medical_accuracy,
650
+ clinical_relevance,
651
+ communication_clarity,
652
+ safety_considerations,
653
+ expert_name_input
654
+ ]
655
+ )
656
+
657
+ # Admin Panel for Evaluation Management
658
+ with gr.Box():
659
+ gr.Markdown("## 📈 Evaluation Analytics")
660
+
661
+ with gr.Row():
662
+ refresh_stats_button = gr.Button("Refresh Statistics", variant="secondary")
663
+ export_data_button = gr.Button("Export Data", variant="secondary")
664
+
665
+ evaluation_stats_display = gr.Markdown(
666
+ value="Click 'Refresh Statistics' to view evaluation data.",
667
+ label="Statistics"
668
+ )
669
+
670
+ export_file_output = gr.File(
671
+ label="Download Evaluation Data",
672
+ visible=False
673
+ )
674
+
675
+ # Connect admin functions
676
+ refresh_stats_button.click(
677
+ fn=get_evaluation_stats,
678
+ outputs=[evaluation_stats_display]
679
+ )
680
+
681
+ export_data_button.click(
682
+ fn=export_evaluations,
683
+ outputs=[export_file_output, evaluation_stats_display]
684
+ )
685
+
686
 
687
 
688
  app.queue()
evaluation_module_readme.md ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chatbot Evaluation Module 🔬
2
+
3
+ ## 概述 (Overview)
4
+
5
+ 为您的医疗聊天机器人界面添加了一个专业的人类专家评估模块,允许医学专家对AI回复进行评分和反馈。
6
+
7
+ ## 功能特性 (Features)
8
+
9
+ ### 1. 多维度评估系统
10
+ - **整体评分**: 1-5分制度评估
11
+ - **医学准确性** (Medical Accuracy): 评估医学信息的准确性
12
+ - **临床相关性** (Clinical Relevance): 评估回复的临床价值
13
+ - **沟通清晰度** (Communication Clarity): 评估表达的清晰程度
14
+ - **安全考虑** (Safety Considerations): 评估安全性和风险
15
+
16
+ ### 2. 详细反馈系统
17
+ - 文本框提供详细反馈意见
18
+ - 专家姓名记录
19
+ - 时间戳自动记录
20
+ - 对话ID追踪
21
+
22
+ ### 3. 数据管理
23
+ - 自动保存为JSON和CSV格式
24
+ - 实时统计分析
25
+ - 数据导出功能
26
+ - 评估历史查看
27
+
28
+ ## 使用方法 (Usage)
29
+
30
+ ### 对于医学专家 (For Medical Experts)
31
+
32
+ 1. **进行对话**
33
+ - 与chatbot进行正常对话
34
+ - 每次回复后,评估模块会自动显示当前对话
35
+
36
+ 2. **填写评估**
37
+ - 输入您的姓名
38
+ - 使用滑块评分 (1-5分)
39
+ - 提供详细文字反馈
40
+
41
+ 3. **提交评估**
42
+ - 点击"Submit Evaluation"按钮
43
+ - 系统会确认保存成功
44
+
45
+ ### 对于管理员 (For Administrators)
46
+
47
+ 1. **查看统计**
48
+ - 点击"Refresh Statistics"查看评估数据
49
+ - 包括平均评分、专家数量等
50
+
51
+ 2. **导出数据**
52
+ - 点击"Export Data"下载CSV文件
53
+ - 用于进一步分析
54
+
55
+ ## 数据存储结构 (Data Structure)
56
+
57
+ ### JSON格式 (Individual Evaluations)
58
+ ```json
59
+ {
60
+ "timestamp": "2024-01-01T12:00:00",
61
+ "conversation_id": "user123_20240101_120000",
62
+ "expert_name": "Dr. Smith",
63
+ "user_input": "用户输入...",
64
+ "bot_response": "机器人回复...",
65
+ "overall_rating": 4,
66
+ "feedback": "详细反馈...",
67
+ "categories": {
68
+ "medical_accuracy": 5,
69
+ "clinical_relevance": 4,
70
+ "communication_clarity": 4,
71
+ "safety_considerations": 5
72
+ }
73
+ }
74
+ ```
75
+
76
+ ### CSV格式 (Master File)
77
+ - timestamp: 时间戳
78
+ - conversation_id: 对话ID
79
+ - expert_name: 专家姓名
80
+ - overall_rating: 整体评分
81
+ - medical_accuracy: 医学准确性
82
+ - clinical_relevance: 临床相关性
83
+ - communication_clarity: 沟通清晰度
84
+ - safety_considerations: 安全考虑
85
+ - feedback: 详细反馈
86
+
87
+ ## 文件结构 (File Structure)
88
+
89
+ ```
90
+ /evaluations/
91
+ ├── evaluation_YYYY-MM-DDTHH-mm-ss.json # 单个评估记录
92
+ ├── evaluation_YYYY-MM-DDTHH-mm-ss.json
93
+ ├── ...
94
+ └── evaluations_master.csv # 汇总CSV文件
95
+ ```
96
+
97
+ ## 技术要求 (Requirements)
98
+
99
+ ### 必需依赖
100
+ - gradio
101
+ - json (内置)
102
+ - csv (内置)
103
+ - datetime (内置)
104
+ - os (内置)
105
+
106
+ ### 可选依赖
107
+ - pandas (用于高级统计分析)
108
+
109
+ ## 安装pandas (可选)
110
+ ```bash
111
+ pip install pandas
112
+ ```
113
+
114
+ ## 评估标准建议 (Evaluation Guidelines)
115
+
116
+ ### 评分标准 (Rating Scale)
117
+ - **5分 (Excellent)**: 完全准确,高度相关,表达清晰
118
+ - **4分 (Good)**: 基本准确,相关性好,表达清楚
119
+ - **3分 (Fair)**: 可接受,有些问题但不严重
120
+ - **2分 (Poor)**: 明显问题,需要改进
121
+ - **1分 (Very Poor)**: 严重错误,不可接受
122
+
123
+ ### 医学准确性评估要点
124
+ - 医学事实是否正确
125
+ - 诊断建议是否合理
126
+ - 治疗方案是否适当
127
+ - 药物信息是否准确
128
+
129
+ ### 安全考虑评估要点
130
+ - 是否避免危险建议
131
+ - 是否提醒就医
132
+ - 是否注明AI限制
133
+ - 风险评估是否合理
134
+
135
+ ## 故障排除 (Troubleshooting)
136
+
137
+ ### 常见问题
138
+ 1. **评估未保存**: 检查是否填写了专家姓名和评分
139
+ 2. **统计不显示**: 确保有评估数据存在
140
+ 3. **导出失败**: 检查文件权限和磁盘空间
141
+
142
+ ### 数据恢复
143
+ 所有评估数据都保存在`/evaluations/`目录下,可以手动备份或恢复。
144
+
145
+ ## 更新日志 (Changelog)
146
+
147
+ ### v1.0 (Current)
148
+ - 基础评估功能
149
+ - 多维度评分系统
150
+ - 数据导出功能
151
+ - 统计分析面板
152
+
153
+ ---
154
+
155
+ **注意**: 此评估模块专为医学专家设计,用于提升AI医疗助手的质量和安全性。所有评估数据应严格按照医疗数据隐私法规处理。