tblaisaacliao commited on
Commit
710cde6
·
1 Parent(s): 4deab9b

support evaluating coach direct prompt

Browse files
src/app/admin/evaluations/[id]/page.tsx CHANGED
@@ -191,8 +191,10 @@ export default function EvaluationDetailPage({
191
  <div className="bg-white rounded-lg shadow p-4 mb-6">
192
  <div className="grid grid-cols-2 md:grid-cols-4 gap-4 text-sm">
193
  <div>
194
- <span className="text-gray-500">Student Prompt:</span>
195
- <p className="font-medium text-gray-900">{evaluation.studentPromptId}</p>
 
 
196
  </div>
197
  <div>
198
  <span className="text-gray-500">Model Used:</span>
@@ -214,9 +216,26 @@ export default function EvaluationDetailPage({
214
  {/* Score Sections */}
215
  <div className="grid grid-cols-1 lg:grid-cols-3 gap-6 mb-6">
216
  <ScoreSection
217
- title="Prompt Design"
218
  overall={evaluation.scores.promptDesign.overall}
219
- scores={[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  {
221
  label: 'Clarity',
222
  value: evaluation.scores.promptDesign.clarity,
@@ -238,9 +257,26 @@ export default function EvaluationDetailPage({
238
  />
239
 
240
  <ScoreSection
241
- title="Training Effectiveness"
242
  overall={evaluation.scores.trainingEffectiveness.overall}
243
- scores={[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  {
245
  label: 'Challenge Level',
246
  value: evaluation.scores.trainingEffectiveness.challengeLevel,
@@ -262,9 +298,22 @@ export default function EvaluationDetailPage({
262
  />
263
 
264
  <ScoreSection
265
- title="Conversation Quality"
266
  overall={evaluation.scores.conversationQuality.overall}
267
- scores={[
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  {
269
  label: 'Teacher Insights',
270
  value: evaluation.scores.conversationQuality.teacherInsights,
@@ -301,7 +350,7 @@ export default function EvaluationDetailPage({
301
  {/* Strengths */}
302
  <div className="bg-white rounded-lg shadow p-6">
303
  <h3 className="text-lg font-semibold text-gray-900 mb-4 flex items-center gap-2">
304
- <span className="text-green-500">✓</span> Prompt Strengths
305
  </h3>
306
  {evaluation.feedback.strengths.length > 0 ? (
307
  <ul className="space-y-2">
@@ -313,14 +362,14 @@ export default function EvaluationDetailPage({
313
  ))}
314
  </ul>
315
  ) : (
316
- <p className="text-gray-500 text-sm">No strengths identified</p>
317
  )}
318
  </div>
319
 
320
  {/* Improvement Areas */}
321
  <div className="bg-white rounded-lg shadow p-6">
322
  <h3 className="text-lg font-semibold text-gray-900 mb-4 flex items-center gap-2">
323
- <span className="text-yellow-500">!</span> Prompt Improvements
324
  </h3>
325
  {evaluation.feedback.improvementAreas.length > 0 ? (
326
  <ul className="space-y-2">
@@ -332,7 +381,7 @@ export default function EvaluationDetailPage({
332
  ))}
333
  </ul>
334
  ) : (
335
- <p className="text-gray-500 text-sm">No improvement areas identified</p>
336
  )}
337
  </div>
338
  </div>
@@ -340,7 +389,7 @@ export default function EvaluationDetailPage({
340
  {/* Prompt Suggestions */}
341
  <div className="bg-blue-50 border border-blue-200 rounded-lg p-6 mb-6">
342
  <h3 className="text-lg font-semibold text-blue-900 mb-4 flex items-center gap-2">
343
- <span>💡</span> Prompt Improvement Suggestions
344
  </h3>
345
  {evaluation.feedback.promptSuggestions.length > 0 ? (
346
  <ul className="space-y-2">
@@ -352,7 +401,7 @@ export default function EvaluationDetailPage({
352
  ))}
353
  </ul>
354
  ) : (
355
- <p className="text-blue-700 text-sm">No specific suggestions</p>
356
  )}
357
  </div>
358
 
 
191
  <div className="bg-white rounded-lg shadow p-4 mb-6">
192
  <div className="grid grid-cols-2 md:grid-cols-4 gap-4 text-sm">
193
  <div>
194
+ <span className="text-gray-500">對話類型:</span>
195
+ <p className="font-medium text-gray-900">
196
+ {evaluation.evaluationMode === 'coach_direct' ? '教練直接對話' : `學生對話 (${evaluation.studentPromptId})`}
197
+ </p>
198
  </div>
199
  <div>
200
  <span className="text-gray-500">Model Used:</span>
 
216
  {/* Score Sections */}
217
  <div className="grid grid-cols-1 lg:grid-cols-3 gap-6 mb-6">
218
  <ScoreSection
219
+ title={evaluation.evaluationMode === 'coach_direct' ? '教練指導品質' : 'Prompt Design'}
220
  overall={evaluation.scores.promptDesign.overall}
221
+ scores={evaluation.evaluationMode === 'coach_direct' ? [
222
+ {
223
+ label: '指導清晰度',
224
+ value: evaluation.scores.promptDesign.clarity,
225
+ },
226
+ {
227
+ label: '回應完整度',
228
+ value: evaluation.scores.promptDesign.completeness,
229
+ },
230
+ {
231
+ label: '建議具體性',
232
+ value: evaluation.scores.promptDesign.specificity,
233
+ },
234
+ {
235
+ label: '風格一致性',
236
+ value: evaluation.scores.promptDesign.consistency,
237
+ },
238
+ ] : [
239
  {
240
  label: 'Clarity',
241
  value: evaluation.scores.promptDesign.clarity,
 
257
  />
258
 
259
  <ScoreSection
260
+ title={evaluation.evaluationMode === 'coach_direct' ? '教師學習成效' : 'Training Effectiveness'}
261
  overall={evaluation.scores.trainingEffectiveness.overall}
262
+ scores={evaluation.evaluationMode === 'coach_direct' ? [
263
+ {
264
+ label: '洞察深度',
265
+ value: evaluation.scores.trainingEffectiveness.challengeLevel,
266
+ },
267
+ {
268
+ label: '學習機會',
269
+ value: evaluation.scores.trainingEffectiveness.learningOpportunities,
270
+ },
271
+ {
272
+ label: '實用價值',
273
+ value: evaluation.scores.trainingEffectiveness.realisticScenarios,
274
+ },
275
+ {
276
+ label: '互動深度',
277
+ value: evaluation.scores.trainingEffectiveness.engagementDepth,
278
+ },
279
+ ] : [
280
  {
281
  label: 'Challenge Level',
282
  value: evaluation.scores.trainingEffectiveness.challengeLevel,
 
298
  />
299
 
300
  <ScoreSection
301
+ title={evaluation.evaluationMode === 'coach_direct' ? '對話品質' : 'Conversation Quality'}
302
  overall={evaluation.scores.conversationQuality.overall}
303
+ scores={evaluation.evaluationMode === 'coach_direct' ? [
304
+ {
305
+ label: '教師收穫',
306
+ value: evaluation.scores.conversationQuality.teacherInsights,
307
+ },
308
+ {
309
+ label: '對話深度',
310
+ value: evaluation.scores.conversationQuality.interactionDepth,
311
+ },
312
+ {
313
+ label: '教育價值',
314
+ value: evaluation.scores.conversationQuality.educationalValue,
315
+ },
316
+ ] : [
317
  {
318
  label: 'Teacher Insights',
319
  value: evaluation.scores.conversationQuality.teacherInsights,
 
350
  {/* Strengths */}
351
  <div className="bg-white rounded-lg shadow p-6">
352
  <h3 className="text-lg font-semibold text-gray-900 mb-4 flex items-center gap-2">
353
+ <span className="text-green-500">✓</span> {evaluation.evaluationMode === 'coach_direct' ? '教練優點' : 'Prompt Strengths'}
354
  </h3>
355
  {evaluation.feedback.strengths.length > 0 ? (
356
  <ul className="space-y-2">
 
362
  ))}
363
  </ul>
364
  ) : (
365
+ <p className="text-gray-500 text-sm">{evaluation.evaluationMode === 'coach_direct' ? '無明確優點' : 'No strengths identified'}</p>
366
  )}
367
  </div>
368
 
369
  {/* Improvement Areas */}
370
  <div className="bg-white rounded-lg shadow p-6">
371
  <h3 className="text-lg font-semibold text-gray-900 mb-4 flex items-center gap-2">
372
+ <span className="text-yellow-500">!</span> {evaluation.evaluationMode === 'coach_direct' ? '教練改進空間' : 'Prompt Improvements'}
373
  </h3>
374
  {evaluation.feedback.improvementAreas.length > 0 ? (
375
  <ul className="space-y-2">
 
381
  ))}
382
  </ul>
383
  ) : (
384
+ <p className="text-gray-500 text-sm">{evaluation.evaluationMode === 'coach_direct' ? '無明確改進建議' : 'No improvement areas identified'}</p>
385
  )}
386
  </div>
387
  </div>
 
389
  {/* Prompt Suggestions */}
390
  <div className="bg-blue-50 border border-blue-200 rounded-lg p-6 mb-6">
391
  <h3 className="text-lg font-semibold text-blue-900 mb-4 flex items-center gap-2">
392
+ <span>💡</span> {evaluation.evaluationMode === 'coach_direct' ? '教練提示改進建議' : 'Prompt Improvement Suggestions'}
393
  </h3>
394
  {evaluation.feedback.promptSuggestions.length > 0 ? (
395
  <ul className="space-y-2">
 
401
  ))}
402
  </ul>
403
  ) : (
404
+ <p className="text-blue-700 text-sm">{evaluation.evaluationMode === 'coach_direct' ? '無具體建議' : 'No specific suggestions'}</p>
405
  )}
406
  </div>
407
 
src/lib/repositories/evaluation-repository.ts CHANGED
@@ -30,6 +30,7 @@ export class EvaluationRepository {
30
  conversationId: row.conversationId,
31
  studentPromptId: row.studentPromptId || undefined,
32
  evaluationType: row.evaluationType as Evaluation['evaluationType'],
 
33
  modelUsed: row.modelUsed,
34
  evaluatedAt: row.evaluatedAt,
35
  evaluatedBy: row.evaluatedBy || undefined,
 
30
  conversationId: row.conversationId,
31
  studentPromptId: row.studentPromptId || undefined,
32
  evaluationType: row.evaluationType as Evaluation['evaluationType'],
33
+ evaluationMode: row.studentPromptId === 'coach_direct' ? 'coach_direct' : 'student',
34
  modelUsed: row.modelUsed,
35
  evaluatedAt: row.evaluatedAt,
36
  evaluatedBy: row.evaluatedBy || undefined,
src/lib/services/evaluation-service.ts CHANGED
@@ -18,6 +18,93 @@ const openai = createOpenAI({
18
 
19
  const EVALUATION_MODEL = process.env.MODEL_NAME || 'gpt-4o-mini';
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  const EVALUATION_SYSTEM_PROMPT = `你是一位專業的社會情緒學習(SEL)教師培訓應用程式評估專家。
22
  你的目標是評估**教師在對話中的體驗**,以便提示工程師改進學生提示。
23
 
@@ -143,17 +230,25 @@ export class EvaluationService {
143
 
144
  private formatConversationForEvaluation(
145
  conversation: Conversation,
146
- messages: Message[]
 
147
  ): string {
148
  const formattedMessages = messages
149
  .filter((m) => m.role !== 'system')
150
  .map((m) => {
151
- const speaker =
152
- m.role === 'user'
153
- ? '老師'
154
- : m.speaker === 'student'
155
- ? '學生'
156
- : '教練';
 
 
 
 
 
 
 
157
  return `${speaker}: ${m.content}`;
158
  })
159
  .join('\n');
@@ -221,6 +316,9 @@ export class EvaluationService {
221
  }
222
  }
223
 
 
 
 
224
  // Get the system prompt from the conversation
225
  const systemPrompt =
226
  conversation.systemPrompt || '(No system prompt available)';
@@ -228,11 +326,21 @@ export class EvaluationService {
228
  // Format conversation for evaluation
229
  const formattedConversation = this.formatConversationForEvaluation(
230
  conversation,
231
- messages
 
232
  );
233
 
234
- // Build the user prompt
235
- const userPrompt = `
 
 
 
 
 
 
 
 
 
236
  學生系統提示:
237
  ${systemPrompt}
238
 
@@ -241,15 +349,20 @@ ${formattedConversation}
241
 
242
  請評估此對話並以 JSON 格式返回你的評估結果。`;
243
 
 
 
 
 
 
244
  console.log(
245
- `[Evaluation] Evaluating conversation ${conversationId} with ${messages.length} messages`
246
  );
247
 
248
  try {
249
  // Call the AI model
250
  const { text } = await generateText({
251
  model: openai(EVALUATION_MODEL),
252
- system: EVALUATION_SYSTEM_PROMPT,
253
  prompt: userPrompt,
254
  temperature: 0.3, // Lower temperature for more consistent results
255
  });
 
18
 
19
  const EVALUATION_MODEL = process.env.MODEL_NAME || 'gpt-4o-mini';
20
 
21
+ // Evaluation prompt for coach-direct conversations (no student involved)
22
+ const COACH_DIRECT_EVALUATION_SYSTEM_PROMPT = `你是一位專業的社會情緒學習(SEL)教練輔導評估專家。
23
+ 你的目標是評估**教練與教師之間的直接對話品質**,以便改進教練系統的輔導效能。
24
+
25
+ 你將會收到:
26
+ 1. 教練系統提示 - 定義教練角色的提示文字
27
+ 2. 教師與 AI 教練之間的對話
28
+
29
+ **重要說明:這是直接的教練-教師對話,沒有學生角色參與。**
30
+
31
+ **重要評估原則:**
32
+ - 以教師體驗為核心:教師在這次諮詢後感覺如何?有獲得幫助嗎?
33
+ - 即使教練提示設計良好,如果對話品質差,整體評分也應該低
34
+ - 如果教師輸入無意義內容(如隨機數字、亂碼),在 rationale 中註明並給予低分
35
+
36
+ 請評估以下維度(評分 1-5 分,1=差,5=優秀):
37
+
38
+ 1. 教練指導品質(權重 50%)
39
+ - 指導清晰度:教練的建議是否清楚且可執行?
40
+ - 回應完整度:教練是否充分回應教師的疑問和需求?
41
+ - 建議具體性:教練是否提供具體、實用的建議?
42
+ - 風格一致性:教練是否保持支持性、一致的溝通風格?
43
+
44
+ 2. 教師學習成效(權重 50%)
45
+ - 洞察深度:教師是否獲得有意義的洞察?
46
+ - 學習機會:對話是否創造了學習和成長的機會?
47
+ - 實用價值:教練的建議是否能應用於實際教學情境?
48
+ - 互動深度:教師是否有實質性地參與對話?
49
+
50
+ 3. 對話品質
51
+ - 教師收穫:教師是否從對話中有所收穫?
52
+ - 對話深度:對話是否有實質性內容?
53
+ - 教育價值:對話是否具有教育價值?
54
+
55
+ **整體評分計算:**
56
+ overallScore = 0.5 × coachingQuality.overall + 0.5 × teacherLearning.overall
57
+
58
+ **低分情況(應給 1-2 分):**
59
+ - 教師輸入無意義內容(隨機數字、單字、亂碼)
60
+ - 對話過短或缺乏實質互動
61
+ - 教師明顯沒有認真參與
62
+
63
+ 請以繁體中文回應,並返回以下 JSON 結構:
64
+
65
+ {
66
+ "teacherEngagement": {
67
+ "level": "<high|medium|low|none - 教師參與程度>",
68
+ "warning": "<如果 level 是 low 或 none,說明為什麼教師沒有認真參與,否則為空字串>"
69
+ },
70
+ "promptDesign": {
71
+ "clarity": <數字 1-5 - 指導清晰度>,
72
+ "completeness": <數字 1-5 - 回應完整度>,
73
+ "specificity": <數字 1-5 - 建議具體性>,
74
+ "consistency": <數字 1-5 - 風格一致性>,
75
+ "overall": <數字 1-5>,
76
+ "rationale": "<教練指導品質說明>"
77
+ },
78
+ "trainingEffectiveness": {
79
+ "challengeLevel": <數字 1-5 - 洞察深度>,
80
+ "learningOpportunities": <數字 1-5 - 學習機會>,
81
+ "realisticScenarios": <數字 1-5 - 實用價值>,
82
+ "engagementDepth": <數字 1-5 - 互動深度>,
83
+ "overall": <數字 1-5>,
84
+ "rationale": "<教師學習成效說明>"
85
+ },
86
+ "conversationQuality": {
87
+ "teacherInsights": <數字 1-5 - 教師收穫>,
88
+ "interactionDepth": <數字 1-5 - 對話深度>,
89
+ "educationalValue": <數字 1-5 - 教育價值>,
90
+ "overall": <數字 1-5>,
91
+ "rationale": "<對話品質說明>"
92
+ },
93
+ "overallScore": <數字 1-5,按權重計算>,
94
+ "strengths": ["<教練的優點>"],
95
+ "improvementAreas": ["<教練需要改進的地方>"],
96
+ "promptSuggestions": ["<具體的教練提示修改建議>"]
97
+ }
98
+
99
+ **teacherEngagement.level 判斷標準:**
100
+ - "high": 教師積極參與,提出有意義的問題和回應
101
+ - "medium": 教師有參與但互動較淺
102
+ - "low": 教師參與度低,回應簡短或缺乏深度
103
+ - "none": 教師輸入無意義內容(隨機數字、亂碼、測試文字)
104
+
105
+ 只返回有效的 JSON,不要其他文字。`;
106
+
107
+ // Evaluation prompt for student conversations (with simulated student)
108
  const EVALUATION_SYSTEM_PROMPT = `你是一位專業的社會情緒學習(SEL)教師培訓應用程式評估專家。
109
  你的目標是評估**教師在對話中的體驗**,以便提示工程師改進學生提示。
110
 
 
230
 
231
  private formatConversationForEvaluation(
232
  conversation: Conversation,
233
+ messages: Message[],
234
+ isCoachDirect: boolean = false
235
  ): string {
236
  const formattedMessages = messages
237
  .filter((m) => m.role !== 'system')
238
  .map((m) => {
239
+ let speaker: string;
240
+ if (isCoachDirect) {
241
+ // For coach-direct: only 老師 and 教練
242
+ speaker = m.role === 'user' ? '老師' : '教練';
243
+ } else {
244
+ // For student conversations: 老師, 學生, 教練
245
+ speaker =
246
+ m.role === 'user'
247
+ ? '老師'
248
+ : m.speaker === 'student'
249
+ ? '學生'
250
+ : '教練';
251
+ }
252
  return `${speaker}: ${m.content}`;
253
  })
254
  .join('\n');
 
316
  }
317
  }
318
 
319
+ // Detect conversation type
320
+ const isCoachDirect = conversation.studentPromptId === 'coach_direct';
321
+
322
  // Get the system prompt from the conversation
323
  const systemPrompt =
324
  conversation.systemPrompt || '(No system prompt available)';
 
326
  // Format conversation for evaluation
327
  const formattedConversation = this.formatConversationForEvaluation(
328
  conversation,
329
+ messages,
330
+ isCoachDirect
331
  );
332
 
333
+ // Build the user prompt based on conversation type
334
+ const userPrompt = isCoachDirect
335
+ ? `
336
+ 教練系統提示:
337
+ ${systemPrompt}
338
+
339
+ 對話內容(共 ${messages.length} 則訊息):
340
+ ${formattedConversation}
341
+
342
+ 請評估此教練-教師對話並以 JSON 格式返回你的評估結果。`
343
+ : `
344
  學生系統提示:
345
  ${systemPrompt}
346
 
 
349
 
350
  請評估此對話並以 JSON 格式返回你的評估結果。`;
351
 
352
+ // Select the appropriate evaluation system prompt
353
+ const evaluationSystemPrompt = isCoachDirect
354
+ ? COACH_DIRECT_EVALUATION_SYSTEM_PROMPT
355
+ : EVALUATION_SYSTEM_PROMPT;
356
+
357
  console.log(
358
+ `[Evaluation] Evaluating conversation ${conversationId} with ${messages.length} messages (mode: ${isCoachDirect ? 'coach_direct' : 'student'})`
359
  );
360
 
361
  try {
362
  // Call the AI model
363
  const { text } = await generateText({
364
  model: openai(EVALUATION_MODEL),
365
+ system: evaluationSystemPrompt,
366
  prompt: userPrompt,
367
  temperature: 0.3, // Lower temperature for more consistent results
368
  });
src/lib/types/models.ts CHANGED
@@ -126,6 +126,7 @@ export interface Evaluation {
126
  conversationId: string;
127
  studentPromptId?: string;
128
  evaluationType: EvaluationType;
 
129
  modelUsed: string;
130
  evaluatedAt: string;
131
  evaluatedBy?: string;
 
126
  conversationId: string;
127
  studentPromptId?: string;
128
  evaluationType: EvaluationType;
129
+ evaluationMode?: 'student' | 'coach_direct';
130
  modelUsed: string;
131
  evaluatedAt: string;
132
  evaluatedBy?: string;