chunchu-08 commited on
Commit
d46a635
·
1 Parent(s): 4640243

Enhanced evaluation system

Browse files
gradio_full_llm_eval.py CHANGED
@@ -1,336 +1,254 @@
 
1
  import gradio as gr
2
  import os
3
  import pandas as pd
4
- import plotly.graph_objects as go
5
  import plotly.express as px
6
- from datetime import datetime
 
 
7
  import json
 
 
8
 
9
- # Import modules from existing files
10
- from response_generator import generate_all_responses
11
- from round_robin_evaluator import comprehensive_round_robin_evaluation, save_comprehensive_results
12
  from realtime_detector import is_realtime_prompt
13
  from search_fallback import get_google_snippets
14
- from llm_prompt_eval_analysis import generate_visualizations, analyze_evaluation_data
15
 
16
- # Load environment variables
17
- from dotenv import load_dotenv
18
  load_dotenv()
 
19
 
20
- def check_api_keys():
21
- """Check if all required API keys are available."""
22
- keys_status = {}
23
-
24
- # Check OpenAI
25
- openai_key = os.getenv("OPENAI_API_KEY")
26
- keys_status["OpenAI (GPT-4)"] = "Available" if openai_key else "Missing"
27
-
28
- # Check Claude
29
- claude_key = os.getenv("CLAUDE_API_KEY")
30
- keys_status["Claude 3"] = "Available" if claude_key else "Missing"
31
-
32
- # Check Gemini
33
- gemini_key = os.getenv("GEMINI_API_KEY")
34
- keys_status["Gemini 1.5"] = "Available" if gemini_key else "Missing"
35
-
36
- # Check Google Search (optional)
37
- google_key = os.getenv("GOOGLE_API_KEY")
38
- google_cse = os.getenv("GOOGLE_CSE_ID")
39
- keys_status["Google Search"] = "Available" if (google_key and google_cse) else "Missing"
40
-
41
- return keys_status
42
-
43
- def process_prompt(prompt, enable_realtime_detection, enable_evaluation, enable_analysis):
44
- """Process a prompt through the complete pipeline."""
45
- if not prompt.strip():
46
- return "Please enter a prompt.", None, None, None, None, None
47
-
48
- results = {
49
- "prompt": prompt,
50
- "responses": {},
51
- "evaluation": None,
52
- "analysis": None,
53
- "search_results": None,
54
- "is_realtime": False
55
- }
56
-
57
- # Step 1: Check if real-time detection is needed
58
- if enable_realtime_detection:
59
- try:
60
- results["is_realtime"] = is_realtime_prompt(prompt)
61
- if results["is_realtime"]:
62
- # Get Google search results
63
- search_results = get_google_snippets(prompt)
64
- results["search_results"] = search_results
65
- # Enhance prompt with search results
66
- enhanced_prompt = f"{prompt}\n\nRecent information: {search_results}"
67
- else:
68
- enhanced_prompt = prompt
69
- except Exception as e:
70
- print(f"Real-time detection error: {e}")
71
- enhanced_prompt = prompt
72
- else:
73
- enhanced_prompt = prompt
74
-
75
- # Step 2: Generate responses from all models
76
  try:
77
- responses = generate_all_responses(enhanced_prompt)
78
- results["responses"] = responses
79
- except Exception as e:
80
- return f"Error generating responses: {e}", None, None, None, None, None
81
-
82
- # Step 3: Perform evaluation if requested
83
- if enable_evaluation and responses:
84
- try:
85
- evaluation_results = comprehensive_round_robin_evaluation(responses, prompt)
86
- results["evaluation"] = evaluation_results
87
-
88
- # Save results
89
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
90
- csv_file = save_comprehensive_results(evaluation_results, prompt, timestamp)
91
-
92
- except Exception as e:
93
- print(f"Evaluation error: {e}")
94
-
95
- # Step 4: Generate analysis if requested
96
- if enable_analysis and results["evaluation"]:
97
- try:
98
- # Create a temporary DataFrame for analysis
99
- analysis_data = []
100
- for model, data in results["evaluation"].items():
101
- for evaluator, eval_data in data.get('evaluations', {}).items():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  row = {
 
103
  'target_model': model,
104
  'evaluator': evaluator,
105
- 'helpfulness': eval_data.get('helpfulness', 0.5),
106
- 'correctness': eval_data.get('correctness', 0.5),
107
- 'coherence': eval_data.get('coherence', 0.5),
108
- 'clarity': eval_data.get('clarity', 0.5),
109
- 'response': data.get('response', '')
110
  }
111
- analysis_data.append(row)
112
-
113
- if analysis_data:
114
- df = pd.DataFrame(analysis_data)
115
- results["analysis"] = df
116
-
117
- except Exception as e:
118
- print(f"Analysis error: {e}")
119
-
120
- return format_results(results)
121
-
122
- def format_results(results):
123
- """Format results for Gradio display."""
124
- prompt = results["prompt"]
125
- responses = results["responses"]
126
- evaluation = results["evaluation"]
127
- analysis = results["analysis"]
128
- search_results = results["search_results"]
129
- is_realtime = results["is_realtime"]
130
-
131
- # Format responses
132
- responses_text = ""
133
- if responses:
134
- responses_text = "MODEL RESPONSES:\n" + "="*50 + "\n"
135
- for model, response in responses.items():
136
- responses_text += f"\n{model}:\n{'-'*20}\n{response}\n"
137
- else:
138
- responses_text = "No responses generated. Check API keys."
139
-
140
- # Format evaluation results
141
- evaluation_text = ""
142
- if evaluation:
143
- evaluation_text = "EVALUATION RESULTS:\n" + "="*50 + "\n"
144
- for model, data in evaluation.items():
145
- avg_scores = data.get('average_scores', {})
146
- evaluation_text += f"\n{model} Average Scores:\n"
147
- for metric, score in avg_scores.items():
148
- evaluation_text += f" {metric}: {score}\n"
149
- evaluation_text += f" Evaluated by: {list(data.get('evaluations', {}).keys())}\n"
150
- else:
151
- evaluation_text = "No evaluation performed."
152
-
153
- # Format search results
154
- search_text = ""
155
- if search_results and is_realtime:
156
- search_text = "REAL-TIME SEARCH RESULTS:\n" + "="*50 + "\n"
157
- search_text += search_results
158
- elif is_realtime:
159
- search_text = "Real-time query detected but search results unavailable."
160
- else:
161
- search_text = "Not a real-time query."
162
-
163
- # Create visualizations
164
- charts = []
165
- if analysis is not None and not analysis.empty:
166
- charts = create_visualizations(analysis)
167
-
168
- return responses_text, evaluation_text, search_text, charts
169
-
170
- def create_visualizations(df):
171
- """Create Plotly visualizations for the analysis."""
172
- charts = []
173
-
174
- try:
175
- # 1. Model Performance Comparison
176
- if 'target_model' in df.columns:
177
- metrics = ['helpfulness', 'correctness', 'coherence', 'clarity']
178
-
179
- for metric in metrics:
180
- if metric in df.columns:
181
- fig = px.box(df, x='target_model', y=metric,
182
- title=f'{metric.title()} Scores by Model',
183
- color='target_model')
184
- fig.update_layout(showlegend=False)
185
- charts.append(fig)
186
-
187
- # 2. Evaluator Bias Analysis
188
- if 'evaluator' in df.columns:
189
- metrics = ['helpfulness', 'correctness', 'coherence', 'clarity']
190
-
191
- for metric in metrics:
192
- if metric in df.columns:
193
- fig = px.box(df, x='evaluator', y=metric,
194
- title=f'{metric.title()} Scores by Evaluator',
195
- color='evaluator')
196
- fig.update_layout(showlegend=False)
197
- charts.append(fig)
198
-
199
- # 3. Heatmap of Cross-Evaluations
200
- if 'target_model' in df.columns and 'evaluator' in df.columns and 'helpfulness' in df.columns:
201
- pivot_data = df.pivot_table(
202
- values='helpfulness',
203
- index='target_model',
204
- columns='evaluator',
205
- aggfunc='mean'
206
- ).fillna(0)
207
-
208
- fig = px.imshow(pivot_data.values,
209
- x=pivot_data.columns,
210
- y=pivot_data.index,
211
- title='Cross-Evaluation Heatmap (Helpfulness)',
212
- color_continuous_scale='RdYlBu_r',
213
- aspect='auto')
214
- fig.update_layout(xaxis_title='Evaluator', yaxis_title='Target Model')
215
- charts.append(fig)
216
-
217
- except Exception as e:
218
- print(f"Visualization error: {e}")
219
-
220
- return charts
221
-
222
- def export_results(responses_text, evaluation_text, search_text):
223
- """Export results to a text file."""
224
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
225
- filename = f"results/export_{timestamp}.txt"
226
-
227
- os.makedirs("results", exist_ok=True)
228
-
229
- with open(filename, 'w', encoding='utf-8') as f:
230
- f.write("LLM COMPARISON RESULTS\n")
231
- f.write("="*50 + "\n")
232
- f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
233
-
234
- f.write(responses_text + "\n\n")
235
- f.write(evaluation_text + "\n\n")
236
- f.write(search_text + "\n\n")
237
-
238
- return f"Results exported to {filename}"
239
-
240
- # Create Gradio interface
241
  def create_interface():
242
- """Create the Gradio interface."""
243
-
244
- # Check API keys
245
- api_status = check_api_keys()
246
- api_status_text = "API KEY STATUS:\n" + "="*30 + "\n"
247
- for service, status in api_status.items():
248
- api_status_text += f"{service}: {status}\n"
249
-
250
- with gr.Blocks(title="LLM Comparison Hub", theme=gr.themes.Soft()) as interface:
251
- gr.Markdown("# LLM Comparison Hub")
252
- gr.Markdown("Compare responses from GPT-4, Claude 3, and Gemini 1.5 with comprehensive evaluation and analysis.")
253
-
 
 
 
 
 
 
254
  with gr.Row():
255
- with gr.Column(scale=2):
256
- # Input section
257
- gr.Markdown("## Input")
258
- prompt_input = gr.Textbox(
259
- label="Enter your prompt",
260
- placeholder="Type your question or prompt here...",
261
- lines=4
262
- )
263
-
264
- with gr.Row():
265
- realtime_checkbox = gr.Checkbox(label="Enable real-time detection", value=True)
266
- evaluation_checkbox = gr.Checkbox(label="Enable evaluation", value=True)
267
- analysis_checkbox = gr.Checkbox(label="Enable analysis", value=True)
268
-
269
- process_btn = gr.Button("Process Prompt", variant="primary")
270
-
271
- # API status
272
- gr.Markdown("## API Status")
273
- api_status_display = gr.Textbox(
274
- value=api_status_text,
275
- label="API Keys",
276
- lines=len(api_status) + 3,
277
- interactive=False
278
- )
279
-
280
- with gr.Column(scale=3):
281
- # Output section
282
- gr.Markdown("## Results")
283
-
284
  with gr.Tabs():
285
- with gr.TabItem("Responses"):
286
- responses_output = gr.Textbox(
287
- label="Model Responses",
288
- lines=15,
289
- interactive=False
290
- )
291
-
292
- with gr.TabItem("Evaluation"):
293
- evaluation_output = gr.Textbox(
294
- label="Evaluation Results",
295
- lines=15,
296
- interactive=False
297
- )
298
-
299
- with gr.TabItem("Search Results"):
300
- search_output = gr.Textbox(
301
- label="Real-time Search Results",
302
- lines=10,
303
- interactive=False
304
- )
305
-
306
- with gr.TabItem("Visualizations"):
307
- charts_output = gr.Plot(label="Analysis Charts")
308
-
309
- # Export button
310
- export_btn = gr.Button("Export Results")
311
- export_output = gr.Textbox(label="Export Status", interactive=False)
312
-
313
- # Event handlers
314
- process_btn.click(
315
  fn=process_prompt,
316
- inputs=[prompt_input, realtime_checkbox, evaluation_checkbox, analysis_checkbox],
317
- outputs=[responses_output, evaluation_output, search_output, charts_output]
318
  )
319
-
320
- export_btn.click(
321
- fn=export_results,
322
- inputs=[responses_output, evaluation_output, search_output],
323
- outputs=[export_output]
324
- )
325
-
326
- return interface
327
 
328
  if __name__ == "__main__":
329
- # Create and launch the interface
330
- interface = create_interface()
331
- interface.launch(
332
- server_name="0.0.0.0",
333
- server_port=7860,
334
- share=False,
335
- debug=True
336
- )
 
1
+ # gradio_full_llm_eval.py – Final Updated Version with ATS Scoring and Visualized UI
2
  import gradio as gr
3
  import os
4
  import pandas as pd
 
5
  import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ import plotly.io as pio
8
+ import zipfile
9
  import json
10
+ from datetime import datetime
11
+ from dotenv import load_dotenv
12
 
13
+ from response_generator import generate_all_responses_with_reasoning
14
+ from round_robin_evaluator import comprehensive_round_robin_evaluation
 
15
  from realtime_detector import is_realtime_prompt
16
  from search_fallback import get_google_snippets
 
17
 
 
 
18
  load_dotenv()
19
+ pio.kaleido.scope.default_format = "png"
20
 
21
+ metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
22
+ 'accuracy', 'relevance', 'completeness', 'clarity']
23
+
24
+ def extract_text_from_resume(file):
25
+ ext = os.path.splitext(file.name)[1].lower()
26
+ if ext == ".pdf":
27
+ import fitz
28
+ with fitz.open(file.name) as doc:
29
+ return "\n".join(page.get_text() for page in doc)
30
+ elif ext == ".docx":
31
+ import docx
32
+ doc = docx.Document(file.name)
33
+ return "\n".join(p.text for p in doc.paragraphs)
34
+ elif ext == ".txt":
35
+ return file.read().decode('utf-8')
36
+ return ""
37
+
38
+ def ats_score_advanced(response, resume, jd):
39
+ prompt = f"""
40
+ You are a professional ATS scoring engine. Compare the generated response to the candidate's resume and job description using:
41
+ 1. Keyword Matching
42
+ 2. Section Weighting
43
+ 3. Semantic Similarity
44
+ 4. Recency/Frequency
45
+ 5. Penalty Detection
46
+ 6. Aggregation
47
+
48
+ Resume:
49
+ {resume}
50
+
51
+ Job Description:
52
+ {jd}
53
+
54
+ Response:
55
+ {response}
56
+
57
+ Return JSON:
58
+ {{"ats_score": <0-100>, "strengths": ["..."], "gaps": ["..."], "suggestions": ["..."]}}
59
+ """
60
+ from openai import OpenAI
61
+ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
+ res = openai_client.chat.completions.create(
64
+ model="gpt-4",
65
+ messages=[{"role": "user", "content": prompt}],
66
+ temperature=0
67
+ )
68
+ return json.loads(res.choices[0].message.content.strip())
69
+ except:
70
+ return {"ats_score": 50, "strengths": [], "gaps": [], "suggestions": ["Check formatting."]}
71
+
72
+ def create_visualizations(df, results_dir):
73
+ image_files = []
74
+ summary = df.groupby('target_model')[metrics].mean().reset_index()
75
+
76
+ heatmap = px.imshow(summary[metrics].values, x=metrics, y=summary['target_model'],
77
+ labels=dict(x="Metric", y="Model", color="Score"),
78
+ title="Heatmap: Metrics Across Models", color_continuous_scale='Viridis')
79
+ heatmap_path = os.path.join(results_dir, "heatmap.png")
80
+ heatmap.write_image(heatmap_path)
81
+ image_files.append(heatmap_path)
82
+
83
+ radar = go.Figure()
84
+ for _, row in summary.iterrows():
85
+ radar.add_trace(go.Scatterpolar(r=list(row[metrics]), theta=metrics, fill='toself', name=row['target_model']))
86
+ radar.update_layout(title="Radar Chart: Model Score Profiles", polar=dict(radialaxis=dict(visible=True, range=[0, 1])))
87
+ radar_path = os.path.join(results_dir, "radar.png")
88
+ radar.write_image(radar_path)
89
+ image_files.append(radar_path)
90
+
91
+ bar = px.bar(summary.melt(id_vars='target_model'), x='variable', y='value', color='target_model', barmode='group',
92
+ title="Bar Chart: Metric Comparison")
93
+ bar_path = os.path.join(results_dir, "barchart.png")
94
+ bar.write_image(bar_path)
95
+ image_files.append(bar_path)
96
+
97
+ return (heatmap, radar, bar), image_files
98
+
99
+ def format_ats_feedback(score, strengths, gaps, suggestions):
100
+ color = "🟢" if score >= 75 else "🟡" if score >= 50 else "🔴"
101
+ return f"""
102
+ ### ATS Match Score: ~{score}% {color}
103
+
104
+ #### **Strengths / High Matches:**
105
+ {chr(10).join([f"* {s}" for s in strengths]) if strengths else "* None found."}
106
+
107
+ #### **Partial or Missing:**
108
+ {chr(10).join([f"* {g}" for g in gaps]) if gaps else "* None mentioned."}
109
+
110
+ #### **How to Improve ATS Score:**
111
+ {chr(10).join([f"1. {s}" for s in suggestions]) if suggestions else "1. Add missing skills."}
112
+ """
113
+
114
+ def process_prompt(prompt, enable_realtime, enable_eval, enable_analysis, user_file, model_selection):
115
+ selected_models = [m for m, enabled in zip(["GPT-4", "Claude 3", "Gemini 1.5"], model_selection) if enabled]
116
+ resume_text = ""
117
+ batch_mode = user_file and user_file.name.endswith(".csv")
118
+ resume_mode = user_file and user_file.name.lower().endswith(('.pdf', '.docx', '.txt'))
119
+
120
+ prompts = [prompt]
121
+ ats_summary_texts = []
122
+ search_results = ""
123
+
124
+ if batch_mode:
125
+ df_batch = pd.read_csv(user_file.name)
126
+ prompts = df_batch['prompt'].dropna().tolist()
127
+ elif resume_mode:
128
+ resume_text = extract_text_from_resume(user_file)
129
+
130
+ all_rows, all_charts = [], []
131
+ zip_path, ats_table_markdown = None, ""
132
+
133
+ for prompt_text in prompts:
134
+ search_results = get_google_snippets(prompt_text) if enable_realtime and is_realtime_prompt(prompt_text) else ""
135
+ final_prompt = f"{prompt_text}\n\nRecent info: {search_results}" if search_results else prompt_text
136
+ responses = generate_all_responses_with_reasoning(final_prompt, selected_models)
137
+
138
+ ats_rows = []
139
+ for model in responses:
140
+ model_resp = responses[model]['response']
141
+ if resume_text:
142
+ ats_result = ats_score_advanced(model_resp, resume_text, prompt_text)
143
+ feedback = format_ats_feedback(ats_result['ats_score'], ats_result.get('strengths', []), ats_result.get('gaps', []), ats_result.get('suggestions', []))
144
+ responses[model]['ats_embed'] = f"### Response\n\n{model_resp}\n\n---\n\n### ATS Evaluation\n\n{feedback}"
145
+ ats_rows.append(f"| {model} | {ats_result['ats_score']} | {', '.join(ats_result.get('strengths', []))} | {', '.join(ats_result.get('suggestions', []))} |")
146
+ else:
147
+ responses[model]['ats_embed'] = f"### Response\n\n{model_resp}\n\n---\n\n**Explainability:**\n{responses[model]['reasoning']}"
148
+ if ats_rows:
149
+ ats_table_markdown = "| Model | Score | Strengths | Suggestions |\n|-------|-------|-----------|-------------|\n" + "\n".join(ats_rows)
150
+
151
+ if enable_eval:
152
+ compact = {k: v['response'] for k, v in responses.items()}
153
+ eval_result = comprehensive_round_robin_evaluation(compact, final_prompt)
154
+ for model, data in eval_result.items():
155
+ for evaluator, scores in data['evaluations'].items():
156
  row = {
157
+ 'prompt': prompt_text,
158
  'target_model': model,
159
  'evaluator': evaluator,
160
+ 'response': responses[model]['response'],
161
+ 'explainability': responses[model]['reasoning']
 
 
 
162
  }
163
+ row.update({k: scores.get(k, 0.5) for k in metrics})
164
+ row.update({f"avg_{k}": data['average_scores'].get(k, 0.5) for k in metrics})
165
+ all_rows.append(row)
166
+
167
+ df_all = pd.DataFrame(all_rows)
168
+ if not df_all.empty:
169
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
170
+ results_dir = f"results/batch_{timestamp}"
171
+ os.makedirs(results_dir, exist_ok=True)
172
+ csv_path = os.path.join(results_dir, "evaluation.csv")
173
+ df_all.to_csv(csv_path, index=False)
174
+ (heatmap, radar, bar), chart_paths = create_visualizations(df_all, results_dir)
175
+ all_charts = [heatmap, radar, bar]
176
+ zip_path = os.path.join(results_dir, "bundle.zip")
177
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
178
+ zipf.write(csv_path, arcname="evaluation.csv")
179
+ for chart in chart_paths:
180
+ zipf.write(chart, arcname=os.path.basename(chart))
181
+ if batch_mode:
182
+ df_batch['ATS Summary'] = ats_summary_texts
183
+ df_batch.to_csv(os.path.join(results_dir, "batch_prompts_output.csv"), index=False)
184
+ zipf.write(os.path.join(results_dir, "batch_prompts_output.csv"), arcname="batch_prompts_output.csv")
185
+
186
+ return tuple(
187
+ responses[model].get('ats_embed', responses[model]['response']) for model in ["GPT-4", "Claude 3", "Gemini 1.5"]
188
+ ) + (
189
+ search_results or "N/A",
190
+ *all_charts,
191
+ df_all[['target_model', 'evaluator'] + metrics] if not df_all.empty else pd.DataFrame(),
192
+ ats_table_markdown,
193
+ zip_path
194
+ )
195
+
196
+ def download_results(path):
197
+ return path if path and os.path.exists(path) else None
198
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def create_interface():
200
+ with gr.Blocks(title="LLM Comparison Hub") as demo:
201
+ gr.Markdown("""
202
+ # LLM Comparison Hub
203
+ This app compares LLM responses using round-robin evaluations, with real-time query detection and comprehensive analysis.
204
+
205
+ **How to use:**
206
+ - Enter a prompt (JD or query)
207
+ - Upload a resume (PDF/DOCX/TXT) or a CSV with prompts
208
+ - Select models
209
+ - Click evaluate
210
+
211
+ **Features:**
212
+ - Real-time web search fallback
213
+ - Resume vs JD ATS scoring (optional)
214
+ - Batch CSV prompt evaluation
215
+ - Visualizations (Heatmap, Radar, Bar)
216
+ - ZIP export of all results
217
+ """)
218
  with gr.Row():
219
+ with gr.Column():
220
+ prompt = gr.Textbox(label="Enter Prompt", lines=4)
221
+ user_file = gr.File(label="Upload Resume or CSV", file_types=[".pdf", ".docx", ".txt", ".csv"])
222
+ model_selector = gr.CheckboxGroup(label="Select Models", choices=["GPT-4", "Claude 3", "Gemini 1.5"], value=["GPT-4", "Claude 3", "Gemini 1.5"])
223
+ enable_realtime = gr.Checkbox(label="Enable real-time detection", value=True)
224
+ enable_eval = gr.Checkbox(label="Enable evaluation", value=True)
225
+ enable_analysis = gr.Checkbox(label="Enable analysis", value=True)
226
+ submit = gr.Button("Run Evaluation")
227
+
228
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  with gr.Tabs():
230
+ with gr.Tab("GPT-4"): gpt_out = gr.Markdown()
231
+ with gr.Tab("Claude 3"): claude_out = gr.Markdown()
232
+ with gr.Tab("Gemini 1.5"): gemini_out = gr.Markdown()
233
+ with gr.Tab("Evaluation Table"): df_out = gr.Dataframe()
234
+ with gr.Tab("ATS Evaluation"): ats_summary = gr.Markdown()
235
+ with gr.Tab("Search Results"): search_out = gr.Markdown()
236
+ with gr.Tab("Visualizations"):
237
+ heatmap_plot = gr.Plot()
238
+ radar_plot = gr.Plot()
239
+ bar_plot = gr.Plot()
240
+ export_btn = gr.Button("Download ZIP Bundle")
241
+ zip_output = gr.File(file_types=[".zip"], interactive=False, visible=True)
242
+
243
+ submit.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  fn=process_prompt,
245
+ inputs=[prompt, enable_realtime, enable_eval, enable_analysis, user_file, model_selector],
246
+ outputs=[gpt_out, claude_out, gemini_out, search_out, heatmap_plot, radar_plot, bar_plot, df_out, ats_summary, zip_output]
247
  )
248
+ export_btn.click(download_results, inputs=[zip_output], outputs=[zip_output])
249
+
250
+ return demo
 
 
 
 
 
251
 
252
  if __name__ == "__main__":
253
+ app = create_interface()
254
+ app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)
 
 
 
 
 
 
information ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LLM-Compare-Hub Project File Structure and Use Cases
2
+ ====================================================
3
+
4
+ Core Application Files
5
+ ---------------------
6
+
7
+ gradio_full_llm_eval.py (1.0B)
8
+ - Use Case: Main Gradio web interface
9
+ - Function: Orchestrates the entire application, provides user-friendly web UI
10
+ - Features: Prompt input, response display, evaluation results, analysis generation
11
+ - Status: Modular - delegates all logic to other files
12
+
13
+ response_generator.py (8.6KB)
14
+ - Use Case: LLM response generation and comparison
15
+ - Function: Generates responses from GPT-4, Claude 3, and Gemini 1.5
16
+ - Features: Side-by-side response comparison, batch processing, optional evaluation
17
+ - Status: Standalone tool + used by Gradio app
18
+
19
+ round_robin_evaluator.py (9.3KB)
20
+ - Use Case: Comprehensive model evaluation system
21
+ - Function: Each model evaluates all other models (GPT-4 evaluates Claude/Gemini, etc.)
22
+ - Features: Multi-metric scoring, CSV export, detailed reasoning
23
+ - Status: Core evaluation engine used by all other components
24
+
25
+ llm_prompt_eval_analysis.py (12KB)
26
+ - Use Case: Data analysis and visualization
27
+ - Function: Analyzes evaluation results, generates charts and reports
28
+ - Features: Statistical analysis, correlation matrices, performance comparisons
29
+ - Status: Standalone analysis tool + used by Gradio app
30
+
31
+ llm_response_logger.py (6.3KB)
32
+ - Use Case: Quick testing and logging tool
33
+ - Function: Rapid testing of all models with single or batch prompts
34
+ - Features: Quick evaluation, CSV export, batch processing
35
+ - Status: Standalone testing tool + used by Gradio app
36
+
37
+ Supporting Modules
38
+ -----------------
39
+
40
+ realtime_detector.py (923B)
41
+ - Use Case: Real-time query detection
42
+ - Function: Determines if a prompt needs current information
43
+ - Features: Uses GPT-3.5-turbo to classify real-time vs. general queries
44
+ - Status: Utility module used by response generation
45
+
46
+ search_fallback.py (1.6KB)
47
+ - Use Case: Google search integration
48
+ - Function: Fetches current information for real-time queries
49
+ - Features: Google Custom Search API integration, result formatting
50
+ - Status: Utility module used by response generation
51
+
52
+ Configuration & Documentation
53
+ ----------------------------
54
+
55
+ requirements.txt (232B)
56
+ - Use Case: Python dependencies
57
+ - Function: Lists all required packages and versions
58
+ - Features: Gradio, OpenAI, Anthropic, Google AI, pandas, matplotlib, etc.
59
+ - Status: Essential for project setup
60
+
61
+ .env (not shown - should exist)
62
+ - Use Case: API key configuration
63
+ - Function: Stores all API keys securely
64
+ - Features: OpenAI, Claude, Gemini, Google Search API keys
65
+ - Status: Essential for functionality
66
+
67
+ .gitignore (661B)
68
+ - Use Case: Git version control
69
+ - Function: Excludes sensitive files from version control
70
+ - Features: API keys, results, cache files, etc.
71
+ - Status: Essential for security
72
+
73
+ README.md (4.1KB)
74
+ - Use Case: Project documentation
75
+ - Function: Setup instructions, usage guide, feature descriptions
76
+ - Features: Installation, configuration, usage examples
77
+ - Status: Essential for users and collaborators
78
+
79
+ Testing & Development
80
+ --------------------
81
+
82
+ test_standalone_tools.py (4.1KB)
83
+ - Use Case: Testing and demonstration
84
+ - Function: Shows how to use all standalone tools
85
+ - Features: Quick start guide, sample prompts, tool explanations
86
+ - Status: Development/testing tool
87
+
88
+ __pycache__/ (directory)
89
+ - Use Case: Python cache
90
+ - Function: Stores compiled Python bytecode
91
+ - Features: Improves import performance
92
+ - Status: Auto-generated, can be deleted
93
+
94
+ Generated Files (when running the app)
95
+ -------------------------------------
96
+
97
+ results/ (directory - created when needed)
98
+ - Use Case: Evaluation results storage
99
+ - Function: Stores CSV files with evaluation data
100
+ - Features: Timestamped files, comprehensive evaluation data
101
+ - Status: Auto-generated during evaluation
102
+
103
+ analysis_results/ (directory - created when needed)
104
+ - Use Case: Analysis output storage
105
+ - Function: Stores charts, reports, and visualizations
106
+ - Features: Performance charts, correlation matrices, analysis reports
107
+ - Status: Auto-generated during analysis
108
+
109
+ Project Summary
110
+ ==============
111
+
112
+ Your project has a clean, modular architecture with:
113
+ - 4 core functional modules (response generation, evaluation, analysis, logging)
114
+ - 2 utility modules (real-time detection, search integration)
115
+ - 1 main interface (Gradio web app)
116
+ - Complete configuration (requirements, environment, documentation)
117
+ - Testing tools for development and demonstration
118
+
119
+ All files serve specific purposes and work together to provide a comprehensive LLM comparison and evaluation system.
120
+
121
+ Key Features:
122
+ - Multi-model response generation (GPT-4, Claude 3, Gemini 1.5)
123
+ - Comprehensive round-robin evaluation system
124
+ - Real-time query detection and search integration
125
+ - Advanced data analysis and visualization
126
+ - Batch processing capabilities
127
+ - Clean, production-ready code without emojis
128
+ - Modular architecture for maintainability
129
+ - Complete web interface via Gradio
130
+ - Standalone tools for automation and testing
131
+
132
+ Usage:
133
+ 1. Set up API keys in .env file
134
+ 2. Install dependencies: pip install -r requirements.txt
135
+ 3. Run main app: python gradio_full_llm_eval.py
136
+ 4. Or use standalone tools for specific tasks
requirements.txt CHANGED
@@ -10,4 +10,7 @@ python-dotenv>=1.0.0
10
  requests>=2.31.0
11
  tqdm>=4.65.0
12
  scikit-learn>=1.3.0
13
- plotly>=5.18.0
 
 
 
 
10
  requests>=2.31.0
11
  tqdm>=4.65.0
12
  scikit-learn>=1.3.0
13
+ plotly>=5.18.0
14
+ kaleido>=0.2.1
15
+ PyMuPDF>=1.23.0
16
+ python-docx>=1.1.0
response_generator.py CHANGED
@@ -1,11 +1,8 @@
1
- import csv
2
  import os
3
  from dotenv import load_dotenv
4
  from openai import OpenAI
5
  import anthropic
6
  import google.generativeai as genai
7
- from round_robin_evaluator import comprehensive_round_robin_evaluation, save_comprehensive_results
8
- from datetime import datetime
9
 
10
  # Load API keys from .env
11
  load_dotenv()
@@ -14,20 +11,50 @@ anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
14
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
15
 
16
  def get_gpt4_response(prompt):
17
- """Get response from GPT-4."""
18
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  response = openai_client.chat.completions.create(
20
  model="gpt-4",
21
- messages=[{"role": "user", "content": prompt}],
22
  temperature=0.7
23
  )
24
  return response.choices[0].message.content
 
25
  except Exception as e:
26
  print(f"Error with GPT-4: {e}")
27
- return None
28
 
29
  def get_claude_response(prompt):
30
- """Get response from Claude."""
31
  try:
32
  response = anthropic_client.messages.create(
33
  model="claude-3-opus-20240229",
@@ -38,226 +65,39 @@ def get_claude_response(prompt):
38
  return response.content[0].text
39
  except Exception as e:
40
  print(f"Error with Claude 3: {e}")
41
- return None
42
 
43
  def get_gemini_response(prompt):
44
- """Get response from Gemini."""
45
  try:
46
  model = genai.GenerativeModel("gemini-1.5-pro")
47
  response = model.generate_content(prompt)
48
  return response.text
49
  except Exception as e:
50
  print(f"Error with Gemini: {e}")
51
- return None
52
 
53
- def display_responses_side_by_side(responses, prompt):
54
- """Display responses in a formatted side-by-side comparison."""
55
- print("\n" + "="*80)
56
- print(f"PROMPT: {prompt}")
57
- print("="*80)
58
-
59
- models = list(responses.keys())
60
- if len(models) == 0:
61
- print("No responses to display")
62
- return
63
-
64
- # Display responses
65
- for i, model in enumerate(models, 1):
66
- response = responses[model]
67
- print(f"\n{i}. {model} RESPONSE:")
68
- print("-" * 40)
69
- print(response)
70
- print("-" * 40)
71
- print(f"Length: {len(response)} characters")
72
- print()
73
 
74
- def generate_and_compare_responses():
75
- """Generate responses from all models and display comparison."""
76
- print("=== Response Generator - Model Comparison Tool ===\n")
77
-
78
- # Get prompt from user
79
- prompt = input("Enter your prompt: ")
80
- if not prompt.strip():
81
- print("No prompt provided. Exiting.")
82
- return
83
-
84
- print(f"\nGenerating responses for: '{prompt}'")
85
- print("=" * 60)
86
-
87
- # Collect responses from all models
88
  responses = {}
89
-
90
- print("\n1. Generating GPT-4 response...")
91
- gpt_response = get_gpt4_response(prompt)
92
- if gpt_response:
93
- responses['GPT-4'] = gpt_response
94
- print("GPT-4 response generated")
95
- else:
96
- print("GPT-4 failed")
97
-
98
- print("\n2. Generating Claude response...")
99
- claude_response = get_claude_response(prompt)
100
- if claude_response:
101
- responses['Claude 3'] = claude_response
102
- print("Claude response generated")
103
- else:
104
- print("Claude failed")
105
-
106
- print("\n3. Generating Gemini response...")
107
- gemini_response = get_gemini_response(prompt)
108
- if gemini_response:
109
- responses['Gemini 1.5'] = gemini_response
110
- print("Gemini response generated")
111
- else:
112
- print("Gemini failed")
113
-
114
- if not responses:
115
- print("\nNo models generated responses. Check your API keys.")
116
- return
117
-
118
- print(f"\nSuccessfully generated {len(responses)} responses")
119
-
120
- # Display side-by-side comparison
121
- display_responses_side_by_side(responses, prompt)
122
-
123
- # Ask if user wants evaluation
124
- evaluate = input("\nDo you want to evaluate these responses? (y/n): ").strip().lower()
125
-
126
- if evaluate in ['y', 'yes']:
127
- print("\n4. Performing comprehensive evaluation...")
128
  try:
129
- comprehensive_results = comprehensive_round_robin_evaluation(responses, prompt)
130
-
131
- # Save results
132
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
133
- csv_file = save_comprehensive_results(comprehensive_results, prompt, timestamp)
134
-
135
- if csv_file:
136
- print(f"Evaluation results saved to: {csv_file}")
137
-
138
- # Display evaluation summary
139
- print("\n=== EVALUATION SUMMARY ===")
140
- for model, data in comprehensive_results.items():
141
- avg_scores = data.get('average_scores', {})
142
- print(f"\n{model} Scores:")
143
- print(f" Helpfulness: {avg_scores.get('helpfulness', 'N/A')}")
144
- print(f" Correctness: {avg_scores.get('correctness', 'N/A')}")
145
- print(f" Coherence: {avg_scores.get('coherence', 'N/A')}")
146
- print(f" Clarity: {avg_scores.get('clarity', 'N/A')}")
147
- print(f" Evaluated by: {list(data.get('evaluations', {}).keys())}")
148
-
149
  except Exception as e:
150
- print(f"Evaluation failed: {e}")
151
-
152
- print("\n=== Response generation completed ===")
153
 
154
- def batch_generate_from_file(filename):
155
- """Generate responses for multiple prompts from a file."""
156
- if not os.path.exists(filename):
157
- print(f"File {filename} not found.")
158
- return
159
-
160
- print(f"=== Batch Response Generation from {filename} ===")
161
-
162
- with open(filename, 'r', encoding='utf-8') as f:
163
- prompts = [line.strip() for line in f if line.strip()]
164
-
165
- print(f"Found {len(prompts)} prompts to process")
166
-
167
- all_results = []
168
-
169
- for i, prompt in enumerate(prompts, 1):
170
- print(f"\n--- Processing Prompt {i}/{len(prompts)} ---")
171
- print(f"Prompt: {prompt}")
172
-
173
- # Generate responses
174
- responses = {}
175
-
176
- gpt_response = get_gpt4_response(prompt)
177
- if gpt_response:
178
- responses['GPT-4'] = gpt_response
179
-
180
- claude_response = get_claude_response(prompt)
181
- if claude_response:
182
- responses['Claude 3'] = claude_response
183
-
184
- gemini_response = get_gemini_response(prompt)
185
- if gemini_response:
186
- responses['Gemini 1.5'] = gemini_response
187
-
188
- if responses:
189
- # Display comparison
190
- display_responses_side_by_side(responses, prompt)
191
-
192
- # Evaluate
193
- try:
194
- comprehensive_results = comprehensive_round_robin_evaluation(responses, prompt)
195
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
196
- csv_file = save_comprehensive_results(comprehensive_results, prompt, f"{timestamp}_batch_{i}")
197
- print(f"Results saved for prompt {i}")
198
- all_results.append((prompt, comprehensive_results))
199
- except Exception as e:
200
- print(f"Evaluation failed for prompt {i}: {e}")
201
- else:
202
- print(f"No responses for prompt {i}")
203
-
204
- # Save summary
205
- if all_results:
206
- summary_file = f"results/batch_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
207
- os.makedirs("results", exist_ok=True)
208
-
209
- with open(summary_file, 'w', encoding='utf-8') as f:
210
- f.write("BATCH RESPONSE GENERATION SUMMARY\n")
211
- f.write("=" * 50 + "\n\n")
212
-
213
- for prompt, results in all_results:
214
- f.write(f"PROMPT: {prompt}\n")
215
- f.write("-" * 30 + "\n")
216
-
217
- for model, data in results.items():
218
- avg_scores = data.get('average_scores', {})
219
- f.write(f"{model}:\n")
220
- f.write(f" Helpfulness: {avg_scores.get('helpfulness', 'N/A')}\n")
221
- f.write(f" Correctness: {avg_scores.get('correctness', 'N/A')}\n")
222
- f.write(f" Coherence: {avg_scores.get('coherence', 'N/A')}\n")
223
- f.write(f" Clarity: {avg_scores.get('clarity', 'N/A')}\n\n")
224
-
225
- f.write("\n" + "="*50 + "\n\n")
226
-
227
- print(f"\nBatch summary saved to: {summary_file}")
228
-
229
- print("\n=== Batch generation completed ===")
230
-
231
- def generate_all_responses(prompt):
232
- """Generate responses from all models for a given prompt."""
233
- responses = {}
234
-
235
- # Generate responses from all models
236
- gpt_response = get_gpt4_response(prompt)
237
- if gpt_response:
238
- responses['GPT-4'] = gpt_response
239
-
240
- claude_response = get_claude_response(prompt)
241
- if claude_response:
242
- responses['Claude 3'] = claude_response
243
-
244
- gemini_response = get_gemini_response(prompt)
245
- if gemini_response:
246
- responses['Gemini 1.5'] = gemini_response
247
-
248
  return responses
249
-
250
- if __name__ == "__main__":
251
- print("=== Response Generator Tool ===")
252
- print("1. Interactive mode")
253
- print("2. Batch mode from file")
254
-
255
- choice = input("Choose mode (1 or 2): ").strip()
256
-
257
- if choice == "1":
258
- generate_and_compare_responses()
259
- elif choice == "2":
260
- filename = input("Enter filename with prompts (one per line): ").strip()
261
- batch_generate_from_file(filename)
262
- else:
263
- print("Invalid choice. Exiting.")
 
 
1
  import os
2
  from dotenv import load_dotenv
3
  from openai import OpenAI
4
  import anthropic
5
  import google.generativeai as genai
 
 
6
 
7
  # Load API keys from .env
8
  load_dotenv()
 
11
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
12
 
13
  def get_gpt4_response(prompt):
 
14
  try:
15
+ if "Recent info:" in prompt:
16
+ user_prompt, realtime_info = prompt.split("Recent info:", 1)
17
+ messages = [
18
+ {
19
+ "role": "system",
20
+ "content": (
21
+ "You are an expert ATS evaluator. You are comparing a job description (JD) and a resume to produce an ATS score. "
22
+ "Highlight matches, gaps, suggestions for improvement, and an overall score."
23
+ )
24
+ },
25
+ {"role": "user", "content": user_prompt.strip()},
26
+ {
27
+ "role": "user",
28
+ "content": (
29
+ f"Here is some recent real-time context for your reference:\n\n{realtime_info.strip()}\n\n"
30
+ "Based on this, tailor your response as if the data is accurate."
31
+ )
32
+ }
33
+ ]
34
+ else:
35
+ messages = [
36
+ {
37
+ "role": "system",
38
+ "content": (
39
+ "You are an expert ATS evaluator. You are comparing a job description (JD) and a resume to produce an ATS score. "
40
+ "Highlight matches, gaps, suggestions for improvement, and an overall score."
41
+ )
42
+ },
43
+ {"role": "user", "content": prompt}
44
+ ]
45
+
46
  response = openai_client.chat.completions.create(
47
  model="gpt-4",
48
+ messages=messages,
49
  temperature=0.7
50
  )
51
  return response.choices[0].message.content
52
+
53
  except Exception as e:
54
  print(f"Error with GPT-4: {e}")
55
+ return "GPT-4 failed."
56
 
57
  def get_claude_response(prompt):
 
58
  try:
59
  response = anthropic_client.messages.create(
60
  model="claude-3-opus-20240229",
 
65
  return response.content[0].text
66
  except Exception as e:
67
  print(f"Error with Claude 3: {e}")
68
+ return "Claude 3 failed."
69
 
70
  def get_gemini_response(prompt):
 
71
  try:
72
  model = genai.GenerativeModel("gemini-1.5-pro")
73
  response = model.generate_content(prompt)
74
  return response.text
75
  except Exception as e:
76
  print(f"Error with Gemini: {e}")
77
+ return "Gemini 1.5 failed."
78
 
79
+ def generate_all_responses_with_reasoning(prompt, selected_models=None):
80
+ all_models = {
81
+ "GPT-4": get_gpt4_response,
82
+ "Claude 3": get_claude_response,
83
+ "Gemini 1.5": get_gemini_response
84
+ }
85
+ models_to_use = selected_models if selected_models else list(all_models.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  responses = {}
88
+ for model_name in models_to_use:
89
+ fetch_fn = all_models[model_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  try:
91
+ response = fetch_fn(prompt)
92
+ reason_prompt = (
93
+ f"Why did you generate this response to the prompt:\n\n"
94
+ f"\"{prompt}\"\n\n"
95
+ f"Your Response:\n\"{response}\"\n\n"
96
+ "Explain your reasoning behind structuring or phrasing it that way."
97
+ )
98
+ reasoning = fetch_fn(reason_prompt)
99
+ responses[model_name] = {"response": response, "reasoning": reasoning}
 
 
 
 
 
 
 
 
 
 
 
100
  except Exception as e:
101
+ responses[model_name] = {"response": "Failed", "reasoning": str(e)}
 
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  return responses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
round_robin_evaluator.py CHANGED
@@ -5,6 +5,7 @@ import google.generativeai as genai
5
  from dotenv import load_dotenv
6
  import csv
7
  import json
 
8
 
9
  # Load environment variables
10
  load_dotenv()
@@ -12,6 +13,16 @@ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
12
  anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
13
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
14
 
 
 
 
 
 
 
 
 
 
 
15
  def evaluate_response(evaluator_model, prompt, target_model, response_text):
16
  """Evaluate a response using the specified evaluator model."""
17
  evaluation_prompt = (
@@ -19,26 +30,20 @@ def evaluate_response(evaluator_model, prompt, target_model, response_text):
19
  f"Here is the original prompt: \"{prompt}\"\n"
20
  f"Here is the response from {target_model}: \"{response_text}\"\n\n"
21
  f"Evaluate this response on the following criteria from 0 (worst) to 1 (best):\n"
22
- f"- Helpfulness: How useful and informative is the response?\n"
23
- f"- Correctness: How accurate and factually correct is the response?\n"
24
- f"- Coherence: How well-structured and logical is the response?\n"
25
- f"- Tone: How appropriate and professional is the tone?\n"
26
- f"- Accuracy: How precise and detailed is the information?\n"
27
- f"- Relevance: How well does the response address the prompt?\n"
28
- f"- Completeness: How comprehensive is the response?\n"
29
- f"- Clarity: How clear and easy to understand is the response?\n\n"
30
- f"Return the result in this exact JSON format:\n\n"
31
  f"{{\n"
32
- f" \"helpfulness\": <0-1>,\n"
33
- f" \"correctness\": <0-1>,\n"
34
- f" \"coherence\": <0-1>,\n"
35
- f" \"tone_score\": <0-1>,\n"
36
- f" \"accuracy\": <0-1>,\n"
37
- f" \"relevance\": <0-1>,\n"
38
- f" \"completeness\": <0-1>,\n"
39
- f" \"clarity\": <0-1>,\n"
40
- f" \"reasoning\": \"detailed explanation for the scores\",\n"
41
- f" \"notes\": \"additional observations about the response\"\n"
42
  f"}}"
43
  )
44
 
@@ -65,106 +70,70 @@ def evaluate_response(evaluator_model, prompt, target_model, response_text):
65
  else:
66
  print(f"Unknown evaluator model: {evaluator_model}")
67
  return None
68
-
69
- # Try to parse JSON response
70
- try:
71
- if isinstance(result, str):
72
- parsed = json.loads(result)
73
- else:
74
- parsed = result
75
  return parsed
76
- except json.JSONDecodeError:
77
  print(f"Failed to parse JSON from {evaluator_model} evaluation")
78
  return None
79
-
80
  except Exception as e:
81
  print(f"Error in {evaluator_model} evaluation: {str(e)}")
82
  return None
83
 
84
  def comprehensive_round_robin_evaluation(responses_dict, prompt):
85
- """
86
- Perform comprehensive round-robin evaluation where each model evaluates all other models.
87
-
88
- Args:
89
- responses_dict: Dictionary with model names as keys and response texts as values
90
- prompt: The original prompt
91
-
92
- Returns:
93
- Dictionary with comprehensive evaluation results
94
- """
95
  print("\nStarting comprehensive round-robin evaluation...")
96
-
97
- # Define the evaluation matrix
98
  evaluation_matrix = {
99
  "GPT-4": ["Claude 3", "Gemini 1.5"],
100
- "Claude 3": ["GPT-4", "Gemini 1.5"],
101
  "Gemini 1.5": ["GPT-4", "Claude 3"]
102
  }
103
-
104
- # Initialize results structure
105
  comprehensive_results = {}
106
-
107
- # For each model, collect evaluations from other models
108
  for target_model, response_text in responses_dict.items():
109
  print(f"\nCollecting evaluations for {target_model}...")
110
-
111
- # Initialize target model data
112
  comprehensive_results[target_model] = {
113
  'response': response_text,
114
  'evaluations': {},
115
  'average_scores': {}
116
  }
117
-
118
- # Get evaluations from other models
119
- evaluators = evaluation_matrix[target_model]
120
- for evaluator in evaluators:
121
  print(f" {evaluator} evaluating {target_model}...")
122
  evaluation = evaluate_response(evaluator, prompt, target_model, response_text)
123
-
124
  if evaluation:
125
  comprehensive_results[target_model]['evaluations'][evaluator] = evaluation
126
  print(f" {evaluator} evaluation completed")
127
  else:
128
  print(f" {evaluator} evaluation failed")
129
-
130
- # Calculate average scores across all evaluators
131
  if comprehensive_results[target_model]['evaluations']:
132
- metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
133
- 'accuracy', 'relevance', 'completeness', 'clarity']
134
-
135
  for metric in metrics:
136
- scores = []
137
- for evaluator, eval_data in comprehensive_results[target_model]['evaluations'].items():
138
- if metric in eval_data and isinstance(eval_data[metric], (int, float)):
139
- scores.append(eval_data[metric])
140
-
141
- if scores:
142
- avg_score = sum(scores) / len(scores)
143
- comprehensive_results[target_model]['average_scores'][metric] = round(avg_score, 3)
144
- else:
145
- comprehensive_results[target_model]['average_scores'][metric] = 0.5
146
-
147
  print(f"\nComprehensive evaluation completed for {len(comprehensive_results)} models")
148
  return comprehensive_results
149
 
150
  def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
151
- """Save comprehensive evaluation results to CSV."""
152
  if timestamp is None:
153
- from datetime import datetime
154
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
155
-
156
  filename = f"results/comprehensive_eval_{timestamp}.csv"
157
-
158
- # Ensure results directory exists
159
  os.makedirs("results", exist_ok=True)
160
-
161
- # Prepare data for CSV
162
  rows = []
163
  for model, data in comprehensive_results.items():
164
- # Get average scores
165
  avg_scores = data.get('average_scores', {})
166
-
167
- # Create row for each evaluator
168
  for evaluator, evaluation in data.get('evaluations', {}).items():
169
  row = {
170
  'timestamp': timestamp,
@@ -192,44 +161,15 @@ def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
192
  'avg_clarity': avg_scores.get('clarity', 0.5)
193
  }
194
  rows.append(row)
195
-
196
- # Write to CSV
197
  if rows:
198
  fieldnames = list(rows[0].keys())
199
- with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
200
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
201
  writer.writeheader()
202
  writer.writerows(rows)
203
-
204
  print(f"Results saved to {filename}")
205
  return filename
206
  else:
207
  print("No results to save")
208
  return None
209
-
210
- def round_robin_evaluate_and_log(responses):
211
- """Legacy function for backward compatibility."""
212
- print("This function is deprecated. Use comprehensive_round_robin_evaluation instead.")
213
- return comprehensive_round_robin_evaluation(responses, "Legacy prompt")
214
-
215
- if __name__ == "__main__":
216
- # Test the evaluation system
217
- test_responses = {
218
- "GPT-4": "This is a test response from GPT-4.",
219
- "Claude 3": "This is a test response from Claude 3.",
220
- "Gemini 1.5": "This is a test response from Gemini 1.5."
221
- }
222
-
223
- test_prompt = "What is artificial intelligence?"
224
-
225
- print("Testing round-robin evaluation system...")
226
- results = comprehensive_round_robin_evaluation(test_responses, test_prompt)
227
-
228
- if results:
229
- print("\nTest completed successfully!")
230
- for model, data in results.items():
231
- print(f"\n{model} average scores:")
232
- for metric, score in data.get('average_scores', {}).items():
233
- print(f" {metric}: {score}")
234
- else:
235
- print("Test failed!")
 
5
  from dotenv import load_dotenv
6
  import csv
7
  import json
8
+ import re
9
 
10
  # Load environment variables
11
  load_dotenv()
 
13
  anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
14
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
15
 
16
+ def safe_parse_json(text):
17
+ """Extract and parse JSON from a possibly noisy LLM output."""
18
+ try:
19
+ match = re.search(r'{.*}', text, re.DOTALL)
20
+ if match:
21
+ return json.loads(match.group())
22
+ except Exception as e:
23
+ print(f"[Safe JSON Parse Error] {e}")
24
+ return None
25
+
26
  def evaluate_response(evaluator_model, prompt, target_model, response_text):
27
  """Evaluate a response using the specified evaluator model."""
28
  evaluation_prompt = (
 
30
  f"Here is the original prompt: \"{prompt}\"\n"
31
  f"Here is the response from {target_model}: \"{response_text}\"\n\n"
32
  f"Evaluate this response on the following criteria from 0 (worst) to 1 (best):\n"
33
+ f"- Helpfulness\n- Correctness\n- Coherence\n- Tone\n- Accuracy\n"
34
+ f"- Relevance\n- Completeness\n- Clarity\n\n"
35
+ f"Return ONLY a valid JSON object with the following keys:\n"
 
 
 
 
 
 
36
  f"{{\n"
37
+ f" \"helpfulness\": <float>,\n"
38
+ f" \"correctness\": <float>,\n"
39
+ f" \"coherence\": <float>,\n"
40
+ f" \"tone_score\": <float>,\n"
41
+ f" \"accuracy\": <float>,\n"
42
+ f" \"relevance\": <float>,\n"
43
+ f" \"completeness\": <float>,\n"
44
+ f" \"clarity\": <float>,\n"
45
+ f" \"reasoning\": \"explanation\",\n"
46
+ f" \"notes\": \"additional remarks\"\n"
47
  f"}}"
48
  )
49
 
 
70
  else:
71
  print(f"Unknown evaluator model: {evaluator_model}")
72
  return None
73
+
74
+ parsed = safe_parse_json(result)
75
+ if parsed:
 
 
 
 
76
  return parsed
77
+ else:
78
  print(f"Failed to parse JSON from {evaluator_model} evaluation")
79
  return None
80
+
81
  except Exception as e:
82
  print(f"Error in {evaluator_model} evaluation: {str(e)}")
83
  return None
84
 
85
  def comprehensive_round_robin_evaluation(responses_dict, prompt):
 
 
 
 
 
 
 
 
 
 
86
  print("\nStarting comprehensive round-robin evaluation...")
87
+
 
88
  evaluation_matrix = {
89
  "GPT-4": ["Claude 3", "Gemini 1.5"],
90
+ "Claude 3": ["GPT-4", "Gemini 1.5"],
91
  "Gemini 1.5": ["GPT-4", "Claude 3"]
92
  }
93
+
 
94
  comprehensive_results = {}
95
+
 
96
  for target_model, response_text in responses_dict.items():
97
  print(f"\nCollecting evaluations for {target_model}...")
 
 
98
  comprehensive_results[target_model] = {
99
  'response': response_text,
100
  'evaluations': {},
101
  'average_scores': {}
102
  }
103
+
104
+ for evaluator in evaluation_matrix[target_model]:
 
 
105
  print(f" {evaluator} evaluating {target_model}...")
106
  evaluation = evaluate_response(evaluator, prompt, target_model, response_text)
 
107
  if evaluation:
108
  comprehensive_results[target_model]['evaluations'][evaluator] = evaluation
109
  print(f" {evaluator} evaluation completed")
110
  else:
111
  print(f" {evaluator} evaluation failed")
112
+
 
113
  if comprehensive_results[target_model]['evaluations']:
114
+ metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
115
+ 'accuracy', 'relevance', 'completeness', 'clarity']
 
116
  for metric in metrics:
117
+ scores = [
118
+ eval_data[metric]
119
+ for eval_data in comprehensive_results[target_model]['evaluations'].values()
120
+ if metric in eval_data and isinstance(eval_data[metric], (int, float))
121
+ ]
122
+ comprehensive_results[target_model]['average_scores'][metric] = round(sum(scores) / len(scores), 3) if scores else 0.5
123
+
 
 
 
 
124
  print(f"\nComprehensive evaluation completed for {len(comprehensive_results)} models")
125
  return comprehensive_results
126
 
127
  def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
 
128
  if timestamp is None:
 
129
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
130
+
131
  filename = f"results/comprehensive_eval_{timestamp}.csv"
 
 
132
  os.makedirs("results", exist_ok=True)
133
+
 
134
  rows = []
135
  for model, data in comprehensive_results.items():
 
136
  avg_scores = data.get('average_scores', {})
 
 
137
  for evaluator, evaluation in data.get('evaluations', {}).items():
138
  row = {
139
  'timestamp': timestamp,
 
161
  'avg_clarity': avg_scores.get('clarity', 0.5)
162
  }
163
  rows.append(row)
164
+
 
165
  if rows:
166
  fieldnames = list(rows[0].keys())
167
+ with open(filename, 'w', newline='', encoding='utf-8') as f:
168
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
169
  writer.writeheader()
170
  writer.writerows(rows)
 
171
  print(f"Results saved to {filename}")
172
  return filename
173
  else:
174
  print("No results to save")
175
  return None