Spaces:
Sleeping
Sleeping
Commit
·
d46a635
1
Parent(s):
4640243
Enhanced evaluation system
Browse files- gradio_full_llm_eval.py +231 -313
- information +136 -0
- requirements.txt +4 -1
- response_generator.py +55 -215
- round_robin_evaluator.py +50 -110
gradio_full_llm_eval.py
CHANGED
|
@@ -1,336 +1,254 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
import pandas as pd
|
| 4 |
-
import plotly.graph_objects as go
|
| 5 |
import plotly.express as px
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
import json
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
from
|
| 11 |
-
from round_robin_evaluator import comprehensive_round_robin_evaluation, save_comprehensive_results
|
| 12 |
from realtime_detector import is_realtime_prompt
|
| 13 |
from search_fallback import get_google_snippets
|
| 14 |
-
from llm_prompt_eval_analysis import generate_visualizations, analyze_evaluation_data
|
| 15 |
|
| 16 |
-
# Load environment variables
|
| 17 |
-
from dotenv import load_dotenv
|
| 18 |
load_dotenv()
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
if results["is_realtime"]:
|
| 62 |
-
# Get Google search results
|
| 63 |
-
search_results = get_google_snippets(prompt)
|
| 64 |
-
results["search_results"] = search_results
|
| 65 |
-
# Enhance prompt with search results
|
| 66 |
-
enhanced_prompt = f"{prompt}\n\nRecent information: {search_results}"
|
| 67 |
-
else:
|
| 68 |
-
enhanced_prompt = prompt
|
| 69 |
-
except Exception as e:
|
| 70 |
-
print(f"Real-time detection error: {e}")
|
| 71 |
-
enhanced_prompt = prompt
|
| 72 |
-
else:
|
| 73 |
-
enhanced_prompt = prompt
|
| 74 |
-
|
| 75 |
-
# Step 2: Generate responses from all models
|
| 76 |
try:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
row = {
|
|
|
|
| 103 |
'target_model': model,
|
| 104 |
'evaluator': evaluator,
|
| 105 |
-
'
|
| 106 |
-
'
|
| 107 |
-
'coherence': eval_data.get('coherence', 0.5),
|
| 108 |
-
'clarity': eval_data.get('clarity', 0.5),
|
| 109 |
-
'response': data.get('response', '')
|
| 110 |
}
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
for metric, score in avg_scores.items():
|
| 148 |
-
evaluation_text += f" {metric}: {score}\n"
|
| 149 |
-
evaluation_text += f" Evaluated by: {list(data.get('evaluations', {}).keys())}\n"
|
| 150 |
-
else:
|
| 151 |
-
evaluation_text = "No evaluation performed."
|
| 152 |
-
|
| 153 |
-
# Format search results
|
| 154 |
-
search_text = ""
|
| 155 |
-
if search_results and is_realtime:
|
| 156 |
-
search_text = "REAL-TIME SEARCH RESULTS:\n" + "="*50 + "\n"
|
| 157 |
-
search_text += search_results
|
| 158 |
-
elif is_realtime:
|
| 159 |
-
search_text = "Real-time query detected but search results unavailable."
|
| 160 |
-
else:
|
| 161 |
-
search_text = "Not a real-time query."
|
| 162 |
-
|
| 163 |
-
# Create visualizations
|
| 164 |
-
charts = []
|
| 165 |
-
if analysis is not None and not analysis.empty:
|
| 166 |
-
charts = create_visualizations(analysis)
|
| 167 |
-
|
| 168 |
-
return responses_text, evaluation_text, search_text, charts
|
| 169 |
-
|
| 170 |
-
def create_visualizations(df):
|
| 171 |
-
"""Create Plotly visualizations for the analysis."""
|
| 172 |
-
charts = []
|
| 173 |
-
|
| 174 |
-
try:
|
| 175 |
-
# 1. Model Performance Comparison
|
| 176 |
-
if 'target_model' in df.columns:
|
| 177 |
-
metrics = ['helpfulness', 'correctness', 'coherence', 'clarity']
|
| 178 |
-
|
| 179 |
-
for metric in metrics:
|
| 180 |
-
if metric in df.columns:
|
| 181 |
-
fig = px.box(df, x='target_model', y=metric,
|
| 182 |
-
title=f'{metric.title()} Scores by Model',
|
| 183 |
-
color='target_model')
|
| 184 |
-
fig.update_layout(showlegend=False)
|
| 185 |
-
charts.append(fig)
|
| 186 |
-
|
| 187 |
-
# 2. Evaluator Bias Analysis
|
| 188 |
-
if 'evaluator' in df.columns:
|
| 189 |
-
metrics = ['helpfulness', 'correctness', 'coherence', 'clarity']
|
| 190 |
-
|
| 191 |
-
for metric in metrics:
|
| 192 |
-
if metric in df.columns:
|
| 193 |
-
fig = px.box(df, x='evaluator', y=metric,
|
| 194 |
-
title=f'{metric.title()} Scores by Evaluator',
|
| 195 |
-
color='evaluator')
|
| 196 |
-
fig.update_layout(showlegend=False)
|
| 197 |
-
charts.append(fig)
|
| 198 |
-
|
| 199 |
-
# 3. Heatmap of Cross-Evaluations
|
| 200 |
-
if 'target_model' in df.columns and 'evaluator' in df.columns and 'helpfulness' in df.columns:
|
| 201 |
-
pivot_data = df.pivot_table(
|
| 202 |
-
values='helpfulness',
|
| 203 |
-
index='target_model',
|
| 204 |
-
columns='evaluator',
|
| 205 |
-
aggfunc='mean'
|
| 206 |
-
).fillna(0)
|
| 207 |
-
|
| 208 |
-
fig = px.imshow(pivot_data.values,
|
| 209 |
-
x=pivot_data.columns,
|
| 210 |
-
y=pivot_data.index,
|
| 211 |
-
title='Cross-Evaluation Heatmap (Helpfulness)',
|
| 212 |
-
color_continuous_scale='RdYlBu_r',
|
| 213 |
-
aspect='auto')
|
| 214 |
-
fig.update_layout(xaxis_title='Evaluator', yaxis_title='Target Model')
|
| 215 |
-
charts.append(fig)
|
| 216 |
-
|
| 217 |
-
except Exception as e:
|
| 218 |
-
print(f"Visualization error: {e}")
|
| 219 |
-
|
| 220 |
-
return charts
|
| 221 |
-
|
| 222 |
-
def export_results(responses_text, evaluation_text, search_text):
|
| 223 |
-
"""Export results to a text file."""
|
| 224 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 225 |
-
filename = f"results/export_{timestamp}.txt"
|
| 226 |
-
|
| 227 |
-
os.makedirs("results", exist_ok=True)
|
| 228 |
-
|
| 229 |
-
with open(filename, 'w', encoding='utf-8') as f:
|
| 230 |
-
f.write("LLM COMPARISON RESULTS\n")
|
| 231 |
-
f.write("="*50 + "\n")
|
| 232 |
-
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
| 233 |
-
|
| 234 |
-
f.write(responses_text + "\n\n")
|
| 235 |
-
f.write(evaluation_text + "\n\n")
|
| 236 |
-
f.write(search_text + "\n\n")
|
| 237 |
-
|
| 238 |
-
return f"Results exported to {filename}"
|
| 239 |
-
|
| 240 |
-
# Create Gradio interface
|
| 241 |
def create_interface():
|
| 242 |
-
""
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
with gr.Row():
|
| 255 |
-
with gr.Column(
|
| 256 |
-
|
| 257 |
-
gr.
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
)
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
realtime_checkbox = gr.Checkbox(label="Enable real-time detection", value=True)
|
| 266 |
-
evaluation_checkbox = gr.Checkbox(label="Enable evaluation", value=True)
|
| 267 |
-
analysis_checkbox = gr.Checkbox(label="Enable analysis", value=True)
|
| 268 |
-
|
| 269 |
-
process_btn = gr.Button("Process Prompt", variant="primary")
|
| 270 |
-
|
| 271 |
-
# API status
|
| 272 |
-
gr.Markdown("## API Status")
|
| 273 |
-
api_status_display = gr.Textbox(
|
| 274 |
-
value=api_status_text,
|
| 275 |
-
label="API Keys",
|
| 276 |
-
lines=len(api_status) + 3,
|
| 277 |
-
interactive=False
|
| 278 |
-
)
|
| 279 |
-
|
| 280 |
-
with gr.Column(scale=3):
|
| 281 |
-
# Output section
|
| 282 |
-
gr.Markdown("## Results")
|
| 283 |
-
|
| 284 |
with gr.Tabs():
|
| 285 |
-
with gr.
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
with gr.TabItem("Search Results"):
|
| 300 |
-
search_output = gr.Textbox(
|
| 301 |
-
label="Real-time Search Results",
|
| 302 |
-
lines=10,
|
| 303 |
-
interactive=False
|
| 304 |
-
)
|
| 305 |
-
|
| 306 |
-
with gr.TabItem("Visualizations"):
|
| 307 |
-
charts_output = gr.Plot(label="Analysis Charts")
|
| 308 |
-
|
| 309 |
-
# Export button
|
| 310 |
-
export_btn = gr.Button("Export Results")
|
| 311 |
-
export_output = gr.Textbox(label="Export Status", interactive=False)
|
| 312 |
-
|
| 313 |
-
# Event handlers
|
| 314 |
-
process_btn.click(
|
| 315 |
fn=process_prompt,
|
| 316 |
-
inputs=[
|
| 317 |
-
outputs=[
|
| 318 |
)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
inputs=[responses_output, evaluation_output, search_output],
|
| 323 |
-
outputs=[export_output]
|
| 324 |
-
)
|
| 325 |
-
|
| 326 |
-
return interface
|
| 327 |
|
| 328 |
if __name__ == "__main__":
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
interface.launch(
|
| 332 |
-
server_name="0.0.0.0",
|
| 333 |
-
server_port=7860,
|
| 334 |
-
share=False,
|
| 335 |
-
debug=True
|
| 336 |
-
)
|
|
|
|
| 1 |
+
# gradio_full_llm_eval.py – Final Updated Version with ATS Scoring and Visualized UI
|
| 2 |
import gradio as gr
|
| 3 |
import os
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
import plotly.express as px
|
| 6 |
+
import plotly.graph_objects as go
|
| 7 |
+
import plotly.io as pio
|
| 8 |
+
import zipfile
|
| 9 |
import json
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
|
| 13 |
+
from response_generator import generate_all_responses_with_reasoning
|
| 14 |
+
from round_robin_evaluator import comprehensive_round_robin_evaluation
|
|
|
|
| 15 |
from realtime_detector import is_realtime_prompt
|
| 16 |
from search_fallback import get_google_snippets
|
|
|
|
| 17 |
|
|
|
|
|
|
|
| 18 |
load_dotenv()
|
| 19 |
+
pio.kaleido.scope.default_format = "png"
|
| 20 |
|
| 21 |
+
metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
|
| 22 |
+
'accuracy', 'relevance', 'completeness', 'clarity']
|
| 23 |
+
|
| 24 |
+
def extract_text_from_resume(file):
|
| 25 |
+
ext = os.path.splitext(file.name)[1].lower()
|
| 26 |
+
if ext == ".pdf":
|
| 27 |
+
import fitz
|
| 28 |
+
with fitz.open(file.name) as doc:
|
| 29 |
+
return "\n".join(page.get_text() for page in doc)
|
| 30 |
+
elif ext == ".docx":
|
| 31 |
+
import docx
|
| 32 |
+
doc = docx.Document(file.name)
|
| 33 |
+
return "\n".join(p.text for p in doc.paragraphs)
|
| 34 |
+
elif ext == ".txt":
|
| 35 |
+
return file.read().decode('utf-8')
|
| 36 |
+
return ""
|
| 37 |
+
|
| 38 |
+
def ats_score_advanced(response, resume, jd):
|
| 39 |
+
prompt = f"""
|
| 40 |
+
You are a professional ATS scoring engine. Compare the generated response to the candidate's resume and job description using:
|
| 41 |
+
1. Keyword Matching
|
| 42 |
+
2. Section Weighting
|
| 43 |
+
3. Semantic Similarity
|
| 44 |
+
4. Recency/Frequency
|
| 45 |
+
5. Penalty Detection
|
| 46 |
+
6. Aggregation
|
| 47 |
+
|
| 48 |
+
Resume:
|
| 49 |
+
{resume}
|
| 50 |
+
|
| 51 |
+
Job Description:
|
| 52 |
+
{jd}
|
| 53 |
+
|
| 54 |
+
Response:
|
| 55 |
+
{response}
|
| 56 |
+
|
| 57 |
+
Return JSON:
|
| 58 |
+
{{"ats_score": <0-100>, "strengths": ["..."], "gaps": ["..."], "suggestions": ["..."]}}
|
| 59 |
+
"""
|
| 60 |
+
from openai import OpenAI
|
| 61 |
+
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
try:
|
| 63 |
+
res = openai_client.chat.completions.create(
|
| 64 |
+
model="gpt-4",
|
| 65 |
+
messages=[{"role": "user", "content": prompt}],
|
| 66 |
+
temperature=0
|
| 67 |
+
)
|
| 68 |
+
return json.loads(res.choices[0].message.content.strip())
|
| 69 |
+
except:
|
| 70 |
+
return {"ats_score": 50, "strengths": [], "gaps": [], "suggestions": ["Check formatting."]}
|
| 71 |
+
|
| 72 |
+
def create_visualizations(df, results_dir):
|
| 73 |
+
image_files = []
|
| 74 |
+
summary = df.groupby('target_model')[metrics].mean().reset_index()
|
| 75 |
+
|
| 76 |
+
heatmap = px.imshow(summary[metrics].values, x=metrics, y=summary['target_model'],
|
| 77 |
+
labels=dict(x="Metric", y="Model", color="Score"),
|
| 78 |
+
title="Heatmap: Metrics Across Models", color_continuous_scale='Viridis')
|
| 79 |
+
heatmap_path = os.path.join(results_dir, "heatmap.png")
|
| 80 |
+
heatmap.write_image(heatmap_path)
|
| 81 |
+
image_files.append(heatmap_path)
|
| 82 |
+
|
| 83 |
+
radar = go.Figure()
|
| 84 |
+
for _, row in summary.iterrows():
|
| 85 |
+
radar.add_trace(go.Scatterpolar(r=list(row[metrics]), theta=metrics, fill='toself', name=row['target_model']))
|
| 86 |
+
radar.update_layout(title="Radar Chart: Model Score Profiles", polar=dict(radialaxis=dict(visible=True, range=[0, 1])))
|
| 87 |
+
radar_path = os.path.join(results_dir, "radar.png")
|
| 88 |
+
radar.write_image(radar_path)
|
| 89 |
+
image_files.append(radar_path)
|
| 90 |
+
|
| 91 |
+
bar = px.bar(summary.melt(id_vars='target_model'), x='variable', y='value', color='target_model', barmode='group',
|
| 92 |
+
title="Bar Chart: Metric Comparison")
|
| 93 |
+
bar_path = os.path.join(results_dir, "barchart.png")
|
| 94 |
+
bar.write_image(bar_path)
|
| 95 |
+
image_files.append(bar_path)
|
| 96 |
+
|
| 97 |
+
return (heatmap, radar, bar), image_files
|
| 98 |
+
|
| 99 |
+
def format_ats_feedback(score, strengths, gaps, suggestions):
|
| 100 |
+
color = "🟢" if score >= 75 else "🟡" if score >= 50 else "🔴"
|
| 101 |
+
return f"""
|
| 102 |
+
### ATS Match Score: ~{score}% {color}
|
| 103 |
+
|
| 104 |
+
#### **Strengths / High Matches:**
|
| 105 |
+
{chr(10).join([f"* {s}" for s in strengths]) if strengths else "* None found."}
|
| 106 |
+
|
| 107 |
+
#### **Partial or Missing:**
|
| 108 |
+
{chr(10).join([f"* {g}" for g in gaps]) if gaps else "* None mentioned."}
|
| 109 |
+
|
| 110 |
+
#### **How to Improve ATS Score:**
|
| 111 |
+
{chr(10).join([f"1. {s}" for s in suggestions]) if suggestions else "1. Add missing skills."}
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def process_prompt(prompt, enable_realtime, enable_eval, enable_analysis, user_file, model_selection):
|
| 115 |
+
selected_models = [m for m, enabled in zip(["GPT-4", "Claude 3", "Gemini 1.5"], model_selection) if enabled]
|
| 116 |
+
resume_text = ""
|
| 117 |
+
batch_mode = user_file and user_file.name.endswith(".csv")
|
| 118 |
+
resume_mode = user_file and user_file.name.lower().endswith(('.pdf', '.docx', '.txt'))
|
| 119 |
+
|
| 120 |
+
prompts = [prompt]
|
| 121 |
+
ats_summary_texts = []
|
| 122 |
+
search_results = ""
|
| 123 |
+
|
| 124 |
+
if batch_mode:
|
| 125 |
+
df_batch = pd.read_csv(user_file.name)
|
| 126 |
+
prompts = df_batch['prompt'].dropna().tolist()
|
| 127 |
+
elif resume_mode:
|
| 128 |
+
resume_text = extract_text_from_resume(user_file)
|
| 129 |
+
|
| 130 |
+
all_rows, all_charts = [], []
|
| 131 |
+
zip_path, ats_table_markdown = None, ""
|
| 132 |
+
|
| 133 |
+
for prompt_text in prompts:
|
| 134 |
+
search_results = get_google_snippets(prompt_text) if enable_realtime and is_realtime_prompt(prompt_text) else ""
|
| 135 |
+
final_prompt = f"{prompt_text}\n\nRecent info: {search_results}" if search_results else prompt_text
|
| 136 |
+
responses = generate_all_responses_with_reasoning(final_prompt, selected_models)
|
| 137 |
+
|
| 138 |
+
ats_rows = []
|
| 139 |
+
for model in responses:
|
| 140 |
+
model_resp = responses[model]['response']
|
| 141 |
+
if resume_text:
|
| 142 |
+
ats_result = ats_score_advanced(model_resp, resume_text, prompt_text)
|
| 143 |
+
feedback = format_ats_feedback(ats_result['ats_score'], ats_result.get('strengths', []), ats_result.get('gaps', []), ats_result.get('suggestions', []))
|
| 144 |
+
responses[model]['ats_embed'] = f"### Response\n\n{model_resp}\n\n---\n\n### ATS Evaluation\n\n{feedback}"
|
| 145 |
+
ats_rows.append(f"| {model} | {ats_result['ats_score']} | {', '.join(ats_result.get('strengths', []))} | {', '.join(ats_result.get('suggestions', []))} |")
|
| 146 |
+
else:
|
| 147 |
+
responses[model]['ats_embed'] = f"### Response\n\n{model_resp}\n\n---\n\n**Explainability:**\n{responses[model]['reasoning']}"
|
| 148 |
+
if ats_rows:
|
| 149 |
+
ats_table_markdown = "| Model | Score | Strengths | Suggestions |\n|-------|-------|-----------|-------------|\n" + "\n".join(ats_rows)
|
| 150 |
+
|
| 151 |
+
if enable_eval:
|
| 152 |
+
compact = {k: v['response'] for k, v in responses.items()}
|
| 153 |
+
eval_result = comprehensive_round_robin_evaluation(compact, final_prompt)
|
| 154 |
+
for model, data in eval_result.items():
|
| 155 |
+
for evaluator, scores in data['evaluations'].items():
|
| 156 |
row = {
|
| 157 |
+
'prompt': prompt_text,
|
| 158 |
'target_model': model,
|
| 159 |
'evaluator': evaluator,
|
| 160 |
+
'response': responses[model]['response'],
|
| 161 |
+
'explainability': responses[model]['reasoning']
|
|
|
|
|
|
|
|
|
|
| 162 |
}
|
| 163 |
+
row.update({k: scores.get(k, 0.5) for k in metrics})
|
| 164 |
+
row.update({f"avg_{k}": data['average_scores'].get(k, 0.5) for k in metrics})
|
| 165 |
+
all_rows.append(row)
|
| 166 |
+
|
| 167 |
+
df_all = pd.DataFrame(all_rows)
|
| 168 |
+
if not df_all.empty:
|
| 169 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 170 |
+
results_dir = f"results/batch_{timestamp}"
|
| 171 |
+
os.makedirs(results_dir, exist_ok=True)
|
| 172 |
+
csv_path = os.path.join(results_dir, "evaluation.csv")
|
| 173 |
+
df_all.to_csv(csv_path, index=False)
|
| 174 |
+
(heatmap, radar, bar), chart_paths = create_visualizations(df_all, results_dir)
|
| 175 |
+
all_charts = [heatmap, radar, bar]
|
| 176 |
+
zip_path = os.path.join(results_dir, "bundle.zip")
|
| 177 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
| 178 |
+
zipf.write(csv_path, arcname="evaluation.csv")
|
| 179 |
+
for chart in chart_paths:
|
| 180 |
+
zipf.write(chart, arcname=os.path.basename(chart))
|
| 181 |
+
if batch_mode:
|
| 182 |
+
df_batch['ATS Summary'] = ats_summary_texts
|
| 183 |
+
df_batch.to_csv(os.path.join(results_dir, "batch_prompts_output.csv"), index=False)
|
| 184 |
+
zipf.write(os.path.join(results_dir, "batch_prompts_output.csv"), arcname="batch_prompts_output.csv")
|
| 185 |
+
|
| 186 |
+
return tuple(
|
| 187 |
+
responses[model].get('ats_embed', responses[model]['response']) for model in ["GPT-4", "Claude 3", "Gemini 1.5"]
|
| 188 |
+
) + (
|
| 189 |
+
search_results or "N/A",
|
| 190 |
+
*all_charts,
|
| 191 |
+
df_all[['target_model', 'evaluator'] + metrics] if not df_all.empty else pd.DataFrame(),
|
| 192 |
+
ats_table_markdown,
|
| 193 |
+
zip_path
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
def download_results(path):
|
| 197 |
+
return path if path and os.path.exists(path) else None
|
| 198 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
def create_interface():
|
| 200 |
+
with gr.Blocks(title="LLM Comparison Hub") as demo:
|
| 201 |
+
gr.Markdown("""
|
| 202 |
+
# LLM Comparison Hub
|
| 203 |
+
This app compares LLM responses using round-robin evaluations, with real-time query detection and comprehensive analysis.
|
| 204 |
+
|
| 205 |
+
**How to use:**
|
| 206 |
+
- Enter a prompt (JD or query)
|
| 207 |
+
- Upload a resume (PDF/DOCX/TXT) or a CSV with prompts
|
| 208 |
+
- Select models
|
| 209 |
+
- Click evaluate
|
| 210 |
+
|
| 211 |
+
**Features:**
|
| 212 |
+
- Real-time web search fallback
|
| 213 |
+
- Resume vs JD ATS scoring (optional)
|
| 214 |
+
- Batch CSV prompt evaluation
|
| 215 |
+
- Visualizations (Heatmap, Radar, Bar)
|
| 216 |
+
- ZIP export of all results
|
| 217 |
+
""")
|
| 218 |
with gr.Row():
|
| 219 |
+
with gr.Column():
|
| 220 |
+
prompt = gr.Textbox(label="Enter Prompt", lines=4)
|
| 221 |
+
user_file = gr.File(label="Upload Resume or CSV", file_types=[".pdf", ".docx", ".txt", ".csv"])
|
| 222 |
+
model_selector = gr.CheckboxGroup(label="Select Models", choices=["GPT-4", "Claude 3", "Gemini 1.5"], value=["GPT-4", "Claude 3", "Gemini 1.5"])
|
| 223 |
+
enable_realtime = gr.Checkbox(label="Enable real-time detection", value=True)
|
| 224 |
+
enable_eval = gr.Checkbox(label="Enable evaluation", value=True)
|
| 225 |
+
enable_analysis = gr.Checkbox(label="Enable analysis", value=True)
|
| 226 |
+
submit = gr.Button("Run Evaluation")
|
| 227 |
+
|
| 228 |
+
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
with gr.Tabs():
|
| 230 |
+
with gr.Tab("GPT-4"): gpt_out = gr.Markdown()
|
| 231 |
+
with gr.Tab("Claude 3"): claude_out = gr.Markdown()
|
| 232 |
+
with gr.Tab("Gemini 1.5"): gemini_out = gr.Markdown()
|
| 233 |
+
with gr.Tab("Evaluation Table"): df_out = gr.Dataframe()
|
| 234 |
+
with gr.Tab("ATS Evaluation"): ats_summary = gr.Markdown()
|
| 235 |
+
with gr.Tab("Search Results"): search_out = gr.Markdown()
|
| 236 |
+
with gr.Tab("Visualizations"):
|
| 237 |
+
heatmap_plot = gr.Plot()
|
| 238 |
+
radar_plot = gr.Plot()
|
| 239 |
+
bar_plot = gr.Plot()
|
| 240 |
+
export_btn = gr.Button("Download ZIP Bundle")
|
| 241 |
+
zip_output = gr.File(file_types=[".zip"], interactive=False, visible=True)
|
| 242 |
+
|
| 243 |
+
submit.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
fn=process_prompt,
|
| 245 |
+
inputs=[prompt, enable_realtime, enable_eval, enable_analysis, user_file, model_selector],
|
| 246 |
+
outputs=[gpt_out, claude_out, gemini_out, search_out, heatmap_plot, radar_plot, bar_plot, df_out, ats_summary, zip_output]
|
| 247 |
)
|
| 248 |
+
export_btn.click(download_results, inputs=[zip_output], outputs=[zip_output])
|
| 249 |
+
|
| 250 |
+
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
if __name__ == "__main__":
|
| 253 |
+
app = create_interface()
|
| 254 |
+
app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
information
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LLM-Compare-Hub Project File Structure and Use Cases
|
| 2 |
+
====================================================
|
| 3 |
+
|
| 4 |
+
Core Application Files
|
| 5 |
+
---------------------
|
| 6 |
+
|
| 7 |
+
gradio_full_llm_eval.py (1.0B)
|
| 8 |
+
- Use Case: Main Gradio web interface
|
| 9 |
+
- Function: Orchestrates the entire application, provides user-friendly web UI
|
| 10 |
+
- Features: Prompt input, response display, evaluation results, analysis generation
|
| 11 |
+
- Status: Modular - delegates all logic to other files
|
| 12 |
+
|
| 13 |
+
response_generator.py (8.6KB)
|
| 14 |
+
- Use Case: LLM response generation and comparison
|
| 15 |
+
- Function: Generates responses from GPT-4, Claude 3, and Gemini 1.5
|
| 16 |
+
- Features: Side-by-side response comparison, batch processing, optional evaluation
|
| 17 |
+
- Status: Standalone tool + used by Gradio app
|
| 18 |
+
|
| 19 |
+
round_robin_evaluator.py (9.3KB)
|
| 20 |
+
- Use Case: Comprehensive model evaluation system
|
| 21 |
+
- Function: Each model evaluates all other models (GPT-4 evaluates Claude/Gemini, etc.)
|
| 22 |
+
- Features: Multi-metric scoring, CSV export, detailed reasoning
|
| 23 |
+
- Status: Core evaluation engine used by all other components
|
| 24 |
+
|
| 25 |
+
llm_prompt_eval_analysis.py (12KB)
|
| 26 |
+
- Use Case: Data analysis and visualization
|
| 27 |
+
- Function: Analyzes evaluation results, generates charts and reports
|
| 28 |
+
- Features: Statistical analysis, correlation matrices, performance comparisons
|
| 29 |
+
- Status: Standalone analysis tool + used by Gradio app
|
| 30 |
+
|
| 31 |
+
llm_response_logger.py (6.3KB)
|
| 32 |
+
- Use Case: Quick testing and logging tool
|
| 33 |
+
- Function: Rapid testing of all models with single or batch prompts
|
| 34 |
+
- Features: Quick evaluation, CSV export, batch processing
|
| 35 |
+
- Status: Standalone testing tool + used by Gradio app
|
| 36 |
+
|
| 37 |
+
Supporting Modules
|
| 38 |
+
-----------------
|
| 39 |
+
|
| 40 |
+
realtime_detector.py (923B)
|
| 41 |
+
- Use Case: Real-time query detection
|
| 42 |
+
- Function: Determines if a prompt needs current information
|
| 43 |
+
- Features: Uses GPT-3.5-turbo to classify real-time vs. general queries
|
| 44 |
+
- Status: Utility module used by response generation
|
| 45 |
+
|
| 46 |
+
search_fallback.py (1.6KB)
|
| 47 |
+
- Use Case: Google search integration
|
| 48 |
+
- Function: Fetches current information for real-time queries
|
| 49 |
+
- Features: Google Custom Search API integration, result formatting
|
| 50 |
+
- Status: Utility module used by response generation
|
| 51 |
+
|
| 52 |
+
Configuration & Documentation
|
| 53 |
+
----------------------------
|
| 54 |
+
|
| 55 |
+
requirements.txt (232B)
|
| 56 |
+
- Use Case: Python dependencies
|
| 57 |
+
- Function: Lists all required packages and versions
|
| 58 |
+
- Features: Gradio, OpenAI, Anthropic, Google AI, pandas, matplotlib, etc.
|
| 59 |
+
- Status: Essential for project setup
|
| 60 |
+
|
| 61 |
+
.env (not shown - should exist)
|
| 62 |
+
- Use Case: API key configuration
|
| 63 |
+
- Function: Stores all API keys securely
|
| 64 |
+
- Features: OpenAI, Claude, Gemini, Google Search API keys
|
| 65 |
+
- Status: Essential for functionality
|
| 66 |
+
|
| 67 |
+
.gitignore (661B)
|
| 68 |
+
- Use Case: Git version control
|
| 69 |
+
- Function: Excludes sensitive files from version control
|
| 70 |
+
- Features: API keys, results, cache files, etc.
|
| 71 |
+
- Status: Essential for security
|
| 72 |
+
|
| 73 |
+
README.md (4.1KB)
|
| 74 |
+
- Use Case: Project documentation
|
| 75 |
+
- Function: Setup instructions, usage guide, feature descriptions
|
| 76 |
+
- Features: Installation, configuration, usage examples
|
| 77 |
+
- Status: Essential for users and collaborators
|
| 78 |
+
|
| 79 |
+
Testing & Development
|
| 80 |
+
--------------------
|
| 81 |
+
|
| 82 |
+
test_standalone_tools.py (4.1KB)
|
| 83 |
+
- Use Case: Testing and demonstration
|
| 84 |
+
- Function: Shows how to use all standalone tools
|
| 85 |
+
- Features: Quick start guide, sample prompts, tool explanations
|
| 86 |
+
- Status: Development/testing tool
|
| 87 |
+
|
| 88 |
+
__pycache__/ (directory)
|
| 89 |
+
- Use Case: Python cache
|
| 90 |
+
- Function: Stores compiled Python bytecode
|
| 91 |
+
- Features: Improves import performance
|
| 92 |
+
- Status: Auto-generated, can be deleted
|
| 93 |
+
|
| 94 |
+
Generated Files (when running the app)
|
| 95 |
+
-------------------------------------
|
| 96 |
+
|
| 97 |
+
results/ (directory - created when needed)
|
| 98 |
+
- Use Case: Evaluation results storage
|
| 99 |
+
- Function: Stores CSV files with evaluation data
|
| 100 |
+
- Features: Timestamped files, comprehensive evaluation data
|
| 101 |
+
- Status: Auto-generated during evaluation
|
| 102 |
+
|
| 103 |
+
analysis_results/ (directory - created when needed)
|
| 104 |
+
- Use Case: Analysis output storage
|
| 105 |
+
- Function: Stores charts, reports, and visualizations
|
| 106 |
+
- Features: Performance charts, correlation matrices, analysis reports
|
| 107 |
+
- Status: Auto-generated during analysis
|
| 108 |
+
|
| 109 |
+
Project Summary
|
| 110 |
+
==============
|
| 111 |
+
|
| 112 |
+
Your project has a clean, modular architecture with:
|
| 113 |
+
- 4 core functional modules (response generation, evaluation, analysis, logging)
|
| 114 |
+
- 2 utility modules (real-time detection, search integration)
|
| 115 |
+
- 1 main interface (Gradio web app)
|
| 116 |
+
- Complete configuration (requirements, environment, documentation)
|
| 117 |
+
- Testing tools for development and demonstration
|
| 118 |
+
|
| 119 |
+
All files serve specific purposes and work together to provide a comprehensive LLM comparison and evaluation system.
|
| 120 |
+
|
| 121 |
+
Key Features:
|
| 122 |
+
- Multi-model response generation (GPT-4, Claude 3, Gemini 1.5)
|
| 123 |
+
- Comprehensive round-robin evaluation system
|
| 124 |
+
- Real-time query detection and search integration
|
| 125 |
+
- Advanced data analysis and visualization
|
| 126 |
+
- Batch processing capabilities
|
| 127 |
+
- Clean, production-ready code without emojis
|
| 128 |
+
- Modular architecture for maintainability
|
| 129 |
+
- Complete web interface via Gradio
|
| 130 |
+
- Standalone tools for automation and testing
|
| 131 |
+
|
| 132 |
+
Usage:
|
| 133 |
+
1. Set up API keys in .env file
|
| 134 |
+
2. Install dependencies: pip install -r requirements.txt
|
| 135 |
+
3. Run main app: python gradio_full_llm_eval.py
|
| 136 |
+
4. Or use standalone tools for specific tasks
|
requirements.txt
CHANGED
|
@@ -10,4 +10,7 @@ python-dotenv>=1.0.0
|
|
| 10 |
requests>=2.31.0
|
| 11 |
tqdm>=4.65.0
|
| 12 |
scikit-learn>=1.3.0
|
| 13 |
-
plotly>=5.18.0
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
requests>=2.31.0
|
| 11 |
tqdm>=4.65.0
|
| 12 |
scikit-learn>=1.3.0
|
| 13 |
+
plotly>=5.18.0
|
| 14 |
+
kaleido>=0.2.1
|
| 15 |
+
PyMuPDF>=1.23.0
|
| 16 |
+
python-docx>=1.1.0
|
response_generator.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
| 1 |
-
import csv
|
| 2 |
import os
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
from openai import OpenAI
|
| 5 |
import anthropic
|
| 6 |
import google.generativeai as genai
|
| 7 |
-
from round_robin_evaluator import comprehensive_round_robin_evaluation, save_comprehensive_results
|
| 8 |
-
from datetime import datetime
|
| 9 |
|
| 10 |
# Load API keys from .env
|
| 11 |
load_dotenv()
|
|
@@ -14,20 +11,50 @@ anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
|
|
| 14 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 15 |
|
| 16 |
def get_gpt4_response(prompt):
|
| 17 |
-
"""Get response from GPT-4."""
|
| 18 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
response = openai_client.chat.completions.create(
|
| 20 |
model="gpt-4",
|
| 21 |
-
messages=
|
| 22 |
temperature=0.7
|
| 23 |
)
|
| 24 |
return response.choices[0].message.content
|
|
|
|
| 25 |
except Exception as e:
|
| 26 |
print(f"Error with GPT-4: {e}")
|
| 27 |
-
return
|
| 28 |
|
| 29 |
def get_claude_response(prompt):
|
| 30 |
-
"""Get response from Claude."""
|
| 31 |
try:
|
| 32 |
response = anthropic_client.messages.create(
|
| 33 |
model="claude-3-opus-20240229",
|
|
@@ -38,226 +65,39 @@ def get_claude_response(prompt):
|
|
| 38 |
return response.content[0].text
|
| 39 |
except Exception as e:
|
| 40 |
print(f"Error with Claude 3: {e}")
|
| 41 |
-
return
|
| 42 |
|
| 43 |
def get_gemini_response(prompt):
|
| 44 |
-
"""Get response from Gemini."""
|
| 45 |
try:
|
| 46 |
model = genai.GenerativeModel("gemini-1.5-pro")
|
| 47 |
response = model.generate_content(prompt)
|
| 48 |
return response.text
|
| 49 |
except Exception as e:
|
| 50 |
print(f"Error with Gemini: {e}")
|
| 51 |
-
return
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
if len(models) == 0:
|
| 61 |
-
print("No responses to display")
|
| 62 |
-
return
|
| 63 |
-
|
| 64 |
-
# Display responses
|
| 65 |
-
for i, model in enumerate(models, 1):
|
| 66 |
-
response = responses[model]
|
| 67 |
-
print(f"\n{i}. {model} RESPONSE:")
|
| 68 |
-
print("-" * 40)
|
| 69 |
-
print(response)
|
| 70 |
-
print("-" * 40)
|
| 71 |
-
print(f"Length: {len(response)} characters")
|
| 72 |
-
print()
|
| 73 |
|
| 74 |
-
def generate_and_compare_responses():
|
| 75 |
-
"""Generate responses from all models and display comparison."""
|
| 76 |
-
print("=== Response Generator - Model Comparison Tool ===\n")
|
| 77 |
-
|
| 78 |
-
# Get prompt from user
|
| 79 |
-
prompt = input("Enter your prompt: ")
|
| 80 |
-
if not prompt.strip():
|
| 81 |
-
print("No prompt provided. Exiting.")
|
| 82 |
-
return
|
| 83 |
-
|
| 84 |
-
print(f"\nGenerating responses for: '{prompt}'")
|
| 85 |
-
print("=" * 60)
|
| 86 |
-
|
| 87 |
-
# Collect responses from all models
|
| 88 |
responses = {}
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
gpt_response = get_gpt4_response(prompt)
|
| 92 |
-
if gpt_response:
|
| 93 |
-
responses['GPT-4'] = gpt_response
|
| 94 |
-
print("GPT-4 response generated")
|
| 95 |
-
else:
|
| 96 |
-
print("GPT-4 failed")
|
| 97 |
-
|
| 98 |
-
print("\n2. Generating Claude response...")
|
| 99 |
-
claude_response = get_claude_response(prompt)
|
| 100 |
-
if claude_response:
|
| 101 |
-
responses['Claude 3'] = claude_response
|
| 102 |
-
print("Claude response generated")
|
| 103 |
-
else:
|
| 104 |
-
print("Claude failed")
|
| 105 |
-
|
| 106 |
-
print("\n3. Generating Gemini response...")
|
| 107 |
-
gemini_response = get_gemini_response(prompt)
|
| 108 |
-
if gemini_response:
|
| 109 |
-
responses['Gemini 1.5'] = gemini_response
|
| 110 |
-
print("Gemini response generated")
|
| 111 |
-
else:
|
| 112 |
-
print("Gemini failed")
|
| 113 |
-
|
| 114 |
-
if not responses:
|
| 115 |
-
print("\nNo models generated responses. Check your API keys.")
|
| 116 |
-
return
|
| 117 |
-
|
| 118 |
-
print(f"\nSuccessfully generated {len(responses)} responses")
|
| 119 |
-
|
| 120 |
-
# Display side-by-side comparison
|
| 121 |
-
display_responses_side_by_side(responses, prompt)
|
| 122 |
-
|
| 123 |
-
# Ask if user wants evaluation
|
| 124 |
-
evaluate = input("\nDo you want to evaluate these responses? (y/n): ").strip().lower()
|
| 125 |
-
|
| 126 |
-
if evaluate in ['y', 'yes']:
|
| 127 |
-
print("\n4. Performing comprehensive evaluation...")
|
| 128 |
try:
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
# Display evaluation summary
|
| 139 |
-
print("\n=== EVALUATION SUMMARY ===")
|
| 140 |
-
for model, data in comprehensive_results.items():
|
| 141 |
-
avg_scores = data.get('average_scores', {})
|
| 142 |
-
print(f"\n{model} Scores:")
|
| 143 |
-
print(f" Helpfulness: {avg_scores.get('helpfulness', 'N/A')}")
|
| 144 |
-
print(f" Correctness: {avg_scores.get('correctness', 'N/A')}")
|
| 145 |
-
print(f" Coherence: {avg_scores.get('coherence', 'N/A')}")
|
| 146 |
-
print(f" Clarity: {avg_scores.get('clarity', 'N/A')}")
|
| 147 |
-
print(f" Evaluated by: {list(data.get('evaluations', {}).keys())}")
|
| 148 |
-
|
| 149 |
except Exception as e:
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
print("\n=== Response generation completed ===")
|
| 153 |
|
| 154 |
-
def batch_generate_from_file(filename):
|
| 155 |
-
"""Generate responses for multiple prompts from a file."""
|
| 156 |
-
if not os.path.exists(filename):
|
| 157 |
-
print(f"File {filename} not found.")
|
| 158 |
-
return
|
| 159 |
-
|
| 160 |
-
print(f"=== Batch Response Generation from {filename} ===")
|
| 161 |
-
|
| 162 |
-
with open(filename, 'r', encoding='utf-8') as f:
|
| 163 |
-
prompts = [line.strip() for line in f if line.strip()]
|
| 164 |
-
|
| 165 |
-
print(f"Found {len(prompts)} prompts to process")
|
| 166 |
-
|
| 167 |
-
all_results = []
|
| 168 |
-
|
| 169 |
-
for i, prompt in enumerate(prompts, 1):
|
| 170 |
-
print(f"\n--- Processing Prompt {i}/{len(prompts)} ---")
|
| 171 |
-
print(f"Prompt: {prompt}")
|
| 172 |
-
|
| 173 |
-
# Generate responses
|
| 174 |
-
responses = {}
|
| 175 |
-
|
| 176 |
-
gpt_response = get_gpt4_response(prompt)
|
| 177 |
-
if gpt_response:
|
| 178 |
-
responses['GPT-4'] = gpt_response
|
| 179 |
-
|
| 180 |
-
claude_response = get_claude_response(prompt)
|
| 181 |
-
if claude_response:
|
| 182 |
-
responses['Claude 3'] = claude_response
|
| 183 |
-
|
| 184 |
-
gemini_response = get_gemini_response(prompt)
|
| 185 |
-
if gemini_response:
|
| 186 |
-
responses['Gemini 1.5'] = gemini_response
|
| 187 |
-
|
| 188 |
-
if responses:
|
| 189 |
-
# Display comparison
|
| 190 |
-
display_responses_side_by_side(responses, prompt)
|
| 191 |
-
|
| 192 |
-
# Evaluate
|
| 193 |
-
try:
|
| 194 |
-
comprehensive_results = comprehensive_round_robin_evaluation(responses, prompt)
|
| 195 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 196 |
-
csv_file = save_comprehensive_results(comprehensive_results, prompt, f"{timestamp}_batch_{i}")
|
| 197 |
-
print(f"Results saved for prompt {i}")
|
| 198 |
-
all_results.append((prompt, comprehensive_results))
|
| 199 |
-
except Exception as e:
|
| 200 |
-
print(f"Evaluation failed for prompt {i}: {e}")
|
| 201 |
-
else:
|
| 202 |
-
print(f"No responses for prompt {i}")
|
| 203 |
-
|
| 204 |
-
# Save summary
|
| 205 |
-
if all_results:
|
| 206 |
-
summary_file = f"results/batch_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
| 207 |
-
os.makedirs("results", exist_ok=True)
|
| 208 |
-
|
| 209 |
-
with open(summary_file, 'w', encoding='utf-8') as f:
|
| 210 |
-
f.write("BATCH RESPONSE GENERATION SUMMARY\n")
|
| 211 |
-
f.write("=" * 50 + "\n\n")
|
| 212 |
-
|
| 213 |
-
for prompt, results in all_results:
|
| 214 |
-
f.write(f"PROMPT: {prompt}\n")
|
| 215 |
-
f.write("-" * 30 + "\n")
|
| 216 |
-
|
| 217 |
-
for model, data in results.items():
|
| 218 |
-
avg_scores = data.get('average_scores', {})
|
| 219 |
-
f.write(f"{model}:\n")
|
| 220 |
-
f.write(f" Helpfulness: {avg_scores.get('helpfulness', 'N/A')}\n")
|
| 221 |
-
f.write(f" Correctness: {avg_scores.get('correctness', 'N/A')}\n")
|
| 222 |
-
f.write(f" Coherence: {avg_scores.get('coherence', 'N/A')}\n")
|
| 223 |
-
f.write(f" Clarity: {avg_scores.get('clarity', 'N/A')}\n\n")
|
| 224 |
-
|
| 225 |
-
f.write("\n" + "="*50 + "\n\n")
|
| 226 |
-
|
| 227 |
-
print(f"\nBatch summary saved to: {summary_file}")
|
| 228 |
-
|
| 229 |
-
print("\n=== Batch generation completed ===")
|
| 230 |
-
|
| 231 |
-
def generate_all_responses(prompt):
|
| 232 |
-
"""Generate responses from all models for a given prompt."""
|
| 233 |
-
responses = {}
|
| 234 |
-
|
| 235 |
-
# Generate responses from all models
|
| 236 |
-
gpt_response = get_gpt4_response(prompt)
|
| 237 |
-
if gpt_response:
|
| 238 |
-
responses['GPT-4'] = gpt_response
|
| 239 |
-
|
| 240 |
-
claude_response = get_claude_response(prompt)
|
| 241 |
-
if claude_response:
|
| 242 |
-
responses['Claude 3'] = claude_response
|
| 243 |
-
|
| 244 |
-
gemini_response = get_gemini_response(prompt)
|
| 245 |
-
if gemini_response:
|
| 246 |
-
responses['Gemini 1.5'] = gemini_response
|
| 247 |
-
|
| 248 |
return responses
|
| 249 |
-
|
| 250 |
-
if __name__ == "__main__":
|
| 251 |
-
print("=== Response Generator Tool ===")
|
| 252 |
-
print("1. Interactive mode")
|
| 253 |
-
print("2. Batch mode from file")
|
| 254 |
-
|
| 255 |
-
choice = input("Choose mode (1 or 2): ").strip()
|
| 256 |
-
|
| 257 |
-
if choice == "1":
|
| 258 |
-
generate_and_compare_responses()
|
| 259 |
-
elif choice == "2":
|
| 260 |
-
filename = input("Enter filename with prompts (one per line): ").strip()
|
| 261 |
-
batch_generate_from_file(filename)
|
| 262 |
-
else:
|
| 263 |
-
print("Invalid choice. Exiting.")
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
from openai import OpenAI
|
| 4 |
import anthropic
|
| 5 |
import google.generativeai as genai
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Load API keys from .env
|
| 8 |
load_dotenv()
|
|
|
|
| 11 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 12 |
|
| 13 |
def get_gpt4_response(prompt):
|
|
|
|
| 14 |
try:
|
| 15 |
+
if "Recent info:" in prompt:
|
| 16 |
+
user_prompt, realtime_info = prompt.split("Recent info:", 1)
|
| 17 |
+
messages = [
|
| 18 |
+
{
|
| 19 |
+
"role": "system",
|
| 20 |
+
"content": (
|
| 21 |
+
"You are an expert ATS evaluator. You are comparing a job description (JD) and a resume to produce an ATS score. "
|
| 22 |
+
"Highlight matches, gaps, suggestions for improvement, and an overall score."
|
| 23 |
+
)
|
| 24 |
+
},
|
| 25 |
+
{"role": "user", "content": user_prompt.strip()},
|
| 26 |
+
{
|
| 27 |
+
"role": "user",
|
| 28 |
+
"content": (
|
| 29 |
+
f"Here is some recent real-time context for your reference:\n\n{realtime_info.strip()}\n\n"
|
| 30 |
+
"Based on this, tailor your response as if the data is accurate."
|
| 31 |
+
)
|
| 32 |
+
}
|
| 33 |
+
]
|
| 34 |
+
else:
|
| 35 |
+
messages = [
|
| 36 |
+
{
|
| 37 |
+
"role": "system",
|
| 38 |
+
"content": (
|
| 39 |
+
"You are an expert ATS evaluator. You are comparing a job description (JD) and a resume to produce an ATS score. "
|
| 40 |
+
"Highlight matches, gaps, suggestions for improvement, and an overall score."
|
| 41 |
+
)
|
| 42 |
+
},
|
| 43 |
+
{"role": "user", "content": prompt}
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
response = openai_client.chat.completions.create(
|
| 47 |
model="gpt-4",
|
| 48 |
+
messages=messages,
|
| 49 |
temperature=0.7
|
| 50 |
)
|
| 51 |
return response.choices[0].message.content
|
| 52 |
+
|
| 53 |
except Exception as e:
|
| 54 |
print(f"Error with GPT-4: {e}")
|
| 55 |
+
return "GPT-4 failed."
|
| 56 |
|
| 57 |
def get_claude_response(prompt):
|
|
|
|
| 58 |
try:
|
| 59 |
response = anthropic_client.messages.create(
|
| 60 |
model="claude-3-opus-20240229",
|
|
|
|
| 65 |
return response.content[0].text
|
| 66 |
except Exception as e:
|
| 67 |
print(f"Error with Claude 3: {e}")
|
| 68 |
+
return "Claude 3 failed."
|
| 69 |
|
| 70 |
def get_gemini_response(prompt):
|
|
|
|
| 71 |
try:
|
| 72 |
model = genai.GenerativeModel("gemini-1.5-pro")
|
| 73 |
response = model.generate_content(prompt)
|
| 74 |
return response.text
|
| 75 |
except Exception as e:
|
| 76 |
print(f"Error with Gemini: {e}")
|
| 77 |
+
return "Gemini 1.5 failed."
|
| 78 |
|
| 79 |
+
def generate_all_responses_with_reasoning(prompt, selected_models=None):
|
| 80 |
+
all_models = {
|
| 81 |
+
"GPT-4": get_gpt4_response,
|
| 82 |
+
"Claude 3": get_claude_response,
|
| 83 |
+
"Gemini 1.5": get_gemini_response
|
| 84 |
+
}
|
| 85 |
+
models_to_use = selected_models if selected_models else list(all_models.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
responses = {}
|
| 88 |
+
for model_name in models_to_use:
|
| 89 |
+
fetch_fn = all_models[model_name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
+
response = fetch_fn(prompt)
|
| 92 |
+
reason_prompt = (
|
| 93 |
+
f"Why did you generate this response to the prompt:\n\n"
|
| 94 |
+
f"\"{prompt}\"\n\n"
|
| 95 |
+
f"Your Response:\n\"{response}\"\n\n"
|
| 96 |
+
"Explain your reasoning behind structuring or phrasing it that way."
|
| 97 |
+
)
|
| 98 |
+
reasoning = fetch_fn(reason_prompt)
|
| 99 |
+
responses[model_name] = {"response": response, "reasoning": reasoning}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
+
responses[model_name] = {"response": "Failed", "reasoning": str(e)}
|
|
|
|
|
|
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
return responses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
round_robin_evaluator.py
CHANGED
|
@@ -5,6 +5,7 @@ import google.generativeai as genai
|
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
import csv
|
| 7 |
import json
|
|
|
|
| 8 |
|
| 9 |
# Load environment variables
|
| 10 |
load_dotenv()
|
|
@@ -12,6 +13,16 @@ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
| 12 |
anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
|
| 13 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def evaluate_response(evaluator_model, prompt, target_model, response_text):
|
| 16 |
"""Evaluate a response using the specified evaluator model."""
|
| 17 |
evaluation_prompt = (
|
|
@@ -19,26 +30,20 @@ def evaluate_response(evaluator_model, prompt, target_model, response_text):
|
|
| 19 |
f"Here is the original prompt: \"{prompt}\"\n"
|
| 20 |
f"Here is the response from {target_model}: \"{response_text}\"\n\n"
|
| 21 |
f"Evaluate this response on the following criteria from 0 (worst) to 1 (best):\n"
|
| 22 |
-
f"- Helpfulness
|
| 23 |
-
f"-
|
| 24 |
-
f"
|
| 25 |
-
f"- Tone: How appropriate and professional is the tone?\n"
|
| 26 |
-
f"- Accuracy: How precise and detailed is the information?\n"
|
| 27 |
-
f"- Relevance: How well does the response address the prompt?\n"
|
| 28 |
-
f"- Completeness: How comprehensive is the response?\n"
|
| 29 |
-
f"- Clarity: How clear and easy to understand is the response?\n\n"
|
| 30 |
-
f"Return the result in this exact JSON format:\n\n"
|
| 31 |
f"{{\n"
|
| 32 |
-
f" \"helpfulness\": <
|
| 33 |
-
f" \"correctness\": <
|
| 34 |
-
f" \"coherence\": <
|
| 35 |
-
f" \"tone_score\": <
|
| 36 |
-
f" \"accuracy\": <
|
| 37 |
-
f" \"relevance\": <
|
| 38 |
-
f" \"completeness\": <
|
| 39 |
-
f" \"clarity\": <
|
| 40 |
-
f" \"reasoning\": \"
|
| 41 |
-
f" \"notes\": \"additional
|
| 42 |
f"}}"
|
| 43 |
)
|
| 44 |
|
|
@@ -65,106 +70,70 @@ def evaluate_response(evaluator_model, prompt, target_model, response_text):
|
|
| 65 |
else:
|
| 66 |
print(f"Unknown evaluator model: {evaluator_model}")
|
| 67 |
return None
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
if isinstance(result, str):
|
| 72 |
-
parsed = json.loads(result)
|
| 73 |
-
else:
|
| 74 |
-
parsed = result
|
| 75 |
return parsed
|
| 76 |
-
|
| 77 |
print(f"Failed to parse JSON from {evaluator_model} evaluation")
|
| 78 |
return None
|
| 79 |
-
|
| 80 |
except Exception as e:
|
| 81 |
print(f"Error in {evaluator_model} evaluation: {str(e)}")
|
| 82 |
return None
|
| 83 |
|
| 84 |
def comprehensive_round_robin_evaluation(responses_dict, prompt):
|
| 85 |
-
"""
|
| 86 |
-
Perform comprehensive round-robin evaluation where each model evaluates all other models.
|
| 87 |
-
|
| 88 |
-
Args:
|
| 89 |
-
responses_dict: Dictionary with model names as keys and response texts as values
|
| 90 |
-
prompt: The original prompt
|
| 91 |
-
|
| 92 |
-
Returns:
|
| 93 |
-
Dictionary with comprehensive evaluation results
|
| 94 |
-
"""
|
| 95 |
print("\nStarting comprehensive round-robin evaluation...")
|
| 96 |
-
|
| 97 |
-
# Define the evaluation matrix
|
| 98 |
evaluation_matrix = {
|
| 99 |
"GPT-4": ["Claude 3", "Gemini 1.5"],
|
| 100 |
-
"Claude 3": ["GPT-4", "Gemini 1.5"],
|
| 101 |
"Gemini 1.5": ["GPT-4", "Claude 3"]
|
| 102 |
}
|
| 103 |
-
|
| 104 |
-
# Initialize results structure
|
| 105 |
comprehensive_results = {}
|
| 106 |
-
|
| 107 |
-
# For each model, collect evaluations from other models
|
| 108 |
for target_model, response_text in responses_dict.items():
|
| 109 |
print(f"\nCollecting evaluations for {target_model}...")
|
| 110 |
-
|
| 111 |
-
# Initialize target model data
|
| 112 |
comprehensive_results[target_model] = {
|
| 113 |
'response': response_text,
|
| 114 |
'evaluations': {},
|
| 115 |
'average_scores': {}
|
| 116 |
}
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
evaluators = evaluation_matrix[target_model]
|
| 120 |
-
for evaluator in evaluators:
|
| 121 |
print(f" {evaluator} evaluating {target_model}...")
|
| 122 |
evaluation = evaluate_response(evaluator, prompt, target_model, response_text)
|
| 123 |
-
|
| 124 |
if evaluation:
|
| 125 |
comprehensive_results[target_model]['evaluations'][evaluator] = evaluation
|
| 126 |
print(f" {evaluator} evaluation completed")
|
| 127 |
else:
|
| 128 |
print(f" {evaluator} evaluation failed")
|
| 129 |
-
|
| 130 |
-
# Calculate average scores across all evaluators
|
| 131 |
if comprehensive_results[target_model]['evaluations']:
|
| 132 |
-
metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
|
| 133 |
-
|
| 134 |
-
|
| 135 |
for metric in metrics:
|
| 136 |
-
scores = [
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
if scores
|
| 142 |
-
|
| 143 |
-
comprehensive_results[target_model]['average_scores'][metric] = round(avg_score, 3)
|
| 144 |
-
else:
|
| 145 |
-
comprehensive_results[target_model]['average_scores'][metric] = 0.5
|
| 146 |
-
|
| 147 |
print(f"\nComprehensive evaluation completed for {len(comprehensive_results)} models")
|
| 148 |
return comprehensive_results
|
| 149 |
|
| 150 |
def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
|
| 151 |
-
"""Save comprehensive evaluation results to CSV."""
|
| 152 |
if timestamp is None:
|
| 153 |
-
from datetime import datetime
|
| 154 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 155 |
-
|
| 156 |
filename = f"results/comprehensive_eval_{timestamp}.csv"
|
| 157 |
-
|
| 158 |
-
# Ensure results directory exists
|
| 159 |
os.makedirs("results", exist_ok=True)
|
| 160 |
-
|
| 161 |
-
# Prepare data for CSV
|
| 162 |
rows = []
|
| 163 |
for model, data in comprehensive_results.items():
|
| 164 |
-
# Get average scores
|
| 165 |
avg_scores = data.get('average_scores', {})
|
| 166 |
-
|
| 167 |
-
# Create row for each evaluator
|
| 168 |
for evaluator, evaluation in data.get('evaluations', {}).items():
|
| 169 |
row = {
|
| 170 |
'timestamp': timestamp,
|
|
@@ -192,44 +161,15 @@ def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
|
|
| 192 |
'avg_clarity': avg_scores.get('clarity', 0.5)
|
| 193 |
}
|
| 194 |
rows.append(row)
|
| 195 |
-
|
| 196 |
-
# Write to CSV
|
| 197 |
if rows:
|
| 198 |
fieldnames = list(rows[0].keys())
|
| 199 |
-
with open(filename, 'w', newline='', encoding='utf-8') as
|
| 200 |
-
writer = csv.DictWriter(
|
| 201 |
writer.writeheader()
|
| 202 |
writer.writerows(rows)
|
| 203 |
-
|
| 204 |
print(f"Results saved to {filename}")
|
| 205 |
return filename
|
| 206 |
else:
|
| 207 |
print("No results to save")
|
| 208 |
return None
|
| 209 |
-
|
| 210 |
-
def round_robin_evaluate_and_log(responses):
|
| 211 |
-
"""Legacy function for backward compatibility."""
|
| 212 |
-
print("This function is deprecated. Use comprehensive_round_robin_evaluation instead.")
|
| 213 |
-
return comprehensive_round_robin_evaluation(responses, "Legacy prompt")
|
| 214 |
-
|
| 215 |
-
if __name__ == "__main__":
|
| 216 |
-
# Test the evaluation system
|
| 217 |
-
test_responses = {
|
| 218 |
-
"GPT-4": "This is a test response from GPT-4.",
|
| 219 |
-
"Claude 3": "This is a test response from Claude 3.",
|
| 220 |
-
"Gemini 1.5": "This is a test response from Gemini 1.5."
|
| 221 |
-
}
|
| 222 |
-
|
| 223 |
-
test_prompt = "What is artificial intelligence?"
|
| 224 |
-
|
| 225 |
-
print("Testing round-robin evaluation system...")
|
| 226 |
-
results = comprehensive_round_robin_evaluation(test_responses, test_prompt)
|
| 227 |
-
|
| 228 |
-
if results:
|
| 229 |
-
print("\nTest completed successfully!")
|
| 230 |
-
for model, data in results.items():
|
| 231 |
-
print(f"\n{model} average scores:")
|
| 232 |
-
for metric, score in data.get('average_scores', {}).items():
|
| 233 |
-
print(f" {metric}: {score}")
|
| 234 |
-
else:
|
| 235 |
-
print("Test failed!")
|
|
|
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
import csv
|
| 7 |
import json
|
| 8 |
+
import re
|
| 9 |
|
| 10 |
# Load environment variables
|
| 11 |
load_dotenv()
|
|
|
|
| 13 |
anthropic_client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
|
| 14 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 15 |
|
| 16 |
+
def safe_parse_json(text):
|
| 17 |
+
"""Extract and parse JSON from a possibly noisy LLM output."""
|
| 18 |
+
try:
|
| 19 |
+
match = re.search(r'{.*}', text, re.DOTALL)
|
| 20 |
+
if match:
|
| 21 |
+
return json.loads(match.group())
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print(f"[Safe JSON Parse Error] {e}")
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
def evaluate_response(evaluator_model, prompt, target_model, response_text):
|
| 27 |
"""Evaluate a response using the specified evaluator model."""
|
| 28 |
evaluation_prompt = (
|
|
|
|
| 30 |
f"Here is the original prompt: \"{prompt}\"\n"
|
| 31 |
f"Here is the response from {target_model}: \"{response_text}\"\n\n"
|
| 32 |
f"Evaluate this response on the following criteria from 0 (worst) to 1 (best):\n"
|
| 33 |
+
f"- Helpfulness\n- Correctness\n- Coherence\n- Tone\n- Accuracy\n"
|
| 34 |
+
f"- Relevance\n- Completeness\n- Clarity\n\n"
|
| 35 |
+
f"Return ONLY a valid JSON object with the following keys:\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
f"{{\n"
|
| 37 |
+
f" \"helpfulness\": <float>,\n"
|
| 38 |
+
f" \"correctness\": <float>,\n"
|
| 39 |
+
f" \"coherence\": <float>,\n"
|
| 40 |
+
f" \"tone_score\": <float>,\n"
|
| 41 |
+
f" \"accuracy\": <float>,\n"
|
| 42 |
+
f" \"relevance\": <float>,\n"
|
| 43 |
+
f" \"completeness\": <float>,\n"
|
| 44 |
+
f" \"clarity\": <float>,\n"
|
| 45 |
+
f" \"reasoning\": \"explanation\",\n"
|
| 46 |
+
f" \"notes\": \"additional remarks\"\n"
|
| 47 |
f"}}"
|
| 48 |
)
|
| 49 |
|
|
|
|
| 70 |
else:
|
| 71 |
print(f"Unknown evaluator model: {evaluator_model}")
|
| 72 |
return None
|
| 73 |
+
|
| 74 |
+
parsed = safe_parse_json(result)
|
| 75 |
+
if parsed:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
return parsed
|
| 77 |
+
else:
|
| 78 |
print(f"Failed to parse JSON from {evaluator_model} evaluation")
|
| 79 |
return None
|
| 80 |
+
|
| 81 |
except Exception as e:
|
| 82 |
print(f"Error in {evaluator_model} evaluation: {str(e)}")
|
| 83 |
return None
|
| 84 |
|
| 85 |
def comprehensive_round_robin_evaluation(responses_dict, prompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
print("\nStarting comprehensive round-robin evaluation...")
|
| 87 |
+
|
|
|
|
| 88 |
evaluation_matrix = {
|
| 89 |
"GPT-4": ["Claude 3", "Gemini 1.5"],
|
| 90 |
+
"Claude 3": ["GPT-4", "Gemini 1.5"],
|
| 91 |
"Gemini 1.5": ["GPT-4", "Claude 3"]
|
| 92 |
}
|
| 93 |
+
|
|
|
|
| 94 |
comprehensive_results = {}
|
| 95 |
+
|
|
|
|
| 96 |
for target_model, response_text in responses_dict.items():
|
| 97 |
print(f"\nCollecting evaluations for {target_model}...")
|
|
|
|
|
|
|
| 98 |
comprehensive_results[target_model] = {
|
| 99 |
'response': response_text,
|
| 100 |
'evaluations': {},
|
| 101 |
'average_scores': {}
|
| 102 |
}
|
| 103 |
+
|
| 104 |
+
for evaluator in evaluation_matrix[target_model]:
|
|
|
|
|
|
|
| 105 |
print(f" {evaluator} evaluating {target_model}...")
|
| 106 |
evaluation = evaluate_response(evaluator, prompt, target_model, response_text)
|
|
|
|
| 107 |
if evaluation:
|
| 108 |
comprehensive_results[target_model]['evaluations'][evaluator] = evaluation
|
| 109 |
print(f" {evaluator} evaluation completed")
|
| 110 |
else:
|
| 111 |
print(f" {evaluator} evaluation failed")
|
| 112 |
+
|
|
|
|
| 113 |
if comprehensive_results[target_model]['evaluations']:
|
| 114 |
+
metrics = ['helpfulness', 'correctness', 'coherence', 'tone_score',
|
| 115 |
+
'accuracy', 'relevance', 'completeness', 'clarity']
|
|
|
|
| 116 |
for metric in metrics:
|
| 117 |
+
scores = [
|
| 118 |
+
eval_data[metric]
|
| 119 |
+
for eval_data in comprehensive_results[target_model]['evaluations'].values()
|
| 120 |
+
if metric in eval_data and isinstance(eval_data[metric], (int, float))
|
| 121 |
+
]
|
| 122 |
+
comprehensive_results[target_model]['average_scores'][metric] = round(sum(scores) / len(scores), 3) if scores else 0.5
|
| 123 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
print(f"\nComprehensive evaluation completed for {len(comprehensive_results)} models")
|
| 125 |
return comprehensive_results
|
| 126 |
|
| 127 |
def save_comprehensive_results(comprehensive_results, prompt, timestamp=None):
|
|
|
|
| 128 |
if timestamp is None:
|
|
|
|
| 129 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 130 |
+
|
| 131 |
filename = f"results/comprehensive_eval_{timestamp}.csv"
|
|
|
|
|
|
|
| 132 |
os.makedirs("results", exist_ok=True)
|
| 133 |
+
|
|
|
|
| 134 |
rows = []
|
| 135 |
for model, data in comprehensive_results.items():
|
|
|
|
| 136 |
avg_scores = data.get('average_scores', {})
|
|
|
|
|
|
|
| 137 |
for evaluator, evaluation in data.get('evaluations', {}).items():
|
| 138 |
row = {
|
| 139 |
'timestamp': timestamp,
|
|
|
|
| 161 |
'avg_clarity': avg_scores.get('clarity', 0.5)
|
| 162 |
}
|
| 163 |
rows.append(row)
|
| 164 |
+
|
|
|
|
| 165 |
if rows:
|
| 166 |
fieldnames = list(rows[0].keys())
|
| 167 |
+
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
| 168 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 169 |
writer.writeheader()
|
| 170 |
writer.writerows(rows)
|
|
|
|
| 171 |
print(f"Results saved to {filename}")
|
| 172 |
return filename
|
| 173 |
else:
|
| 174 |
print("No results to save")
|
| 175 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|