Spaces:
Sleeping
Sleeping
| # 2_lab2.py | |
| # This is a cleaned version of the multi_model_evaluator.py file. | |
| # It is a script that evaluates the performance of multiple models on a given question. | |
| # Below is a cleaned multi_model_evaluator.py version you can save and run as a normal script. | |
| #You can now: | |
| #Create a file named multi_model_evaluator.py. | |
| #Paste this code in. | |
| #Ensure your .env has the needed keys (OPENAI_API_KEY, and others if you want those providers). | |
| #Run with python multi_model_evaluator.py. | |
| import os | |
| import time | |
| from anthropic import Anthropic | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| # ========================= | |
| # Environment setup | |
| # ========================= | |
| #load_dotenv() | |
| load_dotenv(override=True) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") | |
| GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL") | |
| # OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") | |
| if not OPENAI_API_KEY: | |
| raise RuntimeError("OPENAI_API_KEY is required in your .env file.") | |
| # Base OpenAI client (for OpenAI-hosted models, including oss models) | |
| openai_client = OpenAI(api_key=OPENAI_API_KEY) | |
| # ========================= | |
| # Helper: call different providers | |
| # ========================= | |
| def _extract_text(response, provider: str) -> str: | |
| """ | |
| Defensive helper that pulls the first text chunk out of a Responses API | |
| payload. Some providers return tool calls or non-text chunks, so we fall | |
| back to output_text (if available) before giving up. | |
| """ | |
| # Try the structured Responses API shape first | |
| output = getattr(response, "output", None) or [] | |
| for item in output: | |
| content_items = getattr(item, "content", None) or [] | |
| for content in content_items: | |
| text = getattr(content, "text", None) | |
| if text: | |
| # text may come through as list[str] | |
| if isinstance(text, list): | |
| return "".join(text) | |
| return text | |
| # Fall back to the convenience output_text field if present | |
| output_text = getattr(response, "output_text", None) | |
| if output_text: | |
| if isinstance(output_text, list): | |
| return output_text[0] | |
| return output_text | |
| return f"{provider} response did not include text content." | |
| def call_openai_model(model: str, prompt: str) -> str: | |
| response = openai_client.responses.create( | |
| model=model, | |
| input=prompt, | |
| ) | |
| return _extract_text(response, "openai") | |
| def call_anthropic_model(model: str, prompt: str) -> str: | |
| if not ANTHROPIC_API_KEY: | |
| return "ANTHROPIC_API_KEY missing; cannot call Anthropic." | |
| # client = OpenAI( | |
| # api_key=ANTHROPIC_API_KEY, | |
| # base_url="https://api.anthropic.com/v1" | |
| # ) | |
| # response = client.responses.create( | |
| # model=model, | |
| # input=prompt, | |
| # ) | |
| client = Anthropic(api_key=ANTHROPIC_API_KEY) | |
| response = client.messages.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=4096, | |
| ) | |
| return response.content[0].text | |
| def call_gemini_model(model: str, prompt: str) -> str: | |
| if not GEMINI_API_KEY: | |
| return "GEMINI_API_KEY missing; cannot call Gemini." | |
| client = OpenAI( | |
| api_key=GEMINI_API_KEY, | |
| base_url="https://generativelanguage.googleapis.com/v1beta/openai/" | |
| ) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| return response.choices[0].message.content | |
| def call_deepseek_model(model: str, prompt: str) -> str: | |
| if not DEEPSEEK_API_KEY: | |
| return "DEEPSEEK_API_KEY missing; cannot call DeepSeek." | |
| client = OpenAI( | |
| api_key=DEEPSEEK_API_KEY, | |
| base_url="https://api.deepseek.com/v1" | |
| ) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| return response.choices[0].message.content | |
| def call_groq_model(model: str, prompt: str) -> str: | |
| if not GROQ_API_KEY: | |
| return "GROQ_API_KEY missing; cannot call Groq." | |
| client = OpenAI( | |
| api_key=GROQ_API_KEY, | |
| base_url="https://api.groq.com/openai/v1" | |
| ) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| return response.choices[0].message.content | |
| def call_ollama_model(model: str, prompt: str) -> str: | |
| """ | |
| Expects OLLAMA_BASE_URL to point to an Ollama server exposing an OpenAI-compatible /v1 API. | |
| If not set up, this will return a message instead of failing hard. | |
| """ | |
| if not OLLAMA_BASE_URL: | |
| return "OLLAMA_BASE_URL missing; cannot call Ollama." | |
| try: | |
| client = OpenAI( | |
| base_url=f"{OLLAMA_BASE_URL}", | |
| api_key="ollama" # dummy token; Ollama usually ignores this | |
| ) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Ollama call failed: {e}" | |
| # ========================= | |
| # Step 1: generate a single hard question | |
| # ========================= | |
| QUESTION_GENERATOR_MODEL = "gpt-4.1-mini" # or any OpenAI model you prefer | |
| GENERATOR_SYSTEM_PROMPT = ( | |
| "You are a question generation expert. " | |
| "Generate one challenging, real-world question that will test multiple LLMs. " | |
| "Make it complex enough that different LLMs might give different, nuanced answers. " | |
| "Output only the question text, nothing else." | |
| ) | |
| def generate_challenge_question() -> str: | |
| response = openai_client.responses.create( | |
| model=QUESTION_GENERATOR_MODEL, | |
| input=[ | |
| { | |
| "role": "system", | |
| "content": GENERATOR_SYSTEM_PROMPT, | |
| } | |
| ], | |
| ) | |
| question = response.output[0].content[0].text.strip() | |
| return question | |
| # ========================= | |
| # Step 2: define competitor models | |
| # ========================= | |
| # Adjust or comment out entries depending on which APIs/keys you actually have. | |
| # For now, we only enable the OpenAI model that you already have working. | |
| COMPETITORS = [ | |
| { | |
| "name": "Claude sonnet", | |
| "provider": "anthropic", | |
| "model": "claude-sonnet-4-5", | |
| }, | |
| { | |
| "name": "OpenAI gpt-5-nano", | |
| "provider": "openai", | |
| "model": "gpt-5-nano", | |
| }, | |
| { | |
| "name": "Gemini 2.0-flash", | |
| "provider": "gemini", | |
| "model": "gemini-2.0-flash", | |
| }, | |
| { | |
| "name": "Local llama3.2 via Ollama", | |
| "provider": "ollama", | |
| "model": "llama3.2", | |
| }, | |
| { | |
| "name": "DeepSeek Chat", | |
| "provider": "deepseek", | |
| "model": "deepseek-chat", | |
| }, | |
| { | |
| "name": "GROQ openai/gpt-oss-120b", | |
| "provider": "groq", | |
| "model": "openai/gpt-oss-120b", | |
| }, | |
| ] | |
| def call_competitor(provider: str, model: str, prompt: str) -> str: | |
| if provider == "openai": | |
| return call_openai_model(model, prompt) | |
| elif provider == "anthropic": | |
| return call_anthropic_model(model, prompt) | |
| elif provider == "gemini": | |
| return call_gemini_model(model, prompt) | |
| elif provider == "deepseek": | |
| return call_deepseek_model(model, prompt) | |
| elif provider == "groq": | |
| return call_groq_model(model, prompt) | |
| elif provider == "ollama": | |
| return call_ollama_model(model, prompt) | |
| else: | |
| return f"Unknown provider: {provider}" | |
| # ========================= | |
| # Step 3: ask all competitors the same question | |
| # ========================= | |
| def collect_competitor_answers(question: str): | |
| all_answers = [] | |
| for idx, competitor in enumerate(COMPETITORS, start=1): | |
| name = competitor["name"] | |
| provider = competitor["provider"] | |
| model = competitor["model"] | |
| print(f"\n=== Asking competitor {idx}: {name} ===") | |
| start = time.time() | |
| answer = call_competitor(provider, model, question) | |
| elapsed = time.time() - start | |
| print(f"Answer from {name} (took {elapsed:.2f}s):\n") | |
| print(answer) | |
| print("\n" + "=" * 60 + "\n") | |
| all_answers.append( | |
| { | |
| "index": idx, | |
| "name": name, | |
| "provider": provider, | |
| "model": model, | |
| "answer": answer, | |
| "elapsed_seconds": elapsed, | |
| } | |
| ) | |
| return all_answers | |
| # ========================= | |
| # Step 4: create judge prompt with all answers | |
| # ========================= | |
| def build_judge_prompt(question: str, responses: list) -> str: | |
| pieces = [] | |
| pieces.append( | |
| "You are an expert judge comparing responses from multiple AI models to the same question.\n" | |
| "You will receive:\n" | |
| "1) The question.\n" | |
| "2) Several numbered responses from different competitors.\n\n" | |
| "Your task:\n" | |
| "- Carefully read each response.\n" | |
| "- Consider correctness, depth, clarity, helpfulness, and reasoning.\n" | |
| "- Produce a strict ranking from best to worst.\n\n" | |
| "Output format:\n" | |
| "Return ONLY valid JSON with this exact schema (no backticks, no explanation):\n" | |
| "{\n" | |
| ' \"rankings\": [\n' | |
| ' {\"competitor_index\": <number>, \"score\": <number from 0 to 10>, \"justification\": \"<short text>\"}\n' | |
| " ]\n" | |
| "}\n" | |
| "The first element in rankings must be the best answer (highest score), then next best, etc.\n\n" | |
| "Here is the question:\n" | |
| ) | |
| pieces.append(question) | |
| pieces.append("\n\nNow here are the competitor responses:\n") | |
| for r in responses: | |
| pieces.append(f"\n=== Response from competitor {r['index']} ({r['name']}) ===\n") | |
| pieces.append(r["answer"]) | |
| pieces.append("\n") | |
| return "".join(pieces) | |
| # ========================= | |
| # Step 5: ask a judge model to rank them | |
| # ========================= | |
| JUDGE_MODEL = "o3-mini" # or any OpenAI model suitable for judging | |
| def judge_responses(question: str, responses: list): | |
| judge_prompt = build_judge_prompt(question, responses) | |
| response = openai_client.responses.create( | |
| model=JUDGE_MODEL, | |
| input=judge_prompt, | |
| ) | |
| # Fallback: get plain-text output and parse JSON ourselves | |
| import json | |
| raw_text = _extract_text(response, "openai") | |
| result = json.loads(raw_text) | |
| return result | |
| def print_rankings(judge_result, responses): | |
| index_to_response = {r["index"]: r for r in responses} | |
| print("\n=== Final Rankings ===\n") | |
| for rank, entry in enumerate(judge_result["rankings"], start=1): | |
| idx = entry["competitor_index"] | |
| score = entry["score"] | |
| justification = entry["justification"] | |
| competitor = index_to_response.get(idx, {}) | |
| name = competitor.get("name", f"Unknown (index {idx})") | |
| print(f"Rank {rank}: {name}") | |
| print(f" Score: {score}") | |
| print(f" Justification: {justification}") | |
| print() | |
| # ========================= | |
| # Main entry point | |
| # ========================= | |
| def main(): | |
| print("Generating a single challenging question...\n") | |
| question = generate_challenge_question() | |
| print("Question:\n") | |
| print(question) | |
| print("\n" + "=" * 60 + "\n") | |
| print("Collecting competitor answers...\n") | |
| responses = collect_competitor_answers(question) | |
| print("Asking judge model for rankings...\n") | |
| judge_result = judge_responses(question, responses) | |
| print_rankings(judge_result, responses) | |
| if __name__ == "__main__": | |
| main() | |