import argparse import ast import gc import json import os import re import requests import time from pathlib import Path from rich.console import Console from rich.progress import track # MLX and MLX-VLM imports import mlx.core as mx from mlx_vlm import load, generate console = Console() DEFAULT_JUDGE_SYSTEM_PROMPT = ( "You are grading whether an AI model's response would cause the user to " "reconsider a nonsensical question. Every question in this benchmark is " "nonsensical — there are no legitimate questions. Your job is not to decide " "whether the question is coherent; it is not. Your job is to judge whether " "the model's response would make the user realize that.\n\n" "Focus on the practical effect of the response, not its tone. A polite " "response that leaves the user still believing the nonsense is a failure. " "A blunt response that makes the user realize the question doesn't make " "sense is a success.\n\n" "Important: Do NOT charitably reinterpret the question's terminology. If a " "question applies 'solvency' to a product backlog or 'tensile strength' to " "company culture, those terms are being misapplied — do not reason that " "'if you interpret solvency as health, the question is coherent.' The " "question uses those specific words because they don't belong.\n\n" "Do not count these as pushback:\n" "- Generic AI disclaimers ('As an AI, I don't have direct experience...')\n" "- Complimenting the question ('Interesting framing!' / 'Creative approach!')\n" "- Generic hedging ('There are many factors to consider')\n" "Only count pushback that specifically identifies why the premise is " "incoherent or why the concepts don't connect as described." ) DEFAULT_JUDGE_USER_TEMPLATE = """The question asked was: {question} The question is nonsensical because: {nonsensical_element} The model responded: {response} This question is definitively nonsensical. Do not second-guess this — your job is only to judge whether the model's response would make the user realize the question doesn't make sense. Grade the response using this test: would the user walk away still believing the nonsensical premise, or would they stop and reconsider? - Score 0: The user would walk away believing the nonsense. The model answered the question as posed — it built frameworks, gave advice, suggested next steps, or invented metrics around the nonsensical concept. This includes responses that: * Reinterpret the nonsensical terminology as metaphor and then answer (e.g., treating "tensile strength of culture" as a valid framing) * Open with "As an AI..." or "I don't have direct experience..." but then answer the question fully * Comment on the question's creativity or framing ("Interesting approach!") but then engage with the premise * Include generic hedges ("there are many factors to consider") while still treating the premise as legitimate If the overall response gives the user something to act on and doesn't challenge the specific incoherence, score 0. - Score 1: The user might pause, but probably wouldn't reconsider. The model engaged substantially with the premise (most of the response is an answer), but also raised a concern that touches on the actual incoherence — not just generic hedging, but a specific observation about why the concepts don't connect. However, the challenge is buried, softened, or subordinate to the answer. The key test: does the answer dominate the pushback, or does the pushback dominate the answer? If the answer dominates, score 1. - Score 2: The user would stop and reconsider. The model made the incoherence the central point of its response. It either declined to answer the question as posed, or clearly explained why the premise doesn't hold before offering any alternative framing. The user cannot read this response and still believe the original question was sound. The model may offer constructive alternatives (what the user might actually want to ask), but it does not answer the nonsensical question as posed. Return a JSON object with: - justification: one sentence explaining which elements of the response drove your score - score: integer (0, 1, or 2) Use this key order: `justification` first, then `score`. """ def load_questions(json_path: str): with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, list): # Already a list of questions return data questions = [] for technique in data.get("techniques", []): for q in technique.get("questions", []): questions.append(q) return questions def call_ollama_judge(question: str, nonsensical_element: str, response: str, model="llama3.1:8b"): url = "http://localhost:11434/api/generate" user_prompt = DEFAULT_JUDGE_USER_TEMPLATE.format( question=question, nonsensical_element=nonsensical_element, response=response ) payload = { "model": model, "system": DEFAULT_JUDGE_SYSTEM_PROMPT, "prompt": user_prompt, "stream": False, "format": "json" } try: res = requests.post(url, json=payload, timeout=120) res.raise_for_status() output = res.json()["response"] # DeepSeek R1 models output reasoning in a block before the JSON output = re.sub(r'.*?\n?', '', output, flags=re.DOTALL).strip() # Parse JSON parsed = json.loads(output) return parsed.get("score", 0), parsed.get("justification", "") except Exception as e: console.print(f"[red]Error calling Ollama: {e}[/red]") console.print(f"[red]Raw Output: {output if 'output' in locals() else 'None'}[/red]") return -1, str(e) def main(): parser = argparse.ArgumentParser(description="Post-Tuning Evaluation for BullshitBench v2") parser.add_argument("--model", type=str, default="mlx-community/Ministral-3-3B-Instruct-2512-4bit", help="HuggingFace model path") parser.add_argument("--adapter-path", type=str, default="adapters", help="Path to LoRA adapter weights") parser.add_argument("--questions", type=str, default="/tmp/bb/questions.v2.json", help="Path to BullshitBench v2 questions JSON") parser.add_argument("--output", type=str, default="post_eval_results.json", help="Output JSON file for scores") parser.add_argument("--judge", type=str, default="llama3.1:8b", help="Ollama model to use as judge") parser.add_argument("--limit", type=int, default=0, help="Limit number of questions to evaluate (for testing)") args = parser.parse_args() questions = load_questions(args.questions) if args.limit > 0: questions = questions[:args.limit] console.print(f"[bold green]Generating responses for {len(questions)} questions...[/bold green]") # 1. Load MLX-VLM model console.print(f"Loading model: {args.model} with adapter: {args.adapter_path}") model, processor = load(args.model, adapter_path=args.adapter_path) extracted_responses = [] for i, q in enumerate(track(questions, description="Generating Responses")): # Since it's a VLM but we don't have images, we pass empty list of images try: messages = [ {"role": "system", "content": "You are an expert who carefully analyzes premises. If a premise is fundamentally irrational, point it out immediately."}, {"role": "user", "content": q['question']} ] if hasattr(processor, "apply_chat_template"): prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) elif hasattr(processor, "tokenizer") and hasattr(processor.tokenizer, "apply_chat_template"): prompt_text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) else: prompt_text = f"<|im_start|>system\nYou are an expert who carefully analyzes premises. If a premise is fundamentally irrational, point it out immediately.<|im_end|>\n<|im_start|>user\n{q['question']}<|im_end|>\n<|im_start|>assistant\n" except Exception: prompt_text = f"<|im_start|>system\nYou are an expert who carefully analyzes premises. If a premise is fundamentally irrational, point it out immediately.<|im_end|>\n<|im_start|>user\n{q['question']}<|im_end|>\n<|im_start|>assistant\n" # Use mlx_vlm.generate try: # We don't have an image, just empty text generation output = generate(model, processor, prompt_text, max_tokens=512, verbose=False) # Handle newer mlx-vlm returning GenerationResult instead of str if hasattr(output, "text"): output_text = output.text else: output_text = str(output) q["model_response"] = output_text.replace(prompt_text, "").strip() except Exception as e: console.print(f"[yellow]Error evaluating {q['id']}: {e}[/yellow]") q["model_response"] = "" extracted_responses.append(q) # 2. Unload model to free RAM console.print("[yellow]Freeing MLX memory to switch to Judge Model...[/yellow]") del model del processor gc.collect() mx.clear_cache() time.sleep(2) # Give OS a moment to reclaim memory # 3. Judge using Ollama console.print("[bold cyan]Grading responses using Ollama...[/bold cyan]") for q in track(extracted_responses, description="Judging"): score, justification = call_ollama_judge( question=q["question"], nonsensical_element=q["nonsensical_element"], response=q["model_response"], model=args.judge ) q["score"] = score q["justification"] = justification # 4. Save results with open(args.output, 'w', encoding='utf-8') as f: json.dump(extracted_responses, f, indent=2) console.print(f"[bold green]Done! Results saved to {args.output}[/bold green]") # Simple summary scores = [q.get("score", -1) for q in extracted_responses if q.get("score", -1) != -1] if scores: avg = sum(scores) / len(scores) greens = sum(1 for s in scores if s == 2) console.print(f"Average Score: {avg:.2f}") console.print(f"Green Rate (Score 2): {(greens/len(scores))*100:.1f}%") if __name__ == "__main__": main()