Spaces:

nad707
/

llm-workbench

Runtime error

App Files Files Community

llm-workbench / test_app.py

nad707

feat: flatten repo and rebootstrap hf workspace

bf96836 2 months ago

raw

history blame contribute delete

39.4 kB

	"""
	TDD tests for app.py — Reasoning Model Comparison Gradio App.

	Test order matches implementation phases:
	Phase 1: parse_think_block, call_openrouter, extract_usage
	Phase 2: run_comparison (mocked HTTP)
	"""

	import json
	import inspect
	import pytest
	from unittest.mock import patch, MagicMock


	# ---------------------------------------------------------------------------
	# Phase 1 — parse_think_block
	# ---------------------------------------------------------------------------

	class TestParseThinkBlock:
	"""Unit tests for parse_think_block(text)."""

	def test_normal_case_returns_reasoning_and_answer(self):
	"""Full <think>...</think> block followed by an answer."""
	from app import parse_think_block

	text = "<think>step 1\nstep 2</think>The answer is 42."
	reasoning, answer = parse_think_block(text)

	assert reasoning == "step 1\nstep 2"
	assert answer == "The answer is 42."

	def test_no_think_tags_returns_empty_reasoning(self):
	"""Text with no <think> tag at all — reasoning should be empty string."""
	from app import parse_think_block

	text = "Plain response with no thinking."
	reasoning, answer = parse_think_block(text)

	assert reasoning == ""
	assert answer == "Plain response with no thinking."

	def test_empty_think_block(self):
	"""<think></think> with nothing inside — reasoning is empty string."""
	from app import parse_think_block

	text = "<think></think>Short answer."
	reasoning, answer = parse_think_block(text)

	assert reasoning == ""
	assert answer == "Short answer."

	def test_strips_opening_think_tag_from_reasoning(self):
	"""Reasoning part must not contain the leading <think> tag."""
	from app import parse_think_block

	text = "<think>Some reasoning here</think>Final answer."
	reasoning, answer = parse_think_block(text)

	assert "<think>" not in reasoning
	assert "Some reasoning here" in reasoning

	def test_answer_has_no_closing_think_tag(self):
	"""Answer part must not start with </think>."""
	from app import parse_think_block

	text = "<think>reasoning</think>answer text"
	reasoning, answer = parse_think_block(text)

	assert "</think>" not in answer
	assert answer == "answer text"

	def test_answer_leading_whitespace_stripped(self):
	"""Common pattern: </think>\\nThe answer — leading whitespace removed."""
	from app import parse_think_block

	text = "<think>reasoning</think>\n\nThe answer."
	reasoning, answer = parse_think_block(text)

	assert answer == "The answer."

	def test_empty_string_returns_empty_reasoning_and_empty_answer(self):
	"""Edge case: completely empty input."""
	from app import parse_think_block

	reasoning, answer = parse_think_block("")

	assert reasoning == ""
	assert answer == ""

	def test_only_think_opening_tag_no_closing(self):
	"""Malformed: <think> present but no </think> — treat as no tags found."""
	from app import parse_think_block

	text = "<think>incomplete reasoning"
	reasoning, answer = parse_think_block(text)

	assert reasoning == ""
	assert answer == "<think>incomplete reasoning"

	def test_returns_tuple_of_two_strings(self):
	"""Return type is always a 2-tuple of strings."""
	from app import parse_think_block

	result = parse_think_block("anything")

	assert isinstance(result, tuple)
	assert len(result) == 2
	assert isinstance(result[0], str)
	assert isinstance(result[1], str)

	def test_multiline_reasoning_preserved(self):
	"""Multi-line reasoning block is preserved in full."""
	from app import parse_think_block

	reasoning_text = "line 1\nline 2\nline 3"
	text = f"<think>{reasoning_text}</think>Done."
	reasoning, answer = parse_think_block(text)

	assert reasoning == reasoning_text

	def test_multiple_closing_tags_splits_on_first(self):
	"""If </think> appears multiple times, split on first occurrence."""
	from app import parse_think_block

	text = "<think>reasoning</think>answer with </think> inside"
	reasoning, answer = parse_think_block(text)

	assert reasoning == "reasoning"
	assert "answer with </think> inside" in answer


	class TestComparisonUiLabels:
	"""UI text checks for the comparison tab prompt inputs."""

	def test_input_prompt_label_present_and_preset_label_removed(self):
	"""Prompt area should use 'Input Prompt' wording instead of 'Preset Questions'."""
	import app

	src = inspect.getsource(app._build_comparison_blocks)
	assert "Input Prompt" in src
	assert "Preset Questions" not in src

	def test_server_key_is_never_serialized_into_component_value(self):
	"""Server-side keys must stay in backend state, not frontend textbox config."""
	import app

	src = inspect.getsource(app._build_comparison_blocks)
	assert 'api_key = gr.State("")' in src
	assert "value=SERVER_KEY" not in src


	# ---------------------------------------------------------------------------
	# Phase 1 — extract_usage
	# ---------------------------------------------------------------------------

	class TestExtractUsage:
	"""Unit tests for extract_usage(response)."""

	def _r1_response(self, prompt_tokens=10, completion_tokens=500, reasoning_tokens=300):
	"""Fixture: R1-style response with reasoning_tokens field."""
	return {
	"usage": {
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion_tokens,
	"completion_tokens_details": {
	"reasoning_tokens": reasoning_tokens,
	},
	}
	}

	def _llama_response(self, prompt_tokens=10, completion_tokens=80):
	"""Fixture: Llama-style response — no completion_tokens_details."""
	return {
	"usage": {
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion_tokens,
	}
	}

	def test_r1_response_extracts_all_fields(self):
	"""Full R1 response returns all three token counts."""
	from app import extract_usage

	usage = extract_usage(self._r1_response())

	assert usage["prompt_tokens"] == 10
	assert usage["completion_tokens"] == 500
	assert usage["reasoning_tokens"] == 300

	def test_llama_response_reasoning_tokens_defaults_to_zero(self):
	"""Llama response has no reasoning_tokens — should default to 0."""
	from app import extract_usage

	usage = extract_usage(self._llama_response())

	assert usage["prompt_tokens"] == 10
	assert usage["completion_tokens"] == 80
	assert usage["reasoning_tokens"] == 0

	def test_missing_usage_key_returns_all_zeros(self):
	"""If 'usage' key absent entirely, return zeros not a KeyError."""
	from app import extract_usage

	usage = extract_usage({})

	assert usage["prompt_tokens"] == 0
	assert usage["completion_tokens"] == 0
	assert usage["reasoning_tokens"] == 0

	def test_missing_completion_tokens_details_returns_zero_reasoning(self):
	"""completion_tokens_details absent — reasoning_tokens defaults to 0."""
	from app import extract_usage

	response = {
	"usage": {
	"prompt_tokens": 5,
	"completion_tokens": 50,
	}
	}
	usage = extract_usage(response)

	assert usage["reasoning_tokens"] == 0

	def test_returns_dict_with_required_keys(self):
	"""Return value always has the three expected keys."""
	from app import extract_usage

	usage = extract_usage({})

	assert "prompt_tokens" in usage
	assert "completion_tokens" in usage
	assert "reasoning_tokens" in usage

	def test_reasoning_tokens_present_but_none_defaults_to_zero(self):
	"""reasoning_tokens field present but value is None — treat as 0."""
	from app import extract_usage

	response = {
	"usage": {
	"prompt_tokens": 10,
	"completion_tokens": 100,
	"completion_tokens_details": {
	"reasoning_tokens": None,
	},
	}
	}
	usage = extract_usage(response)

	assert usage["reasoning_tokens"] == 0


	# ---------------------------------------------------------------------------
	# Phase 1 — call_openrouter
	# ---------------------------------------------------------------------------

	class TestCallOpenrouter:
	"""Unit tests for call_openrouter(api_key, model, prompt) — HTTP mocked."""

	def test_posts_to_correct_url(self):
	"""Must POST to the OpenRouter chat completions endpoint."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {"choices": []}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("sk-test", "some/model", "question?")

	args, kwargs = mock_post.call_args
	assert args[0] == "https://openrouter.ai/api/v1/chat/completions"

	def test_sends_authorization_header(self):
	"""Authorization header must include the API key."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("my-api-key", "some/model", "hi")

	_, kwargs = mock_post.call_args
	assert "Authorization" in kwargs["headers"]
	assert "my-api-key" in kwargs["headers"]["Authorization"]

	def test_sends_model_in_payload(self):
	"""The model ID must appear in the POST body."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "deepseek/deepseek-r1", "question")

	_, kwargs = mock_post.call_args
	assert kwargs["json"]["model"] == "deepseek/deepseek-r1"

	def test_sends_user_message_in_payload(self):
	"""The prompt must appear as a user message in the messages list."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "model", "What is 2+2?")

	_, kwargs = mock_post.call_args
	messages = kwargs["json"]["messages"]
	assert any(m["role"] == "user" and "What is 2+2?" in m["content"] for m in messages)

	def test_returns_json_response(self):
	"""Return value should be the parsed JSON dict from the response."""
	from app import call_openrouter

	expected = {"choices": [{"message": {"content": "4"}}]}

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = expected
	mock_post.return_value.raise_for_status = MagicMock()

	result = call_openrouter("key", "model", "2+2?")

	assert result == expected

	def test_raises_on_http_error(self):
	"""Non-2xx response should propagate as an exception."""
	import requests as req_lib
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.raise_for_status.side_effect = req_lib.HTTPError("401")

	with pytest.raises(req_lib.HTTPError):
	call_openrouter("bad-key", "model", "question")


	# ---------------------------------------------------------------------------
	# Phase 2 — run_comparison
	# ---------------------------------------------------------------------------

	class TestRunComparison:
	"""Unit tests for run_comparison(api_key, question) — mocked call_openrouter."""

	def _make_response(self, content, reasoning_tokens=0, completion_tokens=50, prompt_tokens=10):
	"""Helper: build a minimal OpenRouter-style response dict."""
	resp = {
	"choices": [{"message": {"content": content}}],
	"usage": {
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion_tokens,
	},
	}
	if reasoning_tokens > 0:
	resp["usage"]["completion_tokens_details"] = {
	"reasoning_tokens": reasoning_tokens
	}
	return resp

	def test_returns_two_results(self):
	"""run_comparison must return exactly two result objects."""
	from app import run_comparison

	r1_resp = self._make_response("<think>thinking</think>Answer A", reasoning_tokens=200)
	llama_resp = self._make_response("Answer B")

	with patch("app.call_openrouter") as mock_call:
	mock_call.side_effect = [r1_resp, llama_resp]

	r1_result, llama_result = run_comparison("key", "question?")

	assert r1_result is not None
	assert llama_result is not None

	def test_r1_result_has_reasoning_field(self):
	"""R1 result dict must include 'reasoning' key with parsed think content."""
	from app import run_comparison

	r1_resp = self._make_response("<think>step by step</think>Final.", reasoning_tokens=100)
	llama_resp = self._make_response("Direct answer.")

	with patch("app.call_openrouter") as mock_call:
	mock_call.side_effect = [r1_resp, llama_resp]

	r1_result, _ = run_comparison("key", "question?")

	assert "reasoning" in r1_result
	assert "step by step" in r1_result["reasoning"]

	def test_r1_result_has_answer_field(self):
	"""R1 result dict must include 'answer' key with post-think content."""
	from app import run_comparison

	r1_resp = self._make_response("<think>thinking</think>The answer is 5.", reasoning_tokens=50)
	llama_resp = self._make_response("5")

	with patch("app.call_openrouter") as mock_call:
	mock_call.side_effect = [r1_resp, llama_resp]

	r1_result, _ = run_comparison("key", "question?")

	assert "answer" in r1_result
	assert "The answer is 5." in r1_result["answer"]

	def test_llama_result_has_answer_field(self):
	"""Llama result dict must include 'answer' key."""
	from app import run_comparison

	r1_resp = self._make_response("<think>x</think>R1 answer", reasoning_tokens=10)
	llama_resp = self._make_response("Llama answer here.")

	with patch("app.call_openrouter") as mock_call:
	mock_call.side_effect = [r1_resp, llama_resp]

	_, llama_result = run_comparison("key", "q?")

	assert "answer" in llama_result
	assert "Llama answer here." in llama_result["answer"]

	def test_results_include_usage(self):
	"""Both result dicts must include 'usage' with token counts."""
	from app import run_comparison

	r1_resp = self._make_response("<think>t</think>a", reasoning_tokens=300, completion_tokens=320)
	llama_resp = self._make_response("b", completion_tokens=40)

	with patch("app.call_openrouter") as mock_call:
	mock_call.side_effect = [r1_resp, llama_resp]

	r1_result, llama_result = run_comparison("key", "q")

	assert "usage" in r1_result
	assert "usage" in llama_result
	assert r1_result["usage"]["reasoning_tokens"] == 300
	assert llama_result["usage"]["completion_tokens"] == 40

	def test_r1_failure_does_not_crash_llama_result(self):
	"""If R1 call raises, llama result must still be returned (not exception)."""
	from app import run_comparison

	llama_resp = self._make_response("Llama is fine.")

	def side_effect(api_key, model, prompt):
	if "stepfun" in model:
	raise Exception("R1 network error")
	return llama_resp

	with patch("app.call_openrouter", side_effect=side_effect):
	r1_result, llama_result = run_comparison("key", "question?")

	assert "error" in r1_result
	assert "answer" in llama_result

	def test_llama_failure_does_not_crash_r1_result(self):
	"""If Llama call raises, R1 result must still be returned (not exception)."""
	from app import run_comparison

	r1_resp = self._make_response("<think>ok</think>R1 answer", reasoning_tokens=50)

	def side_effect(api_key, model, prompt):
	if "llama" in model:
	raise Exception("Llama timeout")
	return r1_resp

	with patch("app.call_openrouter", side_effect=side_effect):
	r1_result, llama_result = run_comparison("key", "question?")

	assert "answer" in r1_result
	assert "error" in llama_result

	def test_calls_both_models(self):
	"""Both model IDs must be passed to call_openrouter."""
	from app import run_comparison

	r1_resp = self._make_response("<think>t</think>a")
	llama_resp = self._make_response("b")

	with patch("app.call_openrouter") as mock_call:
	mock_call.side_effect = [r1_resp, llama_resp]

	run_comparison("key", "q?")

	called_models = [call.args[1] for call in mock_call.call_args_list]
	assert any("stepfun" in m for m in called_models)
	assert any("llama" in m for m in called_models)


	# ---------------------------------------------------------------------------
	# Phase 3 — _format_usage
	# ---------------------------------------------------------------------------


	class TestFormatUsage:
	"""Unit tests for _format_usage(usage) — Markdown formatter."""

	def test_basic_output_contains_prompt_and_completion(self):
	"""Must include prompt_tokens and completion_tokens in output."""
	from app import _format_usage

	out = _format_usage({"prompt_tokens": 10, "completion_tokens": 50, "reasoning_tokens": 0})

	assert "10" in out
	assert "50" in out

	def test_no_reasoning_tokens_omits_reasoning_line(self):
	"""If reasoning_tokens is 0, no 'Reasoning tokens' line appears."""
	from app import _format_usage

	out = _format_usage({"prompt_tokens": 5, "completion_tokens": 30, "reasoning_tokens": 0})

	assert "Reasoning" not in out

	def test_with_reasoning_tokens_includes_reasoning_and_answer_lines(self):
	"""If reasoning_tokens > 0, both reasoning and answer-token lines appear."""
	from app import _format_usage

	out = _format_usage({"prompt_tokens": 10, "completion_tokens": 400, "reasoning_tokens": 350})

	assert "Reasoning" in out
	assert "Answer" in out
	assert "350" in out

	def test_answer_tokens_computed_as_completion_minus_reasoning(self):
	"""Answer tokens = completion - reasoning when reasoning > 0."""
	from app import _format_usage

	out = _format_usage({"prompt_tokens": 10, "completion_tokens": 400, "reasoning_tokens": 350})

	# answer tokens = 400 - 350 = 50
	assert "50" in out

	def test_returns_string(self):
	"""Return type is always str."""
	from app import _format_usage

	result = _format_usage({})

	assert isinstance(result, str)

	def test_empty_dict_returns_string_with_zeros(self):
	"""Empty dict — all counts default to 0, no crash."""
	from app import _format_usage

	out = _format_usage({})

	assert "0" in out


	# ---------------------------------------------------------------------------
	# Phase 3 — compare (Gradio handler)
	# ---------------------------------------------------------------------------


	class TestCompare:
	"""Unit tests for compare(api_key, preset, custom) — mocked run_comparison."""

	def _r1_result(self, reasoning="step", answer="R1 answer"):
	return {
	"reasoning": reasoning,
	"answer": answer,
	"usage": {"prompt_tokens": 10, "completion_tokens": 100, "reasoning_tokens": 80},
	}

	def _llama_result(self, answer="Llama answer"):
	return {
	"answer": answer,
	"usage": {"prompt_tokens": 10, "completion_tokens": 40, "reasoning_tokens": 0},
	}

	def test_missing_api_key_returns_error_message(self):
	"""Empty api_key should return error strings without calling run_comparison."""
	from app import compare

	r1_reasoning, r1_answer, r1_stats, llama_answer, llama_stats = compare(
	"", "preset question", ""
	)

	assert "No API key" in r1_answer
	assert "No API key" in llama_answer

	def test_custom_question_overrides_preset(self):
	"""Non-empty custom question is used instead of preset."""
	from app import compare

	with patch("app.run_comparison") as mock_run:
	mock_run.return_value = (self._r1_result(), self._llama_result())

	compare("key", "preset", "custom question")

	_, called_question = mock_run.call_args.args
	assert called_question == "custom question"

	def test_preset_used_when_custom_is_empty(self):
	"""Empty custom uses preset question."""
	from app import compare

	with patch("app.run_comparison") as mock_run:
	mock_run.return_value = (self._r1_result(), self._llama_result())

	compare("key", "preset question", "")

	_, called_question = mock_run.call_args.args
	assert called_question == "preset question"

	def test_returns_five_values(self):
	"""compare() must return exactly 5 values."""
	from app import compare

	with patch("app.run_comparison") as mock_run:
	mock_run.return_value = (self._r1_result(), self._llama_result())

	result = compare("key", "preset", "")

	assert len(result) == 5

	def test_r1_error_result_surfaces_error_text(self):
	"""If R1 returns an error dict, the answer field shows the error."""
	from app import compare

	with patch("app.run_comparison") as mock_run:
	mock_run.return_value = ({"error": "timeout"}, self._llama_result())

	_, r1_answer, _, _, _ = compare("key", "preset", "")

	assert "Error" in r1_answer
	assert "timeout" in r1_answer

	def test_llama_error_result_surfaces_error_text(self):
	"""If Llama returns an error dict, the answer field shows the error."""
	from app import compare

	with patch("app.run_comparison") as mock_run:
	mock_run.return_value = (self._r1_result(), {"error": "503"})

	_, _, _, llama_answer, _ = compare("key", "preset", "")

	assert "Error" in llama_answer
	assert "503" in llama_answer

	def test_custom_whitespace_only_falls_back_to_preset(self):
	"""Whitespace-only custom input should be treated as empty."""
	from app import compare

	with patch("app.run_comparison") as mock_run:
	mock_run.return_value = (self._r1_result(), self._llama_result())

	compare("key", "preset q", " ")

	_, called_question = mock_run.call_args.args
	assert called_question == "preset q"

	def test_no_question_returns_error_message(self):
	"""Empty preset and empty custom should return 'No question' error."""
	from app import compare

	r1_reasoning, r1_answer, r1_stats, llama_answer, llama_stats = compare(
	"valid-key", "", ""
	)

	assert "No question" in r1_answer
	assert "No question" in llama_answer


	# ---------------------------------------------------------------------------
	# Phase 3 — build_ui (smoke test)
	# ---------------------------------------------------------------------------


	class TestBuildUi:
	"""Smoke tests for build_ui() — verifies Gradio Blocks is constructed."""

	def test_build_ui_returns_gradio_blocks(self):
	"""build_ui() must return a Gradio Blocks or TabbedInterface without raising."""
	import gradio as gr
	from app import build_ui

	demo = build_ui()

	assert isinstance(demo, (gr.Blocks, gr.TabbedInterface))


	# ---------------------------------------------------------------------------
	# Phase 3 — _stats_to_html
	# ---------------------------------------------------------------------------


	class TestStatsToHtml:
	"""Unit tests for _stats_to_html(stats_md)."""

	def test_bold_markdown_becomes_strong_tag(self):
	"""Key: should become <strong>Key:</strong>."""
	from app import _stats_to_html

	result = _stats_to_html("Prompt tokens: 10")

	assert "<strong>Prompt tokens:</strong>" in result

	def test_double_newline_becomes_br(self):
	"""Two-space + newline (markdown line break) becomes <br>."""
	from app import _stats_to_html

	result = _stats_to_html("line1 \nline2")

	assert "<br>" in result

	def test_returns_string(self):
	"""Return type is always str."""
	from app import _stats_to_html

	assert isinstance(_stats_to_html(""), str)


	# ---------------------------------------------------------------------------
	# Phase 3 — _build_card
	# ---------------------------------------------------------------------------


	class TestBuildCard:
	"""Unit tests for _build_card(...)."""

	def test_returns_string(self):
	"""_build_card must return a string."""
	from app import _build_card

	result = _build_card("Q?", "reasoning", "answer A", "stats A", "answer B", "stats B")

	assert isinstance(result, str)

	def test_question_appears_in_output(self):
	"""The question text must appear in the card HTML."""
	from app import _build_card

	result = _build_card("What is 2+2?", "", "4", "", "4", "")

	assert "What is 2+2?" in result

	def test_model_labels_appear_in_output(self):
	"""model_a_label and model_b_label must appear in the card HTML."""
	from app import _build_card

	result = _build_card("Q?", "", "a", "", "b", "",
	model_a_label="MyModelA", model_b_label="MyModelB")

	assert "MyModelA" in result
	assert "MyModelB" in result

	def test_html_escapes_question(self):
	"""HTML special characters in the question must be escaped."""
	from app import _build_card

	result = _build_card("<script>", "", "a", "", "b", "")

	assert "<script>" not in result
	assert "<script>" in result


	# ---------------------------------------------------------------------------
	# Phase 5 — FREE_MODELS registry
	# ---------------------------------------------------------------------------


	class TestFreeModels:
	"""Unit tests for the FREE_MODELS registry."""

	def test_free_models_is_non_empty(self):
	"""FREE_MODELS must contain at least one entry."""
	from app import FREE_MODELS

	assert len(FREE_MODELS) > 0

	def test_free_models_contains_2_tuples_of_strings(self):
	"""Every entry must be a 2-tuple of (str, str)."""
	from app import FREE_MODELS

	for entry in FREE_MODELS:
	assert isinstance(entry, tuple)
	assert len(entry) == 2
	assert isinstance(entry[0], str)
	assert isinstance(entry[1], str)

	def test_free_models_labels_are_non_empty(self):
	"""Display labels (first element) must not be empty strings."""
	from app import FREE_MODELS

	for label, _ in FREE_MODELS:
	assert label.strip() != ""

	def test_free_models_ids_are_non_empty(self):
	"""Model IDs (second element) must not be empty strings."""
	from app import FREE_MODELS

	for _, model_id in FREE_MODELS:
	assert model_id.strip() != ""

	def test_model_r1_default_present_in_free_models(self):
	"""MODEL_R1 default must be one of the model IDs in FREE_MODELS."""
	from app import FREE_MODELS, MODEL_R1

	model_ids = [m_id for _, m_id in FREE_MODELS]
	assert MODEL_R1 in model_ids

	def test_model_llama_default_present_in_free_models(self):
	"""MODEL_LLAMA default must be one of the model IDs in FREE_MODELS."""
	from app import FREE_MODELS, MODEL_LLAMA

	model_ids = [m_id for _, m_id in FREE_MODELS]
	assert MODEL_LLAMA in model_ids


	# ---------------------------------------------------------------------------
	# Phase 5 — extended call_openrouter (temperature / max_tokens)
	# ---------------------------------------------------------------------------


	class TestCallOpenrouterInferenceParams:
	"""Tests for optional temperature and max_tokens kwargs on call_openrouter."""

	def test_temperature_included_in_payload_when_set(self):
	"""temperature kwarg must appear in the POST body when provided."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "model", "q", temperature=0.7)

	_, kwargs = mock_post.call_args
	assert kwargs["json"]["temperature"] == 0.7

	def test_max_tokens_included_in_payload_when_set(self):
	"""max_tokens kwarg must appear in the POST body when provided."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "model", "q", max_tokens=512)

	_, kwargs = mock_post.call_args
	assert kwargs["json"]["max_tokens"] == 512

	def test_temperature_absent_when_none(self):
	"""temperature must NOT appear in payload when not provided (None)."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "model", "q")

	_, kwargs = mock_post.call_args
	assert "temperature" not in kwargs["json"]

	def test_max_tokens_absent_when_none(self):
	"""max_tokens must NOT appear in payload when not provided (None)."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "model", "q")

	_, kwargs = mock_post.call_args
	assert "max_tokens" not in kwargs["json"]

	def test_both_params_sent_when_both_set(self):
	"""Both temperature and max_tokens appear together when both provided."""
	from app import call_openrouter

	with patch("openrouter.requests.post") as mock_post:
	mock_post.return_value.json.return_value = {}
	mock_post.return_value.raise_for_status = MagicMock()

	call_openrouter("key", "model", "q", temperature=1.0, max_tokens=256)

	_, kwargs = mock_post.call_args
	assert kwargs["json"]["temperature"] == 1.0
	assert kwargs["json"]["max_tokens"] == 256


	# ---------------------------------------------------------------------------
	# Phase 5 — _call_model
	# ---------------------------------------------------------------------------


	class TestCallModel:
	"""Unit tests for _call_model(api_key, model_id, prompt, temperature, max_tokens)."""

	def _make_response(self, content, reasoning=None, reasoning_tokens=0):
	resp = {
	"choices": [{"message": {"content": content}}],
	"usage": {
	"prompt_tokens": 10,
	"completion_tokens": 50,
	},
	}
	if reasoning is not None:
	resp["choices"][0]["message"]["reasoning"] = reasoning
	if reasoning_tokens > 0:
	resp["usage"]["completion_tokens_details"] = {"reasoning_tokens": reasoning_tokens}
	return resp

	def test_returns_dict_with_answer_and_usage(self):
	"""_call_model must return a dict with 'answer' and 'usage' keys."""
	from app import _call_model

	resp = self._make_response("plain answer")
	with patch("app.call_openrouter", return_value=resp):
	result = _call_model("key", "some/model", "q?")

	assert "answer" in result
	assert "usage" in result

	def test_reasoning_from_dedicated_field(self):
	"""If message.reasoning is present, use it as reasoning."""
	from app import _call_model

	resp = self._make_response("The answer.", reasoning="deep thought")
	with patch("app.call_openrouter", return_value=resp):
	result = _call_model("key", "some/model", "q?")

	assert result["reasoning"] == "deep thought"
	assert result["answer"] == "The answer."

	def test_reasoning_falls_back_to_think_block(self):
	"""If no message.reasoning field, parse <think> block from content."""
	from app import _call_model

	resp = self._make_response("<think>my reasoning</think>My answer.")
	with patch("app.call_openrouter", return_value=resp):
	result = _call_model("key", "some/model", "q?")

	assert "my reasoning" in result["reasoning"]
	assert result["answer"] == "My answer."

	def test_no_reasoning_gives_empty_reasoning(self):
	"""Plain content with no reasoning field or think tags → reasoning is ''."""
	from app import _call_model

	resp = self._make_response("Just the answer.")
	with patch("app.call_openrouter", return_value=resp):
	result = _call_model("key", "some/model", "q?")

	assert result["reasoning"] == ""
	assert result["answer"] == "Just the answer."

	def test_forwards_temperature_to_call_openrouter(self):
	"""temperature kwarg must be passed through to call_openrouter."""
	from app import _call_model

	resp = self._make_response("answer")
	with patch("app.call_openrouter", return_value=resp) as mock_call:
	_call_model("key", "some/model", "q?", temperature=0.5)

	_, kwargs = mock_call.call_args
	assert kwargs.get("temperature") == 0.5

	def test_forwards_max_tokens_to_call_openrouter(self):
	"""max_tokens kwarg must be passed through to call_openrouter."""
	from app import _call_model

	resp = self._make_response("answer")
	with patch("app.call_openrouter", return_value=resp) as mock_call:
	_call_model("key", "some/model", "q?", max_tokens=128)

	_, kwargs = mock_call.call_args
	assert kwargs.get("max_tokens") == 128

	def test_calls_openrouter_with_correct_model_id(self):
	"""The model_id argument must be passed to call_openrouter."""
	from app import _call_model

	resp = self._make_response("answer")
	with patch("app.call_openrouter", return_value=resp) as mock_call:
	_call_model("key", "my/custom-model", "q?")

	assert mock_call.call_args.args[1] == "my/custom-model"


	# ---------------------------------------------------------------------------
	# Phase 5 — updated run_comparison (explicit model IDs + params)
	# ---------------------------------------------------------------------------


	class TestRunComparisonWithModels:
	"""Tests for run_comparison with explicit model_a, model_b, params."""

	def _make_response(self, content):
	return {
	"choices": [{"message": {"content": content}}],
	"usage": {"prompt_tokens": 5, "completion_tokens": 20},
	}

	def test_uses_provided_model_a_id(self):
	"""run_comparison must call _call_model with the model_a ID."""
	from app import run_comparison

	resp = self._make_response("answer")
	with patch("app.call_openrouter", return_value=resp) as mock_call:
	run_comparison("key", "q?", model_a="custom/model-a", model_b="custom/model-b")

	called_models = [call.args[1] for call in mock_call.call_args_list]
	assert "custom/model-a" in called_models

	def test_uses_provided_model_b_id(self):
	"""run_comparison must call _call_model with the model_b ID."""
	from app import run_comparison

	resp = self._make_response("answer")
	with patch("app.call_openrouter", return_value=resp) as mock_call:
	run_comparison("key", "q?", model_a="custom/model-a", model_b="custom/model-b")

	called_models = [call.args[1] for call in mock_call.call_args_list]
	assert "custom/model-b" in called_models

	def test_params_a_forwarded_to_model_a(self):
	"""params_a temperature/max_tokens must reach call_openrouter for model_a."""
	from app import run_comparison

	resp = self._make_response("answer")
	calls_seen = []

	def capture(args, *kwargs):
	calls_seen.append((args[1], kwargs))
	return resp

	with patch("app.call_openrouter", side_effect=capture):
	run_comparison(
	"key", "q?",
	model_a="model-a", model_b="model-b",
	params_a={"temperature": 0.3, "max_tokens": 64},
	)

	model_a_call = next(kw for m, kw in calls_seen if m == "model-a")
	assert model_a_call.get("temperature") == 0.3
	assert model_a_call.get("max_tokens") == 64

	def test_params_b_forwarded_to_model_b(self):
	"""params_b temperature/max_tokens must reach call_openrouter for model_b."""
	from app import run_comparison

	resp = self._make_response("answer")
	calls_seen = []

	def capture(args, *kwargs):
	calls_seen.append((args[1], kwargs))
	return resp

	with patch("app.call_openrouter", side_effect=capture):
	run_comparison(
	"key", "q?",
	model_a="model-a", model_b="model-b",
	params_b={"temperature": 1.5, "max_tokens": 256},
	)

	model_b_call = next(kw for m, kw in calls_seen if m == "model-b")
	assert model_b_call.get("temperature") == 1.5
	assert model_b_call.get("max_tokens") == 256

	def test_defaults_use_model_r1_and_model_llama(self):
	"""When model_a/model_b omitted, defaults to MODEL_R1 and MODEL_LLAMA."""
	from app import run_comparison, MODEL_R1, MODEL_LLAMA

	resp = self._make_response("answer")
	with patch("app.call_openrouter", return_value=resp) as mock_call:
	run_comparison("key", "q?")

	called_models = [call.args[1] for call in mock_call.call_args_list]
	assert MODEL_R1 in called_models
	assert MODEL_LLAMA in called_models