Applicant-Task-Submission-Test

Running

App Files Files Community

Applicant-Task-Submission-Test / app.py

GreenStar24

small

67eefaa about 1 month ago

raw

history blame contribute delete

79.2 kB

	import json
	import logging
	import os
	import re
	import threading
	import time
	import traceback
	from collections import Counter
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from datetime import datetime, timezone

	import gradio as gr
	import gspread
	from google.oauth2.service_account import Credentials
	from openai import OpenAI

	SCOPES = [
	"https://www.googleapis.com/auth/spreadsheets",
	"https://www.googleapis.com/auth/drive",
	]

	DEFAULT_MODEL_NAME = "gpt-5-mini"
	DEFAULT_EVAL_MAX_WORKERS = 4
	DEFAULT_LLM2_MAX_WORKERS = 8
	DEFAULT_OPENAI_MAX_CONCURRENT_REQUESTS = 16
	EVAL_REPEAT_COUNT = 3
	SUBMISSION_TYPES = {"practice", "final"}
	IRRELEVANT_TAG_START = "<user_message>"
	IRRELEVANT_TAG_END = "</user_message>"
	SHEET_SECTION_DIVIDER = "=" * 50
	STAGE_ONE_OUTPUT_MAX_CHARS = 4000
	STAGE_TWO_OUTPUT_MAX_CHARS = 2000
	STAGE_THREE_OUTPUT_MAX_CHARS = 4000
	PROMPT_CELL_MAX_CHARS = 8000
	EXPECTED_OUTPUT_MAX_CHARS = 4000
	SHEET_CELL_MAX_CHARS = 49000

	COUNSEL_FIELDS = ("buyer_counsel", "seller_counsel", "third_party_counsel")
	TARGET_FIRM_FIELD = "target_firm"
	USER_QUESTION_FIELD = "user_question"
	FINAL_SCHEMA_KEYS = (*COUNSEL_FIELDS, USER_QUESTION_FIELD)
	CITATION_PATTERN = re.compile(r"\[\^(\d+)\]")
	BOOLEAN_ANSWER_PATTERN = re.compile(r"^(true\|false\|yes\|no)\b$", re.IGNORECASE)

	LOCAL_ATTEMPTS = {}
	_OPENAI_CLIENT = None
	_OPENAI_REQUEST_SEMAPHORE = None
	_OPENAI_REQUEST_SEMAPHORE_LIMIT = None
	_OPENAI_REQUEST_SEMAPHORE_LOCK = threading.Lock()
	_LOG_LEVEL_NAME = os.environ.get("LOG_LEVEL", "INFO").upper()
	logging.basicConfig(level=getattr(logging, _LOG_LEVEL_NAME, logging.INFO))
	logger = logging.getLogger(__name__)


	SHEET_COLUMNS = [
	("timestamp", "Submitted At"),
	("submission_type", "Submission Type"),
	("name", "Candidate Name"),
	("email", "Candidate Email"),
	("results", "Results"),
	("results_without_llm_1", "Results LLM3"),
	("llm_score_breakdown", "LLM Score Breakdown"),
	("llm_output_vs_expected", "LLM Output vs Expected"),
	("llm_2_output", "LLM 2 Output"),
	("prompts", "Prompts"),
	]
	SHEET_HEADERS = [label for _, label in SHEET_COLUMNS]

	APP_CSS = """
	.submission-note {
	background: var(--submission-note-bg, #fff7e6);
	border: 1px solid var(--submission-note-border, #ffe5b4);
	border-radius: 8px;
	color: var(--submission-note-text, #3d2b00);
	margin-bottom: 1em;
	padding: 16px;
	}

	@media (prefers-color-scheme: dark) {
	.submission-note {
	--submission-note-bg: #2b2111;
	--submission-note-border: #6b4b18;
	--submission-note-text: #f7e8c5;
	}
	}

	.dark .submission-note {
	--submission-note-bg: #2b2111;
	--submission-note-border: #6b4b18;
	--submission-note-text: #f7e8c5;
	}
	"""


	def get_model_name():
	return os.environ.get("OPENAI_MODEL_NAME", DEFAULT_MODEL_NAME)


	def get_positive_int_env(name, default):
	raw_value = os.environ.get(name, str(default))
	try:
	value = int(raw_value)
	except ValueError:
	logger.warning("Invalid integer for %s=%r; using default %s.", name, raw_value, default)
	return default
	return max(1, value)


	def get_eval_max_workers():
	if "EVAL_CASE_MAX_WORKERS" in os.environ:
	return get_positive_int_env("EVAL_CASE_MAX_WORKERS", DEFAULT_EVAL_MAX_WORKERS)
	return get_positive_int_env("EVAL_MAX_WORKERS", DEFAULT_EVAL_MAX_WORKERS)


	def get_llm2_max_workers():
	return get_positive_int_env("LLM2_MAX_WORKERS", DEFAULT_LLM2_MAX_WORKERS)


	def get_openai_max_concurrent_requests():
	return get_positive_int_env(
	"OPENAI_MAX_CONCURRENT_REQUESTS",
	DEFAULT_OPENAI_MAX_CONCURRENT_REQUESTS,
	)


	def get_openai_request_semaphore():
	global _OPENAI_REQUEST_SEMAPHORE
	global _OPENAI_REQUEST_SEMAPHORE_LIMIT

	limit = get_openai_max_concurrent_requests()
	with _OPENAI_REQUEST_SEMAPHORE_LOCK:
	if _OPENAI_REQUEST_SEMAPHORE is None or _OPENAI_REQUEST_SEMAPHORE_LIMIT != limit:
	_OPENAI_REQUEST_SEMAPHORE = threading.BoundedSemaphore(limit)
	_OPENAI_REQUEST_SEMAPHORE_LIMIT = limit
	return _OPENAI_REQUEST_SEMAPHORE


	def get_openai_client():
	global _OPENAI_CLIENT

	if _OPENAI_CLIENT is None:
	api_key = os.environ.get("OPENAI_API_KEY")
	if not api_key:
	raise RuntimeError("OPENAI_API_KEY is not set.")
	_OPENAI_CLIENT = OpenAI(api_key=api_key)
	return _OPENAI_CLIENT


	def get_google_sheet(ensure_headers=True):
	creds_json = os.environ.get("GOOGLE_CREDS_JSON")
	spreadsheet_id = os.environ.get("SPREADSHEET_ID")

	if not creds_json or not spreadsheet_id:
	raise RuntimeError("GOOGLE_CREDS_JSON and SPREADSHEET_ID must be set.")

	creds = Credentials.from_service_account_info(
	json.loads(creds_json),
	scopes=SCOPES,
	)
	gc = gspread.authorize(creds)
	spreadsheet = gc.open_by_key(spreadsheet_id)
	sheet = spreadsheet.worksheet("Submissions")
	if ensure_headers:
	ensure_submission_sheet_headers(sheet)
	return sheet


	def load_eval_set(prefix):
	questions_str = os.environ.get(f"{prefix}_QUESTIONS_JSON")
	documents_str = os.environ.get(f"{prefix}_DOCUMENTS_JSON")
	expected_str = os.environ.get(f"{prefix}_EXPECTED_JSON")

	if not questions_str or not documents_str or not expected_str:
	return {"cases": []}

	questions = json.loads(questions_str)
	documents = json.loads(documents_str)
	expected = json.loads(expected_str)

	if not (
	isinstance(questions, list)
	and isinstance(documents, list)
	and isinstance(expected, list)
	):
	raise ValueError(f"{prefix} dataset must be a JSON list for all fields.")

	if len(questions) != len(documents) or len(questions) != len(expected):
	raise ValueError(
	f"{prefix} dataset lengths do not match for questions, documents, and expected answers."
	)

	cases = []
	for case_index, (question, docs_entry, expected_entry) in enumerate(
	zip(questions, documents, expected),
	start=1,
	):
	normalized_docs = normalize_documents_entry(docs_entry)
	cases.append(
	{
	"question": str(question),
	"docs": normalized_docs,
	"expected": normalize_expected_entry(
	expected_entry,
	prefix,
	case_index,
	question=str(question),
	docs=normalized_docs,
	),
	}
	)

	return {"cases": cases}


	def build_submission_response(
	ok,
	message,
	notice="",
	disable_practice=False,
	disable_final=False,
	):
	return {
	"ok": ok,
	"message": message,
	"notice": notice,
	"disable_practice": disable_practice,
	"disable_final": disable_final,
	}


	def build_internal_error_message(submission_type):
	label = "practice run" if submission_type == "practice" else "submission"
	return (
	f"We hit an internal error while processing your {label}. "
	"Please try again in a minute."
	)


	def build_stage_error_message(submission_type, stage):
	label = "practice run" if submission_type == "practice" else "submission"
	return (
	f"We hit an internal error while processing your {label}. "
	f"Stage: {stage}."
	)


	def build_save_error_message(submission_type, stage="sheet_write"):
	label = "practice run" if submission_type == "practice" else "submission"
	return (
	f"We could not save your {label} right now. "
	f"Stage: {stage}. Please check the Space logs and spreadsheet permissions, then try again."
	)


	def build_bypass_save_notice():
	return "Debug mode is active: this run was graded but not saved to Google Sheets."


	def should_bypass_sheet_save():
	return os.environ.get("BYPASS_SHEET_SAVE", "").strip().lower() == "true"


	def log_stage_exception(message, *args):
	logger.exception(message, *args)
	print(traceback.format_exc(), flush=True)


	def safe_evaluate_case(case, prompts, case_index):
	try:
	return evaluate_case(case, prompts, case_index=case_index)
	except Exception as exc:
	logger.exception("Case %s failed during evaluation.", case_index)
	raise RuntimeError(f"Evaluation failed for case {case_index}.") from exc


	def normalize_documents_entry(entry):
	if isinstance(entry, list):
	documents = [str(item).strip() for item in entry if str(item).strip()]
	return documents

	text = str(entry).strip()
	if not text:
	return []

	if "---" in text:
	parts = [part.strip() for part in text.split("---")]
	return [part for part in parts if part]

	return [text]


	def get_sheet_column_letter(index):
	letters = ""
	while index > 0:
	index, remainder = divmod(index - 1, 26)
	letters = chr(65 + remainder) + letters
	return letters


	def ensure_submission_sheet_headers(sheet):
	expected_headers = SHEET_HEADERS
	end_column = get_sheet_column_letter(len(expected_headers))
	sheet.update(f"A1:{end_column}1", [expected_headers])


	def get_next_submission_row_index(sheet):
	rows = sheet.get_all_values()
	last_data_row = 1
	for row_index, row in enumerate(rows, start=1):
	if row_index == 1:
	continue
	if any(str(cell).strip() for cell in row):
	last_data_row = row_index
	return last_data_row + 1


	def serialize_sheet_value(value):
	if isinstance(value, (dict, list)):
	return serialize_json(value)
	return "" if value is None else str(value)


	def normalize_expected_mapping(entry, prefix, case_index):
	raw_target_firm = entry.get(TARGET_FIRM_FIELD)
	if raw_target_firm is None:
	raise ValueError(
	f"{prefix} case {case_index} is missing a non-empty '{TARGET_FIRM_FIELD}'."
	)

	target_firm = normalize_whitespace(raw_target_firm)
	if not target_firm or target_firm.lower() == "none":
	raise ValueError(
	f"{prefix} case {case_index} is missing a non-empty '{TARGET_FIRM_FIELD}'."
	)

	normalized = {TARGET_FIRM_FIELD: target_firm}
	for field_name in COUNSEL_FIELDS:
	field_value = entry.get(field_name, "unknown")
	if field_value is None:
	normalized[field_name] = "unknown"
	continue
	normalized[field_name] = str(field_value).strip() or "unknown"

	return normalized


	def extract_target_candidate_from_question(question):
	stripped = normalize_whitespace(question)
	patterns = [
	r"^Is\s+(.+?)\s+present\s+in\s+the\s+agreement\??$",
	r"^Is\s+(.+?)\s+mentioned\s+in\s+the\s+agreement\??$",
	r"^Is\s+(.+?)\s+acting\s+as\s+counsel.*\??$",
	r"^Is\s+(.+?)\s+anywhere\s+in\s+this\s+Asset\s+Purchase\s+Agreement\??$",
	r"^Is\s+(.+?)\s+in\s+the\s+agreement\??$",
	]
	for pattern in patterns:
	match = re.match(pattern, stripped, re.IGNORECASE)
	if match:
	return normalize_whitespace(match.group(1))
	match = re.match(r"^Is\s+(.+?)\??$", stripped, re.IGNORECASE)
	if match:
	return normalize_whitespace(match.group(1))
	return ""


	def extract_entity_candidates(text):
	pattern = re.compile(
	r"\b([A-Z][A-Za-z0-9'.,-](?:\s+(?:&\|[A-Z][A-Za-z0-9'.,-]))*\s+"
	r"(?:LLP\|LLC\|LP\|Inc\.?\|Corporation\|Corp\.?\|Ltd\.?))\b"
	)
	return [normalize_whitespace(match.group(1)) for match in pattern.finditer(str(text))]


	def build_legacy_expected_mapping(entry, question, docs):
	normalized = {
	"buyer_counsel": normalize_whitespace(entry.get("buyer_firm", "unknown")) or "unknown",
	"seller_counsel": normalize_whitespace(entry.get("seller_firm", "unknown")) or "unknown",
	"third_party_counsel": normalize_whitespace(entry.get("third_party", "unknown")) or "unknown",
	}

	target_candidate = extract_target_candidate_from_question(question)
	target_norm = normalize_counsel_value(target_candidate)

	for candidate in normalized.values():
	candidate_text = normalize_whitespace(candidate)
	if not candidate_text:
	continue
	if target_norm and target_norm in normalize_counsel_value(candidate_text):
	normalized[TARGET_FIRM_FIELD] = candidate_text
	break
	else:
	doc_candidates = []
	for doc in docs:
	doc_candidates.extend(extract_entity_candidates(doc))

	for candidate_text in doc_candidates:
	if target_norm and target_norm in normalize_counsel_value(candidate_text):
	normalized[TARGET_FIRM_FIELD] = candidate_text
	break
	else:
	normalized[TARGET_FIRM_FIELD] = target_candidate or "unknown target"

	return normalized


	def normalize_expected_entry(entry, prefix, case_index, question="", docs=None):
	docs = docs or []
	if isinstance(entry, dict):
	legacy_keys = {"buyer_firm", "seller_firm", "third_party", "contains_target_firm"}
	if TARGET_FIRM_FIELD not in entry and legacy_keys.intersection(entry.keys()):
	return build_legacy_expected_mapping(entry, question, docs)
	return normalize_expected_mapping(entry, prefix, case_index)

	if not isinstance(entry, str):
	raise ValueError(
	f"{prefix} case {case_index} expected entry must be a string or object."
	)

	stripped = entry.strip()
	if stripped.startswith("{") and stripped.endswith("}"):
	try:
	parsed = json.loads(stripped)
	except json.JSONDecodeError as exc:
	raise ValueError(
	f"{prefix} case {case_index} contains invalid JSON in expected entry: {exc}"
	) from exc
	if isinstance(parsed, dict):
	legacy_keys = {"buyer_firm", "seller_firm", "third_party", "contains_target_firm"}
	if TARGET_FIRM_FIELD not in parsed and legacy_keys.intersection(parsed.keys()):
	return build_legacy_expected_mapping(parsed, question, docs)
	return normalize_expected_mapping(parsed, prefix, case_index)

	return entry


	def sanitize_input(text, max_length=500):
	clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-+'&()/]", "", text)
	return clean_text.strip()[:max_length]


	def sanitize_prompt(text):
	return text.strip()[:8000]


	def normalize_email(email):
	return email.strip().lower()


	def validate_email(email):
	email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
	return re.match(email_regex, email) is not None


	def extract_user_message(text):
	if not text:
	return None

	pattern = re.compile(
	rf"{re.escape(IRRELEVANT_TAG_START)}(.*?){re.escape(IRRELEVANT_TAG_END)}",
	re.IGNORECASE \| re.DOTALL,
	)
	match = pattern.search(text)
	if not match:
	return None

	return match.group(1).strip()


	def normalize_whitespace(text):
	return re.sub(r"\s+", " ", str(text)).strip()


	def normalize_counsel_value(value):
	if value is None:
	return ""

	text = normalize_whitespace(value).lower()
	text = text.replace("&", " and ")
	text = re.sub(r"[^a-z0-9\s]", " ", text)
	return " ".join(text.split())


	def value_is_unknown(value):
	return normalize_whitespace(value).lower() == "unknown"


	def expected_is_unknown(expected_value):
	return value_is_unknown(expected_value) or normalize_whitespace(expected_value) == ""


	def truncate_text(text, max_chars=2000):
	text = str(text or "")
	if len(text) <= max_chars:
	return text
	return f"{text[: max_chars - 3]}..."


	def serialize_json(value):
	return json.dumps(value, ensure_ascii=True, sort_keys=True)


	def normalize_newlines(text):
	return str(text or "").replace("\r\n", "\n").replace("\r", "\n")


	def normalize_sheet_block_text(text):
	normalized = normalize_newlines(text)
	lines = [line.rstrip() for line in normalized.split("\n")]
	return "\n".join(lines).strip()


	def format_score(value):
	formatted = f"{float(value):.4f}".rstrip("0").rstrip(".")
	if "." not in formatted:
	formatted += ".0"
	return formatted


	def join_sheet_sections(blocks):
	cleaned_blocks = [block for block in blocks if block]
	return f"\n\n{SHEET_SECTION_DIVIDER}\n\n".join(cleaned_blocks)


	def build_case_run_header(case_index, run_index):
	return f"CASE {case_index} \| RUN {run_index}"


	def format_run_indices(run_indices):
	labels = [str(run_index) for run_index in run_indices]
	if not labels:
	return ""
	if len(labels) == 1:
	return labels[0]
	if len(labels) == 2:
	return f"{labels[0]} AND {labels[1]}"
	return f"{', '.join(labels[:-1])} AND {labels[-1]}"


	def build_grouped_case_run_header(case_index, run_indices):
	return f"CASE {case_index} \| RUN {format_run_indices(run_indices)}"


	def truncate_sheet_block(text, max_chars):
	normalized = normalize_sheet_block_text(text)
	return truncate_text(normalized, max_chars=max_chars)


	def format_pretty_expected_answer(expected):
	if isinstance(expected, str):
	return "Irrelevant case. No final JSON should be produced because LLM 1 should stop the pipeline."
	return truncate_text(
	json.dumps(expected, ensure_ascii=True, sort_keys=True, indent=2),
	max_chars=EXPECTED_OUTPUT_MAX_CHARS,
	)


	def format_llm1_expected_text(expected):
	if isinstance(expected, str):
	return "Irrelevant case: output must be wrapped in <user_message>...</user_message>."
	return "Relevant case: output must be exactly TARGET_FIRM: <canonical firm name>."


	def format_llm2_user_output(llm2_outputs):
	if not llm2_outputs:
	return "Not run."

	blocks = []
	for snippet_id, snippet_output in llm2_outputs.items():
	blocks.append(
	"\n".join(
	[
	f"{snippet_id}:",
	truncate_sheet_block(snippet_output, max_chars=STAGE_TWO_OUTPUT_MAX_CHARS),
	]
	)
	)
	return "\n\n".join(blocks)


	def format_teacher_forced_note(run_result):
	if run_result["used_teacher_forced_step1"]:
	return (
	"LLM 1 did not match. LLM 2 and LLM 3 were re-run with the teacher-forced "
	f"Step 1 context: {run_result['effective_step1_context']}"
	)
	return f"LLM 2 and LLM 3 used the submitted Step 1 context: {run_result['effective_step1_context']}"


	def parse_cited_value(value):
	raw_text = normalize_whitespace(value)
	citations = [int(match.group(1)) for match in CITATION_PATTERN.finditer(raw_text)]
	malformed_citation = False
	for token in re.findall(r"\[\^[^\]]*\]", raw_text):
	if not CITATION_PATTERN.fullmatch(token):
	malformed_citation = True
	break
	if "[^" in raw_text and not re.findall(r"\[\^[^\]]*\]", raw_text):
	malformed_citation = True
	base_value = normalize_whitespace(CITATION_PATTERN.sub("", raw_text))
	return {
	"raw": raw_text,
	"base_value": base_value,
	"citations": citations,
	"malformed_citation": malformed_citation,
	}


	def snippet_supports_value(snippet_text, value):
	normalized_value = normalize_counsel_value(value)
	if not normalized_value:
	return False
	return normalized_value in normalize_counsel_value(snippet_text)


	def get_supporting_snippet_numbers(docs, value):
	matches = []
	for index, doc in enumerate(docs, start=1):
	if snippet_supports_value(doc, value):
	matches.append(index)
	return matches


	def append_citations(value, snippet_numbers):
	if value_is_unknown(value) or not snippet_numbers:
	return value
	citations = " ".join(f"[^{snippet_number}]" for snippet_number in snippet_numbers)
	return f"{value} {citations}".strip()


	def dedupe_preserving_order(values):
	seen = set()
	ordered = []
	for value in values:
	normalized = normalize_counsel_value(value)
	if not normalized or normalized in seen:
	continue
	seen.add(normalized)
	ordered.append(normalize_whitespace(value))
	return ordered


	def find_contradicting_entity(expected, docs, target_firm):
	target_norm = normalize_counsel_value(target_firm)

	for field_name in COUNSEL_FIELDS:
	candidate = normalize_whitespace(expected.get(field_name, "unknown"))
	if expected_is_unknown(candidate):
	continue
	if normalize_counsel_value(candidate) == target_norm:
	continue
	snippet_numbers = get_supporting_snippet_numbers(docs, candidate)
	if snippet_numbers:
	return candidate, snippet_numbers

	doc_candidates = []
	for doc in docs:
	doc_candidates.extend(extract_entity_candidates(doc))

	for candidate in dedupe_preserving_order(doc_candidates):
	if normalize_counsel_value(candidate) == target_norm:
	continue
	snippet_numbers = get_supporting_snippet_numbers(docs, candidate)
	if snippet_numbers:
	return candidate, snippet_numbers

	return "", []


	def build_expected_user_question_spec(expected, docs, target_firm):
	target_snippet_numbers = get_supporting_snippet_numbers(docs, target_firm)
	if target_snippet_numbers:
	return {
	"verdict": "true",
	"company": normalize_whitespace(target_firm),
	"citations": target_snippet_numbers,
	}

	contradicting_company, contradicting_snippet_numbers = find_contradicting_entity(
	expected,
	docs,
	target_firm,
	)
	if contradicting_company and contradicting_snippet_numbers:
	return {
	"verdict": "false",
	"company": contradicting_company,
	"citations": contradicting_snippet_numbers,
	}

	return {
	"verdict": "unknown",
	"company": "",
	"citations": [],
	}


	def format_expected_user_question(spec):
	verdict = spec["verdict"]
	citations = spec.get("citations", [])

	if verdict == "unknown":
	return "unknown"

	return append_citations(verdict, citations)


	def parse_user_question_answer(value):
	parsed = parse_cited_value(value)
	base_value = parsed["base_value"]

	if value_is_unknown(base_value):
	return {
	**parsed,
	"verdict": "unknown",
	"evidence": "",
	}

	match = BOOLEAN_ANSWER_PATTERN.match(base_value)
	if match:
	token = match.group(1).lower()
	verdict = "true" if token in {"true", "yes"} else "false"
	return {
	**parsed,
	"verdict": verdict,
	}

	return {
	**parsed,
	"verdict": None,
	}


	def validate_citations_for_value(parsed_value, docs, expected_value):
	bad_references = []
	unsupported_citations = []
	for citation_number in parsed_value["citations"]:
	if citation_number < 1 or citation_number > len(docs):
	bad_references.append(citation_number)
	continue
	if not snippet_supports_value(docs[citation_number - 1], expected_value):
	unsupported_citations.append(citation_number)
	return bad_references, unsupported_citations


	def build_expected_step1_output(expected):
	if isinstance(expected, str):
	return f"{IRRELEVANT_TAG_START}irrelevant{IRRELEVANT_TAG_END}"
	return f"TARGET_FIRM: {expected[TARGET_FIRM_FIELD]}"


	def build_expected_llm3_answer(expected, docs, target_firm):
	if isinstance(expected, str):
	return expected

	formatted = {}
	for field_name in COUNSEL_FIELDS:
	expected_value = str(expected.get(field_name, "unknown"))
	snippet_numbers = get_supporting_snippet_numbers(docs, expected_value)
	formatted[field_name] = append_citations(expected_value, snippet_numbers)
	formatted[USER_QUESTION_FIELD] = format_expected_user_question(
	build_expected_user_question_spec(expected, docs, target_firm)
	)
	return formatted


	def format_prompts_cell(prompts):
	blocks = []
	for label, prompt_key in (
	("LLM 1", "prompt_1"),
	("LLM 2", "prompt_2"),
	("LLM 3", "prompt_3"),
	):
	blocks.append(
	"\n".join(
	[
	label,
	"Prompt:",
	truncate_sheet_block(prompts[prompt_key], max_chars=PROMPT_CELL_MAX_CHARS),
	]
	)
	)
	return truncate_text(
	join_sheet_sections(blocks),
	max_chars=SHEET_CELL_MAX_CHARS,
	)


	def parse_relevant_stage_one_output(text):
	normalized = normalize_sheet_block_text(text)
	match = re.fullmatch(r"TARGET_FIRM:\s*(.+)", normalized)
	if not match:
	return None

	target_firm = match.group(1).strip()
	if not target_firm:
	return None

	return {
	"target_firm": target_firm,
	}


	def is_error_output(text, stage_name):
	return str(text).startswith(f"Error during {stage_name} call:")


	def run_chat_completion(system_prompt, user_prompt):
	semaphore = get_openai_request_semaphore()
	semaphore.acquire()
	try:
	response = get_openai_client().chat.completions.create(
	model=get_model_name(),
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	)
	return response.choices[0].message.content.strip()
	finally:
	semaphore.release()


	def run_step_one(prompts, question):
	try:
	llm1_output = run_chat_completion(prompts["prompt_1"], question)
	except Exception as exc:
	llm1_output = f"Error during LLM1 call: {exc}"

	user_message = extract_user_message(llm1_output)
	return {
	"llm1_output": llm1_output,
	"is_irrelevant": user_message is not None,
	"user_message": user_message,
	}


	def build_llm2_user_prompt(step1_context, snippet_id, doc):
	return (
	f"Target firm context:\n{step1_context}\n\n"
	f"Snippet ID: {snippet_id}\n"
	f"Snippet Text:\n{doc}"
	)


	def run_llm2_snippet(prompts, step1_context, snippet_id, doc):
	try:
	return snippet_id, run_chat_completion(
	prompts["prompt_2"],
	build_llm2_user_prompt(step1_context, snippet_id, doc),
	)
	except Exception as exc:
	return snippet_id, f"Error during LLM2 call: {exc}"


	def run_downstream_stages(prompts, docs, step1_context):
	llm2_outputs = {f"S{index}": "" for index in range(1, len(docs) + 1)}
	llm3_output = ""

	if docs:
	max_workers = min(get_llm2_max_workers(), len(docs))
	if max_workers == 1:
	for index, doc in enumerate(docs, start=1):
	snippet_id = f"S{index}"
	_, snippet_output = run_llm2_snippet(prompts, step1_context, snippet_id, doc)
	llm2_outputs[snippet_id] = snippet_output
	else:
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = [
	executor.submit(
	run_llm2_snippet,
	prompts,
	step1_context,
	f"S{index}",
	doc,
	)
	for index, doc in enumerate(docs, start=1)
	]
	for future in as_completed(futures):
	snippet_id, snippet_output = future.result()
	llm2_outputs[snippet_id] = snippet_output

	serialized_llm2 = []
	for snippet_id, snippet_output in llm2_outputs.items():
	serialized_llm2.append(f"{snippet_id}\n{snippet_output}")

	try:
	llm3_output = run_chat_completion(
	prompts["prompt_3"],
	(
	f"Target firm context:\n{step1_context}\n\n"
	"Per-snippet analyses:\n"
	f"{chr(10).join(serialized_llm2)}"
	),
	)
	except Exception as exc:
	llm3_output = f"Error during LLM3 call: {exc}"

	return {
	"snippet_count": len(docs),
	"llm2_outputs": llm2_outputs,
	"llm3_output": llm3_output,
	}


	def parse_final_answer(answer):
	try:
	parsed = json.loads(answer)
	except json.JSONDecodeError as exc:
	return None, f"Invalid JSON: {exc}"

	if not isinstance(parsed, dict):
	return None, "Final answer must be a JSON object."

	return parsed, None


	def build_field_result(correct, issues=None, penalty=None):
	if penalty is None:
	penalty = 0.0 if correct else 0.2
	return {
	"correct": correct,
	"issues": issues or [],
	"penalty": round(float(penalty), 4),
	}


	def build_prefixed_failure_tags(field_name, issues):
	return [f"{issue}_{field_name}" for issue in issues]


	def grade_counsel_field(field_name, parsed_answer, expected_answer, docs):
	if field_name not in parsed_answer:
	return build_field_result(False, ["missing_field"])

	parsed_value = parse_cited_value(parsed_answer.get(field_name))
	expected_value = str(expected_answer.get(field_name, "unknown"))
	expected_unknown = expected_is_unknown(expected_value)
	actual_unknown = value_is_unknown(parsed_value["base_value"])
	issues = []

	if parsed_value["malformed_citation"]:
	issues.append("malformed_citation")

	if actual_unknown and expected_unknown:
	if parsed_value["citations"]:
	issues.append("unexpected_citation")
	return build_field_result(not issues, issues)

	if actual_unknown and not expected_unknown:
	issues.append("used_unknown")
	return build_field_result(False, issues)

	if not actual_unknown and expected_unknown:
	issues.append("hallucinated_value")
	return build_field_result(False, issues)

	if normalize_counsel_value(parsed_value["base_value"]) != normalize_counsel_value(expected_value):
	issues.append("wrong_value")

	if not parsed_value["citations"]:
	issues.append("missing_citation")
	else:
	bad_references = []
	unsupported_citations = []
	for citation_number in parsed_value["citations"]:
	if citation_number < 1 or citation_number > len(docs):
	bad_references.append(citation_number)
	continue
	if not snippet_supports_value(docs[citation_number - 1], parsed_value["base_value"]):
	unsupported_citations.append(citation_number)
	if bad_references:
	issues.append("bad_citation_reference")
	if unsupported_citations:
	issues.append("unsupported_citation")

	return build_field_result(not issues, issues)


	def grade_user_question_field(parsed_answer, expected_user_question, docs, target_firm):
	if USER_QUESTION_FIELD not in parsed_answer:
	return build_field_result(False, ["missing_field"], penalty=0.2)

	parsed_value = parse_user_question_answer(parsed_answer.get(USER_QUESTION_FIELD))
	issues = []
	penalty = 0.0
	if parsed_value["malformed_citation"]:
	issues.append("malformed_citation")

	expected_verdict = expected_user_question["verdict"]
	expected_company = normalize_whitespace(expected_user_question.get("company", ""))

	if expected_verdict == "unknown":
	if parsed_value["verdict"] != "unknown":
	issues.append("wrong_value")
	penalty += 0.1
	if parsed_value["citations"]:
	issues.append("unexpected_citation")
	penalty += 0.1
	return build_field_result(not issues, issues, penalty=min(0.2, penalty))

	if parsed_value["verdict"] == "unknown":
	issues.append("used_unknown")
	return build_field_result(False, issues, penalty=0.2)

	if parsed_value["verdict"] != expected_verdict:
	issues.append("wrong_value")
	penalty += 0.1

	if not parsed_value["citations"]:
	issues.append("missing_citation")
	penalty += 0.1
	else:
	citation_target = expected_company or target_firm
	bad_references, unsupported_citations = validate_citations_for_value(
	parsed_value,
	docs,
	citation_target,
	)
	if bad_references:
	issues.append("bad_citation_reference")
	if unsupported_citations:
	issues.append("unsupported_citation")
	if parsed_value["malformed_citation"] or bad_references or unsupported_citations:
	penalty += 0.1

	return build_field_result(not issues, issues, penalty=min(0.2, penalty))


	def grade_case(case_run, expected):
	case_result = {"case_score": 0.0, "failure_tags": [], "field_results": {}}
	if isinstance(expected, str):
	if case_run["is_irrelevant"]:
	case_result["case_score"] = 1.0
	else:
	case_result["failure_tags"] = ["relevance_false_positive"]
	case_result["passed"] = case_result["case_score"] == 1.0
	return case_result

	parsed_answer, parse_error = parse_final_answer(case_run["llm3_output"])
	if parse_error:
	case_result["failure_tags"] = ["invalid_json"]
	case_result["field_results"] = {
	field_name: build_field_result(False, ["invalid_json"])
	for field_name in FINAL_SCHEMA_KEYS
	}
	case_result["passed"] = False
	return case_result

	total_penalty = 0.0
	failure_tags = []
	field_results = {}

	for field_name in COUNSEL_FIELDS:
	field_result = grade_counsel_field(field_name, parsed_answer, expected, case_run["docs"])
	field_results[field_name] = field_result
	total_penalty += field_result["penalty"]
	if field_result["issues"]:
	failure_tags.extend(build_prefixed_failure_tags(field_name, field_result["issues"]))

	user_question_result = grade_user_question_field(
	parsed_answer,
	build_expected_user_question_spec(expected, case_run["docs"], case_run["effective_target_firm"]),
	case_run["docs"],
	case_run["effective_target_firm"],
	)
	field_results[USER_QUESTION_FIELD] = user_question_result
	total_penalty += user_question_result["penalty"]
	if user_question_result["issues"]:
	failure_tags.extend(
	build_prefixed_failure_tags(USER_QUESTION_FIELD, user_question_result["issues"])
	)

	case_result["case_score"] = round(max(0.0, 1.0 - total_penalty), 4)
	case_result["failure_tags"] = sorted(set(failure_tags))
	case_result["field_results"] = field_results
	case_result["passed"] = total_penalty == 0.0
	return case_result


	def grade_llm1_stage(llm1_output, expected):
	is_irrelevant = extract_user_message(llm1_output) is not None
	parsed_output = parse_relevant_stage_one_output(llm1_output)

	if isinstance(expected, str):
	if is_irrelevant:
	return {
	"score": 1.0,
	"reasoning": "matched",
	"failure_tags": [],
	"parsed_output": None,
	}
	reasoning = "call_failed" if is_error_output(llm1_output, "LLM1") else "wrong_route"
	return {
	"score": 0.0,
	"reasoning": reasoning,
	"failure_tags": [f"llm1_{reasoning}"],
	"parsed_output": None,
	}

	if is_irrelevant:
	return {
	"score": 0.0,
	"reasoning": "wrong_route",
	"failure_tags": ["llm1_wrong_route"],
	"parsed_output": None,
	}

	if parsed_output is None:
	if is_error_output(llm1_output, "LLM1"):
	reasoning = "call_failed"
	score = 0.0
	else:
	expected_target = normalize_counsel_value(expected[TARGET_FIRM_FIELD])
	output_norm = normalize_counsel_value(llm1_output)
	if expected_target and expected_target in output_norm:
	reasoning = "right_answer_wrong_format"
	score = 0.5
	else:
	reasoning = "wrong_answer_wrong_format"
	score = 0.0
	return {
	"score": score,
	"reasoning": reasoning,
	"failure_tags": [f"llm1_{reasoning}"],
	"parsed_output": None,
	}

	if normalize_counsel_value(parsed_output["target_firm"]) != normalize_counsel_value(
	expected[TARGET_FIRM_FIELD]
	):
	return {
	"score": 0.5,
	"reasoning": "wrong_answer_right_format",
	"failure_tags": ["llm1_wrong_answer_right_format"],
	"parsed_output": parsed_output,
	}

	return {
	"score": 1.0,
	"reasoning": "matched",
	"failure_tags": [],
	"parsed_output": parsed_output,
	}


	def summarize_llm2_stage(case_run, expected):
	if isinstance(expected, str):
	return "Not run. Irrelevant case."

	total_snippets = case_run["snippet_count"]
	if total_snippets == 0:
	return "No snippets were provided."

	error_ids = []
	successful_snippets = 0
	for index in range(1, total_snippets + 1):
	snippet_id = f"S{index}"
	snippet_output = case_run["llm2_outputs"].get(snippet_id, "")
	if not snippet_output or is_error_output(snippet_output, "LLM2"):
	error_ids.append(snippet_id)
	else:
	successful_snippets += 1

	summary = f"{successful_snippets}/{total_snippets} snippet calls completed successfully."
	if error_ids:
	summary += f" Errored snippets: {', '.join(error_ids)}."
	return summary


	def grade_llm3_stage(case_run, expected, semantic_result):
	if semantic_result["failure_tags"]:
	reasoning = ", ".join(semantic_result["failure_tags"])
	elif isinstance(expected, str) and case_run["is_irrelevant"]:
	reasoning = "skipped_irrelevant"
	else:
	reasoning = "clean"

	return {
	"score": semantic_result["case_score"],
	"reasoning": reasoning,
	}


	def summarize_field_accuracy(case_results):
	totals = {
	field_name: {"correct": 0, "total": 0}
	for field_name in FINAL_SCHEMA_KEYS
	}

	for case_result in case_results:
	for field_name, field_summary in case_result["field_results"].items():
	totals[field_name]["total"] += 1
	if field_summary["correct"]:
	totals[field_name]["correct"] += 1

	summary = {}
	for field_name, counts in totals.items():
	total = counts["total"]
	accuracy = round((counts["correct"] / total) * 100, 2) if total else 0.0
	summary[field_name] = {
	"correct": counts["correct"],
	"total": total,
	"accuracy": accuracy,
	}
	return summary


	def average_run_score(runs, key):
	if not runs:
	return 0.0
	return round(sum(run[key] for run in runs) / len(runs), 4)


	def total_run_score(runs, key):
	if not runs:
	return 0.0
	return round(sum(run[key] for run in runs), 4)


	def summarize_reasoning_counts(runs, key):
	counts = Counter(run[key] for run in runs)
	ordered_labels = []
	for run in runs:
	label = run[key]
	if label not in ordered_labels:
	ordered_labels.append(label)
	return "; ".join(f"{label} x{counts[label]}" for label in ordered_labels)


	def explain_reasoning_label(label):
	explanations = {
	"matched": "The model followed the required instruction and produced the expected type of output.",
	"wrong_format": "The model answered the right task but did not use the exact output format the evaluator requires.",
	"right_answer_wrong_format": "The model identified the right answer but did not use the exact required Step 1 format.",
	"wrong_answer_right_format": "The model used the required Step 1 format but identified the wrong target firm.",
	"wrong_answer_wrong_format": "The model gave neither the right answer nor the required Step 1 format.",
	"wrong_route": "The model chose the wrong kind of response for this question, such as treating an irrelevant query as relevant or the reverse.",
	"call_failed": "The model call failed, so no valid answer was produced for this step.",
	"invalid_json": "The Step 3 answer was not valid JSON in the required schema.",
	"relevance_false_positive": "The system treated a question as relevant even though it should have been rejected as irrelevant.",
	"skipped_irrelevant": "This step was skipped because the question was correctly identified as irrelevant.",
	"clean": "The answer passed the evaluator checks for this step.",
	}

	if ", " in label:
	parts = [part.strip() for part in label.split(",") if part.strip()]
	explained_parts = [explanations.get(part, part.replace("_", " ")) for part in parts]
	return "; ".join(explained_parts)

	return explanations.get(label, label.replace("_", " "))


	def summarize_run_groups(runs, key):
	grouped = []
	for run_index, run in enumerate(runs, start=1):
	value = run[key]
	if grouped and grouped[-1]["value"] == value:
	grouped[-1]["run_indices"].append(run_index)
	continue
	grouped.append({"value": value, "run_indices": [run_index]})
	return [
	f"RUN {format_run_indices(group['run_indices'])}: {group['value']}"
	for group in grouped
	]


	def build_explained_run_groups(runs, key):
	grouped = []
	for run_index, run in enumerate(runs, start=1):
	value = run[key]
	if grouped and grouped[-1]["value"] == value:
	grouped[-1]["run_indices"].append(run_index)
	continue
	grouped.append({"value": value, "run_indices": [run_index]})
	return [
	"\n".join(
	[
	f"RUN {format_run_indices(group['run_indices'])}",
	f"Label: {group['value']}",
	f"Meaning: {explain_reasoning_label(group['value'])}",
	]
	)
	for group in grouped
	]


	def build_score_breakdown_cell(case_results):
	blocks = []
	for case_index, case_result in enumerate(case_results, start=1):
	all_runs = list(range(1, len(case_result["runs"]) + 1))
	llm1_reasoning_lines = build_explained_run_groups(case_result["runs"], "llm1_reasoning")
	llm3_reasoning_lines = build_explained_run_groups(case_result["runs"], "llm3_reasoning")
	block_lines = [
	f"CASE {case_index}",
	f"RUNS: {format_run_indices(all_runs)}",
	"",
	"LLM 1",
	f"Score: {format_score(total_run_score(case_result['runs'], 'llm1_score'))} / {len(all_runs)}",
	"Reasoning:",
	*llm1_reasoning_lines,
	"",
	"LLM 3",
	f"Score: {format_score(total_run_score(case_result['runs'], 'llm3_score'))} / {len(all_runs)}",
	"Reasoning:",
	*llm3_reasoning_lines,
	]
	blocks.append("\n".join(block_lines))
	return truncate_text(
	join_sheet_sections(blocks),
	max_chars=SHEET_CELL_MAX_CHARS,
	)


	def build_llm2_output_cell(case_results):
	blocks = []
	for case_index, case_result in enumerate(case_results, start=1):
	for run_index, run_result in enumerate(case_result["runs"], start=1):
	blocks.append(
	"\n".join(
	[
	build_case_run_header(case_index, run_index),
	format_teacher_forced_note(run_result)
	if not isinstance(case_result["expected"], str)
	else "Downstream stages were skipped because this was graded as irrelevant.",
	"",
	f"LLM 2 Summary: {run_result['llm2_reasoning']}",
	"",
	"LLM 2 Output:",
	format_llm2_user_output(run_result["llm2_outputs"]),
	]
	)
	)
	return truncate_text(
	join_sheet_sections(blocks),
	max_chars=SHEET_CELL_MAX_CHARS,
	)


	def build_output_vs_expected_cell(case_results):
	blocks = []
	for case_index, case_result in enumerate(case_results, start=1):
	llm1_expected = format_llm1_expected_text(case_result["expected"])
	grouped_blocks = []
	for run_index, run_result in enumerate(case_result["runs"], start=1):
	llm3_output = run_result["llm3_output"] or "Not run."
	block_body = "\n".join(
	[
	"LLM 1",
	f"Status: {run_result['llm1_reasoning']}",
	"User Output:",
	truncate_sheet_block(run_result["llm1_output"], max_chars=STAGE_ONE_OUTPUT_MAX_CHARS),
	"",
	"Expected:",
	llm1_expected,
	"",
	(
	format_teacher_forced_note(run_result)
	if not isinstance(case_result["expected"], str)
	else "Downstream stages were skipped because this case was irrelevant."
	),
	"",
	"LLM 3",
	f"Reasoning: {run_result['llm3_reasoning']}",
	"User Output:",
	truncate_sheet_block(llm3_output, max_chars=STAGE_THREE_OUTPUT_MAX_CHARS),
	"",
	"Expected:",
	format_pretty_expected_answer(run_result["llm3_expected"]),
	]
	)
	normalized_body = normalize_sheet_block_text(block_body)
	for grouped_block in grouped_blocks:
	if grouped_block["body"] == normalized_body:
	grouped_block["run_indices"].append(run_index)
	break
	else:
	grouped_blocks.append({"body": normalized_body, "run_indices": [run_index]})
	for grouped_block in grouped_blocks:
	blocks.append(
	"\n".join(
	[
	build_grouped_case_run_header(case_index, grouped_block["run_indices"]),
	grouped_block["body"],
	]
	)
	)
	return truncate_text(
	join_sheet_sections(blocks),
	max_chars=SHEET_CELL_MAX_CHARS,
	)


	def estimate_case_llm_call_count(case):
	if isinstance(case["expected"], str):
	return EVAL_REPEAT_COUNT
	return EVAL_REPEAT_COUNT * (len(case["docs"]) + 2)


	def estimate_submission_llm_call_count(cases):
	return sum(estimate_case_llm_call_count(case) for case in cases)


	def evaluate_case(case, prompts, case_index=None):
	started_at = time.perf_counter()
	runs = []
	for _ in range(EVAL_REPEAT_COUNT):
	step_one_result = run_step_one(prompts, case["question"])
	llm1_result = grade_llm1_stage(step_one_result["llm1_output"], case["expected"])
	expected_relevant = not isinstance(case["expected"], str)
	used_teacher_forced_step1 = False
	effective_step1_context = step_one_result["llm1_output"]

	if expected_relevant:
	if llm1_result["score"] < 1.0:
	effective_step1_context = build_expected_step1_output(case["expected"])
	used_teacher_forced_step1 = True
	downstream_result = run_downstream_stages(
	prompts,
	case["docs"],
	effective_step1_context,
	)
	else:
	downstream_result = {
	"snippet_count": len(case["docs"]),
	"llm2_outputs": {},
	"llm3_output": "",
	}

	effective_target_firm = ""
	if expected_relevant:
	effective_target_firm = parse_relevant_stage_one_output(effective_step1_context)["target_firm"]

	case_run = {
	"question": case["question"],
	"docs": case["docs"],
	"snippet_count": downstream_result["snippet_count"],
	"llm1_output": step_one_result["llm1_output"],
	"llm2_outputs": downstream_result["llm2_outputs"],
	"llm3_output": downstream_result["llm3_output"],
	"is_irrelevant": step_one_result["is_irrelevant"],
	"user_message": step_one_result["user_message"],
	"effective_step1_context": effective_step1_context,
	"effective_target_firm": effective_target_firm,
	"used_teacher_forced_step1": used_teacher_forced_step1,
	}

	semantic_result = grade_case(case_run, case["expected"])
	llm3_result = grade_llm3_stage(case_run, case["expected"], semantic_result)
	llm2_reasoning = summarize_llm2_stage(case_run, case["expected"])
	llm3_expected = build_expected_llm3_answer(
	case["expected"],
	case["docs"],
	effective_target_firm,
	)
	failure_tags = sorted(set(llm1_result["failure_tags"] + semantic_result["failure_tags"]))

	runs.append(
	{
	"llm1_output": truncate_sheet_block(
	step_one_result["llm1_output"],
	max_chars=STAGE_ONE_OUTPUT_MAX_CHARS,
	),
	"llm2_outputs": {
	snippet_id: truncate_sheet_block(
	snippet_output,
	max_chars=STAGE_TWO_OUTPUT_MAX_CHARS,
	)
	for snippet_id, snippet_output in case_run["llm2_outputs"].items()
	},
	"llm3_output": truncate_sheet_block(
	case_run["llm3_output"],
	max_chars=STAGE_THREE_OUTPUT_MAX_CHARS,
	),
	"is_irrelevant": step_one_result["is_irrelevant"],
	"failure_tags": failure_tags,
	"field_results": semantic_result["field_results"],
	"passed": llm1_result["score"] == 1.0 and llm3_result["score"] == 1.0,
	"llm1_score": llm1_result["score"],
	"llm1_reasoning": llm1_result["reasoning"],
	"llm2_reasoning": llm2_reasoning,
	"llm3_score": llm3_result["score"],
	"llm3_reasoning": llm3_result["reasoning"],
	"used_teacher_forced_step1": used_teacher_forced_step1,
	"effective_step1_context": truncate_sheet_block(
	effective_step1_context,
	max_chars=STAGE_ONE_OUTPUT_MAX_CHARS,
	),
	"effective_target_firm": effective_target_firm,
	"llm3_expected": llm3_expected,
	"total_points": round(llm1_result["score"] + llm3_result["score"], 4),
	"total_points_without_llm1": round(llm3_result["score"], 4),
	}
	)

	average_llm1_score = average_run_score(runs, "llm1_score")
	average_llm3_score = average_run_score(runs, "llm3_score")
	average_total_points = round(average_llm1_score + average_llm3_score, 4)

	result = {
	"expected": case["expected"],
	"runs": runs,
	"average_llm1_score": average_llm1_score,
	"average_llm3_score": average_llm3_score,
	"average_total_points": average_total_points,
	"case_score": average_total_points,
	"passed": abs(average_total_points - 2.0) < 1e-9,
	}

	elapsed_seconds = time.perf_counter() - started_at
	logger.info(
	"Completed case %s: relevant=%s snippets=%s repeat_count=%s "
	"expected_llm_calls=%s elapsed_seconds=%.2f",
	case_index if case_index is not None else "unknown",
	not isinstance(case["expected"], str),
	len(case["docs"]),
	EVAL_REPEAT_COUNT,
	estimate_case_llm_call_count(case),
	elapsed_seconds,
	)
	return result


	def grade_submission(eval_set, prompts, mode):
	case_results = []
	cases = eval_set["cases"]

	if not cases:
	return {
	"submission_type": mode,
	"score_percent": 0.0,
	"passed_cases": 0,
	"total_cases": 0,
	"failure_summary": {},
	"field_accuracy": summarize_field_accuracy([]),
	"case_results": [],
	"results": 0.0,
	"results_without_llm_1": 0.0,
	}

	max_workers = min(get_eval_max_workers(), len(cases))
	started_at = time.perf_counter()
	expected_llm_calls = estimate_submission_llm_call_count(cases)
	snippet_counts = [len(case["docs"]) for case in cases]
	logger.info(
	"Starting %s grading: cases=%s repeat_count=%s snippet_counts=%s "
	"expected_llm_calls=%s case_workers=%s llm2_workers=%s openai_global_limit=%s",
	mode,
	len(cases),
	EVAL_REPEAT_COUNT,
	snippet_counts,
	expected_llm_calls,
	max_workers,
	get_llm2_max_workers(),
	get_openai_max_concurrent_requests(),
	)

	ordered_results = [None] * len(cases)

	if max_workers == 1:
	for index, case in enumerate(cases):
	ordered_results[index] = safe_evaluate_case(case, prompts, index + 1)
	else:
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	future_to_index = {
	executor.submit(safe_evaluate_case, case, prompts, index + 1): index
	for index, case in enumerate(cases)
	}
	for future in as_completed(future_to_index):
	index = future_to_index[future]
	ordered_results[index] = future.result()

	case_results = ordered_results
	total_cases = len(case_results)
	total_results = round(
	sum(case_result["average_total_points"] for case_result in case_results),
	4,
	)
	total_results_without_llm_1 = round(
	sum(case_result["average_llm3_score"] for case_result in case_results),
	4,
	)
	run_results = [
	run_result
	for case_result in case_results
	for run_result in case_result["runs"]
	]

	passed_cases = sum(1 for case_result in case_results if case_result["passed"])
	max_points = total_cases * 2.0
	score_percent = round((total_results / max_points) * 100, 2) if max_points else 0.0
	failure_summary = dict(
	sorted(
	Counter(
	tag
	for run_result in run_results
	for tag in run_result["failure_tags"]
	).items()
	)
	)
	field_accuracy = summarize_field_accuracy(run_results)
	elapsed_seconds = time.perf_counter() - started_at
	logger.info(
	"Completed %s grading: cases=%s repeat_count=%s expected_llm_calls=%s "
	"elapsed_seconds=%.2f",
	mode,
	total_cases,
	EVAL_REPEAT_COUNT,
	expected_llm_calls,
	elapsed_seconds,
	)

	return {
	"submission_type": mode,
	"score_percent": score_percent,
	"passed_cases": passed_cases,
	"total_cases": total_cases,
	"failure_summary": failure_summary,
	"field_accuracy": field_accuracy,
	"case_results": case_results,
	"results": total_results,
	"results_without_llm_1": total_results_without_llm_1,
	}


	def get_attempt_status(email):
	normalized_email = normalize_email(email)
	status = {"practice_used": False, "final_used": False}

	local_status = LOCAL_ATTEMPTS.get(normalized_email, {})
	if local_status.get("practice_used"):
	status["practice_used"] = True
	if local_status.get("final_used"):
	status["final_used"] = True

	sheet = get_google_sheet(ensure_headers=False)
	rows = sheet.get("B:D")

	for row in rows[1:]:
	row_email = None
	row_type = None

	if len(row) >= 3 and row[0] in SUBMISSION_TYPES:
	row_type = row[0]
	row_email = normalize_email(row[2])
	elif row:
	row_type = "final"
	row_email = normalize_email(row[0])

	if row_email != normalized_email:
	continue

	if row_type == "practice":
	status["practice_used"] = True
	if row_type == "final":
	status["final_used"] = True

	return status


	def save_attempt(record):
	sheet = get_google_sheet()
	row = [serialize_sheet_value(record.get(key, "")) for key, _ in SHEET_COLUMNS]
	sheet.append_row(
	row,
	value_input_option="RAW",
	insert_data_option="INSERT_ROWS",
	table_range="A1",
	)


	def record_local_attempt(email, submission_type):
	LOCAL_ATTEMPTS[email] = {
	"practice_used": LOCAL_ATTEMPTS.get(email, {}).get("practice_used", False)
	or submission_type == "practice",
	"final_used": LOCAL_ATTEMPTS.get(email, {}).get("final_used", False)
	or submission_type == "final",
	}


	def build_attempt_record(name, email, prompts, submission_result):
	return {
	"timestamp": datetime.now(timezone.utc).isoformat(),
	"submission_type": submission_result["submission_type"],
	"name": name,
	"email": email,
	"results": submission_result["results"],
	"results_without_llm_1": submission_result["results_without_llm_1"],
	"llm_score_breakdown": build_score_breakdown_cell(submission_result["case_results"]),
	"llm_output_vs_expected": build_output_vs_expected_cell(
	submission_result["case_results"]
	),
	"llm_2_output": build_llm2_output_cell(submission_result["case_results"]),
	"prompts": format_prompts_cell(prompts),
	}


	def format_field_accuracy(field_accuracy):
	labels = {
	"buyer_counsel": "Buyer counsel",
	"seller_counsel": "Seller counsel",
	"third_party_counsel": "Third-party counsel",
	"user_question": "User question",
	}
	lines = []
	for field_name in FINAL_SCHEMA_KEYS:
	summary = field_accuracy[field_name]
	lines.append(
	(
	f"- {labels[field_name]}: {summary['accuracy']}% "
	f"({summary['correct']}/{summary['total']})"
	)
	)
	return "\n".join(lines)


	def build_user_facing_reasoning_message(label, llm_name):
	if llm_name == "LLM 1":
	messages = {
	"matched": "No issue in these runs.",
	"right_answer_wrong_format": "The model identified the right answer but did not use the required Step 1 format.",
	"wrong_answer_right_format": "The model used the required Step 1 format but identified the wrong target firm.",
	"wrong_answer_wrong_format": "The model gave neither the right output nor the right format.",
	"wrong_route": "The model chose the wrong route for this question and should have handled relevance differently.",
	"call_failed": "The Step 1 model call failed, so no valid answer was produced.",
	}
	return messages.get(label, explain_reasoning_label(label))

	if label == "matched":
	return "No issue in these runs."
	if label == "clean":
	return "The final JSON was valid and passed the evaluator checks."
	if label == "skipped_irrelevant":
	return "This case was correctly rejected as irrelevant before Step 3."
	if label == "invalid_json":
	return "The final answer was not valid JSON in the required schema."
	if label == "relevance_false_positive":
	return (
	"This question should have been rejected as irrelevant earlier, but the pipeline continued as if it were relevant."
	)

	parts = [part.strip() for part in str(label).split(",") if part.strip()]
	explained = []
	for part in parts:
	detail = explain_output_issue_tag(part)
	if detail:
	explained.append(detail)
	else:
	explained.append(explain_reasoning_label(part))
	return "; ".join(explained) if explained else "The final output had evaluator-detected issues."


	def split_field_issue_tag(tag):
	for field_name in sorted(FINAL_SCHEMA_KEYS, key=len, reverse=True):
	suffix = f"_{field_name}"
	if tag.endswith(suffix):
	return tag[: -len(suffix)], field_name
	return None, None


	def explain_output_issue_tag(tag):
	field_labels = {
	"buyer_counsel": "Buyer counsel",
	"seller_counsel": "Seller counsel",
	"third_party_counsel": "Third-party counsel",
	"user_question": "User question",
	}
	issue_name, field_name = split_field_issue_tag(tag)
	if not issue_name or not field_name:
	return None

	field_label = field_labels[field_name]
	issue_messages = {
	"missing_field": "the field was missing from the final JSON.",
	"malformed_citation": "the citation format was malformed.",
	"unexpected_citation": "a citation was included where it should not have been.",
	"used_unknown": 'the answer said "unknown" even though the evidence supported a conclusion.',
	"hallucinated_value": "a value was given even though the expected answer was unknown.",
	"wrong_value": "the answer value was wrong.",
	"missing_citation": "the answer needed a citation but did not include one.",
	"bad_citation_reference": "the citation pointed to a snippet number that does not exist.",
	"unsupported_citation": "the citation pointed to a snippet that does not support the answer.",
	}
	message = issue_messages.get(issue_name)
	if not message:
	return None
	return f"{field_label}: {message}"


	def build_practice_run_issue_lines(runs, key, llm_name):
	grouped = []
	for run_index, run in enumerate(runs, start=1):
	value = run[key]
	if value in {"matched", "clean", "skipped_irrelevant"}:
	continue
	if grouped and grouped[-1]["value"] == value:
	grouped[-1]["run_indices"].append(run_index)
	continue
	grouped.append({"value": value, "run_indices": [run_index]})

	if not grouped:
	return ["- No issues detected."]

	return [
	f"- RUN {format_run_indices(group['run_indices'])}: "
	f"{build_user_facing_reasoning_message(group['value'], llm_name)}"
	for group in grouped
	]


	def build_practice_case_feedback(case_results):
	blocks = []
	for case_index, case_result in enumerate(case_results, start=1):
	run_count = len(case_result["runs"])
	llm1_score = total_run_score(case_result["runs"], "llm1_score")
	llm3_score = total_run_score(case_result["runs"], "llm3_score")
	blocks.append(
	"\n".join(
	[
	f"CASE {case_index}",
	"",
	"LLM 1",
	f"Score: {format_score(llm1_score)} / {run_count}",
	"Issues:",
	*build_practice_run_issue_lines(case_result["runs"], "llm1_reasoning", "LLM 1"),
	"",
	"LLM 3",
	f"Score: {format_score(llm3_score)} / {run_count}",
	"Issues:",
	*build_practice_run_issue_lines(case_result["runs"], "llm3_reasoning", "LLM 3"),
	]
	)
	)
	return join_sheet_sections(blocks)


	def build_practice_scoring_note(submission_result):
	total_cases = submission_result["total_cases"]
	return (
	f"This practice score is based on {total_cases} hidden calibration cases. "
	"Each case is run 3 times to check prompt consistency."
	)


	def format_practice_feedback(name, submission_result):
	case_results = submission_result["case_results"]
	return (
	f"Practice run complete for {name}.\n\n"
	f"Score: {submission_result['score_percent']}%\n"
	f"{build_practice_scoring_note(submission_result)}\n\n"
	f"{build_practice_case_feedback(case_results)}\n\n"
	"Your final submission is still available."
	)


	def load_eval_set_for_mode(submission_type):
	prefix = "PRACTICE" if submission_type == "practice" else "FINAL"
	return load_eval_set(prefix)


	def validate_submission_inputs(email, name):
	if not validate_email(email):
	return "Invalid email address. Please enter a valid email."

	if not name.strip():
	return "Please enter your full name."

	return None


	def submit_attempt(submission_type, email, name, prompt_1, prompt_2, prompt_3):
	if submission_type not in SUBMISSION_TYPES:
	raise ValueError(f"Unsupported submission type: {submission_type}")

	validation_error = validate_submission_inputs(email, name)
	if validation_error:
	return build_submission_response(False, validation_error)

	normalized_email = normalize_email(email)
	clean_name = sanitize_input(name)
	prompts = {
	"prompt_1": sanitize_prompt(prompt_1),
	"prompt_2": sanitize_prompt(prompt_2),
	"prompt_3": sanitize_prompt(prompt_3),
	}
	bypass_sheet_save = should_bypass_sheet_save()

	try:
	attempt_status = get_attempt_status(normalized_email)
	except Exception:
	log_stage_exception(
	"Unexpected error while reading sheet state for %s submission for %s.",
	submission_type,
	normalized_email,
	)
	return build_submission_response(
	False,
	build_stage_error_message(submission_type, "sheet_read"),
	)

	if submission_type == "practice":
	if attempt_status["final_used"]:
	return build_submission_response(
	False,
	(
	f"A final submission has already been received for {normalized_email}. "
	"Practice is no longer available."
	),
	disable_practice=True,
	disable_final=True,
	)
	if attempt_status["practice_used"]:
	return build_submission_response(
	False,
	(
	f"Practice has already been used for {normalized_email}. "
	"Your final submission is still available."
	),
	disable_practice=True,
	disable_final=False,
	)

	if submission_type == "final" and attempt_status["final_used"]:
	return build_submission_response(
	False,
	(
	f"Final submission already received for {normalized_email}. "
	"You can only submit the final once."
	),
	disable_practice=True,
	disable_final=True,
	)

	try:
	eval_set = load_eval_set_for_mode(submission_type)
	except Exception:
	log_stage_exception(
	"Unexpected error while loading %s dataset for %s.",
	submission_type,
	normalized_email,
	)
	return build_submission_response(
	False,
	build_stage_error_message(submission_type, "dataset_load"),
	)

	if not eval_set["cases"]:
	return build_submission_response(
	False,
	f"No hidden cases configured for the {submission_type} dataset.",
	)

	try:
	submission_result = grade_submission(eval_set, prompts, submission_type)
	except Exception:
	log_stage_exception(
	"Unexpected error while grading %s submission for %s.",
	submission_type,
	normalized_email,
	)
	return build_submission_response(
	False,
	build_stage_error_message(submission_type, "grading"),
	)

	try:
	record = build_attempt_record(clean_name, normalized_email, prompts, submission_result)
	except Exception:
	log_stage_exception(
	"Unexpected error while building %s record for %s.",
	submission_type,
	normalized_email,
	)
	return build_submission_response(
	False,
	build_stage_error_message(submission_type, "record_build"),
	)

	bypass_notice = build_bypass_save_notice() if bypass_sheet_save else ""

	if not bypass_sheet_save:
	try:
	save_attempt(record)
	record_local_attempt(normalized_email, submission_type)
	except Exception:
	log_stage_exception(
	"Unexpected error while saving %s submission for %s.",
	submission_type,
	normalized_email,
	)
	return build_submission_response(
	False,
	build_save_error_message(submission_type, "sheet_write"),
	)
	else:
	print(
	f"BYPASS_SHEET_SAVE enabled for {submission_type} submission {normalized_email}.",
	flush=True,
	)

	if submission_type == "practice":
	notice = "Practice run saved. Your final submission is still available."
	if bypass_notice:
	notice = bypass_notice
	return build_submission_response(
	True,
	format_practice_feedback(clean_name, submission_result),
	notice=notice,
	disable_practice=not bypass_sheet_save,
	disable_final=False,
	)

	notice = "Final submission received. You can close the page."
	if bypass_notice:
	notice = bypass_notice
	return build_submission_response(
	True,
	f"Thank you for your submission, {clean_name}!",
	notice=notice,
	disable_practice=not bypass_sheet_save,
	disable_final=not bypass_sheet_save,
	)


	def build_submission_callback_result(result):
	return (
	result["message"],
	gr.update(interactive=not result["disable_practice"]),
	gr.update(interactive=not result["disable_final"]),
	gr.update(value=result["notice"], visible=bool(result["notice"])),
	)


	def handle_submission(submission_type, email, name, s1, s2, s3):
	try:
	result = submit_attempt(submission_type, email, name, s1, s2, s3)
	except Exception:
	log_stage_exception("Unhandled callback failure for %s submission.", submission_type)
	result = build_submission_response(
	False,
	build_stage_error_message(submission_type, "callback"),
	)
	return build_submission_callback_result(result)


	def build_interface():
	with gr.Blocks(css=APP_CSS) as demo:
	gr.Markdown(
	"""
	# Applicant Task: Target Company & Law Firm Identification
	Draft prompts for a strict 3-step legal review pipeline over snippets from Asset Purchase Agreements [SEC Agreement Example](https://www.sec.gov/Archives/edgar/data/28452/000119312505012401/dex101.htm)
	> This evaluation system uses (default: `gpt-5-mini`) for all LLM steps.

	## What you need to do

	- Decide whether the query is relevant and normalize the target firm name.
	- Inspect each snippet independently and record only what that snippet actually supports.
	- Combine those snippet-level findings into one final JSON answer with citations, including a cited answer to the original user question.

	## The 3-step pipeline

	### Step 1: Relevance and target-firm normalization
	Decide whether the question belongs in this deal at all, and if it does, standardize the firm name you will track through the rest of the pipeline.

	- If the query is irrelevant, return `<user_message>...</user_message>`.
	- If the query is relevant, return only:

	```text
	TARGET_FIRM: <canonical firm name>
	```
	Please ensure your final output uses the exact key `TARGET_FIRM:` as shown above, alongside the firm name.

	### Step 2: Snippet-by-snippet analysis
	Step 2 is not the final answer; it is a working note for one snippet at a time.

	- Your Step 2 prompt runs independently on each evidence unit, so the model only sees one snippet per call.
	- The app passes:
	- the Step 1 output
	- `Snippet ID: S1..Sn`
	- the snippet text
	- A good Step 2 prompt says what this snippet supports, what it does not support, and where the evidence is still uncertain.

	### Step 3: Reconciliation and final answer
	Step 3 receives the Step 1 output plus all Step 2 notes and must turn them into one answer that could survive a hostile redline, including a direct answer to the original user question.

	- Step 3 receives the Step 1 output plus all Step 2 outputs.
	- It must return valid JSON with this exact schema:

	```json
	{
	"buyer_counsel": "string with citations like \"Firm Name [^2]\" or \"unknown\"",
	"seller_counsel": "string with citations like \"Firm Name [^4]\" or \"unknown\"",
	"third_party_counsel": "string with citations like \"Firm Name [^1]\" or \"unknown\"",
	"user_question": "string with citations like \"true [^2]\", \"false [^4]\", or \"unknown\""
	}
	```

	Use `"unknown"` when a counsel field or the user-question answer cannot be supported by the evidence. When you provide a firm name, include snippet citations such as `[^2]` that point to the relevant Step 2 snippet IDs. The `user_question` field should answer the original query using only `true`, `false`, or `unknown`: if the answer is `true` or `false`, include supporting snippet citations and do not add any extra text.
	"""
	)

	gr.Markdown(
	"""
	## Workflow At A Glance

	This visual shows how the query moves from `LLM1` to `LLM2` to `LLM3`, and how the final JSON is assembled from the APA snippets.
	"""
	)
	gr.Image(
	value="mermaid_diagram.png",
	label="Pipeline overview",
	show_label=True,
	interactive=False,
	)

	with gr.Accordion("Example Workflow", open=False):
	gr.Markdown(
	"""
	User query
	```text
	Is Kirkland & Ellis LLP acting as counsel anywhere in this Asset Purchase Agreement?
	```

	Step 1 relevant output
	```text
	TARGET_FIRM: Kirkland & Ellis LLP
	```

	Example evidence units
	- `S1`: the opening paragraph names the parties and mentions Kirkland & Ellis LLP as transaction counsel to the buyer.
	- `S2`: the notices section identifies buyer counsel as Kirkland & Ellis LLP and seller counsel as Wachtell, Lipton, Rosen & Katz.
	- `S3`: a boilerplate clause contains no counsel information and should not drive the final answer.
	- `S4`: a representative provision states that Gibson, Dunn & Crutcher LLP advises the securityholders' representative.
	- `S5`: a later notice block again confirms seller counsel as Wachtell, Lipton, Rosen & Katz.

	Final JSON shape
	```json
	{
	"buyer_counsel": "Kirkland & Ellis LLP [^1] [^2]",
	"seller_counsel": "Wachtell, Lipton, Rosen & Katz [^2] [^5]",
	"third_party_counsel": "Gibson, Dunn & Crutcher LLP [^4]",
	"user_question": "true [^1] [^2]"
	}
	```
	"""
	)

	with gr.Accordion("Practice and Final Submission", open=False):
	gr.Markdown(
	"""
	- You may use one optional practice run per email to test your prompts against a hidden calibration set.
	- The practice run uses 3 hidden calibration cases.
	- Each case is run 3 times to check prompt consistency.
	- For each run, LLM 1 can earn up to 1 point for correct routing and target-firm normalization, and LLM 3 can earn up to 1 point for a correct final JSON answer with supported citations.
	- Step 2 is not scored directly, but it strongly affects the LLM 3 score because Step 3 relies on the snippet-level analysis.
	- Practice returns aggregate feedback only: score percentage, an LLM 1 summary, and an LLM 3 summary.
	- You may then revise your prompts or keep them as they are.
	- You may submit one final submission per email against a separate hidden holdout set.
	- After the final submission, practice is no longer available.
	- No structured decoding is used for you, so your prompts must make Step 3 produce reliable JSON on their own.
	"""
	)

	gr.Markdown(
	"""
	Enter your name and email exactly as listed in your CV. Both buttons below use the same three prompt boxes.

	You have one chance to run the practice set and get feedback, and one chance to run the final set. After you click a button, wait for the results to load before clicking again or refreshing the page.

	Good Luck!
	"""
	)

	email_input = gr.Textbox(label="Email", placeholder="your.email@example.com")
	name_input = gr.Textbox(label="First Name, Last Name", placeholder="John Smith")
	system_prompt_input_1 = gr.Textbox(
	label="System Prompt for Step 1",
	placeholder="Enter your Step 1 prompt here...",
	lines=6,
	)
	system_prompt_input_2 = gr.Textbox(
	label="System Prompt for Step 2",
	placeholder="Enter your Step 2 prompt here...",
	lines=10,
	)
	system_prompt_input_3 = gr.Textbox(
	label="System Prompt for Step 3",
	placeholder="Enter your Step 3 prompt here...",
	lines=6,
	)

	gr.Markdown(
	"""
	<div class="submission-note">
	<b>Please note:</b><br>
	Each run may take a couple of minutes.<br>
	After you click a button, wait for the result and do not click it again.
	</div>
	"""
	)

	with gr.Row():
	practice_button = gr.Button("Practice Run")
	final_button = gr.Button("Submit Final")

	output_text = gr.Textbox(label="Results", lines=18)
	feedback_md = gr.Markdown("", visible=False)

	def practice_submit_and_update(email, name, s1, s2, s3):
	return handle_submission("practice", email, name, s1, s2, s3)

	def final_submit_and_update(email, name, s1, s2, s3):
	return handle_submission("final", email, name, s1, s2, s3)

	practice_button.click(
	fn=practice_submit_and_update,
	inputs=[
	email_input,
	name_input,
	system_prompt_input_1,
	system_prompt_input_2,
	system_prompt_input_3,
	],
	outputs=[output_text, practice_button, final_button, feedback_md],
	)

	final_button.click(
	fn=final_submit_and_update,
	inputs=[
	email_input,
	name_input,
	system_prompt_input_1,
	system_prompt_input_2,
	system_prompt_input_3,
	],
	outputs=[output_text, practice_button, final_button, feedback_md],
	)

	return demo


	if __name__ == "__main__":
	interface = build_interface()
	interface.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)