# from __future__ import annotations # import re # FLUFF_LINES = { # "i hope this helps", # "hope this helps", # "let me know if you need anything else", # "thanks", # } # def extract_final_answer(text: str) -> str: # if text is None: # return "" # text = str(text).strip() # if not text: # return "" # text = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", text) # text = re.sub(r"\s*```$", "", text) # # Strong preference: explicit final-answer style markers # explicit_patterns = [ # r"(?is)\bfinal answer\s*:\s*(.+)$", # r"(?is)\banswer\s*:\s*(.+)$", # r"(?is)\bthe answer is\s*:\s*(.+)$", # r"(?is)\bthe answer is\s+(.+)$", # ] # for pattern in explicit_patterns: # match = re.search(pattern, text) # if match: # candidate = match.group(1).strip() # candidate_lines = [line.strip() for line in candidate.splitlines() if line.strip()] # if candidate_lines: # return candidate_lines[0] # lines = [line.strip() for line in text.splitlines() if line.strip()] # if not lines: # return "" # # Prefer short non-fluff lines near the end # for line in reversed(lines): # normalized = normalize_basic_answer(line).lower() # if normalized and normalized not in FLUFF_LINES and len(normalized) <= 200: # return line # return lines[-1] # def normalize_basic_answer(text: str) -> str: # if text is None: # return "" # text = str(text).strip() # if not text: # return "" # text = re.sub(r"\s+", " ", text).strip() # text = re.sub(r"(?i)^(final answer|answer)\s*:\s*", "", text).strip() # if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', "'"}: # text = text[1:-1].strip() # if text.endswith(".") and not re.fullmatch(r"\d+\.\d+", text): # text = text[:-1].strip() # return text # def normalize_final_answer(question: str, text: str) -> str: # text = normalize_basic_answer(text) # if not text: # return "" # q = question.lower() # # first name only # if "give only the first name" in q or "first name only" in q: # text = re.split(r"\s+", text.strip())[0] # # last name only # if "last names only" in q or "use their last names only" in q: # parts = [part.strip() for part in text.split(",")] # cleaned_parts = [] # for part in parts: # tokens = part.split() # cleaned_parts.append(tokens[-1] if tokens else part) # text = ", ".join(cleaned_parts) # # city only # if "just give me the city name" in q or "city name without abbreviations" in q: # text = re.split(r"[,;()\-]", text)[0].strip() # # comma-delimited / comma separated list # if "comma separated list" in q or "comma-delimited list" in q or "comma delimited list" in q: # parts = [p.strip() for p in re.split(r",|\n", text) if p.strip()] # text = ",".join(parts) # # ascending order / alphabetical # if "ascending order" in q: # try: # nums = [int(x.strip()) for x in text.split(",") if x.strip()] # text = ",".join(str(n) for n in sorted(nums)) # except Exception: # pass # if "alphabetical order" in q or "alphabetize" in q or "alphabetized" in q: # parts = [p.strip() for p in text.split(",") if p.strip()] # if parts: # text = ",".join(sorted(parts, key=lambda x: x.lower())) # # two decimal places # if "two decimal places" in q: # number_match = re.search(r"-?\d+(?:\.\d+)?", text.replace(",", "")) # if number_match: # try: # value = float(number_match.group(0)) # text = f"{value:.2f}" # except Exception: # pass # # IOC code / abbreviations / codes often expected uppercase single token # if "ioc country code" in q: # text = text.strip().upper() # # algebraic notation answer should be just one move token-like string # if "algebraic notation" in q: # text = text.strip().split()[0] # return text # def is_placeholder_answer(text: str) -> bool: # normalized = normalize_basic_answer(text).lower() # return normalized in {"", "placeholder", "n/a", "unknown"} from __future__ import annotations import re _FLUFF_LINES = { "i hope this helps", "hope this helps", "let me know if you need anything else", "thanks", "thank you", } def extract_final_answer(text: str) -> str: """ Extract the most likely final answer from raw model output. Strategy: - prefer explicit markers like 'Final answer:' - strip code fences - if multiline, prefer a short meaningful line near the end """ if text is None: return "" text = str(text).strip() if not text: return "" text = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", text) text = re.sub(r"\s*```$", "", text) explicit_patterns = [ r"(?is)\bfinal answer\s*:\s*(.+)$", r"(?is)\banswer\s*:\s*(.+)$", r"(?is)\bthe answer is\s*:\s*(.+)$", r"(?is)\bthe answer is\s+(.+)$", ] for pattern in explicit_patterns: match = re.search(pattern, text) if match: candidate = match.group(1).strip() candidate_lines = [line.strip() for line in candidate.splitlines() if line.strip()] if candidate_lines: return candidate_lines[0] lines = [line.strip() for line in text.splitlines() if line.strip()] if not lines: return "" for line in reversed(lines): normalized = normalize_basic_answer(line).lower() if normalized and normalized not in _FLUFF_LINES and len(normalized) <= 200: return line return lines[-1] def normalize_basic_answer(text: str) -> str: """ Basic cleanup independent of question format. """ if text is None: return "" text = str(text).strip() if not text: return "" text = re.sub(r"\s+", " ", text).strip() text = re.sub(r"(?i)^(final answer|answer)\s*:\s*", "", text).strip() if len(text) >= 2 and text[0] == text[-1] and text[0] in {'"', "'"}: text = text[1:-1].strip() if text.endswith(".") and not re.fullmatch(r"-?\d+\.\d+", text): text = text[:-1].strip() return text def normalize_final_answer(*args: str) -> str: """ Backward-compatible normalizer. Supports: - normalize_final_answer(text) - normalize_final_answer(question, text) """ if len(args) == 1: question = "" text = args[0] elif len(args) == 2: question, text = args else: return "" text = normalize_basic_answer(text) if not text: return "" q = (question or "").lower() # Remove outer labels once more, conservatively text = re.sub(r"(?i)^(final answer|answer)\s*:\s*", "", text).strip() # first name only if "give only the first name" in q or "first name only" in q: tokens = text.split() if tokens: text = tokens[0] # last name only if "last names only" in q or "use their last names only" in q: parts = [part.strip() for part in text.split(",") if part.strip()] if parts: cleaned_parts: list[str] = [] for part in parts: tokens = part.split() cleaned_parts.append(tokens[-1] if tokens else part) text = ", ".join(cleaned_parts) # surname only if "what is the surname" in q or "surname of" in q: tokens = text.split() if tokens: text = tokens[-1] # city only if "city name without abbreviations" in q or "just give me the city name" in q: text = re.split(r"[,;()\-]", text)[0].strip() # IOC code if "ioc country code" in q: text = text.strip().upper() # algebraic notation if "algebraic notation" in q: text = text.strip().split()[0] # comma-separated list formatting if ( "comma separated list" in q or "comma-separated list" in q or "comma delimited list" in q or "comma-delimited list" in q or "comma separated" in q ): parts = [p.strip() for p in re.split(r",|\n", text) if p.strip()] text = ",".join(parts) # ascending order if "ascending order" in q: try: nums = [int(x.strip()) for x in text.split(",") if x.strip()] text = ",".join(str(n) for n in sorted(nums)) except Exception: pass # alphabetical order if "alphabetical order" in q or "alphabetize" in q or "alphabetized" in q: parts = [p.strip() for p in text.split(",") if p.strip()] if parts: text = ",".join(sorted(parts, key=lambda x: x.lower())) # two decimal places if "two decimal places" in q: compact = text.replace(",", "") match = re.search(r"-?\d+(?:\.\d+)?", compact) if match: try: value = float(match.group(0)) text = f"{value:.2f}" except Exception: pass if "nasa award number" in q: text = text.replace("NASA award number", "").strip() if "city name without abbreviations" in q: text = text.replace("St. Petersburg", "Saint Petersburg").strip() if "use their last names only" in q: parts = [p.strip() for p in text.split(",") if p.strip()] last_names = [] for part in parts: tokens = part.split() if tokens: last_names.append(tokens[-1]) if last_names: text = ",".join(last_names) return text.strip() def is_placeholder_answer(text: str) -> bool: """ Detect placeholder/fallback outputs. """ if text is None: return True normalized = normalize_basic_answer(text).lower() return normalized in { "", "placeholder", "n/a", "unknown", }