import random, torch import re import json from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from langchain.schema import HumanMessage import os import pandas as pd from tqdm import tqdm from rouge_score import rouge_scorer from bert_score import BERTScorer import evaluate import re import warnings from collections import Counter import math warnings.filterwarnings("ignore") results_dir = '../results/' evals_dir = '../evals/' reasoning_models = [ # List all files with model generations here ] non_reasoning_models = [ # List all files with model generations here ] def load_jsonl(file_path): data = [] with open(file_path, 'r') as f: for line in f: data.append(json.loads(line)) return data model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') bertscorer = BERTScorer(model_type='allenai/longformer-base-4096', device='cuda' if torch.cuda.is_available() else 'cpu') rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True) def convert_to_float(text): # Remove non-numeric characters and convert to float # Extract numbers and handle special characters text = text.replace('~', '').strip() # Remove approximate symbol text = text.replace('$', '').replace(',', '') # Remove dollar sign and commas numbers = re.findall(r'[\d.]+', text) if not numbers: return None try: number = float(numbers[0]) except: return None # Handle multipliers if 'billion' in text.lower(): number *= 1000000000 elif 'million' in text.lower(): number *= 1000000 elif 'thousand' in text.lower(): number *= 1000 return number # Cos similarity on two bags of words over given threshold def cos_similarity(a: str, b: str, min_val: float = 0.0, max_val: float = 1.0) -> float: """Pure cosine similarity between two raw strings (bag-of-words).""" c1, c2 = Counter(a.split()), Counter(b.split()) vocab = set(c1) | set(c2) dot = sum(c1[w] * c2[w] for w in vocab) n1 = math.sqrt(sum(c1[w] ** 2 for w in vocab)) n2 = math.sqrt(sum(c2[w] ** 2 for w in vocab)) val = dot / (n1 * n2) if n1 and n2 else 0.0 if val >= min_val and val <= max_val: return val return None def extract_final_answer(step): # Replace all multiple spaces with a single space step = re.sub(r'\s+', ' ', step).strip() # Try capturing explicit "Answer: $" pattern answer_match = re.search(r'\**Answer:\**\s*.*?(?:USD|\$)?\s*([\d,]+(?:\.\d{1,2})?)', step) if answer_match: return convert_to_float(answer_match.group(0)) # Fallback: last valid dollar-format number after = final_answer_match = re.search(r'=\s*(?!.*=)(?:USD|\$)?\s*[\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?', step.lower()) if final_answer_match: return convert_to_float(final_answer_match.group(0)) # Fallback: last valid number in the step (Georgi) matches = list(re.finditer(r'\d.[\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?', step.lower())) if matches: last_number_match = matches[-1] return convert_to_float(last_number_match.group(0)) # Fallback: extract last sentence (Georgi). Could start with "Answer: " or "Final Answer: " or "Final Answer is: " or \n or . last_sentence_match = re.search(r'(?<=\n|\.|:)\s*(answer: |final answer: |final answer is: )?([\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?)', step.lower()) if last_sentence_match: return last_sentence_match.group(0) return None def extract_steps(text): if '\nuser\n' in text: text = text.split('\nuser\n')[0] # Find all instances of "Step" at the beginning of a line or after newline steps = re.finditer(r'(?:^|\n)(\s*)(\**)(#*)(\s*)Step(-*)(\s*)(\d+)', text) step_starts = [step.span()[0] for step in steps] if len(step_starts) == 0: steps = re.finditer(r'(?:^|\n)\s*(\**)(\d+)(.*)', text) step_starts = [step.span()[0] for step in steps] step_spans = [(step_starts[idx], step_starts[idx+1]) for idx in range(len(step_starts) - 1)] # Add the last step span if len(step_starts) > 0: step_spans.append((step_starts[-1], len(text))) step_strings = [text[start:end].strip() for start, end in step_spans] # Remove leading "Step X" from each step step_strings = [re.sub(r'(?i)Step\s+\d+\s*:', '', step).strip() for step in step_strings] # Find stepwise final answers step_final_answers = [] for step in step_strings: # Find the last '=' and extract everything after it until the end step_final_answer = extract_final_answer(step) step_final_answers.append(step_final_answer) # Find final answer final_answer = None for step_final_answer in reversed(step_final_answers): if step_final_answer is not None: final_answer = step_final_answer break # Remove extra spaces step_strings = [re.sub(r'\s+', ' ', step) for step in step_strings] return step_strings, step_final_answers, final_answer def preprocess_document(document): return re.sub(r"\s+", " ", document) MAX_TOKENS = 4096 # Longformer's limit def safe_score(bertscorer, gt, pred): if not isinstance(gt, str) or not isinstance(pred, str): print("Non-string input:", gt, pred) return 0.0 if len(gt.strip()) == 0 or len(pred.strip()) == 0: print("Empty input:", gt, pred) return 0.0 # Optional: truncate by tokens using the model's tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096") gt_tokens = tokenizer(gt, return_tensors="pt", truncation=True, max_length=MAX_TOKENS) pred_tokens = tokenizer(pred, return_tensors="pt", truncation=True, max_length=MAX_TOKENS) try: # Re-decode to text after truncation (for BERTScore input) gt_truncated = tokenizer.batch_decode(gt_tokens["input_ids"], skip_special_tokens=True)[0] pred_truncated = tokenizer.batch_decode(pred_tokens["input_ids"], skip_special_tokens=True)[0] return bertscorer.score([gt_truncated], [pred_truncated])[2][0].item() except Exception as e: print("BERTScore error:", e) return 0.0 def evaluate_trace(gt, pred): if gt is None or pred is None: return None, None, None, None, None, None, None, None, None, None # Calculate Rouge scores rouge_score = rougescorer.score(gt, pred) rouge1, rouge2, rougeL, rougeLsum = rouge_score['rouge1'].fmeasure, rouge_score['rouge2'].fmeasure, rouge_score['rougeL'].fmeasure, rouge_score['rougeLsum'].fmeasure # Calculate BERTScore bert_score = safe_score(bertscorer, gt, pred) # Extract steps and final answers gt_steps, gt_step_final_answers, gt_final_answer = extract_steps(gt) pred_steps, pred_step_final_answers, pred_final_answer = extract_steps(pred) # If no steps or final answers, return zeros if len(gt_steps) == 0 or len(pred_steps) == 0: return 0, 0, 0, 0, rouge1, rouge2, rougeL, rougeLsum, bert_score # Convert steps to embeddings gt_steps_embeddings = model.encode(gt_steps) pred_steps_embeddings = model.encode(pred_steps) step_final_answer_correctness = [[0 for _ in range(len(pred_steps))] for _ in range(len(gt_steps))] for id_i, gt_step_answer in enumerate(gt_step_final_answers): if gt_step_answer is None: continue for id_j, pred_step_answer in enumerate(pred_step_final_answers): if pred_step_answer is None: continue # Handle string values if isinstance(gt_step_answer, str) or isinstance(pred_step_answer, str): # Calculate cosine similarity similarity = cos_similarity(str(gt_step_answer), str(pred_step_answer), min_val=0.5) if similarity is not None: step_final_answer_correctness[id_i][id_j] = 1 break # Else handle numeric values elif abs(gt_step_answer - pred_step_answer) / (gt_step_answer + 0.0001) < 0.1: step_final_answer_correctness[id_i][id_j] = 1 break else: step_final_answer_correctness[id_i][id_j] = 0 # Calculate cosine similarity between step embeddings similarity_matrix = cosine_similarity(gt_steps_embeddings, pred_steps_embeddings) similarity_matrix = np.multiply(similarity_matrix, step_final_answer_correctness) # Calculate recall and precision max_similarities_backward = np.max(similarity_matrix, axis=1) max_similarities_forward = np.max(similarity_matrix, axis=0) binarized_similarity_backward = max_similarities_backward > 0.6 binarized_similarity_forward = max_similarities_forward > 0.6 recall = float(np.sum(binarized_similarity_backward) / len(gt_steps)) precision = float(np.sum(binarized_similarity_forward) / len(pred_steps)) # Check final answer match if gt_final_answer is None or pred_final_answer is None: final_answer_match = 0 # Handle string values elif isinstance(gt_final_answer, str) or isinstance(pred_final_answer, str): # Calculate cosine similarity similarity = cos_similarity(str(gt_final_answer), str(pred_final_answer), min_val=0.1) final_answer_match = 1 if similarity is not None else 0 # Else handle numeric values else: final_answer_match = int(abs(gt_final_answer - pred_final_answer)/(gt_final_answer + 0.0001) < 0.05) return recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bert_score # Non-reasoning models for model_file in os.listdir(f'{results_dir}'): if model_file.endswith('.jsonl'): if model_file not in non_reasoning_models: print(f"Skipping {model_file} for now.") continue with open(f'{evals_dir}{model_file}', 'w') as f_eval: with open(f'{results_dir}{model_file}', 'r') as f_pred: for line in tqdm(f_pred, desc=model_file, total=2700): json_line = json.loads(line) recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation']) result_json_line = {} result_json_line['seed'] = json_line['seed'] result_json_line['id'] = json_line.get('id', None) result_json_line['level'] = json_line['level'] result_json_line['topic'] = json_line['topic'] result_json_line['subtopic'] = json_line['subtopic'] result_json_line['model'] = json_line['model'] result_json_line['recall'] = recall result_json_line['precision'] = precision result_json_line['final_answer_match'] = final_answer_match result_json_line['rouge2'] = rouge2 result_json_line['rougeL'] = rougeL result_json_line['rougeLsum'] = rougeLsum result_json_line['bertscore'] = bertscore f_eval.write(json.dumps(result_json_line) + '\n') print("Non-reasoning models evaluation completed.") # ## Reasoning models # for model_file in os.listdir(f'{results_dir}reasoning/'): # if model_file.endswith('.jsonl'): # if model_file not in reasoning_models: # print(f"Skipping {model_file} as it is not a reasoning model.") # continue # with open(f'{evals_dir}reasoning/{model_file}', 'w') as f_eval: # with open(f'{results_dir}reasoning/{model_file}', 'r') as f_pred: # for line in tqdm(f_pred, desc=model_file, total=2700): # json_line = json.loads(line) # recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation_parsed']) # result_json_line = {} # result_json_line['seed'] = json_line['seed'] # result_json_line['id'] = json_line.get('id', None) # result_json_line['level'] = json_line['level'] # result_json_line['topic'] = json_line['topic'] # result_json_line['subtopic'] = json_line['subtopic'] # result_json_line['model'] = json_line['model'] # result_json_line['recall'] = recall # result_json_line['precision'] = precision # result_json_line['final_answer_match'] = final_answer_match # result_json_line['rouge2'] = rouge2 # result_json_line['rougeL'] = rougeL # result_json_line['rougeLsum'] = rougeLsum # result_json_line['bertscore'] = bertscore # f_eval.write(json.dumps(result_json_line) + '\n')