Spaces:
Sleeping
Sleeping
| import random, torch | |
| import re | |
| import json | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from langchain.schema import HumanMessage | |
| import os | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from rouge_score import rouge_scorer | |
| from bert_score import BERTScorer | |
| import evaluate | |
| import re | |
| import warnings | |
| from collections import Counter | |
| import math | |
| warnings.filterwarnings("ignore") | |
| results_dir = '../results/' | |
| evals_dir = '../evals/' | |
| reasoning_models = [ | |
| # List all files with model generations here | |
| ] | |
| non_reasoning_models = [ | |
| # List all files with model generations here | |
| ] | |
| def load_jsonl(file_path): | |
| data = [] | |
| with open(file_path, 'r') as f: | |
| for line in f: | |
| data.append(json.loads(line)) | |
| return data | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| bertscorer = BERTScorer(model_type='allenai/longformer-base-4096', device='cuda' if torch.cuda.is_available() else 'cpu') | |
| rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True) | |
| def convert_to_float(text): | |
| # Remove non-numeric characters and convert to float | |
| # Extract numbers and handle special characters | |
| text = text.replace('~', '').strip() # Remove approximate symbol | |
| text = text.replace('$', '').replace(',', '') # Remove dollar sign and commas | |
| numbers = re.findall(r'[\d.]+', text) | |
| if not numbers: | |
| return None | |
| try: | |
| number = float(numbers[0]) | |
| except: | |
| return None | |
| # Handle multipliers | |
| if 'billion' in text.lower(): | |
| number *= 1000000000 | |
| elif 'million' in text.lower(): | |
| number *= 1000000 | |
| elif 'thousand' in text.lower(): | |
| number *= 1000 | |
| return number | |
| # Cos similarity on two bags of words over given threshold | |
| def cos_similarity(a: str, b: str, min_val: float = 0.0, max_val: float = 1.0) -> float: | |
| """Pure cosine similarity between two raw strings (bag-of-words).""" | |
| c1, c2 = Counter(a.split()), Counter(b.split()) | |
| vocab = set(c1) | set(c2) | |
| dot = sum(c1[w] * c2[w] for w in vocab) | |
| n1 = math.sqrt(sum(c1[w] ** 2 for w in vocab)) | |
| n2 = math.sqrt(sum(c2[w] ** 2 for w in vocab)) | |
| val = dot / (n1 * n2) if n1 and n2 else 0.0 | |
| if val >= min_val and val <= max_val: | |
| return val | |
| return None | |
| def extract_final_answer(step): | |
| # Replace all multiple spaces with a single space | |
| step = re.sub(r'\s+', ' ', step).strip() | |
| # Try capturing explicit "Answer: $" pattern | |
| answer_match = re.search(r'\**Answer:\**\s*.*?(?:USD|\$)?\s*([\d,]+(?:\.\d{1,2})?)', step) | |
| if answer_match: | |
| return convert_to_float(answer_match.group(0)) | |
| # Fallback: last valid dollar-format number after = | |
| final_answer_match = re.search(r'=\s*(?!.*=)(?:USD|\$)?\s*[\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?', step.lower()) | |
| if final_answer_match: | |
| return convert_to_float(final_answer_match.group(0)) | |
| # Fallback: last valid number in the step (Georgi) | |
| matches = list(re.finditer(r'\d.[\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?', step.lower())) | |
| if matches: | |
| last_number_match = matches[-1] | |
| return convert_to_float(last_number_match.group(0)) | |
| # Fallback: extract last sentence (Georgi). Could start with "Answer: " or "Final Answer: " or "Final Answer is: " or \n or . | |
| last_sentence_match = re.search(r'(?<=\n|\.|:)\s*(answer: |final answer: |final answer is: )?([\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?)', step.lower()) | |
| if last_sentence_match: | |
| return last_sentence_match.group(0) | |
| return None | |
| def extract_steps(text): | |
| if '\nuser\n' in text: | |
| text = text.split('\nuser\n')[0] | |
| # Find all instances of "Step" at the beginning of a line or after newline | |
| steps = re.finditer(r'(?:^|\n)(\s*)(\**)(#*)(\s*)Step(-*)(\s*)(\d+)', text) | |
| step_starts = [step.span()[0] for step in steps] | |
| if len(step_starts) == 0: | |
| steps = re.finditer(r'(?:^|\n)\s*(\**)(\d+)(.*)', text) | |
| step_starts = [step.span()[0] for step in steps] | |
| step_spans = [(step_starts[idx], step_starts[idx+1]) for idx in range(len(step_starts) - 1)] | |
| # Add the last step span | |
| if len(step_starts) > 0: | |
| step_spans.append((step_starts[-1], len(text))) | |
| step_strings = [text[start:end].strip() for start, end in step_spans] | |
| # Remove leading "Step X" from each step | |
| step_strings = [re.sub(r'(?i)Step\s+\d+\s*:', '', step).strip() for step in step_strings] | |
| # Find stepwise final answers | |
| step_final_answers = [] | |
| for step in step_strings: | |
| # Find the last '=' and extract everything after it until the end | |
| step_final_answer = extract_final_answer(step) | |
| step_final_answers.append(step_final_answer) | |
| # Find final answer | |
| final_answer = None | |
| for step_final_answer in reversed(step_final_answers): | |
| if step_final_answer is not None: | |
| final_answer = step_final_answer | |
| break | |
| # Remove extra spaces | |
| step_strings = [re.sub(r'\s+', ' ', step) for step in step_strings] | |
| return step_strings, step_final_answers, final_answer | |
| def preprocess_document(document): | |
| return re.sub(r"\s+", " ", document) | |
| MAX_TOKENS = 4096 # Longformer's limit | |
| def safe_score(bertscorer, gt, pred): | |
| if not isinstance(gt, str) or not isinstance(pred, str): | |
| print("Non-string input:", gt, pred) | |
| return 0.0 | |
| if len(gt.strip()) == 0 or len(pred.strip()) == 0: | |
| print("Empty input:", gt, pred) | |
| return 0.0 | |
| # Optional: truncate by tokens using the model's tokenizer | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096") | |
| gt_tokens = tokenizer(gt, return_tensors="pt", truncation=True, max_length=MAX_TOKENS) | |
| pred_tokens = tokenizer(pred, return_tensors="pt", truncation=True, max_length=MAX_TOKENS) | |
| try: | |
| # Re-decode to text after truncation (for BERTScore input) | |
| gt_truncated = tokenizer.batch_decode(gt_tokens["input_ids"], skip_special_tokens=True)[0] | |
| pred_truncated = tokenizer.batch_decode(pred_tokens["input_ids"], skip_special_tokens=True)[0] | |
| return bertscorer.score([gt_truncated], [pred_truncated])[2][0].item() | |
| except Exception as e: | |
| print("BERTScore error:", e) | |
| return 0.0 | |
| def evaluate_trace(gt, pred): | |
| if gt is None or pred is None: | |
| return None, None, None, None, None, None, None, None, None, None | |
| # Calculate Rouge scores | |
| rouge_score = rougescorer.score(gt, pred) | |
| rouge1, rouge2, rougeL, rougeLsum = rouge_score['rouge1'].fmeasure, rouge_score['rouge2'].fmeasure, rouge_score['rougeL'].fmeasure, rouge_score['rougeLsum'].fmeasure | |
| # Calculate BERTScore | |
| bert_score = safe_score(bertscorer, gt, pred) | |
| # Extract steps and final answers | |
| gt_steps, gt_step_final_answers, gt_final_answer = extract_steps(gt) | |
| pred_steps, pred_step_final_answers, pred_final_answer = extract_steps(pred) | |
| # If no steps or final answers, return zeros | |
| if len(gt_steps) == 0 or len(pred_steps) == 0: | |
| return 0, 0, 0, 0, rouge1, rouge2, rougeL, rougeLsum, bert_score | |
| # Convert steps to embeddings | |
| gt_steps_embeddings = model.encode(gt_steps) | |
| pred_steps_embeddings = model.encode(pred_steps) | |
| step_final_answer_correctness = [[0 for _ in range(len(pred_steps))] for _ in range(len(gt_steps))] | |
| for id_i, gt_step_answer in enumerate(gt_step_final_answers): | |
| if gt_step_answer is None: | |
| continue | |
| for id_j, pred_step_answer in enumerate(pred_step_final_answers): | |
| if pred_step_answer is None: | |
| continue | |
| # Handle string values | |
| if isinstance(gt_step_answer, str) or isinstance(pred_step_answer, str): | |
| # Calculate cosine similarity | |
| similarity = cos_similarity(str(gt_step_answer), str(pred_step_answer), min_val=0.5) | |
| if similarity is not None: | |
| step_final_answer_correctness[id_i][id_j] = 1 | |
| break | |
| # Else handle numeric values | |
| elif abs(gt_step_answer - pred_step_answer) / (gt_step_answer + 0.0001) < 0.1: | |
| step_final_answer_correctness[id_i][id_j] = 1 | |
| break | |
| else: | |
| step_final_answer_correctness[id_i][id_j] = 0 | |
| # Calculate cosine similarity between step embeddings | |
| similarity_matrix = cosine_similarity(gt_steps_embeddings, pred_steps_embeddings) | |
| similarity_matrix = np.multiply(similarity_matrix, step_final_answer_correctness) | |
| # Calculate recall and precision | |
| max_similarities_backward = np.max(similarity_matrix, axis=1) | |
| max_similarities_forward = np.max(similarity_matrix, axis=0) | |
| binarized_similarity_backward = max_similarities_backward > 0.6 | |
| binarized_similarity_forward = max_similarities_forward > 0.6 | |
| recall = float(np.sum(binarized_similarity_backward) / len(gt_steps)) | |
| precision = float(np.sum(binarized_similarity_forward) / len(pred_steps)) | |
| # Check final answer match | |
| if gt_final_answer is None or pred_final_answer is None: | |
| final_answer_match = 0 | |
| # Handle string values | |
| elif isinstance(gt_final_answer, str) or isinstance(pred_final_answer, str): | |
| # Calculate cosine similarity | |
| similarity = cos_similarity(str(gt_final_answer), str(pred_final_answer), min_val=0.1) | |
| final_answer_match = 1 if similarity is not None else 0 | |
| # Else handle numeric values | |
| else: | |
| final_answer_match = int(abs(gt_final_answer - pred_final_answer)/(gt_final_answer + 0.0001) < 0.05) | |
| return recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bert_score | |
| # Non-reasoning models | |
| for model_file in os.listdir(f'{results_dir}'): | |
| if model_file.endswith('.jsonl'): | |
| if model_file not in non_reasoning_models: | |
| print(f"Skipping {model_file} for now.") | |
| continue | |
| with open(f'{evals_dir}{model_file}', 'w') as f_eval: | |
| with open(f'{results_dir}{model_file}', 'r') as f_pred: | |
| for line in tqdm(f_pred, desc=model_file, total=2700): | |
| json_line = json.loads(line) | |
| recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation']) | |
| result_json_line = {} | |
| result_json_line['seed'] = json_line['seed'] | |
| result_json_line['id'] = json_line.get('id', None) | |
| result_json_line['level'] = json_line['level'] | |
| result_json_line['topic'] = json_line['topic'] | |
| result_json_line['subtopic'] = json_line['subtopic'] | |
| result_json_line['model'] = json_line['model'] | |
| result_json_line['recall'] = recall | |
| result_json_line['precision'] = precision | |
| result_json_line['final_answer_match'] = final_answer_match | |
| result_json_line['rouge2'] = rouge2 | |
| result_json_line['rougeL'] = rougeL | |
| result_json_line['rougeLsum'] = rougeLsum | |
| result_json_line['bertscore'] = bertscore | |
| f_eval.write(json.dumps(result_json_line) + '\n') | |
| print("Non-reasoning models evaluation completed.") | |
| # ## Reasoning models | |
| # for model_file in os.listdir(f'{results_dir}reasoning/'): | |
| # if model_file.endswith('.jsonl'): | |
| # if model_file not in reasoning_models: | |
| # print(f"Skipping {model_file} as it is not a reasoning model.") | |
| # continue | |
| # with open(f'{evals_dir}reasoning/{model_file}', 'w') as f_eval: | |
| # with open(f'{results_dir}reasoning/{model_file}', 'r') as f_pred: | |
| # for line in tqdm(f_pred, desc=model_file, total=2700): | |
| # json_line = json.loads(line) | |
| # recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation_parsed']) | |
| # result_json_line = {} | |
| # result_json_line['seed'] = json_line['seed'] | |
| # result_json_line['id'] = json_line.get('id', None) | |
| # result_json_line['level'] = json_line['level'] | |
| # result_json_line['topic'] = json_line['topic'] | |
| # result_json_line['subtopic'] = json_line['subtopic'] | |
| # result_json_line['model'] = json_line['model'] | |
| # result_json_line['recall'] = recall | |
| # result_json_line['precision'] = precision | |
| # result_json_line['final_answer_match'] = final_answer_match | |
| # result_json_line['rouge2'] = rouge2 | |
| # result_json_line['rougeL'] = rougeL | |
| # result_json_line['rougeLsum'] = rougeLsum | |
| # result_json_line['bertscore'] = bertscore | |
| # f_eval.write(json.dumps(result_json_line) + '\n') |