usmansafdarktk
Initial FinChain app commit
2d4ef8a
import random, torch
import re
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain.schema import HumanMessage
import os
import pandas as pd
from tqdm import tqdm
from rouge_score import rouge_scorer
from bert_score import BERTScorer
import evaluate
import re
import warnings
from collections import Counter
import math
warnings.filterwarnings("ignore")
results_dir = '../results/'
evals_dir = '../evals/'
reasoning_models = [
# List all files with model generations here
]
non_reasoning_models = [
# List all files with model generations here
]
def load_jsonl(file_path):
data = []
with open(file_path, 'r') as f:
for line in f:
data.append(json.loads(line))
return data
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
bertscorer = BERTScorer(model_type='allenai/longformer-base-4096', device='cuda' if torch.cuda.is_available() else 'cpu')
rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
def convert_to_float(text):
# Remove non-numeric characters and convert to float
# Extract numbers and handle special characters
text = text.replace('~', '').strip() # Remove approximate symbol
text = text.replace('$', '').replace(',', '') # Remove dollar sign and commas
numbers = re.findall(r'[\d.]+', text)
if not numbers:
return None
try:
number = float(numbers[0])
except:
return None
# Handle multipliers
if 'billion' in text.lower():
number *= 1000000000
elif 'million' in text.lower():
number *= 1000000
elif 'thousand' in text.lower():
number *= 1000
return number
# Cos similarity on two bags of words over given threshold
def cos_similarity(a: str, b: str, min_val: float = 0.0, max_val: float = 1.0) -> float:
"""Pure cosine similarity between two raw strings (bag-of-words)."""
c1, c2 = Counter(a.split()), Counter(b.split())
vocab = set(c1) | set(c2)
dot = sum(c1[w] * c2[w] for w in vocab)
n1 = math.sqrt(sum(c1[w] ** 2 for w in vocab))
n2 = math.sqrt(sum(c2[w] ** 2 for w in vocab))
val = dot / (n1 * n2) if n1 and n2 else 0.0
if val >= min_val and val <= max_val:
return val
return None
def extract_final_answer(step):
# Replace all multiple spaces with a single space
step = re.sub(r'\s+', ' ', step).strip()
# Try capturing explicit "Answer: $" pattern
answer_match = re.search(r'\**Answer:\**\s*.*?(?:USD|\$)?\s*([\d,]+(?:\.\d{1,2})?)', step)
if answer_match:
return convert_to_float(answer_match.group(0))
# Fallback: last valid dollar-format number after =
final_answer_match = re.search(r'=\s*(?!.*=)(?:USD|\$)?\s*[\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?', step.lower())
if final_answer_match:
return convert_to_float(final_answer_match.group(0))
# Fallback: last valid number in the step (Georgi)
matches = list(re.finditer(r'\d.[\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?', step.lower()))
if matches:
last_number_match = matches[-1]
return convert_to_float(last_number_match.group(0))
# Fallback: extract last sentence (Georgi). Could start with "Answer: " or "Final Answer: " or "Final Answer is: " or \n or .
last_sentence_match = re.search(r'(?<=\n|\.|:)\s*(answer: |final answer: |final answer is: )?([\d,]+(?:\.\d{1,2})?\s*(million|billion|thousand)?)', step.lower())
if last_sentence_match:
return last_sentence_match.group(0)
return None
def extract_steps(text):
if '\nuser\n' in text:
text = text.split('\nuser\n')[0]
# Find all instances of "Step" at the beginning of a line or after newline
steps = re.finditer(r'(?:^|\n)(\s*)(\**)(#*)(\s*)Step(-*)(\s*)(\d+)', text)
step_starts = [step.span()[0] for step in steps]
if len(step_starts) == 0:
steps = re.finditer(r'(?:^|\n)\s*(\**)(\d+)(.*)', text)
step_starts = [step.span()[0] for step in steps]
step_spans = [(step_starts[idx], step_starts[idx+1]) for idx in range(len(step_starts) - 1)]
# Add the last step span
if len(step_starts) > 0:
step_spans.append((step_starts[-1], len(text)))
step_strings = [text[start:end].strip() for start, end in step_spans]
# Remove leading "Step X" from each step
step_strings = [re.sub(r'(?i)Step\s+\d+\s*:', '', step).strip() for step in step_strings]
# Find stepwise final answers
step_final_answers = []
for step in step_strings:
# Find the last '=' and extract everything after it until the end
step_final_answer = extract_final_answer(step)
step_final_answers.append(step_final_answer)
# Find final answer
final_answer = None
for step_final_answer in reversed(step_final_answers):
if step_final_answer is not None:
final_answer = step_final_answer
break
# Remove extra spaces
step_strings = [re.sub(r'\s+', ' ', step) for step in step_strings]
return step_strings, step_final_answers, final_answer
def preprocess_document(document):
return re.sub(r"\s+", " ", document)
MAX_TOKENS = 4096 # Longformer's limit
def safe_score(bertscorer, gt, pred):
if not isinstance(gt, str) or not isinstance(pred, str):
print("Non-string input:", gt, pred)
return 0.0
if len(gt.strip()) == 0 or len(pred.strip()) == 0:
print("Empty input:", gt, pred)
return 0.0
# Optional: truncate by tokens using the model's tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
gt_tokens = tokenizer(gt, return_tensors="pt", truncation=True, max_length=MAX_TOKENS)
pred_tokens = tokenizer(pred, return_tensors="pt", truncation=True, max_length=MAX_TOKENS)
try:
# Re-decode to text after truncation (for BERTScore input)
gt_truncated = tokenizer.batch_decode(gt_tokens["input_ids"], skip_special_tokens=True)[0]
pred_truncated = tokenizer.batch_decode(pred_tokens["input_ids"], skip_special_tokens=True)[0]
return bertscorer.score([gt_truncated], [pred_truncated])[2][0].item()
except Exception as e:
print("BERTScore error:", e)
return 0.0
def evaluate_trace(gt, pred):
if gt is None or pred is None:
return None, None, None, None, None, None, None, None, None, None
# Calculate Rouge scores
rouge_score = rougescorer.score(gt, pred)
rouge1, rouge2, rougeL, rougeLsum = rouge_score['rouge1'].fmeasure, rouge_score['rouge2'].fmeasure, rouge_score['rougeL'].fmeasure, rouge_score['rougeLsum'].fmeasure
# Calculate BERTScore
bert_score = safe_score(bertscorer, gt, pred)
# Extract steps and final answers
gt_steps, gt_step_final_answers, gt_final_answer = extract_steps(gt)
pred_steps, pred_step_final_answers, pred_final_answer = extract_steps(pred)
# If no steps or final answers, return zeros
if len(gt_steps) == 0 or len(pred_steps) == 0:
return 0, 0, 0, 0, rouge1, rouge2, rougeL, rougeLsum, bert_score
# Convert steps to embeddings
gt_steps_embeddings = model.encode(gt_steps)
pred_steps_embeddings = model.encode(pred_steps)
step_final_answer_correctness = [[0 for _ in range(len(pred_steps))] for _ in range(len(gt_steps))]
for id_i, gt_step_answer in enumerate(gt_step_final_answers):
if gt_step_answer is None:
continue
for id_j, pred_step_answer in enumerate(pred_step_final_answers):
if pred_step_answer is None:
continue
# Handle string values
if isinstance(gt_step_answer, str) or isinstance(pred_step_answer, str):
# Calculate cosine similarity
similarity = cos_similarity(str(gt_step_answer), str(pred_step_answer), min_val=0.5)
if similarity is not None:
step_final_answer_correctness[id_i][id_j] = 1
break
# Else handle numeric values
elif abs(gt_step_answer - pred_step_answer) / (gt_step_answer + 0.0001) < 0.1:
step_final_answer_correctness[id_i][id_j] = 1
break
else:
step_final_answer_correctness[id_i][id_j] = 0
# Calculate cosine similarity between step embeddings
similarity_matrix = cosine_similarity(gt_steps_embeddings, pred_steps_embeddings)
similarity_matrix = np.multiply(similarity_matrix, step_final_answer_correctness)
# Calculate recall and precision
max_similarities_backward = np.max(similarity_matrix, axis=1)
max_similarities_forward = np.max(similarity_matrix, axis=0)
binarized_similarity_backward = max_similarities_backward > 0.6
binarized_similarity_forward = max_similarities_forward > 0.6
recall = float(np.sum(binarized_similarity_backward) / len(gt_steps))
precision = float(np.sum(binarized_similarity_forward) / len(pred_steps))
# Check final answer match
if gt_final_answer is None or pred_final_answer is None:
final_answer_match = 0
# Handle string values
elif isinstance(gt_final_answer, str) or isinstance(pred_final_answer, str):
# Calculate cosine similarity
similarity = cos_similarity(str(gt_final_answer), str(pred_final_answer), min_val=0.1)
final_answer_match = 1 if similarity is not None else 0
# Else handle numeric values
else:
final_answer_match = int(abs(gt_final_answer - pred_final_answer)/(gt_final_answer + 0.0001) < 0.05)
return recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bert_score
# Non-reasoning models
for model_file in os.listdir(f'{results_dir}'):
if model_file.endswith('.jsonl'):
if model_file not in non_reasoning_models:
print(f"Skipping {model_file} for now.")
continue
with open(f'{evals_dir}{model_file}', 'w') as f_eval:
with open(f'{results_dir}{model_file}', 'r') as f_pred:
for line in tqdm(f_pred, desc=model_file, total=2700):
json_line = json.loads(line)
recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation'])
result_json_line = {}
result_json_line['seed'] = json_line['seed']
result_json_line['id'] = json_line.get('id', None)
result_json_line['level'] = json_line['level']
result_json_line['topic'] = json_line['topic']
result_json_line['subtopic'] = json_line['subtopic']
result_json_line['model'] = json_line['model']
result_json_line['recall'] = recall
result_json_line['precision'] = precision
result_json_line['final_answer_match'] = final_answer_match
result_json_line['rouge2'] = rouge2
result_json_line['rougeL'] = rougeL
result_json_line['rougeLsum'] = rougeLsum
result_json_line['bertscore'] = bertscore
f_eval.write(json.dumps(result_json_line) + '\n')
print("Non-reasoning models evaluation completed.")
# ## Reasoning models
# for model_file in os.listdir(f'{results_dir}reasoning/'):
# if model_file.endswith('.jsonl'):
# if model_file not in reasoning_models:
# print(f"Skipping {model_file} as it is not a reasoning model.")
# continue
# with open(f'{evals_dir}reasoning/{model_file}', 'w') as f_eval:
# with open(f'{results_dir}reasoning/{model_file}', 'r') as f_pred:
# for line in tqdm(f_pred, desc=model_file, total=2700):
# json_line = json.loads(line)
# recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation_parsed'])
# result_json_line = {}
# result_json_line['seed'] = json_line['seed']
# result_json_line['id'] = json_line.get('id', None)
# result_json_line['level'] = json_line['level']
# result_json_line['topic'] = json_line['topic']
# result_json_line['subtopic'] = json_line['subtopic']
# result_json_line['model'] = json_line['model']
# result_json_line['recall'] = recall
# result_json_line['precision'] = precision
# result_json_line['final_answer_match'] = final_answer_match
# result_json_line['rouge2'] = rouge2
# result_json_line['rougeL'] = rougeL
# result_json_line['rougeLsum'] = rougeLsum
# result_json_line['bertscore'] = bertscore
# f_eval.write(json.dumps(result_json_line) + '\n')