Spaces:

Usmansafder
/

finchain-space

Sleeping

usmansafdarktk

Initial FinChain app commit

2d4ef8a 4 months ago

13.5 kB

	import random, torch
	import re
	import json
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	from langchain.schema import HumanMessage
	import os
	import pandas as pd
	from tqdm import tqdm
	from rouge_score import rouge_scorer
	from bert_score import BERTScorer
	import evaluate
	import re
	import warnings

	from collections import Counter
	import math

	warnings.filterwarnings("ignore")

	results_dir = '../results/'
	evals_dir = '../evals/'

	reasoning_models = [
	# List all files with model generations here
	]

	non_reasoning_models = [
	# List all files with model generations here
	]

	def load_jsonl(file_path):
	data = []
	with open(file_path, 'r') as f:
	for line in f:
	data.append(json.loads(line))
	return data

	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	bertscorer = BERTScorer(model_type='allenai/longformer-base-4096', device='cuda' if torch.cuda.is_available() else 'cpu')
	rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)

	def convert_to_float(text):
	# Remove non-numeric characters and convert to float
	# Extract numbers and handle special characters
	text = text.replace('~', '').strip() # Remove approximate symbol
	text = text.replace('$', '').replace(',', '') # Remove dollar sign and commas
	numbers = re.findall(r'[\d.]+', text)
	if not numbers:
	return None

	try:
	number = float(numbers[0])
	except:
	return None

	# Handle multipliers

	if 'billion' in text.lower():
	number *= 1000000000
	elif 'million' in text.lower():
	number *= 1000000
	elif 'thousand' in text.lower():
	number *= 1000

	return number

	# Cos similarity on two bags of words over given threshold
	def cos_similarity(a: str, b: str, min_val: float = 0.0, max_val: float = 1.0) -> float:
	"""Pure cosine similarity between two raw strings (bag-of-words)."""
	c1, c2 = Counter(a.split()), Counter(b.split())
	vocab = set(c1) \| set(c2)
	dot = sum(c1[w] * c2[w] for w in vocab)
	n1 = math.sqrt(sum(c1[w] ** 2 for w in vocab))
	n2 = math.sqrt(sum(c2[w] ** 2 for w in vocab))
	val = dot / (n1 * n2) if n1 and n2 else 0.0
	if val >= min_val and val <= max_val:
	return val
	return None

	def extract_final_answer(step):
	# Replace all multiple spaces with a single space
	step = re.sub(r'\s+', ' ', step).strip()

	# Try capturing explicit "Answer: $" pattern
	answer_match = re.search(r'\Answer:\\s.?(?:USD\|\$)?\s*([\d,]+(?:\.\d{1,2})?)', step)
	if answer_match:
	return convert_to_float(answer_match.group(0))

	# Fallback: last valid dollar-format number after =
	final_answer_match = re.search(r'=\s(?!.=)(?:USD\|\$)?\s[\d,]+(?:\.\d{1,2})?\s(million\|billion\|thousand)?', step.lower())
	if final_answer_match:
	return convert_to_float(final_answer_match.group(0))

	# Fallback: last valid number in the step (Georgi)
	matches = list(re.finditer(r'\d.[\d,]+(?:\.\d{1,2})?\s*(million\|billion\|thousand)?', step.lower()))
	if matches:
	last_number_match = matches[-1]
	return convert_to_float(last_number_match.group(0))

	# Fallback: extract last sentence (Georgi). Could start with "Answer: " or "Final Answer: " or "Final Answer is: " or \n or .
	last_sentence_match = re.search(r'(?<=\n\|\.\|:)\s(answer: \|final answer: \|final answer is: )?([\d,]+(?:\.\d{1,2})?\s(million\|billion\|thousand)?)', step.lower())
	if last_sentence_match:
	return last_sentence_match.group(0)

	return None

	def extract_steps(text):
	if '\nuser\n' in text:
	text = text.split('\nuser\n')[0]

	# Find all instances of "Step" at the beginning of a line or after newline
	steps = re.finditer(r'(?:^\|\n)(\s)(\)(#)(\s)Step(-)(\s*)(\d+)', text)
	step_starts = [step.span()[0] for step in steps]
	if len(step_starts) == 0:
	steps = re.finditer(r'(?:^\|\n)\s(\)(\d+)(.)', text)
	step_starts = [step.span()[0] for step in steps]
	step_spans = [(step_starts[idx], step_starts[idx+1]) for idx in range(len(step_starts) - 1)]

	# Add the last step span
	if len(step_starts) > 0:
	step_spans.append((step_starts[-1], len(text)))
	step_strings = [text[start:end].strip() for start, end in step_spans]

	# Remove leading "Step X" from each step
	step_strings = [re.sub(r'(?i)Step\s+\d+\s*:', '', step).strip() for step in step_strings]

	# Find stepwise final answers
	step_final_answers = []
	for step in step_strings:
	# Find the last '=' and extract everything after it until the end
	step_final_answer = extract_final_answer(step)
	step_final_answers.append(step_final_answer)

	# Find final answer
	final_answer = None
	for step_final_answer in reversed(step_final_answers):
	if step_final_answer is not None:
	final_answer = step_final_answer
	break

	# Remove extra spaces
	step_strings = [re.sub(r'\s+', ' ', step) for step in step_strings]

	return step_strings, step_final_answers, final_answer

	def preprocess_document(document):
	return re.sub(r"\s+", " ", document)


	MAX_TOKENS = 4096 # Longformer's limit

	def safe_score(bertscorer, gt, pred):
	if not isinstance(gt, str) or not isinstance(pred, str):
	print("Non-string input:", gt, pred)
	return 0.0
	if len(gt.strip()) == 0 or len(pred.strip()) == 0:
	print("Empty input:", gt, pred)
	return 0.0

	# Optional: truncate by tokens using the model's tokenizer
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

	gt_tokens = tokenizer(gt, return_tensors="pt", truncation=True, max_length=MAX_TOKENS)
	pred_tokens = tokenizer(pred, return_tensors="pt", truncation=True, max_length=MAX_TOKENS)

	try:
	# Re-decode to text after truncation (for BERTScore input)
	gt_truncated = tokenizer.batch_decode(gt_tokens["input_ids"], skip_special_tokens=True)[0]
	pred_truncated = tokenizer.batch_decode(pred_tokens["input_ids"], skip_special_tokens=True)[0]
	return bertscorer.score([gt_truncated], [pred_truncated])[2][0].item()
	except Exception as e:
	print("BERTScore error:", e)
	return 0.0


	def evaluate_trace(gt, pred):

	if gt is None or pred is None:
	return None, None, None, None, None, None, None, None, None, None

	# Calculate Rouge scores
	rouge_score = rougescorer.score(gt, pred)
	rouge1, rouge2, rougeL, rougeLsum = rouge_score['rouge1'].fmeasure, rouge_score['rouge2'].fmeasure, rouge_score['rougeL'].fmeasure, rouge_score['rougeLsum'].fmeasure

	# Calculate BERTScore
	bert_score = safe_score(bertscorer, gt, pred)

	# Extract steps and final answers
	gt_steps, gt_step_final_answers, gt_final_answer = extract_steps(gt)
	pred_steps, pred_step_final_answers, pred_final_answer = extract_steps(pred)

	# If no steps or final answers, return zeros
	if len(gt_steps) == 0 or len(pred_steps) == 0:
	return 0, 0, 0, 0, rouge1, rouge2, rougeL, rougeLsum, bert_score

	# Convert steps to embeddings
	gt_steps_embeddings = model.encode(gt_steps)
	pred_steps_embeddings = model.encode(pred_steps)

	step_final_answer_correctness = [[0 for _ in range(len(pred_steps))] for _ in range(len(gt_steps))]

	for id_i, gt_step_answer in enumerate(gt_step_final_answers):
	if gt_step_answer is None:
	continue
	for id_j, pred_step_answer in enumerate(pred_step_final_answers):
	if pred_step_answer is None:
	continue

	# Handle string values
	if isinstance(gt_step_answer, str) or isinstance(pred_step_answer, str):
	# Calculate cosine similarity
	similarity = cos_similarity(str(gt_step_answer), str(pred_step_answer), min_val=0.5)
	if similarity is not None:
	step_final_answer_correctness[id_i][id_j] = 1
	break

	# Else handle numeric values
	elif abs(gt_step_answer - pred_step_answer) / (gt_step_answer + 0.0001) < 0.1:
	step_final_answer_correctness[id_i][id_j] = 1
	break
	else:
	step_final_answer_correctness[id_i][id_j] = 0

	# Calculate cosine similarity between step embeddings
	similarity_matrix = cosine_similarity(gt_steps_embeddings, pred_steps_embeddings)
	similarity_matrix = np.multiply(similarity_matrix, step_final_answer_correctness)

	# Calculate recall and precision
	max_similarities_backward = np.max(similarity_matrix, axis=1)
	max_similarities_forward = np.max(similarity_matrix, axis=0)
	binarized_similarity_backward = max_similarities_backward > 0.6
	binarized_similarity_forward = max_similarities_forward > 0.6
	recall = float(np.sum(binarized_similarity_backward) / len(gt_steps))
	precision = float(np.sum(binarized_similarity_forward) / len(pred_steps))

	# Check final answer match
	if gt_final_answer is None or pred_final_answer is None:
	final_answer_match = 0

	# Handle string values
	elif isinstance(gt_final_answer, str) or isinstance(pred_final_answer, str):
	# Calculate cosine similarity
	similarity = cos_similarity(str(gt_final_answer), str(pred_final_answer), min_val=0.1)
	final_answer_match = 1 if similarity is not None else 0

	# Else handle numeric values
	else:
	final_answer_match = int(abs(gt_final_answer - pred_final_answer)/(gt_final_answer + 0.0001) < 0.05)

	return recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bert_score

	# Non-reasoning models
	for model_file in os.listdir(f'{results_dir}'):
	if model_file.endswith('.jsonl'):

	if model_file not in non_reasoning_models:
	print(f"Skipping {model_file} for now.")
	continue

	with open(f'{evals_dir}{model_file}', 'w') as f_eval:
	with open(f'{results_dir}{model_file}', 'r') as f_pred:
	for line in tqdm(f_pred, desc=model_file, total=2700):

	json_line = json.loads(line)
	recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation'])

	result_json_line = {}
	result_json_line['seed'] = json_line['seed']
	result_json_line['id'] = json_line.get('id', None)
	result_json_line['level'] = json_line['level']
	result_json_line['topic'] = json_line['topic']
	result_json_line['subtopic'] = json_line['subtopic']
	result_json_line['model'] = json_line['model']

	result_json_line['recall'] = recall
	result_json_line['precision'] = precision
	result_json_line['final_answer_match'] = final_answer_match
	result_json_line['rouge2'] = rouge2
	result_json_line['rougeL'] = rougeL
	result_json_line['rougeLsum'] = rougeLsum
	result_json_line['bertscore'] = bertscore
	f_eval.write(json.dumps(result_json_line) + '\n')

	print("Non-reasoning models evaluation completed.")

	# ## Reasoning models
	# for model_file in os.listdir(f'{results_dir}reasoning/'):
	# if model_file.endswith('.jsonl'):

	# if model_file not in reasoning_models:
	# print(f"Skipping {model_file} as it is not a reasoning model.")
	# continue

	# with open(f'{evals_dir}reasoning/{model_file}', 'w') as f_eval:
	# with open(f'{results_dir}reasoning/{model_file}', 'r') as f_pred:
	# for line in tqdm(f_pred, desc=model_file, total=2700):
	# json_line = json.loads(line)
	# recall, precision, final_answer_match, rouge2, rougeL, rougeLsum, bertscore = evaluate_trace(json_line['solution'], json_line['generation_parsed'])

	# result_json_line = {}
	# result_json_line['seed'] = json_line['seed']
	# result_json_line['id'] = json_line.get('id', None)
	# result_json_line['level'] = json_line['level']
	# result_json_line['topic'] = json_line['topic']
	# result_json_line['subtopic'] = json_line['subtopic']
	# result_json_line['model'] = json_line['model']

	# result_json_line['recall'] = recall
	# result_json_line['precision'] = precision
	# result_json_line['final_answer_match'] = final_answer_match
	# result_json_line['rouge2'] = rouge2
	# result_json_line['rougeL'] = rougeL
	# result_json_line['rougeLsum'] = rougeLsum
	# result_json_line['bertscore'] = bertscore
	# f_eval.write(json.dumps(result_json_line) + '\n')