Spaces:

Concepta
/

metrics_analyzer

No application file

App Files Files Community

metrics_analyzer / metrics.py

Concepta

Create metrics.py

3d97611 verified almost 2 years ago

raw

history blame contribute delete

10.9 kB

	import nltk
	import mlflow
	import hyperopt
	from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
	from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
	import torch
	from sentence_transformers import SentenceTransformer, util
	from bert_score import score
	from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
	from rouge import Rouge
	from tqdm import tqdm
	from datasets import load_metric

	# Download necessary NLTK data
	nltk.download('punkt')
	nltk.download('stopwords')

	# --- Load pre-trained models ---
	# Research and update these with the most recent and powerful Portuguese models
	semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
	perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model
	perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
	perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)

	# Load Hugging Face metrics
	bertscore_metric = load_metric("bertscore")
	bleu_metric = load_metric("bleu")
	rouge_metric = load_metric("rouge")
	meteor_metric = load_metric("meteor") # Additional metric

	# Load a powerful LLM for generating and judging content
	generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available
	generator = pipeline("text-generation", model=generator_model_name)
	judge_model_name = generator_model_name # Using the same model for judging
	judge = pipeline("text-generation", model=judge_model_name)

	# --- Helper Functions ---
	def calculate_perplexity(text):
	"""Calculates perplexity of text using a Portuguese LLM model."""
	try:
	with torch.no_grad():
	tokenize_input = perplexity_tokenizer.tokenize(text)
	tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
	loss = perplexity_model(tensor_input, labels=tensor_input)[0]
	return torch.exp(loss).item()
	except Exception as e:
	print(f"Error calculating perplexity: {e}")
	return float('inf')


	def estimate_semantic_similarity(generated_text, reference_text):
	"""Estimates semantic similarity using a Portuguese Sentence Transformer."""
	try:
	embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
	embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
	cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
	return cosine_sim.item()
	except Exception as e:
	print(f"Error calculating semantic similarity: {e}")
	return 0.0


	def calculate_metrics(generated_text, reference_text):
	"""Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
	results = {}
	try:
	results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
	except Exception as e:
	print(f"Error calculating BERTScore: {e}")
	results['bertscore'] = None

	try:
	bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
	results['bleu'] = bleu_results['bleu']
	except Exception as e:
	print(f"Error calculating BLEU: {e}")
	results['bleu'] = None

	try:
	rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
	results['rougeL'] = rouge_results['rougeL']
	except Exception as e:
	print(f"Error calculating ROUGE: {e}")
	results['rougeL'] = None

	try:
	meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
	results['meteor'] = meteor_results['meteor']
	except Exception as e:
	print(f"Error calculating METEOR: {e}")
	results['meteor'] = None

	return results


	def get_llm_judgment(generated_text, reference_text):
	"""Gets a judgment from a powerful LLM on the quality of the generated text."""
	prompt = f"""
	You are an expert in evaluating educational content.
	Please evaluate the following generated text based on its accuracy, relevance, and clarity,
	compared to the provided reference text.

	Reference Text:
	{reference_text}

	Generated Text:
	{generated_text}

	Provide your judgment as one of the following categories:
	- "no issues": The generated text is accurate, relevant, and clear.
	- "minor issues": The generated text has some minor issues, but is mostly acceptable.
	- "major issues": The generated text has significant issues and needs substantial revision.
	"""
	judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
	return judgment


	# --- Content Analysis Function ---
	def analyze_content_for_review(generated_text, reference_text,
	similarity_threshold,
	bertscore_threshold,
	bleu_threshold,
	rouge_threshold,
	meteor_threshold):
	"""Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
	similarity = estimate_semantic_similarity(generated_text, reference_text)
	metrics = calculate_metrics(generated_text, reference_text)
	llm_judgment = get_llm_judgment(generated_text, reference_text)

	issues = []
	if similarity < similarity_threshold:
	issues.append(f"- Low Semantic Similarity: ({similarity:.2f}) Content might be off-topic or not factually aligned.")
	if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
	issues.append(f"- Low BERTScore: ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
	if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
	issues.append(f"- Low BLEU Score: ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
	if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
	issues.append(f"- Low ROUGE-L Score: ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
	if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
	issues.append(f"- Low METEOR Score: ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")

	# Use LLM judgment as the primary decision-maker
	if llm_judgment == "major issues":
	review_flag = True
	explanation = f"LLM Judgment: Major Issues\n" + "\n".join(issues)
	elif llm_judgment == "minor issues":
	review_flag = True
	explanation = f"LLM Judgment: Minor Issues\n" + "\n".join(issues)
	else:
	review_flag = False
	explanation = "LLM Judgment: No Issues"

	return {
	'review_flag': review_flag,
	'explanation': explanation,
	'semantic_similarity': similarity,
	'metrics': metrics,
	'llm_judgment': llm_judgment,
	'generated_text': generated_text,
	'reference_text': reference_text
	}


	# --- Threshold Optimization Functions ---
	def generate_educational_content(topic, num_sections=3):
	"""Generates educational content with chapters, topics, sections, and subsections."""
	prompt = f"""
	Generate a chapter of educational content on the topic of "{topic}".
	The chapter should include {num_sections} sections, each with at least
	one subsection. The content should be factually accurate, well-organized,
	and written in clear and concise Portuguese.
	"""
	generated_content = generator(prompt, max_length=1000)[0]['generated_text']
	return generated_content

	def objective(params):
	"""Objective function for Hyperopt to minimize."""
	similarity_threshold = params['similarity_threshold']
	bertscore_threshold = params['bertscore_threshold']
	bleu_threshold = params['bleu_threshold']
	rouge_threshold = params['rouge_threshold']
	meteor_threshold = params['meteor_threshold']

	# Generate AI-created data
	topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics
	generated_texts = []
	reference_texts = []
	for topic in topics:
	reference_text = generate_educational_content(topic)
	generated_text = generate_educational_content(topic)
	generated_texts.append(generated_text)
	reference_texts.append(reference_text)

	total_errors = 0
	for gen_text, ref_text in zip(generated_texts, reference_texts):
	result = analyze_content_for_review(gen_text, ref_text,
	similarity_threshold,
	bertscore_threshold,
	bleu_threshold,
	rouge_threshold,
	meteor_threshold)
	if result['review_flag'] and result['llm_judgment'] == "no issues":
	total_errors += 1

	# Log metrics and parameters to MLflow
	with mlflow.start_run():
	mlflow.log_params(params)
	mlflow.log_metric("total_errors", total_errors)

	return {'loss': total_errors, 'status': STATUS_OK}


	# --- Main Execution ---
	if __name__ == "__main__":
	# 1. Threshold Optimization Phase
	mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI
	search_space = { # Hyperparameter search space
	'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
	'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
	'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
	'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
	'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
	}
	trials = Trials()
	best_thresholds = fmin(fn=objective,
	space=search_space,
	algo=tpe.suggest,
	max_evals=50, # Adjust the number of evaluations as needed
	trials=trials)
	print("Best thresholds found:", best_thresholds)

	# 2. Content Evaluation Phase (using the best thresholds)
	new_generated_text = generate_educational_content("Matemática") # Example
	new_reference_text = "Content from your educational material..."

	evaluation_result = analyze_content_for_review(
	new_generated_text, new_reference_text,
	best_thresholds['similarity_threshold'],
	best_thresholds['bertscore_threshold'],
	best_thresholds['bleu_threshold'],
	best_thresholds['rouge_threshold'],
	best_thresholds['meteor_threshold']
	)

	print("\n----- Evaluation Result -----")
	print(f"Review Flag: {evaluation_result['review_flag']}")
	print(f"Explanation: {evaluation_result['explanation']}")