Spaces:
No application file
No application file
| import nltk | |
| import mlflow | |
| import hyperopt | |
| from hyperopt import fmin, tpe, hp, STATUS_OK, Trials | |
| from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| from sentence_transformers import SentenceTransformer, util | |
| from bert_score import score | |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
| from rouge import Rouge | |
| from tqdm import tqdm | |
| from datasets import load_metric | |
| # Download necessary NLTK data | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| # --- Load pre-trained models --- | |
| # Research and update these with the most recent and powerful Portuguese models | |
| semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased') | |
| perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model | |
| perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name) | |
| perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name) | |
| # Load Hugging Face metrics | |
| bertscore_metric = load_metric("bertscore") | |
| bleu_metric = load_metric("bleu") | |
| rouge_metric = load_metric("rouge") | |
| meteor_metric = load_metric("meteor") # Additional metric | |
| # Load a powerful LLM for generating and judging content | |
| generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available | |
| generator = pipeline("text-generation", model=generator_model_name) | |
| judge_model_name = generator_model_name # Using the same model for judging | |
| judge = pipeline("text-generation", model=judge_model_name) | |
| # --- Helper Functions --- | |
| def calculate_perplexity(text): | |
| """Calculates perplexity of text using a Portuguese LLM model.""" | |
| try: | |
| with torch.no_grad(): | |
| tokenize_input = perplexity_tokenizer.tokenize(text) | |
| tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt') | |
| loss = perplexity_model(tensor_input, labels=tensor_input)[0] | |
| return torch.exp(loss).item() | |
| except Exception as e: | |
| print(f"Error calculating perplexity: {e}") | |
| return float('inf') | |
| def estimate_semantic_similarity(generated_text, reference_text): | |
| """Estimates semantic similarity using a Portuguese Sentence Transformer.""" | |
| try: | |
| embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True) | |
| embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True) | |
| cosine_sim = util.pytorch_cos_sim(embedding1, embedding2) | |
| return cosine_sim.item() | |
| except Exception as e: | |
| print(f"Error calculating semantic similarity: {e}") | |
| return 0.0 | |
| def calculate_metrics(generated_text, reference_text): | |
| """Calculates BERTScore, BLEU, ROUGE, and METEOR metrics.""" | |
| results = {} | |
| try: | |
| results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0] | |
| except Exception as e: | |
| print(f"Error calculating BERTScore: {e}") | |
| results['bertscore'] = None | |
| try: | |
| bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]]) | |
| results['bleu'] = bleu_results['bleu'] | |
| except Exception as e: | |
| print(f"Error calculating BLEU: {e}") | |
| results['bleu'] = None | |
| try: | |
| rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text]) | |
| results['rougeL'] = rouge_results['rougeL'] | |
| except Exception as e: | |
| print(f"Error calculating ROUGE: {e}") | |
| results['rougeL'] = None | |
| try: | |
| meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text]) | |
| results['meteor'] = meteor_results['meteor'] | |
| except Exception as e: | |
| print(f"Error calculating METEOR: {e}") | |
| results['meteor'] = None | |
| return results | |
| def get_llm_judgment(generated_text, reference_text): | |
| """Gets a judgment from a powerful LLM on the quality of the generated text.""" | |
| prompt = f""" | |
| You are an expert in evaluating educational content. | |
| Please evaluate the following generated text based on its accuracy, relevance, and clarity, | |
| compared to the provided reference text. | |
| Reference Text: | |
| {reference_text} | |
| Generated Text: | |
| {generated_text} | |
| Provide your judgment as one of the following categories: | |
| - "no issues": The generated text is accurate, relevant, and clear. | |
| - "minor issues": The generated text has some minor issues, but is mostly acceptable. | |
| - "major issues": The generated text has significant issues and needs substantial revision. | |
| """ | |
| judgment = judge(prompt, max_length=50)[0]['generated_text'].strip() | |
| return judgment | |
| # --- Content Analysis Function --- | |
| def analyze_content_for_review(generated_text, reference_text, | |
| similarity_threshold, | |
| bertscore_threshold, | |
| bleu_threshold, | |
| rouge_threshold, | |
| meteor_threshold): | |
| """Analyzes content and flags potential issues based on provided thresholds and LLM judgment.""" | |
| similarity = estimate_semantic_similarity(generated_text, reference_text) | |
| metrics = calculate_metrics(generated_text, reference_text) | |
| llm_judgment = get_llm_judgment(generated_text, reference_text) | |
| issues = [] | |
| if similarity < similarity_threshold: | |
| issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.") | |
| if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold: | |
| issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.") | |
| if metrics['bleu'] and metrics['bleu'] < bleu_threshold: | |
| issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.") | |
| if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold: | |
| issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.") | |
| if metrics['meteor'] and metrics['meteor'] < meteor_threshold: | |
| issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.") | |
| # Use LLM judgment as the primary decision-maker | |
| if llm_judgment == "major issues": | |
| review_flag = True | |
| explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues) | |
| elif llm_judgment == "minor issues": | |
| review_flag = True | |
| explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues) | |
| else: | |
| review_flag = False | |
| explanation = "LLM Judgment: **No Issues**" | |
| return { | |
| 'review_flag': review_flag, | |
| 'explanation': explanation, | |
| 'semantic_similarity': similarity, | |
| 'metrics': metrics, | |
| 'llm_judgment': llm_judgment, | |
| 'generated_text': generated_text, | |
| 'reference_text': reference_text | |
| } | |
| # --- Threshold Optimization Functions --- | |
| def generate_educational_content(topic, num_sections=3): | |
| """Generates educational content with chapters, topics, sections, and subsections.""" | |
| prompt = f""" | |
| Generate a chapter of educational content on the topic of "{topic}". | |
| The chapter should include {num_sections} sections, each with at least | |
| one subsection. The content should be factually accurate, well-organized, | |
| and written in clear and concise Portuguese. | |
| """ | |
| generated_content = generator(prompt, max_length=1000)[0]['generated_text'] | |
| return generated_content | |
| def objective(params): | |
| """Objective function for Hyperopt to minimize.""" | |
| similarity_threshold = params['similarity_threshold'] | |
| bertscore_threshold = params['bertscore_threshold'] | |
| bleu_threshold = params['bleu_threshold'] | |
| rouge_threshold = params['rouge_threshold'] | |
| meteor_threshold = params['meteor_threshold'] | |
| # Generate AI-created data | |
| topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics | |
| generated_texts = [] | |
| reference_texts = [] | |
| for topic in topics: | |
| reference_text = generate_educational_content(topic) | |
| generated_text = generate_educational_content(topic) | |
| generated_texts.append(generated_text) | |
| reference_texts.append(reference_text) | |
| total_errors = 0 | |
| for gen_text, ref_text in zip(generated_texts, reference_texts): | |
| result = analyze_content_for_review(gen_text, ref_text, | |
| similarity_threshold, | |
| bertscore_threshold, | |
| bleu_threshold, | |
| rouge_threshold, | |
| meteor_threshold) | |
| if result['review_flag'] and result['llm_judgment'] == "no issues": | |
| total_errors += 1 | |
| # Log metrics and parameters to MLflow | |
| with mlflow.start_run(): | |
| mlflow.log_params(params) | |
| mlflow.log_metric("total_errors", total_errors) | |
| return {'loss': total_errors, 'status': STATUS_OK} | |
| # --- Main Execution --- | |
| if __name__ == "__main__": | |
| # 1. Threshold Optimization Phase | |
| mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI | |
| search_space = { # Hyperparameter search space | |
| 'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9), | |
| 'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95), | |
| 'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8), | |
| 'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7), | |
| 'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7) | |
| } | |
| trials = Trials() | |
| best_thresholds = fmin(fn=objective, | |
| space=search_space, | |
| algo=tpe.suggest, | |
| max_evals=50, # Adjust the number of evaluations as needed | |
| trials=trials) | |
| print("Best thresholds found:", best_thresholds) | |
| # 2. Content Evaluation Phase (using the best thresholds) | |
| new_generated_text = generate_educational_content("Matemática") # Example | |
| new_reference_text = "Content from your educational material..." | |
| evaluation_result = analyze_content_for_review( | |
| new_generated_text, new_reference_text, | |
| best_thresholds['similarity_threshold'], | |
| best_thresholds['bertscore_threshold'], | |
| best_thresholds['bleu_threshold'], | |
| best_thresholds['rouge_threshold'], | |
| best_thresholds['meteor_threshold'] | |
| ) | |
| print("\n----- Evaluation Result -----") | |
| print(f"Review Flag: {evaluation_result['review_flag']}") | |
| print(f"Explanation: {evaluation_result['explanation']}") | |