Spaces:

jaothan
/

cicd_evaluation_prompt

Build error

App Files Files Community

jaothan commited on Feb 17, 2025

Commit

71af9aa

verified ·

1 Parent(s): 33cabe0

Update evaluate_prompts.py

Browse files

Files changed (1) hide show

evaluate_prompts.py +58 -42

evaluate_prompts.py CHANGED Viewed

@@ -1,42 +1,58 @@
-import json
-import torch
-from transformers import pipeline
-from datasets import load_metric
-# Load evaluation metric
-rouge = load_metric("rouge")
-# Load summarization model
-summarizer = pipeline("summarization", model="facebook/bart-base")
-# Example prompts & expected outputs
-test_cases = [
-    {"input": "The Eiffel Tower is a landmark in Paris, built in 1889.", "expected_summary": "The Eiffel Tower was built in 1889 in Paris."},
-    {"input": "AI is changing industries by automating tasks and providing insights.", "expected_summary": "AI is transforming industries with automation."}
-]
-def evaluate():
-    results = []
-    for case in test_cases:
-        model_output = summarizer(case["input"], max_length=50, min_length=5, do_sample=False)[0]["summary_text"]
-        score = rouge.compute(predictions=[model_output], references=[case["expected_summary"]])
-        results.append({"input": case["input"], "generated_summary": model_output, "rouge_score": score})
-    # Save evaluation results
-    with open("evaluation_results.json", "w") as f:
-        json.dump(results, f, indent=4)
-    avg_rouge_l = sum(res["rouge_score"]["rougeL"].mid.fmeasure for res in results) / len(results)
-    if avg_rouge_l >= 0.4:
-        print("✅ Model passed evaluation.")
-        return True
-    else:
-        print("❌ Model failed evaluation. Improve prompts or model.")
-        return False
-if __name__ == "__main__":
-    success = evaluate()
-    if not success:
-        exit(1)  # Prevent deployment if evaluation fails

+import json
+import evaluate
+import nltk
+from transformers import pipeline
+# Download NLTK tokenizer for ROUGE evaluation
+nltk.download("punkt")
+# Load the ROUGE evaluation metric
+rouge = evaluate.load("rouge")
+# Load a small foundation model
+summarizer = pipeline("summarization", model="facebook/bart-base")
+# Example test cases
+test_cases = [
+    {
+        "input": "The Eiffel Tower is one of the most famous landmarks in the world. Built in 1889, it stands in Paris.",
+        "expected_summary": "The Eiffel Tower was built in 1889 in Paris."
+    },
+    {
+        "input": "Artificial Intelligence is transforming industries by automating tasks and providing data-driven insights.",
+        "expected_summary": "AI is revolutionizing industries with automation and insights."
+    }
+]
+# Evaluate function
+def evaluate():
+    results = []
+    for case in test_cases:
+        model_output = summarizer(case["input"], max_length=50, min_length=5, do_sample=False)[0]["summary_text"]
+        scores = rouge.compute(predictions=[model_output], references=[case["expected_summary"]], use_stemmer=True)
+        results.append({
+            "input": case["input"],
+            "generated_summary": model_output,
+            "expected_summary": case["expected_summary"],
+            "rouge_scores": scores
+        })
+    # Save evaluation results
+    with open("evaluation_results.json", "w") as f:
+        json.dump(results, f, indent=4)
+    # Compute average ROUGE-L score
+    avg_rouge_l = sum(res["rouge_scores"]["rougeL"].mid.fmeasure for res in results) / len(results)
+    if avg_rouge_l >= 0.4:
+        print("✅ Model passed evaluation.")
+        return True
+    else:
+        print("❌ Model failed evaluation.")
+        return False
+if __name__ == "__main__":
+    success = evaluate()
+    if not success:
+        exit(1)  # Prevents deployment if evaluation fails