jaothan commited on
Commit
71af9aa
·
verified ·
1 Parent(s): 33cabe0

Update evaluate_prompts.py

Browse files
Files changed (1) hide show
  1. evaluate_prompts.py +58 -42
evaluate_prompts.py CHANGED
@@ -1,42 +1,58 @@
1
- import json
2
- import torch
3
- from transformers import pipeline
4
- from datasets import load_metric
5
-
6
- # Load evaluation metric
7
- rouge = load_metric("rouge")
8
-
9
- # Load summarization model
10
- summarizer = pipeline("summarization", model="facebook/bart-base")
11
-
12
- # Example prompts & expected outputs
13
- test_cases = [
14
- {"input": "The Eiffel Tower is a landmark in Paris, built in 1889.", "expected_summary": "The Eiffel Tower was built in 1889 in Paris."},
15
- {"input": "AI is changing industries by automating tasks and providing insights.", "expected_summary": "AI is transforming industries with automation."}
16
- ]
17
-
18
- def evaluate():
19
- results = []
20
- for case in test_cases:
21
- model_output = summarizer(case["input"], max_length=50, min_length=5, do_sample=False)[0]["summary_text"]
22
- score = rouge.compute(predictions=[model_output], references=[case["expected_summary"]])
23
-
24
- results.append({"input": case["input"], "generated_summary": model_output, "rouge_score": score})
25
-
26
- # Save evaluation results
27
- with open("evaluation_results.json", "w") as f:
28
- json.dump(results, f, indent=4)
29
-
30
- avg_rouge_l = sum(res["rouge_score"]["rougeL"].mid.fmeasure for res in results) / len(results)
31
-
32
- if avg_rouge_l >= 0.4:
33
- print("✅ Model passed evaluation.")
34
- return True
35
- else:
36
- print("❌ Model failed evaluation. Improve prompts or model.")
37
- return False
38
-
39
- if __name__ == "__main__":
40
- success = evaluate()
41
- if not success:
42
- exit(1) # Prevent deployment if evaluation fails
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import evaluate
3
+ import nltk
4
+ from transformers import pipeline
5
+
6
+ # Download NLTK tokenizer for ROUGE evaluation
7
+ nltk.download("punkt")
8
+
9
+ # Load the ROUGE evaluation metric
10
+ rouge = evaluate.load("rouge")
11
+
12
+ # Load a small foundation model
13
+ summarizer = pipeline("summarization", model="facebook/bart-base")
14
+
15
+ # Example test cases
16
+ test_cases = [
17
+ {
18
+ "input": "The Eiffel Tower is one of the most famous landmarks in the world. Built in 1889, it stands in Paris.",
19
+ "expected_summary": "The Eiffel Tower was built in 1889 in Paris."
20
+ },
21
+ {
22
+ "input": "Artificial Intelligence is transforming industries by automating tasks and providing data-driven insights.",
23
+ "expected_summary": "AI is revolutionizing industries with automation and insights."
24
+ }
25
+ ]
26
+
27
+ # Evaluate function
28
+ def evaluate():
29
+ results = []
30
+ for case in test_cases:
31
+ model_output = summarizer(case["input"], max_length=50, min_length=5, do_sample=False)[0]["summary_text"]
32
+ scores = rouge.compute(predictions=[model_output], references=[case["expected_summary"]], use_stemmer=True)
33
+
34
+ results.append({
35
+ "input": case["input"],
36
+ "generated_summary": model_output,
37
+ "expected_summary": case["expected_summary"],
38
+ "rouge_scores": scores
39
+ })
40
+
41
+ # Save evaluation results
42
+ with open("evaluation_results.json", "w") as f:
43
+ json.dump(results, f, indent=4)
44
+
45
+ # Compute average ROUGE-L score
46
+ avg_rouge_l = sum(res["rouge_scores"]["rougeL"].mid.fmeasure for res in results) / len(results)
47
+
48
+ if avg_rouge_l >= 0.4:
49
+ print("✅ Model passed evaluation.")
50
+ return True
51
+ else:
52
+ print("❌ Model failed evaluation.")
53
+ return False
54
+
55
+ if __name__ == "__main__":
56
+ success = evaluate()
57
+ if not success:
58
+ exit(1) # Prevents deployment if evaluation fails