Junaidb commited on
Commit
ef9830d
·
verified ·
1 Parent(s): dad9b95

Create llmeval.py

Browse files
Files changed (1) hide show
  1. llmeval.py +90 -0
llmeval.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from groq import Groq
2
+ import re
3
+ import json
4
+
5
+ AK="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"
6
+ client = Groq(api_key=AK)
7
+
8
+
9
+
10
+
11
+ class LLM_as_Evaluator():
12
+
13
+ def __init__(self):
14
+ pass
15
+
16
+
17
+ def ___engine_core(self,messages):
18
+
19
+ completion = client.chat.completions.create(
20
+ model="llama3-8b-8192",
21
+ messages=messages,
22
+ temperature=0.0,
23
+ max_completion_tokens=5000,
24
+ top_p=1,
25
+ stream=False,
26
+ stop=None,
27
+ )
28
+ actual_message=completion.choices[0].message.content
29
+ return actual_message
30
+ #cleaned_json=re.sub(r"```(?:json)?\s*(.*?)\s*```", r"\1", actual_message, flags=re.DOTALL).strip()
31
+ #is_json_like = cleaned_json.strip().startswith("{") and cleaned_json.strip().endswith("}")
32
+ #if is_json_like==True:
33
+ #return cleaned_json
34
+ #else:
35
+ #return "FATAL"
36
+
37
+
38
+ def Paradigm_LLM_Evaluator(self,data_to_evaluate):
39
+
40
+
41
+ SYSTEM='''
42
+ Task:
43
+ Evaluate the biological quality of a prompt-research data-response triplet on a 0–1 continuous scale.
44
+
45
+ Goal:
46
+ Assess:
47
+
48
+ Whether the Prompt is clear, biologically specific, and aligned with the Research Data.
49
+
50
+ Whether the Response is biologically relevant, mechanistically coherent, and experimentally actionable based on the Research Data.
51
+
52
+ Scoring Guide (0–1 continuous scale):
53
+
54
+ Score 1.0 if:
55
+
56
+ Prompt is clear, biologically detailed, and correctly aligned to the research context.
57
+
58
+ Response correctly identifies a biologically valid paradigm consistent with the Research Data.
59
+
60
+ Lower scores if:
61
+
62
+ The prompt is vague or misaligned.
63
+
64
+ The response is biologically inaccurate, irrelevant, or mechanistically implausible.
65
+
66
+ EXAMPLE:
67
+ Input:
68
+ Prompt: Identify a paradigm explaining the functional impact of BRCA1 mutations in ovarian cancer, focusing on DNA repair mechanisms.
69
+ Research Data: BRCA1 loss-of-function mutations are associated with impaired homologous recombination repair, leading to genomic instability in ovarian epithelial cells.
70
+ Agent's Response: BRCA1 mutations inhibit non-homologous end joining, which causes increased apoptosis in neurons, suggesting a neurodegeneration model.
71
+
72
+ Your output must begin with Score: and contain only two fields: Score: and Reasoning:. No extra commentary, no markdown, no explanations before or after.:
73
+ Score: 0.3
74
+ Reasoning: The prompt and research data focus on ovarian cancer and homologous recombination, but the response incorrectly shifts to neurons and the wrong DNA repair pathway (non-homologous end joining instead of homologous recombination). Misalignment between response and biological context.
75
+
76
+
77
+ Think step by step
78
+ '''
79
+ messages=[
80
+ {"role":"system","content":SYSTEM},
81
+ {"role":"user","content":f"""
82
+ Prompt:{data_to_evaluate["prompt"]},
83
+ Reseaerch Data :{data_to_evaluate["research"]},
84
+ Agent's Response:{data_to_evaluate["response"]}
85
+
86
+ """}
87
+ ]
88
+
89
+ evaluation_response=self.___engine_core(messages=messages)
90
+ return evaluation_response