theRealNG commited on
Commit
26ce782
·
unverified ·
2 Parent(s): 1d8926b7ec107d

Merge pull request #18 from beautiful-code/llm_testing

Browse files

workflow(til): Migrate to categoriztion instead of score

__init__.py ADDED
File without changes
crew/__init__.py ADDED
File without changes
crew/til.py CHANGED
@@ -1,21 +1,22 @@
1
- from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
 
 
2
  from langchain_core.messages import SystemMessage
3
- from pydantic import BaseModel, Field, UUID4
4
  from langchain_core.output_parsers import JsonOutputParser
 
5
  from langchain_openai import ChatOpenAI
6
- from langchain import callbacks
7
  from typing import List, Optional
8
- import pprint
9
  import os
10
-
11
- HIGH_IMPACT_THRESHOLD = 8
12
- LOW_IMPACT_THRESHOLD = 7
13
 
14
  class TilCrew:
15
  def kickoff(self, inputs={}):
16
  print("Human Message:")
17
  pprint.pp(inputs)
18
  self.content = inputs["content"]
 
19
  self._gather_feedback()
20
  return self._final_call_on_feedback()
21
 
@@ -28,26 +29,26 @@ class TilCrew:
28
  "til": feedback.get('til', ""),
29
  "feedback": "not_ok",
30
  }
31
- if feedback["factuality_score"] < HIGH_IMPACT_THRESHOLD:
32
  result["feedback_criteria"] = "factuality_feedback"
33
  result["reason"] = feedback["factuality_reason"]
34
  final_results = final_results + [result]
35
  continue
36
 
37
- if feedback["insightful_score"] < HIGH_IMPACT_THRESHOLD:
38
  result["feedback_criteria"] = "insightful_feedback"
39
  result["reason"] = feedback["insightful_reason"]
40
  final_results = final_results + [result]
41
  continue
42
 
43
- if feedback["simplicity_score"] < LOW_IMPACT_THRESHOLD:
44
  result["feedback_criteria"] = "simplicity_feedback"
45
  result["reason"] = feedback["simplicity_reason"]
46
  result["suggestion"] = feedback["final_suggestion"]
47
  final_results = final_results + [result]
48
  continue
49
 
50
- if feedback["grammatical_score"] < LOW_IMPACT_THRESHOLD:
51
  result["feedback_criteria"] = "grammatical_feedback"
52
  result["reason"] = feedback["grammatical_reason"]
53
  result["suggestion"] = feedback["final_suggestion"]
@@ -73,19 +74,40 @@ class TilCrew:
73
  print("Feedback: ")
74
  pprint.pp(self.feedback_results)
75
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def _build_feedback_chain(self):
77
  feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
78
  feedback_prompt = ChatPromptTemplate.from_messages([
79
  SystemMessage(
80
- "You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. Your responsibility is to guide the user to write better TILs. "
81
- "Your personal goal is to review a user's list of TILs and suggeste edits based on the following criteria:\n"
 
82
  "1. Is the TIL insightful?"
83
  "2. Is the TIL factually correct and accurate?"
84
  "3. Is the TIL written in simple english?"
85
- "4. Is the TIL grammatically correct?\n"
 
 
 
 
 
 
 
 
86
 
87
- "Can you provide a score for on the scale of 10 for each of the TIL on each of these criteria and provide reasons for the score, "
88
- " the reason/feedback should be presented in the Point Of View of the Reviewer and the feedback should be direct."
89
  f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
90
  ),
91
  HumanMessagePromptTemplate.from_template("{til_content}")
@@ -108,20 +130,20 @@ class TilCrew:
108
 
109
  class TilFeedbackResult(BaseModel):
110
  til: str = Field(description="TIL as exactly captured by the user without any modifications.")
111
- insightful_score: int = Field(
112
- description="TIL scores should be based solely on insightful criteria, with no other factors considered.")
113
- insightful_reason: str = Field(description="Feedback for low insightful_score if it is not 10")
114
- factuality_score: int = Field(
115
- description="TIL scores should be based solely on factuality criteria, with no other factors considered.")
116
- factuality_reason: str = Field(description="Feedback for low factuality_score if it is not 10")
117
- simplicity_score: int = Field(
118
- description="TIL scores should be based solely on simplicity criteria, with no other factors considered.")
119
- simplicity_reason: str = Field(description="Feedback for low simplicity_score if it is not 10")
120
- grammatical_score: int = Field(
121
- description="TIL scores should be based solely on grammatical criteria, with no other factors considered.")
122
- grammatical_reason: str = Field(description="Feedback for low grammatical_score if it is not 10")
123
  final_suggestion: str = Field(
124
- description="Final suggested version of the TIL")
125
 
126
 
127
  class TilFeedbackResults(BaseModel):
 
1
+ from langchain import callbacks
2
+ from langchain import hub
3
+ from langchain.agents import AgentExecutor, create_react_agent
4
+ from langchain_community.tools.tavily_search import TavilyAnswer
5
  from langchain_core.messages import SystemMessage
 
6
  from langchain_core.output_parsers import JsonOutputParser
7
+ from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
8
  from langchain_openai import ChatOpenAI
9
+ from pydantic import BaseModel, Field, UUID4
10
  from typing import List, Optional
 
11
  import os
12
+ import pprint
 
 
13
 
14
  class TilCrew:
15
  def kickoff(self, inputs={}):
16
  print("Human Message:")
17
  pprint.pp(inputs)
18
  self.content = inputs["content"]
19
+ # self._gather_facts()
20
  self._gather_feedback()
21
  return self._final_call_on_feedback()
22
 
 
29
  "til": feedback.get('til', ""),
30
  "feedback": "not_ok",
31
  }
32
+ if feedback["factuality_categorization"] != 'High':
33
  result["feedback_criteria"] = "factuality_feedback"
34
  result["reason"] = feedback["factuality_reason"]
35
  final_results = final_results + [result]
36
  continue
37
 
38
+ if feedback["insightful_categorization"] != 'High':
39
  result["feedback_criteria"] = "insightful_feedback"
40
  result["reason"] = feedback["insightful_reason"]
41
  final_results = final_results + [result]
42
  continue
43
 
44
+ if feedback["simplicity_categorization"] == 'Low':
45
  result["feedback_criteria"] = "simplicity_feedback"
46
  result["reason"] = feedback["simplicity_reason"]
47
  result["suggestion"] = feedback["final_suggestion"]
48
  final_results = final_results + [result]
49
  continue
50
 
51
+ if feedback["grammatical_categorization"] == 'Low':
52
  result["feedback_criteria"] = "grammatical_feedback"
53
  result["reason"] = feedback["grammatical_reason"]
54
  result["suggestion"] = feedback["final_suggestion"]
 
74
  print("Feedback: ")
75
  pprint.pp(self.feedback_results)
76
 
77
+ # Deprecated: Not using this as we are getting similar results by using or without using this
78
+ def _gather_facts(self):
79
+ facts_prompt = PromptTemplate.from_template("What are the facts on the topics mentioned the following user's TILs: {content}")
80
+ tools = [TavilyAnswer()]
81
+ llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
82
+ prompt = hub.pull("hwchase17/react")
83
+ agent = create_react_agent(llm, tools, prompt)
84
+ agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
85
+ self.facts = agent_executor.invoke({"input": facts_prompt.format(content=self.content)})['output']
86
+ print("Gathered Facts: ")
87
+ pprint.pp(self.facts)
88
+
89
  def _build_feedback_chain(self):
90
  feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
91
  feedback_prompt = ChatPromptTemplate.from_messages([
92
  SystemMessage(
93
+ "You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. "
94
+ "You are an expert in writing TILs which are Insightful, Factually correct, Easy to read and grammatically correct."
95
+ "Your goal is to review user's TILs and categorize their correctness as High, Medium, or Low based on the following metrics:"
96
  "1. Is the TIL insightful?"
97
  "2. Is the TIL factually correct and accurate?"
98
  "3. Is the TIL written in simple english?"
99
+ "4. Is the TIL grammatically correct?\n\n"
100
+
101
+ "The criteria to use for assessing if they are insightful or not are:\n"
102
+ "* They TIL shouldn't just be a outright statement, it should contain even the reason on why the statement is true."
103
+ "* It should showcase the understanding of the user on the subject.\n\n"
104
+
105
+ "The criteria to use for assessing if they are factual or not are:\n"
106
+ "* They are related to facts."
107
+ "* You are able to find a source which agrees to the fact from reputable websites.\n\n"
108
 
109
+ "Give reason for your assessment in one or two sentences for each metric and And also rewrite the TIL if you were given the option to write it. "
110
+ "Evaluate each TIL in the context of all the user's TILs."
111
  f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
112
  ),
113
  HumanMessagePromptTemplate.from_template("{til_content}")
 
130
 
131
  class TilFeedbackResult(BaseModel):
132
  til: str = Field(description="TIL as exactly captured by the user without any modifications.")
133
+ insightful_categorization: str = Field(
134
+ description="TIL categorization as High/Medium/Low based on correctness on the insightful metric.")
135
+ insightful_reason: str = Field(description="Reason for your assessment in one or two sentences on insightful metric for the user.")
136
+ factuality_categorization: str = Field(
137
+ description="TIL categorization as High/Medium/Low based on correctness on the factuality metric.")
138
+ factuality_reason: str = Field(description="Reason for your assessment in one or two sentences on factuality metric for the user.")
139
+ simplicity_categorization: str = Field(
140
+ description="TIL categorization as High/Medium/Low based on correctness on the simplicity metric.")
141
+ simplicity_reason: str = Field(description="Reason for your assessment in one or two sentences on simplicity metric for the user.")
142
+ grammatical_categorization: str = Field(
143
+ description="TIL categorization as High/Medium/Low based on correctness on the grammatical metric.")
144
+ grammatical_reason: str = Field(description="Reason for your assessment in one or two sentences on grammatical metric for the user.")
145
  final_suggestion: str = Field(
146
+ description="Rewrite the TIL if you were given the option to write it which should score High on all the metrics.")
147
 
148
 
149
  class TilFeedbackResults(BaseModel):
requirements.txt CHANGED
@@ -13,3 +13,4 @@ fastapi
13
  uvicorn
14
  fastapi_cors
15
  langsmith
 
 
13
  uvicorn
14
  fastapi_cors
15
  langsmith
16
+ pytest
tests/__init__.py ADDED
File without changes
tests/til_test.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from growthy_agents.crew.til import TilCrew # type: ignore
3
+
4
+
5
+ examples = [
6
+ ("The sun rises in the east.", [
7
+ {"insightful_categorization": 'Low', "factuality_categorization": 'High', "simplicity_categorization": 'High', "grammatical_categorization": 'High'}]),
8
+ ("* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
9
+ "* In quantization the weights are reduced by scaling up the datatypes from a datatype that takes smaller space to a data type that takes a larger space, this is also known as downcasting for example downcasting from int8 to float32.\n"
10
+ "* Advantages: takes lesser space and increases compute speed.\n"
11
+ "* Disadvantages: Answers are less precise because of the loss of precision in the LLM model weights.\n", [
12
+ {"insightful_categorization": 'Meidum', "factuality_categorization": 'High',
13
+ "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
14
+ {"insightful_categorization": 'High', "factuality_categorization": 'Low',
15
+ "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
16
+ {"insightful_categorization": 'High', "factuality_categorization": 'High',
17
+ "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
18
+ {"insightful_categorization": 'High', "factuality_categorization": 'High',
19
+ "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
20
+ ]),
21
+ ]
22
+
23
+
24
+ @pytest.mark.parametrize("input_text, expected_categorizations", examples)
25
+ def test_llm_evaluation(input_text, expected_categorizations):
26
+ til_crew = TilCrew()
27
+ til_crew.content = input_text
28
+ til_crew._gather_feedback()
29
+ response = til_crew.feedback_results
30
+
31
+ for idx, feedback in enumerate(response):
32
+ assert feedback["insightful_categorization"] == pytest.approx(
33
+ expected_categorizations[idx]["insightful_categorization"], abs=2.0)
34
+ assert feedback["factuality_categorization"] == pytest.approx(
35
+ expected_categorizations[idx]["factuality_categorization"], abs=2.0)
36
+ assert feedback["simplicity_categorization"] == pytest.approx(
37
+ expected_categorizations[idx]["simplicity_categorization"], abs=2.0)
38
+ assert feedback["grammatical_categorization"] == pytest.approx(
39
+ expected_categorizations[idx]["grammatical_categorization"], abs=2.0)
ui/til_feedback.py CHANGED
@@ -21,8 +21,9 @@ def main():
21
  til_content = st.text_area('Enter what you learnt today:',
22
  "* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
23
  "* The weights are reduced by scaling down the datatypes from a datatype that takes larger space to a data type that takes a smaller space, this is also known as downcasting.\n"
24
- "* Advantages: takes lesser space and increases compute speed\n"
25
- "* Disadvantages: Answers are less precise\n",
 
26
  key='til_content', help='Enter what you learnt today')
27
 
28
  if st.button("Get Feedback"):
 
21
  til_content = st.text_area('Enter what you learnt today:',
22
  "* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
23
  "* The weights are reduced by scaling down the datatypes from a datatype that takes larger space to a data type that takes a smaller space, this is also known as downcasting.\n"
24
+ "* Quantization offers benefits such as reduced storage space usage and faster computation.\n"
25
+ "* Disadvantages: Answers are less precise\n"
26
+ "* I learnt how to use Go Routines to handle concurrency in React.\n",
27
  key='til_content', help='Enter what you learnt today')
28
 
29
  if st.button("Get Feedback"):