Spaces:

beautiful-code
/

ai_workflows

Runtime error

App Files Files Community

theRealNG commited on Jul 1, 2024

Commit

26ce782

unverified ·

2 Parent(s): 1d8926b 7ec107d

Merge pull request #18 from beautiful-code/llm_testing

Browse files

workflow(til): Migrate to categoriztion instead of score

Files changed (7) hide show

__init__.py +0 -0
crew/__init__.py +0 -0
crew/til.py +51 -29
requirements.txt +1 -0
tests/__init__.py +0 -0
tests/til_test.py +39 -0
ui/til_feedback.py +3 -2

__init__.py ADDED Viewed

File without changes

crew/__init__.py ADDED Viewed

File without changes

crew/til.py CHANGED Viewed

@@ -1,21 +1,22 @@
-from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 from langchain_core.messages import SystemMessage
-from pydantic import BaseModel, Field, UUID4
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_openai import ChatOpenAI
-from langchain import callbacks
 from typing import List, Optional
-import pprint
 import os
-HIGH_IMPACT_THRESHOLD = 8
-LOW_IMPACT_THRESHOLD = 7
 class TilCrew:
     def kickoff(self, inputs={}):
         print("Human Message:")
         pprint.pp(inputs)
         self.content = inputs["content"]
         self._gather_feedback()
         return self._final_call_on_feedback()
@@ -28,26 +29,26 @@ class TilCrew:
                 "til": feedback.get('til', ""),
                 "feedback": "not_ok",
             }
-            if feedback["factuality_score"] < HIGH_IMPACT_THRESHOLD:
                 result["feedback_criteria"] = "factuality_feedback"
                 result["reason"] = feedback["factuality_reason"]
                 final_results = final_results + [result]
                 continue
-            if feedback["insightful_score"] < HIGH_IMPACT_THRESHOLD:
                 result["feedback_criteria"] = "insightful_feedback"
                 result["reason"] = feedback["insightful_reason"]
                 final_results = final_results + [result]
                 continue
-            if feedback["simplicity_score"] < LOW_IMPACT_THRESHOLD:
                 result["feedback_criteria"] = "simplicity_feedback"
                 result["reason"] = feedback["simplicity_reason"]
                 result["suggestion"] = feedback["final_suggestion"]
                 final_results = final_results + [result]
                 continue
-            if feedback["grammatical_score"] < LOW_IMPACT_THRESHOLD:
                 result["feedback_criteria"] = "grammatical_feedback"
                 result["reason"] = feedback["grammatical_reason"]
                 result["suggestion"] = feedback["final_suggestion"]
@@ -73,19 +74,40 @@ class TilCrew:
         print("Feedback: ")
         pprint.pp(self.feedback_results)
     def _build_feedback_chain(self):
         feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
         feedback_prompt = ChatPromptTemplate.from_messages([
             SystemMessage(
-                "You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. Your responsibility is to guide the user to write better TILs. "
-                "Your personal goal is to review a user's list of TILs and suggeste edits based on the following criteria:\n"
                 "1. Is the TIL insightful?"
                 "2. Is the TIL factually correct and accurate?"
                 "3. Is the TIL written in simple english?"
-                "4. Is the TIL grammatically correct?\n"
-                "Can you provide a score for on the scale of 10 for each of the TIL on each of these criteria and provide reasons for the score, "
-                " the reason/feedback should be presented in the Point Of View of the Reviewer and the feedback should be direct."
                 f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
             ),
             HumanMessagePromptTemplate.from_template("{til_content}")
@@ -108,20 +130,20 @@ class TilCrew:
 class TilFeedbackResult(BaseModel):
     til: str = Field(description="TIL as exactly captured by the user without any modifications.")
-    insightful_score: int = Field(
-        description="TIL scores should be based solely on insightful criteria, with no other factors considered.")
-    insightful_reason: str = Field(description="Feedback for low insightful_score if it is not 10")
-    factuality_score: int = Field(
-        description="TIL scores should be based solely on factuality criteria, with no other factors considered.")
-    factuality_reason: str = Field(description="Feedback for low factuality_score if it is not 10")
-    simplicity_score: int = Field(
-        description="TIL scores should be based solely on simplicity criteria, with no other factors considered.")
-    simplicity_reason: str = Field(description="Feedback for low simplicity_score if it is not 10")
-    grammatical_score: int = Field(
-        description="TIL scores should be based solely on grammatical criteria, with no other factors considered.")
-    grammatical_reason: str = Field(description="Feedback for low grammatical_score if it is not 10")
     final_suggestion: str = Field(
-        description="Final suggested version of the TIL")
 class TilFeedbackResults(BaseModel):

+from langchain import callbacks
+from langchain import hub
+from langchain.agents import AgentExecutor, create_react_agent
+from langchain_community.tools.tavily_search import TavilyAnswer
 from langchain_core.messages import SystemMessage
 from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
 from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, Field, UUID4
 from typing import List, Optional
 import os
+import pprint
 class TilCrew:
     def kickoff(self, inputs={}):
         print("Human Message:")
         pprint.pp(inputs)
         self.content = inputs["content"]
+        # self._gather_facts()
         self._gather_feedback()
         return self._final_call_on_feedback()
                 "til": feedback.get('til', ""),
                 "feedback": "not_ok",
             }
+            if feedback["factuality_categorization"] != 'High':
                 result["feedback_criteria"] = "factuality_feedback"
                 result["reason"] = feedback["factuality_reason"]
                 final_results = final_results + [result]
                 continue
+            if feedback["insightful_categorization"] != 'High':
                 result["feedback_criteria"] = "insightful_feedback"
                 result["reason"] = feedback["insightful_reason"]
                 final_results = final_results + [result]
                 continue
+            if feedback["simplicity_categorization"] == 'Low':
                 result["feedback_criteria"] = "simplicity_feedback"
                 result["reason"] = feedback["simplicity_reason"]
                 result["suggestion"] = feedback["final_suggestion"]
                 final_results = final_results + [result]
                 continue
+            if feedback["grammatical_categorization"] == 'Low':
                 result["feedback_criteria"] = "grammatical_feedback"
                 result["reason"] = feedback["grammatical_reason"]
                 result["suggestion"] = feedback["final_suggestion"]
         print("Feedback: ")
         pprint.pp(self.feedback_results)
+    # Deprecated: Not using this as we are getting similar results by using or without using this
+    def _gather_facts(self):
+        facts_prompt = PromptTemplate.from_template("What are the facts on the topics mentioned the following user's TILs: {content}")
+        tools = [TavilyAnswer()]
+        llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
+        prompt = hub.pull("hwchase17/react")
+        agent = create_react_agent(llm, tools, prompt)
+        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
+        self.facts = agent_executor.invoke({"input": facts_prompt.format(content=self.content)})['output']
+        print("Gathered Facts: ")
+        pprint.pp(self.facts)
     def _build_feedback_chain(self):
         feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
         feedback_prompt = ChatPromptTemplate.from_messages([
             SystemMessage(
+                "You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. "
+                "You are an expert in writing TILs which are Insightful, Factually correct, Easy to read and grammatically correct."
+                "Your goal is to review user's TILs and categorize their correctness as High, Medium, or Low based on the following metrics:"
                 "1. Is the TIL insightful?"
                 "2. Is the TIL factually correct and accurate?"
                 "3. Is the TIL written in simple english?"
+                "4. Is the TIL grammatically correct?\n\n"
+                "The criteria to use for assessing if they are insightful or not are:\n"
+                "* They TIL shouldn't just be a outright statement, it should contain even the reason on why the statement is true."
+                "* It should showcase the understanding of the user on the subject.\n\n"
+                "The criteria to use for assessing if they are factual or not are:\n"
+                "* They are related to facts."
+                "* You are able to find a source which agrees to the fact from reputable websites.\n\n"
+                "Give reason for your assessment in one or two sentences for each metric and And also rewrite the TIL if you were given the option to write it. "
+                "Evaluate each TIL in the context of all the user's TILs."
                 f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
             ),
             HumanMessagePromptTemplate.from_template("{til_content}")
 class TilFeedbackResult(BaseModel):
     til: str = Field(description="TIL as exactly captured by the user without any modifications.")
+    insightful_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the insightful metric.")
+    insightful_reason: str = Field(description="Reason for your assessment in one or two sentences on insightful metric for the user.")
+    factuality_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the factuality metric.")
+    factuality_reason: str = Field(description="Reason for your assessment in one or two sentences on factuality metric for the user.")
+    simplicity_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the simplicity metric.")
+    simplicity_reason: str = Field(description="Reason for your assessment in one or two sentences on simplicity metric for the user.")
+    grammatical_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the grammatical metric.")
+    grammatical_reason: str = Field(description="Reason for your assessment in one or two sentences on grammatical metric for the user.")
     final_suggestion: str = Field(
+        description="Rewrite the TIL if you were given the option to write it which should score High on all the metrics.")
 class TilFeedbackResults(BaseModel):

requirements.txt CHANGED Viewed

@@ -13,3 +13,4 @@ fastapi
 uvicorn
 fastapi_cors
 langsmith

 uvicorn
 fastapi_cors
 langsmith
+pytest

tests/__init__.py ADDED Viewed

File without changes

tests/til_test.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pytest
+from growthy_agents.crew.til import TilCrew  # type: ignore
+examples = [
+    ("The sun rises in the east.", [
+     {"insightful_categorization": 'Low', "factuality_categorization": 'High', "simplicity_categorization": 'High', "grammatical_categorization": 'High'}]),
+    ("* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
+     "* In quantization the weights are reduced by scaling up the datatypes from a datatype that takes smaller space to a data type that takes a larger space, this is also known as downcasting for example downcasting from int8 to float32.\n"
+     "* Advantages: takes lesser space and increases compute speed.\n"
+     "* Disadvantages: Answers are less precise because of the loss of precision in the LLM model weights.\n", [
+        {"insightful_categorization": 'Meidum', "factuality_categorization": 'High',
+            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
+        {"insightful_categorization": 'High', "factuality_categorization": 'Low',
+            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
+        {"insightful_categorization": 'High', "factuality_categorization": 'High',
+            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
+        {"insightful_categorization": 'High', "factuality_categorization": 'High',
+            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
+    ]),
+]
+@pytest.mark.parametrize("input_text, expected_categorizations", examples)
+def test_llm_evaluation(input_text, expected_categorizations):
+    til_crew = TilCrew()
+    til_crew.content = input_text
+    til_crew._gather_feedback()
+    response = til_crew.feedback_results
+    for idx, feedback in enumerate(response):
+        assert feedback["insightful_categorization"] == pytest.approx(
+            expected_categorizations[idx]["insightful_categorization"], abs=2.0)
+        assert feedback["factuality_categorization"] == pytest.approx(
+            expected_categorizations[idx]["factuality_categorization"], abs=2.0)
+        assert feedback["simplicity_categorization"] == pytest.approx(
+            expected_categorizations[idx]["simplicity_categorization"], abs=2.0)
+        assert feedback["grammatical_categorization"] == pytest.approx(
+            expected_categorizations[idx]["grammatical_categorization"], abs=2.0)

ui/til_feedback.py CHANGED Viewed

@@ -21,8 +21,9 @@ def main():
     til_content = st.text_area('Enter what you learnt today:',
                                "* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
                                "* The weights are reduced by scaling down the datatypes from a datatype that takes larger space to a data type that takes a smaller space, this is also known as downcasting.\n"
-                               "* Advantages: takes lesser space and increases compute speed\n"
-                               "* Disadvantages: Answers are less precise\n",
                                key='til_content', help='Enter what you learnt today')
     if st.button("Get Feedback"):

     til_content = st.text_area('Enter what you learnt today:',
                                "* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
                                "* The weights are reduced by scaling down the datatypes from a datatype that takes larger space to a data type that takes a smaller space, this is also known as downcasting.\n"
+                               "* Quantization offers benefits such as reduced storage space usage and faster computation.\n"
+                               "* Disadvantages: Answers are less precise\n"
+                               "* I learnt how to use Go Routines to handle concurrency in React.\n",
                                key='til_content', help='Enter what you learnt today')
     if st.button("Get Feedback"):