Spaces:
Runtime error
Runtime error
workflow(til): Migrate to categoriztion instead of score
Browse files- __init__.py +0 -0
- crew/__init__.py +0 -0
- crew/til.py +51 -29
- requirements.txt +1 -0
- tests/__init__.py +0 -0
- tests/til_test.py +39 -0
- ui/til_feedback.py +3 -2
__init__.py
ADDED
|
File without changes
|
crew/__init__.py
ADDED
|
File without changes
|
crew/til.py
CHANGED
|
@@ -1,21 +1,22 @@
|
|
| 1 |
-
from
|
|
|
|
|
|
|
|
|
|
| 2 |
from langchain_core.messages import SystemMessage
|
| 3 |
-
from pydantic import BaseModel, Field, UUID4
|
| 4 |
from langchain_core.output_parsers import JsonOutputParser
|
|
|
|
| 5 |
from langchain_openai import ChatOpenAI
|
| 6 |
-
from
|
| 7 |
from typing import List, Optional
|
| 8 |
-
import pprint
|
| 9 |
import os
|
| 10 |
-
|
| 11 |
-
HIGH_IMPACT_THRESHOLD = 8
|
| 12 |
-
LOW_IMPACT_THRESHOLD = 7
|
| 13 |
|
| 14 |
class TilCrew:
|
| 15 |
def kickoff(self, inputs={}):
|
| 16 |
print("Human Message:")
|
| 17 |
pprint.pp(inputs)
|
| 18 |
self.content = inputs["content"]
|
|
|
|
| 19 |
self._gather_feedback()
|
| 20 |
return self._final_call_on_feedback()
|
| 21 |
|
|
@@ -28,26 +29,26 @@ class TilCrew:
|
|
| 28 |
"til": feedback.get('til', ""),
|
| 29 |
"feedback": "not_ok",
|
| 30 |
}
|
| 31 |
-
if feedback["
|
| 32 |
result["feedback_criteria"] = "factuality_feedback"
|
| 33 |
result["reason"] = feedback["factuality_reason"]
|
| 34 |
final_results = final_results + [result]
|
| 35 |
continue
|
| 36 |
|
| 37 |
-
if feedback["
|
| 38 |
result["feedback_criteria"] = "insightful_feedback"
|
| 39 |
result["reason"] = feedback["insightful_reason"]
|
| 40 |
final_results = final_results + [result]
|
| 41 |
continue
|
| 42 |
|
| 43 |
-
if feedback["
|
| 44 |
result["feedback_criteria"] = "simplicity_feedback"
|
| 45 |
result["reason"] = feedback["simplicity_reason"]
|
| 46 |
result["suggestion"] = feedback["final_suggestion"]
|
| 47 |
final_results = final_results + [result]
|
| 48 |
continue
|
| 49 |
|
| 50 |
-
if feedback["
|
| 51 |
result["feedback_criteria"] = "grammatical_feedback"
|
| 52 |
result["reason"] = feedback["grammatical_reason"]
|
| 53 |
result["suggestion"] = feedback["final_suggestion"]
|
|
@@ -73,19 +74,40 @@ class TilCrew:
|
|
| 73 |
print("Feedback: ")
|
| 74 |
pprint.pp(self.feedback_results)
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def _build_feedback_chain(self):
|
| 77 |
feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
|
| 78 |
feedback_prompt = ChatPromptTemplate.from_messages([
|
| 79 |
SystemMessage(
|
| 80 |
-
"You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company.
|
| 81 |
-
"
|
|
|
|
| 82 |
"1. Is the TIL insightful?"
|
| 83 |
"2. Is the TIL factually correct and accurate?"
|
| 84 |
"3. Is the TIL written in simple english?"
|
| 85 |
-
"4. Is the TIL grammatically correct?\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
"
|
| 88 |
-
"
|
| 89 |
f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
|
| 90 |
),
|
| 91 |
HumanMessagePromptTemplate.from_template("{til_content}")
|
|
@@ -108,20 +130,20 @@ class TilCrew:
|
|
| 108 |
|
| 109 |
class TilFeedbackResult(BaseModel):
|
| 110 |
til: str = Field(description="TIL as exactly captured by the user without any modifications.")
|
| 111 |
-
|
| 112 |
-
description="TIL
|
| 113 |
-
insightful_reason: str = Field(description="
|
| 114 |
-
|
| 115 |
-
description="TIL
|
| 116 |
-
factuality_reason: str = Field(description="
|
| 117 |
-
|
| 118 |
-
description="TIL
|
| 119 |
-
simplicity_reason: str = Field(description="
|
| 120 |
-
|
| 121 |
-
description="TIL
|
| 122 |
-
grammatical_reason: str = Field(description="
|
| 123 |
final_suggestion: str = Field(
|
| 124 |
-
description="
|
| 125 |
|
| 126 |
|
| 127 |
class TilFeedbackResults(BaseModel):
|
|
|
|
| 1 |
+
from langchain import callbacks
|
| 2 |
+
from langchain import hub
|
| 3 |
+
from langchain.agents import AgentExecutor, create_react_agent
|
| 4 |
+
from langchain_community.tools.tavily_search import TavilyAnswer
|
| 5 |
from langchain_core.messages import SystemMessage
|
|
|
|
| 6 |
from langchain_core.output_parsers import JsonOutputParser
|
| 7 |
+
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
|
| 8 |
from langchain_openai import ChatOpenAI
|
| 9 |
+
from pydantic import BaseModel, Field, UUID4
|
| 10 |
from typing import List, Optional
|
|
|
|
| 11 |
import os
|
| 12 |
+
import pprint
|
|
|
|
|
|
|
| 13 |
|
| 14 |
class TilCrew:
|
| 15 |
def kickoff(self, inputs={}):
|
| 16 |
print("Human Message:")
|
| 17 |
pprint.pp(inputs)
|
| 18 |
self.content = inputs["content"]
|
| 19 |
+
# self._gather_facts()
|
| 20 |
self._gather_feedback()
|
| 21 |
return self._final_call_on_feedback()
|
| 22 |
|
|
|
|
| 29 |
"til": feedback.get('til', ""),
|
| 30 |
"feedback": "not_ok",
|
| 31 |
}
|
| 32 |
+
if feedback["factuality_categorization"] != 'High':
|
| 33 |
result["feedback_criteria"] = "factuality_feedback"
|
| 34 |
result["reason"] = feedback["factuality_reason"]
|
| 35 |
final_results = final_results + [result]
|
| 36 |
continue
|
| 37 |
|
| 38 |
+
if feedback["insightful_categorization"] != 'High':
|
| 39 |
result["feedback_criteria"] = "insightful_feedback"
|
| 40 |
result["reason"] = feedback["insightful_reason"]
|
| 41 |
final_results = final_results + [result]
|
| 42 |
continue
|
| 43 |
|
| 44 |
+
if feedback["simplicity_categorization"] == 'Low':
|
| 45 |
result["feedback_criteria"] = "simplicity_feedback"
|
| 46 |
result["reason"] = feedback["simplicity_reason"]
|
| 47 |
result["suggestion"] = feedback["final_suggestion"]
|
| 48 |
final_results = final_results + [result]
|
| 49 |
continue
|
| 50 |
|
| 51 |
+
if feedback["grammatical_categorization"] == 'Low':
|
| 52 |
result["feedback_criteria"] = "grammatical_feedback"
|
| 53 |
result["reason"] = feedback["grammatical_reason"]
|
| 54 |
result["suggestion"] = feedback["final_suggestion"]
|
|
|
|
| 74 |
print("Feedback: ")
|
| 75 |
pprint.pp(self.feedback_results)
|
| 76 |
|
| 77 |
+
# Deprecated: Not using this as we are getting similar results by using or without using this
|
| 78 |
+
def _gather_facts(self):
|
| 79 |
+
facts_prompt = PromptTemplate.from_template("What are the facts on the topics mentioned the following user's TILs: {content}")
|
| 80 |
+
tools = [TavilyAnswer()]
|
| 81 |
+
llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
|
| 82 |
+
prompt = hub.pull("hwchase17/react")
|
| 83 |
+
agent = create_react_agent(llm, tools, prompt)
|
| 84 |
+
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
|
| 85 |
+
self.facts = agent_executor.invoke({"input": facts_prompt.format(content=self.content)})['output']
|
| 86 |
+
print("Gathered Facts: ")
|
| 87 |
+
pprint.pp(self.facts)
|
| 88 |
+
|
| 89 |
def _build_feedback_chain(self):
|
| 90 |
feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
|
| 91 |
feedback_prompt = ChatPromptTemplate.from_messages([
|
| 92 |
SystemMessage(
|
| 93 |
+
"You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. "
|
| 94 |
+
"You are an expert in writing TILs which are Insightful, Factually correct, Easy to read and grammatically correct."
|
| 95 |
+
"Your goal is to review user's TILs and categorize their correctness as High, Medium, or Low based on the following metrics:"
|
| 96 |
"1. Is the TIL insightful?"
|
| 97 |
"2. Is the TIL factually correct and accurate?"
|
| 98 |
"3. Is the TIL written in simple english?"
|
| 99 |
+
"4. Is the TIL grammatically correct?\n\n"
|
| 100 |
+
|
| 101 |
+
"The criteria to use for assessing if they are insightful or not are:\n"
|
| 102 |
+
"* They TIL shouldn't just be a outright statement, it should contain even the reason on why the statement is true."
|
| 103 |
+
"* It should showcase the understanding of the user on the subject.\n\n"
|
| 104 |
+
|
| 105 |
+
"The criteria to use for assessing if they are factual or not are:\n"
|
| 106 |
+
"* They are related to facts."
|
| 107 |
+
"* You are able to find a source which agrees to the fact from reputable websites.\n\n"
|
| 108 |
|
| 109 |
+
"Give reason for your assessment in one or two sentences for each metric and And also rewrite the TIL if you were given the option to write it. "
|
| 110 |
+
"Evaluate each TIL in the context of all the user's TILs."
|
| 111 |
f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
|
| 112 |
),
|
| 113 |
HumanMessagePromptTemplate.from_template("{til_content}")
|
|
|
|
| 130 |
|
| 131 |
class TilFeedbackResult(BaseModel):
|
| 132 |
til: str = Field(description="TIL as exactly captured by the user without any modifications.")
|
| 133 |
+
insightful_categorization: str = Field(
|
| 134 |
+
description="TIL categorization as High/Medium/Low based on correctness on the insightful metric.")
|
| 135 |
+
insightful_reason: str = Field(description="Reason for your assessment in one or two sentences on insightful metric for the user.")
|
| 136 |
+
factuality_categorization: str = Field(
|
| 137 |
+
description="TIL categorization as High/Medium/Low based on correctness on the factuality metric.")
|
| 138 |
+
factuality_reason: str = Field(description="Reason for your assessment in one or two sentences on factuality metric for the user.")
|
| 139 |
+
simplicity_categorization: str = Field(
|
| 140 |
+
description="TIL categorization as High/Medium/Low based on correctness on the simplicity metric.")
|
| 141 |
+
simplicity_reason: str = Field(description="Reason for your assessment in one or two sentences on simplicity metric for the user.")
|
| 142 |
+
grammatical_categorization: str = Field(
|
| 143 |
+
description="TIL categorization as High/Medium/Low based on correctness on the grammatical metric.")
|
| 144 |
+
grammatical_reason: str = Field(description="Reason for your assessment in one or two sentences on grammatical metric for the user.")
|
| 145 |
final_suggestion: str = Field(
|
| 146 |
+
description="Rewrite the TIL if you were given the option to write it which should score High on all the metrics.")
|
| 147 |
|
| 148 |
|
| 149 |
class TilFeedbackResults(BaseModel):
|
requirements.txt
CHANGED
|
@@ -13,3 +13,4 @@ fastapi
|
|
| 13 |
uvicorn
|
| 14 |
fastapi_cors
|
| 15 |
langsmith
|
|
|
|
|
|
| 13 |
uvicorn
|
| 14 |
fastapi_cors
|
| 15 |
langsmith
|
| 16 |
+
pytest
|
tests/__init__.py
ADDED
|
File without changes
|
tests/til_test.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from growthy_agents.crew.til import TilCrew # type: ignore
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
examples = [
|
| 6 |
+
("The sun rises in the east.", [
|
| 7 |
+
{"insightful_categorization": 'Low', "factuality_categorization": 'High', "simplicity_categorization": 'High', "grammatical_categorization": 'High'}]),
|
| 8 |
+
("* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
|
| 9 |
+
"* In quantization the weights are reduced by scaling up the datatypes from a datatype that takes smaller space to a data type that takes a larger space, this is also known as downcasting for example downcasting from int8 to float32.\n"
|
| 10 |
+
"* Advantages: takes lesser space and increases compute speed.\n"
|
| 11 |
+
"* Disadvantages: Answers are less precise because of the loss of precision in the LLM model weights.\n", [
|
| 12 |
+
{"insightful_categorization": 'Meidum', "factuality_categorization": 'High',
|
| 13 |
+
"simplicity_categorization": 'High', "grammatical_categorization": 'High'},
|
| 14 |
+
{"insightful_categorization": 'High', "factuality_categorization": 'Low',
|
| 15 |
+
"simplicity_categorization": 'High', "grammatical_categorization": 'High'},
|
| 16 |
+
{"insightful_categorization": 'High', "factuality_categorization": 'High',
|
| 17 |
+
"simplicity_categorization": 'High', "grammatical_categorization": 'High'},
|
| 18 |
+
{"insightful_categorization": 'High', "factuality_categorization": 'High',
|
| 19 |
+
"simplicity_categorization": 'High', "grammatical_categorization": 'High'},
|
| 20 |
+
]),
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@pytest.mark.parametrize("input_text, expected_categorizations", examples)
|
| 25 |
+
def test_llm_evaluation(input_text, expected_categorizations):
|
| 26 |
+
til_crew = TilCrew()
|
| 27 |
+
til_crew.content = input_text
|
| 28 |
+
til_crew._gather_feedback()
|
| 29 |
+
response = til_crew.feedback_results
|
| 30 |
+
|
| 31 |
+
for idx, feedback in enumerate(response):
|
| 32 |
+
assert feedback["insightful_categorization"] == pytest.approx(
|
| 33 |
+
expected_categorizations[idx]["insightful_categorization"], abs=2.0)
|
| 34 |
+
assert feedback["factuality_categorization"] == pytest.approx(
|
| 35 |
+
expected_categorizations[idx]["factuality_categorization"], abs=2.0)
|
| 36 |
+
assert feedback["simplicity_categorization"] == pytest.approx(
|
| 37 |
+
expected_categorizations[idx]["simplicity_categorization"], abs=2.0)
|
| 38 |
+
assert feedback["grammatical_categorization"] == pytest.approx(
|
| 39 |
+
expected_categorizations[idx]["grammatical_categorization"], abs=2.0)
|
ui/til_feedback.py
CHANGED
|
@@ -21,8 +21,9 @@ def main():
|
|
| 21 |
til_content = st.text_area('Enter what you learnt today:',
|
| 22 |
"* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
|
| 23 |
"* The weights are reduced by scaling down the datatypes from a datatype that takes larger space to a data type that takes a smaller space, this is also known as downcasting.\n"
|
| 24 |
-
"*
|
| 25 |
-
"* Disadvantages: Answers are less precise\n"
|
|
|
|
| 26 |
key='til_content', help='Enter what you learnt today')
|
| 27 |
|
| 28 |
if st.button("Get Feedback"):
|
|
|
|
| 21 |
til_content = st.text_area('Enter what you learnt today:',
|
| 22 |
"* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
|
| 23 |
"* The weights are reduced by scaling down the datatypes from a datatype that takes larger space to a data type that takes a smaller space, this is also known as downcasting.\n"
|
| 24 |
+
"* Quantization offers benefits such as reduced storage space usage and faster computation.\n"
|
| 25 |
+
"* Disadvantages: Answers are less precise\n"
|
| 26 |
+
"* I learnt how to use Go Routines to handle concurrency in React.\n",
|
| 27 |
key='til_content', help='Enter what you learnt today')
|
| 28 |
|
| 29 |
if st.button("Get Feedback"):
|