File size: 2,443 Bytes
8c27dd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pytest
from growthy_agents.crew.til import TilCrew  # type: ignore


examples = [
    ("The sun rises in the east.", [
     {"insightful_categorization": 'Low', "factuality_categorization": 'High', "simplicity_categorization": 'High', "grammatical_categorization": 'High'}]),
    ("* Quantization is the process of reducing the size of LLM models by reducing the underlying weights.\n"
     "* In quantization the weights are reduced by scaling up the datatypes from a datatype that takes smaller space to a data type that takes a larger space, this is also known as downcasting for example downcasting from int8 to float32.\n"
     "* Advantages: takes lesser space and increases compute speed.\n"
     "* Disadvantages: Answers are less precise because of the loss of precision in the LLM model weights.\n", [
        {"insightful_categorization": 'Meidum', "factuality_categorization": 'High',
            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
        {"insightful_categorization": 'High', "factuality_categorization": 'Low',
            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
        {"insightful_categorization": 'High', "factuality_categorization": 'High',
            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
        {"insightful_categorization": 'High', "factuality_categorization": 'High',
            "simplicity_categorization": 'High', "grammatical_categorization": 'High'},
    ]),
]


@pytest.mark.parametrize("input_text, expected_categorizations", examples)
def test_llm_evaluation(input_text, expected_categorizations):
    til_crew = TilCrew()
    til_crew.content = input_text
    til_crew._gather_feedback()
    response = til_crew.feedback_results

    for idx, feedback in enumerate(response):
        assert feedback["insightful_categorization"] == pytest.approx(
            expected_categorizations[idx]["insightful_categorization"], abs=2.0)
        assert feedback["factuality_categorization"] == pytest.approx(
            expected_categorizations[idx]["factuality_categorization"], abs=2.0)
        assert feedback["simplicity_categorization"] == pytest.approx(
            expected_categorizations[idx]["simplicity_categorization"], abs=2.0)
        assert feedback["grammatical_categorization"] == pytest.approx(
            expected_categorizations[idx]["grammatical_categorization"], abs=2.0)