File size: 2,861 Bytes
e9aab05
 
 
 
 
 
cbe459e
e9aab05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3da3c9
e9aab05
 
 
b3da3c9
 
 
 
 
e9aab05
b3da3c9
e9aab05
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json 
import sys
import unicodedata

from pydantic import ValidationError, BaseModel
from tinytroupe.utils import logger
from tinytroupe.utils.llm import extract_json

################################################################################
# Validation
################################################################################
def check_valid_fields(obj: dict, valid_fields: list) -> None:
    """
    Checks whether the fields in the specified dict are valid, according to the list of valid fields. If not, raises a ValueError.
    """
    for key in obj:
        if key not in valid_fields:
            raise ValueError(f"Invalid key {key} in dictionary. Valid keys are: {valid_fields}")

def sanitize_raw_string(value: str) -> str:
    """
    Sanitizes the specified string by: 
      - removing any invalid characters.
      - ensuring it is not longer than the maximum Python string length.
    
    This is for an abundance of caution with security, to avoid any potential issues with the string.
    """

    # remove any invalid characters by making sure it is a valid UTF-8 string
    value = value.encode("utf-8", "ignore").decode("utf-8")

    value = unicodedata.normalize("NFC", value)


    # ensure it is not longer than the maximum Python string length
    return value[:sys.maxsize]

def sanitize_dict(value: dict) -> dict:
    """
    Sanitizes the specified dictionary by:
      - removing any invalid characters.
      - ensuring that the dictionary is not too deeply nested.
    """

    # sanitize the string representation of the dictionary
    for k, v in value.items():
        if isinstance(v, str):
            value[k] = sanitize_raw_string(v)

    # ensure that the dictionary is not too deeply nested
    return value

def to_pydantic_or_sanitized_dict(value: dict, model: BaseModel=None) -> dict:
    """
    Converts the specified model response dictionary to a Pydantic model instance, or sanitizes it if the model is not valid.
    It is assumed that the dict contains the `content` key.
    """
    from tinytroupe.openai_utils import NonTerminalError

    if model is not None and (isinstance(model, type) and issubclass(model, BaseModel)):
        # If a model is provided, try to validate the value against the model
        extracted = extract_json(value['content'])
        if extracted is None:
            logger.warning(f"Failed to extract JSON from LLM response: {value['content']}")
            raise NonTerminalError(f"Failed to extract JSON from LLM response")
            
        try:
            res = model.model_validate(sanitize_dict(extracted))
            return res
        except ValidationError as e:
            logger.warning(f"Validation error: {e}")
            return sanitize_dict(value)
    else:
        return sanitize_dict(value)  # If no model, just sanitize the dict