File size: 12,547 Bytes

5374a2d

import os 
import yaml
import json
import regex
from uuid import uuid4
from datetime import datetime, date 
from pydantic import BaseModel
from pydantic_core import PydanticUndefined, ValidationError
from typing import Union, Type, Any, List, Dict, get_origin, get_args

from .logging import logger 

def make_parent_folder(path: str):

    dir_folder = os.path.dirname(path)
    if len(dir_folder.strip()) == 0:
        return
    if not os.path.exists(dir_folder):
        os.makedirs(dir_folder, exist_ok=True)

def generate_id():
    return uuid4().hex

def get_timestamp():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def load_json(path: str, type: str="json"):
    
    assert type in ["json", "jsonl"] # only support json or jsonl format
    if not os.path.exists(path=path):
        logger.error(f"File \"{path}\" does not exists!")
    
    if type == "json":
        try:
            with open(path, "r", encoding="utf-8") as file:
                # outputs = yaml.safe_load(file.read()) # 用yaml.safe_load加载大文件的时候会非常慢
                outputs = json.loads(file.read())
        except Exception:
            logger.error(f"File \"{path}\" is not a valid json file!")
    
    elif type == "jsonl":
        outputs = []
        with open(path, "r", encoding="utf-8") as fin:
            for line in fin:
                # outputs.append(yaml.safe_load(line))
                outputs.append(json.loads(line))
    else:
        outputs = []
        
    return outputs

def save_json(data, path: str, type: str="json", use_indent: bool=True) -> str:

    """
    save data to a json file

    Args: 
        data: The json data to be saved. It can be a JSON str or a Serializable object when type=="json" or a list of JSON str or Serializable object when type=="jsonl".
        path(str): The path of the saved json file. 
        type(str): The type of the json file, chosen from ["json" or "jsonl"].
        use_indent: Whether to use indent when saving the json file. 
    
    Returns:
        path: the path where the json data is saved. 
    """

    assert type in ["json", "jsonl"] # only support json or jsonl format
    make_parent_folder(path)

    if type == "json":
        with open(path, "w", encoding="utf-8") as fout:
            if use_indent:
                fout.write(data if isinstance(data, str) else json.dumps(data, indent=4))
            else:
                fout.write(data if isinstance(data, str) else json.dumps(data))

    elif type == "jsonl":
        with open(path, "w", encoding="utf-8") as fout:
            for item in data:
                fout.write("{}\n".format(item if isinstance(item, str) else json.dumps(item)))

    return path

def escape_json_values(string: str) -> str:

    def escape_value(match):
        raw_value = match.group(1)
        raw_value = raw_value.replace('\n', '\\n')
        return f'"{raw_value}"'
    
    def fix_json(match):
        raw_key = match.group(1)
        raw_value = match.group(2)
        raw_value = raw_value.replace("\n", "\\n")
        raw_value = regex.sub(r'(?<!\\)"', '\\\"', raw_value)
        return f'"{raw_key}": "{raw_value}"'
    
    try:
        json.loads(string)
        return string
    except json.JSONDecodeError:
        pass

    try:
        string = regex.sub(r'(?<!\\)"', '\\\"', string) # replace " with \"
        pattern_key = r'\\"([^"]+)\\"(?=\s*:\s*)'
        string = regex.sub(pattern_key, r'"\1"', string) # replace \\"key\\" with "key"
        pattern_value = r'(?<=:\s*)\\"((?:\\.|[^"\\])*)\\"'
        string = regex.sub(pattern_value, escape_value, string, flags=regex.DOTALL) # replace \\"value\\" with "value"and change \n to \\n
        pattern_nested_json = r'"([^"]+)"\s*:\s*\\"([^"]*\{+[\S\s]*?\}+)[\r\n\\n]*"' # handle nested json in value
        string = regex.sub(pattern_nested_json, fix_json, string, flags=regex.DOTALL)
        json.loads(string)
        return string
    except json.JSONDecodeError:
        pass
    
    return string

def fix_json_booleans(string: str) -> str:
    """
    Finds and replaces isolated "True" and "False" with "true" and "false".

    The '\b' in the regex stands for a "word boundary", which ensures that
    we only match the full words and not substrings like "True" in "IsTrue".

    Args:
        json_string (str): The input JSON string.

    Returns:
        str: The modified JSON string with booleans in lowercase.
    """
    # Use re.sub() with a word boundary (\b) to ensure we only match
    # the isolated words 'True' and 'False' and not substrings like "True" in "IsTrue"
    modified_string = regex.sub(r'\bTrue\b', 'true', string)
    modified_string = regex.sub(r'\bFalse\b', 'false', modified_string)
    return modified_string


def fix_json(string: str) -> str:
    string = fix_json_booleans(string)
    string = escape_json_values(string)
    return string


def parse_json_from_text(text: str) -> List[str]:
    """
    Autoregressively extract JSON object from text 

    Args: 
        text (str): a text that includes JSON data 
    
    Returns:
        List[str]: a list of parsed JSON data
    """
    json_pattern = r"""(?:\{(?:[^{}]*|(?R))*\}|\[(?:[^\[\]]*|(?R))*\])"""
    pattern = regex.compile(json_pattern, regex.VERBOSE)
    matches = pattern.findall(text)
    matches = [fix_json(match) for match in matches]
    return matches


def parse_xml_from_text(text: str, label: str) -> List[str]:
    pattern = rf"<{label}>(.*?)</{label}>"
    matches: List[str] = regex.findall(pattern, text, regex.DOTALL)
    values = [] 
    if matches:
        values = [match.strip() for match in matches]
    return values

def parse_data_from_text(text: str, datatype: str):

    if datatype == "str":
        data = text
    elif datatype == "int":
        data = int(text)
    elif datatype == "float":
        data = float(text)
    elif datatype == "bool":
        data = text.lower() in ("true", "yes", "1", "on", "True")
    elif datatype == "list":
        data = eval(text)
    elif datatype == "dict":
        data = eval(text)
    else:
        # raise ValueError(
        #     f"Invalid value '{datatype}' is detected for `datatype`. "
        #     "Available choices: ['str', 'int', 'float', 'bool', 'list', 'dict']"
        # )
        # logger.warning(f"Unknown datatype '{datatype}' is detected for `datatype`. Return the raw text instead.")
        # failed to parse the data, return the raw text
        return text 
    return data

def parse_json_from_llm_output(text: str) -> dict:
    """
    Extract JSON str from LLM outputs and convert it to dict. 
    """
    json_list = parse_json_from_text(text=text)
    if json_list:
        json_text = json_list[0]
        try:
            data = yaml.safe_load(json_text)
        except Exception:
            raise ValueError(f"The following generated text is not a valid JSON string!\n{json_text}")
    else:
        raise ValueError(f"The follwoing generated text does not contain JSON string!\n{text}")
    return data

def extract_code_blocks(text: str, return_type: bool = False) -> Union[List[str], List[tuple]]:
    """
    Extract code blocks from text enclosed in triple backticks.
    
    Args:
        text (str): The text containing code blocks
        return_type (bool): If True, returns tuples of (language, code), otherwise just code
        
    Returns:
        Union[List[str], List[tuple]]: Either list of code blocks or list of (language, code) tuples
    """
    # Regular expression to match code blocks enclosed in triple backticks
    code_block_pattern = r"```((?:[a-zA-Z]*)?)\n*(.*?)\n*```"
    # Find all matches in the text
    matches = regex.findall(code_block_pattern, text, regex.DOTALL)

    # if no code blocks are found, return the text itself 
    if not matches:
        return [(None, text.strip())] if return_type else [text.strip()]
    
    if return_type:
        # Return tuples of (language, code)
        return [(lang.strip() or None, code.strip()) for lang, code in matches]
    else:
        # Return just the code blocks
        return [code.strip() for _, code in matches]

def remove_repr_quotes(json_string):
    pattern = r'"([A-Za-z_]\w*\(.*\))"'
    result = regex.sub(pattern, r'\1', json_string)
    return result

def custom_serializer(obj: Any): 

    if isinstance(obj, (bytes, bytearray)):
        return obj.decode()
    if isinstance(obj, (datetime, date)):
        return obj.strftime("%Y-%m-%d %H:%M:%S")
    if isinstance(obj, set):
        return list(obj)
    if hasattr(obj, "read") and hasattr(obj, "name"):
        return f"<FileObject name={getattr(obj, 'name', 'unknown')}>"
    if callable(obj):
        return obj.__name__
    if hasattr(obj, "__class__"):
        return obj.__repr__() if hasattr(obj, "__repr__") else obj.__class__.__name__
    
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

# def get_type_name(type):
#     """
#     return the name of a type.
#     """
#     origin = get_origin(type)
#     args = get_args(type)
#     if origin:
#         type_name = f"{origin.__name__}[{', '.join(arg.__name__ for arg in args)}]"
#     else:
#         type_name = getattr(type, "__name__", str(type))

#     return type_name

def get_type_name(typ):

    origin = get_origin(typ)
    if origin is None:
        return getattr(typ, "__name__", str(typ))
    
    if origin is Union:
        args = get_args(typ)
        return " | ".join(get_type_name(arg) for arg in args)
    
    if origin is type:
        return f"Type[{get_type_name(args[0])}]" if args else "Type[Any]"
    
    if origin in (list, tuple):
        args = get_args(typ)
        return f"{origin.__name__}[{', '.join(get_type_name(arg) for arg in args)}]"
    
    if origin is dict:
        key_type, value_type = get_args(typ)
        return f"dict[{get_type_name(key_type)}, {get_type_name(value_type)}]"
    
    return str(origin)

def get_pydantic_field_types(model: Type[BaseModel]) -> Dict[str, Union[str, dict]]:

    field_types = {}
    for field_name, field_info in model.model_fields.items():
        field_type = field_info.annotation
        if hasattr(field_type, "model_fields"):
            field_types[field_name] = get_pydantic_field_types(field_type)
        else:
            type_name = get_type_name(field_type)           
            field_types[field_name] = type_name
    
    return field_types

def get_pydantic_required_field_types(model: Type[BaseModel]) -> Dict[str, str]:

    required_field_types = {}
    for field_name, field_info in model.model_fields.items():
        if not field_info.is_required():
            continue
        if field_info.default is not PydanticUndefined or field_info.default_factory is not None:
            continue
        field_type = field_info.annotation
        type_name = get_type_name(field_type)
        required_field_types[field_name] = type_name
    
    return required_field_types

def format_pydantic_field_types(field_types: Dict[str, str]) -> str:

    output = ", ".join(f"\"{field_name}\": {field_type}" for field_name, field_type in field_types.items())
    output = "{" + output + "}"
    return output

def get_error_message(errors: List[Union[ValidationError, Exception]]) -> str: 

    if not isinstance(errors, list):
        errors = [errors]
    
    validation_errors, exceptions = [], [] 
    for error in errors:
        if isinstance(error, ValidationError):
            validation_errors.append(error)
        else:
            exceptions.append(error)
    
    message = ""
    if len(validation_errors) > 0:
        message += f" >>>>>>>> {len(validation_errors)} Validation Errors: <<<<<<<<\n\n"
        message += "\n\n".join([str(error) for error in validation_errors])
    if len(exceptions) > 0:
        if len(message) > 0:
            message += "\n\n"
        message += f">>>>>>>> {len(exceptions)} Exception Errors: <<<<<<<<\n\n"
        message += "\n\n".join([str(type(error).__name__) + ": " +str(error) for error in exceptions])
    return message

def get_base_module_init_error_message(cls, data: Dict[str, Any], errors: List[Union[ValidationError, Exception]]) -> str:

    if not isinstance(errors, list):
        errors = [errors]
    
    message = f"Can not instantiate {cls.__name__} from: "
    formatted_data = json.dumps(data, indent=4, default=custom_serializer)
    formatted_data = remove_repr_quotes(formatted_data)
    message += formatted_data
    message += "\n\n" + get_error_message(errors)
    return message