File size: 2,507 Bytes
f3e6f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0778ffc
 
 
f3e6f32
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import tiktoken
from typing import List, Optional
from email._parseaddr import AddressList as _AddressList

from schemas import BenchmarkItem, EvaluateData, EvaluateItem
from datasets import DatasetDict

def truncate_text(text: str, model: str = "gpt-4.1", max_tokens: Optional[int] = None) -> str:
    """
    Truncate text to specified token count using tiktoken
    
    Args:
        text: Text to be truncated
        model: Model name to use, defaults to "gpt-4"
        max_tokens: Maximum token count, if None then no truncation
        
    Returns:
        Truncated text
    """
    if not max_tokens:
        return text
        
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # 如果找不到指定模型的编码器,使用cl100k_base编码器
        encoding = tiktoken.get_encoding("cl100k_base")
        
    tokens = encoding.encode(text)
    if len(tokens) <= max_tokens:
        return text
        
    truncated_tokens = tokens[:max_tokens]
    return encoding.decode(truncated_tokens)

def count_tokens(text: str, model: str = "gpt-4.1") -> int:
    """
    Count the number of tokens in a text using tiktoken
    
    Args:
        text: Text to count tokens
        model: Model name to use, defaults to "gpt-4"
        
    Returns:
        Number of tokens in the text
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # 如果找不到指定模型的编码器,使用cl100k_base编码器
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)



def parseaddr(addr):
    """
    Parse addr into its constituent realname and email address parts.

    Return a tuple of realname and email address, unless the parse fails, in
    which case return a 2-tuple of ('', '').
    """
    addrs = _AddressList(addr).addresslist
    if not addrs:
        return '', ''
    return addrs[0]


def parse_eval_dataset(dataset:DatasetDict) -> List[BenchmarkItem]:

    df = pd.DataFrame(dataset['train'])
    benchmark_items:List[BenchmarkItem] = []
    for index, row in df.iterrows():
        benchmark_items.append(BenchmarkItem(
            task_id=row['task_id'],
            question=row['question'],
            evaluate=EvaluateData(items=[EvaluateItem(**item) for item in row['evaluate']['items']]),
            category=row['category'],
            level=row['level']
        ))
    return benchmark_items