fromozu commited on
Commit
654854d
·
verified ·
1 Parent(s): d5a5570

Upload bilingual_book_maker/book_maker/utils.py with huggingface_hub

Browse files
bilingual_book_maker/book_maker/utils.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+
3
+ # Borrowed from : https://github.com/openai/whisper
4
+ LANGUAGES = {
5
+ "en": "english",
6
+ "zh-hans": "simplified chinese",
7
+ "zh": "simplified chinese",
8
+ "zh-hant": "traditional chinese",
9
+ "zh-yue": "cantonese",
10
+ "de": "german",
11
+ "es": "spanish",
12
+ "ru": "russian",
13
+ "ko": "korean",
14
+ "fr": "french",
15
+ "ja": "japanese",
16
+ "pt": "portuguese",
17
+ "tr": "turkish",
18
+ "pl": "polish",
19
+ "ca": "catalan",
20
+ "nl": "dutch",
21
+ "ar": "arabic",
22
+ "sv": "swedish",
23
+ "it": "italian",
24
+ "id": "indonesian",
25
+ "hi": "hindi",
26
+ "fi": "finnish",
27
+ "vi": "vietnamese",
28
+ "he": "hebrew",
29
+ "uk": "ukrainian",
30
+ "el": "greek",
31
+ "ms": "malay",
32
+ "cs": "czech",
33
+ "ro": "romanian",
34
+ "da": "danish",
35
+ "hu": "hungarian",
36
+ "ta": "tamil",
37
+ "no": "norwegian",
38
+ "th": "thai",
39
+ "ur": "urdu",
40
+ "hr": "croatian",
41
+ "bg": "bulgarian",
42
+ "lt": "lithuanian",
43
+ "la": "latin",
44
+ "mi": "maori",
45
+ "ml": "malayalam",
46
+ "cy": "welsh",
47
+ "sk": "slovak",
48
+ "te": "telugu",
49
+ "fa": "persian",
50
+ "lv": "latvian",
51
+ "bn": "bengali",
52
+ "sr": "serbian",
53
+ "az": "azerbaijani",
54
+ "sl": "slovenian",
55
+ "kn": "kannada",
56
+ "et": "estonian",
57
+ "mk": "macedonian",
58
+ "br": "breton",
59
+ "eu": "basque",
60
+ "is": "icelandic",
61
+ "hy": "armenian",
62
+ "ne": "nepali",
63
+ "mn": "mongolian",
64
+ "bs": "bosnian",
65
+ "kk": "kazakh",
66
+ "sq": "albanian",
67
+ "sw": "swahili",
68
+ "gl": "galician",
69
+ "mr": "marathi",
70
+ "pa": "punjabi",
71
+ "si": "sinhala",
72
+ "km": "khmer",
73
+ "sn": "shona",
74
+ "yo": "yoruba",
75
+ "so": "somali",
76
+ "af": "afrikaans",
77
+ "oc": "occitan",
78
+ "ka": "georgian",
79
+ "be": "belarusian",
80
+ "tg": "tajik",
81
+ "sd": "sindhi",
82
+ "gu": "gujarati",
83
+ "am": "amharic",
84
+ "yi": "yiddish",
85
+ "lo": "lao",
86
+ "uz": "uzbek",
87
+ "fo": "faroese",
88
+ "ht": "haitian creole",
89
+ "ps": "pashto",
90
+ "tk": "turkmen",
91
+ "nn": "nynorsk",
92
+ "mt": "maltese",
93
+ "sa": "sanskrit",
94
+ "lb": "luxembourgish",
95
+ "my": "myanmar",
96
+ "bo": "tibetan",
97
+ "tl": "tagalog",
98
+ "mg": "malagasy",
99
+ "as": "assamese",
100
+ "tt": "tatar",
101
+ "haw": "hawaiian",
102
+ "ln": "lingala",
103
+ "ha": "hausa",
104
+ "ba": "bashkir",
105
+ "jw": "javanese",
106
+ "su": "sundanese",
107
+ }
108
+
109
+ # language code lookup by name, with a few language aliases
110
+ TO_LANGUAGE_CODE = {
111
+ **{language: code for code, language in LANGUAGES.items()},
112
+ "burmese": "my",
113
+ "valencian": "ca",
114
+ "flemish": "nl",
115
+ "haitian": "ht",
116
+ "letzeburgesch": "lb",
117
+ "pushto": "ps",
118
+ "panjabi": "pa",
119
+ "moldavian": "ro",
120
+ "moldovan": "ro",
121
+ "sinhalese": "si",
122
+ "castilian": "es",
123
+ }
124
+
125
+
126
+ def prompt_config_to_kwargs(prompt_config):
127
+ prompt_config = prompt_config or {}
128
+ return dict(
129
+ prompt_template=prompt_config.get("user", None),
130
+ prompt_sys_msg=prompt_config.get("system", None),
131
+ )
132
+
133
+
134
+ # ref: https://platform.openai.com/docs/guides/chat/introduction
135
+ def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
136
+ messages = (
137
+ {
138
+ "role": "user",
139
+ "content": text,
140
+ },
141
+ )
142
+
143
+ """Returns the number of tokens used by a list of messages."""
144
+ try:
145
+ encoding = tiktoken.encoding_for_model(model)
146
+ except KeyError:
147
+ encoding = tiktoken.get_encoding("cl100k_base")
148
+ if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
149
+ num_tokens = 0
150
+ for message in messages:
151
+ num_tokens += (
152
+ 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
153
+ )
154
+ for key, value in message.items():
155
+ num_tokens += len(encoding.encode(value))
156
+ if key == "name": # if there's a name, the role is omitted
157
+ num_tokens += -1 # role is always required and always 1 token
158
+ num_tokens += 2 # every reply is primed with <im_start>assistant
159
+ return num_tokens
160
+ else:
161
+ raise NotImplementedError(
162
+ f"""num_tokens_from_messages() is not presently implemented for model {model}.
163
+ See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
164
+ )