Anshul Prasad commited on
Commit
e59f184
·
1 Parent(s): 3708a87

Initial commit

Browse files
Files changed (1) hide show
  1. utils/token.py +28 -0
utils/token.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ from config import MODEL
3
+
4
+
5
+ # Limit the length of combined transcripts for ChatGPT ready
6
+ try:
7
+ encoder = tiktoken.encoding_for_model(MODEL)
8
+ except KeyError:
9
+ # fallback for custom or unrecognized model names
10
+ encoder = tiktoken.get_encoding("cl100k_base")
11
+
12
+
13
+ def count_tokens(text: str) -> int:
14
+ """Return the number of tokens in a string, using your model's tokenizer."""
15
+ if not text:
16
+ return 0
17
+ return len(encoder.encode(text))
18
+
19
+
20
+ def trim_to_token_limit(text: str, max_tokens: int) -> str:
21
+ """
22
+ If text exceeds max_tokens, cut it down to the first max_tokens tokens.
23
+ """
24
+ tokens = encoder.encode(text)
25
+ if len(tokens) <= max_tokens:
26
+ return text
27
+ # decode only the first max_tokens tokens back into a string
28
+ return encoder.decode(tokens[:max_tokens])