Wilame Lima commited on
Commit
45e0afe
·
1 Parent(s): 90263a4

Trim content of the url if it's too long

Browse files
Files changed (3) hide show
  1. config.py +1 -0
  2. functions.py +21 -0
  3. requirements.txt +2 -1
config.py CHANGED
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
6
  import requests
7
  import re
8
  import time
 
9
 
10
  # load variables from the env file
11
  load_dotenv()
 
6
  import requests
7
  import re
8
  import time
9
+ import tiktoken
10
 
11
  # load variables from the env file
12
  load_dotenv()
functions.py CHANGED
@@ -116,11 +116,32 @@ def look_for_actions(user_input:str, message:str):
116
 
117
  # add the content of the website to the message
118
  url_content = f'{user_input}. Content of the site {url}:\n{get_site_content(url)}'
 
 
 
 
119
 
120
  return (True, url_content)
121
 
122
  # if there is no action to perform, return None
123
  return (False, None)
 
 
 
 
 
 
 
 
 
 
 
124
 
 
 
 
125
 
 
 
 
126
 
 
116
 
117
  # add the content of the website to the message
118
  url_content = f'{user_input}. Content of the site {url}:\n{get_site_content(url)}'
119
+
120
+ # check if the url_content is too long. If soo, keep trimming the text until it is not too long
121
+ while get_token_amount(url_content) > 5000:
122
+ url_content = url_content[:-100]
123
 
124
  return (True, url_content)
125
 
126
  # if there is no action to perform, return None
127
  return (False, None)
128
+
129
+ @st.cache_data(ttl=3600)
130
+ def get_token_amount(text,
131
+ model_name="gpt-4") -> int:
132
+
133
+ """Uses the tiktoken library to check if a text is too long for a given model.
134
+ Even tough we are using a Llama model, we are using the GPT-4 model ans an approximation.
135
+
136
+ Args:
137
+ text (str): The text to check.
138
+ model_name (str): The name of the model to check. Defaults to "gpt-4".
139
 
140
+ Returns:
141
+ int: The number of tokens in the text.
142
+ """
143
 
144
+ encoding = tiktoken.encoding_for_model(model_name)
145
+ tokens = encoding.encode(text)
146
+ return len(tokens)
147
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  streamlit
2
  python-dotenv
3
  huggingface_hub
4
- beautifulsoup4
 
 
1
  streamlit
2
  python-dotenv
3
  huggingface_hub
4
+ beautifulsoup4
5
+ tiktoken