Spaces:
Sleeping
Sleeping
Wilame Lima commited on
Commit ·
45e0afe
1
Parent(s): 90263a4
Trim content of the url if it's too long
Browse files- config.py +1 -0
- functions.py +21 -0
- requirements.txt +2 -1
config.py
CHANGED
|
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
|
|
| 6 |
import requests
|
| 7 |
import re
|
| 8 |
import time
|
|
|
|
| 9 |
|
| 10 |
# load variables from the env file
|
| 11 |
load_dotenv()
|
|
|
|
| 6 |
import requests
|
| 7 |
import re
|
| 8 |
import time
|
| 9 |
+
import tiktoken
|
| 10 |
|
| 11 |
# load variables from the env file
|
| 12 |
load_dotenv()
|
functions.py
CHANGED
|
@@ -116,11 +116,32 @@ def look_for_actions(user_input:str, message:str):
|
|
| 116 |
|
| 117 |
# add the content of the website to the message
|
| 118 |
url_content = f'{user_input}. Content of the site {url}:\n{get_site_content(url)}'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
return (True, url_content)
|
| 121 |
|
| 122 |
# if there is no action to perform, return None
|
| 123 |
return (False, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
|
|
|
|
|
|
|
|
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
|
|
|
|
| 116 |
|
| 117 |
# add the content of the website to the message
|
| 118 |
url_content = f'{user_input}. Content of the site {url}:\n{get_site_content(url)}'
|
| 119 |
+
|
| 120 |
+
# check if the url_content is too long. If soo, keep trimming the text until it is not too long
|
| 121 |
+
while get_token_amount(url_content) > 5000:
|
| 122 |
+
url_content = url_content[:-100]
|
| 123 |
|
| 124 |
return (True, url_content)
|
| 125 |
|
| 126 |
# if there is no action to perform, return None
|
| 127 |
return (False, None)
|
| 128 |
+
|
| 129 |
+
@st.cache_data(ttl=3600)
|
| 130 |
+
def get_token_amount(text,
|
| 131 |
+
model_name="gpt-4") -> int:
|
| 132 |
+
|
| 133 |
+
"""Uses the tiktoken library to check if a text is too long for a given model.
|
| 134 |
+
Even tough we are using a Llama model, we are using the GPT-4 model ans an approximation.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
text (str): The text to check.
|
| 138 |
+
model_name (str): The name of the model to check. Defaults to "gpt-4".
|
| 139 |
|
| 140 |
+
Returns:
|
| 141 |
+
int: The number of tokens in the text.
|
| 142 |
+
"""
|
| 143 |
|
| 144 |
+
encoding = tiktoken.encoding_for_model(model_name)
|
| 145 |
+
tokens = encoding.encode(text)
|
| 146 |
+
return len(tokens)
|
| 147 |
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
streamlit
|
| 2 |
python-dotenv
|
| 3 |
huggingface_hub
|
| 4 |
-
beautifulsoup4
|
|
|
|
|
|
| 1 |
streamlit
|
| 2 |
python-dotenv
|
| 3 |
huggingface_hub
|
| 4 |
+
beautifulsoup4
|
| 5 |
+
tiktoken
|