Spaces:

wilame
/

marketer_chatbot

Sleeping

marketer_chatbot / functions.py

Wilame Lima

Trim content of the url if it's too long

45e0afe over 1 year ago

4.81 kB

	from config import *

	@st.cache_data()
	def make_request(user_input:str,
	short_history:list,
	chat_history:list):

	"""Makes a request to the Hugging Face API"""

	client = InferenceClient(
	MODEL_PATH,
	token=HUGGING_FACE_API_KEY,
	)

	try:
	response = client.chat_completion(
	messages=short_history,
	max_tokens = 5000,
	stream = False,
	)

	# get the response
	message = response.choices[0].message['content']

	# analyse the content to see if there is an action to perform
	try:
	perform_actions = look_for_actions(user_input, message)

	except Exception as e:
	st.info(f"An error occurred while looking for actions: {e}")
	perform_actions = (False, None)

	# if there was an action to perform, resubmit the question to the chatbot:
	if perform_actions[0]:

	# replace the last message in the short history with the new message
	short_history[-1] = {'role':'user', 'content':perform_actions[1]}

	# replace the first message with the system prompt without url analysis
	short_history[0] = {'role':'system', 'content':SYSTEM_PROMPT_NO_URL}

	# wait a little bit to avoid the API limit
	time.sleep(1)

	# make the request again
	response = client.chat_completion(
	messages=short_history,
	max_tokens = 5000,
	stream = False,
	)

	# append to the history
	chat_history.append({'content':user_input, 'role':'user'})
	chat_history.append(response.choices[0].message) # append the response

	return chat_history

	except Exception as e:
	st.error(f"An error occurred: {e}")
	st.stop()

	@st.cache_data()
	def get_site_content(url:str):

	"""Receives a URL and returns the content of the site"""

	# create an user agent
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
	}

	# get the site content
	response = requests.get(url, headers=headers)
	soup = BeautifulSoup(response.text, 'html.parser')

	# remove styles and scripts
	for script in soup(["script", "style"]):
	script.extract()

	# let the meta descriptions of the header and all the content inside the body
	# for the meta tags, get the tag itself and its content
	meta_tags = soup.head.find_all('meta')
	meta_tags_text = ''
	for tag in meta_tags:
	meta_tags_text += f'<{tag.name} {tag.attrs}>\n'

	# get the body text
	body_text = soup.body.get_text()

	# join the meta tags and the body text
	text = f'{meta_tags_text}\n{body_text}'

	# remove empty lines
	text = os.linesep.join([s for s in text.splitlines() if s])

	return text

	def look_for_actions(user_input:str, message:str):

	"""Reveives a message and look for the pattern ###ACTION###function###URL###"""

	# check if the pattern is in the message.
	if '###' in message:

	# split the message by the pattern ###ACTION###function###URL### to get the URL and the action
	split_string = message.split('###')

	if 'getSiteContent' in message:

	st.info("I need to visit the site to provide the answer. Please wait...")

	url = split_string[3].strip()

	# remove everything inside ### and ### (including the ###) from the user_input
	user_input = re.sub(r'###.*?###', '', user_input)

	# add the content of the website to the message
	url_content = f'{user_input}. Content of the site {url}:\n{get_site_content(url)}'

	# check if the url_content is too long. If soo, keep trimming the text until it is not too long
	while get_token_amount(url_content) > 5000:
	url_content = url_content[:-100]

	return (True, url_content)

	# if there is no action to perform, return None
	return (False, None)

	@st.cache_data(ttl=3600)
	def get_token_amount(text,
	model_name="gpt-4") -> int:

	"""Uses the tiktoken library to check if a text is too long for a given model.
	Even tough we are using a Llama model, we are using the GPT-4 model ans an approximation.

	Args:
	text (str): The text to check.
	model_name (str): The name of the model to check. Defaults to "gpt-4".

	Returns:
	int: The number of tokens in the text.
	"""

	encoding = tiktoken.encoding_for_model(model_name)
	tokens = encoding.encode(text)
	return len(tokens)