Spaces:

mobenta
/

cohescrap

Runtime error

App Files Files Community

cohescrap / app.py

mobenta

Update app.py

f0d2e1a verified almost 2 years ago

raw

history blame contribute delete

5.53 kB

	import gradio as gr
	import cohere
	import os
	import uuid
	import secrets
	import nltk
	from unstructured.documents.html import HTMLDocument
	import requests
	from bs4 import BeautifulSoup

	# Download and install NLTK data
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')

	# Initialize Cohere client with API key
	co = cohere.Client(os.getenv("COHERE_API_KEY"), client_name="huggingface-aya-23")

	# Function to process HTML content from a given URL
	def process_html_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
	soup = BeautifulSoup(response.text, 'html.parser')
	return soup.get_text()
	except requests.RequestException as e:
	print(f"Error retrieving HTML content: {e}")
	return None

	# Function to generate response using Cohere chatbot
	def generate_response(user_message, extracted_text, cid, token, history=None):
	if not token:
	raise gr.Error("Error loading.")

	if history is None:
	history = []
	if not cid:
	cid = str(uuid.uuid4())

	combined_message = f"{extracted_text}\n\n{user_message}"
	history.append(combined_message)

	stream = co.chat_stream(message=combined_message, conversation_id=cid, model='c4ai-aya-23', connectors=[], temperature=0.3)
	output = ""

	for idx, response in enumerate(stream):
	if response.event_type == "text-generation":
	output += response.text
	if idx == 0:
	history.append(" " + output)
	else:
	history[-1] = output
	chat = [
	(history[i].strip(), history[i + 1].strip())
	for i in range(0, len(history) - 1, 2)
	]
	yield chat, history, cid

	# Function to clear chat
	def clear_chat():
	return [], [], str(uuid.uuid4()), ""

	# Function to handle URL input, unstructure the text, and submit to Cohere
	def handle_unstructure_and_submit(url, user_message, cid, token, history):
	page_content = process_html_from_url(url)
	if page_content:
	for chat, _, _ in generate_response(user_message, page_content, cid, token, history):
	pass
	return chat, history, cid, page_content
	else:
	return "Failed to retrieve HTML content", "", "", ""

	# Function to continue the conversation using the last extracted text
	def continue_conversation(user_message, extracted_text, cid, token, history):
	if extracted_text:
	for chat, _, _ in generate_response(user_message, extracted_text, cid, token, history):
	pass
	return chat, history, cid
	else:
	return "No text extracted to continue the conversation.", "", ""

	# Custom CSS for Gradio app
	custom_css = """
	#logo-img {
	display: block;
	margin-left: auto;
	margin-right: auto;
	width: 50%;
	}
	#chatbot {
	font-size: 16px;
	min-height: 400px;
	}
	#user-message {
	font-size: 16px;
	}
	.center-text {
	text-align: center;
	font-family: Arial, sans-serif;
	}
	.center-text h1 {
	font-size: 2em;
	font-weight: bold;
	}
	.center-text p {
	font-size: 1.2em;
	font-weight: bold;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(analytics_enabled=False, css=custom_css) as demo:
	cid = gr.State("")
	token = gr.State(value=None)
	history = gr.State([])
	extracted_text = gr.State("")

	with gr.Row():
	gr.Markdown("""
	<div class="center-text">
	<h1>Cohere Chatbot</h1>
	<p><strong>Note</strong>: Aya 23 using Unstructured to extract text from web and process it.</p>
	<p><strong>Cohere Aya 23</strong>: <a href="https://cohere.com/research" target="_blank">Cohere for AI</a> and <a href="https://cohere.com/" target="_blank">Cohere</a></p>
	<p><strong>Unstructured</strong>: Open-Source Pre-Processing Tools for Unstructured Data</p>
	</div>
	""")

	with gr.Row():
	url_input = gr.Textbox(placeholder="Enter URL ...", label="URL", show_label=False, elem_id="url-input")

	with gr.Row():
	chatbot = gr.Chatbot(elem_id="chatbot", show_label=False)

	with gr.Row():
	user_message = gr.Textbox(placeholder="Ask anything ...", label="Input", show_label=False, elem_id="user-message")

	with gr.Row():
	submit_button = gr.Button("Unstructure Text and Submit to Cohere")
	continue_button = gr.Button("Continue Conversation")
	clear_button = gr.Button("Clear chat")

	submit_button.click(fn=handle_unstructure_and_submit, inputs=[url_input, user_message, cid, token, history], outputs=[chatbot, history, cid, extracted_text], concurrency_limit=32)
	continue_button.click(fn=continue_conversation, inputs=[user_message, extracted_text, cid, token, history], outputs=[chatbot, history, cid], concurrency_limit=32)
	clear_button.click(fn=clear_chat, inputs=None, outputs=[chatbot, history, cid, extracted_text], concurrency_limit=32)

	user_message.submit(lambda x: gr.update(value=""), None, [user_message], queue=False)
	submit_button.click(lambda x: gr.update(value=""), None, [user_message], queue=False)
	continue_button.click(lambda x: gr.update(value=""), None, [user_message], queue=False)
	clear_button.click(lambda x: gr.update(value=""), None, [user_message], queue=False)

	demo.load(lambda: secrets.token_hex(16), None, token)

	if __name__ == "__main__":
	try:
	demo.queue(api_open=False, max_size=40).launch(show_api=False)
	except Exception as e:
	print(f"Error: {e}")