cohescrap / app.py
mobenta's picture
Update app.py
f0d2e1a verified
import gradio as gr
import cohere
import os
import uuid
import secrets
import nltk
from unstructured.documents.html import HTMLDocument
import requests
from bs4 import BeautifulSoup
# Download and install NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Initialize Cohere client with API key
co = cohere.Client(os.getenv("COHERE_API_KEY"), client_name="huggingface-aya-23")
# Function to process HTML content from a given URL
def process_html_from_url(url):
try:
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.get_text()
except requests.RequestException as e:
print(f"Error retrieving HTML content: {e}")
return None
# Function to generate response using Cohere chatbot
def generate_response(user_message, extracted_text, cid, token, history=None):
if not token:
raise gr.Error("Error loading.")
if history is None:
history = []
if not cid:
cid = str(uuid.uuid4())
combined_message = f"{extracted_text}\n\n{user_message}"
history.append(combined_message)
stream = co.chat_stream(message=combined_message, conversation_id=cid, model='c4ai-aya-23', connectors=[], temperature=0.3)
output = ""
for idx, response in enumerate(stream):
if response.event_type == "text-generation":
output += response.text
if idx == 0:
history.append(" " + output)
else:
history[-1] = output
chat = [
(history[i].strip(), history[i + 1].strip())
for i in range(0, len(history) - 1, 2)
]
yield chat, history, cid
# Function to clear chat
def clear_chat():
return [], [], str(uuid.uuid4()), ""
# Function to handle URL input, unstructure the text, and submit to Cohere
def handle_unstructure_and_submit(url, user_message, cid, token, history):
page_content = process_html_from_url(url)
if page_content:
for chat, _, _ in generate_response(user_message, page_content, cid, token, history):
pass
return chat, history, cid, page_content
else:
return "Failed to retrieve HTML content", "", "", ""
# Function to continue the conversation using the last extracted text
def continue_conversation(user_message, extracted_text, cid, token, history):
if extracted_text:
for chat, _, _ in generate_response(user_message, extracted_text, cid, token, history):
pass
return chat, history, cid
else:
return "No text extracted to continue the conversation.", "", ""
# Custom CSS for Gradio app
custom_css = """
#logo-img {
display: block;
margin-left: auto;
margin-right: auto;
width: 50%;
}
#chatbot {
font-size: 16px;
min-height: 400px;
}
#user-message {
font-size: 16px;
}
.center-text {
text-align: center;
font-family: Arial, sans-serif;
}
.center-text h1 {
font-size: 2em;
font-weight: bold;
}
.center-text p {
font-size: 1.2em;
font-weight: bold;
}
"""
# Create Gradio interface
with gr.Blocks(analytics_enabled=False, css=custom_css) as demo:
cid = gr.State("")
token = gr.State(value=None)
history = gr.State([])
extracted_text = gr.State("")
with gr.Row():
gr.Markdown("""
<div class="center-text">
<h1>Cohere Chatbot</h1>
<p><strong>Note</strong>: Aya 23 using Unstructured to extract text from web and process it.</p>
<p><strong>Cohere Aya 23</strong>: <a href="https://cohere.com/research" target="_blank">Cohere for AI</a> and <a href="https://cohere.com/" target="_blank">Cohere</a></p>
<p><strong>Unstructured</strong>: Open-Source Pre-Processing Tools for Unstructured Data</p>
</div>
""")
with gr.Row():
url_input = gr.Textbox(placeholder="Enter URL ...", label="URL", show_label=False, elem_id="url-input")
with gr.Row():
chatbot = gr.Chatbot(elem_id="chatbot", show_label=False)
with gr.Row():
user_message = gr.Textbox(placeholder="Ask anything ...", label="Input", show_label=False, elem_id="user-message")
with gr.Row():
submit_button = gr.Button("Unstructure Text and Submit to Cohere")
continue_button = gr.Button("Continue Conversation")
clear_button = gr.Button("Clear chat")
submit_button.click(fn=handle_unstructure_and_submit, inputs=[url_input, user_message, cid, token, history], outputs=[chatbot, history, cid, extracted_text], concurrency_limit=32)
continue_button.click(fn=continue_conversation, inputs=[user_message, extracted_text, cid, token, history], outputs=[chatbot, history, cid], concurrency_limit=32)
clear_button.click(fn=clear_chat, inputs=None, outputs=[chatbot, history, cid, extracted_text], concurrency_limit=32)
user_message.submit(lambda x: gr.update(value=""), None, [user_message], queue=False)
submit_button.click(lambda x: gr.update(value=""), None, [user_message], queue=False)
continue_button.click(lambda x: gr.update(value=""), None, [user_message], queue=False)
clear_button.click(lambda x: gr.update(value=""), None, [user_message], queue=False)
demo.load(lambda: secrets.token_hex(16), None, token)
if __name__ == "__main__":
try:
demo.queue(api_open=False, max_size=40).launch(show_api=False)
except Exception as e:
print(f"Error: {e}")