Spaces:
Runtime error
Runtime error
| from pydantic import NoneStr | |
| import os | |
| from langchain.document_loaders import UnstructuredFileLoader | |
| import mimetypes | |
| import validators | |
| import requests | |
| import tempfile | |
| import gradio as gr | |
| import openai | |
| import re | |
| import urllib.parse | |
| class WebpageSummarizer: | |
| """ | |
| A class to summarize webpages using OpenAI API. | |
| """ | |
| def __init__(self,): | |
| """ | |
| Set OpeanApi key | |
| """ | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| def upload_via_url(self, url: str) -> NoneStr: | |
| """ | |
| Uploads a webpage content via URL and returns the document. | |
| Args: | |
| url (str): The URL of the webpage. | |
| Returns: | |
| NoneStr: The document content. | |
| """ | |
| # Check if the URL is valid | |
| if validators.url(url): | |
| headers = { | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
| } | |
| # Send a GET request to retrieve the webpage content | |
| retrieve = requests.get(url, headers=headers) | |
| # Get the content type of the response | |
| content_type = retrieve.headers.get("content-type") | |
| # Guess the file extension based on the content type | |
| file_extension = mimetypes.guess_extension(content_type) | |
| # Save the webpage content to a temporary file | |
| temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) | |
| temp_file.write(retrieve.content) | |
| file_path = temp_file.name | |
| # Return the file path of the temporary file | |
| return file_path | |
| else: | |
| # If the URL is not valid, do nothing and continue | |
| pass | |
| def save_content(self, file_path: str) -> NoneStr: | |
| """ | |
| Saves the content of a file at the specified file path. | |
| Args: | |
| file_path (str): The path of the file to be saved. | |
| Returns: | |
| NoneStr: The document content. | |
| """ | |
| # Load the temporary file as a document using the UnstructuredFileLoader | |
| # strategy set to "fast" for faster processing | |
| loader = UnstructuredFileLoader(file_path, strategy="fast") | |
| # Load the document from the file | |
| document = loader.load() | |
| # Return the loaded document content | |
| return document | |
| def generate_summary(self, text: str) -> str: | |
| """ | |
| Generates a summary using OpenAI API. | |
| Args: | |
| text (str): The text to be summarized. | |
| Returns: | |
| str: The generated summary. | |
| """ | |
| prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}" | |
| # Make an API call to generate a summary using OpenAI API | |
| response = openai.Completion.create( | |
| model="text-davinci-003", | |
| prompt=prompt, | |
| temperature=0, | |
| max_tokens=500, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0.6, | |
| ) | |
| message = response.choices[0].text.strip() | |
| return message | |
| def summarize_webpage(self, url: str) -> str: | |
| """ | |
| Summarizes a webpage using OpenAI API. | |
| Args: | |
| url (str): The URL of the webpage. | |
| Returns: | |
| str: The generated summary. | |
| """ | |
| try: | |
| # Upload the webpage content and retrieve the temporary file path | |
| temporary_file_path = self.upload_via_url(url) | |
| # Save the content of the temporary file | |
| document_content = self.save_content(temporary_file_path) | |
| # Generate a summary using the document content | |
| summary = self.generate_summary(document_content) | |
| # Return the generated summary | |
| return summary | |
| except: | |
| # If an exception occurs (e.g., invalid URL), return an error message | |
| return "Please enter a valid URL." | |
| def gradio_interface(self): | |
| # Create a Gradio interface for the webpage summarization | |
| with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo: | |
| gr.HTML("""<img class="leftimage" align="left" src="https://templates.images.credential.net/1612472097627370951721412474196.png" alt="Image" width="210" height="210"> | |
| <img class="rightimage" align="right" src="https://logos-download.com/wp-content/uploads/2016/06/Syngenta_logo.png" alt="Image" width="150" height="140">""") | |
| with gr.Row(): | |
| with gr.Column(elem_id="col-container"): | |
| gr.HTML("""<center><h1>Syngenta Chemical Identifier</h1></center>""") | |
| inputs = gr.Textbox(label="URL") | |
| btn = gr.Button(label="Submit",value = "Analyse") | |
| outputs = gr.Textbox(label="Summary", lines=6) | |
| btn.click(fn=self.summarize_webpage, inputs=inputs, outputs=outputs) | |
| # Launch the Gradio interface | |
| demo.launch() | |
| if __name__ == "__main__": | |
| web_scraper = WebpageSummarizer() | |
| web_scraper.gradio_interface() | |