Spaces:
Sleeping
Sleeping
| ############################################################################################################################################################### | |
| # _____ _ ___ _ ___ | |
| # |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _ | |
| # | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_| | |
| # |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_| | |
| # |___/ |_| | |
| # | |
| ############################################################################################################################################################## | |
| # _ ______ _ _ _______ _ _ | |
| # _ | | (_____ \ | | (_) (_______) (_) (_) | |
| # _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _ | |
| # (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | | | |
| # / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || | | |
| # \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_| | |
| # (_____| | |
| ############################################################################################################################################################### | |
| # | |
| # Last updated in: 8/15/2024 | |
| # | |
| ############################################################################################################################################################### | |
| # ------------------------------------------------------------------------------ | |
| # IMPORTS | |
| # ------------------------------------------------------------------------------ | |
| import os | |
| import subprocess | |
| from typing import Tuple | |
| import gradio as gr | |
| from bs4 import BeautifulSoup as Soup | |
| from dotenv import load_dotenv | |
| from scrapegraphai.graphs import SmartScraperGraph | |
| from scrapegraphai.utils import prettify_exec_info | |
| from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings | |
| from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint | |
| from langchain_community.document_loaders import (AsyncHtmlLoader, | |
| NewsURLLoader, PubMedLoader, | |
| PlaywrightURLLoader, | |
| RecursiveUrlLoader, | |
| SeleniumURLLoader, | |
| UnstructuredURLLoader, | |
| WebBaseLoader) | |
| # ------------------------------------------------------------------------------ | |
| # DEV ENVIRONMENT SETUP | |
| # ------------------------------------------------------------------------------ | |
| # Load environment variables | |
| load_dotenv() | |
| HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') | |
| # Foudational Model and Embeeding Model HF repo ID | |
| FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3" | |
| EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2" | |
| # Initialize the model instances | |
| llm_model_instance = HuggingFaceEndpoint( | |
| repo_id=FM_REPO_ID, | |
| max_new_tokens=8192, | |
| top_k=10, | |
| top_p=0.95, | |
| typical_p=0.95, | |
| temperature=0.1, | |
| repetition_penalty=1.03, | |
| huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN, | |
| ) | |
| embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( | |
| api_key=HUGGINGFACEHUB_API_TOKEN, | |
| model_name=EM_REPO_ID | |
| ) | |
| graph_config = { | |
| "llm": {"model_instance": llm_model_instance}, | |
| "embeddings": {"model_instance": embedder_model_instance} | |
| } | |
| # ------------------------------------------------------------------------------ | |
| # THE BIG SCRAPER | |
| # ------------------------------------------------------------------------------ | |
| def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]: | |
| """Extracts data from provided URLs using specified loader type. | |
| Args: | |
| urls (str): Comma-separated URLs to extract data from. | |
| loader_type (str): Type of loader to use for data extraction. | |
| Returns: | |
| tuple: A tuple containing the extracted data in JSON format and as a list of Document objects. | |
| Returns error messages if an exception occurs. | |
| """ | |
| try: | |
| urls = urls.split(',') | |
| data = [] | |
| if loader_type == 'AsyncHtmlLoader': | |
| loader = AsyncHtmlLoader(urls) | |
| elif loader_type == 'UnstructuredURL': | |
| loader = UnstructuredURLLoader(urls=urls) | |
| elif loader_type == 'RecursiveURL': | |
| loader = RecursiveUrlLoader( | |
| url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text | |
| ) | |
| elif loader_type == 'SeleniumURL': | |
| loader = SeleniumURLLoader(urls=urls) | |
| elif loader_type == 'SeleniumURLH': | |
| loader = SeleniumURLLoader(urls=urls, headless=False) | |
| elif loader_type == 'PlaywrightURL': | |
| loader = PlaywrightURLLoader(urls=urls) | |
| elif loader_type == 'PubMed': | |
| loader = PubMedLoader(urls[0]) | |
| elif loader_type == 'NewsURL': | |
| loader = NewsURLLoader(urls) | |
| elif loader_type == 'WebBaseLoader': | |
| loader = WebBaseLoader(urls) | |
| else: | |
| return "Not Implemented. Development in Progress", "Work In Progress" | |
| data = loader.load() | |
| jsonData = [] | |
| for item in data: | |
| jsonData.append(item.to_json()) | |
| return jsonData, data | |
| except Exception as err: | |
| return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom" | |
| def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]: | |
| """Scrapes website content based on the prompt and summarizes it. | |
| Args: | |
| prompt (str): The prompt to guide the scraping process. | |
| source (str): The URL of the website to scrape. | |
| Returns: | |
| tuple: A tuple containing the scraped data as a dictionary and the execution information. | |
| """ | |
| smart_scraper_graph = SmartScraperGraph( | |
| prompt=prompt, | |
| source=source, | |
| config=graph_config | |
| ) | |
| result = smart_scraper_graph.run() | |
| exec_info = smart_scraper_graph.get_execution_info() | |
| return result, prettify_exec_info(exec_info) | |
| # ------------------------------------------------------------------------------ | |
| # TABBED GRADIO UI | |
| # ------------------------------------------------------------------------------ | |
| # Define choices for the dropdown menu | |
| choices = [ | |
| 'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed', | |
| 'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup', | |
| 'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL', | |
| ] | |
| # Create the Gradio interface with tabs | |
| with gr.Blocks(theme="sudeepshouche/minimalist") as demo: | |
| gr.Markdown("# THE BIG SCRAPER") | |
| with gr.Tabs(): | |
| # Tab 1: Data Extraction | |
| with gr.TabItem("Data Extraction"): | |
| gr.Markdown("## Extract data from URLs using various loaders") | |
| with gr.Row(): | |
| url_input = gr.Textbox(label="Enter your comma separated URLs here") | |
| loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here") | |
| extract_button = gr.Button("Extract Data") | |
| with gr.Row(): | |
| extracted_data_json = gr.JSON(label="Extracted Data (JSON)") | |
| extracted_data_text = gr.Textbox(label="Extracted Data (Text)") | |
| extract_button.click( | |
| extractDataFromUrls, | |
| inputs=[url_input, loader_dropdown], | |
| outputs=[extracted_data_json, extracted_data_text] | |
| ) | |
| # Tab 2: Website Scraping and Summarization | |
| with gr.TabItem("Scraping & Summarization"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2") | |
| prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.") | |
| source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/") | |
| scrape_button = gr.Button("Scrape and Summarize") | |
| with gr.Column(): | |
| result_output = gr.JSON(label="Result") | |
| exec_info_output = gr.Textbox(label="Execution Info") | |
| scrape_button.click( | |
| scrapeAndSummarize, | |
| inputs=[prompt_input, source_input], | |
| outputs=[result_output, exec_info_output] | |
| ) | |
| # Launch the Gradio interface | |
| demo.launch() |