Spaces:

ResearchMAGIC
/

the-big-scraper

Sleeping

App Files Files Community

the-big-scraper / alternative.py

rodrigomasini

Update alternative.py

3f9fa90 verified over 1 year ago

raw

history blame contribute delete

9.29 kB

	###############################################################################################################################################################
	# _____ _ ___ _ ___
	# \|_ _\|\| \|_ ___ \| _ )(_) __ _ / __\| __ _ _ __ _ _ __ ___ _ _
	# \| \| \| ' \ / -_) \| _ \\| \|/ _` \| \__ \/ _\|\| '_\|/ _` \|\| '_ \/ -_)\| '_\|
	# \|_\| \|_\|\|_\|\___\| \|___/\|_\|\__, \| \|___/\__\|\|_\| \__,_\|\| .__/\___\|\|_\|
	# \|___/ \|_\|
	#
	##############################################################################################################################################################
	# _ ______ _ _ _______ _ _
	# _ \| \| (_____ \ \| \| (_) (_______) (_) (_)
	# _____ _ _ _\| \|_ \| \|__ ___ ____ _ _____) ) ___ __\| \| ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
	# (____ \|\| \| \| \|(_ _)\| _ \ / _ \ / ___)(_) \| __ / / _ \ / _ \| / ___)\| \| / _ \| / _ \ \| \|\|_\|\| \|(____ \| /___)\| \|\| _ \ \| \|
	# / ___ \|\| \|_\| \| \| \|_ \| \| \| \|\| \|_\| \|\| \| _ \| \| \ \ \| \|_\| \|( (_\| \|\| \| \| \|( (_\| \|\| \|_\| \| \| \| \| \|/ ___ \|\|___ \|\| \|\| \| \| \|\| \|
	# \_____\|\|____/ \__)\|_\| \|_\| \___/ \|_\| (_) \|_\| \|_\| \___/ \____\|\|_\| \|_\| \___ \| \___/ \|_\| \|_\|\_____\|(___/ \|_\|\|_\| \|_\|\|_\|
	# (_____\|
	###############################################################################################################################################################
	#
	# Last updated in: 8/15/2024
	#
	###############################################################################################################################################################

	# ------------------------------------------------------------------------------
	# IMPORTS
	# ------------------------------------------------------------------------------

	import os
	import subprocess
	from typing import Tuple

	import gradio as gr
	from bs4 import BeautifulSoup as Soup
	from dotenv import load_dotenv

	from scrapegraphai.graphs import SmartScraperGraph
	from scrapegraphai.utils import prettify_exec_info

	from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
	from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
	from langchain_community.document_loaders import (AsyncHtmlLoader,
	NewsURLLoader, PubMedLoader,
	PlaywrightURLLoader,
	RecursiveUrlLoader,
	SeleniumURLLoader,
	UnstructuredURLLoader,
	WebBaseLoader)


	# ------------------------------------------------------------------------------
	# DEV ENVIRONMENT SETUP
	# ------------------------------------------------------------------------------

	# Load environment variables
	load_dotenv()
	HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

	# Foudational Model and Embeeding Model HF repo ID
	FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
	EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2"


	# Initialize the model instances
	llm_model_instance = HuggingFaceEndpoint(
	repo_id=FM_REPO_ID,
	max_new_tokens=8192,
	top_k=10,
	top_p=0.95,
	typical_p=0.95,
	temperature=0.1,
	repetition_penalty=1.03,
	huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN,
	)

	embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
	api_key=HUGGINGFACEHUB_API_TOKEN,
	model_name=EM_REPO_ID
	)

	graph_config = {
	"llm": {"model_instance": llm_model_instance},
	"embeddings": {"model_instance": embedder_model_instance}
	}

	# ------------------------------------------------------------------------------
	# THE BIG SCRAPER
	# ------------------------------------------------------------------------------

	def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
	"""Extracts data from provided URLs using specified loader type.

	Args:
	urls (str): Comma-separated URLs to extract data from.
	loader_type (str): Type of loader to use for data extraction.

	Returns:
	tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
	Returns error messages if an exception occurs.
	"""
	try:
	urls = urls.split(',')
	data = []

	if loader_type == 'AsyncHtmlLoader':
	loader = AsyncHtmlLoader(urls)
	elif loader_type == 'UnstructuredURL':
	loader = UnstructuredURLLoader(urls=urls)
	elif loader_type == 'RecursiveURL':
	loader = RecursiveUrlLoader(
	url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
	)
	elif loader_type == 'SeleniumURL':
	loader = SeleniumURLLoader(urls=urls)
	elif loader_type == 'SeleniumURLH':
	loader = SeleniumURLLoader(urls=urls, headless=False)
	elif loader_type == 'PlaywrightURL':
	loader = PlaywrightURLLoader(urls=urls)
	elif loader_type == 'PubMed':
	loader = PubMedLoader(urls[0])
	elif loader_type == 'NewsURL':
	loader = NewsURLLoader(urls)
	elif loader_type == 'WebBaseLoader':
	loader = WebBaseLoader(urls)
	else:
	return "Not Implemented. Development in Progress", "Work In Progress"

	data = loader.load()
	jsonData = []
	for item in data:
	jsonData.append(item.to_json())

	return jsonData, data

	except Exception as err:
	return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"


	def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
	"""Scrapes website content based on the prompt and summarizes it.

	Args:
	prompt (str): The prompt to guide the scraping process.
	source (str): The URL of the website to scrape.

	Returns:
	tuple: A tuple containing the scraped data as a dictionary and the execution information.
	"""
	smart_scraper_graph = SmartScraperGraph(
	prompt=prompt,
	source=source,
	config=graph_config
	)
	result = smart_scraper_graph.run()
	exec_info = smart_scraper_graph.get_execution_info()
	return result, prettify_exec_info(exec_info)

	# ------------------------------------------------------------------------------
	# TABBED GRADIO UI
	# ------------------------------------------------------------------------------

	# Define choices for the dropdown menu
	choices = [
	'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
	'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
	'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
	]

	# Create the Gradio interface with tabs
	with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
	gr.Markdown("# THE BIG SCRAPER")

	with gr.Tabs():
	# Tab 1: Data Extraction
	with gr.TabItem("Data Extraction"):
	gr.Markdown("## Extract data from URLs using various loaders")
	with gr.Row():
	url_input = gr.Textbox(label="Enter your comma separated URLs here")
	loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
	extract_button = gr.Button("Extract Data")
	with gr.Row():
	extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
	extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
	extract_button.click(
	extractDataFromUrls,
	inputs=[url_input, loader_dropdown],
	outputs=[extracted_data_json, extracted_data_text]
	)

	# Tab 2: Website Scraping and Summarization
	with gr.TabItem("Scraping & Summarization"):
	with gr.Row():
	with gr.Column():
	model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
	prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
	source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
	scrape_button = gr.Button("Scrape and Summarize")
	with gr.Column():
	result_output = gr.JSON(label="Result")
	exec_info_output = gr.Textbox(label="Execution Info")

	scrape_button.click(
	scrapeAndSummarize,
	inputs=[prompt_input, source_input],
	outputs=[result_output, exec_info_output]
	)

	# Launch the Gradio interface
	demo.launch()