Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from scrapegraphai.graphs import SmartScraperGraph | |
| from scrapegraphai.utils import prettify_exec_info | |
| from langchain_community.llms import HuggingFaceEndpoint | |
| from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings | |
| import gradio as gr | |
| import subprocess | |
| import json | |
| # Ensure Playwright installs required browsers and dependencies | |
| subprocess.run(["playwright", "install"]) | |
| #subprocess.run(["playwright", "install-deps"]) | |
| # Load environment variables | |
| load_dotenv() | |
| HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') | |
| # Initialize the model instances | |
| repo_id = "mistralai/Mistral-7B-Instruct-v0.2" | |
| llm_model_instance = HuggingFaceEndpoint( | |
| repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN | |
| ) | |
| embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( | |
| api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" | |
| ) | |
| graph_config = { | |
| "llm": {"model_instance": llm_model_instance}, | |
| "embeddings": {"model_instance": embedder_model_instance} | |
| } | |
| def scrape_and_summarize(prompt, source): | |
| smart_scraper_graph = SmartScraperGraph( | |
| prompt=prompt, | |
| source=source, | |
| config=graph_config | |
| ) | |
| result = smart_scraper_graph.run() | |
| # Ensure the result is properly formatted as JSON | |
| if isinstance(result, dict): | |
| result_json = result | |
| else: | |
| try: | |
| result_json = json.loads(result) | |
| except json.JSONDecodeError as e: | |
| # Attempt to extract JSON from the result | |
| start_index = result.find("[") | |
| end_index = result.rfind("]") | |
| if start_index != -1 and end_index != -1: | |
| json_str = result[start_index:end_index+1] | |
| try: | |
| result_json = json.loads(json_str) | |
| except json.JSONDecodeError as inner_e: | |
| raise ValueError(f"Invalid JSON output: {result}") from inner_e | |
| else: | |
| raise ValueError(f"Invalid JSON output: {result}") from e | |
| return result_json | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>") | |
| gr.Markdown("""This is a no code ML app for scraping <br> 1. Just provide the Prompt, ie., the items you wanna Scrap from the website <br> 2. Provide the url for the site you wanna Scrap, click Generate<br> And BOOM 💥 you can copy the result and view the execution details in the right side pannel """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| prompt_input = gr.Textbox(label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too.") | |
| source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist") | |
| scrape_button = gr.Button("Generate") | |
| with gr.Column(): | |
| result_output = gr.JSON(label="Result") | |
| scrape_button.click( | |
| scrape_and_summarize, | |
| inputs=[prompt_input, source_input], | |
| outputs=[result_output] | |
| ) | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| demo.launch() |