Spaces:

IamVicky111
/

MistralScrapy

Build error

App Files Files Community

IamVicky111 commited on Aug 13, 2024

Commit

d2eaed3

verified ·

1 Parent(s): 028a1e6

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -37

app.py CHANGED Viewed

@@ -1,29 +1,47 @@
 import os
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
-import gradio as gr
-import subprocess
-import json
 # Ensure Playwright installs required browsers and dependencies
-subprocess.run(["playwright", "install"])
-#subprocess.run(["playwright", "install-deps"])
 # Load environment variables
 load_dotenv()
 HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 # Initialize the model instances
-repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
 llm_model_instance = HuggingFaceEndpoint(
-    repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN
 )
 embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
-    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
 )
 graph_config = {
@@ -31,55 +49,84 @@ graph_config = {
     "embeddings": {"model_instance": embedder_model_instance}
 }
-def scrape_and_summarize(prompt, source):
     smart_scraper_graph = SmartScraperGraph(
         prompt=prompt,
         source=source,
         config=graph_config
     )
     result = smart_scraper_graph.run()
     # Ensure the result is properly formatted as JSON
     if isinstance(result, dict):
-        result_json = result
-    else:
-        try:
-            result_json = json.loads(result)
-        except json.JSONDecodeError as e:
-            # Attempt to extract JSON from the result
-            start_index = result.find("[")
-            end_index = result.rfind("]")
-            if start_index != -1 and end_index != -1:
-                json_str = result[start_index:end_index+1]
-                try:
-                    result_json = json.loads(json_str)
-                except json.JSONDecodeError as inner_e:
-                    raise ValueError(f"Invalid JSON output: {result}") from inner_e
-            else:
-                raise ValueError(f"Invalid JSON output: {result}") from e
-    return result_json
-# Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
-    gr.Markdown("""This is a no code ML app for scraping <br> 1. Just provide the Prompt, ie., the items you wanna Scrap from the website <br> 2. Provide the url for the site you wanna Scrap, click Generate<br> And BOOM 💥 you can copy the result and view the execution details in the right side pannel """)
     with gr.Row():
         with gr.Column():
-            prompt_input = gr.Textbox(label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too.")
-            source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist")
             scrape_button = gr.Button("Generate")
         with gr.Column():
             result_output = gr.JSON(label="Result")
     scrape_button.click(
         scrape_and_summarize,
         inputs=[prompt_input, source_input],
         outputs=[result_output]
     )
-# Launch the Gradio app
 if __name__ == "__main__":
     demo.launch()

+"""
+Web Scraper and Summarizer using Mistral AI.
+This module provides a Gradio-based web application for scraping websites
+and summarizing content using the Mistral AI language model. It allows users
+to input a prompt and a source URL, then generates a JSON output of the
+scraped and summarized information.
+Developer: Vicky_111
+LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/
+"""
 import os
+import json
+import subprocess
+from typing import Dict, Any
+import gradio as gr
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 # Ensure Playwright installs required browsers and dependencies
+subprocess.run(["playwright", "install"], check=True)
+# subprocess.run(["playwright", "install-deps"])
 # Load environment variables
 load_dotenv()
 HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 # Initialize the model instances
+REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
 llm_model_instance = HuggingFaceEndpoint(
+    repo_id=REPO_ID,
+    max_length=128,
+    temperature=0.3,
+    token=HUGGINGFACEHUB_API_TOKEN
 )
+# Embed using Hugging face interferance embedding
 embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN,
+    model_name="sentence-transformers/all-MiniLM-l6-v2"
 )
 graph_config = {
     "embeddings": {"model_instance": embedder_model_instance}
 }
+# Using smart scraper graph the content is scrapped and summarised
+def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]:
+    """
+    Scrape a website and summarize its content based on a given prompt.
+    This function uses the SmartScraperGraph to scrape the provided URL
+    and generate a summary based on the given prompt. It ensures the output
+    is in a valid JSON format.
+    Args:
+        prompt (str): The prompt to guide the scraping and summarization.
+        source (str): The URL of the website to scrape.
+    Returns:
+        Dict[str, Any]: A JSON-formatted dictionary containing the scraped
+        and summarized information.
+    Raises:
+        ValueError: If the output cannot be parsed as valid JSON.
+    """
     smart_scraper_graph = SmartScraperGraph(
         prompt=prompt,
         source=source,
         config=graph_config
     )
     result = smart_scraper_graph.run()
     # Ensure the result is properly formatted as JSON
     if isinstance(result, dict):
+        return result
+    try:
+        return json.loads(result)
+    except json.JSONDecodeError as e:
+        # Attempt to extract JSON from the result
+        start_index = result.find("[")
+        end_index = result.rfind("]")
+        if start_index != -1 and end_index != -1:
+            json_str = result[start_index:end_index+1]
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError as inner_e:
+                raise ValueError(f"Invalid JSON output: {result}") from inner_e
+        else:
+            raise ValueError(f"Invalid JSON output: {result}") from e
+# Gradio User interface
 with gr.Blocks() as demo:
     gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
+    gr.Markdown("""
+    This is a no code ML app for scraping <br>
+    1. Just provide the Prompt, i.e., the items you want to scrape from the website <br>
+    2. Provide the URL for the site you want to scrape, click Generate<br>
+    And BOOM 💥 you can copy the result and view the execution details in the right side panel
+    """)
     with gr.Row():
         with gr.Column():
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too."
+            )
+            source_input = gr.Textbox(
+                label="Source URL",
+                value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist"
+            )
             scrape_button = gr.Button("Generate")
         with gr.Column():
             result_output = gr.JSON(label="Result")
     scrape_button.click(
         scrape_and_summarize,
         inputs=[prompt_input, source_input],
         outputs=[result_output]
     )
 if __name__ == "__main__":
     demo.launch()