Spaces:

hadadrjt
/

ai

Paused

App Files Files Community

hadadrjt commited on Jun 23

Commit

bc90a07

1 Parent(s): 76f7f20

ai: Switch to Docker container.

Browse files

* And use async for Deep Search.

Files changed (4) hide show

Dockerfile +27 -0
README.md +6 -5
requirements.txt +6 -0
src/tools/deep_search.py +95 -58

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# Use the latest personal Ubuntu image as the starting point
+FROM hadadrjt/ubuntu:latest
+# Set the user to root to have full permissions during build and runtime
+USER root
+# Set the working directory inside the container to /usr/src/app
+# All subsequent commands will be run in this directory
+WORKDIR /usr/src/app
+# Copy all files from the current directory on the host machine to the working directory in the container
+COPY . .
+# Install Python dependencies listed in requirements.txt without using cache to reduce image size
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose port 7860 so that it can be accessed from outside the container
+EXPOSE 7860
+# Set an environment variable to configure the Gradio server to listen on all network interfaces
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+# Clear any default entrypoint to allow CMD to run directly
+ENTRYPOINT []
+# Specify the default command to run the Python application when the container starts
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -2,12 +2,13 @@
 title: J.A.R.V.I.S.
 license: apache-2.0
 license_link: https://huggingface.co/hadadrjt/JARVIS/blob/main/LICENSE
-colorFrom: yellow
 colorTo: purple
-emoji: 🌍
-sdk: gradio
-sdk_version: 5.34.2
-app_file: app.py
 pinned: true
 short_description: Just a Rather Very Intelligent System
 models:

 title: J.A.R.V.I.S.
 license: apache-2.0
 license_link: https://huggingface.co/hadadrjt/JARVIS/blob/main/LICENSE
+colorFrom: green
 colorTo: purple
+emoji: ⚡
+thumbnail: >-
+  https://cdn-uploads.huggingface.co/production/uploads/67b426629ec6943726101b92/ptiPI3_NVFdo2yaDtpvH3.jpeg
+sdk: docker
+app_port: 7860
 pinned: true
 short_description: Just a Rather Very Intelligent System
 models:

requirements.txt CHANGED Viewed

@@ -1,4 +1,10 @@
 anyio
 pydantic
 starlette
 uvicorn

+aiohttp[speedups]
 anyio
+gradio
+httpx
+httpx[http2]
 pydantic
+Pygments
 starlette
+urllib3
 uvicorn

src/tools/deep_search.py CHANGED Viewed

@@ -3,22 +3,24 @@
 # SPDX-License-Identifier: Apache-2.0
 #
-import requests  # Import the requests library to perform HTTP requests synchronously
 from src.utils.ip_generator import generate_ip  # Import function to generate random IP addresses for request headers
-# Define a class named SearchTools to encapsulate functionalities related to deep search
 class SearchTools:
     # This class provides methods to connect to the web
     """
-    A class providing tools to perform web searches and read content from URLs using various search engines
-    and a reader API service.
     Attributes:
-        searxng_url (str): Base URL for the SearXNG search proxy service.
-        baidu_url (str): Base URL for Baidu search engine.
-        timeout (int): Timeout duration in seconds for HTTP requests.
-        reader_api (str): Base URL for the reader API service used to extract content from URLs.
     Methods:
         read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
@@ -26,73 +28,108 @@ class SearchTools:
                                returning the raw HTML response text.
     """
     def __init__(self):
         """
         Initialize the SearchTools instance with predefined URLs and timeout settings.
         """
-        self.searxng_url = "https://paulgo.io/search"  # URL for the SearXNG search proxy service
-        self.baidu_url = "https://www.baidu.com/s"  # URL for Baidu search engine
-        self.timeout = 30  # Timeout in seconds for HTTP requests to avoid long hanging connections
-        self.reader_api = "https://r.jina.ai/"  # Reader API endpoint to extract readable content from URLs
     async def read_url(self, url: str) -> str:
         """
-        Asynchronously read and retrieve the textual content of a given URL using the reader API.
         Args:
-            url (str): The URL of the webpage to read content from.
         Returns:
-            str: The textual content extracted from the URL if successful.
-            None: If the request fails or an exception occurs.
         """
-        try:
-            data = {"url": url}  # Prepare POST data with the target URL
-            # Send a synchronous POST request to the reader API with the URL data and timeout
-            response = requests.post(self.reader_api, data=data, timeout=self.timeout)
-            response.raise_for_status()  # Raise an exception if the response status is an HTTP error
-            return response.text  # Return the textual content of the response
-        except Exception:
-            # Return None if any error occurs during the request or response processing
-            return None
     async def search(self, query: str, engine: str = "google") -> str:
         """
-        Asynchronously perform a web search for the given query using the specified search engine.
         Args:
-            query (str): The search query string.
-            engine (str, optional): The search engine to use. Supported values are "google" and "baidu".
-                                    Defaults to "google".
         Returns:
-            str: The raw HTML content of the search results page if successful.
-            None: If the request fails or an exception occurs.
         """
-        try:
-            if engine == "baidu":
-                # Construct the URL for Baidu search by appending the query parameter 'wd' with the search term
-                url = f"{self.reader_api}{self.baidu_url}?wd={query}"
-                # Set the HTTP header to target the main content container of Baidu search results
-                headers = {
-                    "X-Target-Selector": "#content_left",
-                    "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
-                }
-            else:
-                # For Google or other engines, define a prefix for the search command (!go for Google, !bi for Bing)
-                prefix = "!go" if engine == "google" else "!bi"
-                # Construct the URL for SearXNG search proxy with the prefixed query
-                url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
-                # Set the HTTP header to target the URLs container in the search results
-                headers = {
-                    "X-Target-Selector": "#urls",
-                    "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
-                }
-            # Send a synchronous GET request to the constructed URL with headers and timeout
-            response = requests.get(url, headers=headers, timeout=self.timeout)
-            response.raise_for_status()  # Raise an exception if the response status is an HTTP error
-            return response.text  # Return the raw HTML content of the search results
-        except Exception:
-            # Return None if any error occurs during the request or response processing
-            return None

 # SPDX-License-Identifier: Apache-2.0
 #
+import aiohttp  # Import the aiohttp library to perform asynchronous HTTP requests
+import asyncio  # Import asyncio library to handle asynchronous operations and implement delay mechanisms
 from src.utils.ip_generator import generate_ip  # Import function to generate random IP addresses for request headers
+# Define the main SearchTools class that provides web searching and URL reading capabilities
 class SearchTools:
     # This class provides methods to connect to the web
     """
+    A comprehensive class providing tools to perform web searches and read content from URLs using various search engines
+    and a reader API service. This class implements full asynchronous operations with robust retry mechanisms to ensure
+    connections remain active even when encountering errors.
     Attributes:
+        searxng_url (str): Base URL for the SearXNG search proxy service that handles Google and other search engines.
+        baidu_url (str): Base URL for Baidu search engine for Chinese language searches.
+        timeout (int): Timeout duration in seconds for HTTP requests to prevent indefinite hanging.
+        reader_api (str): Base URL for the reader API service used to extract clean content from URLs.
     Methods:
         read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
                                returning the raw HTML response text.
     """
+    # Constructor method to initialize the SearchTools instance with all necessary configuration values
     def __init__(self):
         """
         Initialize the SearchTools instance with predefined URLs and timeout settings.
+        This method sets up all the base URLs and configuration parameters needed for web searching and content reading.
         """
+        # Set the base URL for SearXNG search proxy service which provides access to multiple search engines
+        self.searxng_url = "https://paulgo.io/search"
+        # Set the base URL for Baidu search engine for handling Chinese language queries
+        self.baidu_url = "https://www.baidu.com/s"
+        # Set timeout duration to 30 seconds to balance between allowing slow responses and preventing infinite waits
+        self.timeout = 30
+        # Set the reader API endpoint that converts web pages into clean, readable text format
+        self.reader_api = "https://r.jina.ai/"
+    # Private helper method that implements the core retry logic for all HTTP requests
+    async def _fetch_with_retry(self, session, method, url, **kwargs):
+        """
+        Helper method to perform HTTP requests with infinite retry until a valid response is obtained.
+        This method ensures that connections never fail permanently and will keep trying until success.
+        Args:
+            session (aiohttp.ClientSession): The aiohttp session object to use for making HTTP requests.
+            method (str): HTTP method to use for the request (e.g., 'get', 'post', 'put', 'delete').
+            url (str): The complete URL to send the request to.
+            **kwargs: Additional keyword arguments to pass to the aiohttp request method (headers, data, etc.).
+        Returns:
+            str: The response text content when a successful request is finally achieved.
+        """
+        # Create an infinite loop that will only break when a successful response is received
+        while True:
+            # Use a try-except block to catch any type of exception that might occur during the request
+            try:
+                # Make the actual HTTP request using the provided session, method, URL and additional arguments
+                async with session.request(method, url, **kwargs) as response:
+                    # Check if the response status indicates success, raise exception if it's an error status
+                    response.raise_for_status()
+                    # Return the text content of the successful response
+                    return await response.text()
+            # Catch any exception that occurs during the request process
+            except Exception:
+                # Retry on any exception without stopping the loop or raising the error
+                # Wait for 5 second before attempting the next retry to avoid overwhelming the server
+                await asyncio.sleep(5)
+    # Public method to read and extract content from any given URL
     async def read_url(self, url: str) -> str:
         """
+        Asynchronously read and retrieve the textual content of a given URL using the reader API with infinite retry.
+        This method will keep trying until it successfully retrieves the content from the specified URL.
         Args:
+            url (str): The complete URL of the webpage to read content from.
         Returns:
+            str: The clean textual content extracted from the URL by the reader API service.
         """
+        # Prepare the POST data payload containing the target URL for the reader API
+        data = {"url": url}
+        # Create an aiohttp client session with the configured timeout settings
+        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
+            # Use the retry helper method to POST the URL to the reader API and get the content
+            return await self._fetch_with_retry(session, 'post', self.reader_api, data=data)
+    # Public method to perform web searches using different search engines
     async def search(self, query: str, engine: str = "google") -> str:
         """
+        Asynchronously perform a web search for the given query using the specified search engine with infinite retry.
+        This method will keep trying until it successfully retrieves search results from the chosen search engine.
         Args:
+            query (str): The search query string containing the terms to search for.
+            engine (str, optional): The search engine to use for the search. Supported values are "google" and "baidu".
+                                    Defaults to "google" if not specified.
         Returns:
+            str: The raw HTML content of the search results page from the specified search engine.
         """
+        # Check if the user wants to use Baidu search engine for the query
+        if engine == "baidu":
+            # Construct the full URL by combining reader API, Baidu URL and the search query parameter
+            url = f"{self.reader_api}{self.baidu_url}?wd={query}"
+            # Set HTTP headers specific to Baidu search results extraction
+            headers = {
+                # Target the main content container where Baidu displays search results
+                "X-Target-Selector": "#content_left",
+                "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
+            }
+        # Handle all other search engines (Google, Bing, etc.) through SearXNG proxy
+        else:
+            # Determine the search prefix based on the requested engine (Google or Bing)
+            prefix = "!go" if engine == "google" else "!bi"
+            # Construct the full URL by combining reader API, SearXNG URL, prefix and query
+            url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
+            # Set HTTP headers specific to SearXNG search results extraction
+            headers = {
+                # Target the URLs container where SearXNG displays search result links
+                "X-Target-Selector": "#urls",
+                "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
+            }
+        # Create an aiohttp client session with the configured timeout settings
+        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
+            # Use the retry helper method to GET the search results and return the HTML content
+            return await self._fetch_with_retry(session, 'get', url, headers=headers)