Spaces:

Shreyas094
/

SearXNG-Engine

Running

App Files Files Community

Shreyas094 commited on Sep 27, 2024

Commit

2769ea6

verified ·

1 Parent(s): 426506c

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -13

app.py CHANGED Viewed

@@ -2,10 +2,34 @@ import gradio as gr
 import requests
 import time
 import random
-def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10):
     """
-    Perform a search using the Searx API with error handling, retry logic, and limited results.
     """
     search_endpoint = f"{instance_url}/search"
     params = {
@@ -16,7 +40,7 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
         'time_range': '',
         'engines': '',
         'safesearch': '0',
-        'results': str(num_results)  # Limit the number of results
     }
     headers = {
@@ -42,12 +66,14 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
             for idx, result in enumerate(data['results'][:num_results], start=1):
                 title = result.get('title', 'No Title')
                 url = result.get('url', 'No URL')
-                snippet = result.get('content', 'No Description')
-                # Try to get a longer snippet if available
-                long_content = result.get('long_content', snippet)
-                formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{long_content}\n\n"
             return formatted_results
         except requests.exceptions.RequestException as e:
@@ -64,9 +90,9 @@ def create_gradio_interface():
     Creates and returns the Gradio interface.
     """
     with gr.Blocks() as demo:
-        gr.Markdown("# 🕵️‍♂️ Private Search with Searx and Gradio")
         gr.Markdown(
-            "This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine."
         )
         with gr.Row():
             with gr.Column():
@@ -94,23 +120,25 @@ def create_gradio_interface():
                     step=1,
                     label="Number of Results"
                 )
                 search_button = gr.Button("Search")
             with gr.Column():
                 results = gr.Markdown("### Search Results will appear here...")
-        def perform_search(q, url, cats, num):
-            return search_searx(q, instance_url=url, categories=cats, num_results=int(num))
         search_button.click(
             perform_search,
-            inputs=[query, instance_url, categories, num_results],
             outputs=results
         )
         gr.Markdown(
             """
             ---
-            **Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
             """
         )

 import requests
 import time
 import random
+from bs4 import BeautifulSoup
+import trafilatura
+def extract_content_bs4(url):
+    try:
+        response = requests.get(url, timeout=10)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # This is a simple extraction and might need to be adjusted based on the structure of the websites you're scraping
+        paragraphs = soup.find_all('p')
+        content = ' '.join([p.text for p in paragraphs])
+        return content[:1000] + "..." if len(content) > 1000 else content
+    except Exception as e:
+        return f"Error extracting content: {str(e)}"
+def extract_content_trafilatura(url):
+    try:
+        downloaded = trafilatura.fetch_url(url)
+        content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
+        return content[:1000] + "..." if content and len(content) > 1000 else content
+    except Exception as e:
+        return f"Error extracting content: {str(e)}"
+def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10, use_trafilatura=False):
     """
+    Perform a search using the Searx API with error handling, retry logic, limited results, and content extraction.
     """
     search_endpoint = f"{instance_url}/search"
     params = {
         'time_range': '',
         'engines': '',
         'safesearch': '0',
+        'results': str(num_results)
     }
     headers = {
             for idx, result in enumerate(data['results'][:num_results], start=1):
                 title = result.get('title', 'No Title')
                 url = result.get('url', 'No URL')
+                # Extract content using the selected method
+                if use_trafilatura:
+                    content = extract_content_trafilatura(url)
+                else:
+                    content = extract_content_bs4(url)
+                formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
             return formatted_results
         except requests.exceptions.RequestException as e:
     Creates and returns the Gradio interface.
     """
     with gr.Blocks() as demo:
+        gr.Markdown("# 🕵️‍♂️ Private Search with Searx and Content Extraction")
         gr.Markdown(
+            "This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine and extract content from the results."
         )
         with gr.Row():
             with gr.Column():
                     step=1,
                     label="Number of Results"
                 )
+                use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
                 search_button = gr.Button("Search")
             with gr.Column():
                 results = gr.Markdown("### Search Results will appear here...")
+        def perform_search(q, url, cats, num, use_traf):
+            return search_searx(q, instance_url=url, categories=cats, num_results=int(num), use_trafilatura=use_traf)
         search_button.click(
             perform_search,
+            inputs=[query, instance_url, categories, num_results, use_trafilatura],
             outputs=results
         )
         gr.Markdown(
             """
             ---
+            **Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
+            It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
             """
         )