Spaces:

eligapris
/

search_api

Sleeping

App Files Files Community

eligapris commited on May 13, 2025

Commit

e51e040

verified ·

1 Parent(s): 62f3268

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +48 -0
README.md +129 -12
main.py +80 -0
pyproject.toml +20 -0
requirements.txt +3 -0
search.py +99 -0
website_viewer.py +64 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+# Use Python 3.11 as the base image
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    libglib2.0-0 \
+    libnss3 \
+    libnspr4 \
+    libdbus-1-3 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libxkbcommon0 \
+    libatspi2.0-0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy pyproject.toml and install Python dependencies
+COPY pyproject.toml .
+RUN pip install -e .
+# Copy the application code
+COPY . .
+# Install playwright
+RUN playwright install
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Expose the port the app runs on
+EXPOSE 8000
+# Command to run the FastAPI application with uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

README.md CHANGED Viewed

@@ -1,12 +1,129 @@
----
-title: Search Api
-emoji: 🐠
-colorFrom: red
-colorTo: red
-sdk: docker
-pinned: false
-license: mit
-short_description: 'Opensource Search Api '
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Search Engine API
+A FastAPI-based search engine that uses DuckDuckGo's lite search interface.
+## Setup
+1. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Run the API server:
+```bash
+uvicorn main:app --reload
+```
+The API will be available at `http://localhost:8000`
+## API Documentation
+Once the server is running, you can access the interactive API documentation at:
+- Swagger UI: `http://localhost:8000/docs`
+- ReDoc: `http://localhost:8000/redoc`
+## API Endpoints
+### POST /search
+Search for content using a search phrase.
+Request body:
+```json
+{
+    "search_phrase": "your search query"
+}
+```
+Response:
+```json
+{
+    "results": {
+        "Item_1": {
+            "title": "Result title",
+            "snippet": "Result snippet",
+            "linkText": "Link text"
+        },
+        // ... more items
+    }
+}
+```
+### POST /websiteView
+View and browse a website's content in Markdown format.
+Request body:
+```json
+{
+    "url": "https://example.com"
+}
+```
+Response:
+```json
+{
+    "title": "Website Title",
+    "markdown": "# Website Title\n\nMain content in Markdown format...\n\n## Links\n\n- [Link text](https://example.com/link)\n\n## Images\n\n![Image description](https://example.com/image.jpg)",
+    "links": [
+        {
+            "text": "Link text",
+            "url": "https://example.com/link",
+            "markdown": "[Link text](https://example.com/link)"
+        }
+    ],
+    "images": [
+        {
+            "src": "https://example.com/image.jpg",
+            "alt": "Image description",
+            "markdown": "![Image description](https://example.com/image.jpg)"
+        }
+    ],
+    "url": "https://example.com"
+}
+```
+### POST /searchWithContent
+Search for content and retrieve the full content of the top N results.
+Request body:
+```json
+{
+    "search_phrase": "your search query",
+    "top_n": 5
+}
+```
+Response:
+```json
+{
+    "results": {
+        "Item_1": {
+            "title": "Result title",
+            "snippet": "Result snippet",
+            "linkText": "https://example.com",
+            "content": {
+                "title": "Website Title",
+                "markdown": "# Website Title\n\nMain content in Markdown format...",
+                "links": [...],
+                "images": [...],
+                "url": "https://example.com"
+            }
+        },
+        // ... more items up to top_n
+    }
+}
+```
+The `content` field for each result contains the full website content in Markdown format, including:
+- Title as a level 1 heading
+- Main content
+- Links section with Markdown-formatted links
+- Images section with Markdown-formatted images
+## Error Handling
+The API will return appropriate HTTP status codes and error messages in case of failures:
+- 500: Internal server error
+- 422: Validation error (invalid request body)

main.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from search import search_n_browse, search_with_content
+from website_viewer import fetch_website_content
+from typing import Dict, List
+app = FastAPI(title="VerboAI Search Engine API", description="""
+VerboAI Search Engine API provides three core functionalities:
+1. Web search with browsing capabilities
+2. Website content extraction with structured data (title, markdown, links, and images)
+3. Content-based search that returns the top N most relevant results from websites
+The API is built with FastAPI, supports CORS, and is ready for cloud deployment. Powered by VerboAI.
+""", version="1.0.0", docs_url="/", redoc_url="/redoc")
+# Enable CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class SearchRequest(BaseModel):
+    search_phrase: str
+class SearchResponse(BaseModel):
+    results: dict
+class WebsiteViewRequest(BaseModel):
+    url: str
+class WebsiteViewResponse(BaseModel):
+    title: str
+    markdown: str
+    links: Dict[str, List[Dict]] = {}
+    images: Dict[str, List[Dict]] = {}
+    url: str
+class SearchWithContentRequest(BaseModel):
+    search_phrase: str
+    top_n: int = 5
+class SearchWithContentResponse(BaseModel):
+    results: dict
+@app.post("/search", response_model=SearchResponse)
+async def search(request: SearchRequest):
+    try:
+        results = await search_n_browse(request.search_phrase)
+        return SearchResponse(results=results)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/websiteView", response_model=WebsiteViewResponse)
+async def view_website(request: WebsiteViewRequest):
+    try:
+        website_content = await fetch_website_content(request.url)
+        return WebsiteViewResponse(**website_content)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/searchWithContent", response_model=SearchWithContentResponse)
+async def search_with_content_endpoint(request: SearchWithContentRequest):
+    try:
+        results = await search_with_content(request.search_phrase, request.top_n)
+        return SearchWithContentResponse(results=results)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[project]
+name = "search-engine-aapi"
+version = "0.1.0"
+description = "A search engine API"
+authors = [
+    { name = "Your Name", email = "your.email@example.com" }
+]
+dependencies = [
+    "fastapi==0.109.2",
+    "uvicorn==0.27.1",
+    "crawl4ai==0.3.3",
+]
+requires-python = ">=3.9,<3.13"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/search_engine_aapi"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi==0.109.2
+uvicorn==0.27.1
+crawl4ai==0.3.3

search.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import aiohttp
+from bs4 import BeautifulSoup
+import re
+from website_viewer import fetch_website_content
+async def search_n_browse(search_phrase: str) -> dict:
+    url = "https://lite.duckduckgo.com/lite/"
+    headers = {
+        "Content-Type": "application/x-www-form-urlencoded",
+    }
+    data = {
+        "q": search_phrase,
+    }
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, data=data, headers=headers) as response:
+                if response.status == 200:
+                    body = await response.text()
+                    soup = BeautifulSoup(body, 'html.parser')
+                    # Extract links
+                    links = [a['href'] for a in soup.find_all('a', href=True)
+                            if not a['href'].startswith('#')]
+                    links = [link if link.startswith('http') else '' for link in links]
+                    # Extract titles
+                    titles = [a.text.strip() for a in soup.select('tr > td > a')]
+                    # Extract snippets
+                    snippets = [td.text.strip() for td in soup.select('td.result-snippet')]
+                    # Extract link texts
+                    link_texts = [span.text for span in soup.select('td > span.link-text')]
+                    # Process results
+                    json_data = {}
+                    if snippets:
+                        for index, snippet in enumerate(snippets):
+                            json_data[f"Item_{index + 1}"] = {
+                                "title": titles[index] if index < len(titles) else "",
+                                "snippet": snippet,
+                                "linkText": link_texts[index] if index < len(link_texts) else "",
+                            }
+                    else:
+                        json_data["Result"] = {
+                            "title": "No results were found.",
+                            "snippet": "Our search engine could not find what you are looking for.",
+                            "linkText": "Thank you for using our search engine.",
+                        }
+                    return json_data
+                else:
+                    raise Exception(f"Search API error. Status code: {response.status}")
+    except Exception as e:
+        raise Exception(f"Error during search: {str(e)}")
+async def search_with_content(search_phrase: str, top_n: int = 5) -> dict:
+    """
+    Search for content and retrieve the content of the top N results
+    """
+    try:
+        # First, get the search results
+        search_results = await search_n_browse(search_phrase)
+        # Process only the top N results
+        top_results = {}
+        count = 0
+        for key, result in search_results.items():
+            if key.startswith("Item_") and count < top_n:
+                try:
+                    # Get the URL from linkText
+                    url = result.get("linkText", "")
+                    if url:
+                        # Fetch the content of the page
+                        content = await fetch_website_content(url)
+                        # Add the content to the result
+                        top_results[key] = {
+                            "title": result.get("title", ""),
+                            "snippet": result.get("snippet", ""),
+                            "linkText": result.get("linkText", ""),
+                            "content": content
+                        }
+                        count += 1
+                except Exception as e:
+                    # If we can't fetch the content, still include the basic result
+                    top_results[key] = {
+                        "title": result.get("title", ""),
+                        "snippet": result.get("snippet", ""),
+                        "linkText": result.get("linkText", ""),
+                        "error": f"Could not fetch content: {str(e)}"
+                    }
+                    count += 1
+        return top_results
+    except Exception as e:
+        raise Exception(f"Error during search with content: {str(e)}")

website_viewer.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import logging
+import aiohttp
+from bs4 import BeautifulSoup
+from html2text import HTML2Text
+from typing import Dict, List, Optional
+from urllib.parse import urljoin, urlparse
+from crawl4ai import *
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class WebsiteViewerError(Exception):
+    pass
+async def fetch_website_content(url: str) -> Dict:
+    """
+    Fetch website content using aiohttp and BeautifulSoup.
+    Args:
+        url (str): The URL of the website to fetch
+    Returns:
+        Dict: A dictionary containing:
+            - title: The page title (str)
+            - content: The main content in markdown format (str)
+            - links: List of absolute URLs found on the page (List[str])
+            - images: List of image URLs found on the page (List[str])
+            - url: The original URL (str)
+    """
+    try:
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=url,
+            )
+            soup = BeautifulSoup(result.html, 'html.parser')
+            # Process links - convert relative URLs to absolute
+            links = result.links
+            # Process images - get src attributes
+            media = result.media
+            # Get title
+            title = soup.title.string if soup.title else ''
+            output = {
+                "title": title,
+                "markdown": result.markdown,
+                "links": links,
+                "media": media,
+                "url": url
+            }
+            return output
+    except Exception as e:
+        logger.error(f"Error fetching website content: {str(e)}")
+        raise WebsiteViewerError(f"Failed to fetch website content: {str(e)}")