Spaces:
Running
Running
Remove useless endpoint
Browse files- app.py +25 -41
- requirements.txt +0 -1
- search.py → serp.py +2 -8
app.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
from contextlib import asynccontextmanager
|
| 2 |
from typing import Optional
|
| 3 |
-
import expiringdict
|
| 4 |
from fastapi import APIRouter, FastAPI
|
| 5 |
-
from fastapi.routing import APIRouter
|
| 6 |
import httpx
|
| 7 |
from pydantic import BaseModel, Field
|
| 8 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
@@ -10,7 +9,7 @@ import logging
|
|
| 10 |
import uvicorn
|
| 11 |
|
| 12 |
from scrap import scrap_patent_async, scrap_patent_bulk_async
|
| 13 |
-
from
|
| 14 |
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
|
@@ -37,42 +36,31 @@ async def api_lifespan(app: FastAPI):
|
|
| 37 |
await pw_browser.close()
|
| 38 |
await playwright.stop()
|
| 39 |
|
| 40 |
-
app = FastAPI(lifespan=api_lifespan)
|
| 41 |
-
backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
| 42 |
|
| 43 |
# Router for scrapping related endpoints
|
| 44 |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
| 45 |
-
# Router for
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
@app.get('/')
|
| 50 |
-
async def status():
|
| 51 |
-
backend_keys = [k[0] for k in backend_status.items()]
|
| 52 |
-
backend_status_dict = {}
|
| 53 |
-
|
| 54 |
-
for k in backend_keys:
|
| 55 |
-
backend_status_dict[k] = backend_status.get(k)
|
| 56 |
-
return {"status": "running", "backend_status": backend_status_dict}
|
| 57 |
|
| 58 |
# ===================== Search endpoints =====================
|
| 59 |
|
| 60 |
|
| 61 |
-
class
|
| 62 |
queries: list[str] = Field(...,
|
| 63 |
description="The list of queries to search for")
|
| 64 |
n_results: int = Field(
|
| 65 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
| 66 |
|
| 67 |
|
| 68 |
-
@
|
| 69 |
-
async def query_google_scholar(params:
|
| 70 |
"""Queries google scholar for the specified query"""
|
| 71 |
return {"error": "Unimplemented"}
|
| 72 |
|
| 73 |
|
| 74 |
-
@
|
| 75 |
-
async def search_patents(params:
|
| 76 |
"""Searches google patents for the specified queries and returns the found documents."""
|
| 77 |
results = []
|
| 78 |
for q in params.queries:
|
|
@@ -81,14 +69,13 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
|
|
| 81 |
res = await query_google_patents(pw_browser, q, params.n_results)
|
| 82 |
results.extend(res)
|
| 83 |
except Exception as e:
|
| 84 |
-
backend_status["gpatents"] = "rate-limited"
|
| 85 |
logging.error(
|
| 86 |
f"Failed to query Google Patents with query `{q}`: {e}")
|
| 87 |
-
return
|
| 88 |
|
| 89 |
|
| 90 |
-
@
|
| 91 |
-
async def search_brave(params:
|
| 92 |
"""Searches brave search for the specified queries and returns the found documents."""
|
| 93 |
results = []
|
| 94 |
last_exception: Optional[Exception] = None
|
|
@@ -99,15 +86,14 @@ async def search_brave(params: APISearchParams) -> APISearchResults:
|
|
| 99 |
results.extend(res)
|
| 100 |
except Exception as e:
|
| 101 |
last_exception = e
|
| 102 |
-
backend_status["brave"] = "rate-limited"
|
| 103 |
logging.error(
|
| 104 |
f"Failed to query Brave search with query `{q}`: {e}")
|
| 105 |
|
| 106 |
-
return
|
| 107 |
|
| 108 |
|
| 109 |
-
@
|
| 110 |
-
async def search_bing(params:
|
| 111 |
"""Searches Bing search for the specified queries and returns the found documents."""
|
| 112 |
results = []
|
| 113 |
last_exception: Optional[Exception] = None
|
|
@@ -118,15 +104,14 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
|
|
| 118 |
results.extend(res)
|
| 119 |
except Exception as e:
|
| 120 |
last_exception = e
|
| 121 |
-
backend_status["bing"] = "rate-limited"
|
| 122 |
logging.error(
|
| 123 |
f"Failed to query Bing search with query `{q}`: {e}")
|
| 124 |
|
| 125 |
-
return
|
| 126 |
|
| 127 |
|
| 128 |
-
@
|
| 129 |
-
async def search_duck(params:
|
| 130 |
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
| 131 |
results = []
|
| 132 |
last_exception: Optional[Exception] = None
|
|
@@ -138,15 +123,14 @@ async def search_duck(params: APISearchParams) -> APISearchResults:
|
|
| 138 |
results.extend(res)
|
| 139 |
except Exception as e:
|
| 140 |
last_exception = e
|
| 141 |
-
backend_status["duckduckgo"] = "rate-limited"
|
| 142 |
logging.error(f"Failed to query DDG with query `{q}`: {e}")
|
| 143 |
|
| 144 |
-
return
|
| 145 |
|
| 146 |
|
| 147 |
-
@
|
| 148 |
@app.post("/search")
|
| 149 |
-
async def search(params:
|
| 150 |
"""Attempts to search the specified queries using ALL backends"""
|
| 151 |
results = []
|
| 152 |
|
|
@@ -180,9 +164,9 @@ async def search(params: APISearchParams):
|
|
| 180 |
logging.info("Trying with next browser backend.")
|
| 181 |
|
| 182 |
if len(results) == 0:
|
| 183 |
-
return
|
| 184 |
|
| 185 |
-
return
|
| 186 |
|
| 187 |
# =========================== Scrapping endpoints ===========================
|
| 188 |
|
|
@@ -209,7 +193,7 @@ async def scrap_patents(params: ScrapPatentsRequest):
|
|
| 209 |
|
| 210 |
# ===============================================================================
|
| 211 |
|
| 212 |
-
app.include_router(
|
| 213 |
app.include_router(scrap_router)
|
| 214 |
|
| 215 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 1 |
from contextlib import asynccontextmanager
|
| 2 |
from typing import Optional
|
|
|
|
| 3 |
from fastapi import APIRouter, FastAPI
|
| 4 |
+
from fastapi.routing import APIRouter
|
| 5 |
import httpx
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
|
|
| 9 |
import uvicorn
|
| 10 |
|
| 11 |
from scrap import scrap_patent_async, scrap_patent_bulk_async
|
| 12 |
+
from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
|
| 13 |
|
| 14 |
logging.basicConfig(
|
| 15 |
level=logging.INFO,
|
|
|
|
| 36 |
await pw_browser.close()
|
| 37 |
await playwright.stop()
|
| 38 |
|
| 39 |
+
app = FastAPI(lifespan=api_lifespan, docs_url="/")
|
|
|
|
| 40 |
|
| 41 |
# Router for scrapping related endpoints
|
| 42 |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
| 43 |
+
# Router for SERP-scrapping related endpoints
|
| 44 |
+
serp_router = APIRouter(prefix="/serp", tags=["serp scrapping"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# ===================== Search endpoints =====================
|
| 47 |
|
| 48 |
|
| 49 |
+
class SerpQuery(BaseModel):
|
| 50 |
queries: list[str] = Field(...,
|
| 51 |
description="The list of queries to search for")
|
| 52 |
n_results: int = Field(
|
| 53 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
| 54 |
|
| 55 |
|
| 56 |
+
@serp_router.post("/search_scholar")
|
| 57 |
+
async def query_google_scholar(params: SerpQuery):
|
| 58 |
"""Queries google scholar for the specified query"""
|
| 59 |
return {"error": "Unimplemented"}
|
| 60 |
|
| 61 |
|
| 62 |
+
@serp_router.post("/search_patents")
|
| 63 |
+
async def search_patents(params: SerpQuery) -> SerpResults:
|
| 64 |
"""Searches google patents for the specified queries and returns the found documents."""
|
| 65 |
results = []
|
| 66 |
for q in params.queries:
|
|
|
|
| 69 |
res = await query_google_patents(pw_browser, q, params.n_results)
|
| 70 |
results.extend(res)
|
| 71 |
except Exception as e:
|
|
|
|
| 72 |
logging.error(
|
| 73 |
f"Failed to query Google Patents with query `{q}`: {e}")
|
| 74 |
+
return SerpResults(results=results, error=None)
|
| 75 |
|
| 76 |
|
| 77 |
+
@serp_router.post("/search_brave")
|
| 78 |
+
async def search_brave(params: SerpQuery) -> SerpResults:
|
| 79 |
"""Searches brave search for the specified queries and returns the found documents."""
|
| 80 |
results = []
|
| 81 |
last_exception: Optional[Exception] = None
|
|
|
|
| 86 |
results.extend(res)
|
| 87 |
except Exception as e:
|
| 88 |
last_exception = e
|
|
|
|
| 89 |
logging.error(
|
| 90 |
f"Failed to query Brave search with query `{q}`: {e}")
|
| 91 |
|
| 92 |
+
return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 93 |
|
| 94 |
|
| 95 |
+
@serp_router.post("/search_bing")
|
| 96 |
+
async def search_bing(params: SerpQuery) -> SerpResults:
|
| 97 |
"""Searches Bing search for the specified queries and returns the found documents."""
|
| 98 |
results = []
|
| 99 |
last_exception: Optional[Exception] = None
|
|
|
|
| 104 |
results.extend(res)
|
| 105 |
except Exception as e:
|
| 106 |
last_exception = e
|
|
|
|
| 107 |
logging.error(
|
| 108 |
f"Failed to query Bing search with query `{q}`: {e}")
|
| 109 |
|
| 110 |
+
return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 111 |
|
| 112 |
|
| 113 |
+
@serp_router.post("/search_duck")
|
| 114 |
+
async def search_duck(params: SerpQuery) -> SerpResults:
|
| 115 |
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
| 116 |
results = []
|
| 117 |
last_exception: Optional[Exception] = None
|
|
|
|
| 123 |
results.extend(res)
|
| 124 |
except Exception as e:
|
| 125 |
last_exception = e
|
|
|
|
| 126 |
logging.error(f"Failed to query DDG with query `{q}`: {e}")
|
| 127 |
|
| 128 |
+
return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 129 |
|
| 130 |
|
| 131 |
+
@serp_router.post("/search")
|
| 132 |
@app.post("/search")
|
| 133 |
+
async def search(params: SerpQuery):
|
| 134 |
"""Attempts to search the specified queries using ALL backends"""
|
| 135 |
results = []
|
| 136 |
|
|
|
|
| 164 |
logging.info("Trying with next browser backend.")
|
| 165 |
|
| 166 |
if len(results) == 0:
|
| 167 |
+
return SerpResults(results=[], error="All backends are rate-limited.")
|
| 168 |
|
| 169 |
+
return SerpResults(results=results, error=None)
|
| 170 |
|
| 171 |
# =========================== Scrapping endpoints ===========================
|
| 172 |
|
|
|
|
| 193 |
|
| 194 |
# ===============================================================================
|
| 195 |
|
| 196 |
+
app.include_router(serp_router)
|
| 197 |
app.include_router(scrap_router)
|
| 198 |
|
| 199 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
CHANGED
|
@@ -3,6 +3,5 @@ uvicorn
|
|
| 3 |
pydantic
|
| 4 |
playwright
|
| 5 |
duckduckgo_search
|
| 6 |
-
expiringdict
|
| 7 |
beautifulsoup4
|
| 8 |
httpx
|
|
|
|
| 3 |
pydantic
|
| 4 |
playwright
|
| 5 |
duckduckgo_search
|
|
|
|
| 6 |
beautifulsoup4
|
| 7 |
httpx
|
search.py → serp.py
RENAMED
|
@@ -7,14 +7,8 @@ from urllib.parse import quote_plus
|
|
| 7 |
import logging
|
| 8 |
import re
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
"""Response of /search_patents endpoint"""
|
| 13 |
-
error: Optional[str]
|
| 14 |
-
results: Optional[list[dict]]
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
class APISearchResults(BaseModel):
|
| 18 |
error: Optional[str]
|
| 19 |
results: Optional[list[dict]]
|
| 20 |
|
|
|
|
| 7 |
import logging
|
| 8 |
import re
|
| 9 |
|
| 10 |
+
class SerpResults(BaseModel):
|
| 11 |
+
"""Model for SERP scrapping results"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
error: Optional[str]
|
| 13 |
results: Optional[list[dict]]
|
| 14 |
|