Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,93 +1,145 @@
|
|
| 1 |
from fastapi import FastAPI, HTTPException, Query
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
-
from duckduckgo_search import DDGS
|
| 4 |
import requests
|
|
|
|
| 5 |
import uvicorn
|
|
|
|
| 6 |
|
| 7 |
-
app = FastAPI(title="Glimpse
|
| 8 |
|
| 9 |
-
# Habilitar CORS para que tu app frontend pueda consumir esta API sin errores
|
| 10 |
app.add_middleware(
|
| 11 |
CORSMiddleware,
|
| 12 |
-
allow_origins=["*"],
|
| 13 |
allow_credentials=True,
|
| 14 |
allow_methods=["*"],
|
| 15 |
allow_headers=["*"],
|
| 16 |
)
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
"
|
| 21 |
-
"
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
@app.get("/")
|
| 26 |
def read_root():
|
| 27 |
-
return {"status": "Glimpse API is running
|
| 28 |
|
| 29 |
@app.get("/search")
|
| 30 |
def search(
|
| 31 |
q: str = Query(..., description="T茅rmino de b煤squeda"),
|
| 32 |
-
engine: str = Query("duckduckgo", description="duckduckgo,
|
| 33 |
):
|
| 34 |
-
results = []
|
| 35 |
engine = engine.lower()
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
params = {
|
| 60 |
-
"q": q,
|
| 61 |
-
"format": "json",
|
| 62 |
-
"engines": engine,
|
| 63 |
-
"language": "es-ES"
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
# Intentar en nuestras instancias de fallback si alguna falla
|
| 67 |
-
for instance in SEARXNG_INSTANCES:
|
| 68 |
-
try:
|
| 69 |
-
resp = requests.get(instance, params=params, timeout=5)
|
| 70 |
-
resp.raise_for_status()
|
| 71 |
-
data = resp.json()
|
| 72 |
-
|
| 73 |
-
for r in data.get("results", []):
|
| 74 |
-
results.append({
|
| 75 |
-
"title": r.get("title", ""),
|
| 76 |
-
"url": r.get("url", ""),
|
| 77 |
-
"content": r.get("content", "")
|
| 78 |
-
})
|
| 79 |
-
success = True
|
| 80 |
-
break # Si tuvo 茅xito, salimos del bucle
|
| 81 |
-
except Exception as e:
|
| 82 |
-
print(f"Fall贸 la instancia {instance} para el motor {engine}: {e}")
|
| 83 |
-
continue
|
| 84 |
-
|
| 85 |
-
if not success:
|
| 86 |
-
raise HTTPException(status_code=503, detail=f"Todos los servidores proxy para el motor {engine} fallaron temporalmente.")
|
| 87 |
|
| 88 |
-
# Devolvemos exactamente el formato que espera tu HTML
|
| 89 |
return {"results": results}
|
| 90 |
|
| 91 |
if __name__ == "__main__":
|
| 92 |
-
# Hugging Face expone los puertos en el 7860 por defecto
|
| 93 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException, Query
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 3 |
import requests
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
import uvicorn
|
| 6 |
+
import urllib.parse
|
| 7 |
|
| 8 |
+
app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo para buscadores")
|
| 9 |
|
|
|
|
| 10 |
app.add_middleware(
|
| 11 |
CORSMiddleware,
|
| 12 |
+
allow_origins=["*"],
|
| 13 |
allow_credentials=True,
|
| 14 |
allow_methods=["*"],
|
| 15 |
allow_headers=["*"],
|
| 16 |
)
|
| 17 |
|
| 18 |
+
# User-Agent de un navegador real para evitar bloqueos
|
| 19 |
+
HEADERS = {
|
| 20 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 21 |
+
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8"
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
def scrape_duckduckgo(query: str):
|
| 25 |
+
# Usamos la versi贸n HTML plana de DDG que es m谩s f谩cil de scrapear
|
| 26 |
+
url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}"
|
| 27 |
+
resp = requests.get(url, headers=HEADERS, timeout=10)
|
| 28 |
+
resp.raise_for_status()
|
| 29 |
+
soup = BeautifulSoup(resp.text, 'lxml')
|
| 30 |
+
|
| 31 |
+
results = []
|
| 32 |
+
for result in soup.select('.result__body'):
|
| 33 |
+
title_tag = result.select_one('.result__title a')
|
| 34 |
+
snippet_tag = result.select_one('.result__snippet')
|
| 35 |
+
|
| 36 |
+
if title_tag:
|
| 37 |
+
# Limpiar la URL de redirecci贸n de DDG
|
| 38 |
+
raw_url = title_tag.get('href', '')
|
| 39 |
+
clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0])
|
| 40 |
+
|
| 41 |
+
results.append({
|
| 42 |
+
"title": title_tag.text.strip(),
|
| 43 |
+
"url": clean_url if clean_url.startswith('http') else raw_url,
|
| 44 |
+
"content": snippet_tag.text.strip() if snippet_tag else ""
|
| 45 |
+
})
|
| 46 |
+
return results
|
| 47 |
+
|
| 48 |
+
def scrape_mojeek(query: str):
|
| 49 |
+
url = f"https://www.mojeek.com/search?q={urllib.parse.quote(query)}&fmt=html"
|
| 50 |
+
resp = requests.get(url, headers=HEADERS, timeout=10)
|
| 51 |
+
resp.raise_for_status()
|
| 52 |
+
soup = BeautifulSoup(resp.text, 'lxml')
|
| 53 |
+
|
| 54 |
+
results = []
|
| 55 |
+
for li in soup.select('ul.results-standard > li'):
|
| 56 |
+
a_tag = li.select_one('a.ob')
|
| 57 |
+
p_tag = li.select_one('p.s')
|
| 58 |
+
|
| 59 |
+
if a_tag:
|
| 60 |
+
results.append({
|
| 61 |
+
"title": a_tag.text.strip(),
|
| 62 |
+
"url": a_tag.get('href', ''),
|
| 63 |
+
"content": p_tag.text.strip() if p_tag else ""
|
| 64 |
+
})
|
| 65 |
+
return results
|
| 66 |
+
|
| 67 |
+
def scrape_qwant(query: str):
|
| 68 |
+
# Usamos Qwant Lite que no requiere renderizado de JavaScript
|
| 69 |
+
url = f"https://lite.qwant.com/?q={urllib.parse.quote(query)}"
|
| 70 |
+
resp = requests.get(url, headers=HEADERS, timeout=10)
|
| 71 |
+
resp.raise_for_status()
|
| 72 |
+
soup = BeautifulSoup(resp.text, 'lxml')
|
| 73 |
+
|
| 74 |
+
results = []
|
| 75 |
+
for article in soup.select('article.result'):
|
| 76 |
+
title_tag = article.select_one('h2 a')
|
| 77 |
+
snippet_tag = article.select_one('.result-snippet')
|
| 78 |
+
|
| 79 |
+
if title_tag:
|
| 80 |
+
results.append({
|
| 81 |
+
"title": title_tag.text.strip(),
|
| 82 |
+
"url": title_tag.get('href', ''),
|
| 83 |
+
"content": snippet_tag.text.strip() if snippet_tag else ""
|
| 84 |
+
})
|
| 85 |
+
return results
|
| 86 |
+
|
| 87 |
+
def scrape_brave(query: str):
|
| 88 |
+
url = f"https://search.brave.com/search?q={urllib.parse.quote(query)}"
|
| 89 |
+
resp = requests.get(url, headers=HEADERS, timeout=10)
|
| 90 |
+
resp.raise_for_status()
|
| 91 |
+
soup = BeautifulSoup(resp.text, 'lxml')
|
| 92 |
+
|
| 93 |
+
results = []
|
| 94 |
+
for snippet in soup.select('.snippet'):
|
| 95 |
+
title_tag = snippet.select_one('.heading')
|
| 96 |
+
link_tag = snippet.select_one('a')
|
| 97 |
+
desc_tag = snippet.select_one('.snippet-content, .snippet-description')
|
| 98 |
+
|
| 99 |
+
if title_tag and link_tag:
|
| 100 |
+
results.append({
|
| 101 |
+
"title": title_tag.text.strip(),
|
| 102 |
+
"url": link_tag.get('href', ''),
|
| 103 |
+
"content": desc_tag.text.strip() if desc_tag else ""
|
| 104 |
+
})
|
| 105 |
+
return results
|
| 106 |
|
| 107 |
@app.get("/")
|
| 108 |
def read_root():
|
| 109 |
+
return {"status": "Glimpse Scraping API is running!"}
|
| 110 |
|
| 111 |
@app.get("/search")
|
| 112 |
def search(
|
| 113 |
q: str = Query(..., description="T茅rmino de b煤squeda"),
|
| 114 |
+
engine: str = Query("duckduckgo", description="duckduckgo, mojeek, qwant, brave")
|
| 115 |
):
|
|
|
|
| 116 |
engine = engine.lower()
|
| 117 |
+
results = []
|
| 118 |
|
| 119 |
+
try:
|
| 120 |
+
if engine == "duckduckgo":
|
| 121 |
+
results = scrape_duckduckgo(q)
|
| 122 |
+
elif engine == "mojeek":
|
| 123 |
+
results = scrape_mojeek(q)
|
| 124 |
+
elif engine in ["qwant", "tapnav"]:
|
| 125 |
+
results = scrape_qwant(q)
|
| 126 |
+
elif engine == "brave":
|
| 127 |
+
results = scrape_brave(q)
|
| 128 |
+
elif engine == "startpage":
|
| 129 |
+
# Startpage bloquea el scraping crudo agresivamente (usa tokens).
|
| 130 |
+
# Redirigimos silenciosamente a DuckDuckGo como fallback
|
| 131 |
+
results = scrape_duckduckgo(q)
|
| 132 |
+
else:
|
| 133 |
+
results = scrape_duckduckgo(q)
|
| 134 |
+
|
| 135 |
+
except requests.exceptions.HTTPError as e:
|
| 136 |
+
if e.response.status_code == 403:
|
| 137 |
+
raise HTTPException(status_code=403, detail=f"El motor {engine} nos bloque贸 (Protecci贸n Anti-Bot/CORS).")
|
| 138 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 139 |
+
except Exception as e:
|
| 140 |
+
raise HTTPException(status_code=500, detail=f"Error al procesar el HTML de {engine}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
|
|
|
| 142 |
return {"results": results}
|
| 143 |
|
| 144 |
if __name__ == "__main__":
|
|
|
|
| 145 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|