glimpse-api / app.py
Juanoto2012's picture
Update app.py
5a251e0 verified
from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
import requests
from bs4 import BeautifulSoup
import uvicorn
import urllib.parse
import json
from duckduckgo_search import DDGS
app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo e Imágenes multi-motor")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8"
}
# --- FUNCIONES DE SCRAPING WEB (TEXTO) ---
def scrape_duckduckgo(query: str):
url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for result in soup.select('.result__body'):
title_tag = result.select_one('.result__title a')
snippet_tag = result.select_one('.result__snippet')
if title_tag:
raw_url = title_tag.get('href', '')
clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0])
results.append({
"title": title_tag.text.strip(),
"url": clean_url if clean_url.startswith('http') else raw_url,
"content": snippet_tag.text.strip() if snippet_tag else ""
})
return results
def scrape_mojeek(query: str):
url = f"https://www.mojeek.com/search?q={urllib.parse.quote(query)}&fmt=html"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for li in soup.select('ul.results-standard > li'):
a_tag = li.select_one('a.ob')
p_tag = li.select_one('p.s')
if a_tag:
results.append({
"title": a_tag.text.strip(),
"url": a_tag.get('href', ''),
"content": p_tag.text.strip() if p_tag else ""
})
return results
def scrape_qwant(query: str):
url = f"https://lite.qwant.com/?q={urllib.parse.quote(query)}"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for article in soup.select('article.result'):
title_tag = article.select_one('h2 a')
snippet_tag = article.select_one('.result-snippet')
if title_tag:
results.append({
"title": title_tag.text.strip(),
"url": title_tag.get('href', ''),
"content": snippet_tag.text.strip() if snippet_tag else ""
})
return results
def scrape_brave(query: str):
url = f"https://search.brave.com/search?q={urllib.parse.quote(query)}"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for snippet in soup.select('.snippet'):
title_tag = snippet.select_one('.heading')
link_tag = snippet.select_one('a')
desc_tag = snippet.select_one('.snippet-content, .snippet-description')
if title_tag and link_tag:
results.append({
"title": title_tag.text.strip(),
"url": link_tag.get('href', ''),
"content": desc_tag.text.strip() if desc_tag else ""
})
return results
# --- ENDPOINTS ---
@app.get("/")
def read_root():
return {"status": "Glimpse API is running!", "endpoints": ["/search", "/images"]}
@app.get("/search")
def search(
q: str = Query(..., description="Término de búsqueda"),
engine: str = Query("duckduckgo", description="Motor de búsqueda")
):
engine = engine.lower()
results = []
try:
if engine == "mojeek":
results = scrape_mojeek(q)
elif engine in ["qwant", "tapnav"]:
results = scrape_qwant(q)
elif engine == "brave":
results = scrape_brave(q)
else:
results = scrape_duckduckgo(q)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error en {engine}: {str(e)}")
return {"results": results}
@app.get("/images")
def search_images(
q: str = Query(..., description="Término de búsqueda para imágenes"),
max_results: int = Query(40, description="Cantidad máxima de imágenes")
):
results = []
# INTENTO 1: Qwant API (Muy estable para imágenes)
try:
url = "https://api.qwant.com/v3/search/images"
params = {"q": q, "t": "images", "locale": "es_ES", "count": max_results}
resp = requests.get(url, params=params, headers=HEADERS, timeout=5)
if resp.status_code == 200:
items = resp.json().get("data", {}).get("result", {}).get("items", [])
for item in items:
if item.get("media"):
results.append({
"title": item.get("title", "Imagen"),
"image_url": item.get("media", ""),
"thumbnail_url": item.get("thumbnail", ""),
"source_url": item.get("url", ""),
"source_name": item.get("domain", "")
})
if results:
return {"results": results}
except Exception as e:
print(f"Intento 1 (Qwant) falló: {e}")
# INTENTO 2: Yahoo Images (Scraping a la estructura JSON oculta)
try:
url = f"https://images.search.yahoo.com/search/images?p={urllib.parse.quote(q)}"
resp = requests.get(url, headers=HEADERS, timeout=5)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'lxml')
for li in soup.select('li.ld'):
data_attr = li.get('data')
if data_attr:
try:
item = json.loads(data_attr)
if item.get('iurl'):
results.append({
"title": item.get('title', 'Imagen'),
"image_url": item.get('iurl', ''),
"thumbnail_url": item.get('ith', '') or item.get('iurl', ''),
"source_url": item.get('rurl', ''),
"source_name": item.get('surl', '')
})
except:
continue
if results:
return {"results": results[:max_results]}
except Exception as e:
print(f"Intento 2 (Yahoo) falló: {e}")
# INTENTO 3: DuckDuckGo (El original, por si los demás fallan)
try:
with DDGS() as ddgs:
ddg_images = list(ddgs.images(keywords=q, max_results=max_results))
for img in ddg_images:
img_url = img.get("image", "") or img.get("url", "")
if img_url:
results.append({
"title": img.get("title", "Imagen"),
"image_url": img_url,
"thumbnail_url": img.get("thumbnail", "") or img_url,
"source_url": img.get("url", ""),
"source_name": img.get("source", "")
})
if results:
return {"results": results}
except Exception as e:
print(f"Intento 3 (DDG) falló: {e}")
# Si todo falla, enviamos el error real al Frontend
raise HTTPException(status_code=503, detail="Los servidores proxy de imágenes están temporalmente bloqueados. Intenta en unos minutos.")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)