from fastapi import FastAPI, HTTPException, Query from fastapi.middleware.cors import CORSMiddleware import requests from bs4 import BeautifulSoup import uvicorn import urllib.parse import json from duckduckgo_search import DDGS app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo e Imágenes multi-motor") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept-Language": "es-ES,es;q=0.9,en;q=0.8" } # --- FUNCIONES DE SCRAPING WEB (TEXTO) --- def scrape_duckduckgo(query: str): url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}" resp = requests.get(url, headers=HEADERS, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'lxml') results = [] for result in soup.select('.result__body'): title_tag = result.select_one('.result__title a') snippet_tag = result.select_one('.result__snippet') if title_tag: raw_url = title_tag.get('href', '') clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0]) results.append({ "title": title_tag.text.strip(), "url": clean_url if clean_url.startswith('http') else raw_url, "content": snippet_tag.text.strip() if snippet_tag else "" }) return results def scrape_mojeek(query: str): url = f"https://www.mojeek.com/search?q={urllib.parse.quote(query)}&fmt=html" resp = requests.get(url, headers=HEADERS, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'lxml') results = [] for li in soup.select('ul.results-standard > li'): a_tag = li.select_one('a.ob') p_tag = li.select_one('p.s') if a_tag: results.append({ "title": a_tag.text.strip(), "url": a_tag.get('href', ''), "content": p_tag.text.strip() if p_tag else "" }) return results def scrape_qwant(query: str): url = f"https://lite.qwant.com/?q={urllib.parse.quote(query)}" resp = requests.get(url, headers=HEADERS, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'lxml') results = [] for article in soup.select('article.result'): title_tag = article.select_one('h2 a') snippet_tag = article.select_one('.result-snippet') if title_tag: results.append({ "title": title_tag.text.strip(), "url": title_tag.get('href', ''), "content": snippet_tag.text.strip() if snippet_tag else "" }) return results def scrape_brave(query: str): url = f"https://search.brave.com/search?q={urllib.parse.quote(query)}" resp = requests.get(url, headers=HEADERS, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'lxml') results = [] for snippet in soup.select('.snippet'): title_tag = snippet.select_one('.heading') link_tag = snippet.select_one('a') desc_tag = snippet.select_one('.snippet-content, .snippet-description') if title_tag and link_tag: results.append({ "title": title_tag.text.strip(), "url": link_tag.get('href', ''), "content": desc_tag.text.strip() if desc_tag else "" }) return results # --- ENDPOINTS --- @app.get("/") def read_root(): return {"status": "Glimpse API is running!", "endpoints": ["/search", "/images"]} @app.get("/search") def search( q: str = Query(..., description="Término de búsqueda"), engine: str = Query("duckduckgo", description="Motor de búsqueda") ): engine = engine.lower() results = [] try: if engine == "mojeek": results = scrape_mojeek(q) elif engine in ["qwant", "tapnav"]: results = scrape_qwant(q) elif engine == "brave": results = scrape_brave(q) else: results = scrape_duckduckgo(q) except Exception as e: raise HTTPException(status_code=500, detail=f"Error en {engine}: {str(e)}") return {"results": results} @app.get("/images") def search_images( q: str = Query(..., description="Término de búsqueda para imágenes"), max_results: int = Query(40, description="Cantidad máxima de imágenes") ): results = [] # INTENTO 1: Qwant API (Muy estable para imágenes) try: url = "https://api.qwant.com/v3/search/images" params = {"q": q, "t": "images", "locale": "es_ES", "count": max_results} resp = requests.get(url, params=params, headers=HEADERS, timeout=5) if resp.status_code == 200: items = resp.json().get("data", {}).get("result", {}).get("items", []) for item in items: if item.get("media"): results.append({ "title": item.get("title", "Imagen"), "image_url": item.get("media", ""), "thumbnail_url": item.get("thumbnail", ""), "source_url": item.get("url", ""), "source_name": item.get("domain", "") }) if results: return {"results": results} except Exception as e: print(f"Intento 1 (Qwant) falló: {e}") # INTENTO 2: Yahoo Images (Scraping a la estructura JSON oculta) try: url = f"https://images.search.yahoo.com/search/images?p={urllib.parse.quote(q)}" resp = requests.get(url, headers=HEADERS, timeout=5) if resp.status_code == 200: soup = BeautifulSoup(resp.text, 'lxml') for li in soup.select('li.ld'): data_attr = li.get('data') if data_attr: try: item = json.loads(data_attr) if item.get('iurl'): results.append({ "title": item.get('title', 'Imagen'), "image_url": item.get('iurl', ''), "thumbnail_url": item.get('ith', '') or item.get('iurl', ''), "source_url": item.get('rurl', ''), "source_name": item.get('surl', '') }) except: continue if results: return {"results": results[:max_results]} except Exception as e: print(f"Intento 2 (Yahoo) falló: {e}") # INTENTO 3: DuckDuckGo (El original, por si los demás fallan) try: with DDGS() as ddgs: ddg_images = list(ddgs.images(keywords=q, max_results=max_results)) for img in ddg_images: img_url = img.get("image", "") or img.get("url", "") if img_url: results.append({ "title": img.get("title", "Imagen"), "image_url": img_url, "thumbnail_url": img.get("thumbnail", "") or img_url, "source_url": img.get("url", ""), "source_name": img.get("source", "") }) if results: return {"results": results} except Exception as e: print(f"Intento 3 (DDG) falló: {e}") # Si todo falla, enviamos el error real al Frontend raise HTTPException(status_code=503, detail="Los servidores proxy de imágenes están temporalmente bloqueados. Intenta en unos minutos.") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)