Spaces:
Running
Running
| from fastapi import FastAPI, HTTPException, Query | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import uvicorn | |
| import urllib.parse | |
| import json | |
| from duckduckgo_search import DDGS | |
| app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo e Imágenes multi-motor") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Accept-Language": "es-ES,es;q=0.9,en;q=0.8" | |
| } | |
| # --- FUNCIONES DE SCRAPING WEB (TEXTO) --- | |
| def scrape_duckduckgo(query: str): | |
| url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}" | |
| resp = requests.get(url, headers=HEADERS, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, 'lxml') | |
| results = [] | |
| for result in soup.select('.result__body'): | |
| title_tag = result.select_one('.result__title a') | |
| snippet_tag = result.select_one('.result__snippet') | |
| if title_tag: | |
| raw_url = title_tag.get('href', '') | |
| clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0]) | |
| results.append({ | |
| "title": title_tag.text.strip(), | |
| "url": clean_url if clean_url.startswith('http') else raw_url, | |
| "content": snippet_tag.text.strip() if snippet_tag else "" | |
| }) | |
| return results | |
| def scrape_mojeek(query: str): | |
| url = f"https://www.mojeek.com/search?q={urllib.parse.quote(query)}&fmt=html" | |
| resp = requests.get(url, headers=HEADERS, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, 'lxml') | |
| results = [] | |
| for li in soup.select('ul.results-standard > li'): | |
| a_tag = li.select_one('a.ob') | |
| p_tag = li.select_one('p.s') | |
| if a_tag: | |
| results.append({ | |
| "title": a_tag.text.strip(), | |
| "url": a_tag.get('href', ''), | |
| "content": p_tag.text.strip() if p_tag else "" | |
| }) | |
| return results | |
| def scrape_qwant(query: str): | |
| url = f"https://lite.qwant.com/?q={urllib.parse.quote(query)}" | |
| resp = requests.get(url, headers=HEADERS, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, 'lxml') | |
| results = [] | |
| for article in soup.select('article.result'): | |
| title_tag = article.select_one('h2 a') | |
| snippet_tag = article.select_one('.result-snippet') | |
| if title_tag: | |
| results.append({ | |
| "title": title_tag.text.strip(), | |
| "url": title_tag.get('href', ''), | |
| "content": snippet_tag.text.strip() if snippet_tag else "" | |
| }) | |
| return results | |
| def scrape_brave(query: str): | |
| url = f"https://search.brave.com/search?q={urllib.parse.quote(query)}" | |
| resp = requests.get(url, headers=HEADERS, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, 'lxml') | |
| results = [] | |
| for snippet in soup.select('.snippet'): | |
| title_tag = snippet.select_one('.heading') | |
| link_tag = snippet.select_one('a') | |
| desc_tag = snippet.select_one('.snippet-content, .snippet-description') | |
| if title_tag and link_tag: | |
| results.append({ | |
| "title": title_tag.text.strip(), | |
| "url": link_tag.get('href', ''), | |
| "content": desc_tag.text.strip() if desc_tag else "" | |
| }) | |
| return results | |
| # --- ENDPOINTS --- | |
| def read_root(): | |
| return {"status": "Glimpse API is running!", "endpoints": ["/search", "/images"]} | |
| def search( | |
| q: str = Query(..., description="Término de búsqueda"), | |
| engine: str = Query("duckduckgo", description="Motor de búsqueda") | |
| ): | |
| engine = engine.lower() | |
| results = [] | |
| try: | |
| if engine == "mojeek": | |
| results = scrape_mojeek(q) | |
| elif engine in ["qwant", "tapnav"]: | |
| results = scrape_qwant(q) | |
| elif engine == "brave": | |
| results = scrape_brave(q) | |
| else: | |
| results = scrape_duckduckgo(q) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Error en {engine}: {str(e)}") | |
| return {"results": results} | |
| def search_images( | |
| q: str = Query(..., description="Término de búsqueda para imágenes"), | |
| max_results: int = Query(40, description="Cantidad máxima de imágenes") | |
| ): | |
| results = [] | |
| # INTENTO 1: Qwant API (Muy estable para imágenes) | |
| try: | |
| url = "https://api.qwant.com/v3/search/images" | |
| params = {"q": q, "t": "images", "locale": "es_ES", "count": max_results} | |
| resp = requests.get(url, params=params, headers=HEADERS, timeout=5) | |
| if resp.status_code == 200: | |
| items = resp.json().get("data", {}).get("result", {}).get("items", []) | |
| for item in items: | |
| if item.get("media"): | |
| results.append({ | |
| "title": item.get("title", "Imagen"), | |
| "image_url": item.get("media", ""), | |
| "thumbnail_url": item.get("thumbnail", ""), | |
| "source_url": item.get("url", ""), | |
| "source_name": item.get("domain", "") | |
| }) | |
| if results: | |
| return {"results": results} | |
| except Exception as e: | |
| print(f"Intento 1 (Qwant) falló: {e}") | |
| # INTENTO 2: Yahoo Images (Scraping a la estructura JSON oculta) | |
| try: | |
| url = f"https://images.search.yahoo.com/search/images?p={urllib.parse.quote(q)}" | |
| resp = requests.get(url, headers=HEADERS, timeout=5) | |
| if resp.status_code == 200: | |
| soup = BeautifulSoup(resp.text, 'lxml') | |
| for li in soup.select('li.ld'): | |
| data_attr = li.get('data') | |
| if data_attr: | |
| try: | |
| item = json.loads(data_attr) | |
| if item.get('iurl'): | |
| results.append({ | |
| "title": item.get('title', 'Imagen'), | |
| "image_url": item.get('iurl', ''), | |
| "thumbnail_url": item.get('ith', '') or item.get('iurl', ''), | |
| "source_url": item.get('rurl', ''), | |
| "source_name": item.get('surl', '') | |
| }) | |
| except: | |
| continue | |
| if results: | |
| return {"results": results[:max_results]} | |
| except Exception as e: | |
| print(f"Intento 2 (Yahoo) falló: {e}") | |
| # INTENTO 3: DuckDuckGo (El original, por si los demás fallan) | |
| try: | |
| with DDGS() as ddgs: | |
| ddg_images = list(ddgs.images(keywords=q, max_results=max_results)) | |
| for img in ddg_images: | |
| img_url = img.get("image", "") or img.get("url", "") | |
| if img_url: | |
| results.append({ | |
| "title": img.get("title", "Imagen"), | |
| "image_url": img_url, | |
| "thumbnail_url": img.get("thumbnail", "") or img_url, | |
| "source_url": img.get("url", ""), | |
| "source_name": img.get("source", "") | |
| }) | |
| if results: | |
| return {"results": results} | |
| except Exception as e: | |
| print(f"Intento 3 (DDG) falló: {e}") | |
| # Si todo falla, enviamos el error real al Frontend | |
| raise HTTPException(status_code=503, detail="Los servidores proxy de imágenes están temporalmente bloqueados. Intenta en unos minutos.") | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |