Spaces:
Running
Running
File size: 7,918 Bytes
45e11e9 8c2240b 45e11e9 8c2240b 5a251e0 1b3da14 45e11e9 5a251e0 45e11e9 8c2240b 45e11e9 8c2240b 5a251e0 ae5ce8c 8c2240b 45e11e9 1b3da14 ae5ce8c 45e11e9 1b3da14 45e11e9 1b3da14 45e11e9 8c2240b 1b3da14 8c2240b 5a251e0 8c2240b 1b3da14 45e11e9 ae5ce8c 1b3da14 ae5ce8c 5a251e0 ae5ce8c 1b3da14 ae5ce8c 1b3da14 5a251e0 1b3da14 5a251e0 1b3da14 5a251e0 ae5ce8c 5a251e0 ae5ce8c 45e11e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
import requests
from bs4 import BeautifulSoup
import uvicorn
import urllib.parse
import json
from duckduckgo_search import DDGS
app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo e Imágenes multi-motor")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8"
}
# --- FUNCIONES DE SCRAPING WEB (TEXTO) ---
def scrape_duckduckgo(query: str):
url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for result in soup.select('.result__body'):
title_tag = result.select_one('.result__title a')
snippet_tag = result.select_one('.result__snippet')
if title_tag:
raw_url = title_tag.get('href', '')
clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0])
results.append({
"title": title_tag.text.strip(),
"url": clean_url if clean_url.startswith('http') else raw_url,
"content": snippet_tag.text.strip() if snippet_tag else ""
})
return results
def scrape_mojeek(query: str):
url = f"https://www.mojeek.com/search?q={urllib.parse.quote(query)}&fmt=html"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for li in soup.select('ul.results-standard > li'):
a_tag = li.select_one('a.ob')
p_tag = li.select_one('p.s')
if a_tag:
results.append({
"title": a_tag.text.strip(),
"url": a_tag.get('href', ''),
"content": p_tag.text.strip() if p_tag else ""
})
return results
def scrape_qwant(query: str):
url = f"https://lite.qwant.com/?q={urllib.parse.quote(query)}"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for article in soup.select('article.result'):
title_tag = article.select_one('h2 a')
snippet_tag = article.select_one('.result-snippet')
if title_tag:
results.append({
"title": title_tag.text.strip(),
"url": title_tag.get('href', ''),
"content": snippet_tag.text.strip() if snippet_tag else ""
})
return results
def scrape_brave(query: str):
url = f"https://search.brave.com/search?q={urllib.parse.quote(query)}"
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'lxml')
results = []
for snippet in soup.select('.snippet'):
title_tag = snippet.select_one('.heading')
link_tag = snippet.select_one('a')
desc_tag = snippet.select_one('.snippet-content, .snippet-description')
if title_tag and link_tag:
results.append({
"title": title_tag.text.strip(),
"url": link_tag.get('href', ''),
"content": desc_tag.text.strip() if desc_tag else ""
})
return results
# --- ENDPOINTS ---
@app.get("/")
def read_root():
return {"status": "Glimpse API is running!", "endpoints": ["/search", "/images"]}
@app.get("/search")
def search(
q: str = Query(..., description="Término de búsqueda"),
engine: str = Query("duckduckgo", description="Motor de búsqueda")
):
engine = engine.lower()
results = []
try:
if engine == "mojeek":
results = scrape_mojeek(q)
elif engine in ["qwant", "tapnav"]:
results = scrape_qwant(q)
elif engine == "brave":
results = scrape_brave(q)
else:
results = scrape_duckduckgo(q)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error en {engine}: {str(e)}")
return {"results": results}
@app.get("/images")
def search_images(
q: str = Query(..., description="Término de búsqueda para imágenes"),
max_results: int = Query(40, description="Cantidad máxima de imágenes")
):
results = []
# INTENTO 1: Qwant API (Muy estable para imágenes)
try:
url = "https://api.qwant.com/v3/search/images"
params = {"q": q, "t": "images", "locale": "es_ES", "count": max_results}
resp = requests.get(url, params=params, headers=HEADERS, timeout=5)
if resp.status_code == 200:
items = resp.json().get("data", {}).get("result", {}).get("items", [])
for item in items:
if item.get("media"):
results.append({
"title": item.get("title", "Imagen"),
"image_url": item.get("media", ""),
"thumbnail_url": item.get("thumbnail", ""),
"source_url": item.get("url", ""),
"source_name": item.get("domain", "")
})
if results:
return {"results": results}
except Exception as e:
print(f"Intento 1 (Qwant) falló: {e}")
# INTENTO 2: Yahoo Images (Scraping a la estructura JSON oculta)
try:
url = f"https://images.search.yahoo.com/search/images?p={urllib.parse.quote(q)}"
resp = requests.get(url, headers=HEADERS, timeout=5)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'lxml')
for li in soup.select('li.ld'):
data_attr = li.get('data')
if data_attr:
try:
item = json.loads(data_attr)
if item.get('iurl'):
results.append({
"title": item.get('title', 'Imagen'),
"image_url": item.get('iurl', ''),
"thumbnail_url": item.get('ith', '') or item.get('iurl', ''),
"source_url": item.get('rurl', ''),
"source_name": item.get('surl', '')
})
except:
continue
if results:
return {"results": results[:max_results]}
except Exception as e:
print(f"Intento 2 (Yahoo) falló: {e}")
# INTENTO 3: DuckDuckGo (El original, por si los demás fallan)
try:
with DDGS() as ddgs:
ddg_images = list(ddgs.images(keywords=q, max_results=max_results))
for img in ddg_images:
img_url = img.get("image", "") or img.get("url", "")
if img_url:
results.append({
"title": img.get("title", "Imagen"),
"image_url": img_url,
"thumbnail_url": img.get("thumbnail", "") or img_url,
"source_url": img.get("url", ""),
"source_name": img.get("source", "")
})
if results:
return {"results": results}
except Exception as e:
print(f"Intento 3 (DDG) falló: {e}")
# Si todo falla, enviamos el error real al Frontend
raise HTTPException(status_code=503, detail="Los servidores proxy de imágenes están temporalmente bloqueados. Intenta en unos minutos.")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |