Spaces:
Paused
Paused
Update routers/search.py
Browse files- routers/search.py +339 -345
routers/search.py
CHANGED
|
@@ -12,415 +12,409 @@ import struct
|
|
| 12 |
from typing import Optional, Tuple, List, Dict
|
| 13 |
import base64
|
| 14 |
from functools import lru_cache
|
|
|
|
|
|
|
| 15 |
import time
|
| 16 |
-
from concurrent.futures import ProcessPoolExecutor
|
| 17 |
-
import multiprocessing
|
| 18 |
-
|
| 19 |
-
# Use uvloop se disponível para melhor performance async
|
| 20 |
-
try:
|
| 21 |
-
import uvloop
|
| 22 |
-
uvloop.install()
|
| 23 |
-
except ImportError:
|
| 24 |
-
pass
|
| 25 |
-
|
| 26 |
-
def _init_worker():
|
| 27 |
-
"""Inicializa worker do processo filho"""
|
| 28 |
-
# Configurações específicas do worker se necessário
|
| 29 |
-
import signal
|
| 30 |
-
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
| 31 |
|
| 32 |
router = APIRouter()
|
| 33 |
|
| 34 |
-
# Pool de
|
| 35 |
-
|
| 36 |
-
max_workers=min(
|
| 37 |
-
|
| 38 |
)
|
| 39 |
|
| 40 |
-
# Cache
|
| 41 |
_url_cache = {}
|
| 42 |
-
|
| 43 |
-
_cache_max_size = 2000
|
| 44 |
-
_cache_ttl = 3600 # 1 hora
|
| 45 |
|
| 46 |
-
@
|
| 47 |
-
def
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
try:
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
slash_count = 0
|
| 60 |
-
end_idx = 0
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
slash_count += 1
|
| 65 |
-
if slash_count == 3:
|
| 66 |
-
end_idx = i
|
| 67 |
-
break
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return url
|
| 76 |
|
| 77 |
-
# Regex pré-compilado para máxima performance
|
| 78 |
-
_IMAGE_PATTERN = re.compile(
|
| 79 |
-
r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)(?:\?[^\s"\'<>]*)?',
|
| 80 |
-
re.IGNORECASE | re.MULTILINE
|
| 81 |
-
)
|
| 82 |
|
| 83 |
-
def
|
| 84 |
-
"""
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
-
unique_urls = []
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
seen.add(cleaned)
|
| 96 |
-
unique_urls.append(cleaned)
|
| 97 |
-
|
| 98 |
-
if len(unique_urls) >= 80: # Para cedo se já temos suficientes
|
| 99 |
-
break
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
# Estruturas de dados otimizadas para parsing
|
| 104 |
-
_JPEG_MARKERS = frozenset([b'\xff\xc0', b'\xff\xc1', b'\xff\xc2', b'\xff\xc3'])
|
| 105 |
-
_PNG_SIGNATURE = b'\x89PNG\r\n\x1a\n'
|
| 106 |
-
_WEBP_SIGNATURE = b'RIFF'
|
| 107 |
|
| 108 |
-
def
|
| 109 |
-
"""
|
|
|
|
|
|
|
| 110 |
if len(data) < 24:
|
| 111 |
return None
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
pass
|
| 135 |
-
|
| 136 |
-
# WebP - parsing simplificado
|
| 137 |
-
elif data[:4] == _WEBP_SIGNATURE and len(data) >= 30:
|
| 138 |
-
try:
|
| 139 |
if data[12:16] == b'VP8 ':
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
if
|
| 143 |
-
return
|
| 144 |
-
|
| 145 |
-
|
| 146 |
|
| 147 |
return None
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
| 151 |
if not image_data or len(image_data) < 100:
|
| 152 |
return None
|
| 153 |
|
| 154 |
try:
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
# Cálculo de proporções com operações inteiras
|
| 171 |
-
w, h = image.size
|
| 172 |
-
if w <= max_size and h <= max_size:
|
| 173 |
-
thumbnail = image # Não redimensiona se já é pequena
|
| 174 |
-
else:
|
| 175 |
-
# Proporções com divisão inteira otimizada
|
| 176 |
if w > h:
|
| 177 |
-
new_h = (h * max_size) // w
|
| 178 |
-
new_w = max_size if new_h > 0 else max_size
|
| 179 |
-
new_h = max(1, new_h)
|
| 180 |
else:
|
| 181 |
-
new_w = (w * max_size) // h
|
| 182 |
-
new_h = max_size if new_w > 0 else max_size
|
| 183 |
-
new_w = max(1, new_w)
|
| 184 |
|
| 185 |
-
# Resize com filtro mais rápido
|
| 186 |
-
thumbnail = image.resize((new_w, new_h), Image.Resampling.
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
return base64.b64encode(buffer.getvalue()).decode('ascii')
|
| 197 |
-
|
| 198 |
-
except Exception:
|
| 199 |
return None
|
| 200 |
|
| 201 |
-
def _is_cache_valid(key: str) -> bool:
|
| 202 |
-
"""Verifica se cache ainda é válido"""
|
| 203 |
-
if key not in _cache_timestamps:
|
| 204 |
-
return False
|
| 205 |
-
return (time.time() - _cache_timestamps[key]) < _cache_ttl
|
| 206 |
|
| 207 |
-
def
|
| 208 |
-
"""
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
|
| 213 |
-
expired_keys = [
|
| 214 |
-
key for key, timestamp in _cache_timestamps.items()
|
| 215 |
-
if (current_time - timestamp) > _cache_ttl
|
| 216 |
-
]
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
_http_client = httpx.AsyncClient(
|
| 230 |
-
timeout=httpx.Timeout(8.0, connect=3.0),
|
| 231 |
-
limits=httpx.Limits(
|
| 232 |
-
max_keepalive_connections=50,
|
| 233 |
-
max_connections=80,
|
| 234 |
-
keepalive_expiry=30.0
|
| 235 |
-
),
|
| 236 |
-
http2=False,
|
| 237 |
-
headers={
|
| 238 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 239 |
-
'Accept': 'image/*,*/*;q=0.8',
|
| 240 |
-
'Accept-Encoding': 'gzip, deflate',
|
| 241 |
-
'Connection': 'keep-alive'
|
| 242 |
-
}
|
| 243 |
-
)
|
| 244 |
-
return _http_client
|
| 245 |
-
|
| 246 |
-
async def _process_image_ultra_fast(url: str, include_thumbnail: bool, semaphore: asyncio.Semaphore) -> Dict:
|
| 247 |
-
"""Processamento ultra-otimizado de uma imagem"""
|
| 248 |
-
async with semaphore:
|
| 249 |
-
# Cache check
|
| 250 |
-
cache_key = f"{url}_{include_thumbnail}"
|
| 251 |
-
if _is_cache_valid(cache_key):
|
| 252 |
-
return _url_cache[cache_key].copy()
|
| 253 |
-
|
| 254 |
-
client = await _get_http_client()
|
| 255 |
-
clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/')
|
| 256 |
-
|
| 257 |
-
width, height, thumbnail_b64 = None, None, None
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
range_size = 16384 if include_thumbnail else 4096
|
| 262 |
-
headers = {'Range': f'bytes=0-{range_size}'}
|
| 263 |
-
|
| 264 |
-
response = await client.get(clean_url, headers=headers, timeout=5.0)
|
| 265 |
-
|
| 266 |
-
if response.status_code in (200, 206) and len(response.content) > 200:
|
| 267 |
-
data = response.content
|
| 268 |
-
|
| 269 |
-
# Parsing de dimensões
|
| 270 |
-
dimensions = _get_image_dimensions_lightning(data)
|
| 271 |
-
if dimensions:
|
| 272 |
-
width, height = dimensions
|
| 273 |
-
|
| 274 |
-
# Thumbnail em processo separado se necessário
|
| 275 |
-
if include_thumbnail and len(data) > 1000:
|
| 276 |
-
loop = asyncio.get_event_loop()
|
| 277 |
-
thumbnail_b64 = await loop.run_in_executor(
|
| 278 |
-
_process_pool,
|
| 279 |
-
_create_thumbnail_process,
|
| 280 |
-
data
|
| 281 |
-
)
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
| 289 |
if dimensions:
|
| 290 |
width, height = dimensions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
thumbnail_b64 = await loop.run_in_executor(
|
| 295 |
-
_process_pool,
|
| 296 |
-
_create_thumbnail_process,
|
| 297 |
-
response.content
|
| 298 |
-
)
|
| 299 |
-
except:
|
| 300 |
-
pass
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
"""Busca imagens com performance máxima"""
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
"tbm": "isch",
|
| 330 |
-
"q": q,
|
| 331 |
-
"tbs": "isz:l,iar:w", # Imagens grandes, formato wide
|
| 332 |
-
"safe": "off"
|
| 333 |
-
}
|
| 334 |
|
| 335 |
-
|
| 336 |
-
"
|
| 337 |
-
"
|
| 338 |
-
"
|
| 339 |
-
"Accept-Encoding": "gzip, deflate",
|
| 340 |
-
"Referer": "https://www.google.com/"
|
| 341 |
}
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
try:
|
| 344 |
-
#
|
| 345 |
-
|
| 346 |
-
response = await client.get(
|
| 347 |
-
"https://www.google.com/search",
|
| 348 |
-
params=params,
|
| 349 |
-
headers=headers
|
| 350 |
-
)
|
| 351 |
-
|
| 352 |
-
if response.status_code != 200:
|
| 353 |
-
raise HTTPException(status_code=response.status_code, detail="Erro na busca")
|
| 354 |
-
|
| 355 |
-
# Extração ultra-rápida
|
| 356 |
-
image_urls = _extract_images_vectorized(response.text)
|
| 357 |
-
|
| 358 |
-
if not image_urls:
|
| 359 |
-
return JSONResponse(content={
|
| 360 |
-
"query": q,
|
| 361 |
-
"total_found": 0,
|
| 362 |
-
"images": []
|
| 363 |
-
})
|
| 364 |
|
| 365 |
-
#
|
| 366 |
-
semaphore = asyncio.Semaphore(25) # Concorrência controlada
|
| 367 |
-
|
| 368 |
-
# Cria tasks limitadas a 50 imagens máximo
|
| 369 |
-
tasks = [
|
| 370 |
-
_process_image_ultra_fast(url, include_thumbnails, semaphore)
|
| 371 |
-
for url in image_urls[:60] # Processa um pouco mais para compensar falhas
|
| 372 |
-
]
|
| 373 |
-
|
| 374 |
-
# Processa tudo em paralelo
|
| 375 |
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 376 |
|
| 377 |
-
#
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
if
|
| 381 |
-
|
| 382 |
-
result.get('height', 0) > 0)
|
| 383 |
-
]
|
| 384 |
|
| 385 |
-
|
| 386 |
-
valid_images.sort(key=lambda x: x.get('width', 0) * x.get('height', 0), reverse=True)
|
| 387 |
-
final_images = valid_images[:50]
|
| 388 |
|
| 389 |
-
return JSONResponse(content={
|
| 390 |
-
"query": q,
|
| 391 |
-
"min_width_filter": min_width,
|
| 392 |
-
"total_found": len(final_images),
|
| 393 |
-
"thumbnails_included": include_thumbnails,
|
| 394 |
-
"images": final_images
|
| 395 |
-
})
|
| 396 |
-
|
| 397 |
-
except httpx.TimeoutException:
|
| 398 |
-
raise HTTPException(status_code=408, detail="Timeout na requisição")
|
| 399 |
except Exception as e:
|
| 400 |
-
|
|
|
|
|
|
|
|
|
|
| 401 |
|
|
|
|
| 402 |
@router.get("/thumbnail")
|
| 403 |
async def get_thumbnail_fast(
|
| 404 |
-
url: str = Query(..., description="URL da imagem"),
|
| 405 |
-
size: int = Query(
|
| 406 |
):
|
| 407 |
-
"""
|
|
|
|
|
|
|
| 408 |
try:
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
if result.get('thumbnail'):
|
| 413 |
-
return JSONResponse(content={
|
| 414 |
-
"url": result['url'],
|
| 415 |
-
"thumbnail": result['thumbnail'],
|
| 416 |
-
"dimensions": f"{result.get('width', 0)}x{result.get('height', 0)}"
|
| 417 |
-
})
|
| 418 |
-
else:
|
| 419 |
-
raise HTTPException(status_code=500, detail="Erro ao criar miniatura")
|
| 420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
except Exception as e:
|
| 422 |
raise HTTPException(status_code=500, detail=f"Erro: {str(e)}")
|
| 423 |
|
| 424 |
-
|
|
|
|
| 425 |
import atexit
|
| 426 |
-
atexit.register(lambda:
|
|
|
|
| 12 |
from typing import Optional, Tuple, List, Dict
|
| 13 |
import base64
|
| 14 |
from functools import lru_cache
|
| 15 |
+
import aiofiles
|
| 16 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
router = APIRouter()
|
| 20 |
|
| 21 |
+
# Pool de threads otimizado para operações CPU-intensivas (thumbnail)
|
| 22 |
+
thumbnail_executor = ThreadPoolExecutor(
|
| 23 |
+
max_workers=min(32, (os.cpu_count() or 1) + 4),
|
| 24 |
+
thread_name_prefix="thumbnail_"
|
| 25 |
)
|
| 26 |
|
| 27 |
+
# Cache em memória para URLs já processadas
|
| 28 |
_url_cache = {}
|
| 29 |
+
_cache_max_size = 1000
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
@router.get("/search")
|
| 32 |
+
async def search(
|
| 33 |
+
q: str = Query(..., description="Termo de pesquisa para imagens"),
|
| 34 |
+
min_width: int = Query(1200, description="Largura mínima das imagens (padrão: 1200px)"),
|
| 35 |
+
include_thumbnails: bool = Query(True, description="Incluir miniaturas base64 nas respostas")
|
| 36 |
+
):
|
| 37 |
+
"""
|
| 38 |
+
Busca imagens no Google Imagens com máxima performance
|
| 39 |
+
"""
|
| 40 |
+
start_time = time.time()
|
| 41 |
+
|
| 42 |
+
# URL do Google Imagens com parâmetros para imagens grandes
|
| 43 |
+
google_images_url = "http://www.google.com/search"
|
| 44 |
+
|
| 45 |
+
params = {
|
| 46 |
+
"tbm": "isch",
|
| 47 |
+
"q": q,
|
| 48 |
+
"start": 0,
|
| 49 |
+
"sa": "N",
|
| 50 |
+
"asearch": "arc",
|
| 51 |
+
"cs": "1",
|
| 52 |
+
"tbs": "isz:l",
|
| 53 |
+
"async": f"arc_id:srp_GgSMaOPQOtL_5OUPvbSTOQ_110,ffilt:all,ve_name:MoreResultsContainer,inf:1,_id:arc-srp_GgSMaOPQOtL_5OUPvbSTOQ_110,_pms:s,_fmt:pc"
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
headers = {
|
| 57 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 58 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 59 |
+
"Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3",
|
| 60 |
+
"Accept-Encoding": "gzip, deflate",
|
| 61 |
+
"Connection": "keep-alive",
|
| 62 |
+
"Referer": "https://www.google.com/"
|
| 63 |
+
}
|
| 64 |
|
| 65 |
try:
|
| 66 |
+
# Busca no Google (rápida)
|
| 67 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 68 |
+
response = await client.get(google_images_url, params=params, headers=headers)
|
| 69 |
|
| 70 |
+
if response.status_code != 200:
|
| 71 |
+
raise HTTPException(status_code=response.status_code, detail="Erro ao buscar no Google Imagens")
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
# Extração otimizada
|
| 74 |
+
images = extract_images_from_response_optimized(response.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
# Processamento paralelo massivo
|
| 77 |
+
enriched_images = await enrich_images_ultra_fast(images, include_thumbnails)
|
| 78 |
+
|
| 79 |
+
# Filtragem rápida
|
| 80 |
+
valid_images = [
|
| 81 |
+
img for img in enriched_images
|
| 82 |
+
if img.get('width', 0) >= min_width and img.get('height', 0) > 0
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
# Se poucos resultados, busca adicional em paralelo
|
| 86 |
+
if len(valid_images) < 20:
|
| 87 |
+
params["tbs"] = "isz:lt,islt:4mp"
|
| 88 |
+
|
| 89 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 90 |
+
response2 = await client.get(google_images_url, params=params, headers=headers)
|
| 91 |
+
|
| 92 |
+
if response2.status_code == 200:
|
| 93 |
+
additional_images = extract_images_from_response_optimized(response2.text)
|
| 94 |
+
additional_enriched = await enrich_images_ultra_fast(additional_images, include_thumbnails)
|
| 95 |
+
|
| 96 |
+
# Merge rápido com set para deduplicação
|
| 97 |
+
seen_urls = {img.get('url') for img in valid_images}
|
| 98 |
+
for img in additional_enriched:
|
| 99 |
+
if (img.get('url') not in seen_urls
|
| 100 |
+
and img.get('width', 0) >= min_width
|
| 101 |
+
and img.get('height', 0) > 0):
|
| 102 |
+
valid_images.append(img)
|
| 103 |
+
seen_urls.add(img.get('url'))
|
| 104 |
+
|
| 105 |
+
# Ordenação e limitação
|
| 106 |
+
valid_images.sort(key=lambda x: x.get('width', 0), reverse=True)
|
| 107 |
+
final_images = valid_images[:50]
|
| 108 |
+
|
| 109 |
+
total_time = time.time() - start_time
|
| 110 |
+
|
| 111 |
+
return JSONResponse(content={
|
| 112 |
+
"query": q,
|
| 113 |
+
"min_width_filter": min_width,
|
| 114 |
+
"total_found": len(final_images),
|
| 115 |
+
"thumbnails_included": include_thumbnails,
|
| 116 |
+
"processing_time": round(total_time, 2),
|
| 117 |
+
"images": final_images
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
except httpx.TimeoutException:
|
| 121 |
+
raise HTTPException(status_code=408, detail="Timeout na requisição ao Google")
|
| 122 |
+
except Exception as e:
|
| 123 |
+
raise HTTPException(status_code=500, detail=f"Erro ao executar a busca: {str(e)}")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@lru_cache(maxsize=500)
|
| 127 |
+
def clean_wikimedia_url_cached(url: str) -> str:
|
| 128 |
+
"""
|
| 129 |
+
Versão cached da limpeza de URLs do Wikimedia
|
| 130 |
+
"""
|
| 131 |
+
if 'wikimedia.org' in url and '/thumb/' in url:
|
| 132 |
+
try:
|
| 133 |
+
parts = url.split('/thumb/')
|
| 134 |
+
if len(parts) == 2:
|
| 135 |
+
before_thumb = parts[0]
|
| 136 |
+
after_thumb = parts[1]
|
| 137 |
+
path_parts = after_thumb.split('/')
|
| 138 |
+
|
| 139 |
+
if len(path_parts) >= 3:
|
| 140 |
+
original_path = '/'.join(path_parts[:3])
|
| 141 |
+
return f"{before_thumb}/{original_path}"
|
| 142 |
+
except:
|
| 143 |
+
pass
|
| 144 |
return url
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
def extract_images_from_response_optimized(response_text: str) -> List[Dict]:
|
| 148 |
+
"""
|
| 149 |
+
Extração ultra-otimizada usando regex compilado e processamento em lote
|
| 150 |
+
"""
|
| 151 |
+
# Regex compilado (mais rápido)
|
| 152 |
+
pattern = re.compile(r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)\b', re.IGNORECASE)
|
| 153 |
|
| 154 |
+
# Extração em uma única passada
|
| 155 |
+
image_urls = pattern.findall(response_text)
|
|
|
|
| 156 |
|
| 157 |
+
# Deduplicação com set (O(1) lookup)
|
| 158 |
+
seen_urls = set()
|
| 159 |
+
images = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
# Processa URLs em lote
|
| 162 |
+
for url in image_urls[:200]: # Aumentado para compensar filtragem
|
| 163 |
+
cleaned_url = clean_wikimedia_url_cached(url)
|
| 164 |
+
if cleaned_url not in seen_urls:
|
| 165 |
+
seen_urls.add(cleaned_url)
|
| 166 |
+
images.append({"url": cleaned_url, "width": None, "height": None})
|
| 167 |
+
|
| 168 |
+
return images
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
def get_image_size_super_fast(data: bytes) -> Optional[Tuple[int, int]]:
|
| 172 |
+
"""
|
| 173 |
+
Parsing ultra-otimizado - apenas formatos mais comuns primeiro
|
| 174 |
+
"""
|
| 175 |
if len(data) < 24:
|
| 176 |
return None
|
| 177 |
|
| 178 |
+
try:
|
| 179 |
+
# JPEG (mais comum) - otimizado
|
| 180 |
+
if data[:2] == b'\xff\xd8':
|
| 181 |
+
# Busca mais eficiente pelos markers
|
| 182 |
+
for i in range(2, min(len(data) - 8, 1000)): # Limita busca
|
| 183 |
+
if data[i:i+2] in (b'\xff\xc0', b'\xff\xc2'):
|
| 184 |
+
if i + 9 <= len(data):
|
| 185 |
+
height = struct.unpack('>H', data[i+5:i+7])[0]
|
| 186 |
+
width = struct.unpack('>H', data[i+7:i+9])[0]
|
| 187 |
+
if width > 0 and height > 0:
|
| 188 |
+
return width, height
|
| 189 |
+
|
| 190 |
+
# PNG (segundo mais comum)
|
| 191 |
+
elif data[:8] == b'\x89PNG\r\n\x1a\n' and len(data) >= 24:
|
| 192 |
+
width = struct.unpack('>I', data[16:20])[0]
|
| 193 |
+
height = struct.unpack('>I', data[20:24])[0]
|
| 194 |
+
if width > 0 and height > 0:
|
| 195 |
+
return width, height
|
| 196 |
+
|
| 197 |
+
# WebP (crescimento)
|
| 198 |
+
elif data[:12] == b'RIFF' + data[4:8] + b'WEBP' and len(data) >= 30:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
if data[12:16] == b'VP8 ':
|
| 200 |
+
width = struct.unpack('<H', data[26:28])[0] & 0x3fff
|
| 201 |
+
height = struct.unpack('<H', data[28:30])[0] & 0x3fff
|
| 202 |
+
if width > 0 and height > 0:
|
| 203 |
+
return width, height
|
| 204 |
+
except:
|
| 205 |
+
pass
|
| 206 |
|
| 207 |
return None
|
| 208 |
|
| 209 |
+
|
| 210 |
+
def create_thumbnail_cpu_optimized(image_data: bytes, max_size: int = 200) -> Optional[str]:
|
| 211 |
+
"""
|
| 212 |
+
Versão CPU-otimizada para threading
|
| 213 |
+
"""
|
| 214 |
if not image_data or len(image_data) < 100:
|
| 215 |
return None
|
| 216 |
|
| 217 |
try:
|
| 218 |
+
# Abre imagem (rápido)
|
| 219 |
+
with Image.open(io.BytesIO(image_data)) as image:
|
| 220 |
+
# Conversão rápida para RGB
|
| 221 |
+
if image.mode != 'RGB':
|
| 222 |
+
if image.mode in ('RGBA', 'LA'):
|
| 223 |
+
# Background branco para transparências
|
| 224 |
+
bg = Image.new('RGB', image.size, (255, 255, 255))
|
| 225 |
+
bg.paste(image, mask=image.split()[-1] if 'A' in image.mode else None)
|
| 226 |
+
image = bg
|
| 227 |
+
else:
|
| 228 |
+
image = image.convert('RGB')
|
| 229 |
+
|
| 230 |
+
# Cálculo otimizado de proporções
|
| 231 |
+
w, h = image.size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
if w > h:
|
| 233 |
+
new_w, new_h = max_size, max(1, (h * max_size) // w)
|
|
|
|
|
|
|
| 234 |
else:
|
| 235 |
+
new_w, new_h = max(1, (w * max_size) // h), max_size
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
# Resize com filtro mais rápido para thumbnails
|
| 238 |
+
thumbnail = image.resize((new_w, new_h), Image.Resampling.BILINEAR)
|
| 239 |
+
|
| 240 |
+
# Salva com configurações otimizadas
|
| 241 |
+
buffer = io.BytesIO()
|
| 242 |
+
thumbnail.save(buffer, format='JPEG', quality=80, optimize=False) # optimize=False é mais rápido
|
| 243 |
+
|
| 244 |
+
return f"data:image/jpeg;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
return None
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
async def download_and_process_image(session: httpx.AsyncClient, url: str, include_thumbnail: bool) -> Dict:
|
| 251 |
+
"""
|
| 252 |
+
Download e processamento otimizado de uma única imagem
|
| 253 |
+
"""
|
| 254 |
+
# Verifica cache primeiro
|
| 255 |
+
cache_key = f"{url}_{include_thumbnail}"
|
| 256 |
+
if cache_key in _url_cache:
|
| 257 |
+
return _url_cache[cache_key].copy()
|
| 258 |
|
| 259 |
+
clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
+
headers = {
|
| 262 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 263 |
+
'Accept': 'image/*',
|
| 264 |
+
'Connection': 'close'
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
width, height, thumbnail_b64 = None, None, None
|
| 268 |
+
|
| 269 |
+
try:
|
| 270 |
+
# Estratégia otimizada: tamanhos incrementais
|
| 271 |
+
ranges = ['0-8192', '0-32768', '0-131072'] if include_thumbnail else ['0-2048']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
for range_header in ranges:
|
| 274 |
+
headers['Range'] = f'bytes={range_header}'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
+
try:
|
| 277 |
+
response = await session.get(clean_url, headers=headers, timeout=6.0)
|
| 278 |
+
if response.status_code in [200, 206] and len(response.content) > 100:
|
| 279 |
+
|
| 280 |
+
# Parsing rápido de dimensões
|
| 281 |
+
if not width or not height:
|
| 282 |
+
dimensions = get_image_size_super_fast(response.content)
|
| 283 |
if dimensions:
|
| 284 |
width, height = dimensions
|
| 285 |
+
|
| 286 |
+
# Thumbnail em thread separada se necessário
|
| 287 |
+
if include_thumbnail and not thumbnail_b64:
|
| 288 |
+
loop = asyncio.get_event_loop()
|
| 289 |
+
thumbnail_b64 = await loop.run_in_executor(
|
| 290 |
+
thumbnail_executor,
|
| 291 |
+
create_thumbnail_cpu_optimized,
|
| 292 |
+
response.content
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
# Se conseguiu tudo o que precisava, para por aqui
|
| 296 |
+
if width and height and (not include_thumbnail or thumbnail_b64):
|
| 297 |
+
break
|
| 298 |
|
| 299 |
+
except:
|
| 300 |
+
continue # Tenta próximo range
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
+
# Fallback final: download completo se necessário
|
| 303 |
+
if (not width or not height or (include_thumbnail and not thumbnail_b64)):
|
| 304 |
+
try:
|
| 305 |
+
del headers['Range']
|
| 306 |
+
response = await session.get(clean_url, headers=headers, timeout=8.0)
|
| 307 |
+
if response.status_code == 200 and len(response.content) < 2000000: # Max 2MB
|
| 308 |
+
|
| 309 |
+
if not width or not height:
|
| 310 |
+
try:
|
| 311 |
+
with Image.open(io.BytesIO(response.content)) as img:
|
| 312 |
+
width, height = img.size
|
| 313 |
+
except:
|
| 314 |
+
pass
|
| 315 |
+
|
| 316 |
+
if include_thumbnail and not thumbnail_b64:
|
| 317 |
+
loop = asyncio.get_event_loop()
|
| 318 |
+
thumbnail_b64 = await loop.run_in_executor(
|
| 319 |
+
thumbnail_executor,
|
| 320 |
+
create_thumbnail_cpu_optimized,
|
| 321 |
+
response.content
|
| 322 |
+
)
|
| 323 |
+
except:
|
| 324 |
+
pass
|
|
|
|
| 325 |
|
| 326 |
+
except Exception as e:
|
| 327 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
+
result = {
|
| 330 |
+
"url": clean_url,
|
| 331 |
+
"width": width,
|
| 332 |
+
"height": height
|
|
|
|
|
|
|
| 333 |
}
|
| 334 |
|
| 335 |
+
if include_thumbnail:
|
| 336 |
+
result["thumbnail"] = thumbnail_b64
|
| 337 |
+
|
| 338 |
+
# Cache do resultado (limita tamanho do cache)
|
| 339 |
+
if len(_url_cache) < _cache_max_size:
|
| 340 |
+
_url_cache[cache_key] = result.copy()
|
| 341 |
+
|
| 342 |
+
return result
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
async def enrich_images_ultra_fast(images: List[Dict], include_thumbnails: bool = True) -> List[Dict]:
|
| 346 |
+
"""
|
| 347 |
+
Processamento ultra-paralelo com todas as otimizações modernas
|
| 348 |
+
"""
|
| 349 |
+
if not images:
|
| 350 |
+
return []
|
| 351 |
+
|
| 352 |
+
# Configuração HTTP2 otimizada para máxima concorrência
|
| 353 |
+
connector = httpx.AsyncClient(
|
| 354 |
+
timeout=httpx.Timeout(10.0),
|
| 355 |
+
limits=httpx.Limits(
|
| 356 |
+
max_keepalive_connections=100, # Muito mais conexões
|
| 357 |
+
max_connections=150, # Pool maior
|
| 358 |
+
keepalive_expiry=30.0 # Mantém conexões por mais tempo
|
| 359 |
+
),
|
| 360 |
+
http2=False # HTTP/1.1 ainda é mais rápido para muitas conexões pequenas
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
# Semáforo mais agressivo
|
| 364 |
+
semaphore = asyncio.Semaphore(30) # Muito mais concorrência
|
| 365 |
+
|
| 366 |
+
async def process_single_image(image_data):
|
| 367 |
+
async with semaphore:
|
| 368 |
+
return await download_and_process_image(connector, image_data["url"], include_thumbnails)
|
| 369 |
+
|
| 370 |
try:
|
| 371 |
+
# Cria todas as tasks de uma vez
|
| 372 |
+
tasks = [process_single_image(img) for img in images]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
+
# Processa tudo em paralelo com gather otimizado
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 376 |
|
| 377 |
+
# Filtragem rápida
|
| 378 |
+
valid_results = []
|
| 379 |
+
for result in results:
|
| 380 |
+
if not isinstance(result, Exception) and result.get('width') and result.get('height'):
|
| 381 |
+
valid_results.append(result)
|
|
|
|
|
|
|
| 382 |
|
| 383 |
+
return valid_results
|
|
|
|
|
|
|
| 384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
except Exception as e:
|
| 386 |
+
return []
|
| 387 |
+
finally:
|
| 388 |
+
await connector.aclose()
|
| 389 |
+
|
| 390 |
|
| 391 |
+
# Endpoint adicional otimizado
|
| 392 |
@router.get("/thumbnail")
|
| 393 |
async def get_thumbnail_fast(
|
| 394 |
+
url: str = Query(..., description="URL da imagem para gerar miniatura"),
|
| 395 |
+
size: int = Query(200, description="Tamanho máximo da miniatura em pixels")
|
| 396 |
):
|
| 397 |
+
"""
|
| 398 |
+
Obtém miniatura ultra-rápida de uma imagem específica
|
| 399 |
+
"""
|
| 400 |
try:
|
| 401 |
+
async with httpx.AsyncClient(timeout=8.0) as client:
|
| 402 |
+
result = await download_and_process_image(client, url, True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
+
if result.get('thumbnail'):
|
| 405 |
+
return JSONResponse(content={
|
| 406 |
+
"url": result['url'],
|
| 407 |
+
"thumbnail": result['thumbnail'],
|
| 408 |
+
"dimensions": f"{result.get('width', 0)}x{result.get('height', 0)}",
|
| 409 |
+
"size": size
|
| 410 |
+
})
|
| 411 |
+
else:
|
| 412 |
+
raise HTTPException(status_code=500, detail="Erro ao criar miniatura")
|
| 413 |
+
|
| 414 |
except Exception as e:
|
| 415 |
raise HTTPException(status_code=500, detail=f"Erro: {str(e)}")
|
| 416 |
|
| 417 |
+
|
| 418 |
+
# Cleanup do executor na finalização
|
| 419 |
import atexit
|
| 420 |
+
atexit.register(lambda: thumbnail_executor.shutdown(wait=False))
|