Upload 13 files
Browse files- src/COMPLETE_FILE_LIST.md +311 -0
- src/__init__ (1).py +7 -0
- src/__init__.py +23 -0
- src/comparator.py +114 -0
- src/cross_referencer.py +458 -0
- src/demo_bypass.py +340 -0
- src/embedding_engine.py +62 -0
- src/face_processor.py +87 -0
- src/ocr_extractor.py +420 -0
- src/stealth_engine.py +454 -0
- src/test_basic.py +248 -0
- src/usage_example.py +273 -0
- src/vector_db.py +173 -0
src/COMPLETE_FILE_LIST.md
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ✅ LISTA COMPLETA DE ARCHIVOS - VERIFICACIÓN FINAL
|
| 2 |
+
|
| 3 |
+
## 📦 PROYECTO COMPLETO: 36 archivos
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 🔴 ARCHIVOS CRÍTICOS EN RAÍZ (11 archivos)
|
| 8 |
+
|
| 9 |
+
| # | Archivo | Tamaño | Estado | Descripción |
|
| 10 |
+
|---|---------|--------|--------|-------------|
|
| 11 |
+
| 1 | app.py | 16 KB | ✅ | FastAPI server principal |
|
| 12 |
+
| 2 | requirements.txt | 1.1 KB | ✅ | Dependencias (CORREGIDO mediapipe) |
|
| 13 |
+
| 3 | start.py | 2.7 KB | ✅ | Script de inicio robusto |
|
| 14 |
+
| 4 | Dockerfile | 2 KB | ✅ | Config Docker para HF Spaces |
|
| 15 |
+
| 5 | config.yaml | 3 KB | ✅ | Configuración del sistema |
|
| 16 |
+
| 6 | .env.example | 1.2 KB | ✅ | Template variables de entorno |
|
| 17 |
+
| 7 | env.example.txt | 1.2 KB | ✅ | Copia visible de .env.example |
|
| 18 |
+
| 8 | .gitignore | 800 B | ✅ | Archivos a ignorar |
|
| 19 |
+
| 9 | verify_files.py | 6 KB | ✅ | Script de verificación automática |
|
| 20 |
+
| 10 | LICENSE | 2 KB | ✅ | MIT License + Ethical Notice |
|
| 21 |
+
| 11 | requirements_FIXED.txt | 1.1 KB | ✅ | Backup de requirements corregido |
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## 🧠 MÓDULOS CORE - src/ (9 archivos)
|
| 26 |
+
|
| 27 |
+
### Archivos Principales (7 archivos)
|
| 28 |
+
| # | Archivo | Tamaño | Estado | Descripción |
|
| 29 |
+
|---|---------|--------|--------|-------------|
|
| 30 |
+
| 12 | src/__init__.py | 400 B | ✅ | Inicialización del paquete |
|
| 31 |
+
| 13 | src/face_processor.py | 2 KB | ✅ | Detección MTCNN |
|
| 32 |
+
| 14 | src/embedding_engine.py | 1.5 KB | ✅ | Generación embeddings ArcFace |
|
| 33 |
+
| 15 | src/comparator.py | 2 KB | ✅ | Comparación con umbrales adaptativos |
|
| 34 |
+
| 16 | **src/ocr_extractor.py** | **12 KB** | ✅ | ⭐ MÓDULO CLAVE #1: OCR extractor |
|
| 35 |
+
| 17 | **src/cross_referencer.py** | **10 KB** | ✅ | ⭐ MÓDULO CLAVE #2: Cross-referencer |
|
| 36 |
+
| 18 | src/vector_db.py | 3 KB | ✅ | Almacenamiento Qdrant |
|
| 37 |
+
|
| 38 |
+
### Scrapers (2 archivos)
|
| 39 |
+
| # | Archivo | Tamaño | Estado | Descripción |
|
| 40 |
+
|---|---------|--------|--------|-------------|
|
| 41 |
+
| 19 | src/scrapers/__init__.py | 200 B | ✅ | Inicialización scrapers |
|
| 42 |
+
| 20 | **src/scrapers/stealth_engine.py** | **8 KB** | ✅ | ⭐ MÓDULO CLAVE #3: Stealth scraping |
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## 💡 EJEMPLOS - examples/ (2 archivos)
|
| 47 |
+
|
| 48 |
+
| # | Archivo | Tamaño | Estado | Descripción |
|
| 49 |
+
|---|---------|--------|--------|-------------|
|
| 50 |
+
| 21 | examples/usage_example.py | 5 KB | ✅ | Ejemplos interactivos de uso |
|
| 51 |
+
| 22 | examples/demo_bypass.py | 7 KB | ✅ | Demo del bypass de PimEyes |
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## 🧪 TESTS - tests/ (1 archivo)
|
| 56 |
+
|
| 57 |
+
| # | Archivo | Tamaño | Estado | Descripción |
|
| 58 |
+
|---|---------|--------|--------|-------------|
|
| 59 |
+
| 23 | tests/test_basic.py | 4 KB | ✅ | Tests unitarios básicos |
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## 📘 DOCUMENTACIÓN (13 archivos .md)
|
| 64 |
+
|
| 65 |
+
| # | Archivo | Tamaño | Importancia | Descripción |
|
| 66 |
+
|---|---------|--------|-------------|-------------|
|
| 67 |
+
| 24 | README.md | 12 KB | 🔴 CRÍTICO | Documentación principal |
|
| 68 |
+
| 25 | QUICKSTART.md | 3 KB | 🔴 CRÍTICO | Guía de inicio rápido |
|
| 69 |
+
| 26 | **INTEGRATION_GUIDE.md** | **15 KB** | 🔴 CRÍTICO | Guía de los 3 módulos clave |
|
| 70 |
+
| 27 | PROJECT_STRUCTURE.md | 8 KB | 🟡 | Arquitectura del proyecto |
|
| 71 |
+
| 28 | DEPLOYMENT.md | 10 KB | 🟡 | Guías de deployment |
|
| 72 |
+
| 29 | PROJECT_SUMMARY.md | 7 KB | 🟢 | Resumen ejecutivo |
|
| 73 |
+
| 30 | README_HUGGINGFACE.md | 1 KB | 🟡 | Config para HF Spaces |
|
| 74 |
+
| 31 | TROUBLESHOOTING_IMPORTS.md | 8 KB | 🟡 | Solución errores de imports |
|
| 75 |
+
| 32 | FIX_SUMMARY.md | 5 KB | 🟡 | Resumen de correcciones |
|
| 76 |
+
| 33 | FILES_LISTING.md | 9 KB | 🟢 | Lista de archivos |
|
| 77 |
+
| 34 | FILE_VERIFICATION.md | 8 KB | 🟢 | Guía de verificación |
|
| 78 |
+
| 35 | MANIFEST.md | 10 KB | 🟢 | Manifiesto completo |
|
| 79 |
+
| 36 | **MEDIAPIPE_FIX.md** | **3 KB** | 🔴 CRÍTICO | Fix del error de build |
|
| 80 |
+
| 37 | BUILD_ERROR_FIX.md | 7 KB | 🟡 | Solución errores de build |
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## 📊 RESUMEN POR CATEGORÍA
|
| 85 |
+
|
| 86 |
+
| Categoría | Archivos | Tamaño Total |
|
| 87 |
+
|-----------|----------|--------------|
|
| 88 |
+
| Archivos Raíz | 11 | ~30 KB |
|
| 89 |
+
| Módulos src/ | 9 | ~40 KB |
|
| 90 |
+
| Ejemplos | 2 | ~12 KB |
|
| 91 |
+
| Tests | 1 | ~4 KB |
|
| 92 |
+
| Documentación | 14 | ~110 KB |
|
| 93 |
+
| **TOTAL** | **37** | **~196 KB** |
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## 🎯 ARCHIVOS MÁS IMPORTANTES
|
| 98 |
+
|
| 99 |
+
### 🔴 ABSOLUTAMENTE NECESARIOS (Sin estos NO funciona):
|
| 100 |
+
|
| 101 |
+
1. ✅ **app.py** - Server principal
|
| 102 |
+
2. ✅ **requirements.txt** - Dependencias (CON mediapipe==0.10.32)
|
| 103 |
+
3. ✅ **start.py** - Script de inicio
|
| 104 |
+
4. ✅ **Dockerfile** - Config para HF
|
| 105 |
+
5. ✅ **src/__init__.py** - Package Python
|
| 106 |
+
6. ✅ **src/face_processor.py** - Detección facial
|
| 107 |
+
7. ✅ **src/embedding_engine.py** - Embeddings
|
| 108 |
+
8. ✅ **src/comparator.py** - Comparación
|
| 109 |
+
9. ✅ **src/ocr_extractor.py** ⭐ - OCR (MÓDULO CLAVE)
|
| 110 |
+
10. ✅ **src/cross_referencer.py** ⭐ - Cross-ref (MÓDULO CLAVE)
|
| 111 |
+
11. ✅ **src/vector_db.py** - Storage
|
| 112 |
+
12. ✅ **src/scrapers/__init__.py** - Package Python
|
| 113 |
+
13. ✅ **src/scrapers/stealth_engine.py** ⭐ - Stealth (MÓDULO CLAVE)
|
| 114 |
+
|
| 115 |
+
### 🟡 MUY RECOMENDADOS (Para entender el proyecto):
|
| 116 |
+
|
| 117 |
+
14. ✅ **README.md** - Documentación principal
|
| 118 |
+
15. ✅ **QUICKSTART.md** - Cómo empezar
|
| 119 |
+
16. ✅ **INTEGRATION_GUIDE.md** - Cómo funcionan los módulos
|
| 120 |
+
17. ✅ **MEDIAPIPE_FIX.md** - Fix del error actual
|
| 121 |
+
|
| 122 |
+
### 🟢 OPCIONALES (Útiles pero no críticos):
|
| 123 |
+
|
| 124 |
+
- examples/ - Para aprender a usar
|
| 125 |
+
- tests/ - Para verificar funcionamiento
|
| 126 |
+
- Resto de documentación - Para referencia
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## 🔍 VERIFICACIÓN RÁPIDA
|
| 131 |
+
|
| 132 |
+
### Comando para verificar estructura:
|
| 133 |
+
|
| 134 |
+
```bash
|
| 135 |
+
# Linux/Mac
|
| 136 |
+
cd aliah-plus
|
| 137 |
+
tree -L 2
|
| 138 |
+
|
| 139 |
+
# O manualmente:
|
| 140 |
+
ls -la
|
| 141 |
+
ls -la src/
|
| 142 |
+
ls -la src/scrapers/
|
| 143 |
+
ls -la examples/
|
| 144 |
+
ls -la tests/
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
### Salida esperada:
|
| 148 |
+
|
| 149 |
+
```
|
| 150 |
+
aliah-plus/
|
| 151 |
+
├── app.py ✅
|
| 152 |
+
├── requirements.txt ✅
|
| 153 |
+
├── start.py ✅
|
| 154 |
+
├── Dockerfile ✅
|
| 155 |
+
├── config.yaml ✅
|
| 156 |
+
├── README.md ✅
|
| 157 |
+
├── ... (más .md files)
|
| 158 |
+
├── src/
|
| 159 |
+
│ ├── __init__.py ✅
|
| 160 |
+
│ ├── face_processor.py ✅
|
| 161 |
+
│ ├── embedding_engine.py ✅
|
| 162 |
+
│ ├── comparator.py ✅
|
| 163 |
+
│ ├── ocr_extractor.py ✅
|
| 164 |
+
│ ├── cross_referencer.py ✅
|
| 165 |
+
│ ├── vector_db.py ✅
|
| 166 |
+
│ └── scrapers/
|
| 167 |
+
│ ├── __init__.py ✅
|
| 168 |
+
│ └── stealth_engine.py ✅
|
| 169 |
+
├── examples/
|
| 170 |
+
│ ├── usage_example.py ✅
|
| 171 |
+
│ └── demo_bypass.py ✅
|
| 172 |
+
└── tests/
|
| 173 |
+
└── test_basic.py ✅
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
---
|
| 177 |
+
|
| 178 |
+
## 🚀 SCRIPT DE VERIFICACIÓN AUTOMÁTICA
|
| 179 |
+
|
| 180 |
+
Para verificar que tienes TODOS los archivos:
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
cd aliah-plus
|
| 184 |
+
python verify_files.py
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
**Salida esperada:**
|
| 188 |
+
```
|
| 189 |
+
============================================================
|
| 190 |
+
VERIFICACIÓN DE ARCHIVOS - Aliah-Plus
|
| 191 |
+
============================================================
|
| 192 |
+
|
| 193 |
+
[1/5] Verificando archivos CRÍTICOS...
|
| 194 |
+
✓ app.py 16.0 KB
|
| 195 |
+
✓ requirements.txt 1.1 KB
|
| 196 |
+
✓ start.py 2.7 KB
|
| 197 |
+
✓ Dockerfile 2.0 KB
|
| 198 |
+
✓ config.yaml 3.0 KB
|
| 199 |
+
|
| 200 |
+
[2/5] Verificando módulos en src/...
|
| 201 |
+
✓ src/__init__.py 0.4 KB
|
| 202 |
+
✓ src/face_processor.py 2.0 KB
|
| 203 |
+
✓ src/embedding_engine.py 1.5 KB
|
| 204 |
+
✓ src/comparator.py 2.0 KB
|
| 205 |
+
✓ src/ocr_extractor.py 12.0 KB
|
| 206 |
+
✓ src/cross_referencer.py 10.0 KB
|
| 207 |
+
✓ src/vector_db.py 3.0 KB
|
| 208 |
+
✓ src/scrapers/__init__.py 0.2 KB
|
| 209 |
+
✓ src/scrapers/stealth_engine.py 8.0 KB
|
| 210 |
+
|
| 211 |
+
[3/5] Verificando ejemplos...
|
| 212 |
+
✓ examples/usage_example.py 5.0 KB
|
| 213 |
+
✓ examples/demo_bypass.py 7.0 KB
|
| 214 |
+
|
| 215 |
+
[4/5] Verificando tests...
|
| 216 |
+
✓ test_basic.py 4.0 KB
|
| 217 |
+
|
| 218 |
+
[5/5] Verificando documentación...
|
| 219 |
+
✓ README.md
|
| 220 |
+
✓ QUICKSTART.md
|
| 221 |
+
... (más archivos)
|
| 222 |
+
|
| 223 |
+
============================================================
|
| 224 |
+
RESUMEN
|
| 225 |
+
============================================================
|
| 226 |
+
TOTAL: 37/37 archivos presentes
|
| 227 |
+
|
| 228 |
+
✅ ¡PERFECTO! Todos los archivos están presentes.
|
| 229 |
+
El proyecto está completo y listo para usar.
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
## ⚠️ SI FALTA ALGÚN ARCHIVO
|
| 235 |
+
|
| 236 |
+
### Archivos de src/ no visibles?
|
| 237 |
+
|
| 238 |
+
**Problema:** Los archivos de `src/` pueden no aparecer si la carpeta no se descargó correctamente.
|
| 239 |
+
|
| 240 |
+
**Solución:**
|
| 241 |
+
1. Descarga TODA la carpeta `aliah-plus` completa
|
| 242 |
+
2. NO descargues archivos individuales
|
| 243 |
+
3. Asegúrate de que la estructura de carpetas se preserve
|
| 244 |
+
|
| 245 |
+
### Verificación manual:
|
| 246 |
+
|
| 247 |
+
```bash
|
| 248 |
+
# Cuenta archivos .py en src/
|
| 249 |
+
find src -name "*.py" | wc -l
|
| 250 |
+
# Debe mostrar: 9
|
| 251 |
+
|
| 252 |
+
# Lista todos los archivos Python
|
| 253 |
+
find . -name "*.py" -type f
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## 📦 DESCARGA COMPLETA
|
| 259 |
+
|
| 260 |
+
Cuando descargues el proyecto, deberías obtener:
|
| 261 |
+
|
| 262 |
+
```
|
| 263 |
+
aliah-plus.zip (o carpeta)
|
| 264 |
+
└── Contiene 37 archivos
|
| 265 |
+
├── 11 en raíz
|
| 266 |
+
├── 9 en src/
|
| 267 |
+
├── 2 en examples/
|
| 268 |
+
├── 1 en tests/
|
| 269 |
+
└── 14 archivos .md
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
**Tamaño total:** ~196 KB
|
| 273 |
+
|
| 274 |
+
---
|
| 275 |
+
|
| 276 |
+
## ✅ CONFIRMACIÓN FINAL
|
| 277 |
+
|
| 278 |
+
Si ejecutas:
|
| 279 |
+
|
| 280 |
+
```bash
|
| 281 |
+
cd aliah-plus
|
| 282 |
+
ls -la src/*.py
|
| 283 |
+
```
|
| 284 |
+
|
| 285 |
+
Y ves:
|
| 286 |
+
|
| 287 |
+
```
|
| 288 |
+
src/__init__.py
|
| 289 |
+
src/comparator.py
|
| 290 |
+
src/cross_referencer.py
|
| 291 |
+
src/embedding_engine.py
|
| 292 |
+
src/face_processor.py
|
| 293 |
+
src/ocr_extractor.py
|
| 294 |
+
src/vector_db.py
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
**¡PERFECTO! Tienes todos los módulos.** 🎉
|
| 298 |
+
|
| 299 |
+
---
|
| 300 |
+
|
| 301 |
+
## 🎯 PRÓXIMO PASO
|
| 302 |
+
|
| 303 |
+
1. ✅ Verifica que tienes los 37 archivos
|
| 304 |
+
2. ✅ Especialmente los 9 archivos de `src/`
|
| 305 |
+
3. ✅ Sube TODOS a Hugging Face Spaces
|
| 306 |
+
4. ✅ Asegúrate de que `requirements.txt` tenga `mediapipe==0.10.32`
|
| 307 |
+
5. ✅ Espera el build (2-3 minutos)
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
**TODOS LOS ARCHIVOS HAN SIDO PRESENTADOS Y ESTÁN DISPONIBLES PARA DESCARGA** ✨
|
src/__init__ (1).py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scrapers module - Web scraping engines with stealth capabilities
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .stealth_engine import StealthSearch
|
| 6 |
+
|
| 7 |
+
__all__ = ['StealthSearch']
|
src/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Aliah-Plus - Sistema Avanzado de Re-Identificación Facial
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
__version__ = "1.0.0"
|
| 6 |
+
__author__ = "Aliah-Plus Team"
|
| 7 |
+
__description__ = "Advanced Face Re-Identification System with OCR and Cross-Referencing"
|
| 8 |
+
|
| 9 |
+
from .face_processor import FaceProcessor
|
| 10 |
+
from .embedding_engine import EmbeddingEngine
|
| 11 |
+
from .comparator import FaceComparator
|
| 12 |
+
from .ocr_extractor import OCRExtractor
|
| 13 |
+
from .cross_referencer import CrossReferencer
|
| 14 |
+
from .vector_db import VectorDatabase
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
'FaceProcessor',
|
| 18 |
+
'EmbeddingEngine',
|
| 19 |
+
'FaceComparator',
|
| 20 |
+
'OCRExtractor',
|
| 21 |
+
'CrossReferencer',
|
| 22 |
+
'VectorDatabase',
|
| 23 |
+
]
|
src/comparator.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Face Comparator - Comparación de embeddings con niveles de confianza adaptativos
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class FaceComparator:
|
| 11 |
+
"""
|
| 12 |
+
Compara embeddings faciales con umbrales adaptativos.
|
| 13 |
+
Implementa el sistema de 3 niveles: Seguro, Probable, Descartado.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, threshold=0.75):
|
| 17 |
+
"""
|
| 18 |
+
Args:
|
| 19 |
+
threshold: Umbral base de similitud (0.0-1.0)
|
| 20 |
+
"""
|
| 21 |
+
self.threshold = threshold
|
| 22 |
+
|
| 23 |
+
# Umbrales adaptativos
|
| 24 |
+
self.SECURE_MATCH = 0.85 # >85% = Match Seguro
|
| 25 |
+
self.PROBABLE_MATCH = 0.72 # 72-85% = Coincidencia Probable
|
| 26 |
+
# <72% = Descartado
|
| 27 |
+
|
| 28 |
+
def calculate_similarity(self, embedding1, embedding2):
|
| 29 |
+
"""
|
| 30 |
+
Calcula la similitud coseno entre dos embeddings.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
embedding1: Vector de embedding 1
|
| 34 |
+
embedding2: Vector de embedding 2
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
Similitud entre 0.0 y 1.0
|
| 38 |
+
"""
|
| 39 |
+
emb1 = np.array(embedding1).reshape(1, -1)
|
| 40 |
+
emb2 = np.array(embedding2).reshape(1, -1)
|
| 41 |
+
|
| 42 |
+
similarity = cosine_similarity(emb1, emb2)[0][0]
|
| 43 |
+
|
| 44 |
+
return float(similarity)
|
| 45 |
+
|
| 46 |
+
def verify_identity(self, source_emb, target_emb):
|
| 47 |
+
"""
|
| 48 |
+
Verifica identidad con análisis de confianza adaptativo.
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Tupla (nivel_confianza: str, similitud: float)
|
| 52 |
+
"""
|
| 53 |
+
similarity = self.calculate_similarity(source_emb, target_emb)
|
| 54 |
+
|
| 55 |
+
if similarity > self.SECURE_MATCH:
|
| 56 |
+
confidence_level = "Match Seguro"
|
| 57 |
+
logger.info(f"Match Seguro: {similarity:.3f}")
|
| 58 |
+
elif similarity > self.PROBABLE_MATCH:
|
| 59 |
+
confidence_level = "Coincidencia Probable (Requiere revisión)"
|
| 60 |
+
logger.info(f"Coincidencia Probable: {similarity:.3f}")
|
| 61 |
+
else:
|
| 62 |
+
confidence_level = "Descartado"
|
| 63 |
+
logger.debug(f"Descartado: {similarity:.3f}")
|
| 64 |
+
|
| 65 |
+
return confidence_level, similarity
|
| 66 |
+
|
| 67 |
+
def compare_embeddings(self, query_embedding, candidate_results):
|
| 68 |
+
"""
|
| 69 |
+
Compara el embedding query con múltiples candidatos.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
query_embedding: Embedding de la imagen query
|
| 73 |
+
candidate_results: Lista de resultados con embeddings
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
Lista de matches verificados ordenados por similitud
|
| 77 |
+
"""
|
| 78 |
+
verified_matches = []
|
| 79 |
+
|
| 80 |
+
for candidate in candidate_results:
|
| 81 |
+
if 'embedding' not in candidate:
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
# Calcular similitud
|
| 85 |
+
similarity = self.calculate_similarity(
|
| 86 |
+
query_embedding,
|
| 87 |
+
candidate['embedding']
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Solo incluir si supera el umbral
|
| 91 |
+
if similarity >= self.threshold:
|
| 92 |
+
# Determinar nivel de confianza
|
| 93 |
+
if similarity > self.SECURE_MATCH:
|
| 94 |
+
confidence_level = "Match Seguro"
|
| 95 |
+
elif similarity > self.PROBABLE_MATCH:
|
| 96 |
+
confidence_level = "Coincidencia Probable"
|
| 97 |
+
else:
|
| 98 |
+
confidence_level = "Baja confianza"
|
| 99 |
+
|
| 100 |
+
candidate['similarity'] = similarity
|
| 101 |
+
candidate['confidence_level'] = confidence_level
|
| 102 |
+
candidate['embedding_distance'] = 1 - similarity
|
| 103 |
+
candidate['verified'] = True
|
| 104 |
+
|
| 105 |
+
verified_matches.append(candidate)
|
| 106 |
+
|
| 107 |
+
logger.debug(f"Match verificado: {similarity:.3f} - {confidence_level}")
|
| 108 |
+
|
| 109 |
+
# Ordenar por similitud descendente
|
| 110 |
+
verified_matches.sort(key=lambda x: x['similarity'], reverse=True)
|
| 111 |
+
|
| 112 |
+
logger.info(f"Comparación completada: {len(verified_matches)}/{len(candidate_results)} verificados")
|
| 113 |
+
|
| 114 |
+
return verified_matches
|
src/cross_referencer.py
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cross-Referencer - Correlación inteligente de resultados entre múltiples motores
|
| 3 |
+
Este módulo es la clave para unir hallazgos de Yandex, Bing y PimEyes.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict, Set, Tuple
|
| 7 |
+
from urllib.parse import urlparse, parse_qs
|
| 8 |
+
import re
|
| 9 |
+
from difflib import SequenceMatcher
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
from loguru import logger
|
| 12 |
+
import hashlib
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class CrossReferencer:
|
| 16 |
+
"""
|
| 17 |
+
Sistema de correlación que une resultados de múltiples fuentes.
|
| 18 |
+
Si Yandex encuentra una foto y el OCR de PimEyes detecta el mismo dominio,
|
| 19 |
+
este módulo los vincula automáticamente.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, domain_similarity_threshold: float = 0.85):
|
| 23 |
+
"""
|
| 24 |
+
Args:
|
| 25 |
+
domain_similarity_threshold: Umbral de similitud para considerar dominios iguales (0.0-1.0)
|
| 26 |
+
"""
|
| 27 |
+
self.domain_threshold = domain_similarity_threshold
|
| 28 |
+
self.domain_cache = {} # Cache de dominios normalizados
|
| 29 |
+
|
| 30 |
+
def normalize_domain(self, url_or_domain: str) -> str:
|
| 31 |
+
"""
|
| 32 |
+
Normaliza un dominio o URL para comparación.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
url_or_domain: URL completa o dominio
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Dominio normalizado
|
| 39 |
+
"""
|
| 40 |
+
# Usar cache
|
| 41 |
+
if url_or_domain in self.domain_cache:
|
| 42 |
+
return self.domain_cache[url_or_domain]
|
| 43 |
+
|
| 44 |
+
# Limpiar
|
| 45 |
+
cleaned = url_or_domain.lower().strip()
|
| 46 |
+
|
| 47 |
+
# Si es una URL, extraer dominio
|
| 48 |
+
if cleaned.startswith(('http://', 'https://')):
|
| 49 |
+
parsed = urlparse(cleaned)
|
| 50 |
+
domain = parsed.netloc
|
| 51 |
+
else:
|
| 52 |
+
domain = cleaned
|
| 53 |
+
|
| 54 |
+
# Remover www.
|
| 55 |
+
domain = re.sub(r'^www\.', '', domain)
|
| 56 |
+
|
| 57 |
+
# Remover puerto si existe
|
| 58 |
+
domain = re.sub(r':\d+$', '', domain)
|
| 59 |
+
|
| 60 |
+
# Remover subdominios comunes que no son relevantes
|
| 61 |
+
domain = re.sub(r'^(m\.|mobile\.|static\.|cdn\.)', '', domain)
|
| 62 |
+
|
| 63 |
+
# Cache
|
| 64 |
+
self.domain_cache[url_or_domain] = domain
|
| 65 |
+
|
| 66 |
+
return domain
|
| 67 |
+
|
| 68 |
+
def extract_domain_from_url(self, url: str) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Extrae el dominio principal de una URL.
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
parsed = urlparse(url)
|
| 74 |
+
domain = parsed.netloc
|
| 75 |
+
|
| 76 |
+
# Remover www
|
| 77 |
+
domain = re.sub(r'^www\.', '', domain)
|
| 78 |
+
|
| 79 |
+
# Obtener dominio principal (sin subdominios)
|
| 80 |
+
parts = domain.split('.')
|
| 81 |
+
if len(parts) >= 2:
|
| 82 |
+
return '.'.join(parts[-2:])
|
| 83 |
+
|
| 84 |
+
return domain
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.debug(f"Error extrayendo dominio de {url}: {e}")
|
| 88 |
+
return ""
|
| 89 |
+
|
| 90 |
+
def calculate_domain_similarity(self, domain1: str, domain2: str) -> float:
|
| 91 |
+
"""
|
| 92 |
+
Calcula la similitud entre dos dominios.
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
Similitud entre 0.0 y 1.0
|
| 96 |
+
"""
|
| 97 |
+
# Normalizar ambos
|
| 98 |
+
d1 = self.normalize_domain(domain1)
|
| 99 |
+
d2 = self.normalize_domain(domain2)
|
| 100 |
+
|
| 101 |
+
# Comparación exacta
|
| 102 |
+
if d1 == d2:
|
| 103 |
+
return 1.0
|
| 104 |
+
|
| 105 |
+
# Comparación difusa usando SequenceMatcher
|
| 106 |
+
similarity = SequenceMatcher(None, d1, d2).ratio()
|
| 107 |
+
|
| 108 |
+
return similarity
|
| 109 |
+
|
| 110 |
+
def find_cross_references(self, all_results: Dict[str, List[Dict]],
|
| 111 |
+
ocr_results: Dict = None) -> List[Dict]:
|
| 112 |
+
"""
|
| 113 |
+
Encuentra correlaciones entre resultados de diferentes motores.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
all_results: Diccionario con resultados por motor {'yandex': [...], 'bing': [...], ...}
|
| 117 |
+
ocr_results: Resultados de OCR de miniaturas censuradas
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Lista de resultados correlacionados y enriquecidos
|
| 121 |
+
"""
|
| 122 |
+
logger.info("Iniciando cross-referencing de resultados")
|
| 123 |
+
|
| 124 |
+
# Índice de dominios
|
| 125 |
+
domain_index = defaultdict(list)
|
| 126 |
+
|
| 127 |
+
# Indexar todos los resultados por dominio
|
| 128 |
+
for source, results in all_results.items():
|
| 129 |
+
for idx, result in enumerate(results):
|
| 130 |
+
# Extraer dominio
|
| 131 |
+
if 'url' in result:
|
| 132 |
+
domain = self.extract_domain_from_url(result['url'])
|
| 133 |
+
elif 'domain' in result:
|
| 134 |
+
domain = self.normalize_domain(result['domain'])
|
| 135 |
+
else:
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
# Añadir al índice
|
| 139 |
+
result['_original_source'] = source
|
| 140 |
+
result['_original_index'] = idx
|
| 141 |
+
domain_index[domain].append(result)
|
| 142 |
+
|
| 143 |
+
# Si hay resultados de OCR, añadirlos al índice
|
| 144 |
+
if ocr_results:
|
| 145 |
+
for ocr_item in ocr_results:
|
| 146 |
+
domain = self.normalize_domain(ocr_item.get('domain', ''))
|
| 147 |
+
ocr_item['_is_ocr'] = True
|
| 148 |
+
domain_index[domain].append(ocr_item)
|
| 149 |
+
|
| 150 |
+
# Encontrar correlaciones
|
| 151 |
+
cross_referenced_results = []
|
| 152 |
+
processed_domains = set()
|
| 153 |
+
|
| 154 |
+
for domain, items in domain_index.items():
|
| 155 |
+
if domain in processed_domains or not domain:
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
# Si hay múltiples fuentes para el mismo dominio, es una correlación
|
| 159 |
+
sources = set(item.get('_original_source') for item in items if '_original_source' in item)
|
| 160 |
+
has_ocr = any(item.get('_is_ocr', False) for item in items)
|
| 161 |
+
|
| 162 |
+
if len(sources) > 1 or has_ocr:
|
| 163 |
+
# Crear resultado correlacionado
|
| 164 |
+
correlation = self._create_correlation(domain, items, sources)
|
| 165 |
+
cross_referenced_results.append(correlation)
|
| 166 |
+
|
| 167 |
+
logger.info(f"Correlación encontrada: {domain} en {sources}")
|
| 168 |
+
|
| 169 |
+
processed_domains.add(domain)
|
| 170 |
+
|
| 171 |
+
# Añadir resultados sin correlación pero verificados
|
| 172 |
+
for source, results in all_results.items():
|
| 173 |
+
for result in results:
|
| 174 |
+
domain = self.extract_domain_from_url(result.get('url', ''))
|
| 175 |
+
if domain not in processed_domains:
|
| 176 |
+
result['cross_referenced'] = False
|
| 177 |
+
result['sources'] = [source]
|
| 178 |
+
cross_referenced_results.append(result)
|
| 179 |
+
|
| 180 |
+
# Ordenar por número de fuentes (más fuentes = más confiable)
|
| 181 |
+
cross_referenced_results.sort(
|
| 182 |
+
key=lambda x: (
|
| 183 |
+
len(x.get('sources', [])),
|
| 184 |
+
x.get('ocr_verified', False),
|
| 185 |
+
x.get('confidence', 0)
|
| 186 |
+
),
|
| 187 |
+
reverse=True
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
logger.success(f"Cross-referencing completado: {len(cross_referenced_results)} resultados procesados")
|
| 191 |
+
|
| 192 |
+
return cross_referenced_results
|
| 193 |
+
|
| 194 |
+
def _create_correlation(self, domain: str, items: List[Dict], sources: Set[str]) -> Dict:
|
| 195 |
+
"""
|
| 196 |
+
Crea un resultado correlacionado unificado.
|
| 197 |
+
"""
|
| 198 |
+
# Separar items de OCR y de búsqueda
|
| 199 |
+
ocr_items = [i for i in items if i.get('_is_ocr', False)]
|
| 200 |
+
search_items = [i for i in items if not i.get('_is_ocr', False)]
|
| 201 |
+
|
| 202 |
+
# Tomar el mejor resultado de búsqueda (primero de Yandex si existe)
|
| 203 |
+
primary_result = None
|
| 204 |
+
for source in ['yandex', 'bing', 'google', 'pimeyes']:
|
| 205 |
+
candidates = [i for i in search_items if i.get('_original_source') == source]
|
| 206 |
+
if candidates:
|
| 207 |
+
primary_result = candidates[0]
|
| 208 |
+
break
|
| 209 |
+
|
| 210 |
+
if not primary_result and search_items:
|
| 211 |
+
primary_result = search_items[0]
|
| 212 |
+
|
| 213 |
+
# Crear resultado unificado
|
| 214 |
+
correlation = {
|
| 215 |
+
'domain': domain,
|
| 216 |
+
'cross_referenced': True,
|
| 217 |
+
'sources': list(sources),
|
| 218 |
+
'ocr_verified': len(ocr_items) > 0,
|
| 219 |
+
'confidence': self._calculate_correlation_confidence(sources, ocr_items),
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# Añadir datos del resultado primario
|
| 223 |
+
if primary_result:
|
| 224 |
+
correlation.update({
|
| 225 |
+
'url': primary_result.get('url'),
|
| 226 |
+
'thumbnail_url': primary_result.get('thumbnail_url'),
|
| 227 |
+
'primary_source': primary_result.get('_original_source'),
|
| 228 |
+
})
|
| 229 |
+
|
| 230 |
+
# Añadir datos de OCR
|
| 231 |
+
if ocr_items:
|
| 232 |
+
correlation['ocr_data'] = {
|
| 233 |
+
'extracted_domains': [i.get('domain') for i in ocr_items],
|
| 234 |
+
'avg_confidence': sum(i.get('confidence', 0) for i in ocr_items) / len(ocr_items),
|
| 235 |
+
'extraction_methods': [i.get('method', 'unknown') for i in ocr_items],
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
# Añadir todas las URLs alternativas
|
| 239 |
+
all_urls = [i.get('url') for i in search_items if i.get('url')]
|
| 240 |
+
if all_urls:
|
| 241 |
+
correlation['alternative_urls'] = list(set(all_urls))
|
| 242 |
+
|
| 243 |
+
return correlation
|
| 244 |
+
|
| 245 |
+
def _calculate_correlation_confidence(self, sources: Set[str], ocr_items: List[Dict]) -> float:
|
| 246 |
+
"""
|
| 247 |
+
Calcula la confianza de una correlación basada en número de fuentes y OCR.
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
Confianza entre 0.0 y 1.0
|
| 251 |
+
"""
|
| 252 |
+
base_confidence = 0.5
|
| 253 |
+
|
| 254 |
+
# Bonus por cada fuente adicional (máx 0.15 por fuente)
|
| 255 |
+
source_bonus = min(len(sources) * 0.15, 0.45)
|
| 256 |
+
|
| 257 |
+
# Bonus si hay verificación OCR
|
| 258 |
+
ocr_bonus = 0.0
|
| 259 |
+
if ocr_items:
|
| 260 |
+
avg_ocr_confidence = sum(i.get('confidence', 0) for i in ocr_items) / len(ocr_items)
|
| 261 |
+
ocr_bonus = avg_ocr_confidence * 0.2 # Máx 0.2
|
| 262 |
+
|
| 263 |
+
total_confidence = min(base_confidence + source_bonus + ocr_bonus, 1.0)
|
| 264 |
+
|
| 265 |
+
return round(total_confidence, 3)
|
| 266 |
+
|
| 267 |
+
def match_pimeyes_with_search(self, pimeyes_results: List[Dict],
|
| 268 |
+
search_results: List[Dict],
|
| 269 |
+
ocr_domains: List[str]) -> List[Dict]:
|
| 270 |
+
"""
|
| 271 |
+
Método especializado para correlacionar PimEyes (censurado) con búsquedas abiertas.
|
| 272 |
+
|
| 273 |
+
Este es el "truco" principal: si PimEyes tiene una miniatura censurada pero el OCR
|
| 274 |
+
detecta "ejemplo.com", y Yandex encuentra "ejemplo.com/foto.jpg", los unimos.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
pimeyes_results: Resultados de PimEyes (censurados)
|
| 278 |
+
search_results: Resultados de Yandex/Bing (abiertos)
|
| 279 |
+
ocr_domains: Dominios extraídos por OCR de miniaturas de PimEyes
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
Lista de matches con URLs desbloquedas
|
| 283 |
+
"""
|
| 284 |
+
logger.info("Matching PimEyes censurado con búsquedas abiertas")
|
| 285 |
+
|
| 286 |
+
matches = []
|
| 287 |
+
|
| 288 |
+
for ocr_domain in ocr_domains:
|
| 289 |
+
normalized_ocr = self.normalize_domain(ocr_domain)
|
| 290 |
+
|
| 291 |
+
# Buscar en resultados de búsqueda
|
| 292 |
+
for search_result in search_results:
|
| 293 |
+
search_domain = self.extract_domain_from_url(search_result.get('url', ''))
|
| 294 |
+
|
| 295 |
+
# Si los dominios coinciden
|
| 296 |
+
if self.calculate_domain_similarity(normalized_ocr, search_domain) >= self.domain_threshold:
|
| 297 |
+
match = {
|
| 298 |
+
'pimeyes_domain_ocr': ocr_domain,
|
| 299 |
+
'matched_url': search_result.get('url'),
|
| 300 |
+
'thumbnail_url': search_result.get('thumbnail_url'),
|
| 301 |
+
'source': search_result.get('source', 'unknown'),
|
| 302 |
+
'match_confidence': self.calculate_domain_similarity(normalized_ocr, search_domain),
|
| 303 |
+
'unlocked': True, # Desbloqueado!
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
matches.append(match)
|
| 307 |
+
logger.success(f"✓ PimEyes censurado desbloqueado: {ocr_domain} → {search_result['url']}")
|
| 308 |
+
|
| 309 |
+
return matches
|
| 310 |
+
|
| 311 |
+
def deduplicate_results(self, results: List[Dict]) -> List[Dict]:
|
| 312 |
+
"""
|
| 313 |
+
Elimina resultados duplicados basándose en URL y hash de imagen.
|
| 314 |
+
|
| 315 |
+
Args:
|
| 316 |
+
results: Lista de resultados
|
| 317 |
+
|
| 318 |
+
Returns:
|
| 319 |
+
Lista sin duplicados
|
| 320 |
+
"""
|
| 321 |
+
seen_urls = set()
|
| 322 |
+
seen_hashes = set()
|
| 323 |
+
unique_results = []
|
| 324 |
+
|
| 325 |
+
for result in results:
|
| 326 |
+
url = result.get('url', '')
|
| 327 |
+
|
| 328 |
+
# Hash del URL
|
| 329 |
+
url_hash = hashlib.md5(url.encode()).hexdigest() if url else None
|
| 330 |
+
|
| 331 |
+
# Hash de thumbnail si existe
|
| 332 |
+
thumb_hash = None
|
| 333 |
+
if result.get('thumbnail_url'):
|
| 334 |
+
thumb_hash = hashlib.md5(result['thumbnail_url'].encode()).hexdigest()
|
| 335 |
+
|
| 336 |
+
# Verificar duplicados
|
| 337 |
+
is_duplicate = False
|
| 338 |
+
|
| 339 |
+
if url and url in seen_urls:
|
| 340 |
+
is_duplicate = True
|
| 341 |
+
|
| 342 |
+
if url_hash and url_hash in seen_hashes:
|
| 343 |
+
is_duplicate = True
|
| 344 |
+
|
| 345 |
+
if thumb_hash and thumb_hash in seen_hashes:
|
| 346 |
+
is_duplicate = True
|
| 347 |
+
|
| 348 |
+
if not is_duplicate:
|
| 349 |
+
unique_results.append(result)
|
| 350 |
+
|
| 351 |
+
if url:
|
| 352 |
+
seen_urls.add(url)
|
| 353 |
+
if url_hash:
|
| 354 |
+
seen_hashes.add(url_hash)
|
| 355 |
+
if thumb_hash:
|
| 356 |
+
seen_hashes.add(thumb_hash)
|
| 357 |
+
|
| 358 |
+
logger.info(f"Deduplicación: {len(results)} → {len(unique_results)} únicos")
|
| 359 |
+
|
| 360 |
+
return unique_results
|
| 361 |
+
|
| 362 |
+
def generate_final_report(self, cross_referenced_results: List[Dict]) -> Dict:
|
| 363 |
+
"""
|
| 364 |
+
Genera un reporte final unificado con estadísticas.
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
Diccionario con reporte completo
|
| 368 |
+
"""
|
| 369 |
+
# Estadísticas
|
| 370 |
+
total_results = len(cross_referenced_results)
|
| 371 |
+
cross_ref_count = sum(1 for r in cross_referenced_results if r.get('cross_referenced', False))
|
| 372 |
+
ocr_verified_count = sum(1 for r in cross_referenced_results if r.get('ocr_verified', False))
|
| 373 |
+
|
| 374 |
+
# Agrupar por fuente
|
| 375 |
+
by_source = defaultdict(int)
|
| 376 |
+
for result in cross_referenced_results:
|
| 377 |
+
for source in result.get('sources', []):
|
| 378 |
+
by_source[source] += 1
|
| 379 |
+
|
| 380 |
+
# Dominios únicos
|
| 381 |
+
unique_domains = set()
|
| 382 |
+
for result in cross_referenced_results:
|
| 383 |
+
domain = result.get('domain')
|
| 384 |
+
if domain:
|
| 385 |
+
unique_domains.add(domain)
|
| 386 |
+
|
| 387 |
+
# Resultados de alta confianza (>0.8)
|
| 388 |
+
high_confidence = [r for r in cross_referenced_results if r.get('confidence', 0) > 0.8]
|
| 389 |
+
|
| 390 |
+
report = {
|
| 391 |
+
'summary': {
|
| 392 |
+
'total_results': total_results,
|
| 393 |
+
'cross_referenced': cross_ref_count,
|
| 394 |
+
'ocr_verified': ocr_verified_count,
|
| 395 |
+
'unique_domains': len(unique_domains),
|
| 396 |
+
'high_confidence_results': len(high_confidence),
|
| 397 |
+
},
|
| 398 |
+
'by_source': dict(by_source),
|
| 399 |
+
'results': cross_referenced_results,
|
| 400 |
+
'top_matches': cross_referenced_results[:10], # Top 10
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
logger.info(f"Reporte generado: {total_results} resultados, {cross_ref_count} correlacionados")
|
| 404 |
+
|
| 405 |
+
return report
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
# Función de utilidad
|
| 409 |
+
def quick_cross_reference(yandex_results: List[Dict],
|
| 410 |
+
bing_results: List[Dict],
|
| 411 |
+
pimeyes_ocr_domains: List[str]) -> List[Dict]:
|
| 412 |
+
"""
|
| 413 |
+
Función de conveniencia para correlacionar rápidamente.
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
yandex_results: Resultados de Yandex
|
| 417 |
+
bing_results: Resultados de Bing
|
| 418 |
+
pimeyes_ocr_domains: Dominios extraídos de PimEyes por OCR
|
| 419 |
+
|
| 420 |
+
Returns:
|
| 421 |
+
Lista de resultados correlacionados
|
| 422 |
+
"""
|
| 423 |
+
xref = CrossReferencer()
|
| 424 |
+
|
| 425 |
+
all_results = {
|
| 426 |
+
'yandex': yandex_results,
|
| 427 |
+
'bing': bing_results,
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
# Convertir dominios OCR al formato esperado
|
| 431 |
+
ocr_results = [{'domain': d, 'confidence': 0.8} for d in pimeyes_ocr_domains]
|
| 432 |
+
|
| 433 |
+
return xref.find_cross_references(all_results, ocr_results)
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
if __name__ == "__main__":
|
| 437 |
+
# Ejemplo de uso
|
| 438 |
+
xref = CrossReferencer()
|
| 439 |
+
|
| 440 |
+
# Resultados de ejemplo
|
| 441 |
+
yandex = [
|
| 442 |
+
{'url': 'https://example.com/photo1.jpg', 'source': 'yandex'},
|
| 443 |
+
{'url': 'https://test.com/image.png', 'source': 'yandex'},
|
| 444 |
+
]
|
| 445 |
+
|
| 446 |
+
bing = [
|
| 447 |
+
{'url': 'https://example.com/photo2.jpg', 'source': 'bing'},
|
| 448 |
+
{'url': 'https://another.com/pic.jpg', 'source': 'bing'},
|
| 449 |
+
]
|
| 450 |
+
|
| 451 |
+
ocr_domains = ['example.com', 'test.com']
|
| 452 |
+
|
| 453 |
+
# Cross-reference
|
| 454 |
+
results = quick_cross_reference(yandex, bing, ocr_domains)
|
| 455 |
+
|
| 456 |
+
print(f"\nResultados correlacionados: {len(results)}")
|
| 457 |
+
for r in results:
|
| 458 |
+
print(f" • {r.get('domain')} - Fuentes: {r.get('sources')} - OCR: {r.get('ocr_verified')}")
|
src/demo_bypass.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
🔥 DEMOSTRACIÓN: El "Truco" de Aliah-Plus
|
| 3 |
+
Cómo desbloquear URLs de PimEyes sin pagar
|
| 4 |
+
|
| 5 |
+
Este script demuestra paso a paso cómo los 3 módulos trabajan juntos.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import numpy as np
|
| 10 |
+
import cv2
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
# Añadir path
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 16 |
+
|
| 17 |
+
from src.scrapers.stealth_engine import StealthSearch
|
| 18 |
+
from src.ocr_extractor import OCRExtractor
|
| 19 |
+
from src.cross_referencer import CrossReferencer
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def print_banner():
|
| 23 |
+
"""Imprime banner de inicio"""
|
| 24 |
+
print("""
|
| 25 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 26 |
+
║ ║
|
| 27 |
+
║ 🔥 ALIAH-PLUS: DEMO DEL BYPASS DE PIMEYES 🔥 ║
|
| 28 |
+
║ ║
|
| 29 |
+
║ Este script demuestra cómo desbloquear URLs de PimEyes ║
|
| 30 |
+
║ sin pagar $29.99/mes usando: ║
|
| 31 |
+
║ ║
|
| 32 |
+
║ 1️⃣ Stealth Scraping (Playwright) ║
|
| 33 |
+
║ 2️⃣ OCR Extraction (EasyOCR + 7 técnicas) ║
|
| 34 |
+
║ 3️⃣ Cross-Referencing (Correlación multi-motor) ║
|
| 35 |
+
║ ║
|
| 36 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 37 |
+
""")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
async def demo_pimeyes_bypass(image_path: str):
|
| 41 |
+
"""
|
| 42 |
+
Demostración completa del bypass de PimEyes.
|
| 43 |
+
"""
|
| 44 |
+
print_banner()
|
| 45 |
+
|
| 46 |
+
print("\n" + "="*70)
|
| 47 |
+
print("PASO 1: STEALTH SCRAPING DE PIMEYES")
|
| 48 |
+
print("="*70)
|
| 49 |
+
|
| 50 |
+
print("\n📡 Inicializando Stealth Search Engine...")
|
| 51 |
+
stealth = StealthSearch(headless=True)
|
| 52 |
+
print("✓ Stealth mode activado")
|
| 53 |
+
print(" • Playwright con anti-detección")
|
| 54 |
+
print(" • Fingerprinting bypass")
|
| 55 |
+
print(" • Comportamiento humano simulado")
|
| 56 |
+
|
| 57 |
+
print(f"\n🔍 Accediendo a PimEyes con: {image_path}")
|
| 58 |
+
print(" Esto puede tardar 30-60 segundos...")
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
# Buscar en PimEyes
|
| 62 |
+
pimeyes_results = await stealth.search_pimeyes_free(image_path)
|
| 63 |
+
|
| 64 |
+
print(f"\n✅ PimEyes accedido exitosamente")
|
| 65 |
+
print(f"📸 Miniaturas capturadas: {len(pimeyes_results)}")
|
| 66 |
+
|
| 67 |
+
if pimeyes_results:
|
| 68 |
+
print("\nEjemplo de miniatura capturada:")
|
| 69 |
+
print(f" • Censurada: {pimeyes_results[0].get('censored', 'Sí')}")
|
| 70 |
+
print(f" • Texto visible: {pimeyes_results[0].get('text_content', 'N/A')[:50]}...")
|
| 71 |
+
print(f" • Screenshot disponible: {'Sí' if pimeyes_results[0].get('screenshot') else 'No'}")
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"\n⚠️ Error en PimEyes (puede estar bloqueado temporalmente): {e}")
|
| 75 |
+
print(" Usando datos de ejemplo para la demostración...")
|
| 76 |
+
|
| 77 |
+
# Datos de ejemplo para demostración
|
| 78 |
+
pimeyes_results = [
|
| 79 |
+
{
|
| 80 |
+
'screenshot': np.random.randint(0, 255, (200, 300, 3), dtype=np.uint8).tobytes(),
|
| 81 |
+
'text_content': 'onlyfans.com/usuario123',
|
| 82 |
+
'censored': True
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
'screenshot': np.random.randint(0, 255, (200, 300, 3), dtype=np.uint8).tobytes(),
|
| 86 |
+
'text_content': 'ejemplo.com',
|
| 87 |
+
'censored': True
|
| 88 |
+
}
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
# =========================================================================
|
| 92 |
+
print("\n\n" + "="*70)
|
| 93 |
+
print("PASO 2: EXTRACCIÓN OCR DE DOMINIOS")
|
| 94 |
+
print("="*70)
|
| 95 |
+
|
| 96 |
+
print("\n🔍 Inicializando OCR Extractor...")
|
| 97 |
+
ocr = OCRExtractor(gpu=False) # CPU para compatibilidad
|
| 98 |
+
print("✓ EasyOCR cargado")
|
| 99 |
+
print(" • 7 técnicas de pre-procesamiento")
|
| 100 |
+
print(" • Detección de texto borroso")
|
| 101 |
+
print(" • Corrección de errores de OCR")
|
| 102 |
+
|
| 103 |
+
print(f"\n📝 Procesando {len(pimeyes_results)} miniaturas...")
|
| 104 |
+
|
| 105 |
+
all_ocr_domains = []
|
| 106 |
+
|
| 107 |
+
for idx, pim_result in enumerate(pimeyes_results, 1):
|
| 108 |
+
print(f"\n Miniatura {idx}/{len(pimeyes_results)}:")
|
| 109 |
+
|
| 110 |
+
# Simular extracción OCR
|
| 111 |
+
# En producción, usaríamos: ocr.extract_domain_from_thumb(screenshot)
|
| 112 |
+
|
| 113 |
+
# Para demo, extraer del texto visible
|
| 114 |
+
text = pim_result.get('text_content', '')
|
| 115 |
+
|
| 116 |
+
# Simular dominios encontrados
|
| 117 |
+
if 'onlyfans' in text.lower():
|
| 118 |
+
domains = [
|
| 119 |
+
{'domain': 'onlyfans.com', 'confidence': 0.89, 'method': 2},
|
| 120 |
+
{'domain': 'onlyfans.com/usuario123', 'confidence': 0.76, 'method': 4}
|
| 121 |
+
]
|
| 122 |
+
elif 'ejemplo' in text.lower():
|
| 123 |
+
domains = [
|
| 124 |
+
{'domain': 'ejemplo.com', 'confidence': 0.82, 'method': 1}
|
| 125 |
+
]
|
| 126 |
+
else:
|
| 127 |
+
domains = []
|
| 128 |
+
|
| 129 |
+
if domains:
|
| 130 |
+
print(f" ✅ Dominios extraídos: {len(domains)}")
|
| 131 |
+
for d in domains:
|
| 132 |
+
print(f" • {d['domain']} (confianza: {d['confidence']:.2%}, método: #{d['method']})")
|
| 133 |
+
all_ocr_domains.extend(domains)
|
| 134 |
+
else:
|
| 135 |
+
print(f" ⚠️ No se detectaron dominios")
|
| 136 |
+
|
| 137 |
+
print(f"\n✅ Total de dominios extraídos: {len(all_ocr_domains)}")
|
| 138 |
+
|
| 139 |
+
# =========================================================================
|
| 140 |
+
print("\n\n" + "="*70)
|
| 141 |
+
print("PASO 3: BÚSQUEDA EN MOTORES ABIERTOS")
|
| 142 |
+
print("="*70)
|
| 143 |
+
|
| 144 |
+
print("\n🔍 Buscando en Yandex y Bing (sin censura)...")
|
| 145 |
+
print(" Estos motores NO censuran resultados")
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
# Buscar en Yandex
|
| 149 |
+
print("\n → Yandex Images...")
|
| 150 |
+
yandex_results = await stealth.search_yandex_reverse(image_path)
|
| 151 |
+
print(f" ✓ Yandex: {len(yandex_results)} resultados")
|
| 152 |
+
|
| 153 |
+
# Buscar en Bing
|
| 154 |
+
print(" → Bing Images...")
|
| 155 |
+
bing_results = await stealth.search_bing_reverse(image_path)
|
| 156 |
+
print(f" ✓ Bing: {len(bing_results)} resultados")
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"\n ⚠️ Error en búsquedas: {e}")
|
| 160 |
+
print(" Usando datos de ejemplo...")
|
| 161 |
+
|
| 162 |
+
# Datos de ejemplo
|
| 163 |
+
yandex_results = [
|
| 164 |
+
{
|
| 165 |
+
'url': 'https://onlyfans.com/usuario123/photo456.jpg',
|
| 166 |
+
'domain': 'onlyfans.com',
|
| 167 |
+
'source': 'yandex'
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
'url': 'https://ejemplo.com/galeria/imagen789.jpg',
|
| 171 |
+
'domain': 'ejemplo.com',
|
| 172 |
+
'source': 'yandex'
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
'url': 'https://otro-sitio.com/foto.jpg',
|
| 176 |
+
'domain': 'otro-sitio.com',
|
| 177 |
+
'source': 'yandex'
|
| 178 |
+
}
|
| 179 |
+
]
|
| 180 |
+
|
| 181 |
+
bing_results = [
|
| 182 |
+
{
|
| 183 |
+
'url': 'https://ejemplo.com/perfil/foto.png',
|
| 184 |
+
'domain': 'ejemplo.com',
|
| 185 |
+
'source': 'bing'
|
| 186 |
+
}
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
all_search_results = yandex_results + bing_results
|
| 190 |
+
print(f"\n✅ Total de resultados abiertos: {len(all_search_results)}")
|
| 191 |
+
|
| 192 |
+
# =========================================================================
|
| 193 |
+
print("\n\n" + "="*70)
|
| 194 |
+
print("PASO 4: CROSS-REFERENCING (EL TRUCO PRINCIPAL)")
|
| 195 |
+
print("="*70)
|
| 196 |
+
|
| 197 |
+
print("\n🔗 Correlacionando resultados...")
|
| 198 |
+
print(" Buscando coincidencias entre:")
|
| 199 |
+
print(" • Dominios extraídos de PimEyes (OCR)")
|
| 200 |
+
print(" • URLs encontradas en Yandex/Bing")
|
| 201 |
+
|
| 202 |
+
xref = CrossReferencer()
|
| 203 |
+
|
| 204 |
+
# Realizar cross-referencing
|
| 205 |
+
unlocked_urls = xref.match_pimeyes_with_search(
|
| 206 |
+
pimeyes_results,
|
| 207 |
+
all_search_results,
|
| 208 |
+
[d['domain'] for d in all_ocr_domains]
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
print(f"\n🎯 Correlaciones encontradas: {len(unlocked_urls)}")
|
| 212 |
+
|
| 213 |
+
# =========================================================================
|
| 214 |
+
print("\n\n" + "="*70)
|
| 215 |
+
print("✨ RESULTADOS FINALES")
|
| 216 |
+
print("="*70)
|
| 217 |
+
|
| 218 |
+
if unlocked_urls:
|
| 219 |
+
print(f"\n🎉 ¡ÉXITO! {len(unlocked_urls)} URLs desbloqueadas de PimEyes")
|
| 220 |
+
print("\nURLs que PimEyes te cobraría $29.99 para ver:\n")
|
| 221 |
+
|
| 222 |
+
for idx, match in enumerate(unlocked_urls, 1):
|
| 223 |
+
print(f"\n[{idx}] 🔓 URL DESBLOQUEADA")
|
| 224 |
+
print(f" PimEyes OCR detectó: {match.get('pimeyes_domain_ocr', 'N/A')}")
|
| 225 |
+
print(f" Correlacionado con: {match.get('matched_url', 'N/A')}")
|
| 226 |
+
print(f" Fuente: {match.get('source', 'N/A')}")
|
| 227 |
+
print(f" Confianza: {match.get('match_confidence', 0):.2%}")
|
| 228 |
+
print(f" Estado: {'✅ UNLOCKED' if match.get('unlocked') else '❌'}")
|
| 229 |
+
|
| 230 |
+
# Calcular ahorro
|
| 231 |
+
savings = len(unlocked_urls) * 29.99
|
| 232 |
+
print(f"\n💰 Ahorro estimado: ${savings:.2f}")
|
| 233 |
+
print(f" (PimEyes cobra $29.99/mes para {len(unlocked_urls)} URLs)")
|
| 234 |
+
|
| 235 |
+
else:
|
| 236 |
+
print("\n⚠️ No se encontraron correlaciones")
|
| 237 |
+
print(" Posibles razones:")
|
| 238 |
+
print(" • La imagen no tiene suficientes resultados públicos")
|
| 239 |
+
print(" • Los dominios de PimEyes no coinciden con búsquedas abiertas")
|
| 240 |
+
print(" • OCR no pudo extraer dominios de las miniaturas")
|
| 241 |
+
|
| 242 |
+
# =========================================================================
|
| 243 |
+
print("\n\n" + "="*70)
|
| 244 |
+
print("📊 ESTADÍSTICAS DE LA BÚSQUEDA")
|
| 245 |
+
print("="*70)
|
| 246 |
+
|
| 247 |
+
print(f"\n• Miniaturas de PimEyes capturadas: {len(pimeyes_results)}")
|
| 248 |
+
print(f"• Dominios extraídos por OCR: {len(all_ocr_domains)}")
|
| 249 |
+
print(f"• Resultados de Yandex: {len(yandex_results)}")
|
| 250 |
+
print(f"• Resultados de Bing: {len(bing_results)}")
|
| 251 |
+
print(f"• URLs desbloqueadas: {len(unlocked_urls)}")
|
| 252 |
+
|
| 253 |
+
if all_ocr_domains and all_search_results:
|
| 254 |
+
success_rate = (len(unlocked_urls) / len(all_ocr_domains)) * 100
|
| 255 |
+
print(f"• Tasa de éxito: {success_rate:.1f}%")
|
| 256 |
+
|
| 257 |
+
# =========================================================================
|
| 258 |
+
print("\n\n" + "="*70)
|
| 259 |
+
print("🎓 CÓMO FUNCIONA EL TRUCO")
|
| 260 |
+
print("="*70)
|
| 261 |
+
|
| 262 |
+
print("""
|
| 263 |
+
PimEyes te muestra una miniatura así:
|
| 264 |
+
┌─────────────────────────┐
|
| 265 |
+
│ [Imagen borrosa] │
|
| 266 |
+
│ │
|
| 267 |
+
│ onlyfans.com/usuario │ ← Visible pero sin link
|
| 268 |
+
│ │
|
| 269 |
+
│ 🔒 Paga para ver URL │
|
| 270 |
+
└─────────────────────────┘
|
| 271 |
+
|
| 272 |
+
Aliah-Plus hace esto:
|
| 273 |
+
1. OCR extrae "onlyfans.com/usuario" de la miniatura
|
| 274 |
+
2. Yandex busca la misma cara
|
| 275 |
+
3. Yandex encuentra "https://onlyfans.com/usuario/photo.jpg"
|
| 276 |
+
4. Cross-referencer ve que ambos son "onlyfans.com"
|
| 277 |
+
5. ¡MATCH! → URL completa sin pagar
|
| 278 |
+
|
| 279 |
+
Resultado:
|
| 280 |
+
┌─────────────────────────┐
|
| 281 |
+
│ ✅ URL DESBLOQUEADA │
|
| 282 |
+
│ │
|
| 283 |
+
│ https://onlyfans.com/ │
|
| 284 |
+
│ usuario/photo.jpg │
|
| 285 |
+
│ │
|
| 286 |
+
│ Fuente: Yandex │
|
| 287 |
+
│ Confianza: 91% │
|
| 288 |
+
└─────────────────────────┘
|
| 289 |
+
""")
|
| 290 |
+
|
| 291 |
+
# =========================================================================
|
| 292 |
+
print("\n" + "="*70)
|
| 293 |
+
print("✅ DEMOSTRACIÓN COMPLETADA")
|
| 294 |
+
print("="*70)
|
| 295 |
+
|
| 296 |
+
print("\n🚀 Para usar en producción:")
|
| 297 |
+
print(" python app.py")
|
| 298 |
+
print(" → API disponible en http://localhost:8000")
|
| 299 |
+
print(" → Documentación en http://localhost:8000/docs")
|
| 300 |
+
|
| 301 |
+
print("\n📚 Más información:")
|
| 302 |
+
print(" • README.md - Documentación completa")
|
| 303 |
+
print(" • INTEGRATION_GUIDE.md - Guía de integración")
|
| 304 |
+
print(" • QUICKSTART.md - Inicio rápido")
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
async def main():
|
| 308 |
+
"""Punto de entrada"""
|
| 309 |
+
|
| 310 |
+
if len(sys.argv) < 2:
|
| 311 |
+
print("""
|
| 312 |
+
Uso: python demo_bypass.py <ruta_imagen>
|
| 313 |
+
|
| 314 |
+
Ejemplo:
|
| 315 |
+
python demo_bypass.py foto_persona.jpg
|
| 316 |
+
|
| 317 |
+
Este script demostrará cómo Aliah-Plus desbloquea URLs de PimEyes
|
| 318 |
+
usando OCR y cross-referencing.
|
| 319 |
+
""")
|
| 320 |
+
return
|
| 321 |
+
|
| 322 |
+
image_path = sys.argv[1]
|
| 323 |
+
|
| 324 |
+
if not Path(image_path).exists():
|
| 325 |
+
print(f"❌ Error: La imagen '{image_path}' no existe")
|
| 326 |
+
return
|
| 327 |
+
|
| 328 |
+
try:
|
| 329 |
+
await demo_pimeyes_bypass(image_path)
|
| 330 |
+
except KeyboardInterrupt:
|
| 331 |
+
print("\n\n⚠️ Demostración interrumpida por el usuario")
|
| 332 |
+
except Exception as e:
|
| 333 |
+
print(f"\n\n❌ Error: {e}")
|
| 334 |
+
import traceback
|
| 335 |
+
traceback.print_exc()
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
if __name__ == "__main__":
|
| 339 |
+
print("\n🔥 Iniciando demostración de Aliah-Plus...")
|
| 340 |
+
asyncio.run(main())
|
src/embedding_engine.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Embedding Engine - Generación de vectores faciales
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from deepface import DeepFace
|
| 6 |
+
import numpy as np
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class EmbeddingEngine:
|
| 11 |
+
"""
|
| 12 |
+
Genera embeddings faciales usando modelos de deep learning.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
SUPPORTED_MODELS = [
|
| 16 |
+
"VGG-Face", "Facenet", "Facenet512", "OpenFace",
|
| 17 |
+
"DeepFace", "DeepID", "ArcFace", "Dlib", "SFace"
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
def __init__(self, model="ArcFace"):
|
| 21 |
+
"""
|
| 22 |
+
Inicializa el motor de embeddings.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
model: Modelo a usar (default: ArcFace - el más preciso)
|
| 26 |
+
"""
|
| 27 |
+
if model not in self.SUPPORTED_MODELS:
|
| 28 |
+
logger.warning(f"Modelo {model} no soportado, usando ArcFace")
|
| 29 |
+
model = "ArcFace"
|
| 30 |
+
|
| 31 |
+
self.model_name = model
|
| 32 |
+
logger.info(f"Embedding Engine inicializado con modelo: {model}")
|
| 33 |
+
|
| 34 |
+
def generate_embedding(self, face_image):
|
| 35 |
+
"""
|
| 36 |
+
Genera un vector de embedding para un rostro.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
face_image: Imagen del rostro (numpy array RGB, 160x160)
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Vector numpy de embeddings o None si falla
|
| 43 |
+
"""
|
| 44 |
+
try:
|
| 45 |
+
# DeepFace espera un array numpy
|
| 46 |
+
embedding_obj = DeepFace.represent(
|
| 47 |
+
img_path=face_image,
|
| 48 |
+
model_name=self.model_name,
|
| 49 |
+
enforce_detection=False,
|
| 50 |
+
detector_backend='skip' # Ya hicimos detección con MTCNN
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Extraer el vector
|
| 54 |
+
embedding = np.array(embedding_obj[0]["embedding"])
|
| 55 |
+
|
| 56 |
+
logger.debug(f"Embedding generado: {len(embedding)} dimensiones")
|
| 57 |
+
|
| 58 |
+
return embedding
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.error(f"Error generando embedding: {e}")
|
| 62 |
+
return None
|
src/face_processor.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Face Processor - Detección y alineación de rostros
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import cv2
|
| 6 |
+
import numpy as np
|
| 7 |
+
from mtcnn import MTCNN
|
| 8 |
+
from PIL import Image
|
| 9 |
+
from loguru import logger
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FaceProcessor:
|
| 13 |
+
"""
|
| 14 |
+
Procesa imágenes para detectar, alinear y normalizar rostros.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
"""Inicializa el detector MTCNN"""
|
| 19 |
+
logger.info("Inicializando MTCNN...")
|
| 20 |
+
self.detector = MTCNN()
|
| 21 |
+
logger.success("MTCNN inicializado")
|
| 22 |
+
|
| 23 |
+
def align_face(self, image):
|
| 24 |
+
"""
|
| 25 |
+
Detecta y alinea el rostro en la imagen.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
image: Imagen PIL o numpy array (RGB)
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Rostro alineado y normalizado (160x160) o None si no se detecta
|
| 32 |
+
"""
|
| 33 |
+
# Convertir PIL a numpy si es necesario
|
| 34 |
+
if isinstance(image, Image.Image):
|
| 35 |
+
image = np.array(image)
|
| 36 |
+
|
| 37 |
+
# Asegurar que está en RGB
|
| 38 |
+
if len(image.shape) == 2: # Grayscale
|
| 39 |
+
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
| 40 |
+
elif image.shape[2] == 4: # RGBA
|
| 41 |
+
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
| 42 |
+
|
| 43 |
+
# Detectar rostros
|
| 44 |
+
faces = self.detector.detect_faces(image)
|
| 45 |
+
|
| 46 |
+
if len(faces) == 0:
|
| 47 |
+
logger.warning("No se detectó ningún rostro")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
# Tomar el rostro más grande (más probable que sea el principal)
|
| 51 |
+
face = max(faces, key=lambda x: x['box'][2] * x['box'][3])
|
| 52 |
+
|
| 53 |
+
# Extraer keypoints
|
| 54 |
+
keypoints = face['keypoints']
|
| 55 |
+
left_eye = keypoints['left_eye']
|
| 56 |
+
right_eye = keypoints['right_eye']
|
| 57 |
+
|
| 58 |
+
# Calcular ángulo de rotación para alinear horizontalmente
|
| 59 |
+
dY = right_eye[1] - left_eye[1]
|
| 60 |
+
dX = right_eye[0] - left_eye[0]
|
| 61 |
+
angle = np.degrees(np.arctan2(dY, dX))
|
| 62 |
+
|
| 63 |
+
# Rotar imagen
|
| 64 |
+
h, w = image.shape[:2]
|
| 65 |
+
center = (w // 2, h // 2)
|
| 66 |
+
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
| 67 |
+
aligned = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC)
|
| 68 |
+
|
| 69 |
+
# Recortar rostro con margen
|
| 70 |
+
x, y, width, height = face['box']
|
| 71 |
+
margin = int(min(width, height) * 0.2) # 20% de margen
|
| 72 |
+
|
| 73 |
+
x1 = max(0, x - margin)
|
| 74 |
+
y1 = max(0, y - margin)
|
| 75 |
+
x2 = min(w, x + width + margin)
|
| 76 |
+
y2 = min(h, y + height + margin)
|
| 77 |
+
|
| 78 |
+
face_crop = aligned[y1:y2, x1:x2]
|
| 79 |
+
|
| 80 |
+
# Resize a 160x160 (estándar FaceNet)
|
| 81 |
+
try:
|
| 82 |
+
face_resized = cv2.resize(face_crop, (160, 160), interpolation=cv2.INTER_AREA)
|
| 83 |
+
logger.debug(f"Rostro detectado y alineado: {face_resized.shape}")
|
| 84 |
+
return face_resized
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Error al resize: {e}")
|
| 87 |
+
return None
|
src/ocr_extractor.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OCR Extractor - Módulo Detective para extraer URLs ocultas de miniaturas
|
| 3 |
+
Este módulo rompe el bloqueo de sitios que censuran URLs con blur.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import easyocr
|
| 7 |
+
import numpy as np
|
| 8 |
+
import cv2
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
from loguru import logger
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class OCRExtractor:
|
| 15 |
+
"""
|
| 16 |
+
Extrae dominios y URLs de imágenes, incluso si están borrosas o parcialmente ocultas.
|
| 17 |
+
Implementa técnicas de pre-procesamiento para mejorar la detección en miniaturas de baja calidad.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
# Extensiones de dominio comunes
|
| 21 |
+
TLD_PATTERNS = [
|
| 22 |
+
r'\.com', r'\.net', r'\.org', r'\.io', r'\.co',
|
| 23 |
+
r'\.tv', r'\.me', r'\.site', r'\.app', r'\.dev',
|
| 24 |
+
r'\.xxx', r'\.adult', r'\.porn', r'\.sex', # Dominios adultos
|
| 25 |
+
r'\.fan', r'\.fans', r'\.cam', r'\.live'
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
# Patrones de URL completas
|
| 29 |
+
URL_PATTERNS = [
|
| 30 |
+
r'https?://[^\s]+', # URLs con protocolo
|
| 31 |
+
r'www\.[a-zA-Z0-9-]+\.[a-zA-Z]{2,}', # www.dominio.com
|
| 32 |
+
r'[a-zA-Z0-9-]+\.(?:com|net|org|io|xxx|adult|porn|cam)', # dominio.com
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# Plataformas conocidas
|
| 36 |
+
KNOWN_PLATFORMS = [
|
| 37 |
+
'onlyfans', 'fansly', 'patreon', 'instagram', 'twitter',
|
| 38 |
+
'tiktok', 'reddit', 'imgur', 'flickr', 'tumblr',
|
| 39 |
+
'xvideos', 'pornhub', 'xnxx', 'redtube', 'youporn',
|
| 40 |
+
'chaturbate', 'myfreecams', 'streamate', 'bongacams'
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
def __init__(self, gpu: bool = True, languages: List[str] = None):
|
| 44 |
+
"""
|
| 45 |
+
Inicializa el OCR engine.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
gpu: Usar GPU si está disponible
|
| 49 |
+
languages: Lista de idiomas (default: ['en'])
|
| 50 |
+
"""
|
| 51 |
+
if languages is None:
|
| 52 |
+
languages = ['en']
|
| 53 |
+
|
| 54 |
+
logger.info(f"Inicializando EasyOCR con GPU={gpu}, idiomas={languages}")
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
self.reader = easyocr.Reader(languages, gpu=gpu)
|
| 58 |
+
logger.success("EasyOCR inicializado correctamente")
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.warning(f"Error al inicializar con GPU, usando CPU: {e}")
|
| 61 |
+
self.reader = easyocr.Reader(languages, gpu=False)
|
| 62 |
+
|
| 63 |
+
def preprocess_image(self, image_np: np.ndarray) -> List[np.ndarray]:
|
| 64 |
+
"""
|
| 65 |
+
Pre-procesa la imagen con múltiples técnicas para mejorar la detección de texto.
|
| 66 |
+
Retorna múltiples versiones de la imagen procesada.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
image_np: Imagen en formato numpy array (BGR)
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Lista de imágenes procesadas
|
| 73 |
+
"""
|
| 74 |
+
processed_images = []
|
| 75 |
+
|
| 76 |
+
# Convertir a escala de grises
|
| 77 |
+
if len(image_np.shape) == 3:
|
| 78 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
|
| 79 |
+
else:
|
| 80 |
+
gray = image_np.copy()
|
| 81 |
+
|
| 82 |
+
# 1. Imagen original en escala de grises
|
| 83 |
+
processed_images.append(gray)
|
| 84 |
+
|
| 85 |
+
# 2. Umbral binario (para texto oscuro en fondo claro)
|
| 86 |
+
_, thresh1 = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
|
| 87 |
+
processed_images.append(thresh1)
|
| 88 |
+
|
| 89 |
+
# 3. Umbral binario invertido (para texto claro en fondo oscuro)
|
| 90 |
+
_, thresh2 = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
| 91 |
+
processed_images.append(thresh2)
|
| 92 |
+
|
| 93 |
+
# 4. Umbral adaptativo (para imágenes con iluminación irregular)
|
| 94 |
+
adaptive = cv2.adaptiveThreshold(
|
| 95 |
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 96 |
+
cv2.THRESH_BINARY, 11, 2
|
| 97 |
+
)
|
| 98 |
+
processed_images.append(adaptive)
|
| 99 |
+
|
| 100 |
+
# 5. Mejorar contraste con CLAHE
|
| 101 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 102 |
+
enhanced = clahe.apply(gray)
|
| 103 |
+
processed_images.append(enhanced)
|
| 104 |
+
|
| 105 |
+
# 6. Reducción de ruido
|
| 106 |
+
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
| 107 |
+
processed_images.append(denoised)
|
| 108 |
+
|
| 109 |
+
# 7. Sharpening (para texto borroso)
|
| 110 |
+
kernel_sharpen = np.array([[-1, -1, -1],
|
| 111 |
+
[-1, 9, -1],
|
| 112 |
+
[-1, -1, -1]])
|
| 113 |
+
sharpened = cv2.filter2D(gray, -1, kernel_sharpen)
|
| 114 |
+
processed_images.append(sharpened)
|
| 115 |
+
|
| 116 |
+
return processed_images
|
| 117 |
+
|
| 118 |
+
def extract_text_from_image(self, image_np: np.ndarray) -> List[Dict]:
|
| 119 |
+
"""
|
| 120 |
+
Extrae todo el texto visible de una imagen.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
image_np: Imagen en formato numpy array
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
Lista de diccionarios con texto detectado y confianza
|
| 127 |
+
"""
|
| 128 |
+
all_results = []
|
| 129 |
+
|
| 130 |
+
# Procesar múltiples versiones de la imagen
|
| 131 |
+
processed_images = self.preprocess_image(image_np)
|
| 132 |
+
|
| 133 |
+
for idx, processed in enumerate(processed_images):
|
| 134 |
+
try:
|
| 135 |
+
results = self.reader.readtext(processed, paragraph=False)
|
| 136 |
+
|
| 137 |
+
for bbox, text, confidence in results:
|
| 138 |
+
all_results.append({
|
| 139 |
+
'text': text,
|
| 140 |
+
'confidence': float(confidence),
|
| 141 |
+
'bbox': bbox,
|
| 142 |
+
'preprocessing_method': idx
|
| 143 |
+
})
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.debug(f"Error en método de preprocesamiento {idx}: {e}")
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
# Eliminar duplicados y mantener los de mayor confianza
|
| 150 |
+
unique_results = self._deduplicate_results(all_results)
|
| 151 |
+
|
| 152 |
+
return unique_results
|
| 153 |
+
|
| 154 |
+
def _deduplicate_results(self, results: List[Dict]) -> List[Dict]:
|
| 155 |
+
"""
|
| 156 |
+
Elimina resultados duplicados, manteniendo el de mayor confianza.
|
| 157 |
+
"""
|
| 158 |
+
seen = {}
|
| 159 |
+
|
| 160 |
+
for result in results:
|
| 161 |
+
text = result['text'].lower().strip()
|
| 162 |
+
|
| 163 |
+
if text not in seen or result['confidence'] > seen[text]['confidence']:
|
| 164 |
+
seen[text] = result
|
| 165 |
+
|
| 166 |
+
return list(seen.values())
|
| 167 |
+
|
| 168 |
+
def extract_domain_from_thumb(self, image_np: np.ndarray,
|
| 169 |
+
min_confidence: float = 0.6) -> List[Dict]:
|
| 170 |
+
"""
|
| 171 |
+
Extrae dominios específicamente de una miniatura.
|
| 172 |
+
Este es el método principal para romper el bloqueo de PimEyes.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
image_np: Imagen en formato numpy array
|
| 176 |
+
min_confidence: Confianza mínima para considerar válido (0.0-1.0)
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Lista de dominios encontrados con metadata
|
| 180 |
+
"""
|
| 181 |
+
# Extraer todo el texto
|
| 182 |
+
text_results = self.extract_text_from_image(image_np)
|
| 183 |
+
|
| 184 |
+
found_domains = []
|
| 185 |
+
|
| 186 |
+
for result in text_results:
|
| 187 |
+
text = result['text']
|
| 188 |
+
confidence = result['confidence']
|
| 189 |
+
|
| 190 |
+
if confidence < min_confidence:
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
# Limpiar texto
|
| 194 |
+
cleaned_text = self._clean_text(text)
|
| 195 |
+
|
| 196 |
+
# Buscar dominios
|
| 197 |
+
domains = self._find_domains_in_text(cleaned_text)
|
| 198 |
+
|
| 199 |
+
for domain in domains:
|
| 200 |
+
found_domains.append({
|
| 201 |
+
'domain': domain,
|
| 202 |
+
'confidence': confidence,
|
| 203 |
+
'original_text': text,
|
| 204 |
+
'cleaned_text': cleaned_text,
|
| 205 |
+
'bbox': result['bbox'],
|
| 206 |
+
'method': result['preprocessing_method']
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
+
# Ordenar por confianza
|
| 210 |
+
found_domains.sort(key=lambda x: x['confidence'], reverse=True)
|
| 211 |
+
|
| 212 |
+
# Eliminar duplicados
|
| 213 |
+
unique_domains = self._deduplicate_domains(found_domains)
|
| 214 |
+
|
| 215 |
+
logger.info(f"OCR: Encontrados {len(unique_domains)} dominios únicos")
|
| 216 |
+
|
| 217 |
+
return unique_domains
|
| 218 |
+
|
| 219 |
+
def _clean_text(self, text: str) -> str:
|
| 220 |
+
"""
|
| 221 |
+
Limpia el texto extraído para mejorar la detección de dominios.
|
| 222 |
+
"""
|
| 223 |
+
# Convertir a minúsculas
|
| 224 |
+
text = text.lower()
|
| 225 |
+
|
| 226 |
+
# Remover espacios múltiples
|
| 227 |
+
text = re.sub(r'\s+', '', text)
|
| 228 |
+
|
| 229 |
+
# Corregir errores comunes de OCR
|
| 230 |
+
corrections = {
|
| 231 |
+
'c0m': 'com',
|
| 232 |
+
'c om': 'com',
|
| 233 |
+
'co m': 'com',
|
| 234 |
+
'n et': 'net',
|
| 235 |
+
'ne t': 'net',
|
| 236 |
+
'0rg': 'org',
|
| 237 |
+
'o rg': 'org',
|
| 238 |
+
'i o': 'io',
|
| 239 |
+
'tv ': 'tv',
|
| 240 |
+
'xxx ': 'xxx',
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
for wrong, correct in corrections.items():
|
| 244 |
+
text = text.replace(wrong, correct)
|
| 245 |
+
|
| 246 |
+
return text
|
| 247 |
+
|
| 248 |
+
def _find_domains_in_text(self, text: str) -> List[str]:
|
| 249 |
+
"""
|
| 250 |
+
Encuentra dominios en un texto usando patrones y heurísticas.
|
| 251 |
+
"""
|
| 252 |
+
domains = []
|
| 253 |
+
|
| 254 |
+
# Método 1: Buscar con regex de URLs
|
| 255 |
+
for pattern in self.URL_PATTERNS:
|
| 256 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 257 |
+
domains.extend(matches)
|
| 258 |
+
|
| 259 |
+
# Método 2: Buscar TLDs
|
| 260 |
+
for tld_pattern in self.TLD_PATTERNS:
|
| 261 |
+
# Buscar palabra seguida de TLD
|
| 262 |
+
pattern = r'([a-zA-Z0-9-]+' + tld_pattern + r'(?:/[^\s]*)?)'
|
| 263 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 264 |
+
domains.extend(matches)
|
| 265 |
+
|
| 266 |
+
# Método 3: Buscar plataformas conocidas
|
| 267 |
+
for platform in self.KNOWN_PLATFORMS:
|
| 268 |
+
if platform in text:
|
| 269 |
+
# Intentar extraer username si existe
|
| 270 |
+
username_pattern = rf'{platform}\.com/([a-zA-Z0-9_-]+)'
|
| 271 |
+
username_match = re.search(username_pattern, text)
|
| 272 |
+
|
| 273 |
+
if username_match:
|
| 274 |
+
domains.append(f"{platform}.com/{username_match.group(1)}")
|
| 275 |
+
else:
|
| 276 |
+
domains.append(f"{platform}.com")
|
| 277 |
+
|
| 278 |
+
# Limpiar y validar dominios
|
| 279 |
+
cleaned_domains = []
|
| 280 |
+
for domain in domains:
|
| 281 |
+
domain = domain.strip().lower()
|
| 282 |
+
domain = re.sub(r'^https?://', '', domain)
|
| 283 |
+
domain = re.sub(r'^www\.', '', domain)
|
| 284 |
+
|
| 285 |
+
# Validar que parece un dominio válido
|
| 286 |
+
if self._is_valid_domain(domain):
|
| 287 |
+
cleaned_domains.append(domain)
|
| 288 |
+
|
| 289 |
+
return list(set(cleaned_domains)) # Eliminar duplicados
|
| 290 |
+
|
| 291 |
+
def _is_valid_domain(self, domain: str) -> bool:
|
| 292 |
+
"""
|
| 293 |
+
Valida que una cadena parece ser un dominio válido.
|
| 294 |
+
"""
|
| 295 |
+
# Debe tener al menos un punto
|
| 296 |
+
if '.' not in domain:
|
| 297 |
+
return False
|
| 298 |
+
|
| 299 |
+
# No debe tener espacios
|
| 300 |
+
if ' ' in domain:
|
| 301 |
+
return False
|
| 302 |
+
|
| 303 |
+
# Debe tener un TLD válido
|
| 304 |
+
has_valid_tld = any(tld.replace('\\', '').replace('.', '') in domain
|
| 305 |
+
for tld in self.TLD_PATTERNS)
|
| 306 |
+
|
| 307 |
+
return has_valid_tld
|
| 308 |
+
|
| 309 |
+
def _deduplicate_domains(self, domains: List[Dict]) -> List[Dict]:
|
| 310 |
+
"""
|
| 311 |
+
Elimina dominios duplicados, manteniendo el de mayor confianza.
|
| 312 |
+
"""
|
| 313 |
+
seen = {}
|
| 314 |
+
|
| 315 |
+
for item in domains:
|
| 316 |
+
domain = item['domain']
|
| 317 |
+
|
| 318 |
+
if domain not in seen or item['confidence'] > seen[domain]['confidence']:
|
| 319 |
+
seen[domain] = item
|
| 320 |
+
|
| 321 |
+
return list(seen.values())
|
| 322 |
+
|
| 323 |
+
def extract_from_pimeyes_thumbnail(self, image_np: np.ndarray) -> Dict:
|
| 324 |
+
"""
|
| 325 |
+
Método especializado para miniaturas de PimEyes.
|
| 326 |
+
Aplica técnicas específicas para este sitio.
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
image_np: Miniatura de PimEyes (generalmente con blur)
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
Diccionario con dominios extraídos y metadata
|
| 333 |
+
"""
|
| 334 |
+
logger.info("Procesando miniatura de PimEyes con técnicas especializadas")
|
| 335 |
+
|
| 336 |
+
# PimEyes suele poner el dominio en la parte inferior
|
| 337 |
+
height = image_np.shape[0]
|
| 338 |
+
|
| 339 |
+
# Extraer solo la parte inferior (donde suele estar el texto)
|
| 340 |
+
bottom_region = image_np[int(height * 0.7):, :]
|
| 341 |
+
|
| 342 |
+
# Aplicar mejoras específicas para texto con blur
|
| 343 |
+
deblurred = self._deblur_text_region(bottom_region)
|
| 344 |
+
|
| 345 |
+
# Extraer dominios
|
| 346 |
+
domains = self.extract_domain_from_thumb(deblurred, min_confidence=0.5)
|
| 347 |
+
|
| 348 |
+
return {
|
| 349 |
+
'domains': domains,
|
| 350 |
+
'source': 'pimeyes',
|
| 351 |
+
'confidence_avg': np.mean([d['confidence'] for d in domains]) if domains else 0.0,
|
| 352 |
+
'total_found': len(domains)
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
def _deblur_text_region(self, image_np: np.ndarray) -> np.ndarray:
|
| 356 |
+
"""
|
| 357 |
+
Aplica técnicas de deblurring específicas para regiones de texto.
|
| 358 |
+
"""
|
| 359 |
+
# Convertir a escala de grises
|
| 360 |
+
if len(image_np.shape) == 3:
|
| 361 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
|
| 362 |
+
else:
|
| 363 |
+
gray = image_np
|
| 364 |
+
|
| 365 |
+
# Aplicar Wiener filter aproximado
|
| 366 |
+
kernel = np.ones((3, 3), np.float32) / 9
|
| 367 |
+
deblurred = cv2.filter2D(gray, -1, kernel)
|
| 368 |
+
|
| 369 |
+
# Sharpen agresivo
|
| 370 |
+
kernel_sharpen = np.array([[-1, -1, -1, -1, -1],
|
| 371 |
+
[-1, 2, 2, 2, -1],
|
| 372 |
+
[-1, 2, 8, 2, -1],
|
| 373 |
+
[-1, 2, 2, 2, -1],
|
| 374 |
+
[-1, -1, -1, -1, -1]]) / 8.0
|
| 375 |
+
|
| 376 |
+
sharpened = cv2.filter2D(deblurred, -1, kernel_sharpen)
|
| 377 |
+
|
| 378 |
+
# Aumentar contraste
|
| 379 |
+
sharpened = cv2.equalizeHist(sharpened.astype(np.uint8))
|
| 380 |
+
|
| 381 |
+
return sharpened
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
# Función de utilidad para uso directo
|
| 385 |
+
def quick_extract_domains(image_path: str, min_confidence: float = 0.6) -> List[str]:
|
| 386 |
+
"""
|
| 387 |
+
Función de conveniencia para extraer dominios rápidamente.
|
| 388 |
+
|
| 389 |
+
Args:
|
| 390 |
+
image_path: Ruta a la imagen
|
| 391 |
+
min_confidence: Confianza mínima
|
| 392 |
+
|
| 393 |
+
Returns:
|
| 394 |
+
Lista de dominios encontrados
|
| 395 |
+
"""
|
| 396 |
+
import cv2
|
| 397 |
+
|
| 398 |
+
image = cv2.imread(image_path)
|
| 399 |
+
if image is None:
|
| 400 |
+
raise ValueError(f"No se pudo cargar la imagen: {image_path}")
|
| 401 |
+
|
| 402 |
+
extractor = OCRExtractor()
|
| 403 |
+
results = extractor.extract_domain_from_thumb(image, min_confidence)
|
| 404 |
+
|
| 405 |
+
return [r['domain'] for r in results]
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
if __name__ == "__main__":
|
| 409 |
+
# Ejemplo de uso
|
| 410 |
+
import sys
|
| 411 |
+
|
| 412 |
+
if len(sys.argv) > 1:
|
| 413 |
+
image_path = sys.argv[1]
|
| 414 |
+
domains = quick_extract_domains(image_path)
|
| 415 |
+
|
| 416 |
+
print(f"\n🔍 Dominios encontrados: {len(domains)}")
|
| 417 |
+
for domain in domains:
|
| 418 |
+
print(f" • {domain}")
|
| 419 |
+
else:
|
| 420 |
+
print("Uso: python ocr_extractor.py <ruta_imagen>")
|
src/stealth_engine.py
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stealth Engine - Motor de scraping con anti-detección
|
| 3 |
+
Bypasea las protecciones de sitios como PimEyes, OnlyFans, etc.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from playwright.async_api import async_playwright, Browser, Page
|
| 7 |
+
from playwright_stealth import stealth_async
|
| 8 |
+
from typing import List, Dict, Optional
|
| 9 |
+
import asyncio
|
| 10 |
+
import random
|
| 11 |
+
from loguru import logger
|
| 12 |
+
from fake_useragent import UserAgent
|
| 13 |
+
import json
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class StealthSearch:
|
| 17 |
+
"""
|
| 18 |
+
Motor de búsqueda con capacidades de evasión anti-bot.
|
| 19 |
+
Implementa técnicas avanzadas para parecer un usuario real.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
# User agents rotativos
|
| 23 |
+
USER_AGENTS = [
|
| 24 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 25 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 26 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
| 27 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
def __init__(self, headless: bool = True, proxy: Optional[str] = None):
|
| 31 |
+
"""
|
| 32 |
+
Inicializa el motor de búsqueda stealth.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
headless: Ejecutar navegador sin GUI
|
| 36 |
+
proxy: Proxy a usar (formato: "http://ip:port")
|
| 37 |
+
"""
|
| 38 |
+
self.headless = headless
|
| 39 |
+
self.proxy = proxy
|
| 40 |
+
self.ua_generator = UserAgent()
|
| 41 |
+
|
| 42 |
+
async def _create_stealth_browser(self) -> tuple[Browser, Page]:
|
| 43 |
+
"""
|
| 44 |
+
Crea un navegador con todas las protecciones anti-detección activadas.
|
| 45 |
+
"""
|
| 46 |
+
playwright = await async_playwright().start()
|
| 47 |
+
|
| 48 |
+
# Configuración del navegador
|
| 49 |
+
launch_options = {
|
| 50 |
+
'headless': self.headless,
|
| 51 |
+
'args': [
|
| 52 |
+
'--disable-blink-features=AutomationControlled',
|
| 53 |
+
'--disable-dev-shm-usage',
|
| 54 |
+
'--no-sandbox',
|
| 55 |
+
'--disable-setuid-sandbox',
|
| 56 |
+
'--disable-web-security',
|
| 57 |
+
'--disable-features=IsolateOrigins,site-per-process',
|
| 58 |
+
]
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
if self.proxy:
|
| 62 |
+
launch_options['proxy'] = {'server': self.proxy}
|
| 63 |
+
|
| 64 |
+
browser = await playwright.chromium.launch(**launch_options)
|
| 65 |
+
|
| 66 |
+
# Crear contexto con fingerprint realista
|
| 67 |
+
context = await browser.new_context(
|
| 68 |
+
user_agent=random.choice(self.USER_AGENTS),
|
| 69 |
+
viewport={'width': 1920, 'height': 1080},
|
| 70 |
+
locale='en-US',
|
| 71 |
+
timezone_id='America/New_York',
|
| 72 |
+
permissions=['geolocation'],
|
| 73 |
+
geolocation={'latitude': 40.7128, 'longitude': -74.0060}, # NYC
|
| 74 |
+
color_scheme='light',
|
| 75 |
+
device_scale_factor=1,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Crear página
|
| 79 |
+
page = await context.new_page()
|
| 80 |
+
|
| 81 |
+
# Aplicar playwright-stealth
|
| 82 |
+
await stealth_async(page)
|
| 83 |
+
|
| 84 |
+
# Inyectar scripts adicionales de evasión
|
| 85 |
+
await self._inject_evasion_scripts(page)
|
| 86 |
+
|
| 87 |
+
logger.info("Navegador stealth creado exitosamente")
|
| 88 |
+
|
| 89 |
+
return browser, page
|
| 90 |
+
|
| 91 |
+
async def _inject_evasion_scripts(self, page: Page):
|
| 92 |
+
"""
|
| 93 |
+
Inyecta scripts JavaScript para evadir detección adicional.
|
| 94 |
+
"""
|
| 95 |
+
# Sobrescribir navigator.webdriver
|
| 96 |
+
await page.add_init_script("""
|
| 97 |
+
Object.defineProperty(navigator, 'webdriver', {
|
| 98 |
+
get: () => undefined
|
| 99 |
+
});
|
| 100 |
+
""")
|
| 101 |
+
|
| 102 |
+
# Sobrescribir navigator.plugins
|
| 103 |
+
await page.add_init_script("""
|
| 104 |
+
Object.defineProperty(navigator, 'plugins', {
|
| 105 |
+
get: () => [1, 2, 3, 4, 5]
|
| 106 |
+
});
|
| 107 |
+
""")
|
| 108 |
+
|
| 109 |
+
# Sobrescribir navigator.languages
|
| 110 |
+
await page.add_init_script("""
|
| 111 |
+
Object.defineProperty(navigator, 'languages', {
|
| 112 |
+
get: () => ['en-US', 'en']
|
| 113 |
+
});
|
| 114 |
+
""")
|
| 115 |
+
|
| 116 |
+
# Chrome runtime mock
|
| 117 |
+
await page.add_init_script("""
|
| 118 |
+
window.chrome = {
|
| 119 |
+
runtime: {}
|
| 120 |
+
};
|
| 121 |
+
""")
|
| 122 |
+
|
| 123 |
+
# Permissions mock
|
| 124 |
+
await page.add_init_script("""
|
| 125 |
+
const originalQuery = window.navigator.permissions.query;
|
| 126 |
+
window.navigator.permissions.query = (parameters) => (
|
| 127 |
+
parameters.name === 'notifications' ?
|
| 128 |
+
Promise.resolve({ state: Notification.permission }) :
|
| 129 |
+
originalQuery(parameters)
|
| 130 |
+
);
|
| 131 |
+
""")
|
| 132 |
+
|
| 133 |
+
async def _human_behavior(self, page: Page):
|
| 134 |
+
"""
|
| 135 |
+
Simula comportamiento humano: movimientos de mouse, scrolls, etc.
|
| 136 |
+
"""
|
| 137 |
+
# Scroll aleatorio
|
| 138 |
+
await page.evaluate("""
|
| 139 |
+
window.scrollTo({
|
| 140 |
+
top: Math.random() * 500,
|
| 141 |
+
behavior: 'smooth'
|
| 142 |
+
});
|
| 143 |
+
""")
|
| 144 |
+
|
| 145 |
+
# Espera aleatoria
|
| 146 |
+
await asyncio.sleep(random.uniform(0.5, 2.0))
|
| 147 |
+
|
| 148 |
+
# Movimiento de mouse aleatorio
|
| 149 |
+
await page.mouse.move(
|
| 150 |
+
random.randint(100, 500),
|
| 151 |
+
random.randint(100, 500)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
async def search_pimeyes_free(self, image_path: str) -> List[Dict]:
|
| 155 |
+
"""
|
| 156 |
+
Busca en PimEyes sin pagar, extrayendo las miniaturas censuradas.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
image_path: Ruta a la imagen a buscar
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
Lista de resultados con miniaturas y datos extraíbles
|
| 163 |
+
"""
|
| 164 |
+
logger.info("Iniciando búsqueda stealth en PimEyes")
|
| 165 |
+
|
| 166 |
+
browser, page = await self._create_stealth_browser()
|
| 167 |
+
results = []
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
# Navegar a PimEyes
|
| 171 |
+
await page.goto('https://pimeyes.com/en', wait_until='networkidle')
|
| 172 |
+
logger.info("Página PimEyes cargada")
|
| 173 |
+
|
| 174 |
+
# Simular comportamiento humano
|
| 175 |
+
await self._human_behavior(page)
|
| 176 |
+
|
| 177 |
+
# Aceptar cookies si aparecen
|
| 178 |
+
try:
|
| 179 |
+
await page.click('button:has-text("Accept")', timeout=3000)
|
| 180 |
+
except:
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
# Buscar el botón de upload
|
| 184 |
+
upload_button = await page.query_selector('input[type="file"]')
|
| 185 |
+
|
| 186 |
+
if upload_button:
|
| 187 |
+
# Subir imagen
|
| 188 |
+
await upload_button.set_input_files(image_path)
|
| 189 |
+
logger.info("Imagen subida, esperando resultados...")
|
| 190 |
+
|
| 191 |
+
# Esperar a que carguen los resultados
|
| 192 |
+
await page.wait_for_selector('.results-container', timeout=30000)
|
| 193 |
+
|
| 194 |
+
# Simular scroll para que carguen más imágenes
|
| 195 |
+
for _ in range(3):
|
| 196 |
+
await page.evaluate('window.scrollBy(0, 500)')
|
| 197 |
+
await asyncio.sleep(1)
|
| 198 |
+
|
| 199 |
+
# Extraer miniaturas
|
| 200 |
+
thumbnails = await page.query_selector_all('.result-item img')
|
| 201 |
+
|
| 202 |
+
for idx, thumb in enumerate(thumbnails):
|
| 203 |
+
try:
|
| 204 |
+
# Extraer URL de la miniatura
|
| 205 |
+
thumb_url = await thumb.get_attribute('src')
|
| 206 |
+
|
| 207 |
+
# Extraer contenedor padre para obtener metadata
|
| 208 |
+
parent = await thumb.evaluate_handle('el => el.closest(".result-item")')
|
| 209 |
+
parent_html = await parent.inner_html()
|
| 210 |
+
|
| 211 |
+
# Buscar texto visible (puede contener dominio)
|
| 212 |
+
text_content = await parent.inner_text()
|
| 213 |
+
|
| 214 |
+
# Tomar screenshot de la miniatura individual
|
| 215 |
+
screenshot = await thumb.screenshot()
|
| 216 |
+
|
| 217 |
+
results.append({
|
| 218 |
+
'thumbnail_url': thumb_url,
|
| 219 |
+
'index': idx,
|
| 220 |
+
'text_content': text_content,
|
| 221 |
+
'screenshot': screenshot,
|
| 222 |
+
'source': 'pimeyes',
|
| 223 |
+
'censored': 'blur' in parent_html.lower() or 'premium' in parent_html.lower()
|
| 224 |
+
})
|
| 225 |
+
|
| 226 |
+
logger.debug(f"Miniatura {idx} extraída")
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
logger.warning(f"Error extrayendo miniatura {idx}: {e}")
|
| 230 |
+
continue
|
| 231 |
+
|
| 232 |
+
logger.success(f"PimEyes: {len(results)} miniaturas extraídas")
|
| 233 |
+
|
| 234 |
+
else:
|
| 235 |
+
logger.error("No se encontró el botón de upload en PimEyes")
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Error en búsqueda de PimEyes: {e}")
|
| 239 |
+
|
| 240 |
+
finally:
|
| 241 |
+
await browser.close()
|
| 242 |
+
|
| 243 |
+
return results
|
| 244 |
+
|
| 245 |
+
async def search_yandex_reverse(self, image_path: str) -> List[Dict]:
|
| 246 |
+
"""
|
| 247 |
+
Búsqueda reversa en Yandex Images con stealth.
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
image_path: Ruta a la imagen
|
| 251 |
+
|
| 252 |
+
Returns:
|
| 253 |
+
Lista de resultados
|
| 254 |
+
"""
|
| 255 |
+
logger.info("Iniciando búsqueda stealth en Yandex")
|
| 256 |
+
|
| 257 |
+
browser, page = await self._create_stealth_browser()
|
| 258 |
+
results = []
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
# Navegar a Yandex Images
|
| 262 |
+
await page.goto('https://yandex.com/images/', wait_until='networkidle')
|
| 263 |
+
|
| 264 |
+
# Simular comportamiento humano
|
| 265 |
+
await self._human_behavior(page)
|
| 266 |
+
|
| 267 |
+
# Click en el botón de búsqueda por imagen
|
| 268 |
+
try:
|
| 269 |
+
camera_button = await page.query_selector('.cbir-panel__button')
|
| 270 |
+
await camera_button.click()
|
| 271 |
+
await asyncio.sleep(1)
|
| 272 |
+
except:
|
| 273 |
+
logger.warning("No se pudo hacer click en botón de cámara")
|
| 274 |
+
|
| 275 |
+
# Subir imagen
|
| 276 |
+
file_input = await page.query_selector('input[type="file"]')
|
| 277 |
+
if file_input:
|
| 278 |
+
await file_input.set_input_files(image_path)
|
| 279 |
+
logger.info("Imagen subida a Yandex")
|
| 280 |
+
|
| 281 |
+
# Esperar resultados
|
| 282 |
+
await page.wait_for_selector('.serp-item', timeout=15000)
|
| 283 |
+
|
| 284 |
+
# Scroll para cargar más resultados
|
| 285 |
+
for _ in range(5):
|
| 286 |
+
await page.evaluate('window.scrollBy(0, 800)')
|
| 287 |
+
await asyncio.sleep(0.5)
|
| 288 |
+
|
| 289 |
+
# Extraer resultados
|
| 290 |
+
items = await page.query_selector_all('.serp-item')
|
| 291 |
+
|
| 292 |
+
for idx, item in enumerate(items[:50]):
|
| 293 |
+
try:
|
| 294 |
+
# Extraer link
|
| 295 |
+
link_elem = await item.query_selector('a.serp-item__link')
|
| 296 |
+
url = await link_elem.get_attribute('href') if link_elem else None
|
| 297 |
+
|
| 298 |
+
# Extraer miniatura
|
| 299 |
+
img_elem = await item.query_selector('img.serp-item__thumb')
|
| 300 |
+
thumb_url = await img_elem.get_attribute('src') if img_elem else None
|
| 301 |
+
|
| 302 |
+
# Extraer dominio
|
| 303 |
+
domain_elem = await item.query_selector('.serp-item__domain')
|
| 304 |
+
domain = await domain_elem.inner_text() if domain_elem else None
|
| 305 |
+
|
| 306 |
+
if url:
|
| 307 |
+
results.append({
|
| 308 |
+
'url': url,
|
| 309 |
+
'thumbnail_url': thumb_url,
|
| 310 |
+
'domain': domain,
|
| 311 |
+
'source': 'yandex',
|
| 312 |
+
'index': idx
|
| 313 |
+
})
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.debug(f"Error extrayendo item {idx}: {e}")
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
logger.success(f"Yandex: {len(results)} resultados extraídos")
|
| 320 |
+
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.error(f"Error en búsqueda de Yandex: {e}")
|
| 323 |
+
|
| 324 |
+
finally:
|
| 325 |
+
await browser.close()
|
| 326 |
+
|
| 327 |
+
return results
|
| 328 |
+
|
| 329 |
+
async def search_bing_reverse(self, image_path: str) -> List[Dict]:
|
| 330 |
+
"""
|
| 331 |
+
Búsqueda reversa en Bing Images con stealth.
|
| 332 |
+
"""
|
| 333 |
+
logger.info("Iniciando búsqueda stealth en Bing")
|
| 334 |
+
|
| 335 |
+
browser, page = await self._create_stealth_browser()
|
| 336 |
+
results = []
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
# Navegar a Bing Images
|
| 340 |
+
await page.goto('https://www.bing.com/images', wait_until='networkidle')
|
| 341 |
+
|
| 342 |
+
await self._human_behavior(page)
|
| 343 |
+
|
| 344 |
+
# Click en búsqueda por imagen
|
| 345 |
+
try:
|
| 346 |
+
camera_icon = await page.query_selector('.cameraIcon')
|
| 347 |
+
await camera_icon.click()
|
| 348 |
+
await asyncio.sleep(1)
|
| 349 |
+
except:
|
| 350 |
+
logger.warning("No se encontró icono de cámara en Bing")
|
| 351 |
+
|
| 352 |
+
# Subir imagen
|
| 353 |
+
file_input = await page.query_selector('input[type="file"]')
|
| 354 |
+
if file_input:
|
| 355 |
+
await file_input.set_input_files(image_path)
|
| 356 |
+
|
| 357 |
+
# Esperar resultados
|
| 358 |
+
await page.wait_for_selector('.imgpt', timeout=15000)
|
| 359 |
+
|
| 360 |
+
# Scroll
|
| 361 |
+
for _ in range(3):
|
| 362 |
+
await page.evaluate('window.scrollBy(0, 1000)')
|
| 363 |
+
await asyncio.sleep(1)
|
| 364 |
+
|
| 365 |
+
# Extraer resultados
|
| 366 |
+
items = await page.query_selector_all('.imgpt')
|
| 367 |
+
|
| 368 |
+
for idx, item in enumerate(items[:50]):
|
| 369 |
+
try:
|
| 370 |
+
link_elem = await item.query_selector('a')
|
| 371 |
+
url = await link_elem.get_attribute('href') if link_elem else None
|
| 372 |
+
|
| 373 |
+
img_elem = await item.query_selector('img')
|
| 374 |
+
thumb_url = await img_elem.get_attribute('src') if img_elem else None
|
| 375 |
+
|
| 376 |
+
if url:
|
| 377 |
+
results.append({
|
| 378 |
+
'url': url,
|
| 379 |
+
'thumbnail_url': thumb_url,
|
| 380 |
+
'source': 'bing',
|
| 381 |
+
'index': idx
|
| 382 |
+
})
|
| 383 |
+
|
| 384 |
+
except Exception as e:
|
| 385 |
+
logger.debug(f"Error: {e}")
|
| 386 |
+
continue
|
| 387 |
+
|
| 388 |
+
logger.success(f"Bing: {len(results)} resultados")
|
| 389 |
+
|
| 390 |
+
except Exception as e:
|
| 391 |
+
logger.error(f"Error en Bing: {e}")
|
| 392 |
+
|
| 393 |
+
finally:
|
| 394 |
+
await browser.close()
|
| 395 |
+
|
| 396 |
+
return results
|
| 397 |
+
|
| 398 |
+
async def search_all_engines(self, image_path: str) -> Dict[str, List[Dict]]:
|
| 399 |
+
"""
|
| 400 |
+
Busca en todos los motores simultáneamente.
|
| 401 |
+
|
| 402 |
+
Args:
|
| 403 |
+
image_path: Ruta a la imagen
|
| 404 |
+
|
| 405 |
+
Returns:
|
| 406 |
+
Diccionario con resultados por motor
|
| 407 |
+
"""
|
| 408 |
+
logger.info("Iniciando búsqueda multi-motor")
|
| 409 |
+
|
| 410 |
+
# Ejecutar búsquedas en paralelo
|
| 411 |
+
tasks = [
|
| 412 |
+
self.search_pimeyes_free(image_path),
|
| 413 |
+
self.search_yandex_reverse(image_path),
|
| 414 |
+
self.search_bing_reverse(image_path),
|
| 415 |
+
]
|
| 416 |
+
|
| 417 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 418 |
+
|
| 419 |
+
all_results = {
|
| 420 |
+
'pimeyes': results[0] if not isinstance(results[0], Exception) else [],
|
| 421 |
+
'yandex': results[1] if not isinstance(results[1], Exception) else [],
|
| 422 |
+
'bing': results[2] if not isinstance(results[2], Exception) else [],
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
total = sum(len(v) for v in all_results.values())
|
| 426 |
+
logger.success(f"Total de resultados: {total}")
|
| 427 |
+
|
| 428 |
+
return all_results
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
async def test_stealth():
|
| 432 |
+
"""
|
| 433 |
+
Función de prueba
|
| 434 |
+
"""
|
| 435 |
+
stealth = StealthSearch(headless=True)
|
| 436 |
+
|
| 437 |
+
# Crear imagen de prueba
|
| 438 |
+
import numpy as np
|
| 439 |
+
from PIL import Image
|
| 440 |
+
|
| 441 |
+
test_img = np.random.randint(0, 255, (200, 200, 3), dtype=np.uint8)
|
| 442 |
+
Image.fromarray(test_img).save('/tmp/test.jpg')
|
| 443 |
+
|
| 444 |
+
# Probar PimEyes
|
| 445 |
+
results = await stealth.search_pimeyes_free('/tmp/test.jpg')
|
| 446 |
+
print(f"PimEyes: {len(results)} resultados")
|
| 447 |
+
|
| 448 |
+
# Probar Yandex
|
| 449 |
+
results = await stealth.search_yandex_reverse('/tmp/test.jpg')
|
| 450 |
+
print(f"Yandex: {len(results)} resultados")
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
if __name__ == "__main__":
|
| 454 |
+
asyncio.run(test_stealth())
|
src/test_basic.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests básicos para Aliah-Plus
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
import numpy as np
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Añadir src al path
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 12 |
+
|
| 13 |
+
from src.face_processor import FaceProcessor
|
| 14 |
+
from src.embedding_engine import EmbeddingEngine
|
| 15 |
+
from src.comparator import FaceComparator
|
| 16 |
+
from src.ocr_extractor import OCRExtractor
|
| 17 |
+
from src.cross_referencer import CrossReferencer
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class TestFaceProcessor:
|
| 21 |
+
"""Tests para el procesador de rostros"""
|
| 22 |
+
|
| 23 |
+
def test_initialization(self):
|
| 24 |
+
"""Verifica que FaceProcessor se inicializa correctamente"""
|
| 25 |
+
processor = FaceProcessor()
|
| 26 |
+
assert processor.detector is not None
|
| 27 |
+
|
| 28 |
+
def test_align_face_no_face(self):
|
| 29 |
+
"""Verifica que retorna None cuando no hay rostro"""
|
| 30 |
+
processor = FaceProcessor()
|
| 31 |
+
# Imagen random sin rostro
|
| 32 |
+
random_image = np.random.randint(0, 255, (200, 200, 3), dtype=np.uint8)
|
| 33 |
+
result = processor.align_face(random_image)
|
| 34 |
+
# Puede ser None o una imagen si MTCNN detecta algo por error
|
| 35 |
+
assert result is None or isinstance(result, np.ndarray)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class TestEmbeddingEngine:
|
| 39 |
+
"""Tests para el motor de embeddings"""
|
| 40 |
+
|
| 41 |
+
def test_initialization(self):
|
| 42 |
+
"""Verifica inicialización con diferentes modelos"""
|
| 43 |
+
engine = EmbeddingEngine(model="ArcFace")
|
| 44 |
+
assert engine.model_name == "ArcFace"
|
| 45 |
+
|
| 46 |
+
# Modelo no soportado debería usar ArcFace por defecto
|
| 47 |
+
engine2 = EmbeddingEngine(model="InvalidModel")
|
| 48 |
+
assert engine2.model_name == "ArcFace"
|
| 49 |
+
|
| 50 |
+
def test_generate_embedding_shape(self):
|
| 51 |
+
"""Verifica que los embeddings tienen la forma correcta"""
|
| 52 |
+
engine = EmbeddingEngine(model="ArcFace")
|
| 53 |
+
|
| 54 |
+
# Crear rostro fake de 160x160
|
| 55 |
+
fake_face = np.random.randint(0, 255, (160, 160, 3), dtype=np.uint8)
|
| 56 |
+
|
| 57 |
+
# Intentar generar embedding
|
| 58 |
+
embedding = engine.generate_embedding(fake_face)
|
| 59 |
+
|
| 60 |
+
# Si funciona, debería ser un array numpy
|
| 61 |
+
if embedding is not None:
|
| 62 |
+
assert isinstance(embedding, np.ndarray)
|
| 63 |
+
assert len(embedding) > 0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class TestComparator:
|
| 67 |
+
"""Tests para el comparador de embeddings"""
|
| 68 |
+
|
| 69 |
+
def test_initialization(self):
|
| 70 |
+
"""Verifica inicialización"""
|
| 71 |
+
comparator = FaceComparator(threshold=0.75)
|
| 72 |
+
assert comparator.threshold == 0.75
|
| 73 |
+
|
| 74 |
+
def test_calculate_similarity_identical(self):
|
| 75 |
+
"""Dos embeddings idénticos deben tener similitud 1.0"""
|
| 76 |
+
comparator = FaceComparator()
|
| 77 |
+
|
| 78 |
+
emb = np.random.rand(512)
|
| 79 |
+
similarity = comparator.calculate_similarity(emb, emb)
|
| 80 |
+
|
| 81 |
+
assert abs(similarity - 1.0) < 0.01 # Debe ser ~1.0
|
| 82 |
+
|
| 83 |
+
def test_verify_identity_levels(self):
|
| 84 |
+
"""Verifica los niveles de confianza"""
|
| 85 |
+
comparator = FaceComparator()
|
| 86 |
+
|
| 87 |
+
emb1 = np.random.rand(512)
|
| 88 |
+
emb2 = np.random.rand(512)
|
| 89 |
+
|
| 90 |
+
confidence, similarity = comparator.verify_identity(emb1, emb2)
|
| 91 |
+
|
| 92 |
+
assert isinstance(confidence, str)
|
| 93 |
+
assert 0.0 <= similarity <= 1.0
|
| 94 |
+
|
| 95 |
+
# Verificar categorías
|
| 96 |
+
if similarity > 0.85:
|
| 97 |
+
assert "Seguro" in confidence
|
| 98 |
+
elif similarity > 0.72:
|
| 99 |
+
assert "Probable" in confidence
|
| 100 |
+
else:
|
| 101 |
+
assert "Descartado" in confidence
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class TestOCRExtractor:
|
| 105 |
+
"""Tests para el extractor OCR"""
|
| 106 |
+
|
| 107 |
+
def test_initialization(self):
|
| 108 |
+
"""Verifica inicialización"""
|
| 109 |
+
# Sin GPU para tests
|
| 110 |
+
ocr = OCRExtractor(gpu=False)
|
| 111 |
+
assert ocr.reader is not None
|
| 112 |
+
|
| 113 |
+
def test_clean_text(self):
|
| 114 |
+
"""Verifica limpieza de texto"""
|
| 115 |
+
ocr = OCRExtractor(gpu=False)
|
| 116 |
+
|
| 117 |
+
# Texto con errores comunes de OCR
|
| 118 |
+
dirty = "example.c0m"
|
| 119 |
+
clean = ocr._clean_text(dirty)
|
| 120 |
+
|
| 121 |
+
assert clean == "example.com"
|
| 122 |
+
|
| 123 |
+
def test_is_valid_domain(self):
|
| 124 |
+
"""Verifica validación de dominios"""
|
| 125 |
+
ocr = OCRExtractor(gpu=False)
|
| 126 |
+
|
| 127 |
+
assert ocr._is_valid_domain("example.com") == True
|
| 128 |
+
assert ocr._is_valid_domain("onlyfans.com") == True
|
| 129 |
+
assert ocr._is_valid_domain("invalid") == False
|
| 130 |
+
assert ocr._is_valid_domain("no spaces.com") == False
|
| 131 |
+
|
| 132 |
+
def test_preprocess_image(self):
|
| 133 |
+
"""Verifica que el preprocesamiento genera múltiples versiones"""
|
| 134 |
+
ocr = OCRExtractor(gpu=False)
|
| 135 |
+
|
| 136 |
+
# Imagen de prueba
|
| 137 |
+
test_img = np.random.randint(0, 255, (100, 200, 3), dtype=np.uint8)
|
| 138 |
+
|
| 139 |
+
processed = ocr.preprocess_image(test_img)
|
| 140 |
+
|
| 141 |
+
# Debe generar 7 versiones
|
| 142 |
+
assert len(processed) == 7
|
| 143 |
+
|
| 144 |
+
# Todas deben ser imágenes válidas
|
| 145 |
+
for img in processed:
|
| 146 |
+
assert isinstance(img, np.ndarray)
|
| 147 |
+
assert len(img.shape) == 2 # Grayscale
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class TestCrossReferencer:
|
| 151 |
+
"""Tests para el cross-referencer"""
|
| 152 |
+
|
| 153 |
+
def test_initialization(self):
|
| 154 |
+
"""Verifica inicialización"""
|
| 155 |
+
xref = CrossReferencer(domain_similarity_threshold=0.85)
|
| 156 |
+
assert xref.domain_threshold == 0.85
|
| 157 |
+
|
| 158 |
+
def test_normalize_domain(self):
|
| 159 |
+
"""Verifica normalización de dominios"""
|
| 160 |
+
xref = CrossReferencer()
|
| 161 |
+
|
| 162 |
+
# Diferentes formatos del mismo dominio
|
| 163 |
+
assert xref.normalize_domain("www.example.com") == "example.com"
|
| 164 |
+
assert xref.normalize_domain("EXAMPLE.COM") == "example.com"
|
| 165 |
+
assert xref.normalize_domain("example.com:8080") == "example.com"
|
| 166 |
+
assert xref.normalize_domain("m.example.com") == "example.com"
|
| 167 |
+
|
| 168 |
+
def test_extract_domain_from_url(self):
|
| 169 |
+
"""Verifica extracción de dominio de URL"""
|
| 170 |
+
xref = CrossReferencer()
|
| 171 |
+
|
| 172 |
+
url = "https://www.example.com/path/to/page.html?query=1"
|
| 173 |
+
domain = xref.extract_domain_from_url(url)
|
| 174 |
+
|
| 175 |
+
assert domain == "example.com"
|
| 176 |
+
|
| 177 |
+
def test_calculate_domain_similarity(self):
|
| 178 |
+
"""Verifica cálculo de similitud de dominios"""
|
| 179 |
+
xref = CrossReferencer()
|
| 180 |
+
|
| 181 |
+
# Dominios idénticos
|
| 182 |
+
assert xref.calculate_domain_similarity("example.com", "example.com") == 1.0
|
| 183 |
+
|
| 184 |
+
# Dominios similares
|
| 185 |
+
sim = xref.calculate_domain_similarity("example.com", "examples.com")
|
| 186 |
+
assert 0.7 < sim < 1.0
|
| 187 |
+
|
| 188 |
+
# Dominios diferentes
|
| 189 |
+
sim2 = xref.calculate_domain_similarity("example.com", "different.com")
|
| 190 |
+
assert sim2 < 0.7
|
| 191 |
+
|
| 192 |
+
def test_deduplicate_results(self):
|
| 193 |
+
"""Verifica deduplicación de resultados"""
|
| 194 |
+
xref = CrossReferencer()
|
| 195 |
+
|
| 196 |
+
results = [
|
| 197 |
+
{'url': 'https://example.com/1.jpg'},
|
| 198 |
+
{'url': 'https://example.com/1.jpg'}, # Duplicado
|
| 199 |
+
{'url': 'https://example.com/2.jpg'},
|
| 200 |
+
]
|
| 201 |
+
|
| 202 |
+
unique = xref.deduplicate_results(results)
|
| 203 |
+
|
| 204 |
+
assert len(unique) == 2
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class TestIntegration:
|
| 208 |
+
"""Tests de integración"""
|
| 209 |
+
|
| 210 |
+
def test_full_pipeline_mock(self):
|
| 211 |
+
"""Test del pipeline completo con datos mock"""
|
| 212 |
+
|
| 213 |
+
# 1. Procesar rostro
|
| 214 |
+
processor = FaceProcessor()
|
| 215 |
+
fake_image = np.random.randint(0, 255, (300, 300, 3), dtype=np.uint8)
|
| 216 |
+
|
| 217 |
+
# 2. OCR
|
| 218 |
+
ocr = OCRExtractor(gpu=False)
|
| 219 |
+
|
| 220 |
+
# 3. Cross-referencer
|
| 221 |
+
xref = CrossReferencer()
|
| 222 |
+
|
| 223 |
+
# Datos mock
|
| 224 |
+
yandex_results = [
|
| 225 |
+
{'url': 'https://example.com/photo.jpg', 'source': 'yandex'}
|
| 226 |
+
]
|
| 227 |
+
|
| 228 |
+
ocr_domains = ['example.com']
|
| 229 |
+
|
| 230 |
+
# Cross-reference
|
| 231 |
+
matches = xref.match_pimeyes_with_search(
|
| 232 |
+
[],
|
| 233 |
+
yandex_results,
|
| 234 |
+
ocr_domains
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Debe encontrar el match
|
| 238 |
+
assert isinstance(matches, list)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# Función para ejecutar tests
|
| 242 |
+
def run_tests():
|
| 243 |
+
"""Ejecuta todos los tests"""
|
| 244 |
+
pytest.main([__file__, '-v'])
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
if __name__ == "__main__":
|
| 248 |
+
run_tests()
|
src/usage_example.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ejemplo de uso de Aliah-Plus
|
| 3 |
+
Demuestra cómo usar las funcionalidades principales del sistema
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Añadir el directorio padre al path
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 12 |
+
|
| 13 |
+
from src.face_processor import FaceProcessor
|
| 14 |
+
from src.embedding_engine import EmbeddingEngine
|
| 15 |
+
from src.scrapers.stealth_engine import StealthSearch
|
| 16 |
+
from src.ocr_extractor import OCRExtractor
|
| 17 |
+
from src.cross_referencer import CrossReferencer
|
| 18 |
+
from src.comparator import FaceComparator
|
| 19 |
+
import cv2
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
async def example_complete_search(image_path: str):
|
| 23 |
+
"""
|
| 24 |
+
Ejemplo completo de búsqueda con todas las características de Aliah-Plus.
|
| 25 |
+
"""
|
| 26 |
+
print("=" * 60)
|
| 27 |
+
print("ALIAH-PLUS - Búsqueda Completa")
|
| 28 |
+
print("=" * 60)
|
| 29 |
+
|
| 30 |
+
# 1. Inicializar componentes
|
| 31 |
+
print("\n[1/7] Inicializando componentes...")
|
| 32 |
+
face_processor = FaceProcessor()
|
| 33 |
+
embedding_engine = EmbeddingEngine(model="ArcFace")
|
| 34 |
+
stealth_search = StealthSearch(headless=True)
|
| 35 |
+
ocr_extractor = OCRExtractor(gpu=False) # CPU para ejemplo
|
| 36 |
+
cross_referencer = CrossReferencer()
|
| 37 |
+
comparator = FaceComparator(threshold=0.75)
|
| 38 |
+
|
| 39 |
+
# 2. Cargar y procesar imagen
|
| 40 |
+
print(f"\n[2/7] Cargando imagen: {image_path}")
|
| 41 |
+
image = cv2.imread(image_path)
|
| 42 |
+
if image is None:
|
| 43 |
+
print("❌ Error: No se pudo cargar la imagen")
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 47 |
+
|
| 48 |
+
# 3. Detectar y alinear rostro
|
| 49 |
+
print("\n[3/7] Detectando y alineando rostro...")
|
| 50 |
+
aligned_face = face_processor.align_face(image_rgb)
|
| 51 |
+
|
| 52 |
+
if aligned_face is None:
|
| 53 |
+
print("❌ No se detectó ningún rostro en la imagen")
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
print("✓ Rostro detectado y alineado")
|
| 57 |
+
|
| 58 |
+
# 4. Generar embedding
|
| 59 |
+
print("\n[4/7] Generando embedding facial...")
|
| 60 |
+
embedding = embedding_engine.generate_embedding(aligned_face)
|
| 61 |
+
|
| 62 |
+
if embedding is None:
|
| 63 |
+
print("❌ Error generando embedding")
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
+
print(f"✓ Embedding generado: {len(embedding)} dimensiones")
|
| 67 |
+
|
| 68 |
+
# 5. Buscar en múltiples motores
|
| 69 |
+
print("\n[5/7] Buscando en múltiples motores...")
|
| 70 |
+
print(" → Yandex Images")
|
| 71 |
+
print(" → Bing Images")
|
| 72 |
+
print(" → PimEyes (stealth)")
|
| 73 |
+
|
| 74 |
+
search_results = await stealth_search.search_all_engines(image_path)
|
| 75 |
+
|
| 76 |
+
total_results = sum(len(v) for v in search_results.values())
|
| 77 |
+
print(f"✓ Total de resultados encontrados: {total_results}")
|
| 78 |
+
|
| 79 |
+
for engine, results in search_results.items():
|
| 80 |
+
print(f" • {engine}: {len(results)} resultados")
|
| 81 |
+
|
| 82 |
+
# 6. Extraer dominios con OCR (de miniaturas de PimEyes)
|
| 83 |
+
print("\n[6/7] Extrayendo dominios con OCR...")
|
| 84 |
+
ocr_domains = []
|
| 85 |
+
|
| 86 |
+
if 'pimeyes' in search_results:
|
| 87 |
+
for pim_result in search_results['pimeyes'][:5]: # Solo primeros 5 para ejemplo
|
| 88 |
+
if pim_result.get('screenshot'):
|
| 89 |
+
screenshot_np = cv2.imdecode(
|
| 90 |
+
pim_result['screenshot'],
|
| 91 |
+
cv2.IMREAD_COLOR
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
extracted = ocr_extractor.extract_domain_from_thumb(screenshot_np)
|
| 95 |
+
ocr_domains.extend(extracted)
|
| 96 |
+
|
| 97 |
+
print(f"✓ Dominios extraídos por OCR: {len(ocr_domains)}")
|
| 98 |
+
|
| 99 |
+
if ocr_domains:
|
| 100 |
+
print("\n Dominios encontrados:")
|
| 101 |
+
for dom in ocr_domains[:5]: # Mostrar solo primeros 5
|
| 102 |
+
print(f" • {dom['domain']} (confianza: {dom['confidence']:.2%})")
|
| 103 |
+
|
| 104 |
+
# 7. Cross-referencing
|
| 105 |
+
print("\n[7/7] Correlacionando resultados (Cross-Referencing)...")
|
| 106 |
+
|
| 107 |
+
cross_referenced = cross_referencer.find_cross_references(
|
| 108 |
+
search_results,
|
| 109 |
+
ocr_domains
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
correlations = sum(1 for r in cross_referenced if r.get('cross_referenced', False))
|
| 113 |
+
|
| 114 |
+
print(f"✓ Correlaciones encontradas: {correlations}")
|
| 115 |
+
print(f"✓ Resultados totales procesados: {len(cross_referenced)}")
|
| 116 |
+
|
| 117 |
+
# Mostrar top 5 resultados
|
| 118 |
+
print("\n" + "=" * 60)
|
| 119 |
+
print("TOP 5 RESULTADOS")
|
| 120 |
+
print("=" * 60)
|
| 121 |
+
|
| 122 |
+
for idx, result in enumerate(cross_referenced[:5], 1):
|
| 123 |
+
print(f"\n[{idx}] {result.get('domain', 'N/A')}")
|
| 124 |
+
print(f" URL: {result.get('url', 'N/A')}")
|
| 125 |
+
print(f" Fuentes: {', '.join(result.get('sources', []))}")
|
| 126 |
+
print(f" Verificado por OCR: {'Sí' if result.get('ocr_verified') else 'No'}")
|
| 127 |
+
print(f" Confianza: {result.get('confidence', 0):.2%}")
|
| 128 |
+
|
| 129 |
+
if result.get('cross_referenced'):
|
| 130 |
+
print(f" ✓ Correlacionado entre múltiples fuentes")
|
| 131 |
+
|
| 132 |
+
print("\n" + "=" * 60)
|
| 133 |
+
print("✓ Búsqueda completada")
|
| 134 |
+
print("=" * 60)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
async def example_ocr_only(thumbnail_path: str):
|
| 138 |
+
"""
|
| 139 |
+
Ejemplo de extracción OCR de una miniatura.
|
| 140 |
+
"""
|
| 141 |
+
print("\n" + "=" * 60)
|
| 142 |
+
print("ALIAH-PLUS - Extracción OCR")
|
| 143 |
+
print("=" * 60)
|
| 144 |
+
|
| 145 |
+
ocr = OCRExtractor(gpu=False)
|
| 146 |
+
|
| 147 |
+
print(f"\nProcesando: {thumbnail_path}")
|
| 148 |
+
|
| 149 |
+
image = cv2.imread(thumbnail_path)
|
| 150 |
+
if image is None:
|
| 151 |
+
print("❌ Error cargando imagen")
|
| 152 |
+
return
|
| 153 |
+
|
| 154 |
+
# Extraer dominios
|
| 155 |
+
domains = ocr.extract_domain_from_thumb(image)
|
| 156 |
+
|
| 157 |
+
print(f"\n✓ Dominios encontrados: {len(domains)}")
|
| 158 |
+
|
| 159 |
+
if domains:
|
| 160 |
+
print("\nResultados:")
|
| 161 |
+
for idx, dom in enumerate(domains, 1):
|
| 162 |
+
print(f"\n[{idx}] {dom['domain']}")
|
| 163 |
+
print(f" Confianza: {dom['confidence']:.2%}")
|
| 164 |
+
print(f" Texto original: {dom['original_text']}")
|
| 165 |
+
print(f" Método: #{dom['method']}")
|
| 166 |
+
else:
|
| 167 |
+
print("\n⚠ No se encontraron dominios en la imagen")
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
async def example_compare_faces(image1_path: str, image2_path: str):
|
| 171 |
+
"""
|
| 172 |
+
Ejemplo de comparación directa entre dos rostros.
|
| 173 |
+
"""
|
| 174 |
+
print("\n" + "=" * 60)
|
| 175 |
+
print("ALIAH-PLUS - Comparación de Rostros")
|
| 176 |
+
print("=" * 60)
|
| 177 |
+
|
| 178 |
+
face_processor = FaceProcessor()
|
| 179 |
+
embedding_engine = EmbeddingEngine(model="ArcFace")
|
| 180 |
+
comparator = FaceComparator()
|
| 181 |
+
|
| 182 |
+
# Procesar imagen 1
|
| 183 |
+
print(f"\nImagen 1: {image1_path}")
|
| 184 |
+
img1 = cv2.imread(image1_path)
|
| 185 |
+
img1_rgb = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
|
| 186 |
+
face1 = face_processor.align_face(img1_rgb)
|
| 187 |
+
|
| 188 |
+
if face1 is None:
|
| 189 |
+
print("❌ No se detectó rostro en imagen 1")
|
| 190 |
+
return
|
| 191 |
+
|
| 192 |
+
emb1 = embedding_engine.generate_embedding(face1)
|
| 193 |
+
print("✓ Rostro 1 procesado")
|
| 194 |
+
|
| 195 |
+
# Procesar imagen 2
|
| 196 |
+
print(f"\nImagen 2: {image2_path}")
|
| 197 |
+
img2 = cv2.imread(image2_path)
|
| 198 |
+
img2_rgb = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
|
| 199 |
+
face2 = face_processor.align_face(img2_rgb)
|
| 200 |
+
|
| 201 |
+
if face2 is None:
|
| 202 |
+
print("❌ No se detectó rostro en imagen 2")
|
| 203 |
+
return
|
| 204 |
+
|
| 205 |
+
emb2 = embedding_engine.generate_embedding(face2)
|
| 206 |
+
print("✓ Rostro 2 procesado")
|
| 207 |
+
|
| 208 |
+
# Comparar
|
| 209 |
+
print("\nComparando...")
|
| 210 |
+
confidence_level, similarity = comparator.verify_identity(emb1, emb2)
|
| 211 |
+
|
| 212 |
+
print("\n" + "=" * 60)
|
| 213 |
+
print("RESULTADO")
|
| 214 |
+
print("=" * 60)
|
| 215 |
+
print(f"Similitud: {similarity:.2%}")
|
| 216 |
+
print(f"Distancia: {1-similarity:.3f}")
|
| 217 |
+
print(f"Veredicto: {confidence_level}")
|
| 218 |
+
|
| 219 |
+
if similarity > 0.85:
|
| 220 |
+
print("\n✓ Las personas son la misma (Match Seguro)")
|
| 221 |
+
elif similarity > 0.72:
|
| 222 |
+
print("\n⚠ Posible coincidencia (requiere revisión)")
|
| 223 |
+
else:
|
| 224 |
+
print("\n❌ Las personas son diferentes")
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
async def main():
|
| 228 |
+
"""Menú principal de ejemplos"""
|
| 229 |
+
|
| 230 |
+
print("""
|
| 231 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 232 |
+
║ ALIAH-PLUS EXAMPLES ║
|
| 233 |
+
║ Sistema Avanzado de Re-Identificación ║
|
| 234 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 235 |
+
|
| 236 |
+
Selecciona un ejemplo:
|
| 237 |
+
|
| 238 |
+
1. Búsqueda completa (Face detection + Search + OCR + Cross-ref)
|
| 239 |
+
2. Solo extracción OCR de miniatura
|
| 240 |
+
3. Comparación directa entre dos rostros
|
| 241 |
+
4. Salir
|
| 242 |
+
""")
|
| 243 |
+
|
| 244 |
+
choice = input("Opción (1-4): ").strip()
|
| 245 |
+
|
| 246 |
+
if choice == "1":
|
| 247 |
+
image_path = input("\nRuta de la imagen: ").strip()
|
| 248 |
+
await example_complete_search(image_path)
|
| 249 |
+
|
| 250 |
+
elif choice == "2":
|
| 251 |
+
thumbnail_path = input("\nRuta de la miniatura: ").strip()
|
| 252 |
+
await example_ocr_only(thumbnail_path)
|
| 253 |
+
|
| 254 |
+
elif choice == "3":
|
| 255 |
+
image1 = input("\nRuta imagen 1: ").strip()
|
| 256 |
+
image2 = input("Ruta imagen 2: ").strip()
|
| 257 |
+
await example_compare_faces(image1, image2)
|
| 258 |
+
|
| 259 |
+
elif choice == "4":
|
| 260 |
+
print("\nAdiós!")
|
| 261 |
+
return
|
| 262 |
+
|
| 263 |
+
else:
|
| 264 |
+
print("\n❌ Opción inválida")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
if __name__ == "__main__":
|
| 268 |
+
try:
|
| 269 |
+
asyncio.run(main())
|
| 270 |
+
except KeyboardInterrupt:
|
| 271 |
+
print("\n\n👋 Interrumpido por el usuario")
|
| 272 |
+
except Exception as e:
|
| 273 |
+
print(f"\n❌ Error: {e}")
|
src/vector_db.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector Database - Almacenamiento y recuperación de embeddings
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from loguru import logger
|
| 9 |
+
try:
|
| 10 |
+
from qdrant_client import QdrantClient
|
| 11 |
+
from qdrant_client.models import Distance, VectorParams, PointStruct
|
| 12 |
+
QDRANT_AVAILABLE = True
|
| 13 |
+
except ImportError:
|
| 14 |
+
QDRANT_AVAILABLE = False
|
| 15 |
+
logger.warning("Qdrant no disponible, usando almacenamiento en memoria")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class VectorDatabase:
|
| 19 |
+
"""
|
| 20 |
+
Gestiona el almacenamiento de embeddings y resultados de búsqueda.
|
| 21 |
+
Usa Qdrant si está disponible, sino almacenamiento en memoria.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, host="localhost", port=6333, collection_name="aliah_faces"):
|
| 25 |
+
"""
|
| 26 |
+
Inicializa la conexión con la base de datos vectorial.
|
| 27 |
+
"""
|
| 28 |
+
self.collection_name = collection_name
|
| 29 |
+
self.memory_store = {} # Fallback a memoria
|
| 30 |
+
|
| 31 |
+
if QDRANT_AVAILABLE:
|
| 32 |
+
try:
|
| 33 |
+
self.client = QdrantClient(host=host, port=port)
|
| 34 |
+
self._init_collection()
|
| 35 |
+
self.use_qdrant = True
|
| 36 |
+
logger.info(f"Conectado a Qdrant: {host}:{port}")
|
| 37 |
+
except Exception as e:
|
| 38 |
+
logger.warning(f"No se pudo conectar a Qdrant, usando memoria: {e}")
|
| 39 |
+
self.use_qdrant = False
|
| 40 |
+
else:
|
| 41 |
+
self.use_qdrant = False
|
| 42 |
+
logger.info("Usando almacenamiento en memoria")
|
| 43 |
+
|
| 44 |
+
def _init_collection(self):
|
| 45 |
+
"""Inicializa la colección de Qdrant si no existe"""
|
| 46 |
+
try:
|
| 47 |
+
collections = self.client.get_collections().collections
|
| 48 |
+
if self.collection_name not in [c.name for c in collections]:
|
| 49 |
+
self.client.create_collection(
|
| 50 |
+
collection_name=self.collection_name,
|
| 51 |
+
vectors_config=VectorParams(size=512, distance=Distance.COSINE)
|
| 52 |
+
)
|
| 53 |
+
logger.info(f"Colección '{self.collection_name}' creada")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
logger.error(f"Error inicializando colección: {e}")
|
| 56 |
+
|
| 57 |
+
def store_result(self, query_id: str, embedding: List[float], results: List[Dict]):
|
| 58 |
+
"""
|
| 59 |
+
Almacena el embedding y resultados de una búsqueda.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
query_id: ID único de la búsqueda
|
| 63 |
+
embedding: Vector de embedding
|
| 64 |
+
results: Lista de resultados verificados
|
| 65 |
+
"""
|
| 66 |
+
data = {
|
| 67 |
+
'query_id': query_id,
|
| 68 |
+
'embedding': embedding.tolist() if hasattr(embedding, 'tolist') else embedding,
|
| 69 |
+
'results': results,
|
| 70 |
+
'timestamp': datetime.now().isoformat(),
|
| 71 |
+
'num_results': len(results)
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
if self.use_qdrant:
|
| 75 |
+
try:
|
| 76 |
+
point = PointStruct(
|
| 77 |
+
id=hash(query_id) % (10 ** 8), # ID numérico
|
| 78 |
+
vector=data['embedding'],
|
| 79 |
+
payload={
|
| 80 |
+
'query_id': query_id,
|
| 81 |
+
'results': json.dumps(results),
|
| 82 |
+
'timestamp': data['timestamp'],
|
| 83 |
+
'num_results': len(results)
|
| 84 |
+
}
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self.client.upsert(
|
| 88 |
+
collection_name=self.collection_name,
|
| 89 |
+
points=[point]
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
logger.info(f"Resultado almacenado en Qdrant: {query_id}")
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Error almacenando en Qdrant: {e}")
|
| 95 |
+
self.memory_store[query_id] = data
|
| 96 |
+
else:
|
| 97 |
+
# Almacenar en memoria
|
| 98 |
+
self.memory_store[query_id] = data
|
| 99 |
+
logger.debug(f"Resultado almacenado en memoria: {query_id}")
|
| 100 |
+
|
| 101 |
+
def get_result(self, query_id: str) -> Optional[Dict]:
|
| 102 |
+
"""
|
| 103 |
+
Recupera los resultados de una búsqueda previa.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
query_id: ID de la búsqueda
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
Diccionario con los resultados o None
|
| 110 |
+
"""
|
| 111 |
+
if self.use_qdrant:
|
| 112 |
+
try:
|
| 113 |
+
# Buscar por payload
|
| 114 |
+
results = self.client.scroll(
|
| 115 |
+
collection_name=self.collection_name,
|
| 116 |
+
scroll_filter={
|
| 117 |
+
"must": [
|
| 118 |
+
{
|
| 119 |
+
"key": "query_id",
|
| 120 |
+
"match": {"value": query_id}
|
| 121 |
+
}
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
limit=1
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
if results[0]:
|
| 128 |
+
point = results[0][0]
|
| 129 |
+
return {
|
| 130 |
+
'query_id': point.payload['query_id'],
|
| 131 |
+
'results': json.loads(point.payload['results']),
|
| 132 |
+
'timestamp': point.payload['timestamp'],
|
| 133 |
+
'num_results': point.payload['num_results']
|
| 134 |
+
}
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Error recuperando de Qdrant: {e}")
|
| 137 |
+
|
| 138 |
+
# Buscar en memoria
|
| 139 |
+
return self.memory_store.get(query_id)
|
| 140 |
+
|
| 141 |
+
def search_similar(self, embedding: List[float], limit: int = 10) -> List[Dict]:
|
| 142 |
+
"""
|
| 143 |
+
Busca embeddings similares en la base de datos.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
embedding: Vector de embedding query
|
| 147 |
+
limit: Número máximo de resultados
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Lista de búsquedas similares previas
|
| 151 |
+
"""
|
| 152 |
+
if self.use_qdrant:
|
| 153 |
+
try:
|
| 154 |
+
results = self.client.search(
|
| 155 |
+
collection_name=self.collection_name,
|
| 156 |
+
query_vector=embedding,
|
| 157 |
+
limit=limit
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
similar = []
|
| 161 |
+
for result in results:
|
| 162 |
+
similar.append({
|
| 163 |
+
'query_id': result.payload['query_id'],
|
| 164 |
+
'similarity': result.score,
|
| 165 |
+
'timestamp': result.payload['timestamp'],
|
| 166 |
+
'num_results': result.payload['num_results']
|
| 167 |
+
})
|
| 168 |
+
|
| 169 |
+
return similar
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"Error buscando similares: {e}")
|
| 172 |
+
|
| 173 |
+
return []
|