Update app.py
Browse files
app.py
CHANGED
|
@@ -32,27 +32,7 @@ logging.basicConfig(
|
|
| 32 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 33 |
)
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
-
def setup_spacy_model():
|
| 36 |
-
"""Descarga el modelo de spaCy si no está instalado"""
|
| 37 |
-
try:
|
| 38 |
-
spacy.load("es_core_news_lg")
|
| 39 |
-
logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente")
|
| 40 |
-
except OSError:
|
| 41 |
-
logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
|
| 42 |
-
try:
|
| 43 |
-
subprocess.run(
|
| 44 |
-
[sys.executable, "-m", "spacy", "download", "es_core_news_lg"],
|
| 45 |
-
check=True,
|
| 46 |
-
stdout=subprocess.PIPE,
|
| 47 |
-
stderr=subprocess.PIPE
|
| 48 |
-
)
|
| 49 |
-
logger.info("Modelo descargado exitosamente")
|
| 50 |
-
except subprocess.CalledProcessError as e:
|
| 51 |
-
logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
|
| 52 |
-
raise RuntimeError("No se pudo descargar el modelo spaCy") from e
|
| 53 |
|
| 54 |
-
# Configurar modelo antes de iniciar
|
| 55 |
-
setup_spacy_model()
|
| 56 |
class SEOSpaceAnalyzer:
|
| 57 |
def __init__(self):
|
| 58 |
self.session = self._configure_session()
|
|
@@ -60,20 +40,21 @@ class SEOSpaceAnalyzer:
|
|
| 60 |
self.base_dir = Path("content_storage")
|
| 61 |
self.base_dir.mkdir(parents=True, exist_ok=True)
|
| 62 |
self.current_analysis = {}
|
|
|
|
| 63 |
def _load_models(self) -> Dict:
|
| 64 |
"""Carga modelos optimizados para Hugging Face"""
|
| 65 |
try:
|
| 66 |
device = 0 if torch.cuda.is_available() else -1
|
| 67 |
return {
|
| 68 |
'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
|
| 69 |
-
'ner': pipeline("ner", model="dslim/bert-base-NER", device=device),
|
| 70 |
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
| 71 |
-
'spacy': spacy.load("es_core_news_lg")
|
| 72 |
}
|
| 73 |
except Exception as e:
|
| 74 |
logger.error(f"Error loading models: {e}")
|
| 75 |
raise
|
| 76 |
-
|
| 77 |
def _configure_session(self) -> requests.Session:
|
| 78 |
"""Configura sesión HTTP con reintentos"""
|
| 79 |
session = requests.Session()
|
|
@@ -92,39 +73,15 @@ class SEOSpaceAnalyzer:
|
|
| 92 |
})
|
| 93 |
return session
|
| 94 |
|
| 95 |
-
def
|
| 96 |
-
"""
|
| 97 |
-
try:
|
| 98 |
-
device = 0 if torch.cuda.is_available() else -1
|
| 99 |
-
return {
|
| 100 |
-
'summarizer': pipeline(
|
| 101 |
-
"summarization",
|
| 102 |
-
model="facebook/bart-large-cnn",
|
| 103 |
-
device=device
|
| 104 |
-
),
|
| 105 |
-
'ner': pipeline(
|
| 106 |
-
"ner",
|
| 107 |
-
model="dslim/bert-base-NER",
|
| 108 |
-
aggregation_strategy="simple",
|
| 109 |
-
device=device
|
| 110 |
-
),
|
| 111 |
-
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
| 112 |
-
'spacy': spacy.load("es_core_news_lg")
|
| 113 |
-
}
|
| 114 |
-
except Exception as e:
|
| 115 |
-
logger.error(f"Error loading models: {e}")
|
| 116 |
-
raise
|
| 117 |
-
|
| 118 |
-
def analyze_sitemap(self, sitemap_url: str) -> Dict:
|
| 119 |
-
"""Analiza un sitemap completo"""
|
| 120 |
try:
|
| 121 |
urls = self._parse_sitemap(sitemap_url)
|
| 122 |
if not urls:
|
| 123 |
-
return {"error": "No se pudieron extraer URLs del sitemap"}
|
| 124 |
|
| 125 |
results = []
|
| 126 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 127 |
-
# Corregido: Cambiado ] por } en la comprensión del diccionario
|
| 128 |
futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
|
| 129 |
for future in as_completed(futures):
|
| 130 |
try:
|
|
@@ -142,11 +99,16 @@ class SEOSpaceAnalyzer:
|
|
| 142 |
'timestamp': datetime.now().isoformat()
|
| 143 |
}
|
| 144 |
|
| 145 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
except Exception as e:
|
| 148 |
logger.error(f"Error en análisis: {str(e)}")
|
| 149 |
-
return {"error": str(e)}
|
| 150 |
|
| 151 |
def _process_url(self, url: str) -> Dict:
|
| 152 |
"""Procesa una URL individual"""
|
|
@@ -332,8 +294,9 @@ class SEOSpaceAnalyzer:
|
|
| 332 |
|
| 333 |
# Análisis de temas principales
|
| 334 |
try:
|
|
|
|
| 335 |
vectorizer = TfidfVectorizer(
|
| 336 |
-
stop_words=
|
| 337 |
max_features=50,
|
| 338 |
ngram_range=(1, 2)
|
| 339 |
)
|
|
@@ -404,9 +367,9 @@ class SEOSpaceAnalyzer:
|
|
| 404 |
all_links = [link for r in results for link in r.get('links', [])]
|
| 405 |
if all_links:
|
| 406 |
df_links = pd.DataFrame(all_links)
|
| 407 |
-
|
| 408 |
-
if
|
| 409 |
-
recs.append(f"🔗
|
| 410 |
|
| 411 |
return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
|
| 412 |
|
|
@@ -463,7 +426,7 @@ def create_interface():
|
|
| 463 |
### Documentos Encontrados
|
| 464 |
Los documentos descargados se guardan en la carpeta `content_storage/`
|
| 465 |
""")
|
| 466 |
-
|
| 467 |
|
| 468 |
# Event handlers
|
| 469 |
analyze_btn.click(
|
|
@@ -478,20 +441,27 @@ def create_interface():
|
|
| 478 |
outputs=[stats_output, recommendations_output, content_output, links_output]
|
| 479 |
)
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
download_btn.click(
|
| 482 |
-
fn=
|
| 483 |
-
outputs=gr.File()
|
| 484 |
)
|
| 485 |
|
| 486 |
return interface
|
| 487 |
|
| 488 |
if __name__ == "__main__":
|
| 489 |
-
# Verificar modelos antes de iniciar
|
| 490 |
try:
|
| 491 |
spacy.load("es_core_news_lg")
|
| 492 |
except OSError:
|
| 493 |
-
logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute:")
|
| 494 |
-
logger.error("python -m spacy download es_core_news_lg")
|
| 495 |
exit(1)
|
| 496 |
|
| 497 |
app = create_interface()
|
|
|
|
| 32 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 33 |
)
|
| 34 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
|
|
|
|
|
|
| 36 |
class SEOSpaceAnalyzer:
|
| 37 |
def __init__(self):
|
| 38 |
self.session = self._configure_session()
|
|
|
|
| 40 |
self.base_dir = Path("content_storage")
|
| 41 |
self.base_dir.mkdir(parents=True, exist_ok=True)
|
| 42 |
self.current_analysis = {}
|
| 43 |
+
|
| 44 |
def _load_models(self) -> Dict:
|
| 45 |
"""Carga modelos optimizados para Hugging Face"""
|
| 46 |
try:
|
| 47 |
device = 0 if torch.cuda.is_available() else -1
|
| 48 |
return {
|
| 49 |
'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
|
| 50 |
+
'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
|
| 51 |
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
| 52 |
+
'spacy': spacy.load("es_core_news_lg")
|
| 53 |
}
|
| 54 |
except Exception as e:
|
| 55 |
logger.error(f"Error loading models: {e}")
|
| 56 |
raise
|
| 57 |
+
|
| 58 |
def _configure_session(self) -> requests.Session:
|
| 59 |
"""Configura sesión HTTP con reintentos"""
|
| 60 |
session = requests.Session()
|
|
|
|
| 73 |
})
|
| 74 |
return session
|
| 75 |
|
| 76 |
+
def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
|
| 77 |
+
"""Analiza un sitemap completo y devuelve componentes por separado"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
try:
|
| 79 |
urls = self._parse_sitemap(sitemap_url)
|
| 80 |
if not urls:
|
| 81 |
+
return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
|
| 82 |
|
| 83 |
results = []
|
| 84 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
|
|
| 85 |
futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
|
| 86 |
for future in as_completed(futures):
|
| 87 |
try:
|
|
|
|
| 99 |
'timestamp': datetime.now().isoformat()
|
| 100 |
}
|
| 101 |
|
| 102 |
+
return (
|
| 103 |
+
self.current_analysis['stats'],
|
| 104 |
+
self.current_analysis['recommendations'],
|
| 105 |
+
self.current_analysis['content_analysis'],
|
| 106 |
+
self.current_analysis['links']
|
| 107 |
+
)
|
| 108 |
|
| 109 |
except Exception as e:
|
| 110 |
logger.error(f"Error en análisis: {str(e)}")
|
| 111 |
+
return {"error": str(e)}, [], {}, {}
|
| 112 |
|
| 113 |
def _process_url(self, url: str) -> Dict:
|
| 114 |
"""Procesa una URL individual"""
|
|
|
|
| 294 |
|
| 295 |
# Análisis de temas principales
|
| 296 |
try:
|
| 297 |
+
stop_words = list(self.models['spacy'].Defaults.stop_words)
|
| 298 |
vectorizer = TfidfVectorizer(
|
| 299 |
+
stop_words=stop_words,
|
| 300 |
max_features=50,
|
| 301 |
ngram_range=(1, 2)
|
| 302 |
)
|
|
|
|
| 367 |
all_links = [link for r in results for link in r.get('links', [])]
|
| 368 |
if all_links:
|
| 369 |
df_links = pd.DataFrame(all_links)
|
| 370 |
+
internal_links = df_links[df_links['type'] == 'internal']
|
| 371 |
+
if len(internal_links) > 100: # Umbral arbitrario
|
| 372 |
+
recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
|
| 373 |
|
| 374 |
return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
|
| 375 |
|
|
|
|
| 426 |
### Documentos Encontrados
|
| 427 |
Los documentos descargados se guardan en la carpeta `content_storage/`
|
| 428 |
""")
|
| 429 |
+
# Reemplazado FileExplorer por Markdown informativo
|
| 430 |
|
| 431 |
# Event handlers
|
| 432 |
analyze_btn.click(
|
|
|
|
| 441 |
outputs=[stats_output, recommendations_output, content_output, links_output]
|
| 442 |
)
|
| 443 |
|
| 444 |
+
# Para descargar el reporte, primero se debe generar
|
| 445 |
+
def generate_report():
|
| 446 |
+
if analyzer.current_analysis:
|
| 447 |
+
report_path = "content_storage/seo_report.json"
|
| 448 |
+
with open(report_path, 'w') as f:
|
| 449 |
+
json.dump(analyzer.current_analysis, f, indent=2)
|
| 450 |
+
return report_path
|
| 451 |
+
return None
|
| 452 |
+
|
| 453 |
download_btn.click(
|
| 454 |
+
fn=generate_report,
|
| 455 |
+
outputs=gr.File(label="Descargar Reporte")
|
| 456 |
)
|
| 457 |
|
| 458 |
return interface
|
| 459 |
|
| 460 |
if __name__ == "__main__":
|
|
|
|
| 461 |
try:
|
| 462 |
spacy.load("es_core_news_lg")
|
| 463 |
except OSError:
|
| 464 |
+
logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute: python -m spacy download es_core_news_lg")
|
|
|
|
| 465 |
exit(1)
|
| 466 |
|
| 467 |
app = create_interface()
|