Spaces:

Merlintxu
/

SEO

Runtime error

App Files Files Community

Merlintxu commited on Apr 12, 2025

Commit

bb43f76

verified ·

1 Parent(s): 6564f20

Update app.py

Browse files

Files changed (1) hide show

app.py +218 -161

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import PyPDF2
 import numpy as np
 import pandas as pd
 from io import BytesIO
-from typing import List, Dict, Optional, Tuple
 from urllib.parse import urlparse, urljoin
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
@@ -24,7 +24,6 @@ import torch
 import subprocess
 import sys
 import spacy
-import logging
 import gradio as gr
 import matplotlib.pyplot as plt
@@ -35,30 +34,53 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 class SEOSpaceAnalyzer:
-    def __init__(self):
         self.session = self._configure_session()
         self.models = self._load_models()
         self.base_dir = Path("content_storage")
         self.base_dir.mkdir(parents=True, exist_ok=True)
-        self.current_analysis = {}
-    def _load_models(self) -> Dict:
-        """Carga modelos optimizados para Hugging Face"""
         try:
             device = 0 if torch.cuda.is_available() else -1
-            return {
                 'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
                 'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
                 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
                 'spacy': spacy.load("es_core_news_lg")
             }
         except Exception as e:
-            logger.error(f"Error loading models: {e}")
             raise
     def _configure_session(self) -> requests.Session:
-        """Configura sesión HTTP con reintentos"""
         session = requests.Session()
         retry = Retry(
             total=3,
@@ -74,25 +96,33 @@ class SEOSpaceAnalyzer:
             'Accept-Language': 'es-ES,es;q=0.9'
         })
         return session
     def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
-        """Analiza un sitemap completo y devuelve componentes por separado"""
         try:
             urls = self._parse_sitemap(sitemap_url)
             if not urls:
                 return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
-            results = []
-            with ThreadPoolExecutor(max_workers=4) as executor:
-                futures = {executor.submit(self._process_url, url): url for url in urls[:20]}  # Limitar para demo
                 for future in as_completed(futures):
                     try:
-                        results.append(future.result())
                     except Exception as e:
-                        url = futures[future]
-                        logger.error(f"Error processing {url}: {e}")
                         results.append({'url': url, 'status': 'error', 'error': str(e)})
             self.current_analysis = {
                 'stats': self._calculate_stats(results),
                 'content_analysis': self._analyze_content(results),
@@ -100,43 +130,42 @@ class SEOSpaceAnalyzer:
                 'recommendations': self._generate_seo_recommendations(results),
                 'timestamp': datetime.now().isoformat()
             }
-            return (
-                self.current_analysis['stats'],
-                self.current_analysis['recommendations'],
-                self.current_analysis['content_analysis'],
-                self.current_analysis['links']
-            )
         except Exception as e:
-            logger.error(f"Error en análisis: {str(e)}")
             return {"error": str(e)}, [], {}, {}
     def _process_url(self, url: str) -> Dict:
-        """Procesa una URL individual"""
         try:
             response = self.session.get(url, timeout=15)
             response.raise_for_status()
             content_type = response.headers.get('Content-Type', '')
-            result = {'url': url, 'status': 'success'}
             if 'application/pdf' in content_type:
                 result.update(self._process_pdf(response.content))
             elif 'text/html' in content_type:
                 result.update(self._process_html(response.text, url))
             self._save_content(url, response.content)
             return result
         except requests.exceptions.RequestException as e:
             logger.warning(f"Error procesando {url}: {str(e)}")
             return {'url': url, 'status': 'error', 'error': str(e)}
     def _process_html(self, html: str, base_url: str) -> Dict:
-        """Procesa contenido HTML"""
         soup = BeautifulSoup(html, 'html.parser')
         clean_text = self._clean_text(soup.get_text())
         return {
             'type': 'html',
             'content': clean_text,
@@ -144,16 +173,16 @@ class SEOSpaceAnalyzer:
             'links': self._extract_links(soup, base_url),
             'metadata': self._extract_metadata(soup)
         }
     def _process_pdf(self, content: bytes) -> Dict:
-        """Procesa documentos PDF"""
         try:
             text = ""
             with BytesIO(content) as pdf_file:
                 reader = PyPDF2.PdfReader(pdf_file)
                 for page in reader.pages:
-                    text += page.extract_text() or ""  # Handle None return
             clean_text = self._clean_text(text)
             return {
                 'type': 'pdf',
@@ -162,30 +191,28 @@ class SEOSpaceAnalyzer:
                 'page_count': len(reader.pages)
             }
         except PyPDF2.PdfReadError as e:
-            logger.error(f"Error reading PDF: {e}")
             return {'type': 'pdf', 'error': str(e)}
     def _clean_text(self, text: str) -> str:
-        """Limpieza avanzada de texto"""
         if not text:
             return ""
         text = re.sub(r'\s+', ' ', text)
         return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
     def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
-        """Extrae y clasifica enlaces"""
-        links = []
         base_netloc = urlparse(base_url).netloc
         for tag in soup.find_all('a', href=True):
             try:
                 href = tag['href'].strip()
                 if not href or href.startswith('javascript:'):
                     continue
                 full_url = urljoin(base_url, href)
                 parsed = urlparse(full_url)
                 links.append({
                     'url': full_url,
                     'type': 'internal' if parsed.netloc == base_netloc else 'external',
@@ -193,55 +220,54 @@ class SEOSpaceAnalyzer:
                     'file_type': self._get_file_type(parsed.path)
                 })
             except Exception as e:
-                logger.warning(f"Error processing link {tag.get('href')}: {e}")
                 continue
         return links
     def _get_file_type(self, path: str) -> str:
-        """Determina tipo de archivo por extensión"""
         ext = Path(path).suffix.lower()
         return ext[1:] if ext else 'html'
     def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
-        """Extrae metadatos SEO"""
-        metadata = {
             'title': '',
             'description': '',
             'keywords': [],
             'og': {}
         }
         if soup.title and soup.title.string:
             metadata['title'] = soup.title.string.strip()[:200]
         for meta in soup.find_all('meta'):
             name = meta.get('name', '').lower()
             property_ = meta.get('property', '').lower()
             content = meta.get('content', '')
             if name == 'description':
                 metadata['description'] = content[:300]
             elif name == 'keywords':
                 metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
             elif property_.startswith('og:'):
                 metadata['og'][property_[3:]] = content
         return metadata
     def _parse_sitemap(self, sitemap_url: str) -> List[str]:
-        """Parsea sitemap XML básico"""
         try:
             response = self.session.get(sitemap_url, timeout=10)
             response.raise_for_status()
             if 'xml' not in response.headers.get('Content-Type', ''):
                 logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
                 return []
-            urls = []
-            soup = BeautifulSoup(response.text, 'lxml-xml')  # Usar parser XML específico
-            # Handle sitemap index
             if soup.find('sitemapindex'):
                 for sitemap in soup.find_all('loc'):
                     url = sitemap.text.strip()
@@ -249,80 +275,92 @@ class SEOSpaceAnalyzer:
                         urls.extend(self._parse_sitemap(url))
             else:
                 urls = [loc.text.strip() for loc in soup.find_all('loc')]
-            return list(set(url for url in urls if url.startswith('http')))
         except Exception as e:
-            logger.error(f"Error parsing sitemap {sitemap_url}: {e}")
             return []
     def _save_content(self, url: str, content: bytes) -> None:
-        """Almacena el contenido descargado"""
         try:
             parsed = urlparse(url)
             domain_dir = self.base_dir / parsed.netloc
             path = parsed.path.lstrip('/')
             if not path or path.endswith('/'):
-                path = path + 'index.html'
-            save_path = domain_dir / path
             save_path.parent.mkdir(parents=True, exist_ok=True)
             with open(save_path, 'wb') as f:
                 f.write(content)
         except Exception as e:
-            logger.error(f"Error saving content for {url}: {e}")
     def _calculate_stats(self, results: List[Dict]) -> Dict:
-        """Calcula estadísticas básicas"""
         successful = [r for r in results if r.get('status') == 'success']
         return {
             'total_urls': len(results),
             'successful': len(successful),
             'failed': len(results) - len(successful),
-            'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(),
-            'avg_word_count': round(np.mean([r.get('word_count', 0) for r in successful]), 1),
             'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
         }
     def _analyze_content(self, results: List[Dict]) -> Dict:
-        """Analiza contenido con NLP"""
         successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
-        texts = [r['content'] for r in successful if len(r['content'].split()) > 10]  # Filtrar contenido muy corto
         if not texts:
             return {'top_keywords': [], 'content_samples': []}
-        # Análisis de temas principales
         try:
             stop_words = list(self.models['spacy'].Defaults.stop_words)
-            vectorizer = TfidfVectorizer(
-                stop_words=stop_words,
-                max_features=50,
-                ngram_range=(1, 2)
-            )
             tfidf = vectorizer.fit_transform(texts)
             feature_names = vectorizer.get_feature_names_out()
-            sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]  # Top 10 índices
-            top_keywords = feature_names[sorted_indices][::-1].tolist()  # Orden descendente
         except Exception as e:
-            logger.error(f"Error en análisis TF-IDF: {str(e)}")
             top_keywords = []
         return {
             'top_keywords': top_keywords,
-            'content_samples': [{'url': r['url'], 'sample': r['content'][:500] + '...'}
-                              for r in successful[:3]]  # Muestras de contenido
         }
     def _analyze_links(self, results: List[Dict]) -> Dict:
-        """Analiza estructura de enlaces"""
         all_links = []
         for result in results:
             if result.get('links'):
                 all_links.extend(result['links'])
         if not all_links:
             return {
                 'internal_links': {},
@@ -330,9 +368,7 @@ class SEOSpaceAnalyzer:
                 'common_anchors': {},
                 'file_types': {}
             }
         df = pd.DataFrame(all_links)
         return {
             'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
             'external_domains': df[df['type'] == 'external']['url']
@@ -341,43 +377,59 @@ class SEOSpaceAnalyzer:
             'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
             'file_types': df['file_type'].value_counts().to_dict()
         }
     def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
-        """Genera recomendaciones SEO"""
         successful = [r for r in results if r.get('status') == 'success']
         if not successful:
             return ["No se pudo analizar ningún contenido exitosamente"]
         recs = []
-        # Revisar metadatos
         missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
         if missing_titles:
             recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
-        short_descriptions = sum(1 for r in successful
-                               if not r.get('metadata', {}).get('description'))
         if short_descriptions:
             recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
-        # Revisar contenido corto
         short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
         if short_content:
             recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
-        # Analizar enlaces
         all_links = [link for r in results for link in r.get('links', [])]
         if all_links:
             df_links = pd.DataFrame(all_links)
             internal_links = df_links[df_links['type'] == 'internal']
-            if len(internal_links) > 100:  # Umbral arbitrario
                 recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
         return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
-def create_interface():
     analyzer = SEOSpaceAnalyzer()
     with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
         gr.Markdown("""
         # 🕵️ SEO Analyzer Pro
@@ -385,84 +437,88 @@ def create_interface():
         Sube la URL de un sitemap.xml para analizar todo el sitio web.
         """)
         with gr.Row():
             with gr.Column():
-                sitemap_input = gr.Textbox(
-                    label="URL del Sitemap",
-                    placeholder="https://ejemplo.com/sitemap.xml",
-                    interactive=True
-                )
                 analyze_btn = gr.Button("Analizar Sitio", variant="primary")
                 with gr.Row():
                     clear_btn = gr.Button("Limpiar")
                     download_btn = gr.Button("Descargar Reporte", variant="secondary")
             with gr.Column():
                 status_output = gr.Textbox(label="Estado del Análisis", interactive=False)
                 progress_bar = gr.Progress()
         with gr.Tabs():
             with gr.Tab("📊 Resumen"):
                 stats_output = gr.JSON(label="Estadísticas Generales")
                 recommendations_output = gr.JSON(label="Recomendaciones SEO")
             with gr.Tab("📝 Contenido"):
                 content_output = gr.JSON(label="Análisis de Contenido")
                 gr.Examples(
-                    examples=[
-                        {"content": "Ejemplo de análisis de contenido..."}
-                    ],
                     inputs=[content_output],
                     label="Ejemplos de Salida"
                 )
             with gr.Tab("🔗 Enlaces"):
                 links_output = gr.JSON(label="Análisis de Enlaces")
-                with gr.Accordion("Visualización de Enlaces", open=False):
-                    links_plot = gr.Plot()
             with gr.Tab("📂 Documentos"):
                 gr.Markdown("""
                 ### Documentos Encontrados
                 Los documentos descargados se guardan en la carpeta `content_storage/`
                 """)
-                # Reemplazado FileExplorer por Markdown informativo
-        # Event handlers
         analyze_btn.click(
             fn=analyzer.analyze_sitemap,
             inputs=sitemap_input,
             outputs=[stats_output, recommendations_output, content_output, links_output],
             show_progress=True
         )
         clear_btn.click(
-            fn=lambda: [None]*4,
             outputs=[stats_output, recommendations_output, content_output, links_output]
         )
-        # Para descargar el reporte, primero se debe generar
-        def generate_report():
-            if analyzer.current_analysis:
-                report_path = "content_storage/seo_report.json"
-                with open(report_path, 'w') as f:
-                    json.dump(analyzer.current_analysis, f, indent=2)
-                return report_path
-            return None
         download_btn.click(
             fn=generate_report,
             outputs=gr.File(label="Descargar Reporte")
         )
     return interface
-def setup_spacy_model():
-    """Descarga el modelo de spaCy si no está instalado"""
     try:
         spacy.load("es_core_news_lg")
-        logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente")
     except OSError:
         logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
         try:
@@ -472,17 +528,18 @@ def setup_spacy_model():
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE
             )
-            logger.info("Modelo descargado exitosamente")
         except subprocess.CalledProcessError as e:
             logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
             raise RuntimeError("No se pudo descargar el modelo spaCy") from e
 if __name__ == "__main__":
     setup_spacy_model()
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,
         show_error=True,
         share=False
-    )

 import numpy as np
 import pandas as pd
 from io import BytesIO
+from typing import List, Dict, Optional, Tuple, Any
 from urllib.parse import urlparse, urljoin
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
 import subprocess
 import sys
 import spacy
 import gradio as gr
 import matplotlib.pyplot as plt
 )
 logger = logging.getLogger(__name__)
+def sanitize_filename(filename: str) -> str:
+    """
+    Sanitiza el nombre de un archivo eliminando o reemplazando caracteres no permitidos.
+    """
+    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+    filename = re.sub(r'\s+', '_', filename)
+    return filename
 class SEOSpaceAnalyzer:
+    """
+    Clase principal que encapsula la lógica para analizar un sitio web a partir de su sitemap.
+    """
+    def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
+        """
+        Inicializa la sesión, carga los modelos y configura parámetros.
+        :param max_urls: Número máximo de URLs a procesar en un análisis.
+        :param max_workers: Número de hilos para la ejecución concurrente.
+        """
+        self.max_urls = max_urls
+        self.max_workers = max_workers
         self.session = self._configure_session()
         self.models = self._load_models()
         self.base_dir = Path("content_storage")
         self.base_dir.mkdir(parents=True, exist_ok=True)
+        self.current_analysis: Dict[str, Any] = {}
+    def _load_models(self) -> Dict[str, Any]:
+        """Carga modelos optimizados para Hugging Face y spaCy."""
         try:
             device = 0 if torch.cuda.is_available() else -1
+            logger.info("Cargando modelos NLP...")
+            models = {
                 'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
                 'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
                 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
                 'spacy': spacy.load("es_core_news_lg")
             }
+            logger.info("Modelos cargados correctamente.")
+            return models
         except Exception as e:
+            logger.error(f"Error cargando modelos: {e}")
             raise
     def _configure_session(self) -> requests.Session:
+        """Configura una sesión HTTP con reintentos y headers personalizados."""
         session = requests.Session()
         retry = Retry(
             total=3,
             'Accept-Language': 'es-ES,es;q=0.9'
         })
         return session
     def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
+        """
+        Analiza un sitemap completo, procesando URLs en paralelo y generando estadísticas, análisis de contenido, enlaces y recomendaciones SEO.
+        :param sitemap_url: URL del sitemap XML.
+        :return: Tuple con estadísticas, recomendaciones, análisis de contenido y análisis de enlaces.
+        """
         try:
+            logger.info(f"Parseando sitemap: {sitemap_url}")
             urls = self._parse_sitemap(sitemap_url)
             if not urls:
+                logger.warning("No se pudieron extraer URLs del sitemap.")
                 return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
+            results: List[Dict] = []
+            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
                 for future in as_completed(futures):
+                    url = futures[future]
                     try:
+                        res = future.result()
+                        results.append(res)
+                        logger.info(f"Procesado: {url}")
                     except Exception as e:
+                        logger.error(f"Error procesando {url}: {e}")
                         results.append({'url': url, 'status': 'error', 'error': str(e)})
             self.current_analysis = {
                 'stats': self._calculate_stats(results),
                 'content_analysis': self._analyze_content(results),
                 'recommendations': self._generate_seo_recommendations(results),
                 'timestamp': datetime.now().isoformat()
             }
+            return (self.current_analysis['stats'],
+                    self.current_analysis['recommendations'],
+                    self.current_analysis['content_analysis'],
+                    self.current_analysis['links'])
         except Exception as e:
+            logger.error(f"Error en análisis: {e}")
             return {"error": str(e)}, [], {}, {}
     def _process_url(self, url: str) -> Dict:
+        """Procesa una URL individual y decide el método de procesamiento según el tipo de contenido."""
         try:
             response = self.session.get(url, timeout=15)
             response.raise_for_status()
             content_type = response.headers.get('Content-Type', '')
+            result: Dict[str, Any] = {'url': url, 'status': 'success'}
             if 'application/pdf' in content_type:
                 result.update(self._process_pdf(response.content))
             elif 'text/html' in content_type:
                 result.update(self._process_html(response.text, url))
+            else:
+                result.update({'type': 'unknown', 'content': '', 'word_count': 0})
             self._save_content(url, response.content)
             return result
         except requests.exceptions.RequestException as e:
             logger.warning(f"Error procesando {url}: {str(e)}")
             return {'url': url, 'status': 'error', 'error': str(e)}
+        except Exception as e:
+            logger.error(f"Error inesperado en {url}: {str(e)}")
+            return {'url': url, 'status': 'error', 'error': str(e)}
     def _process_html(self, html: str, base_url: str) -> Dict:
+        """Procesa contenido HTML: extrae y limpia el texto, enlaces y metadatos."""
         soup = BeautifulSoup(html, 'html.parser')
         clean_text = self._clean_text(soup.get_text())
         return {
             'type': 'html',
             'content': clean_text,
             'links': self._extract_links(soup, base_url),
             'metadata': self._extract_metadata(soup)
         }
     def _process_pdf(self, content: bytes) -> Dict:
+        """Procesa documentos PDF extrayendo texto de cada página."""
         try:
             text = ""
             with BytesIO(content) as pdf_file:
                 reader = PyPDF2.PdfReader(pdf_file)
                 for page in reader.pages:
+                    extracted = page.extract_text()
+                    text += extracted if extracted else ""
             clean_text = self._clean_text(text)
             return {
                 'type': 'pdf',
                 'page_count': len(reader.pages)
             }
         except PyPDF2.PdfReadError as e:
+            logger.error(f"Error leyendo PDF: {e}")
             return {'type': 'pdf', 'error': str(e)}
     def _clean_text(self, text: str) -> str:
+        """Realiza la limpieza y normalización del texto."""
         if not text:
             return ""
         text = re.sub(r'\s+', ' ', text)
         return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
     def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
+        """Extrae y clasifica enlaces presentes en el HTML."""
+        links: List[Dict] = []
         base_netloc = urlparse(base_url).netloc
         for tag in soup.find_all('a', href=True):
             try:
                 href = tag['href'].strip()
                 if not href or href.startswith('javascript:'):
                     continue
                 full_url = urljoin(base_url, href)
                 parsed = urlparse(full_url)
                 links.append({
                     'url': full_url,
                     'type': 'internal' if parsed.netloc == base_netloc else 'external',
                     'file_type': self._get_file_type(parsed.path)
                 })
             except Exception as e:
+                logger.warning(f"Error procesando enlace {tag.get('href')}: {e}")
                 continue
         return links
     def _get_file_type(self, path: str) -> str:
+        """Determina el tipo de archivo según la extensión encontrada en la URL."""
         ext = Path(path).suffix.lower()
         return ext[1:] if ext else 'html'
     def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
+        """Extrae metadatos relevantes para SEO (título, descripción, keywords y etiquetas OpenGraph)."""
+        metadata: Dict[str, Any] = {
             'title': '',
             'description': '',
             'keywords': [],
             'og': {}
         }
         if soup.title and soup.title.string:
             metadata['title'] = soup.title.string.strip()[:200]
         for meta in soup.find_all('meta'):
             name = meta.get('name', '').lower()
             property_ = meta.get('property', '').lower()
             content = meta.get('content', '')
             if name == 'description':
                 metadata['description'] = content[:300]
             elif name == 'keywords':
                 metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
             elif property_.startswith('og:'):
                 metadata['og'][property_[3:]] = content
         return metadata
     def _parse_sitemap(self, sitemap_url: str) -> List[str]:
+        """
+        Parsea un sitemap XML e incluso maneja índices de sitemaps.
+        :return: Lista de URLs encontradas en el sitemap.
+        """
         try:
             response = self.session.get(sitemap_url, timeout=10)
             response.raise_for_status()
             if 'xml' not in response.headers.get('Content-Type', ''):
                 logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
                 return []
+            soup = BeautifulSoup(response.text, 'lxml-xml')
+            urls: List[str] = []
+            # Manejo de sitemap index
             if soup.find('sitemapindex'):
                 for sitemap in soup.find_all('loc'):
                     url = sitemap.text.strip()
                         urls.extend(self._parse_sitemap(url))
             else:
                 urls = [loc.text.strip() for loc in soup.find_all('loc')]
+            # Filtrar URLs que empiezan por http y eliminar duplicados
+            filtered_urls = list({url for url in urls if url.startswith('http')})
+            return filtered_urls
         except Exception as e:
+            logger.error(f"Error al parsear el sitemap {sitemap_url}: {e}")
             return []
     def _save_content(self, url: str, content: bytes) -> None:
+        """
+        Almacena el contenido descargado en una estructura organizada. Antes de escribir, verifica si ya existe el archivo.
+        """
         try:
             parsed = urlparse(url)
             domain_dir = self.base_dir / parsed.netloc
+            # Construir ruta a partir de la ruta URL
             path = parsed.path.lstrip('/')
             if not path or path.endswith('/'):
+                path = os.path.join(path, 'index.html')
+            safe_path = sanitize_filename(path)
+            save_path = domain_dir / safe_path
             save_path.parent.mkdir(parents=True, exist_ok=True)
+            # Calcula hash del contenido y evita re-escribir si el archivo existe y es idéntico
+            new_hash = hashlib.md5(content).hexdigest()
+            if save_path.exists():
+                with open(save_path, 'rb') as f:
+                    existing_content = f.read()
+                existing_hash = hashlib.md5(existing_content).hexdigest()
+                if new_hash == existing_hash:
+                    logger.debug(f"El contenido de {url} ya está guardado y es idéntico.")
+                    return
             with open(save_path, 'wb') as f:
                 f.write(content)
+            logger.info(f"Contenido guardado en: {save_path}")
         except Exception as e:
+            logger.error(f"Error al guardar contenido para {url}: {e}")
     def _calculate_stats(self, results: List[Dict]) -> Dict:
+        """Calcula estadísticas básicas sobre el conjunto de resultados procesados."""
         successful = [r for r in results if r.get('status') == 'success']
+        content_types = [r.get('type', 'unknown') for r in successful]
+        avg_word_count = round(np.mean([r.get('word_count', 0) for r in successful]) if successful else 0, 1)
         return {
             'total_urls': len(results),
             'successful': len(successful),
             'failed': len(results) - len(successful),
+            'content_types': pd.Series(content_types).value_counts().to_dict(),
+            'avg_word_count': avg_word_count,
             'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
         }
     def _analyze_content(self, results: List[Dict]) -> Dict:
+        """
+        Analiza el contenido extraído usando TF-IDF y muestra algunas muestras.
+        :return: Diccionario con keywords y ejemplos de contenido.
+        """
         successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
+        texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
         if not texts:
             return {'top_keywords': [], 'content_samples': []}
         try:
             stop_words = list(self.models['spacy'].Defaults.stop_words)
+            vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50, ngram_range=(1, 2))
             tfidf = vectorizer.fit_transform(texts)
             feature_names = vectorizer.get_feature_names_out()
+            sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]
+            top_keywords = feature_names[sorted_indices][::-1].tolist()
         except Exception as e:
+            logger.error(f"Error en análisis TF-IDF: {e}")
             top_keywords = []
         return {
             'top_keywords': top_keywords,
+            'content_samples': [{'url': r['url'], 'sample': (r['content'][:500] + '...') if len(r['content']) > 500 else r['content']}
+                                for r in successful[:3]]
         }
     def _analyze_links(self, results: List[Dict]) -> Dict:
+        """
+        Analiza la estructura de enlaces en el contenido procesado.
+        :return: Estadísticas de enlaces internos, dominios externos, anclas y tipos de archivos.
+        """
         all_links = []
         for result in results:
             if result.get('links'):
                 all_links.extend(result['links'])
         if not all_links:
             return {
                 'internal_links': {},
                 'common_anchors': {},
                 'file_types': {}
             }
         df = pd.DataFrame(all_links)
         return {
             'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
             'external_domains': df[df['type'] == 'external']['url']
             'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
             'file_types': df['file_type'].value_counts().to_dict()
         }
     def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
+        """
+        Genera recomendaciones SEO basadas en metadatos, cantidad de contenido y estructura de enlaces.
+        :return: Lista de recomendaciones.
+        """
         successful = [r for r in results if r.get('status') == 'success']
         if not successful:
             return ["No se pudo analizar ningún contenido exitosamente"]
         recs = []
         missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
         if missing_titles:
             recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
+        short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
         if short_descriptions:
             recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
         short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
         if short_content:
             recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
         all_links = [link for r in results for link in r.get('links', [])]
         if all_links:
             df_links = pd.DataFrame(all_links)
             internal_links = df_links[df_links['type'] == 'internal']
+            if len(internal_links) > 100:
                 recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
         return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
+    def _plot_internal_links(self, links_data: Dict) -> Optional[plt.Figure]:
+        """
+        Genera un gráfico de barras para la distribución de enlaces internos.
+        :param links_data: Diccionario con los enlaces internos.
+        :return: Figura de matplotlib o None si no hay datos.
+        """
+        internal_links = links_data.get('internal_links', {})
+        if not internal_links:
+            return None
+        fig, ax = plt.subplots()
+        names = list(internal_links.keys())
+        counts = list(internal_links.values())
+        ax.barh(names, counts)
+        ax.set_xlabel("Cantidad de enlaces")
+        ax.set_title("Top 20 Enlaces Internos")
+        plt.tight_layout()
+        return fig
+def create_interface() -> gr.Blocks:
+    """
+    Crea la interfaz de usuario utilizando Gradio.
+    """
     analyzer = SEOSpaceAnalyzer()
     with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
         gr.Markdown("""
         # 🕵️ SEO Analyzer Pro
         Sube la URL de un sitemap.xml para analizar todo el sitio web.
         """)
         with gr.Row():
             with gr.Column():
+                sitemap_input = gr.Textbox(label="URL del Sitemap",
+                                           placeholder="https://ejemplo.com/sitemap.xml",
+                                           interactive=True)
                 analyze_btn = gr.Button("Analizar Sitio", variant="primary")
                 with gr.Row():
                     clear_btn = gr.Button("Limpiar")
                     download_btn = gr.Button("Descargar Reporte", variant="secondary")
+                    plot_btn = gr.Button("Visualizar Enlaces Internos", variant="secondary")
             with gr.Column():
                 status_output = gr.Textbox(label="Estado del Análisis", interactive=False)
                 progress_bar = gr.Progress()
         with gr.Tabs():
             with gr.Tab("📊 Resumen"):
                 stats_output = gr.JSON(label="Estadísticas Generales")
                 recommendations_output = gr.JSON(label="Recomendaciones SEO")
             with gr.Tab("📝 Contenido"):
                 content_output = gr.JSON(label="Análisis de Contenido")
                 gr.Examples(
+                    examples=[{"content": "Ejemplo de análisis de contenido..."}],
                     inputs=[content_output],
                     label="Ejemplos de Salida"
                 )
             with gr.Tab("🔗 Enlaces"):
                 links_output = gr.JSON(label="Análisis de Enlaces")
+                links_plot = gr.Plot(label="Visualización de Enlaces Internos")
             with gr.Tab("📂 Documentos"):
                 gr.Markdown("""
                 ### Documentos Encontrados
                 Los documentos descargados se guardan en la carpeta `content_storage/`
                 """)
+        # Función que genera el reporte y lo guarda en disco
+        def generate_report() -> Optional[str]:
+            if analyzer.current_analysis:
+                report_path = "content_storage/seo_report.json"
+                try:
+                    with open(report_path, 'w', encoding='utf-8') as f:
+                        json.dump(analyzer.current_analysis, f, indent=2, ensure_ascii=False)
+                    return report_path
+                except Exception as e:
+                    logger.error(f"Error generando reporte: {e}")
+                    return None
+            return None
+        # Callback para generar gráfico de enlaces internos a partir del análisis almacenado
+        def generate_internal_links_plot(links_json: Dict) -> Any:
+            fig = analyzer._plot_internal_links(links_json)
+            return fig if fig is not None else {}
+        # Asignación de acciones a botones y otros eventos
         analyze_btn.click(
             fn=analyzer.analyze_sitemap,
             inputs=sitemap_input,
             outputs=[stats_output, recommendations_output, content_output, links_output],
             show_progress=True
         )
         clear_btn.click(
+            fn=lambda: [None] * 4,
             outputs=[stats_output, recommendations_output, content_output, links_output]
         )
         download_btn.click(
             fn=generate_report,
             outputs=gr.File(label="Descargar Reporte")
         )
+        plot_btn.click(
+            fn=generate_internal_links_plot,
+            inputs=links_output,
+            outputs=links_plot
+        )
     return interface
+def setup_spacy_model() -> None:
+    """
+    Verifica y descarga el modelo de spaCy 'es_core_news_lg' si no está instalado.
+    """
     try:
         spacy.load("es_core_news_lg")
+        logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente.")
     except OSError:
         logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
         try:
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE
             )
+            logger.info("Modelo descargado exitosamente.")
         except subprocess.CalledProcessError as e:
             logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
             raise RuntimeError("No se pudo descargar el modelo spaCy") from e
 if __name__ == "__main__":
     setup_spacy_model()
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,
         show_error=True,
         share=False
+    )