Spaces:

Merlintxu
/

SEO

Runtime error

App Files Files Community

Merlintxu commited on Apr 11, 2025

Commit

63fe26b

verified ·

1 Parent(s): b5c209f

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -149

app.py CHANGED Viewed

@@ -15,38 +15,18 @@ from bs4 import BeautifulSoup
 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
-import gradio as gr
-import matplotlib.pyplot as plt
-from sklearn.feature_extractio
-import json
-import logging
-import re
-import requests
-import hashlib
-import PyPDF2
-import numpy as np
-import pandas as pd
-from io import BytesIO
-from typing import List, Dict, Optional
-from urllib.parse import urlparse, urljoin
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from bs4 import BeautifulSoup
-from pathlib import Path
-from datetime import datetime
-from collections import defaultdict
-import gradio as gr
-import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import TfidfVectorizer
 from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from sentence_transformers import SentenceTransformer
 import spacy
 import torch
-# Configuración inicial
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -55,12 +35,11 @@ class SEOSpaceAnalyzer:
         self.session = self._configure_session()
         self.models = self._load_models()
         self.base_dir = Path("content_storage")
-        self.link_analysis = defaultdict(list)
-        self.documents = []
         self.current_analysis = {}
     def _configure_session(self):
-        """Configuración avanzada de sesión HTTP con reintentos"""
         session = requests.Session()
         retry = Retry(
             total=3,
@@ -76,201 +55,270 @@ class SEOSpaceAnalyzer:
         return session
     def _load_models(self):
-        """Carga modelos de Hugging Face optimizados"""
         device = 0 if torch.cuda.is_available() else -1
         return {
             'summarizer': pipeline("summarization",
                                  model="facebook/bart-large-cnn",
                                  device=device),
             'ner': pipeline("ner",
-                           model="dslim/bert-base-NER",
-                           aggregation_strategy="simple",
-                           device=device),
-            'qa': pipeline("question-answering",
-                         model="deepset/roberta-base-squad2",
-                         device=device),
             'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
             'spacy': spacy.load("es_core_news_lg")
         }
-    def _process_url(self, url):
-        """Procesa una URL y extrae su contenido"""
         try:
-            response = self.session.get(url, timeout=15)
             response.raise_for_status()
             content_type = response.headers.get('Content-Type', '')
-            result = {'url': url, 'links': []}
             if 'application/pdf' in content_type:
                 result.update(self._process_pdf(response.content))
             elif 'text/html' in content_type:
                 result.update(self._process_html(response.text, url))
-            self._save_content(url, response.content)
-            return result
         except Exception as e:
-            logger.error(f"Error procesando {url}: {str(e)}")
-            return {'url': url, 'error': str(e)}
-    def _process_html(self, html, base_url):
         """Procesa contenido HTML"""
         soup = BeautifulSoup(html, 'lxml')
         return {
-            'content': self._clean_text(soup.get_text()),
             'type': 'html',
-            'metadata': self._extract_metadata(soup),
-            'links': self._extract_links(soup, base_url)
         }
-    def _process_pdf(self, content):
         """Procesa documentos PDF"""
         text = ""
         with BytesIO(content) as pdf_file:
             reader = PyPDF2.PdfReader(pdf_file)
             for page in reader.pages:
                 text += page.extract_text()
         return {
-            'content': self._clean_text(text),
             'type': 'pdf',
-            'metadata': {'pages': len(reader.pages)}
         }
-    def _extract_links(self, soup, base_url):
         """Extrae y clasifica enlaces"""
         links = []
         for tag in soup.find_all('a', href=True):
-            href = tag['href']
-            full_url = urljoin(base_url, href)
-            link_type = 'internal' if urlparse(full_url).netloc == urlparse(base_url).netloc else 'external'
-            links.append({
-                'url': full_url,
-                'type': link_type,
-                'anchor': self._clean_text(tag.text),
-                'file_type': self._get_file_type(href)
-            })
         return links
-    def _get_file_type(self, url):
-        """Determina el tipo de archivo por extensión"""
-        ext = Path(urlparse(url).path).suffix.lower()
         return ext[1:] if ext else 'html'
-    def _clean_text(self, text):
-        """Limpieza avanzada de texto"""
-        text = re.sub(r'\s+', ' ', text)
-        return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
-    def _save_content(self, url, content):
-        """Almacena el contenido descargado"""
-        path = urlparse(url).path.lstrip('/')
-        save_path = self.base_dir / urlparse(url).netloc / path
-        save_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(save_path.with_suffix(self._get_file_type(url)), 'wb') as f:
-            f.write(content)
-    def analyze_sitemap(self, sitemap_url):
-        """Analiza todo el sitemap y genera reportes"""
-        urls = self._parse_sitemap(sitemap_url)
-        results = []
-        with ThreadPoolExecutor(max_workers=4) as executor:
-            futures = [executor.submit(self._process_url, url) for url in urls]
-            for future in as_completed(futures):
-                results.append(future.result())
-                progress(len(results)/len(urls))
-        self.current_analysis = {
-            'basic_stats': self._calculate_stats(results),
-            'content_analysis': self._analyze_content(results),
-            'link_analysis': self._analyze_links(results),
-            'seo_recommendations': self._generate_recommendations(results)
-        }
-        return self.current_analysis
-    def _parse_sitemap(self, sitemap_url):
-        """Parsea sitemaps XML incluyendo sitemaps indexados"""
-        # Implementación de parsing de sitemap (similar a versiones anteriores)
-        return []
-    def _calculate_stats(self, results):
-        """Calcula estadísticas básicas del análisis"""
         return {
             'total_urls': len(results),
-            'content_types': pd.Series([r.get('type', 'unknown') for r in results]).value_counts().to_dict(),
-            'avg_content_length': np.mean([len(r.get('content', '')) for r in results])
         }
-    def create_report(self):
-        """Crea un reporte descargable en múltiples formatos"""
-        report = {
-            'timestamp': datetime.now().isoformat(),
-            'analysis': self.current_analysis
         }
-        # Guardar en JSON
-        json_path = self.base_dir / 'seo_report.json'
-        with open(json_path, 'w') as f:
-            json.dump(report, f)
-        # Crear CSV con enlaces
-        df = pd.DataFrame([link for result in self.current_analysis['link_analysis'] for link in result['links']])
-        csv_path = self.base_dir / 'links_analysis.csv'
-        df.to_csv(csv_path, index=False)
-        return [str(json_path), str(csv_path)]
-    def create_visualization(self):
-        """Genera visualizaciones interactivas"""
-        fig, ax = plt.subplots()
-        pd.Series(self.current_analysis['basic_stats']['content_types']).plot.pie(
-            ax=ax,
-            title='Distribución de Tipos de Contenido',
-            ylabel=''
-        )
-        return fig
-# Interface Gradio
 def create_interface():
     analyzer = SEOSpaceAnalyzer()
     with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("# 🕵️ SEO Analyzer Pro")
         with gr.Row():
-            sitemap_url = gr.Textbox(label="URL del Sitemap", placeholder="https://www.ing.es/ennaranja/sitemap.xml")
-            analyze_btn = gr.Button("Analizar", variant="primary")
-        with gr.Tab("Resultados"):
-            json_output = gr.JSON(label="Análisis Completo")
-            plot_output = gr.Plot(label="Visualización")
-        with gr.Tab("Enlaces"):
-            internal_links = gr.Dataframe(label="Enlaces Internos")
-            external_links = gr.Dataframe(label="Enlaces Externos")
-        with gr.Tab("Descargas"):
-            report_download = gr.Files(label="Descargar Reporte")
-            download_btn = gr.Button("Generar Reporte", variant="secondary")
         analyze_btn.click(
             fn=analyzer.analyze_sitemap,
             inputs=sitemap_url,
-            outputs=[json_output, plot_output, internal_links, external_links]
-        )
-        download_btn.click(
-            fn=analyzer.create_report,
-            outputs=report_download
         )
     return interface
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch(server_name="0.0.0.0", server_port=7860)

 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
 from sklearn.feature_extraction.text import TfidfVectorizer
 from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from sentence_transformers import SentenceTransformer
 import spacy
 import torch
+import gradio as gr
+import matplotlib.pyplot as plt
+# Configuración de logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         self.session = self._configure_session()
         self.models = self._load_models()
         self.base_dir = Path("content_storage")
+        self.base_dir.mkdir(exist_ok=True)
         self.current_analysis = {}
     def _configure_session(self):
+        """Configura sesión HTTP con reintentos"""
         session = requests.Session()
         retry = Retry(
             total=3,
         return session
     def _load_models(self):
+        """Carga modelos optimizados para Hugging Face"""
         device = 0 if torch.cuda.is_available() else -1
         return {
             'summarizer': pipeline("summarization",
                                  model="facebook/bart-large-cnn",
                                  device=device),
             'ner': pipeline("ner",
+                          model="dslim/bert-base-NER",
+                          device=device),
             'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
             'spacy': spacy.load("es_core_news_lg")
         }
+    def analyze_sitemap(self, sitemap_url: str):
+        """Analiza un sitemap completo"""
+        try:
+            urls = self._parse_sitemap(sitemap_url)
+            if not urls:
+                return {"error": "No se pudieron extraer URLs del sitemap"}
+            results = []
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                futures = [executor.submit(self._process_url, url) for url in urls[:50]]  # Limitar para demo
+                for future in as_completed(futures):
+                    results.append(future.result())
+            self.current_analysis = {
+                'stats': self._calculate_stats(results),
+                'content_analysis': self._analyze_content(results),
+                'links': self._analyze_links(results),
+                'recommendations': self._generate_seo_recommendations(results)
+            }
+            return self.current_analysis
+        except Exception as e:
+            logger.error(f"Error en análisis: {str(e)}")
+            return {"error": str(e)}
+    def _process_url(self, url: str):
+        """Procesa una URL individual"""
         try:
+            response = self.session.get(url, timeout=10)
             response.raise_for_status()
             content_type = response.headers.get('Content-Type', '')
+            result = {'url': url, 'status': 'success'}
             if 'application/pdf' in content_type:
                 result.update(self._process_pdf(response.content))
             elif 'text/html' in content_type:
                 result.update(self._process_html(response.text, url))
+            return result
         except Exception as e:
+            logger.warning(f"Error procesando {url}: {str(e)}")
+            return {'url': url, 'status': 'error', 'error': str(e)}
+    def _process_html(self, html: str, base_url: str):
         """Procesa contenido HTML"""
         soup = BeautifulSoup(html, 'lxml')
+        clean_text = self._clean_text(soup.get_text())
         return {
             'type': 'html',
+            'content': clean_text,
+            'word_count': len(clean_text.split()),
+            'links': self._extract_links(soup, base_url),
+            'metadata': self._extract_metadata(soup)
         }
+    def _process_pdf(self, content: bytes):
         """Procesa documentos PDF"""
         text = ""
         with BytesIO(content) as pdf_file:
             reader = PyPDF2.PdfReader(pdf_file)
             for page in reader.pages:
                 text += page.extract_text()
+        clean_text = self._clean_text(text)
         return {
             'type': 'pdf',
+            'content': clean_text,
+            'word_count': len(clean_text.split()),
+            'page_count': len(reader.pages)
         }
+    def _clean_text(self, text: str):
+        """Limpieza avanzada de texto"""
+        text = re.sub(r'\s+', ' ', text)
+        return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
+    def _extract_links(self, soup: BeautifulSoup, base_url: str):
         """Extrae y clasifica enlaces"""
         links = []
         for tag in soup.find_all('a', href=True):
+            try:
+                full_url = urljoin(base_url, tag['href'])
+                parsed = urlparse(full_url)
+                links.append({
+                    'url': full_url,
+                    'type': 'internal' if parsed.netloc == urlparse(base_url).netloc else 'external',
+                    'anchor': self._clean_text(tag.text)[:100],
+                    'file_type': self._get_file_type(parsed.path)
+                })
+            except:
+                continue
         return links
+    def _get_file_type(self, path: str):
+        """Determina tipo de archivo por extensión"""
+        ext = Path(path).suffix.lower()
         return ext[1:] if ext else 'html'
+    def _extract_metadata(self, soup: BeautifulSoup):
+        """Extrae metadatos SEO"""
+        metadata = {'title': '', 'description': '', 'keywords': []}
+        # Título
+        if soup.title:
+            metadata['title'] = soup.title.string.strip()
+        # Meta tags
+        for meta in soup.find_all('meta'):
+            if meta.get('name') == 'description':
+                metadata['description'] = meta.get('content', '')[:500]
+            elif meta.get('name') == 'keywords':
+                metadata['keywords'] = [kw.strip() for kw in meta.get('content', '').split(',')]
+        return metadata
+    def _parse_sitemap(self, sitemap_url: str):
+        """Parsea sitemap XML básico"""
+        try:
+            response = self.session.get(sitemap_url)
+            response.raise_for_status()
+            urls = []
+            soup = BeautifulSoup(response.text, 'lxml')
+            # Sitemap index
+            for loc in soup.find_all('loc'):
+                url = loc.text.strip()
+                if url.endswith('.xml') and url != sitemap_url:
+                    urls.extend(self._parse_sitemap(url))
+                else:
+                    urls.append(url)
+            return list(set(urls))
+        except Exception as e:
+            logger.error(f"Error parsing sitemap: {str(e)}")
+            return []
+    def _calculate_stats(self, results: List[Dict]):
+        """Calcula estadísticas básicas"""
+        successful = [r for r in results if r.get('status') == 'success']
         return {
             'total_urls': len(results),
+            'successful': len(successful),
+            'failed': len(results) - len(successful),
+            'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(),
+            'avg_word_count': np.mean([r.get('word_count', 0) for r in successful])
         }
+    def _analyze_content(self, results: List[Dict]):
+        """Analiza contenido con NLP"""
+        successful = [r for r in results if r.get('status') == 'success']
+        texts = [r.get('content', '') for r in successful]
+        # Análisis de temas principales
+        vectorizer = TfidfVectorizer(stop_words=list(spacy.lang.es.stop_words.STOP_WORDS))
+        try:
+            tfidf = vectorizer.fit_transform(texts)
+            top_keywords = vectorizer.get_feature_names_out()[np.argsort(tfidf.sum(axis=0).A1][-10:][::-1]
+        except:
+            top_keywords = []
+        return {
+            'top_keywords': list(top_keywords),
+            'content_samples': [t[:500] + '...' for t in texts[:3]]  # Muestras de contenido
         }
+    def _analyze_links(self, results: List[Dict]):
+        """Analiza estructura de enlaces"""
+        all_links = []
+        for result in results:
+            if result.get('links'):
+                all_links.extend(result['links'])
+        if not all_links:
+            return {}
+        df = pd.DataFrame(all_links)
+        return {
+            'internal_links': df[df['type'] == 'internal']['url'].value_counts().to_dict(),
+            'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().to_dict(),
+            'common_anchors': df['anchor'].value_counts().head(10).to_dict()
+        }
+    def _generate_seo_recommendations(self, results: List[Dict]):
+        """Genera recomendaciones SEO"""
+        successful = [r for r in results if r.get('status') == 'success']
+        recs = []
+        # Revisar metadatos
+        missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
+        if missing_titles:
+            recs.append(f"Añadir títulos a {missing_titles} páginas")
+        # Revisar contenido corto
+        short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
+        if short_content:
+            recs.append(f"Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
+        return recs if recs else ["No se detectaron problemas críticos de SEO"]
+# Interfaz Gradio
 def create_interface():
     analyzer = SEOSpaceAnalyzer()
     with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        # 🕵️ SEO Analyzer Pro
+        *Analizador SEO avanzado con modelos de lenguaje*
+        """)
         with gr.Row():
+            with gr.Column():
+                sitemap_url = gr.Textbox(
+                    label="URL del Sitemap",
+                    placeholder="https://ejemplo.com/sitemap.xml",
+                    interactive=True
+                )
+                analyze_btn = gr.Button("Analizar", variant="primary")
+            with gr.Column():
+                status = gr.Textbox(label="Estado", interactive=False)
+        with gr.Tabs():
+            with gr.Tab("Resumen"):
+                stats = gr.JSON(label="Estadísticas")
+                recommendations = gr.JSON(label="Recomendaciones SEO")
+            with gr.Tab("Contenido"):
+                content_analysis = gr.JSON(label="Análisis de Contenido")
+                content_samples = gr.JSON(label="Muestras de Contenido")
+            with gr.Tab("Enlaces"):
+                links_analysis = gr.JSON(label="Análisis de Enlaces")
+                links_plot = gr.Plot()
+        # Event handlers
         analyze_btn.click(
             fn=analyzer.analyze_sitemap,
             inputs=sitemap_url,
+            outputs=[stats, recommendations, content_analysis, links_analysis],
+            api_name="analyze"
         )
     return interface
 if __name__ == "__main__":
+    app = create_interface()
+    app.launch(server_name="0.0.0.0", server_port=7860)