import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr


def crawl_website_from_sitemap(sitemap_url):
    visited_urls = set()
    data = {
        'URLs': [],
        'Title': [],
        'Keywords': [],
        'Description': []
    }

    def crawl(url):
        # Check if URL has already been visited
        if url in visited_urls:
            return

        # Add URL to visited set
        visited_urls.add(url)

        # Extract domain from the given URL
        parsed_url = urlparse(url)
        base_url = parsed_url.scheme + "://" + parsed_url.netloc

        # Make a GET request to the URL
        try:
            response = requests.get(url)
        except requests.exceptions.RequestException:
            # Handle unreadable URLs
            return

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the title
            title = soup.title.string if soup.title else ''

            # Check for duplicate or similar titles
            similar_titles = []
            for visited_title in data['Title']:
                similarity_ratio = SequenceMatcher(None, visited_title, title).ratio()
                if similarity_ratio > 0.8:
                    similar_titles.append(visited_title)

            # Extract the meta keywords
            meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
            meta_keywords = meta_keywords['content'] if meta_keywords and 'content' in meta_keywords.attrs else ''

            # Extract the meta description
            meta_description = soup.find('meta', attrs={'name': 'description'})
            meta_description = meta_description[
                'content'] if meta_description and 'content' in meta_description.attrs else ''

            # Add the data to the dictionary
            data['URLs'].append(url)
            data['Title'].append(title)
            data['Keywords'].append(meta_keywords)
            data['Description'].append(meta_description)

            # Extract all the links on the page
            links = soup.find_all('a')

            # Visit each link
            for link in links:
                href = link.get('href')
                if href and not href.startswith('#'):
                    # Construct the absolute URL by joining the base URL and the relative URL
                    absolute_url = urljoin(url, href)
                    parsed_absolute_url = urlparse(absolute_url)

                    # Check if the URL points to a webpage (excluding image URLs) and ends with ".html" or ".htm"
                    if parsed_absolute_url.netloc == parsed_url.netloc and parsed_absolute_url.path.endswith(
                            ('.html', '.htm')):
                        try:
                            # Visit the absolute URL
                            crawl(absolute_url)
                        except requests.exceptions.RequestException:
                            # Handle unreadable URLs
                            continue
        else:
            # Handle unsuccessful requests
            return

    # Fetch the sitemap XML content
    try:
        response = requests.get(sitemap_url)
    except requests.exceptions.RequestException:
        return "Error fetching sitemap"

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the sitemap XML content
        xml_content = response.content
        root = ET.fromstring(xml_content)

        # Extract URLs from the sitemap
        urls = []
        for url_element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
            loc_element = url_element.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
            if loc_element is not None:
                url = loc_element.text
                urls.append(url)

        # Call the crawl function for each URL in the sitemap
        for url in urls:
            crawl(url)

        # Convert the data to a pandas DataFrame
        df = pd.DataFrame(data)

        # Get the domain name from the sitemap URL
        domain_name = urlparse(sitemap_url).netloc

        # Save the DataFrame to an Excel file
        wb = Workbook()
        ws = wb.active

        # Write the DataFrame to the worksheet
        for r in dataframe_to_rows(df, index=False, header=True):
            ws.append(r)

        # Highlight cells with similar titles in the "Title" column
        fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
        titles = df['Title']
        for row in ws.iter_rows(min_row=2, min_col=2, max_col=2):
            cell = row[0]
            if cell.value in titles:
                cell.fill = fill

        # Save the workbook as an Excel file
        file_path = f'{domain_name}.xlsx'
        wb.save(file_path)

        return file_path
    else:
        return "Error fetching sitemap"


# Create a Gradio interface
iface = gr.Interface(
    fn=crawl_website_from_sitemap,
    inputs="text",
    outputs="file",
    title="Sitemap to SEO Tracking Excel",
    description="To collect SEO data (Page URL, Title, Description, Keywords) from a sitemap URL, use a Python script with BeautifulSoup and pandas. The script crawls the sitemap, extracts data from each page, and exports the results to an Excel file for analysis.",
    allow_flagging=False,
    examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
)

# Launch the Gradio interface
iface.launch()