import gradio as gr
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
import os

# Function to parse HTML content and extract details
def parse_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    data = soup.find('div', class_='ArticleContentBox')

    # Extract information
    article_title = data.find("h3", {"class": "title_text"})
    article_date = data.find("span", {"class": "date"})
    article_views = data.find("span", {"class": "count"})
    article_reviews = data.find("strong", {"class": "num"})
    article_content = data.find("div", {"class": "se-main-container"})
    article_author = data.find("button", {"class": "nickname"})
    
    # Handle missing data
    article_title = article_title.text.strip() if article_title else "null"
    if article_date:
        article_date_str = article_date.text.strip()
        article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
        article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
    else:
        article_date = "null"
    article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
    article_reviews = article_reviews.text.strip() if article_reviews else "null"
    if article_content:
        article_content = article_content.text.strip()
        article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
    else:
        article_content = "null"
    article_author = article_author.text.strip() if article_author else "null"

    return {
        'Title': article_title,
        'Date': article_date,
        'Author': article_author,
        'Views': article_views,
        'Reviews': article_reviews,
        'Content': article_content,
    }

# Function to update the dataframe and create an Excel file
def update_dataframe(html_content, dataframe):
    # Parse the content and create a new row
    parsed_data = parse_content(html_content)
    new_row = pd.DataFrame([parsed_data])
    
    # Concatenate the new row with the existing dataframe
    dataframe = pd.concat([dataframe, new_row], ignore_index=True)
    
    # Save the dataframe to an Excel file
    excel_file_path = "/tmp/parsed_data.xlsx"
    dataframe.to_excel(excel_file_path, index=False)
    
    # Return the updated dataframe and the path of the Excel file for download
    return dataframe, excel_file_path

# Initialize an empty dataframe (initially with no rows)
initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content'])

with gr.Blocks() as interface:
    with gr.Column():
        gr.Markdown("## HTML Content Parser")
        
        # Input and Button
        html_input = gr.Textbox(
            label="Paste HTML Content",
            placeholder="Paste your HTML content here...",
            lines=2,
            max_lines=2
        )
        parse_button = gr.Button("Parse Content")
        
        # Dataframe display with horizontal scrolling allowed and vertical scrolling disabled
        parsed_dataframe = gr.Dataframe(
            label="Parsed Data",
            interactive=False,
            show_label=False,  # Remove label if you don't want it to show
            row_count=5,  # Limit the display to 5 rows
            column_widths=['10%', '10%', '10%', '10%', '10%', '50%']
        )
        
        # File output for download
        file_output = gr.File(label="Download Excel File")
    
        # Hidden state to store the dataframe
        dataframe_state = gr.State(value=initial_df)
    
    # Define the button click event
    parse_button.click(
        fn=update_dataframe,
        inputs=[html_input, dataframe_state],
        outputs=[parsed_dataframe, file_output]
    )

interface.launch()