import gradio as gr from bs4 import BeautifulSoup from datetime import datetime import pandas as pd import re import os # Function to parse HTML content and extract details def parse_content(html_content): soup = BeautifulSoup(html_content, 'html.parser') data = soup.find('div', class_='ArticleContentBox') # Extract information article_title = data.find("h3", {"class": "title_text"}) article_date = data.find("span", {"class": "date"}) article_views = data.find("span", {"class": "count"}) article_reviews = data.find("strong", {"class": "num"}) article_content = data.find("div", {"class": "se-main-container"}) article_author = data.find("button", {"class": "nickname"}) # Handle missing data article_title = article_title.text.strip() if article_title else "null" if article_date: article_date_str = article_date.text.strip() article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M') article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S") else: article_date = "null" article_views = article_views.text.strip().split(' ')[1] if article_views else "null" article_reviews = article_reviews.text.strip() if article_reviews else "null" if article_content: article_content = article_content.text.strip() article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE)) else: article_content = "null" article_author = article_author.text.strip() if article_author else "null" return { 'Title': article_title, 'Date': article_date, 'Author': article_author, 'Views': article_views, 'Reviews': article_reviews, 'Content': article_content, } # Function to update the dataframe and create an Excel file def update_dataframe(html_content, dataframe): # Parse the content and create a new row parsed_data = parse_content(html_content) new_row = pd.DataFrame([parsed_data]) # Concatenate the new row with the existing dataframe dataframe = pd.concat([dataframe, new_row], ignore_index=True) # Save the dataframe to an Excel file excel_file_path = "/tmp/parsed_data.xlsx" dataframe.to_excel(excel_file_path, index=False) # Return the updated dataframe and the path of the Excel file for download return dataframe, excel_file_path # Initialize an empty dataframe (initially with no rows) initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content']) with gr.Blocks() as interface: with gr.Column(): gr.Markdown("## HTML Content Parser") # Input and Button html_input = gr.Textbox( label="Paste HTML Content", placeholder="Paste your HTML content here...", lines=2, max_lines=2 ) parse_button = gr.Button("Parse Content") # Dataframe display with horizontal scrolling allowed and vertical scrolling disabled parsed_dataframe = gr.Dataframe( label="Parsed Data", interactive=False, show_label=False, # Remove label if you don't want it to show row_count=5, # Limit the display to 5 rows column_widths=['10%', '10%', '10%', '10%', '10%', '50%'] ) # File output for download file_output = gr.File(label="Download Excel File") # Hidden state to store the dataframe dataframe_state = gr.State(value=initial_df) # Define the button click event parse_button.click( fn=update_dataframe, inputs=[html_input, dataframe_state], outputs=[parsed_dataframe, file_output] ) interface.launch()