rodolphethinks1's picture
Update app.py
1970c33 verified
import gradio as gr
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
import os
# Function to parse HTML content and extract details
def parse_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
data = soup.find('div', class_='ArticleContentBox')
# Extract information
article_title = data.find("h3", {"class": "title_text"})
article_date = data.find("span", {"class": "date"})
article_views = data.find("span", {"class": "count"})
article_reviews = data.find("strong", {"class": "num"})
article_content = data.find("div", {"class": "se-main-container"})
article_author = data.find("button", {"class": "nickname"})
# Handle missing data
article_title = article_title.text.strip() if article_title else "null"
if article_date:
article_date_str = article_date.text.strip()
article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
else:
article_date = "null"
article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
article_reviews = article_reviews.text.strip() if article_reviews else "null"
if article_content:
article_content = article_content.text.strip()
article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
else:
article_content = "null"
article_author = article_author.text.strip() if article_author else "null"
return {
'Title': article_title,
'Date': article_date,
'Author': article_author,
'Views': article_views,
'Reviews': article_reviews,
'Content': article_content,
}
# Function to update the dataframe and create an Excel file
def update_dataframe(html_content, dataframe):
# Parse the content and create a new row
parsed_data = parse_content(html_content)
new_row = pd.DataFrame([parsed_data])
# Concatenate the new row with the existing dataframe
dataframe = pd.concat([dataframe, new_row], ignore_index=True)
# Save the dataframe to an Excel file
excel_file_path = "/tmp/parsed_data.xlsx"
dataframe.to_excel(excel_file_path, index=False)
# Return the updated dataframe and the path of the Excel file for download
return dataframe, excel_file_path
# Initialize an empty dataframe (initially with no rows)
initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content'])
with gr.Blocks() as interface:
with gr.Column():
gr.Markdown("## HTML Content Parser")
# Input and Button
html_input = gr.Textbox(
label="Paste HTML Content",
placeholder="Paste your HTML content here...",
lines=2,
max_lines=2
)
parse_button = gr.Button("Parse Content")
# Dataframe display with horizontal scrolling allowed and vertical scrolling disabled
parsed_dataframe = gr.Dataframe(
label="Parsed Data",
interactive=False,
show_label=False, # Remove label if you don't want it to show
row_count=5, # Limit the display to 5 rows
column_widths=['10%', '10%', '10%', '10%', '10%', '50%']
)
# File output for download
file_output = gr.File(label="Download Excel File")
# Hidden state to store the dataframe
dataframe_state = gr.State(value=initial_df)
# Define the button click event
parse_button.click(
fn=update_dataframe,
inputs=[html_input, dataframe_state],
outputs=[parsed_dataframe, file_output]
)
interface.launch()