Spaces:
Runtime error
Runtime error
File size: 3,770 Bytes
b811d74 e21bfa3 b811d74 075ec1e b811d74 075ec1e b811d74 e21bfa3 075ec1e 7b3f563 075ec1e 0fdd412 0873404 0fdd412 7b3f563 e21bfa3 075ec1e 0873404 592392b 075ec1e 8cd8cd6 1970c33 8cd8cd6 27371a9 075ec1e 773783d d019187 773783d 0873404 e21bfa3 d019187 e21bfa3 592392b 0873404 075ec1e e21bfa3 075ec1e b811d74 592392b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | import gradio as gr
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
import os
# Function to parse HTML content and extract details
def parse_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
data = soup.find('div', class_='ArticleContentBox')
# Extract information
article_title = data.find("h3", {"class": "title_text"})
article_date = data.find("span", {"class": "date"})
article_views = data.find("span", {"class": "count"})
article_reviews = data.find("strong", {"class": "num"})
article_content = data.find("div", {"class": "se-main-container"})
article_author = data.find("button", {"class": "nickname"})
# Handle missing data
article_title = article_title.text.strip() if article_title else "null"
if article_date:
article_date_str = article_date.text.strip()
article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
else:
article_date = "null"
article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
article_reviews = article_reviews.text.strip() if article_reviews else "null"
if article_content:
article_content = article_content.text.strip()
article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
else:
article_content = "null"
article_author = article_author.text.strip() if article_author else "null"
return {
'Title': article_title,
'Date': article_date,
'Author': article_author,
'Views': article_views,
'Reviews': article_reviews,
'Content': article_content,
}
# Function to update the dataframe and create an Excel file
def update_dataframe(html_content, dataframe):
# Parse the content and create a new row
parsed_data = parse_content(html_content)
new_row = pd.DataFrame([parsed_data])
# Concatenate the new row with the existing dataframe
dataframe = pd.concat([dataframe, new_row], ignore_index=True)
# Save the dataframe to an Excel file
excel_file_path = "/tmp/parsed_data.xlsx"
dataframe.to_excel(excel_file_path, index=False)
# Return the updated dataframe and the path of the Excel file for download
return dataframe, excel_file_path
# Initialize an empty dataframe (initially with no rows)
initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content'])
with gr.Blocks() as interface:
with gr.Column():
gr.Markdown("## HTML Content Parser")
# Input and Button
html_input = gr.Textbox(
label="Paste HTML Content",
placeholder="Paste your HTML content here...",
lines=2,
max_lines=2
)
parse_button = gr.Button("Parse Content")
# Dataframe display with horizontal scrolling allowed and vertical scrolling disabled
parsed_dataframe = gr.Dataframe(
label="Parsed Data",
interactive=False,
show_label=False, # Remove label if you don't want it to show
row_count=5, # Limit the display to 5 rows
column_widths=['10%', '10%', '10%', '10%', '10%', '50%']
)
# File output for download
file_output = gr.File(label="Download Excel File")
# Hidden state to store the dataframe
dataframe_state = gr.State(value=initial_df)
# Define the button click event
parse_button.click(
fn=update_dataframe,
inputs=[html_input, dataframe_state],
outputs=[parsed_dataframe, file_output]
)
interface.launch()
|