Spaces:

rodolphethinks1
/

ArticleContentBox

Runtime error

App Files Files Community

ArticleContentBox / app.py

rodolphethinks1

Update app.py

1970c33 verified about 1 year ago

raw

history blame contribute delete

3.77 kB

	import gradio as gr
	from bs4 import BeautifulSoup
	from datetime import datetime
	import pandas as pd
	import re
	import os

	# Function to parse HTML content and extract details
	def parse_content(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')
	data = soup.find('div', class_='ArticleContentBox')

	# Extract information
	article_title = data.find("h3", {"class": "title_text"})
	article_date = data.find("span", {"class": "date"})
	article_views = data.find("span", {"class": "count"})
	article_reviews = data.find("strong", {"class": "num"})
	article_content = data.find("div", {"class": "se-main-container"})
	article_author = data.find("button", {"class": "nickname"})

	# Handle missing data
	article_title = article_title.text.strip() if article_title else "null"
	if article_date:
	article_date_str = article_date.text.strip()
	article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
	article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
	else:
	article_date = "null"
	article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
	article_reviews = article_reviews.text.strip() if article_reviews else "null"
	if article_content:
	article_content = article_content.text.strip()
	article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
	else:
	article_content = "null"
	article_author = article_author.text.strip() if article_author else "null"

	return {
	'Title': article_title,
	'Date': article_date,
	'Author': article_author,
	'Views': article_views,
	'Reviews': article_reviews,
	'Content': article_content,
	}

	# Function to update the dataframe and create an Excel file
	def update_dataframe(html_content, dataframe):
	# Parse the content and create a new row
	parsed_data = parse_content(html_content)
	new_row = pd.DataFrame([parsed_data])

	# Concatenate the new row with the existing dataframe
	dataframe = pd.concat([dataframe, new_row], ignore_index=True)

	# Save the dataframe to an Excel file
	excel_file_path = "/tmp/parsed_data.xlsx"
	dataframe.to_excel(excel_file_path, index=False)

	# Return the updated dataframe and the path of the Excel file for download
	return dataframe, excel_file_path

	# Initialize an empty dataframe (initially with no rows)
	initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content'])

	with gr.Blocks() as interface:
	with gr.Column():
	gr.Markdown("## HTML Content Parser")

	# Input and Button
	html_input = gr.Textbox(
	label="Paste HTML Content",
	placeholder="Paste your HTML content here...",
	lines=2,
	max_lines=2
	)
	parse_button = gr.Button("Parse Content")

	# Dataframe display with horizontal scrolling allowed and vertical scrolling disabled
	parsed_dataframe = gr.Dataframe(
	label="Parsed Data",
	interactive=False,
	show_label=False, # Remove label if you don't want it to show
	row_count=5, # Limit the display to 5 rows
	column_widths=['10%', '10%', '10%', '10%', '10%', '50%']
	)

	# File output for download
	file_output = gr.File(label="Download Excel File")

	# Hidden state to store the dataframe
	dataframe_state = gr.State(value=initial_df)

	# Define the button click event
	parse_button.click(
	fn=update_dataframe,
	inputs=[html_input, dataframe_state],
	outputs=[parsed_dataframe, file_output]
	)

	interface.launch()