Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from datetime import datetime | |
| import pandas as pd | |
| import re | |
| import os | |
| # Function to parse HTML content and extract details | |
| def parse_content(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| data = soup.find('div', class_='ArticleContentBox') | |
| # Extract information | |
| article_title = data.find("h3", {"class": "title_text"}) | |
| article_date = data.find("span", {"class": "date"}) | |
| article_views = data.find("span", {"class": "count"}) | |
| article_reviews = data.find("strong", {"class": "num"}) | |
| article_content = data.find("div", {"class": "se-main-container"}) | |
| article_author = data.find("button", {"class": "nickname"}) | |
| # Handle missing data | |
| article_title = article_title.text.strip() if article_title else "null" | |
| if article_date: | |
| article_date_str = article_date.text.strip() | |
| article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M') | |
| article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S") | |
| else: | |
| article_date = "null" | |
| article_views = article_views.text.strip().split(' ')[1] if article_views else "null" | |
| article_reviews = article_reviews.text.strip() if article_reviews else "null" | |
| if article_content: | |
| article_content = article_content.text.strip() | |
| article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE)) | |
| else: | |
| article_content = "null" | |
| article_author = article_author.text.strip() if article_author else "null" | |
| return { | |
| 'Title': article_title, | |
| 'Date': article_date, | |
| 'Author': article_author, | |
| 'Views': article_views, | |
| 'Reviews': article_reviews, | |
| 'Content': article_content, | |
| } | |
| # Function to update the dataframe and create an Excel file | |
| def update_dataframe(html_content, dataframe): | |
| # Parse the content and create a new row | |
| parsed_data = parse_content(html_content) | |
| new_row = pd.DataFrame([parsed_data]) | |
| # Concatenate the new row with the existing dataframe | |
| dataframe = pd.concat([dataframe, new_row], ignore_index=True) | |
| # Save the dataframe to an Excel file | |
| excel_file_path = "/tmp/parsed_data.xlsx" | |
| dataframe.to_excel(excel_file_path, index=False) | |
| # Return the updated dataframe and the path of the Excel file for download | |
| return dataframe, excel_file_path | |
| # Initialize an empty dataframe (initially with no rows) | |
| initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content']) | |
| with gr.Blocks() as interface: | |
| with gr.Column(): | |
| gr.Markdown("## HTML Content Parser") | |
| # Input and Button | |
| html_input = gr.Textbox( | |
| label="Paste HTML Content", | |
| placeholder="Paste your HTML content here...", | |
| lines=2, | |
| max_lines=2 | |
| ) | |
| parse_button = gr.Button("Parse Content") | |
| # Dataframe display with horizontal scrolling allowed and vertical scrolling disabled | |
| parsed_dataframe = gr.Dataframe( | |
| label="Parsed Data", | |
| interactive=False, | |
| show_label=False, # Remove label if you don't want it to show | |
| row_count=5, # Limit the display to 5 rows | |
| column_widths=['10%', '10%', '10%', '10%', '10%', '50%'] | |
| ) | |
| # File output for download | |
| file_output = gr.File(label="Download Excel File") | |
| # Hidden state to store the dataframe | |
| dataframe_state = gr.State(value=initial_df) | |
| # Define the button click event | |
| parse_button.click( | |
| fn=update_dataframe, | |
| inputs=[html_input, dataframe_state], | |
| outputs=[parsed_dataframe, file_output] | |
| ) | |
| interface.launch() | |