File size: 3,770 Bytes
b811d74
 
 
 
 
e21bfa3
b811d74
075ec1e
b811d74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
075ec1e
b811d74
 
 
 
 
 
 
 
e21bfa3
075ec1e
7b3f563
075ec1e
0fdd412
0873404
 
0fdd412
7b3f563
e21bfa3
 
 
 
 
 
075ec1e
0873404
 
 
592392b
 
 
075ec1e
 
8cd8cd6
 
 
1970c33
 
8cd8cd6
27371a9
075ec1e
773783d
d019187
 
 
773783d
0873404
e21bfa3
d019187
e21bfa3
 
 
592392b
0873404
 
 
075ec1e
 
 
 
e21bfa3
075ec1e
b811d74
592392b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
import os

# Function to parse HTML content and extract details
def parse_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    data = soup.find('div', class_='ArticleContentBox')

    # Extract information
    article_title = data.find("h3", {"class": "title_text"})
    article_date = data.find("span", {"class": "date"})
    article_views = data.find("span", {"class": "count"})
    article_reviews = data.find("strong", {"class": "num"})
    article_content = data.find("div", {"class": "se-main-container"})
    article_author = data.find("button", {"class": "nickname"})
    
    # Handle missing data
    article_title = article_title.text.strip() if article_title else "null"
    if article_date:
        article_date_str = article_date.text.strip()
        article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
        article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
    else:
        article_date = "null"
    article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
    article_reviews = article_reviews.text.strip() if article_reviews else "null"
    if article_content:
        article_content = article_content.text.strip()
        article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
    else:
        article_content = "null"
    article_author = article_author.text.strip() if article_author else "null"

    return {
        'Title': article_title,
        'Date': article_date,
        'Author': article_author,
        'Views': article_views,
        'Reviews': article_reviews,
        'Content': article_content,
    }

# Function to update the dataframe and create an Excel file
def update_dataframe(html_content, dataframe):
    # Parse the content and create a new row
    parsed_data = parse_content(html_content)
    new_row = pd.DataFrame([parsed_data])
    
    # Concatenate the new row with the existing dataframe
    dataframe = pd.concat([dataframe, new_row], ignore_index=True)
    
    # Save the dataframe to an Excel file
    excel_file_path = "/tmp/parsed_data.xlsx"
    dataframe.to_excel(excel_file_path, index=False)
    
    # Return the updated dataframe and the path of the Excel file for download
    return dataframe, excel_file_path

# Initialize an empty dataframe (initially with no rows)
initial_df = pd.DataFrame(columns=['Title', 'Date', 'Author', 'Views', 'Reviews', 'Content'])

with gr.Blocks() as interface:
    with gr.Column():
        gr.Markdown("## HTML Content Parser")
        
        # Input and Button
        html_input = gr.Textbox(
            label="Paste HTML Content",
            placeholder="Paste your HTML content here...",
            lines=2,
            max_lines=2
        )
        parse_button = gr.Button("Parse Content")
        
        # Dataframe display with horizontal scrolling allowed and vertical scrolling disabled
        parsed_dataframe = gr.Dataframe(
            label="Parsed Data",
            interactive=False,
            show_label=False,  # Remove label if you don't want it to show
            row_count=5,  # Limit the display to 5 rows
            column_widths=['10%', '10%', '10%', '10%', '10%', '50%']
        )
        
        # File output for download
        file_output = gr.File(label="Download Excel File")
    
        # Hidden state to store the dataframe
        dataframe_state = gr.State(value=initial_df)
    
    # Define the button click event
    parse_button.click(
        fn=update_dataframe,
        inputs=[html_input, dataframe_state],
        outputs=[parsed_dataframe, file_output]
    )

interface.launch()