rodolphethinks1 commited on
Commit
b811d74
·
verified ·
1 Parent(s): ff15a4d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from bs4 import BeautifulSoup
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ import re
6
+
7
+ def parse_content(html_content):
8
+ # Parse the HTML using BeautifulSoup
9
+ soup = BeautifulSoup(html_content, 'html.parser')
10
+ data = soup.find('div', class_='ArticleContentBox')
11
+
12
+ # Extract information
13
+ article_title = data.find("h3", {"class": "title_text"})
14
+ article_date = data.find("span", {"class": "date"})
15
+ article_views = data.find("span", {"class": "count"})
16
+ article_reviews = data.find("strong", {"class": "num"})
17
+ article_content = data.find("div", {"class": "se-main-container"})
18
+ article_author = data.find("button", {"class": "nickname"})
19
+
20
+ # Handle missing data
21
+ article_title = article_title.text.strip() if article_title else "null"
22
+ if article_date:
23
+ article_date_str = article_date.text.strip()
24
+ article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
25
+ article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
26
+ else:
27
+ article_date = "null"
28
+ article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
29
+ article_reviews = article_reviews.text.strip() if article_reviews else "null"
30
+ if article_content:
31
+ article_content = article_content.text.strip()
32
+ article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
33
+ else:
34
+ article_content = "null"
35
+ article_author = article_author.text.strip() if article_author else "null"
36
+
37
+ # Create a DataFrame
38
+ parsed_data = {
39
+ 'Title': article_title,
40
+ 'Date': article_date,
41
+ 'Author': article_author,
42
+ 'Views': article_views,
43
+ 'Reviews': article_reviews,
44
+ 'Content': article_content,
45
+ }
46
+ df = pd.DataFrame([parsed_data])
47
+ return df
48
+
49
+ # Define the Gradio interface
50
+ def process_input(html_content):
51
+ try:
52
+ df = parse_content(html_content)
53
+ return df
54
+ except Exception as e:
55
+ return f"Error: {e}"
56
+
57
+ interface = gr.Interface(
58
+ fn=process_input,
59
+ inputs=gr.Textbox(label="Paste HTML Content", lines=10, placeholder="Paste your HTML content here..."),
60
+ outputs=gr.Dataframe(label="Parsed Data")
61
+ )
62
+
63
+ # Launch the app
64
+ interface.launch()