Spaces:

rodolphethinks1
/

ArticleContentBox

Runtime error

App Files Files Community

rodolphethinks1 commited on Jan 24, 2025

Commit

b811d74

verified ·

1 Parent(s): ff15a4d

Create app.py

Browse files

Files changed (1) hide show

app.py +64 -0

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import gradio as gr
+from bs4 import BeautifulSoup
+from datetime import datetime
+import pandas as pd
+import re
+def parse_content(html_content):
+    # Parse the HTML using BeautifulSoup
+    soup = BeautifulSoup(html_content, 'html.parser')
+    data = soup.find('div', class_='ArticleContentBox')
+    # Extract information
+    article_title = data.find("h3", {"class": "title_text"})
+    article_date = data.find("span", {"class": "date"})
+    article_views = data.find("span", {"class": "count"})
+    article_reviews = data.find("strong", {"class": "num"})
+    article_content = data.find("div", {"class": "se-main-container"})
+    article_author = data.find("button", {"class": "nickname"})
+    # Handle missing data
+    article_title = article_title.text.strip() if article_title else "null"
+    if article_date:
+        article_date_str = article_date.text.strip()
+        article_date_obj = datetime.strptime(article_date_str, '%Y.%m.%d. %H:%M')
+        article_date = article_date_obj.strftime("%Y-%m-%d %H:%M:%S")
+    else:
+        article_date = "null"
+    article_views = article_views.text.strip().split(' ')[1] if article_views else "null"
+    article_reviews = article_reviews.text.strip() if article_reviews else "null"
+    if article_content:
+        article_content = article_content.text.strip()
+        article_content = " ".join(re.split("\s+", article_content, flags=re.UNICODE))
+    else:
+        article_content = "null"
+    article_author = article_author.text.strip() if article_author else "null"
+    # Create a DataFrame
+    parsed_data = {
+        'Title': article_title,
+        'Date': article_date,
+        'Author': article_author,
+        'Views': article_views,
+        'Reviews': article_reviews,
+        'Content': article_content,
+    }
+    df = pd.DataFrame([parsed_data])
+    return df
+# Define the Gradio interface
+def process_input(html_content):
+    try:
+        df = parse_content(html_content)
+        return df
+    except Exception as e:
+        return f"Error: {e}"
+interface = gr.Interface(
+    fn=process_input,
+    inputs=gr.Textbox(label="Paste HTML Content", lines=10, placeholder="Paste your HTML content here..."),
+    outputs=gr.Dataframe(label="Parsed Data")
+)
+# Launch the app
+interface.launch()