Spaces:

abdulllah01
/

articles

Sleeping

App Files Files Community

abdulllah01 commited on Jan 7, 2025

Commit

36702ae

verified ·

1 Parent(s): f2bcb62

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -42

app.py CHANGED Viewed

@@ -1,12 +1,22 @@
-import streamlit as st
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 from io import BytesIO
 def extract_article_info(url):
     try:
-        response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
@@ -19,15 +29,15 @@ def extract_article_info(url):
         if meta_tag and meta_tag.get('content'):
             meta_description = meta_tag['content']
-        # Extract heading
         heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
-        # Extract subheadings
         subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
         # Extract all text from <p> tags and add two breaks between paragraphs
         all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
-        article_text = "\n\n".join(all_paragraphs)
         # Combine heading and subheadings with article text
         full_article_text = f"{heading}\n\n" if heading else ""
@@ -42,43 +52,59 @@ def extract_article_info(url):
     except Exception as e:
         return f"Error processing the content: {e}"
-def process_excel(file):
-    # Read the uploaded Excel file
-    df = pd.read_excel(file)
-    if 'URL' in df.columns:
-        # Apply extract_article_info to each URL and store the result in a new column
-        df['Article Text'] = df['URL'].apply(extract_article_info)
-        # Save the updated DataFrame to a BytesIO object to prepare it for download
-        output = BytesIO()
-        df.to_excel(output, index=False)
-        output.seek(0)
-        return output
     else:
-        return None
-def main():
-    st.title("Excel URL Processor")
-    st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")
-    # Upload Excel file
-    uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])
-    if uploaded_file:
-        # Process the file
-        processed_file = process_excel(uploaded_file)
-        if processed_file:
-            st.success("File processed successfully!")
-            st.download_button(
-                label="Download Modified Excel File",
-                data=processed_file,
-                file_name="updated_file.xlsx",
-                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-            )
-        else:
-            st.error("The uploaded file does not contain a column named 'URL'.")
-if __name__ == "__main__":
-    main()

 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+import streamlit as st
 from io import BytesIO
 def extract_article_info(url):
+    """
+    Extracts meta title, meta description, heading, subheadings, and all text in <p> tags from a blog post URL.
+    Args:
+        url (str): The URL of the blog post.
+    Returns:
+        str: A string containing the extracted information.
+    """
     try:
+        # Fetch the HTML content of the URL
+        response = requests.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         if meta_tag and meta_tag.get('content'):
             meta_description = meta_tag['content']
+        # Extract heading (Assuming <h1> is used for the main heading)
         heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
+        # Extract subheadings (Assuming <h2> tags are used for subheadings)
         subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
         # Extract all text from <p> tags and add two breaks between paragraphs
         all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
+        article_text = "\n\n".join(all_paragraphs)  # Add two breaks between paragraphs
         # Combine heading and subheadings with article text
         full_article_text = f"{heading}\n\n" if heading else ""
     except Exception as e:
         return f"Error processing the content: {e}"
+def process_file(uploaded_file):
+    # Load the Excel file
+    df = pd.read_excel(uploaded_file)
+    # Check if 'URL' column exists
+    if 'URL' not in df.columns:
+        return None, "The 'URL' column is missing from the Excel file."
+    # List to hold results
+    results = []
+    # Use ThreadPoolExecutor for parallel processing
+    with ThreadPoolExecutor() as executor:
+        # Submit tasks to the executor
+        future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']}
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                # Append the result to the results list
+                results.append(future.result())
+            except Exception as e:
+                # Handle exceptions during execution
+                results.append(f"Error processing the URL {url}: {e}")
+    # Add the results to a new column in the DataFrame
+    df['Article Text'] = results
+    # Save the updated DataFrame to a BytesIO object
+    output = BytesIO()
+    df.to_excel(output, index=False, engine='openpyxl')
+    output.seek(0)
+    return output, None
+# Streamlit App
+st.title("Web Article Extractor")
+st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.")
+# File upload
+uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"])
+if uploaded_file is not None:
+    with st.spinner("Processing your file..."):
+        output, error = process_file(uploaded_file)
+    if error:
+        st.error(error)
     else:
+        st.success("File processed successfully!")
+        st.download_button(
+            label="Download Processed File",
+            data=output,
+            file_name="processed_file.xlsx",
+            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        )