Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from tqdm import tqdm | |
| import streamlit as st | |
| from io import BytesIO | |
| def extract_article_info(url): | |
| """ | |
| Extracts meta title, meta description, heading, subheadings, and all text in <p> tags from a blog post URL. | |
| Args: | |
| url (str): The URL of the blog post. | |
| Returns: | |
| str: A string containing the extracted information. | |
| """ | |
| try: | |
| # Fetch the HTML content of the URL | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extract meta title | |
| meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None | |
| # Extract meta description | |
| meta_description = None | |
| meta_tag = soup.find('meta', attrs={'name': 'description'}) | |
| if meta_tag and meta_tag.get('content'): | |
| meta_description = meta_tag['content'] | |
| # Extract heading (Assuming <h1> is used for the main heading) | |
| heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None | |
| # Extract subheadings (Assuming <h2> tags are used for subheadings) | |
| subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')] | |
| # Extract all text from <p> tags and add two breaks between paragraphs | |
| all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] | |
| article_text = "\n\n".join(all_paragraphs) # Add two breaks between paragraphs | |
| # Combine heading and subheadings with article text | |
| full_article_text = f"{heading}\n\n" if heading else "" | |
| for subheading in subheadings: | |
| full_article_text += f"{subheading}\n\n" | |
| full_article_text += article_text | |
| return full_article_text | |
| except requests.exceptions.RequestException as e: | |
| return f"Error fetching the URL: {e}" | |
| except Exception as e: | |
| return f"Error processing the content: {e}" | |
| def process_file(uploaded_file): | |
| # Load the Excel file | |
| df = pd.read_excel(uploaded_file) | |
| # Check if 'URL' column exists | |
| if 'URL' not in df.columns: | |
| return None, "The 'URL' column is missing from the Excel file." | |
| # List to hold results | |
| results = [] | |
| # Use ThreadPoolExecutor for parallel processing | |
| with ThreadPoolExecutor() as executor: | |
| # Submit tasks to the executor | |
| future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']} | |
| for future in as_completed(future_to_url): | |
| url = future_to_url[future] | |
| try: | |
| # Append the result to the results list | |
| results.append(future.result()) | |
| except Exception as e: | |
| # Handle exceptions during execution | |
| results.append(f"Error processing the URL {url}: {e}") | |
| # Add the results to a new column in the DataFrame | |
| df['Article Text'] = results | |
| # Save the updated DataFrame to a BytesIO object | |
| output = BytesIO() | |
| df.to_excel(output, index=False, engine='openpyxl') | |
| output.seek(0) | |
| return output, None | |
| # Streamlit App | |
| st.title("Web Article Extractor") | |
| st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.") | |
| # File upload | |
| uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"]) | |
| if uploaded_file is not None: | |
| with st.spinner("Processing your file..."): | |
| output, error = process_file(uploaded_file) | |
| if error: | |
| st.error(error) | |
| else: | |
| st.success("File processed successfully!") | |
| st.download_button( | |
| label="Download Processed File", | |
| data=output, | |
| file_name="processed_file.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) |