Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import csv | |
| from transformers import pipeline | |
| # from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration | |
| # Text sumamrization model | |
| summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
| def scrape_dawn(): | |
| url = 'https://www.dawn.com/business' | |
| response = requests.get(url, verify=False) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| articles = [] | |
| count = 0 # Counter to track the number of articles scraped | |
| for item in soup.find_all('article', class_='story'): | |
| if count >= 5: # Stop after 10 articles | |
| break | |
| title_tag = item.find('h2', class_='story__title') | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| link = title_tag.find('a')['href'] | |
| full_text = get_full_article_dawn(link) | |
| # Summarize the full article | |
| summary_obj = summarizer(full_text[:1020]) | |
| # Convert the summary object to a string | |
| summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
| articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
| count += 1 # Increment the counter | |
| return articles | |
| # Function to get the full text of an article from Dawn | |
| def get_full_article_dawn(url): | |
| response = requests.get(url, verify = False) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| content_div = soup.find('div', class_='story__content') | |
| if content_div: | |
| paragraphs = content_div.find_all('p') | |
| full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) | |
| return full_text | |
| return "Content not found." | |
| # Function to scrape articles from Business Recorder | |
| def scrape_brecorder(): | |
| url = 'https://www.brecorder.com/business-finance' | |
| response = requests.get(url, verify=False) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| articles = [] | |
| count = 0 # Counter to track the number of articles scraped | |
| for item in soup.find_all('article', class_='story'): | |
| if count >= 5: # Stop after 10 articles | |
| break | |
| title_tag = item.find('h2', class_='story__title') | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| link = title_tag.find('a')['href'] | |
| full_text = get_full_article_brecorder(link) | |
| # Summarize the full article | |
| summary_obj = summarizer(full_text[:1020]) | |
| # Convert the summary object to a string | |
| summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
| articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
| count += 1 # Increment the counter | |
| return articles | |
| # Function to get the full text of an article from Business Recorder | |
| def get_full_article_brecorder(url): | |
| response = requests.get(url, verify = False) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| content_div = soup.find('div', class_='story__content') | |
| if content_div: | |
| paragraphs = content_div.find_all(['p', 'li']) | |
| full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) | |
| return full_text | |
| return "Content not found." | |
| # | |
| # def scrape_tnews(): | |
| # url = 'https://www.thenews.com.pk/latest/category/business' | |
| # response = requests.get(url, verify=False) | |
| # soup = BeautifulSoup(response.text, 'html.parser') | |
| # articles = [] | |
| # | |
| # count = 0 # Counter to track the number of articles scraped | |
| # | |
| # for item in soup.find_all('div', class_='most-popular-box'): | |
| # if count >= 2: # Stop after 10 articles | |
| # break | |
| # | |
| # title_tag = item.find('h2', class_='most-popular-list') | |
| # if title_tag: | |
| # title = title_tag.get_text(strip=True) | |
| # link = title_tag.find('a')['href'] | |
| # full_text = get_full_article_tnews(link) | |
| # # Summarize the full article | |
| # summary_obj = summarizer(full_text[:1020]) | |
| # | |
| # # Convert the summary object to a string | |
| # summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
| # articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
| # | |
| # count += 1 # Increment the counter | |
| # | |
| # return articles | |
| def scrape_tnews(): | |
| url = 'https://www.thenews.com.pk/latest/category/business' | |
| response = requests.get(url, verify=False) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| articles = [] | |
| count = 0 # Counter to track the number of articles scraped | |
| for item in soup.find_all('div', class_='most-popular-box'): | |
| if count >= 5: # Stop after 2 articles | |
| break | |
| # Extract the title from the <h2> tag | |
| title_tag = item.find('h2') | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| # Extract the link from the <a> tag inside <h2> | |
| link = item.find('a')['href'] | |
| # Fetch and process full article text (you should define get_full_article_tnews) | |
| full_text = get_full_article_tnews(link) | |
| # Summarize the full article (you should define summarizer) | |
| summary_obj = summarizer(full_text[:1020]) | |
| summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
| # Append the article details | |
| articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
| count += 1 # Increment the counter | |
| return articles | |
| def get_full_article_tnews(url): | |
| response = requests.get(url, verify = False) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| content_div = soup.find('div', class_='detail-content') | |
| if content_div: | |
| paragraphs = content_div.find_all(['p', 'li']) | |
| full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) | |
| return full_text | |
| return "Content not found." | |
| # Function to save articles to a CSV file | |
| def save_to_csv(filename, articles): | |
| if not articles: | |
| print(f"No articles found to save in {filename}.") | |
| return | |
| keys = articles[0].keys() | |
| with open(filename, 'w', newline='', encoding='utf-8') as output_file: | |
| dict_writer = csv.DictWriter(output_file, fieldnames=keys) | |
| dict_writer.writeheader() | |
| dict_writer.writerows(articles) | |
| # # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV | |
| # def main(): | |
| # # Scraping articles from Dawn | |
| # dawn_articles = scrape_tnews() | |
| # save_to_csv('tnews_articles_full.csv', dawn_articles) | |
| # print("tnews articles saved to CSV file successfully.") | |
| # | |
| # # Scraping articles from Business Recorder | |
| # # brecorder_articles = scrape_brecorder() | |
| # # save_to_csv('brecorder_articles_full.csv', brecorder_articles) | |
| # # print("Business Recorder articles saved to CSV file successfully.") | |
| # | |
| # | |
| # if __name__ == '__main__': | |
| # main() | |
| # url = 'https://www.thenews.com.pk/latest/category/business' | |
| # response = requests.get(url, verify=False) | |
| # soup = BeautifulSoup(response.text, 'html.parser') | |
| # s = soup.find_all('div', class_='most-popular-box') | |
| # print(s) |