""" @author: Edward R Jones @version 1.34 @copyright 2020 - Edward R Jones, all rights reserved. """ import sys import warnings import pandas as pd import re import requests # install using conda install requests from time import time from datetime import date try: import newspaper # install using conda install newspaper3k from newspaper import Article except: warnings.warn("AdvancedAnalytics.Scrape.newspaper_stories "+\ "missing NEWSPAPER3K package") try: # newsapi requires tiny\segmenter: pip install tinysegmenter==0.3 # Install newsapi using: pip install newsapi-python from newsapi import NewsApiClient # Needed for using API Feed except: warnings.warn("AdvancedAnalytics.Scrape.newsapi_get_urls "+\ "missing NEWSAPI package") class scrape(object): def newspaper_stories(words, search_type='or', search_level=0, urls=None, display=True, memorize=False, language='en'): config = newspaper.Config() config.memoize_articles = memorize config.language = language config.fetch_images = False config.request_timeout = 20 config.MIN_WORD_COUNT = 300 config.MIN_SENT_COUNT = 10 if urls == None or urls =='top_news': news_urls = { 'huffington': 'http://huffingtonpost.com', 'reuters': 'http://www.reuters.com', 'cbs-news': 'http://www.cbsnews.com', 'usa-today': 'http://usatoday.com', 'cnn': 'http://cnn.com', 'npr': 'http://www.npr.org', 'abc-news': 'http://abcnews.com', 'us-news': 'http://www.usnews.com', 'msn': 'http://msn.com', 'pbs': 'http://www.pbs.org', 'nbc-news': 'http://www.nbcnews.com', 'msnbc': 'http://www.msnbc.com', 'fox': 'http://www.foxnews.com'} elif urls=='all_us_news': news_urls = { 'abc-news': 'https://abcnews.go.com', 'al-jazeera-english': 'http://www.aljazeera.com', 'ars-technica': 'http://arstechnica.com', 'associated-press': 'https://apnews.com/', 'axios': 'https://www.axios.com', 'bleacher-report': 'http://www.bleacherreport.com', 'bloomberg': 'http://www.bloomberg.com', 'breitbart-news': 'http://www.breitbart.com', 'business-insider': 'http://www.businessinsider.com', 'buzzfeed': 'https://www.buzzfeed.com', 'cbs-news': 'http://www.cbsnews.com', 'cnbc': 'http://www.cnbc.com', 'cnn': 'http://us.cnn.com', 'crypto-coins-news': 'https://www.ccn.com', 'engadget': 'https://www.engadget.com', 'entertainment-weekly': 'http://www.ew.com', 'espn': 'http://espn.go.com', 'espn-cric-info': 'http://www.espncricinfo.com/', 'fortune': 'http://fortune.com', 'fox-news': 'http://www.foxnews.com', 'fox-sports': 'http://www.foxsports.com', 'google-news': 'https://news.google.com', 'hacker-news': 'https://news.ycombinator.com', 'ign': 'http://www.ign.com', 'mashable': 'http://mashable.com', 'medical-news-today': 'http://www.medicalnewstoday.com', 'msnbc': 'http://www.msnbc.com', 'mtv-news': 'http://www.mtv.com/news', 'national-geographic': 'http://news.nationalgeographic.com', 'national-review': 'https://www.nationalreview.com/', 'nbc-news': 'http://www.nbcnews.com', 'new-scientist': 'https://www.newscientist.com/section/news', 'newsweek': 'http://www.newsweek.com', 'new-york-magazine': 'http://nymag.com', 'next-big-future': 'https://www.nextbigfuture.com', 'nfl-news': 'http://www.nfl.com/news', 'nhl-news': 'https://www.nhl.com/news', 'politico': 'https://www.politico.com', 'polygon': 'http://www.polygon.com', 'recode': 'http://www.recode.net', 'reddit-r-all': 'https://www.reddit.com/r/all', 'reuters': 'http://www.reuters.com', 'techcrunch': 'https://techcrunch.com', 'techradar': 'http://www.techradar.com', 'american-conservative': 'http://www.theamericanconservative.com/', 'hill': 'http://thehill.com', 'huffington-post': 'http://www.huffingtonpost.com', 'next-web': 'http://thenextweb.com', 'verge': 'http://www.theverge.com', 'wall-street-journal': 'http://www.wsj.com', 'washington-post': 'https://www.washingtonpost.com', 'washington-times': 'https://www.washingtontimes.com/', 'time': 'http://time.com', 'usa-today': 'http://www.usatoday.com/news', 'vice-news': 'https://news.vice.com', 'wired': 'https://www.wired.com'} elif urls == "texas_universities": news_urls = { 'A&M': 'http://www.tamu.edu', 'A&M-Commerce': 'http://www.tamuc.edu', 'A&M-Corpus': 'http://www.tamucc.edu', 'A&M-Kingsville': 'http://www.tamuk.edu', 'A&M-Galveston': 'http://www.tamug.edu', 'A&M-PrairieView': 'http://www.pvamu.edu', 'A&M-International': 'http://www.tamiu.edu', 'A&M-WestTexas': 'http://www.wtamu.edu', 'Baylor': 'http://www.baylor.edu', 'Rice': 'http://www.rice.edu', 'SFAustin': 'http://www.sfasu.edu', 'SMU': 'http://www.smu.edu', 'SulRoss': 'http://www.sulross.edu', 'TexasState': 'http://www.txstate.edu', 'Texas_Tech': 'http://www.ttu.edu', 'UDallas': 'http://www.udallas.edu', 'UHouston': 'http://www.uh.edu', 'UTexas': 'http://www.utexas.edu', 'UT_Dallas': 'http://www.utdallas.edu', 'UT_ElPaso': 'http://www.utep.edu', 'UT_Houston': 'http://www.uth.edu', 'UT_NorthTexas': 'http://www.unt.edu', 'UT_SanAntonio': 'http://www.utsa.edu'} elif urls == 'popular': news_urls = {} agency_urls = newspaper.popular_urls() for i in range(len(agency_urls)): val = agency_urls[i] url = agency_urls[i].replace("http://", "") url = url.replace("www.", "") url = url.replace("blog.", "") url = url.replace("blogs.", "") url = url.replace(".com", "") url = url.replace(".net", "") url = url.replace(".au", "") url = url.replace(".org", "") url = url.replace(".co.uk", "") url = url.replace("the", "") url = url.replace(".", "-") url = url.replace('usa', 'usa-') if url=='berkeley-edu': continue if url=='beta-na-leagueoflegends': continue if url=='bottomline-as-ucsb-edu': continue news_urls[url] = val else: news_urls = urls print("\nSearch Level {:6d} Articles available from {:=0 : # secure-fly are duplicates of http if article.url.find('secure-fly')>=0: continue if agency=='usa-today': if url_lower.find('tunein.com') >= 0: continue if agency=='huffington': # Ignore huffington if it's not .com if url_lower.find('.com') < 0: continue # Filter Articles that are primarily video, film or not en if url_lower.find('.video/') >=0 or \ url_lower.find('/video') >=0 or \ url_lower.find('/picture') >=0 or \ url_lower.find('.pictures/')>=0 or \ url_lower.find('/photo') >=0 or \ url_lower.find('.photos/') >=0 or \ url_lower.find('espanol') >=0 or \ url_lower.find('.mx/' ) >=0 or \ url_lower.find('/mx.' ) >=0 or \ url_lower.find('.fr/' ) >=0 or \ url_lower.find('/fr.' ) >=0 or \ url_lower.find('.de/' ) >=0 or \ url_lower.find('/de.' ) >=0 or \ url_lower.find('.it/' ) >=0 or \ url_lower.find('/it.' ) >=0 or \ url_lower.find('.gr/' ) >=0 or \ url_lower.find('/gr.' ) >=0 or \ url_lower.find('.se/' ) >=0 or \ url_lower.find('/se.' ) >=0 or \ url_lower.find('.es/' ) >=0 or \ url_lower.find('/es.' ) >=0 or \ url_lower.find('?button') >=0 or \ url_lower.find('calendar.') >=0 or \ url_lower.find('calendar/') >=0 or \ url_lower.find('/event/') >=0 or \ url_lower.find('engr.utexas') >=0 or \ url_lower.find('sites.smu.') >=0: continue # Filter if search_level == 0, URL quick search if search_level == 0: # Verify url contains at least one of the key words found_it = False for word in words: j = url_lower.find(word) if j>= 0: found_it = True break if found_it: # Article contains words and passes filters # Save this article for full review article_collection.append(article.url) else: # No URL screening, Save for full review article_collection.append(article.url) n_to_review = len(article_collection) if display: print("{:>6d} Selected for download".format(n_to_review)) for article_url in article_collection: article = Article(article_url, config=config) try: article.download() except: if display: print("Cannot download:", article_url[0:79]) continue n = 0 # Limit download failures stop_sec=1 # Initial max wait time in seconds while n<2: try: article.parse() n = 99 except: n += 1 # Initiate download again before new parse attempt article.download() # Timeout for 5 seconds waiting for download t0 = time() tlapse = 0 while tlapse= 0: found_it = True break else: # search type 'and' found_it = True for word in words: j = text_lower_case.find(word) if j < 0: found_it = False break if found_it: # Article contains words and passes filters # Save this article for later full review length = len(text) df_story = pd.DataFrame([[agency, article_url, length, keywords, title, summary, text]], columns=['agency', 'url', 'length', 'keywords', 'title', 'summary', 'text']) # Check for an identical already in the file if df_articles.shape[0]==0: #df_articles = df_articles.append(df_story) df_articles = pd.concat([df_articles, df_story]) else: # Verify this story is not already in df_articles same_story = False for i in range(df_articles.shape[0]): if text==df_articles['text'].iloc[i]: same_story = True n_to_review -= 1 continue if not(same_story): #df_articles = df_articles.append(df_story) df_articles = pd.concat([df_articles, df_story]) else: n_to_review -= 1 print("=", end='') n_articles[agency] = [n_to_review, len(article_collection)] if display: print("\n\nArticles Selected by Agency:") for agency in news_urls: ratio = str(n_articles[agency][0]) + "/" + \ str(n_articles[agency][1]) ratio = ratio print("{:>10s} Articles from {:78s}*".format("-")) for i in range(df_articles.shape[0]): k = len(df_articles['title'].iloc[i]) if k > 63: for j in range(25): k = 63-j if df_articles['title'].iloc[i][k] == " ": break print("{:>5d} {:<10s} {:<63s}". format(df_articles['length'].iloc[i], df_articles['agency'].iloc[i], df_articles['title' ].iloc[i][0:k])) if len(df_articles['title'].iloc[i])>63: print(" {:<60s}". format(df_articles['title'].iloc[i][k:120])) else: print("{:>5d} {:<10s} {:78s}*".format("-")) return df_articles def clean_html(html): # First we remove inline JavaScript/CSS: pg = re.sub(r"(?is)<(script|style).*?>.*?()", "", html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. pg = re.sub(r"(?s)[\n]?", "", pg) # Next we can remove the remaining tags: pg = re.sub(r"(?s)<.*?>", " ", pg) # Finally, we deal with whitespace pg = re.sub(r" ", " ", pg) pg = re.sub(r"’", "'", pg) pg = re.sub(r"'", "'", pg) pg = re.sub(r"“", '"', pg) pg = re.sub(r"”", '"', pg) pg = re.sub(r""", '"', pg) pg = re.sub(r"&", '&', pg) pg = re.sub(r"\n", " ", pg) pg = re.sub(r"\t", " ", pg) pg = re.sub(r"/>", " ", pg) pg = re.sub(r'/">', " ", pg) k = 1 m = len(pg) while k>0: pg = re.sub(r" ", " ", pg) k = m - len(pg) m = len(pg) return pg.strip() def newsapi_get_urls(apikey, search_words, urls=None): try: api = NewsApiClient(api_key=apikey) except: raise RuntimeError("APIKEY Invalid") if len(search_words)==0 or search_words==None: raise RuntimeError("No Search Words") print("Searching agencies for pages containing:", search_words) # This is my API key, each user must request their own # API key from https://newsapi.org/account api = NewsApiClient(api_key=apikey) api_urls = [] # Note that newsapi only draws articles from registered sources # These require a particular key/value combination in news_urls # Even if the url is correct, if the key is not what is registered # the search will be rejected for that agency if urls == None or urls == 'top_news': news_urls = { 'al-jazeera-english': 'http://www.aljazeera.com', 'the-huffington-post': 'http://www.huffingtonpost.com', 'bloomberg': 'http://www.bloomberg.com', 'reuters': 'http://www.reuters.com', 'cbs-news': 'http://www.cbsnews.com', 'usa-today': 'http://www.usatoday.com/news', 'cnn': 'http://us.cnn.com', 'abc-news': 'https://abcnews.go.com', 'msnbc': 'http://www.msnbc.com', 'nbc-news': 'http://www.nbcnews.com', 'the-wall-street-journal': 'http://www.wsj.com', 'fox-news': 'http://www.foxnews.com', 'associated-press': 'https://apnews.com/'} elif urls=='all_us_news': news_urls = {} sources = api.get_sources() n_sources = len(sources['sources']) for i in range(n_sources): cay = sources['sources'][i]['id'] val = sources['sources'][i]['url'] lang = sources['sources'][i]['language'] ctry = sources['sources'][i]['country'] if lang == 'en' and ctry == 'us': news_urls[cay] = val else: news_urls = urls # Iterate over agencies and search words to pull more url's # Limited to 300 requests/day - Likely to be exceeded for agency in news_urls: domain = news_urls[agency].replace("http://" , "") domain = news_urls[agency].replace("https://", "") print("{:.<30s} {:<50s}".format(agency, domain)) for word in search_words: # Get articles with q= in them, Limits to 20 URLs try: articles = api.get_everything(q=word, language='en', sources=agency, domains=domain) except: print("--->Unable to pull news from:", agency, "for", word) continue # Pull the URL from these articles (limited to 20) d = articles['articles'] for i in range(len(d)): url = d[i]['url'] api_urls.append([agency, word, url]) df_urls = pd.DataFrame(api_urls, columns=['agency', 'word', 'url']) n_total = len(df_urls) # Remove duplicates df_urls = df_urls.drop_duplicates('url') n_unique = len(df_urls) print("\nFound a total of", n_total, " URLs, of which", n_unique, " were unique.") return df_urls def request_pages(df_urls): web_pages = [] for i in range(len(df_urls)): u = df_urls.iloc[i] url = u[2] short_url = url[0:50] short_url = short_url.replace("https//", "") short_url = short_url.replace("http//", "") n = 0 # Allow for a maximum of 2 download failures stop_sec=1 # Initial max wait time in seconds while n<2: try: r = requests.get(url, timeout=(stop_sec)) if r.status_code == 404: print("-->HTML ERROR 404", short_url) raise ValueError() if r.status_code == 200: print("Obtained: "+short_url) else: print("-->Web page: "+short_url+" status code:", \ r.status_code) n=99 continue # Skip this page except: if r.status_code == 404: n=99 continue n += 1 # Timeout waiting for download t0 = time() tlapse = 0 print("Waiting", stop_sec, "sec") while tlapse