Spaces:

anly656
/

dr_jones

Sleeping

File size: 26,643 Bytes

8643b59

"""

@author: Edward R Jones
@version 1.34
@copyright 2020 - Edward R Jones, all rights reserved.
"""

import sys
import warnings
import pandas as pd

import re
import requests  # install using conda install requests

from time import time
from datetime import date

try:
    import newspaper # install using conda install newspaper3k
    from newspaper import Article
except:
    warnings.warn("AdvancedAnalytics.Scrape.newspaper_stories "+\
                  "missing NEWSPAPER3K package")
try:
# newsapi requires tiny\segmenter:  pip install tinysegmenter==0.3
# Install newsapi using:  pip install newsapi-python
    from newsapi import NewsApiClient # Needed for using API Feed
except:
    warnings.warn("AdvancedAnalytics.Scrape.newsapi_get_urls "+\
                  "missing NEWSAPI package")
   
class scrape(object):
        
    def newspaper_stories(words, search_type='or', search_level=0, urls=None, 
                      display=True, memorize=False, language='en'):
        config = newspaper.Config()
        config.memoize_articles = memorize
        config.language         = language
        config.fetch_images     = False
        config.request_timeout  = 20
        config.MIN_WORD_COUNT   = 300
        config.MIN_SENT_COUNT   = 10
        if urls == None or urls =='top_news':
            news_urls = {
            'huffington': 'http://huffingtonpost.com', 
            'reuters':    'http://www.reuters.com', 
            'cbs-news':   'http://www.cbsnews.com',
            'usa-today':  'http://usatoday.com',
            'cnn':        'http://cnn.com',
            'npr':        'http://www.npr.org',
            'abc-news':   'http://abcnews.com',
            'us-news':    'http://www.usnews.com',
            'msn':        'http://msn.com',
            'pbs':        'http://www.pbs.org',
            'nbc-news':   'http://www.nbcnews.com',
            'msnbc':      'http://www.msnbc.com',
            'fox':        'http://www.foxnews.com'}
        elif urls=='all_us_news':
            news_urls = {
            'abc-news': 'https://abcnews.go.com',
            'al-jazeera-english': 'http://www.aljazeera.com',
            'ars-technica': 'http://arstechnica.com',
            'associated-press': 'https://apnews.com/',
            'axios': 'https://www.axios.com',
            'bleacher-report': 'http://www.bleacherreport.com',
            'bloomberg': 'http://www.bloomberg.com',
            'breitbart-news': 'http://www.breitbart.com',
            'business-insider': 'http://www.businessinsider.com',
            'buzzfeed': 'https://www.buzzfeed.com',
            'cbs-news': 'http://www.cbsnews.com',
            'cnbc': 'http://www.cnbc.com',
            'cnn': 'http://us.cnn.com',
            'crypto-coins-news': 'https://www.ccn.com',
            'engadget': 'https://www.engadget.com',
            'entertainment-weekly': 'http://www.ew.com',
            'espn': 'http://espn.go.com',
            'espn-cric-info': 'http://www.espncricinfo.com/',
            'fortune': 'http://fortune.com',
            'fox-news': 'http://www.foxnews.com',
            'fox-sports': 'http://www.foxsports.com',
            'google-news': 'https://news.google.com',
            'hacker-news': 'https://news.ycombinator.com',
            'ign': 'http://www.ign.com',
            'mashable': 'http://mashable.com',
            'medical-news-today': 'http://www.medicalnewstoday.com',
            'msnbc': 'http://www.msnbc.com',
            'mtv-news': 'http://www.mtv.com/news',
            'national-geographic': 'http://news.nationalgeographic.com',
            'national-review': 'https://www.nationalreview.com/',
            'nbc-news': 'http://www.nbcnews.com',
            'new-scientist': 'https://www.newscientist.com/section/news',
            'newsweek': 'http://www.newsweek.com',
            'new-york-magazine': 'http://nymag.com',
            'next-big-future': 'https://www.nextbigfuture.com',
            'nfl-news': 'http://www.nfl.com/news',
            'nhl-news': 'https://www.nhl.com/news',
            'politico': 'https://www.politico.com',
            'polygon': 'http://www.polygon.com',
            'recode': 'http://www.recode.net',
            'reddit-r-all': 'https://www.reddit.com/r/all',
            'reuters': 'http://www.reuters.com',
            'techcrunch': 'https://techcrunch.com',
            'techradar': 'http://www.techradar.com',
            'american-conservative': 'http://www.theamericanconservative.com/',
            'hill': 'http://thehill.com',
            'huffington-post': 'http://www.huffingtonpost.com',
            'next-web': 'http://thenextweb.com',
            'verge': 'http://www.theverge.com',
            'wall-street-journal': 'http://www.wsj.com',
            'washington-post': 'https://www.washingtonpost.com',
            'washington-times': 'https://www.washingtontimes.com/',
            'time': 'http://time.com',
            'usa-today': 'http://www.usatoday.com/news',
            'vice-news': 'https://news.vice.com',
            'wired': 'https://www.wired.com'}
        elif urls == "texas_universities":
            news_urls = {
            'A&M':               'http://www.tamu.edu',
            'A&M-Commerce':      'http://www.tamuc.edu',
            'A&M-Corpus':        'http://www.tamucc.edu',
            'A&M-Kingsville':    'http://www.tamuk.edu',
            'A&M-Galveston':     'http://www.tamug.edu',
            'A&M-PrairieView':   'http://www.pvamu.edu',
            'A&M-International': 'http://www.tamiu.edu',
            'A&M-WestTexas':     'http://www.wtamu.edu',
            'Baylor':            'http://www.baylor.edu',
            'Rice':              'http://www.rice.edu',
            'SFAustin':          'http://www.sfasu.edu',
            'SMU':               'http://www.smu.edu',
            'SulRoss':           'http://www.sulross.edu',
            'TexasState':        'http://www.txstate.edu',
            'Texas_Tech':        'http://www.ttu.edu',
            'UDallas':           'http://www.udallas.edu',
            'UHouston':          'http://www.uh.edu',
            'UTexas':            'http://www.utexas.edu', 
            'UT_Dallas':         'http://www.utdallas.edu',
            'UT_ElPaso':         'http://www.utep.edu',
            'UT_Houston':        'http://www.uth.edu',
            'UT_NorthTexas':     'http://www.unt.edu',
            'UT_SanAntonio':     'http://www.utsa.edu'}
        elif urls == 'popular':
            news_urls = {}
            agency_urls = newspaper.popular_urls()
            for i in range(len(agency_urls)):
                val = agency_urls[i]
                url = agency_urls[i].replace("http://", "")
                url = url.replace("www.", "")
                url = url.replace("blog.", "")
                url = url.replace("blogs.", "")
                url = url.replace(".com", "")
                url = url.replace(".net", "")
                url = url.replace(".au", "")
                url = url.replace(".org", "")
                url = url.replace(".co.uk", "")
                url = url.replace("the", "")
                url = url.replace(".", "-")
                url = url.replace('usa', 'usa-')
                if url=='berkeley-edu':
                    continue
                if url=='beta-na-leagueoflegends':
                    continue
                if url=='bottomline-as-ucsb-edu':
                    continue
                news_urls[url] = val
        else:
            news_urls = urls
        
        print("\nSearch Level {:<d}:".format(search_level), end="")
        if search_level==0:
            print(" Screening URLs for search words")
            print("   URLs must contain one or more of:", end="")
        else:
            print(" No URL Screening")
            print("   Deep Search for Articles containing: ", 
                  end="")
        i=0
        for word in words:
            i += 1
            if i < len(words):
                if search_type == 'or':
                    print(word+" or ", end="")
                else:
                    print(word+" & ", end="")
            else:
                print(word)
                    
        df_articles = pd.DataFrame(columns=['agency', 'url', 'length', 
                                            'keywords', 'title', 'summary', 
                                            'text'])
        n_articles  = {}
        today       = str(date.today())
        for agency, url in news_urls.items():
            paper   = newspaper.build(url, config=config)
            if display:
                print("\n{:>6d} Articles available from {:<s} on {:<10s}:".
                      format(paper.size(), agency.upper(), today))
            article_collection = []
            for article in paper.articles:
                url_lower = article.url.lower()
                # Exclude articles that are in a language other then en
                # or contains mostly video or pictures
                # search_level 0 only downloads articles with at least
                # one of the key words in its URL
                # search_level 1 download all articles that appear to be
                # appear to be in English and are not mainly photos or
                # videos.
                # With either search level, if an article is downloaded
                # it is scanned to see if it contains the search words
                # It is also compared to other articles to verify that
                # it is not a duplicate of another article.
                
                 # Special Filters for some Agencies
                if agency=='cbs-news':
                    if  url_lower.find('.com') >=0 :
                        # secure-fly are duplicates of http
                        if article.url.find('secure-fly')>=0:
                            continue
                if agency=='usa-today':
                    if url_lower.find('tunein.com') >= 0:
                        continue
                if agency=='huffington':
                    # Ignore huffington if it's not .com
                    if url_lower.find('.com') < 0:
                        continue
                    
                # Filter Articles that are primarily video, film or not en
                if url_lower.find('.video/')   >=0 or \
                   url_lower.find('/video')    >=0 or \
                   url_lower.find('/picture')  >=0 or \
                   url_lower.find('.pictures/')>=0 or \
                   url_lower.find('/photo')    >=0 or \
                   url_lower.find('.photos/')  >=0 or \
                   url_lower.find('espanol')   >=0 or \
                   url_lower.find('.mx/' )     >=0 or \
                   url_lower.find('/mx.' )     >=0 or \
                   url_lower.find('.fr/' )     >=0 or \
                   url_lower.find('/fr.' )     >=0 or \
                   url_lower.find('.de/' )     >=0 or \
                   url_lower.find('/de.' )     >=0 or \
                   url_lower.find('.it/' )     >=0 or \
                   url_lower.find('/it.' )     >=0 or \
                   url_lower.find('.gr/' )     >=0 or \
                   url_lower.find('/gr.' )     >=0 or \
                   url_lower.find('.se/' )     >=0 or \
                   url_lower.find('/se.' )     >=0 or \
                   url_lower.find('.es/' )     >=0 or \
                   url_lower.find('/es.' )     >=0 or \
                   url_lower.find('?button')   >=0 or \
                   url_lower.find('calendar.') >=0 or \
                   url_lower.find('calendar/') >=0 or \
                   url_lower.find('/event/')   >=0 or \
                   url_lower.find('engr.utexas') >=0 or \
                   url_lower.find('sites.smu.')  >=0:
                       continue

                # Filter if search_level == 0, URL quick search
                if search_level == 0:
                    # Verify url contains at least one of the key words
                    found_it = False
                    for word in words:
                        j = url_lower.find(word)
                        if j>= 0:
                            found_it = True
                            break
                    if found_it:
                        # Article contains words and passes filters
                        # Save this article for full review
                        article_collection.append(article.url)
                else:
                    #  No URL screening, Save for full review
                    article_collection.append(article.url)
            n_to_review = len(article_collection)
            if display:
                print("{:>6d} Selected for download".format(n_to_review))
            
            for article_url in article_collection:
                article = Article(article_url, config=config)
                try:
                    article.download()
                except: 
                    if display:
                        print("Cannot download:", article_url[0:79])
                    continue
                n = 0
                # Limit download failures
                stop_sec=1 # Initial max wait time in seconds
                while n<2:
                    try:
                        article.parse()
                        n = 99
                    except:
                        n += 1
                        # Initiate download again before new parse attempt
                        article.download()
                        # Timeout for 5 seconds waiting for download
                        t0     = time()
                        tlapse = 0
                        while tlapse<stop_sec:
                            tlapse = time()-t0
                        # Double wait time if needed for next exception
                        stop_sec = stop_sec+1
                if n != 99:
                    if display:
                        print("Cannot download:", article_url[0:79])
                    n_to_review -= 1
                    continue
                article.nlp()
                keywords = article.keywords
                title    = article.title
                summary  = article.summary
                text     = article.text
                text_lower_case = text.lower()
                if search_type == 'or':
                    found_it = False
                    # Verify the url contains at least one of the key words
                    for word in words:
                        j = text_lower_case.find(word)
                        if j>= 0:
                            found_it = True
                            break
                else: 
                    # search type 'and'
                    found_it = True
                    for word in words:
                        j = text_lower_case.find(word)
                        if j < 0:
                            found_it = False
                            break
                if found_it:
                    # Article contains words and passes filters
                    # Save this article for later full review
                    length   = len(text)
                    df_story = pd.DataFrame([[agency, article_url, length, 
                                              keywords, title, summary, 
                                              text]], 
                                columns=['agency', 'url', 'length', 'keywords',
                                             'title', 'summary', 'text'])
                    # Check for an identical already in the file
                    if df_articles.shape[0]==0:
                        #df_articles  = df_articles.append(df_story)
                        df_articles  = pd.concat([df_articles, df_story])
                    else:
                        # Verify this story is not already in df_articles
                        same_story = False
                        for i in range(df_articles.shape[0]):
                            if text==df_articles['text'].iloc[i]:
                                same_story   = True
                                n_to_review -= 1
                                continue
                        if not(same_story):
                            #df_articles  = df_articles.append(df_story)
                            df_articles  = pd.concat([df_articles, df_story])
                else:
                    n_to_review -= 1
                    
                print("=", end='')
            n_articles[agency] = [n_to_review, len(article_collection)]
        if display:
            print("\n\nArticles Selected by Agency:")
            for agency in news_urls:
                ratio = str(n_articles[agency][0]) + "/" + \
                        str(n_articles[agency][1])
                ratio = ratio
                print("{:>10s} Articles from {:<s}".
                      format(ratio, agency.upper()))
            print("\nArticles Collected on "+today+":", 
                      df_articles.shape[0],'from', 
                      df_articles['agency'].nunique(), "Agencies.")
            print("\nSize    Agency    Title")
            print("*{:->78s}*".format("-"))
            for i in range(df_articles.shape[0]):
                k = len(df_articles['title'].iloc[i])
                if k > 63:
                    for j in range(25):
                        k = 63-j
                        if df_articles['title'].iloc[i][k] == " ":
                            break
                
                    print("{:>5d} {:<10s} {:<63s}".
                          format(df_articles['length'].iloc[i], 
                                 df_articles['agency'].iloc[i],
                                 df_articles['title' ].iloc[i][0:k]))
                    if len(df_articles['title'].iloc[i])>63:
                        print("                {:<60s}".
                              format(df_articles['title'].iloc[i][k:120]))
                else:
                    print("{:>5d} {:<10s} {:<s}".
                          format(df_articles['length'].iloc[i], 
                                 df_articles['agency'].iloc[i],
                                 df_articles['title' ].iloc[i]))
                print("")
            print("*{:->78s}*".format("-"))
        return df_articles

    def clean_html(html):
        # First we remove inline JavaScript/CSS:
        pg = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
        # Then we remove html comments. This has to be done before removing regular
        # tags since comments can contain '>' characters.
        pg = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", pg)
        # Next we can remove the remaining tags:
        pg = re.sub(r"(?s)<.*?>", " ", pg)
        # Finally, we deal with whitespace
        pg = re.sub(r"&nbsp;", " ", pg)
        pg = re.sub(r"&rsquo;", "'", pg)
        pg = re.sub(r"&#x27;", "'", pg)
        pg = re.sub(r"&ldquo;", '"', pg)
        pg = re.sub(r"&rdquo;", '"', pg)
        pg = re.sub(r"&quot;", '"', pg)
        pg = re.sub(r"&amp;", '&', pg)
        pg = re.sub(r"\n", " ", pg)
        pg = re.sub(r"\t", " ", pg)
        pg = re.sub(r"/>", " ", pg)
        pg = re.sub(r'/">', " ", pg)
        k = 1
        m = len(pg)
        while k>0:
            pg = re.sub(r"  ", " ", pg)
            k = m - len(pg)
            m = len(pg)
        return pg.strip()

    def newsapi_get_urls(apikey, search_words, urls=None):
        try:
            api = NewsApiClient(api_key=apikey)
        except:
            raise RuntimeError("APIKEY Invalid")
        if len(search_words)==0 or search_words==None:
            raise RuntimeError("No Search Words")
        print("Searching agencies for pages containing:", search_words)
        # This is my API key, each user must request their own
        # API key from https://newsapi.org/account
        api = NewsApiClient(api_key=apikey)
        api_urls  = []
        # Note that newsapi only draws articles from registered sources
        # These require a particular key/value combination in news_urls
        # Even if the url is correct, if the key is not what is registered
        # the search will be rejected for that agency
        if urls == None or urls == 'top_news':
               news_urls = { 
                    'al-jazeera-english':      'http://www.aljazeera.com',
                    'the-huffington-post':     'http://www.huffingtonpost.com',
                    'bloomberg':               'http://www.bloomberg.com',
                    'reuters':                 'http://www.reuters.com', 
                    'cbs-news':                'http://www.cbsnews.com',
                    'usa-today':               'http://www.usatoday.com/news',
                    'cnn':                     'http://us.cnn.com',
                    'abc-news':                'https://abcnews.go.com',
                    'msnbc':                   'http://www.msnbc.com',
                    'nbc-news':                'http://www.nbcnews.com',
                    'the-wall-street-journal': 'http://www.wsj.com',
                    'fox-news':                'http://www.foxnews.com',
                    'associated-press':        'https://apnews.com/'}
        elif urls=='all_us_news':
                news_urls = {}
                sources = api.get_sources()
                n_sources = len(sources['sources'])
                for i in range(n_sources):
                    cay  = sources['sources'][i]['id']
                    val  = sources['sources'][i]['url']
                    lang = sources['sources'][i]['language']
                    ctry = sources['sources'][i]['country']
                    if lang == 'en' and ctry == 'us':
                        news_urls[cay] = val
        else:
                news_urls = urls
        # Iterate over agencies and search words to pull more url's
        # Limited to 300 requests/day - Likely to be exceeded 
        for agency in news_urls:
            domain = news_urls[agency].replace("http://" , "")
            domain = news_urls[agency].replace("https://", "")
            print("{:.<30s} {:<50s}".format(agency, domain))
            for word in search_words:
                # Get articles with q= in them, Limits to 20 URLs
                try:
                    articles = api.get_everything(q=word, language='en',
                                        sources=agency, domains=domain)
                except:
                    print("--->Unable to pull news from:", agency, "for", word)
                    continue
                # Pull the URL from these articles (limited to 20)
                d = articles['articles']
                for i in range(len(d)):
                    url = d[i]['url']
                    api_urls.append([agency, word, url])
        df_urls  = pd.DataFrame(api_urls, columns=['agency', 'word', 'url'])
        n_total  = len(df_urls)
        # Remove duplicates
        df_urls  = df_urls.drop_duplicates('url')
        n_unique = len(df_urls)
        print("\nFound a total of", n_total, " URLs, of which", n_unique,
              " were unique.")
        return df_urls
    
    def request_pages(df_urls):
        web_pages = []
        for i in range(len(df_urls)):
            u   = df_urls.iloc[i]
            url = u[2]
            short_url = url[0:50]
            short_url = short_url.replace("https//", "")
            short_url = short_url.replace("http//", "")
            n = 0
            # Allow for a maximum of 2 download failures
            stop_sec=1 # Initial max wait time in seconds
            while n<2:
                try:
                    r = requests.get(url, timeout=(stop_sec))
                    if r.status_code == 404:
                        print("-->HTML ERROR 404", short_url)
                        raise ValueError()
                    if r.status_code == 200:
                        print("Obtained: "+short_url)
                    else:
                        print("-->Web page: "+short_url+" status code:", \
                                  r.status_code)
                    n=99
                    continue # Skip this page
                except:
                    if r.status_code == 404:
                        n=99
                        continue
                    n += 1
                    # Timeout waiting for download
                    t0 = time()
                    tlapse = 0
                    print("Waiting", stop_sec, "sec")
                    while tlapse<stop_sec:
                        tlapse = time()-t0
                    # Double wait time if needed for next exception
                    stop_sec = stop_sec*2
            if n != 99:
                # download failed skip this page
                continue
            # Page obtained successfully
            html_page = r.text
            page_text = scrape.clean_html(html_page)
            web_pages.append([url, page_text])
        df_www  = pd.DataFrame(web_pages, columns=['url', 'text'])
        n_total  = len(df_urls)
        # Remove duplicates
        df_www  = df_www.drop_duplicates('url')
        n_unique = len(df_urls)
        print("Found a total of", n_total, " web pages, of which", n_unique,\
              " were unique.")
        return df_www
    
class Metrics:
    # Function for calculating loss and confusion matrix
    def binary_loss(y, y_predict, fn_cost, fp_cost, display=True):
        loss     = [0, 0]       #False Neg Cost, False Pos Cost
        conf_mat = [[0, 0], [0, 0]] #tn, fp, fn, tp
        for j in range(len(y)):
            if y[j]==0:
                if y_predict[j]==0:
                    conf_mat[0][0] += 1 #True Negative
                else:
                    conf_mat[0][1] += 1 #False Positive
                    loss[1] += fp_cost[j]
            else:
                if y_predict[j]==1:
                    conf_mat[1][1] += 1 #True Positive
                else:
                    conf_mat[1][0] += 1 #False Negative
                    loss[0] += fn_cost[j]
        if display:
            fn_loss = loss[0]
            fp_loss = loss[1]
            total_loss = fn_loss + fp_loss
            misc    = conf_mat[0][1] + conf_mat[1][0]
            misc    = misc/len(y)
            print("{:.<23s}{:10.4f}".format("Misclassification Rate", misc))
            print("{:.<23s}{:10.0f}".format("False Negative Loss", fn_loss))
            print("{:.<23s}{:10.0f}".format("False Positive Loss", fp_loss))
            print("{:.<23s}{:10.0f}".format("Total Loss", total_loss))
        return loss, conf_mat