Spaces:

tracinginsights
/

QuotesBot

Sleeping

App Files Files Community

tracinginsights commited on Dec 28, 2022

Commit

a76c1ab

1 Parent(s): eb67193

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -25

app.py CHANGED Viewed

@@ -5,9 +5,16 @@ import black
 import flair
 import time
 from bs4 import BeautifulSoup
 URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
 def get_xml(url):
@@ -15,37 +22,122 @@ def get_xml(url):
     # use urllib.parse to check for formula1.com website or other news
     xml = pd.read_xml(url,xpath='channel/item')
-def check_updates(every=60):
     while True:
         time.sleep(every)
         latest_xml = get_xml()
         if ~previous_xml.equals(latest_xml):
             print('New articles found')
             new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
-            for article in new_articles_df.iterrows():
-                link = row[1]["guid"]
-                request = requests.get(link)
-                soup = BeautifulSoup(request.content, "html.parser")
-                # class_ below will be different for different websites
-                s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
-                lines = s.find_all("p")
-                text_content = pd.DataFrame(data={"text": []})
-                for i, line in enumerate(lines):
-                    df = pd.DataFrame(data={"text": [line.text]})
-                    text_content = pd.concat([text_content, df], ignore_index=True)
-                strongs = s.find_all("strong")
-                strong_content = pd.DataFrame(data={"text": []})
-                for i, strong in enumerate(strongs):
-                    if i > 0:
-                        df = pd.DataFrame(data={"text": [strong.text]})
-                        strong_content = pd.concat([strong_content, df], ignore_index=True)
-                # df has content
-                df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
-                            drop=True
-                        )
-                return df
         else:

 import flair
 import time
 from bs4 import BeautifulSoup
+import re
+import numpy as np
+from flair.data import Sentence
+from flair.models import SequenceTagger
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
+import string
 URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
 def get_xml(url):
     # use urllib.parse to check for formula1.com website or other news
     xml = pd.read_xml(url,xpath='channel/item')
+# care taken to only consider results where there are more words not a single word quotes
+def extract_quote(string):
+    # Use the re.findall function to extract the quoted text
+    results = re.findall(r'[“\"](.*?)[”\"]', string)
+    quotes = []
+    for result in results:
+        split_result = result.split()
+        if len(split_result) >3:
+            quotes.append(result)
+    return quotes
+def get_names(text):
+    # # load the NER tagger
+    tagger = SequenceTagger.load('ner')
+    sentence = Sentence(text)
+    tagger.predict(sentence)
+    names = []
+    for label in sentence.get_labels('ner'):
+        if label.value == "PER":
+            names.append(f"{label.data_point.text}")
+     # convert to a set to remove some of the repetitions
+    names = list(set(names))
+    return names
+def get_text(new_articles_df):
+    """
+    quotes outputs a list of quotes
+    """
+    dfs_dict = {}
+    for article in tqdm(new_articles_df.iterrows()):
+        link = article[1]["guid"]
+        request = requests.get(link)
+        soup = BeautifulSoup(request.content, "html.parser")
+        # class_ below will be different for different websites
+        s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
+        lines = s.find_all("p")
+        text_content = pd.DataFrame(data={"text": []})
+        for i, line in enumerate(lines):
+            df = pd.DataFrame(data={"text": [line.text]})
+            text_content = pd.concat([text_content, df], ignore_index=True)
+        strongs = s.find_all("strong")
+        strong_content = pd.DataFrame(data={"text": []})
+        for i, strong in enumerate(strongs):
+            if i > 0:
+                df = pd.DataFrame(data={"text": [strong.text]})
+                strong_content = pd.concat([strong_content, df], ignore_index=True)
+        # df has content
+        df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
+                    drop=True
+                )
+#         df["quote"] = df["text"].apply(lambda row: extract_quote(row))
+#         # combine all rows into context
+        context = ""
+        for i,row in df.iterrows():
+            context += f" {row['text']}"
+        quotes = extract_quote(context)
+        # to save some time not computing unnecessary NER
+        if len(quotes) != 0:
+            speakers = get_names(context)
+        else:
+            speakers = ()
+        dfs_dict[link] = {'context':context, 'quotes':quotes, 'speakers':speakers}
+    return dfs_dict
+def load_speaker_model():
+    model_name = f"microsoft/deberta-v2-large"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+    question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
+    return question_answerer
+def remove_punctuations(text):
+    modified_text = "".join([character for character in text if character not in string.punctuation])
+    modified_text = modified_text.lstrip(" ")
+    modified_text = modified_text.rstrip(" ")
+    return modified_text
+def check_updates(every=300):
     while True:
         time.sleep(every)
         latest_xml = get_xml()
         if ~previous_xml.equals(latest_xml):
             print('New articles found')
             new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
+            # loops through new articles and gets the necessary text, quotes and speakers
+            dfs_dict = get_text(new_articles_df)
         else: