Spaces:

awinml
/

2-qa-earnings-sentencewise

Build error

App Files Files Community

awinml commited on Mar 19, 2023

Commit

9c49e99

1 Parent(s): e375940

Upload 2 files

Browse files

Files changed (2) hide show

app.py +64 -22
utils.py +126 -5

app.py CHANGED Viewed

@@ -1,17 +1,22 @@
 import openai
 import streamlit_scrollable_textbox as stx
-import pinecone
 import streamlit as st
 from utils import (
     create_dense_embeddings,
     create_sparse_embeddings,
     format_query,
-    generate_prompt,
     get_data,
     get_flan_t5_model,
     get_mpnet_embedding_model,
     get_sgpt_embedding_model,
     get_splade_sparse_embedding_model,
     get_t5_model,
     gpt_model,
@@ -24,7 +29,7 @@ from utils import (
     text_lookup,
 )
-st.set_page_config(layout="wide")
 st.title("Abstractive Question Answering")
@@ -36,21 +41,31 @@ st.write(
 col1, col2 = st.columns([3, 3], gap="medium")
 with col1:
     st.subheader("Question")
     query_text = st.text_input(
         "Input Query",
-        value="What was discussed regarding Wearables revenue performance?",
     )
 with col1:
     years_choice = ["2020", "2019", "2018", "2017", "2016", "All"]
 with col1:
-    year = st.selectbox("Year", years_choice)
 with col1:
-    quarter = st.selectbox("Quarter", ["Q1", "Q2", "Q3", "Q4", "All"])
 with col1:
     participant_type = st.selectbox("Speaker", ["Company Speaker", "Analyst"])
@@ -69,7 +84,7 @@ ticker_choice = [
 ]
 with col1:
-    ticker = st.selectbox("Company", ticker_choice)
 with st.sidebar:
     st.subheader("Select Options:")
@@ -189,9 +204,8 @@ else:
     context_list = format_query(query_results)
-prompt = generate_prompt(query_text, context_list)
 if decoder_model == "GPT3 - (text-davinci-003)":
     with col2:
         with st.form("my_form"):
             edited_prompt = st.text_area(
@@ -208,29 +222,57 @@ if decoder_model == "GPT3 - (text-davinci-003)":
                 api_key = save_key(openai_key)
                 openai.api_key = api_key
                 generated_text = gpt_model(edited_prompt)
-                with col2:
-                    st.subheader("Answer:")
-                    st.write(generated_text)
 elif decoder_model == "T5":
     t5_pipeline = get_t5_model()
     output_text = []
-    for context_text in context_list:
-        output_text.append(t5_pipeline(context_text)[0]["summary_text"])
     with col2:
-        st.subheader("Answer:")
-        for text in output_text:
-            st.markdown(f"- {text}")
 elif decoder_model == "FLAN-T5":
     flan_t5_pipeline = get_flan_t5_model()
     output_text = []
-    for context_text in context_list:
-        output_text.append(flan_t5_pipeline(context_text)[0]["summary_text"])
     with col2:
-        st.subheader("Answer:")
-        for text in output_text:
-            st.markdown(f"- {text}")
 with col1:
     with st.expander("See Retrieved Text"):

 import openai
+import pinecone
 import streamlit_scrollable_textbox as stx
 import streamlit as st
 from utils import (
+    clean_entities,
     create_dense_embeddings,
     create_sparse_embeddings,
+    extract_entities,
     format_query,
+    generate_flant5_prompt,
+    generate_gpt_prompt,
+    get_context_list_prompt,
     get_data,
     get_flan_t5_model,
     get_mpnet_embedding_model,
     get_sgpt_embedding_model,
+    get_spacy_model,
     get_splade_sparse_embedding_model,
     get_t5_model,
     gpt_model,
     text_lookup,
 )
+st.set_page_config(layout="wide")  # isort: skip
 st.title("Abstractive Question Answering")
 col1, col2 = st.columns([3, 3], gap="medium")
+spacy_model = get_spacy_model()
 with col1:
     st.subheader("Question")
     query_text = st.text_input(
         "Input Query",
+        value="What was discussed regarding Wearables revenue performance in Q1 2020?",
     )
+company_ent, quarter_ent, year_ent = extract_entities(query_text, spacy_model)
+ticker_index, quarter_index, year_index = clean_entities(
+    company_ent, quarter_ent, year_ent
+)
 with col1:
     years_choice = ["2020", "2019", "2018", "2017", "2016", "All"]
 with col1:
+    year = st.selectbox("Year", years_choice, index=year_index)
 with col1:
+    quarter = st.selectbox(
+        "Quarter", ["Q1", "Q2", "Q3", "Q4", "All"], index=quarter_index
+    )
 with col1:
     participant_type = st.selectbox("Speaker", ["Company Speaker", "Analyst"])
 ]
 with col1:
+    ticker = st.selectbox("Company", ticker_choice, ticker_index)
 with st.sidebar:
     st.subheader("Select Options:")
     context_list = format_query(query_results)
 if decoder_model == "GPT3 - (text-davinci-003)":
+    prompt = generate_gpt_prompt(query_text, context_list)
     with col2:
         with st.form("my_form"):
             edited_prompt = st.text_area(
                 api_key = save_key(openai_key)
                 openai.api_key = api_key
                 generated_text = gpt_model(edited_prompt)
+                st.subheader("Answer:")
+                st.write(generated_text)
 elif decoder_model == "T5":
+    prompt = generate_flant5_prompt(query_text, context_list)
     t5_pipeline = get_t5_model()
     output_text = []
     with col2:
+        with st.form("my_form"):
+            edited_prompt = st.text_area(
+                label="Model Prompt", value=prompt, height=270
+            )
+            context_list = get_context_list_prompt(edited_prompt)
+            submitted = st.form_submit_button("Submit")
+            if submitted:
+                for context_text in context_list:
+                    output_text.append(
+                        t5_pipeline(context_text)[0]["summary_text"]
+                    )
+                st.subheader("Answer:")
+                for text in output_text:
+                    st.markdown(f"- {text}")
 elif decoder_model == "FLAN-T5":
+    prompt = generate_flant5_prompt(query_text, context_list)
     flan_t5_pipeline = get_flan_t5_model()
     output_text = []
     with col2:
+        with st.form("my_form"):
+            edited_prompt = st.text_area(
+                label="Model Prompt", value=prompt, height=270
+            )
+            context_list = get_context_list_prompt(edited_prompt)
+            submitted = st.form_submit_button("Submit")
+            if submitted:
+                for context_text in context_list:
+                    output_text.append(
+                        flan_t5_pipeline(
+                            "Question:"
+                            + query_text
+                            + "\nContext:"
+                            + context_text
+                            + "\nAnswer?"
+                        )[0]["summary_text"]
+                    )
+                st.subheader("Answer:")
+                for text in output_text:
+                    if "(iii)" not in text:
+                        st.markdown(f"- {text}")
 with col1:
     with st.expander("See Retrieved Text"):

utils.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import openai
 import pandas as pd
 import streamlit_scrollable_textbox as stx
 import torch
 from sentence_transformers import SentenceTransformer
@@ -11,7 +15,6 @@ from transformers import (
     pipeline,
 )
-import pinecone
 import streamlit as st
@@ -21,6 +24,14 @@ def get_data():
     return data
 # Initialize models from HuggingFace
@@ -33,8 +44,8 @@ def get_t5_model():
 def get_flan_t5_model():
     return pipeline(
         "summarization",
-        model="google/flan-t5-small",
-        tokenizer="google/flan-t5-small",
         max_length=512,
         # length_penalty = 0
     )
@@ -320,7 +331,7 @@ def text_lookup(data, sentence_ids):
     return context
-def generate_prompt(query_text, context_list):
     context = " ".join(context_list)
     prompt = f"""Answer the question in 6 long detailed points as accurately as possible using the provided context. Include as many key details as possible.
 Context: {context}
@@ -329,7 +340,7 @@ Answer:"""
     return prompt
-def generate_prompt_2(query_text, context_list):
     context = " ".join(context_list)
     prompt = f"""
     Context information is below:
@@ -342,6 +353,24 @@ def generate_prompt_2(query_text, context_list):
     return prompt
 def gpt_model(prompt):
     response = openai.Completion.create(
         model="text-davinci-003",
@@ -355,6 +384,98 @@ def gpt_model(prompt):
     return response.choices[0].text
 # Transcript Retrieval

+import re
 import openai
 import pandas as pd
+import pinecone
+import spacy
 import streamlit_scrollable_textbox as stx
 import torch
 from sentence_transformers import SentenceTransformer
     pipeline,
 )
 import streamlit as st
     return data
+# Initialize Spacy Model
+@st.experimental_singleton
+def get_spacy_model():
+    return spacy.load("en_core_web_sm")
 # Initialize models from HuggingFace
 def get_flan_t5_model():
     return pipeline(
         "summarization",
+        model="google/flan-t5-xl",
+        tokenizer="google/flan-t5-xl",
         max_length=512,
         # length_penalty = 0
     )
     return context
+def generate_gpt_prompt(query_text, context_list):
     context = " ".join(context_list)
     prompt = f"""Answer the question in 6 long detailed points as accurately as possible using the provided context. Include as many key details as possible.
 Context: {context}
     return prompt
+def generate_gpt_prompt_2(query_text, context_list):
     context = " ".join(context_list)
     prompt = f"""
     Context information is below:
     return prompt
+def generate_flant5_prompt(query_text, context_list):
+    context = " \n".join(context_list)
+    prompt = f"""Given the context information and prior knowledge, answer this question:
+{query_text}
+Context information is below:
+---------------------
+{context}
+---------------------"""
+    return prompt
+def get_context_list_prompt(prompt):
+    prompt_list = prompt.split("---------------------")
+    context = prompt_list[-2].strip()
+    context_list = context.split(" \n")
+    return context_list
 def gpt_model(prompt):
     response = openai.Completion.create(
         model="text-davinci-003",
     return response.choices[0].text
+# Entity Extraction
+def extract_quarter_year(string):
+    # Extract year from string
+    year_match = re.search(r"\d{4}", string)
+    if year_match:
+        year = year_match.group()
+    else:
+        return None, None
+    # Extract quarter from string
+    quarter_match = re.search(r"Q\d", string)
+    if quarter_match:
+        quarter = "Q" + quarter_match.group()[1]
+    else:
+        return None, None
+    return quarter, year
+def extract_entities(query, model):
+    doc = model(query)
+    entities = {ent.label_: ent.text for ent in doc.ents}
+    if "ORG" in entities.keys():
+        company = entities["ORG"].lower()
+        if "DATE" in entities.keys():
+            quarter, year = extract_quarter_year(entities["DATE"])
+            return company, quarter, year
+        else:
+            return company, None, None
+    else:
+        if "DATE" in entities.keys():
+            quarter, year = extract_quarter_year(entities["DATE"])
+            return None, quarter, year
+        else:
+            return None, None, None
+def clean_entities(company, quarter, year):
+    company_ticker_map = {
+        "apple": "AAPL",
+        "amd": "AMD",
+        "amazon": "AMZN",
+        "cisco": "CSCO",
+        "google": "GOOGL",
+        "microsoft": "MSFT",
+        "nvidia": "NVDA",
+        "asml": "ASML",
+        "intel": "INTC",
+        "micron": "MU",
+    }
+    ticker_choice = [
+        "AAPL",
+        "CSCO",
+        "MSFT",
+        "ASML",
+        "NVDA",
+        "GOOGL",
+        "MU",
+        "INTC",
+        "AMZN",
+        "AMD",
+    ]
+    year_choice = ["2020", "2019", "2018", "2017", "2016", "All"]
+    quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"]
+    if company is not None:
+        if company in company_ticker_map.keys():
+            ticker = company_ticker_map[company]
+            ticker_index = ticker_choice.index(ticker)
+        else:
+            ticker_index = 0
+    else:
+        ticker_index = 0
+    if quarter is not None:
+        if quarter in quarter_choice:
+            quarter_index = quarter_choice.index(quarter)
+        else:
+            quarter_index = len(quarter_choice) - 1
+    else:
+        quarter_index = len(quarter_choice) - 1
+    if year is not None:
+        if year in year_choice:
+            year_index = year_choice.index(year)
+        else:
+            year_index = len(year_choice) - 1
+    else:
+        year_index = len(year_choice) - 1
+    return ticker_index, quarter_index, year_index
 # Transcript Retrieval