Spaces:

jsulz
/

sotu-analysis

Sleeping

App Files Files Community

jsulz commited on Sep 17, 2024

Commit

939697d

1 Parent(s): 6c9d6d5

added initial sumarization

Browse files

Files changed (3) hide show

app.py +50 -3
poetry.lock +4 -4
pyproject.toml +1 -0

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from collections import Counter
 import gradio as gr
 from datasets import load_dataset
 from nltk.util import ngrams
@@ -8,6 +9,10 @@ import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from matplotlib import pyplot as plt
 from wordcloud import WordCloud
 def load_transform_dataset():
@@ -25,6 +30,12 @@ def load_transform_dataset():
         + (0.5 * (len(x.split()) / len(x.split("."))))
         - 21.43
     )
     # Sort the dataframe by date because Plotly doesn't do any of this automatically
     _df = _df.sort_values(by="date")
     _written = _df[_df["categories"] == "Written"]
@@ -64,6 +75,8 @@ def plotly_ngrams(n_grams, potus, _df):
         trigrams = [" ".join(trigram) for trigram in trigrams]
         # create a dataframe from the trigrams and counts
         trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
         fig4 = px.bar(
             trigrams_df,
             x="counts",
@@ -124,9 +137,34 @@ def plt_wordcloud(president, _df):
     return fig6
 # Create a Gradio interface with blocks
 with gr.Blocks() as demo:
     df, written, spoken = load_transform_dataset()
     # Build out the top level static charts and content
     gr.Markdown(
         """
@@ -208,6 +246,16 @@ with gr.Blocks() as demo:
                    The drop off is quite noticeable, don't you think? ;)
             """
         )
     gr.Markdown(
         """
             ## Dive Deeper on Each President
@@ -227,16 +275,15 @@ with gr.Blocks() as demo:
     presidents = df["potus"].unique()
     presidents = presidents.tolist()
     presidents.append("All")
     # create a dropdown to select a president
     president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
     # create a slider for number of word grams
     grams = gr.Slider(
         minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
     )
-    # store the dataframe in a state object before passing to plots
-    df_state = gr.State(df)
     # show a bar chart of the top n-grams for a selected president
     gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])

 from collections import Counter
+import math
 import gradio as gr
 from datasets import load_dataset
 from nltk.util import ngrams
 from plotly.subplots import make_subplots
 from matplotlib import pyplot as plt
 from wordcloud import WordCloud
+from huggingface_hub import InferenceClient
+import matplotlib
+matplotlib.use("agg")
 def load_transform_dataset():
         + (0.5 * (len(x.split()) / len(x.split("."))))
         - 21.43
     )
+    # create a column that is the year the speach was given from the date column
+    _df["year"] = _df["date"].dt.year
+    # create a column that is a concatenation of the president's name, year, and category
+    _df["speech_key"] = (
+        _df["potus"] + " - " + _df["year"].astype(str) + " (" + _df["categories"] + ")"
+    )
     # Sort the dataframe by date because Plotly doesn't do any of this automatically
     _df = _df.sort_values(by="date")
     _written = _df[_df["categories"] == "Written"]
         trigrams = [" ".join(trigram) for trigram in trigrams]
         # create a dataframe from the trigrams and counts
         trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
+        if potus == "All":
+            potus = "All Presidents"
         fig4 = px.bar(
             trigrams_df,
             x="counts",
     return fig6
+def summarization(speech_key, _df):
+    client = InferenceClient(model="facebook/bart-large-cnn")
+    chunk_len = 4000
+    speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
+    sotu_chunks = int(math.ceil(len(speech) / chunk_len))
+    response = []
+    for chunk in range(1, sotu_chunks + 1):
+        if chunk * 4000 < len(speech):
+            chunk_text = speech[(chunk - 1) * chunk_len : chunk * chunk_len]
+        else:
+            chunk_text = speech[(chunk - 1) * chunk_len :]
+        try:
+            summarization_chunk = client.summarization(
+                chunk_text, parameters={"truncation": "do_not_truncate"}
+            )
+        except Exception as e:
+            print(e)
+        response.append(summarization_chunk.summary_text)
+    return "\n\n".join(response)
 # Create a Gradio interface with blocks
 with gr.Blocks() as demo:
     df, written, spoken = load_transform_dataset()
+    # store the dataframe in a state object before passing to component functions
+    df_state = gr.State(df)
     # Build out the top level static charts and content
     gr.Markdown(
         """
                    The drop off is quite noticeable, don't you think? ;)
             """
         )
+    gr.Markdown("## Summarize a Speech")
+    speeches = df["speech_key"].unique()
+    speeches = speeches.tolist()
+    speech = gr.Dropdown(label="Select a Speech", choices=speeches)
+    # create a dropdown to select a speech from a president
+    run_summarization = gr.Button(value="Summarize")
+    fin_speech = gr.Textbox(label="Summarized Speech", type="text", lines=10)
+    run_summarization.click(
+        summarization, inputs=[speech, df_state], outputs=[fin_speech]
+    )
     gr.Markdown(
         """
             ## Dive Deeper on Each President
     presidents = df["potus"].unique()
     presidents = presidents.tolist()
     presidents.append("All")
     # create a dropdown to select a president
     president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
+    # create a text area to display the summarized speech
     # create a slider for number of word grams
     grams = gr.Slider(
         minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
     )
     # show a bar chart of the top n-grams for a selected president
     gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])

poetry.lock CHANGED Viewed

@@ -851,13 +851,13 @@ zstd = ["zstandard (>=0.18.0)"]
 [[package]]
 name = "huggingface-hub"
-version = "0.24.6"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
-    {file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
 ]
 [package.dependencies]
@@ -2738,4 +2738,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "8542704f2fdef8c09d10c94785620326a8e2c72112368ee6f2e25fa45aeeb75a"

 [[package]]
 name = "huggingface-hub"
+version = "0.24.7"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
+    {file = "huggingface_hub-0.24.7-py3-none-any.whl", hash = "sha256:a212c555324c8a7b1ffdd07266bb7e7d69ca71aa238d27b7842d65e9a26ac3e5"},
+    {file = "huggingface_hub-0.24.7.tar.gz", hash = "sha256:0ad8fb756e2831da0ac0491175b960f341fe06ebcf80ed6f8728313f95fc0207"},
 ]
 [package.dependencies]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
+content-hash = "6140858cd5057fd978c2f09d1ec90bfda474a4ff6cb96ae17e67d134dae2bc4d"

pyproject.toml CHANGED Viewed

@@ -14,6 +14,7 @@ nltk = "^3.9.1"
 plotly = "^5.23.0"
 matplotlib = "^3.9.2"
 wordcloud = "^1.9.3"
 [build-system]
 requires = ["poetry-core"]

 plotly = "^5.23.0"
 matplotlib = "^3.9.2"
 wordcloud = "^1.9.3"
+huggingface-hub = "^0.24.7"
 [build-system]
 requires = ["poetry-core"]