Spaces:
Running
Running
added initial sumarization
Browse files- app.py +50 -3
- poetry.lock +4 -4
- pyproject.toml +1 -0
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from collections import Counter
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from datasets import load_dataset
|
| 4 |
from nltk.util import ngrams
|
|
@@ -8,6 +9,10 @@ import plotly.graph_objects as go
|
|
| 8 |
from plotly.subplots import make_subplots
|
| 9 |
from matplotlib import pyplot as plt
|
| 10 |
from wordcloud import WordCloud
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def load_transform_dataset():
|
|
@@ -25,6 +30,12 @@ def load_transform_dataset():
|
|
| 25 |
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
| 26 |
- 21.43
|
| 27 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# Sort the dataframe by date because Plotly doesn't do any of this automatically
|
| 29 |
_df = _df.sort_values(by="date")
|
| 30 |
_written = _df[_df["categories"] == "Written"]
|
|
@@ -64,6 +75,8 @@ def plotly_ngrams(n_grams, potus, _df):
|
|
| 64 |
trigrams = [" ".join(trigram) for trigram in trigrams]
|
| 65 |
# create a dataframe from the trigrams and counts
|
| 66 |
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
|
|
|
|
|
|
| 67 |
fig4 = px.bar(
|
| 68 |
trigrams_df,
|
| 69 |
x="counts",
|
|
@@ -124,9 +137,34 @@ def plt_wordcloud(president, _df):
|
|
| 124 |
return fig6
|
| 125 |
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
# Create a Gradio interface with blocks
|
| 128 |
with gr.Blocks() as demo:
|
| 129 |
df, written, spoken = load_transform_dataset()
|
|
|
|
|
|
|
|
|
|
| 130 |
# Build out the top level static charts and content
|
| 131 |
gr.Markdown(
|
| 132 |
"""
|
|
@@ -208,6 +246,16 @@ with gr.Blocks() as demo:
|
|
| 208 |
The drop off is quite noticeable, don't you think? ;)
|
| 209 |
"""
|
| 210 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
gr.Markdown(
|
| 212 |
"""
|
| 213 |
## Dive Deeper on Each President
|
|
@@ -227,16 +275,15 @@ with gr.Blocks() as demo:
|
|
| 227 |
presidents = df["potus"].unique()
|
| 228 |
presidents = presidents.tolist()
|
| 229 |
presidents.append("All")
|
|
|
|
| 230 |
# create a dropdown to select a president
|
| 231 |
president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
|
|
|
|
| 232 |
# create a slider for number of word grams
|
| 233 |
grams = gr.Slider(
|
| 234 |
minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
|
| 235 |
)
|
| 236 |
|
| 237 |
-
# store the dataframe in a state object before passing to plots
|
| 238 |
-
df_state = gr.State(df)
|
| 239 |
-
|
| 240 |
# show a bar chart of the top n-grams for a selected president
|
| 241 |
gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
|
| 242 |
|
|
|
|
| 1 |
from collections import Counter
|
| 2 |
+
import math
|
| 3 |
import gradio as gr
|
| 4 |
from datasets import load_dataset
|
| 5 |
from nltk.util import ngrams
|
|
|
|
| 9 |
from plotly.subplots import make_subplots
|
| 10 |
from matplotlib import pyplot as plt
|
| 11 |
from wordcloud import WordCloud
|
| 12 |
+
from huggingface_hub import InferenceClient
|
| 13 |
+
import matplotlib
|
| 14 |
+
|
| 15 |
+
matplotlib.use("agg")
|
| 16 |
|
| 17 |
|
| 18 |
def load_transform_dataset():
|
|
|
|
| 30 |
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
| 31 |
- 21.43
|
| 32 |
)
|
| 33 |
+
# create a column that is the year the speach was given from the date column
|
| 34 |
+
_df["year"] = _df["date"].dt.year
|
| 35 |
+
# create a column that is a concatenation of the president's name, year, and category
|
| 36 |
+
_df["speech_key"] = (
|
| 37 |
+
_df["potus"] + " - " + _df["year"].astype(str) + " (" + _df["categories"] + ")"
|
| 38 |
+
)
|
| 39 |
# Sort the dataframe by date because Plotly doesn't do any of this automatically
|
| 40 |
_df = _df.sort_values(by="date")
|
| 41 |
_written = _df[_df["categories"] == "Written"]
|
|
|
|
| 75 |
trigrams = [" ".join(trigram) for trigram in trigrams]
|
| 76 |
# create a dataframe from the trigrams and counts
|
| 77 |
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
| 78 |
+
if potus == "All":
|
| 79 |
+
potus = "All Presidents"
|
| 80 |
fig4 = px.bar(
|
| 81 |
trigrams_df,
|
| 82 |
x="counts",
|
|
|
|
| 137 |
return fig6
|
| 138 |
|
| 139 |
|
| 140 |
+
def summarization(speech_key, _df):
|
| 141 |
+
client = InferenceClient(model="facebook/bart-large-cnn")
|
| 142 |
+
chunk_len = 4000
|
| 143 |
+
speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
|
| 144 |
+
sotu_chunks = int(math.ceil(len(speech) / chunk_len))
|
| 145 |
+
response = []
|
| 146 |
+
for chunk in range(1, sotu_chunks + 1):
|
| 147 |
+
if chunk * 4000 < len(speech):
|
| 148 |
+
chunk_text = speech[(chunk - 1) * chunk_len : chunk * chunk_len]
|
| 149 |
+
else:
|
| 150 |
+
chunk_text = speech[(chunk - 1) * chunk_len :]
|
| 151 |
+
try:
|
| 152 |
+
summarization_chunk = client.summarization(
|
| 153 |
+
chunk_text, parameters={"truncation": "do_not_truncate"}
|
| 154 |
+
)
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(e)
|
| 157 |
+
response.append(summarization_chunk.summary_text)
|
| 158 |
+
|
| 159 |
+
return "\n\n".join(response)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
# Create a Gradio interface with blocks
|
| 163 |
with gr.Blocks() as demo:
|
| 164 |
df, written, spoken = load_transform_dataset()
|
| 165 |
+
# store the dataframe in a state object before passing to component functions
|
| 166 |
+
df_state = gr.State(df)
|
| 167 |
+
|
| 168 |
# Build out the top level static charts and content
|
| 169 |
gr.Markdown(
|
| 170 |
"""
|
|
|
|
| 246 |
The drop off is quite noticeable, don't you think? ;)
|
| 247 |
"""
|
| 248 |
)
|
| 249 |
+
gr.Markdown("## Summarize a Speech")
|
| 250 |
+
speeches = df["speech_key"].unique()
|
| 251 |
+
speeches = speeches.tolist()
|
| 252 |
+
speech = gr.Dropdown(label="Select a Speech", choices=speeches)
|
| 253 |
+
# create a dropdown to select a speech from a president
|
| 254 |
+
run_summarization = gr.Button(value="Summarize")
|
| 255 |
+
fin_speech = gr.Textbox(label="Summarized Speech", type="text", lines=10)
|
| 256 |
+
run_summarization.click(
|
| 257 |
+
summarization, inputs=[speech, df_state], outputs=[fin_speech]
|
| 258 |
+
)
|
| 259 |
gr.Markdown(
|
| 260 |
"""
|
| 261 |
## Dive Deeper on Each President
|
|
|
|
| 275 |
presidents = df["potus"].unique()
|
| 276 |
presidents = presidents.tolist()
|
| 277 |
presidents.append("All")
|
| 278 |
+
|
| 279 |
# create a dropdown to select a president
|
| 280 |
president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
|
| 281 |
+
# create a text area to display the summarized speech
|
| 282 |
# create a slider for number of word grams
|
| 283 |
grams = gr.Slider(
|
| 284 |
minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
|
| 285 |
)
|
| 286 |
|
|
|
|
|
|
|
|
|
|
| 287 |
# show a bar chart of the top n-grams for a selected president
|
| 288 |
gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
|
| 289 |
|
poetry.lock
CHANGED
|
@@ -851,13 +851,13 @@ zstd = ["zstandard (>=0.18.0)"]
|
|
| 851 |
|
| 852 |
[[package]]
|
| 853 |
name = "huggingface-hub"
|
| 854 |
-
version = "0.24.
|
| 855 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
| 856 |
optional = false
|
| 857 |
python-versions = ">=3.8.0"
|
| 858 |
files = [
|
| 859 |
-
{file = "huggingface_hub-0.24.
|
| 860 |
-
{file = "huggingface_hub-0.24.
|
| 861 |
]
|
| 862 |
|
| 863 |
[package.dependencies]
|
|
@@ -2738,4 +2738,4 @@ multidict = ">=4.0"
|
|
| 2738 |
[metadata]
|
| 2739 |
lock-version = "2.0"
|
| 2740 |
python-versions = "^3.12"
|
| 2741 |
-
content-hash = "
|
|
|
|
| 851 |
|
| 852 |
[[package]]
|
| 853 |
name = "huggingface-hub"
|
| 854 |
+
version = "0.24.7"
|
| 855 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
| 856 |
optional = false
|
| 857 |
python-versions = ">=3.8.0"
|
| 858 |
files = [
|
| 859 |
+
{file = "huggingface_hub-0.24.7-py3-none-any.whl", hash = "sha256:a212c555324c8a7b1ffdd07266bb7e7d69ca71aa238d27b7842d65e9a26ac3e5"},
|
| 860 |
+
{file = "huggingface_hub-0.24.7.tar.gz", hash = "sha256:0ad8fb756e2831da0ac0491175b960f341fe06ebcf80ed6f8728313f95fc0207"},
|
| 861 |
]
|
| 862 |
|
| 863 |
[package.dependencies]
|
|
|
|
| 2738 |
[metadata]
|
| 2739 |
lock-version = "2.0"
|
| 2740 |
python-versions = "^3.12"
|
| 2741 |
+
content-hash = "6140858cd5057fd978c2f09d1ec90bfda474a4ff6cb96ae17e67d134dae2bc4d"
|
pyproject.toml
CHANGED
|
@@ -14,6 +14,7 @@ nltk = "^3.9.1"
|
|
| 14 |
plotly = "^5.23.0"
|
| 15 |
matplotlib = "^3.9.2"
|
| 16 |
wordcloud = "^1.9.3"
|
|
|
|
| 17 |
|
| 18 |
[build-system]
|
| 19 |
requires = ["poetry-core"]
|
|
|
|
| 14 |
plotly = "^5.23.0"
|
| 15 |
matplotlib = "^3.9.2"
|
| 16 |
wordcloud = "^1.9.3"
|
| 17 |
+
huggingface-hub = "^0.24.7"
|
| 18 |
|
| 19 |
[build-system]
|
| 20 |
requires = ["poetry-core"]
|