Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -220,20 +220,35 @@ def summ_inference_tokenize(input_: list, n_tokens: int):
|
|
| 220 |
tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
|
| 221 |
return summ_tokenizer, tokenized_data
|
| 222 |
|
| 223 |
-
def
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
|
| 228 |
-
result = inference_tokenizer.decode(pred
|
| 229 |
-
result = re.sub("<.*?>", "", result).strip()
|
| 230 |
return result
|
| 231 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
|
| 232 |
|
| 233 |
############## ENTRY POINT START #######################
|
| 234 |
def main():
|
| 235 |
st.markdown('''<h3>News Summarizer and NER</h3>
|
| 236 |
-
<p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
''', unsafe_allow_html=True)
|
| 238 |
input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
|
| 239 |
horizontal=True)
|
|
|
|
| 220 |
tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
|
| 221 |
return summ_tokenizer, tokenized_data
|
| 222 |
|
| 223 |
+
def clean_summary(summary: str):
|
| 224 |
+
summary = summary.strip()
|
| 225 |
+
if summary[-1] != '.':
|
| 226 |
+
sents = summary.split(". ")
|
| 227 |
+
summary = ". ".join(sents[:-1])
|
| 228 |
+
summary += "."
|
| 229 |
+
summary = re.sub(r'^-', "", summary)
|
| 230 |
+
summary = summary.strip()
|
| 231 |
+
if len(summary) <= 5:
|
| 232 |
+
summary = ""
|
| 233 |
+
return summary
|
| 234 |
+
|
| 235 |
+
def summ_inference(txts: str):
|
| 236 |
+
txts = [*map(summ_preprocess, txts)]
|
| 237 |
+
inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=txts, n_tokens=SUMM_INPUT_N_TOKENS)
|
| 238 |
pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
|
| 239 |
+
result = ["" if t=="" else clean_summary(inference_tokenizer.decode(p, skip_special_tokens=True)) for t, p in zip(txts, pred)]
|
|
|
|
| 240 |
return result
|
| 241 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
|
| 242 |
|
| 243 |
############## ENTRY POINT START #######################
|
| 244 |
def main():
|
| 245 |
st.markdown('''<h3>News Summarizer and NER</h3>
|
| 246 |
+
<p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a>
|
| 247 |
+
<br>
|
| 248 |
+
The app works best in summarizing <a href="https://edition.cnn.com/">CNN</a> and <a href="https://www.dailymail.co.uk/home/index.html">Daily Mail</a> news articles,
|
| 249 |
+
as the BART model is fine-tuned on them.
|
| 250 |
+
</p>
|
| 251 |
+
|
| 252 |
''', unsafe_allow_html=True)
|
| 253 |
input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
|
| 254 |
horizontal=True)
|