Spaces:

pszemraj
/

summarize-long-text

Running on CPU Upgrade

App Files Files Community

Update README.md

by uragankatrrin - opened Oct 14, 2022

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+68

-154

Files changed (5) hide show

README.md +0 -1
app.py +41 -98
requirements.txt +2 -2
summarize.py +25 -39
utils.py +0 -14

README.md CHANGED Viewed

@@ -4,7 +4,6 @@ emoji: 📚
 colorFrom: red
 colorTo: purple
 sdk: gradio
-sdk_version: 3.32.0
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: red
 colorTo: purple
 sdk: gradio
 app_file: app.py
 pinned: true
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,7 +1,3 @@
-"""
-app.py - the main application file for the gradio app
-"""
-import gc
 import logging
 import random
 import re
@@ -10,7 +6,6 @@ from pathlib import Path
 import gradio as gr
 import nltk
-import torch
 from cleantext import clean
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
@@ -18,61 +13,22 @@ from utils import load_example_filenames, truncate_word_count
 _here = Path(__file__).parent
-nltk.download("stopwords", quiet=True)
 logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - [%(levelname)s] %(name)s: %(message)s"
 )
-MODEL_OPTIONS = [
-    "pszemraj/led-large-book-summary",
-    "pszemraj/led-base-book-summary",
-]
-def predict(
-    input_text: str,
-    model_name: str,
-    token_batch_length: int = 2048,
-    empty_cache: bool = True,
-    **settings,
-) -> list:
-    """
-    predict - helper fn to support multiple models for summarization at once
-    :param str input_text: the input text to summarize
-    :param str model_name: model name to use
-    :param int token_batch_length: the length of the token batches to use
-    :param bool empty_cache: whether to empty the cache before loading a new= model
-    :return: list of dicts with keys "summary" and "score"
-    """
-    if torch.cuda.is_available() and empty_cache:
-        torch.cuda.empty_cache()
-    model, tokenizer = load_model_and_tokenizer(model_name)
-    summaries = summarize_via_tokenbatches(
-        input_text,
-        model,
-        tokenizer,
-        batch_length=token_batch_length,
-        **settings,
-    )
-    del model
-    del tokenizer
-    gc.collect()
-    return summaries
 def proc_submission(
     input_text: str,
-    model_name: str,
-    num_beams: int,
-    token_batch_length: int,
-    length_penalty: float,
-    repetition_penalty: float,
-    no_repeat_ngram_size: int,
-    max_input_length: int = 2560,
 ):
     """
     proc_submission - a helper function for the gradio module to process submissions
@@ -85,14 +41,12 @@ def proc_submission(
         length_penalty (float): the length penalty to use
         repetition_penalty (float): the repetition penalty to use
         no_repeat_ngram_size (int): the no-repeat ngram size to use
-        max_input_length (int, optional): the maximum input length to use. Defaults to 2560.
     Returns:
         str in HTML format, string of the summary, str of score
     """
-    logger = logging.getLogger(__name__)
-    logger.info("Processing submission")
     settings = {
         "length_penalty": float(length_penalty),
         "repetition_penalty": float(repetition_penalty),
@@ -104,19 +58,14 @@ def proc_submission(
         "early_stopping": True,
         "do_sample": False,
     }
-    if "base" in model_name:
-        logger.info("Updating max_input_length to for base model")
-        max_input_length = 4096
-    logger.info(f"max_input_length: {max_input_length}")
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
-        truncated_input = processed["truncated_text"]
         # create elaborate HTML warning
         input_wc = re.split(r"\s+", input_text)
         msg = f"""
@@ -128,7 +77,7 @@ def proc_submission(
         logging.warning(msg)
         history["WARNING"] = msg
     else:
-        truncated_input = input_text
         msg = None
     if len(input_text) < 50:
@@ -146,25 +95,24 @@ def proc_submission(
         return msg, "", []
-    _summaries = predict(
-        input_text=truncated_input,
-        model_name=model_name,
-        token_batch_length=token_batch_length,
         **settings,
     )
-    sum_text = [
-        f"\nBatch {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries, start=1)
-    ]
     sum_scores = [
-        f"\n- Batch {i}:\n\t{round(s['summary_score'],4)}"
-        for i, s in enumerate(_summaries, start=1)
     ]
     sum_text_out = "\n".join(sum_text)
     history["Summary Scores"] = "<br><br>"
     scores_out = "\n".join(sum_scores)
     rt = round((time.perf_counter() - st) / 60, 2)
-    logger.info(f"Runtime: {rt} minutes")
     html = ""
     html += f"<p>Runtime: {rt} minutes on CPU</p>"
     if msg is not None:
@@ -221,38 +169,36 @@ def load_uploaded_file(file_obj):
 if __name__ == "__main__":
-    logger = logging.getLogger(__name__)
-    logger.info("Starting up app")
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
-    demo = gr.Blocks(
-        title="Summarize Long-Form Text",
-    )
     _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Long-Form Summarization: LED & BookSum")
         gr.Markdown(
             "LED models ([model card](https://huggingface.co/pszemraj/led-large-book-summary)) fine-tuned to summarize long-form text. A [space with other models can be found here](https://huggingface.co/spaces/pszemraj/document-summarization)"
         )
         with gr.Column():
             gr.Markdown("## Load Inputs & Select Parameters")
             gr.Markdown(
                 "Enter or upload text below, and it will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
             )
             with gr.Row():
-                model_name = gr.Dropdown(
-                    choices=MODEL_OPTIONS,
-                    value=MODEL_OPTIONS[0],
-                    label="Model Name",
                 )
                 num_beams = gr.Radio(
                     choices=[2, 3, 4],
                     label="Beam Search: # of Beams",
                     value=2,
                 )
-            gr.Markdown(
-                "Load a a .txt - example or your own (_You may find [this OCR space](https://huggingface.co/spaces/pszemraj/pdf-ocr) useful_)"
-            )
             with gr.Row():
                 example_name = gr.Dropdown(
                     _examples,
@@ -267,8 +213,7 @@ if __name__ == "__main__":
             with gr.Row():
                 input_text = gr.Textbox(
                     lines=4,
-                    max_lines=12,
-                    label="Text to Summarize",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
                 with gr.Column():
@@ -305,11 +250,11 @@ if __name__ == "__main__":
         with gr.Column():
             gr.Markdown("### Advanced Settings")
             with gr.Row():
-                length_penalty = gr.Slider(
                     minimum=0.5,
                     maximum=1.0,
                     label="length penalty",
-                    value=0.7,
                     step=0.05,
                 )
                 token_batch_length = gr.Radio(
@@ -319,11 +264,11 @@ if __name__ == "__main__":
                 )
             with gr.Row():
-                repetition_penalty = gr.Slider(
                     minimum=1.0,
                     maximum=5.0,
                     label="repetition penalty",
-                    value=3.5,
                     step=0.1,
                 )
                 no_repeat_ngram_size = gr.Radio(
@@ -337,10 +282,10 @@ if __name__ == "__main__":
                 "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )
             gr.Markdown(
-                "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a Colab notebook for a tutorial."
             )
             gr.Markdown(
-                "- **Update May 1, 2023:** Enabled faster inference times via `use_cache=True`, the number of words the model will processed has been increased! Not on this demo, but there is a [test model](https://huggingface.co/pszemraj/led-large-book-summary-continued) available: an extension of `led-large-book-summary`."
             )
             gr.Markdown("---")
@@ -356,7 +301,7 @@ if __name__ == "__main__":
             fn=proc_submission,
             inputs=[
                 input_text,
-                model_name,
                 num_beams,
                 token_batch_length,
                 length_penalty,
@@ -366,6 +311,4 @@ if __name__ == "__main__":
             outputs=[output_text, summary_text, summary_scores],
         )
-    demo.launch(
-        enable_queue=True,
-    )

 import logging
 import random
 import re
 import gradio as gr
 import nltk
 from cleantext import clean
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 _here = Path(__file__).parent
+nltk.download("stopwords")  # TODO=find where this requirement originates from
 logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 def proc_submission(
     input_text: str,
+    model_size: str,
+    num_beams,
+    token_batch_length,
+    length_penalty,
+    repetition_penalty,
+    no_repeat_ngram_size,
+    max_input_length: int = 1024,
 ):
     """
     proc_submission - a helper function for the gradio module to process submissions
         length_penalty (float): the length penalty to use
         repetition_penalty (float): the repetition penalty to use
         no_repeat_ngram_size (int): the no-repeat ngram size to use
+        max_input_length (int, optional): the maximum input length to use. Defaults to 1024.
     Returns:
         str in HTML format, string of the summary, str of score
     """
     settings = {
         "length_penalty": float(length_penalty),
         "repetition_penalty": float(repetition_penalty),
         "early_stopping": True,
         "do_sample": False,
     }
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
+    max_input_length = 2048 if model_size == "base" else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
+        tr_in = processed["truncated_text"]
         # create elaborate HTML warning
         input_wc = re.split(r"\s+", input_text)
         msg = f"""
         logging.warning(msg)
         history["WARNING"] = msg
     else:
+        tr_in = input_text
         msg = None
     if len(input_text) < 50:
         return msg, "", []
+    _summaries = summarize_via_tokenbatches(
+        tr_in,
+        model_sm if "base" in model_size.lower() else model,
+        tokenizer_sm if "base" in model_size.lower() else tokenizer,
+        batch_length=token_batch_length,
         **settings,
     )
+    sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
     sum_scores = [
+        f" - Section {i}: {round(s['summary_score'],4)}"
+        for i, s in enumerate(_summaries)
     ]
     sum_text_out = "\n".join(sum_text)
     history["Summary Scores"] = "<br><br>"
     scores_out = "\n".join(sum_scores)
     rt = round((time.perf_counter() - st) / 60, 2)
+    print(f"Runtime: {rt} minutes")
     html = ""
     html += f"<p>Runtime: {rt} minutes on CPU</p>"
     if msg is not None:
 if __name__ == "__main__":
+    model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
+    model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
+    demo = gr.Blocks()
     _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Long-Form Summarization: LED & BookSum")
         gr.Markdown(
             "LED models ([model card](https://huggingface.co/pszemraj/led-large-book-summary)) fine-tuned to summarize long-form text. A [space with other models can be found here](https://huggingface.co/spaces/pszemraj/document-summarization)"
         )
         with gr.Column():
             gr.Markdown("## Load Inputs & Select Parameters")
             gr.Markdown(
                 "Enter or upload text below, and it will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
             )
             with gr.Row():
+                model_size = gr.Radio(
+                    choices=["base", "large"], label="Model Variant", value="large"
                 )
                 num_beams = gr.Radio(
                     choices=[2, 3, 4],
                     label="Beam Search: # of Beams",
                     value=2,
                 )
+            gr.Markdown("Load a a .txt - example or your own (_You may find [this OCR space](https://huggingface.co/spaces/pszemraj/pdf-ocr) useful_)")
             with gr.Row():
                 example_name = gr.Dropdown(
                     _examples,
             with gr.Row():
                 input_text = gr.Textbox(
                     lines=4,
+                    label="Input Text (for summarization)",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
                 with gr.Column():
         with gr.Column():
             gr.Markdown("### Advanced Settings")
             with gr.Row():
+                length_penalty = gr.inputs.Slider(
                     minimum=0.5,
                     maximum=1.0,
                     label="length penalty",
+                    default=0.7,
                     step=0.05,
                 )
                 token_batch_length = gr.Radio(
                 )
             with gr.Row():
+                repetition_penalty = gr.inputs.Slider(
                     minimum=1.0,
                     maximum=5.0,
                     label="repetition penalty",
+                    default=3.5,
                     step=0.1,
                 )
                 no_repeat_ngram_size = gr.Radio(
                 "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )
             gr.Markdown(
+                "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`.  "
             )
             gr.Markdown(
+                "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a Colab notebook for a tutorial."
             )
             gr.Markdown("---")
             fn=proc_submission,
             inputs=[
                 input_text,
+                model_size,
                 num_beams,
                 token_batch_length,
                 length_penalty,
             outputs=[output_text, summary_text, summary_scores],
         )
+    demo.launch(enable_queue=True, share=True)

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-clean-text
 gradio
 natsort
 nltk
 torch
 tqdm
 transformers
-accelerate

+clean-text[gpl]
 gradio
 natsort
 nltk
 torch
 tqdm
 transformers
+accelerate

summarize.py CHANGED Viewed

@@ -1,40 +1,30 @@
 import logging
-import pprint as pp
-from utils import validate_pytorch2
-logging.basicConfig(level=logging.INFO)
 import torch
 from tqdm.auto import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-def load_model_and_tokenizer(model_name: str) -> tuple:
     """
-    load_model_and_tokenizer - load a model and tokenizer from a model name/ID on the hub
-    :param str model_name: the model name/ID on the hub
-    :return tuple: a tuple containing the model and tokenizer
     """
-    logger = logging.getLogger(__name__)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
-    ).to(device)
-    model = model.eval()
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    logger.info(f"Loaded model {model_name} to {device}")
-    if validate_pytorch2():
-        try:
-            logger.info("Compiling model with Torch 2.0")
-            model = torch.compile(model)
-        except Exception as e:
-            logger.warning(f"Could not compile model with Torch 2.0: {e}")
-    else:
-        logger.info("Torch 2.0 not detected, skipping compilation")
     return model, tokenizer
@@ -86,7 +76,6 @@ def summarize_via_tokenbatches(
     tokenizer,
     batch_length=2048,
     batch_stride=16,
-    min_batch_length: int = 512,
     **kwargs,
 ):
     """
@@ -94,7 +83,7 @@ def summarize_via_tokenbatches(
     Args:
         input_text (str): the text to summarize
-        model (): the model to use for summarization
         tokenizer (): the tokenizer to use for summarization
         batch_length (int, optional): the length of each batch. Defaults to 2048.
         batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
@@ -103,16 +92,12 @@ def summarize_via_tokenbatches(
         str: the summary
     """
     # log all input parameters
-    logger = logging.getLogger(__name__)
-    # log all input parameters
-    if batch_length < min_batch_length:
-        logger.warning(
-            f"batch_length must be at least {min_batch_length}. Setting batch_length to {min_batch_length}"
-        )
-        batch_length = min_batch_length
-    logger.info(f"input parameters:\n{pp.pformat(kwargs)}")
-    logger.info(f"batch_length: {batch_length}, batch_stride: {batch_stride}")
     encoded_input = tokenizer(
         input_text,
         padding="max_length",
@@ -127,9 +112,10 @@ def summarize_via_tokenbatches(
     in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
     gen_summaries = []
-    pbar = tqdm(total=len(in_id_arr), desc="Summarizing")
     for _id, _mask in zip(in_id_arr, att_arr):
         result, score = summarize_and_score(
             ids=_id,
             mask=_mask,
@@ -144,9 +130,9 @@ def summarize_via_tokenbatches(
             "summary_score": score,
         }
         gen_summaries.append(_sum)
-        logger.info(f"SCore {score} for summary:\n\t{result}")
         pbar.update()
     pbar.close()
-    logger.debug(f"Generated summaries:\n{pp.pformat(gen_summaries)}")
     return gen_summaries

 import logging
 import torch
 from tqdm.auto import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+def load_model_and_tokenizer(model_name):
     """
+    load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
+    Args:
+        model_name (str): the name of the model to load
+    Returns:
+        AutoModelForSeq2SeqLM: the model
+        AutoTokenizer: the tokenizer
     """
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
+        # low_cpu_mem_usage=True,
+        # use_cache=False,
+    )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = model.to("cuda") if torch.cuda.is_available() else model
+    logging.info(f"Loaded model {model_name}")
     return model, tokenizer
     tokenizer,
     batch_length=2048,
     batch_stride=16,
     **kwargs,
 ):
     """
     Args:
         input_text (str): the text to summarize
+        model (): the model to use for summarizationz
         tokenizer (): the tokenizer to use for summarization
         batch_length (int, optional): the length of each batch. Defaults to 2048.
         batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
         str: the summary
     """
     # log all input parameters
+    if batch_length < 512:
+        batch_length = 512
+        print("WARNING: batch_length was set to 512")
+    print(
+        f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
+    )
     encoded_input = tokenizer(
         input_text,
         padding="max_length",
     in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
     gen_summaries = []
+    pbar = tqdm(total=len(in_id_arr))
     for _id, _mask in zip(in_id_arr, att_arr):
         result, score = summarize_and_score(
             ids=_id,
             mask=_mask,
             "summary_score": score,
         }
         gen_summaries.append(_sum)
+        print(f"\t{result[0]}\nScore:\t{score}")
         pbar.update()
     pbar.close()
     return gen_summaries

utils.py CHANGED Viewed

@@ -2,26 +2,12 @@
     utils.py - Utility functions for the project.
 """
-import logging
 import re
 from pathlib import Path
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    level=logging.INFO,
-)
-import torch
 from natsort import natsorted
-def validate_pytorch2(torch_version: str = None):
-    torch_version = torch.__version__ if torch_version is None else torch_version
-    pattern = r"^2\.\d+(\.\d+)*"
-    return True if re.match(pattern, torch_version) else False
 def truncate_word_count(text, max_words=512):
     """
     truncate_word_count - a helper function for the gradio module

     utils.py - Utility functions for the project.
 """
 import re
 from pathlib import Path
 from natsort import natsorted
 def truncate_word_count(text, max_words=512):
     """
     truncate_word_count - a helper function for the gradio module