Files changed (5) hide show
  1. README.md +0 -1
  2. app.py +41 -98
  3. requirements.txt +2 -2
  4. summarize.py +25 -39
  5. utils.py +0 -14
README.md CHANGED
@@ -4,7 +4,6 @@ emoji: 📚
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 3.32.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
app.py CHANGED
@@ -1,7 +1,3 @@
1
- """
2
- app.py - the main application file for the gradio app
3
- """
4
- import gc
5
  import logging
6
  import random
7
  import re
@@ -10,7 +6,6 @@ from pathlib import Path
10
 
11
  import gradio as gr
12
  import nltk
13
- import torch
14
  from cleantext import clean
15
 
16
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
@@ -18,61 +13,22 @@ from utils import load_example_filenames, truncate_word_count
18
 
19
  _here = Path(__file__).parent
20
 
21
- nltk.download("stopwords", quiet=True)
22
 
23
  logging.basicConfig(
24
- level=logging.INFO, format="%(asctime)s - [%(levelname)s] %(name)s: %(message)s"
25
  )
26
 
27
- MODEL_OPTIONS = [
28
- "pszemraj/led-large-book-summary",
29
- "pszemraj/led-base-book-summary",
30
- ]
31
-
32
-
33
- def predict(
34
- input_text: str,
35
- model_name: str,
36
- token_batch_length: int = 2048,
37
- empty_cache: bool = True,
38
- **settings,
39
- ) -> list:
40
- """
41
- predict - helper fn to support multiple models for summarization at once
42
- :param str input_text: the input text to summarize
43
- :param str model_name: model name to use
44
- :param int token_batch_length: the length of the token batches to use
45
- :param bool empty_cache: whether to empty the cache before loading a new= model
46
- :return: list of dicts with keys "summary" and "score"
47
- """
48
- if torch.cuda.is_available() and empty_cache:
49
- torch.cuda.empty_cache()
50
-
51
- model, tokenizer = load_model_and_tokenizer(model_name)
52
- summaries = summarize_via_tokenbatches(
53
- input_text,
54
- model,
55
- tokenizer,
56
- batch_length=token_batch_length,
57
- **settings,
58
- )
59
-
60
- del model
61
- del tokenizer
62
- gc.collect()
63
-
64
- return summaries
65
-
66
 
67
  def proc_submission(
68
  input_text: str,
69
- model_name: str,
70
- num_beams: int,
71
- token_batch_length: int,
72
- length_penalty: float,
73
- repetition_penalty: float,
74
- no_repeat_ngram_size: int,
75
- max_input_length: int = 2560,
76
  ):
77
  """
78
  proc_submission - a helper function for the gradio module to process submissions
@@ -85,14 +41,12 @@ def proc_submission(
85
  length_penalty (float): the length penalty to use
86
  repetition_penalty (float): the repetition penalty to use
87
  no_repeat_ngram_size (int): the no-repeat ngram size to use
88
- max_input_length (int, optional): the maximum input length to use. Defaults to 2560.
89
 
90
  Returns:
91
  str in HTML format, string of the summary, str of score
92
  """
93
 
94
- logger = logging.getLogger(__name__)
95
- logger.info("Processing submission")
96
  settings = {
97
  "length_penalty": float(length_penalty),
98
  "repetition_penalty": float(repetition_penalty),
@@ -104,19 +58,14 @@ def proc_submission(
104
  "early_stopping": True,
105
  "do_sample": False,
106
  }
107
-
108
- if "base" in model_name:
109
- logger.info("Updating max_input_length to for base model")
110
- max_input_length = 4096
111
-
112
- logger.info(f"max_input_length: {max_input_length}")
113
  st = time.perf_counter()
114
  history = {}
115
  clean_text = clean(input_text, lower=False)
 
116
  processed = truncate_word_count(clean_text, max_input_length)
117
 
118
  if processed["was_truncated"]:
119
- truncated_input = processed["truncated_text"]
120
  # create elaborate HTML warning
121
  input_wc = re.split(r"\s+", input_text)
122
  msg = f"""
@@ -128,7 +77,7 @@ def proc_submission(
128
  logging.warning(msg)
129
  history["WARNING"] = msg
130
  else:
131
- truncated_input = input_text
132
  msg = None
133
 
134
  if len(input_text) < 50:
@@ -146,25 +95,24 @@ def proc_submission(
146
 
147
  return msg, "", []
148
 
149
- _summaries = predict(
150
- input_text=truncated_input,
151
- model_name=model_name,
152
- token_batch_length=token_batch_length,
 
153
  **settings,
154
  )
155
- sum_text = [
156
- f"\nBatch {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries, start=1)
157
- ]
158
  sum_scores = [
159
- f"\n- Batch {i}:\n\t{round(s['summary_score'],4)}"
160
- for i, s in enumerate(_summaries, start=1)
161
  ]
162
 
163
  sum_text_out = "\n".join(sum_text)
164
  history["Summary Scores"] = "<br><br>"
165
  scores_out = "\n".join(sum_scores)
166
  rt = round((time.perf_counter() - st) / 60, 2)
167
- logger.info(f"Runtime: {rt} minutes")
168
  html = ""
169
  html += f"<p>Runtime: {rt} minutes on CPU</p>"
170
  if msg is not None:
@@ -221,38 +169,36 @@ def load_uploaded_file(file_obj):
221
 
222
 
223
  if __name__ == "__main__":
224
- logger = logging.getLogger(__name__)
225
- logger.info("Starting up app")
 
 
226
  name_to_path = load_example_filenames(_here / "examples")
227
  logging.info(f"Loaded {len(name_to_path)} examples")
228
- demo = gr.Blocks(
229
- title="Summarize Long-Form Text",
230
- )
231
  _examples = list(name_to_path.keys())
232
  with demo:
 
233
  gr.Markdown("# Long-Form Summarization: LED & BookSum")
234
  gr.Markdown(
235
  "LED models ([model card](https://huggingface.co/pszemraj/led-large-book-summary)) fine-tuned to summarize long-form text. A [space with other models can be found here](https://huggingface.co/spaces/pszemraj/document-summarization)"
236
  )
237
  with gr.Column():
 
238
  gr.Markdown("## Load Inputs & Select Parameters")
239
  gr.Markdown(
240
  "Enter or upload text below, and it will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
241
  )
242
  with gr.Row():
243
- model_name = gr.Dropdown(
244
- choices=MODEL_OPTIONS,
245
- value=MODEL_OPTIONS[0],
246
- label="Model Name",
247
  )
248
  num_beams = gr.Radio(
249
  choices=[2, 3, 4],
250
  label="Beam Search: # of Beams",
251
  value=2,
252
  )
253
- gr.Markdown(
254
- "Load a a .txt - example or your own (_You may find [this OCR space](https://huggingface.co/spaces/pszemraj/pdf-ocr) useful_)"
255
- )
256
  with gr.Row():
257
  example_name = gr.Dropdown(
258
  _examples,
@@ -267,8 +213,7 @@ if __name__ == "__main__":
267
  with gr.Row():
268
  input_text = gr.Textbox(
269
  lines=4,
270
- max_lines=12,
271
- label="Text to Summarize",
272
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
273
  )
274
  with gr.Column():
@@ -305,11 +250,11 @@ if __name__ == "__main__":
305
  with gr.Column():
306
  gr.Markdown("### Advanced Settings")
307
  with gr.Row():
308
- length_penalty = gr.Slider(
309
  minimum=0.5,
310
  maximum=1.0,
311
  label="length penalty",
312
- value=0.7,
313
  step=0.05,
314
  )
315
  token_batch_length = gr.Radio(
@@ -319,11 +264,11 @@ if __name__ == "__main__":
319
  )
320
 
321
  with gr.Row():
322
- repetition_penalty = gr.Slider(
323
  minimum=1.0,
324
  maximum=5.0,
325
  label="repetition penalty",
326
- value=3.5,
327
  step=0.1,
328
  )
329
  no_repeat_ngram_size = gr.Radio(
@@ -337,10 +282,10 @@ if __name__ == "__main__":
337
  "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
338
  )
339
  gr.Markdown(
340
- "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a Colab notebook for a tutorial."
341
  )
342
  gr.Markdown(
343
- "- **Update May 1, 2023:** Enabled faster inference times via `use_cache=True`, the number of words the model will processed has been increased! Not on this demo, but there is a [test model](https://huggingface.co/pszemraj/led-large-book-summary-continued) available: an extension of `led-large-book-summary`."
344
  )
345
  gr.Markdown("---")
346
 
@@ -356,7 +301,7 @@ if __name__ == "__main__":
356
  fn=proc_submission,
357
  inputs=[
358
  input_text,
359
- model_name,
360
  num_beams,
361
  token_batch_length,
362
  length_penalty,
@@ -366,6 +311,4 @@ if __name__ == "__main__":
366
  outputs=[output_text, summary_text, summary_scores],
367
  )
368
 
369
- demo.launch(
370
- enable_queue=True,
371
- )
 
 
 
 
 
1
  import logging
2
  import random
3
  import re
 
6
 
7
  import gradio as gr
8
  import nltk
 
9
  from cleantext import clean
10
 
11
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 
13
 
14
  _here = Path(__file__).parent
15
 
16
+ nltk.download("stopwords") # TODO=find where this requirement originates from
17
 
18
  logging.basicConfig(
19
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def proc_submission(
24
  input_text: str,
25
+ model_size: str,
26
+ num_beams,
27
+ token_batch_length,
28
+ length_penalty,
29
+ repetition_penalty,
30
+ no_repeat_ngram_size,
31
+ max_input_length: int = 1024,
32
  ):
33
  """
34
  proc_submission - a helper function for the gradio module to process submissions
 
41
  length_penalty (float): the length penalty to use
42
  repetition_penalty (float): the repetition penalty to use
43
  no_repeat_ngram_size (int): the no-repeat ngram size to use
44
+ max_input_length (int, optional): the maximum input length to use. Defaults to 1024.
45
 
46
  Returns:
47
  str in HTML format, string of the summary, str of score
48
  """
49
 
 
 
50
  settings = {
51
  "length_penalty": float(length_penalty),
52
  "repetition_penalty": float(repetition_penalty),
 
58
  "early_stopping": True,
59
  "do_sample": False,
60
  }
 
 
 
 
 
 
61
  st = time.perf_counter()
62
  history = {}
63
  clean_text = clean(input_text, lower=False)
64
+ max_input_length = 2048 if model_size == "base" else max_input_length
65
  processed = truncate_word_count(clean_text, max_input_length)
66
 
67
  if processed["was_truncated"]:
68
+ tr_in = processed["truncated_text"]
69
  # create elaborate HTML warning
70
  input_wc = re.split(r"\s+", input_text)
71
  msg = f"""
 
77
  logging.warning(msg)
78
  history["WARNING"] = msg
79
  else:
80
+ tr_in = input_text
81
  msg = None
82
 
83
  if len(input_text) < 50:
 
95
 
96
  return msg, "", []
97
 
98
+ _summaries = summarize_via_tokenbatches(
99
+ tr_in,
100
+ model_sm if "base" in model_size.lower() else model,
101
+ tokenizer_sm if "base" in model_size.lower() else tokenizer,
102
+ batch_length=token_batch_length,
103
  **settings,
104
  )
105
+ sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
 
 
106
  sum_scores = [
107
+ f" - Section {i}: {round(s['summary_score'],4)}"
108
+ for i, s in enumerate(_summaries)
109
  ]
110
 
111
  sum_text_out = "\n".join(sum_text)
112
  history["Summary Scores"] = "<br><br>"
113
  scores_out = "\n".join(sum_scores)
114
  rt = round((time.perf_counter() - st) / 60, 2)
115
+ print(f"Runtime: {rt} minutes")
116
  html = ""
117
  html += f"<p>Runtime: {rt} minutes on CPU</p>"
118
  if msg is not None:
 
169
 
170
 
171
  if __name__ == "__main__":
172
+
173
+ model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
174
+ model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
175
+
176
  name_to_path = load_example_filenames(_here / "examples")
177
  logging.info(f"Loaded {len(name_to_path)} examples")
178
+ demo = gr.Blocks()
 
 
179
  _examples = list(name_to_path.keys())
180
  with demo:
181
+
182
  gr.Markdown("# Long-Form Summarization: LED & BookSum")
183
  gr.Markdown(
184
  "LED models ([model card](https://huggingface.co/pszemraj/led-large-book-summary)) fine-tuned to summarize long-form text. A [space with other models can be found here](https://huggingface.co/spaces/pszemraj/document-summarization)"
185
  )
186
  with gr.Column():
187
+
188
  gr.Markdown("## Load Inputs & Select Parameters")
189
  gr.Markdown(
190
  "Enter or upload text below, and it will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
191
  )
192
  with gr.Row():
193
+ model_size = gr.Radio(
194
+ choices=["base", "large"], label="Model Variant", value="large"
 
 
195
  )
196
  num_beams = gr.Radio(
197
  choices=[2, 3, 4],
198
  label="Beam Search: # of Beams",
199
  value=2,
200
  )
201
+ gr.Markdown("Load a a .txt - example or your own (_You may find [this OCR space](https://huggingface.co/spaces/pszemraj/pdf-ocr) useful_)")
 
 
202
  with gr.Row():
203
  example_name = gr.Dropdown(
204
  _examples,
 
213
  with gr.Row():
214
  input_text = gr.Textbox(
215
  lines=4,
216
+ label="Input Text (for summarization)",
 
217
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
218
  )
219
  with gr.Column():
 
250
  with gr.Column():
251
  gr.Markdown("### Advanced Settings")
252
  with gr.Row():
253
+ length_penalty = gr.inputs.Slider(
254
  minimum=0.5,
255
  maximum=1.0,
256
  label="length penalty",
257
+ default=0.7,
258
  step=0.05,
259
  )
260
  token_batch_length = gr.Radio(
 
264
  )
265
 
266
  with gr.Row():
267
+ repetition_penalty = gr.inputs.Slider(
268
  minimum=1.0,
269
  maximum=5.0,
270
  label="repetition penalty",
271
+ default=3.5,
272
  step=0.1,
273
  )
274
  no_repeat_ngram_size = gr.Radio(
 
282
  "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
283
  )
284
  gr.Markdown(
285
+ "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. "
286
  )
287
  gr.Markdown(
288
+ "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a Colab notebook for a tutorial."
289
  )
290
  gr.Markdown("---")
291
 
 
301
  fn=proc_submission,
302
  inputs=[
303
  input_text,
304
+ model_size,
305
  num_beams,
306
  token_batch_length,
307
  length_penalty,
 
311
  outputs=[output_text, summary_text, summary_scores],
312
  )
313
 
314
+ demo.launch(enable_queue=True, share=True)
 
 
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- clean-text
2
  gradio
3
  natsort
4
  nltk
5
  torch
6
  tqdm
7
  transformers
8
- accelerate
 
1
+ clean-text[gpl]
2
  gradio
3
  natsort
4
  nltk
5
  torch
6
  tqdm
7
  transformers
8
+ accelerate
summarize.py CHANGED
@@ -1,40 +1,30 @@
1
  import logging
2
- import pprint as pp
3
 
4
- from utils import validate_pytorch2
5
-
6
- logging.basicConfig(level=logging.INFO)
7
  import torch
8
  from tqdm.auto import tqdm
9
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
10
 
11
 
12
- def load_model_and_tokenizer(model_name: str) -> tuple:
13
  """
14
- load_model_and_tokenizer - load a model and tokenizer from a model name/ID on the hub
15
- :param str model_name: the model name/ID on the hub
16
- :return tuple: a tuple containing the model and tokenizer
 
 
 
 
17
  """
18
- logger = logging.getLogger(__name__)
19
- device = "cuda" if torch.cuda.is_available() else "cpu"
20
  model = AutoModelForSeq2SeqLM.from_pretrained(
21
  model_name,
22
- ).to(device)
23
- model = model.eval()
24
-
25
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
26
 
27
- logger.info(f"Loaded model {model_name} to {device}")
28
-
29
- if validate_pytorch2():
30
- try:
31
- logger.info("Compiling model with Torch 2.0")
32
- model = torch.compile(model)
33
- except Exception as e:
34
- logger.warning(f"Could not compile model with Torch 2.0: {e}")
35
- else:
36
- logger.info("Torch 2.0 not detected, skipping compilation")
37
-
38
  return model, tokenizer
39
 
40
 
@@ -86,7 +76,6 @@ def summarize_via_tokenbatches(
86
  tokenizer,
87
  batch_length=2048,
88
  batch_stride=16,
89
- min_batch_length: int = 512,
90
  **kwargs,
91
  ):
92
  """
@@ -94,7 +83,7 @@ def summarize_via_tokenbatches(
94
 
95
  Args:
96
  input_text (str): the text to summarize
97
- model (): the model to use for summarization
98
  tokenizer (): the tokenizer to use for summarization
99
  batch_length (int, optional): the length of each batch. Defaults to 2048.
100
  batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
@@ -103,16 +92,12 @@ def summarize_via_tokenbatches(
103
  str: the summary
104
  """
105
  # log all input parameters
106
- logger = logging.getLogger(__name__)
107
- # log all input parameters
108
- if batch_length < min_batch_length:
109
- logger.warning(
110
- f"batch_length must be at least {min_batch_length}. Setting batch_length to {min_batch_length}"
111
- )
112
- batch_length = min_batch_length
113
-
114
- logger.info(f"input parameters:\n{pp.pformat(kwargs)}")
115
- logger.info(f"batch_length: {batch_length}, batch_stride: {batch_stride}")
116
  encoded_input = tokenizer(
117
  input_text,
118
  padding="max_length",
@@ -127,9 +112,10 @@ def summarize_via_tokenbatches(
127
  in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
128
  gen_summaries = []
129
 
130
- pbar = tqdm(total=len(in_id_arr), desc="Summarizing")
131
 
132
  for _id, _mask in zip(in_id_arr, att_arr):
 
133
  result, score = summarize_and_score(
134
  ids=_id,
135
  mask=_mask,
@@ -144,9 +130,9 @@ def summarize_via_tokenbatches(
144
  "summary_score": score,
145
  }
146
  gen_summaries.append(_sum)
147
- logger.info(f"SCore {score} for summary:\n\t{result}")
148
  pbar.update()
149
 
150
  pbar.close()
151
- logger.debug(f"Generated summaries:\n{pp.pformat(gen_summaries)}")
152
  return gen_summaries
 
1
  import logging
 
2
 
 
 
 
3
  import torch
4
  from tqdm.auto import tqdm
5
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
 
7
 
8
+ def load_model_and_tokenizer(model_name):
9
  """
10
+ load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
11
+
12
+ Args:
13
+ model_name (str): the name of the model to load
14
+ Returns:
15
+ AutoModelForSeq2SeqLM: the model
16
+ AutoTokenizer: the tokenizer
17
  """
18
+
 
19
  model = AutoModelForSeq2SeqLM.from_pretrained(
20
  model_name,
21
+ # low_cpu_mem_usage=True,
22
+ # use_cache=False,
23
+ )
24
  tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ model = model.to("cuda") if torch.cuda.is_available() else model
26
 
27
+ logging.info(f"Loaded model {model_name}")
 
 
 
 
 
 
 
 
 
 
28
  return model, tokenizer
29
 
30
 
 
76
  tokenizer,
77
  batch_length=2048,
78
  batch_stride=16,
 
79
  **kwargs,
80
  ):
81
  """
 
83
 
84
  Args:
85
  input_text (str): the text to summarize
86
+ model (): the model to use for summarizationz
87
  tokenizer (): the tokenizer to use for summarization
88
  batch_length (int, optional): the length of each batch. Defaults to 2048.
89
  batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
 
92
  str: the summary
93
  """
94
  # log all input parameters
95
+ if batch_length < 512:
96
+ batch_length = 512
97
+ print("WARNING: batch_length was set to 512")
98
+ print(
99
+ f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
100
+ )
 
 
 
 
101
  encoded_input = tokenizer(
102
  input_text,
103
  padding="max_length",
 
112
  in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
113
  gen_summaries = []
114
 
115
+ pbar = tqdm(total=len(in_id_arr))
116
 
117
  for _id, _mask in zip(in_id_arr, att_arr):
118
+
119
  result, score = summarize_and_score(
120
  ids=_id,
121
  mask=_mask,
 
130
  "summary_score": score,
131
  }
132
  gen_summaries.append(_sum)
133
+ print(f"\t{result[0]}\nScore:\t{score}")
134
  pbar.update()
135
 
136
  pbar.close()
137
+
138
  return gen_summaries
utils.py CHANGED
@@ -2,26 +2,12 @@
2
  utils.py - Utility functions for the project.
3
  """
4
 
5
- import logging
6
  import re
7
  from pathlib import Path
8
 
9
- logging.basicConfig(
10
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
11
- level=logging.INFO,
12
- )
13
- import torch
14
  from natsort import natsorted
15
 
16
 
17
- def validate_pytorch2(torch_version: str = None):
18
- torch_version = torch.__version__ if torch_version is None else torch_version
19
-
20
- pattern = r"^2\.\d+(\.\d+)*"
21
-
22
- return True if re.match(pattern, torch_version) else False
23
-
24
-
25
  def truncate_word_count(text, max_words=512):
26
  """
27
  truncate_word_count - a helper function for the gradio module
 
2
  utils.py - Utility functions for the project.
3
  """
4
 
 
5
  import re
6
  from pathlib import Path
7
 
 
 
 
 
 
8
  from natsort import natsorted
9
 
10
 
 
 
 
 
 
 
 
 
11
  def truncate_word_count(text, max_words=512):
12
  """
13
  truncate_word_count - a helper function for the gradio module