eungyu kim
commited on
Commit
·
bfde142
1
Parent(s):
8bf942b
feat: Add T5 Text Summarizer code and README documentation.
Browse files- README.md +22 -0
- model.py +59 -0
- requirements.txt +26 -0
README.md
CHANGED
|
@@ -1,3 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
| 3 |
---
|
|
|
|
| 1 |
+
# T5 Text Summarizer
|
| 2 |
+
|
| 3 |
+
This repository contains a simple text summarization script using a pre-trained T5 model from the Hugging Face Transformers library. The script demonstrates how to use prompt-based summarization to generate a concise summary of an input text.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The main script (`model.py`) defines a function `summarize_text` that:
|
| 8 |
+
- Loads the T5 tokenizer and T5 model.
|
| 9 |
+
- Adds a summarization prompt (`"summarize: "`) to the input text.
|
| 10 |
+
- Tokenizes the input text and truncates it to a maximum length.
|
| 11 |
+
- Generates a summary using beam search.
|
| 12 |
+
- Decodes the generated token sequence back into human-readable text while skipping special tokens.
|
| 13 |
+
|
| 14 |
+
## Code Explanation
|
| 15 |
+
|
| 16 |
+
### Tokenization and Decoding
|
| 17 |
+
|
| 18 |
+
- **Tokenization:**
|
| 19 |
+
The input text is first prefixed with the summarization prompt and then tokenized using:
|
| 20 |
+
```python
|
| 21 |
+
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
|
| 22 |
+
|
| 23 |
---
|
| 24 |
license: apache-2.0
|
| 25 |
---
|
model.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
| 2 |
+
|
| 3 |
+
def summarize_text(text: str,
|
| 4 |
+
model_name: str = "t5-base",
|
| 5 |
+
max_length: int = 150,
|
| 6 |
+
min_length: int = 40,
|
| 7 |
+
num_beams: int = 4) -> str:
|
| 8 |
+
"""
|
| 9 |
+
Summarizes the given text using a T5 model.
|
| 10 |
+
|
| 11 |
+
Parameters:
|
| 12 |
+
- text: The long input text to be summarized.
|
| 13 |
+
- model_name: The pre-trained T5 model to use (e.g., "t5-base", "t5-small", etc.)
|
| 14 |
+
- max_length: The maximum length (in tokens) of the generated summary.
|
| 15 |
+
- min_length: The minimum length (in tokens) of the generated summary.
|
| 16 |
+
- num_beams: The number of beams for beam search (affects summary quality).
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
- The summarized text (str)
|
| 20 |
+
"""
|
| 21 |
+
# Load tokenizer and model (using new tokenizer behavior)
|
| 22 |
+
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
|
| 23 |
+
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 24 |
+
|
| 25 |
+
# Add the summarization prompt (T5 uses prompt-based approach)
|
| 26 |
+
input_text = "summarize: " + text.strip()
|
| 27 |
+
|
| 28 |
+
# Tokenize the input text
|
| 29 |
+
input_ids = tokenizer.encode(input_text,
|
| 30 |
+
return_tensors="pt",
|
| 31 |
+
max_length=512,
|
| 32 |
+
truncation=True)
|
| 33 |
+
|
| 34 |
+
# Generate summary using the model's generate method
|
| 35 |
+
summary_ids = model.generate(input_ids,
|
| 36 |
+
max_length=max_length,
|
| 37 |
+
min_length=min_length,
|
| 38 |
+
num_beams=num_beams,
|
| 39 |
+
early_stopping=True)
|
| 40 |
+
|
| 41 |
+
# Decode the generated tokens back into text
|
| 42 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 43 |
+
return summary
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
# Example English text to summarize
|
| 47 |
+
long_text = (
|
| 48 |
+
"In recent years, the global economy has faced various challenges. Trade tensions, "
|
| 49 |
+
"inflationary pressures, and rapid technological advancements have contributed to "
|
| 50 |
+
"significant changes in market dynamics. Experts believe that these factors will continue "
|
| 51 |
+
"to influence economic trends, while governments around the world are exploring policies "
|
| 52 |
+
"to stabilize the economy. Meanwhile, the rise of the digital economy and the transition "
|
| 53 |
+
"to green energy are emerging as key drivers of future economic growth."
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Execute the summarization
|
| 57 |
+
summary_result = summarize_text(long_text)
|
| 58 |
+
print("Summary:")
|
| 59 |
+
print(summary_result)
|
requirements.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
certifi==2025.1.31
|
| 2 |
+
charset-normalizer==3.4.1
|
| 3 |
+
filelock==3.17.0
|
| 4 |
+
fsspec==2025.2.0
|
| 5 |
+
huggingface-hub==0.28.1
|
| 6 |
+
idna==3.10
|
| 7 |
+
Jinja2==3.1.5
|
| 8 |
+
MarkupSafe==3.0.2
|
| 9 |
+
mpmath==1.3.0
|
| 10 |
+
networkx==3.4.2
|
| 11 |
+
numpy==2.2.2
|
| 12 |
+
packaging==24.2
|
| 13 |
+
protobuf==5.29.3
|
| 14 |
+
PyYAML==6.0.2
|
| 15 |
+
regex==2024.11.6
|
| 16 |
+
requests==2.32.3
|
| 17 |
+
safetensors==0.5.2
|
| 18 |
+
sentencepiece==0.2.0
|
| 19 |
+
setuptools==75.8.0
|
| 20 |
+
sympy==1.13.1
|
| 21 |
+
tokenizers==0.21.0
|
| 22 |
+
torch==2.6.0
|
| 23 |
+
tqdm==4.67.1
|
| 24 |
+
transformers==4.48.2
|
| 25 |
+
typing_extensions==4.12.2
|
| 26 |
+
urllib3==2.3.0
|