final_project / app.py
beepeen244586's picture
Update app.py
ee7550d verified
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
import evaluate
import nltk
# Ensure that the NLTK sentence tokenizer is available
nltk.download('punkt')
# Load the T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
# Load the PEGASUS model and tokenizer
pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
# Load the ROUGE metric
rouge = evaluate.load("rouge")
# Function to generate a summary using T5
def generate_t5_summary(text):
num_beams = 25 # Further increase beams for more diverse summaries
length_penalty = 1.0 # Neutral to balance summary length
no_repeat_ngram_size = 2 # Allow for more bigram coverage
max_length = 150 # Focus on concise yet informative summaries
min_length = 80 # Ensure summary includes core content
do_sample = False
t5_inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
t5_summary_ids = t5_model.generate(t5_inputs, max_length=max_length, min_length=min_length,
num_beams=num_beams, length_penalty=length_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
do_sample=do_sample, early_stopping=True)
t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)
return t5_summary
# Function to generate a summary using PEGASUS
def generate_pegasus_summary(text):
num_beams = 20
length_penalty = 1.2
no_repeat_ngram_size = 2
max_length = 150
min_length = 80
do_sample = False
pegasus_inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
pegasus_summary_ids = pegasus_model.generate(pegasus_inputs['input_ids'], max_length=max_length, min_length=min_length,
num_beams=num_beams, length_penalty=length_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
do_sample=do_sample, early_stopping=True)
pegasus_summary = pegasus_tokenizer.decode(pegasus_summary_ids[0], skip_special_tokens=True)
return pegasus_summary
# Function to generate a combined summary with an emphasis on bigrams
def generate_weighted_combined_summary(text, weight_t5=0.4, weight_pegasus=0.6):
t5_summary = generate_t5_summary(text)
pegasus_summary = generate_pegasus_summary(text)
# Tokenize summaries into sentences
t5_sentences = nltk.sent_tokenize(t5_summary)
pegasus_sentences = nltk.sent_tokenize(pegasus_summary)
# Combine sentences with a focus on maximizing bigram overlap
combined_sentences = []
combined_sentences.extend(t5_sentences[:int(len(t5_sentences) * weight_t5)])
combined_sentences.extend(pegasus_sentences[:int(len(pegasus_sentences) * weight_pegasus)])
# Reorder sentences to maximize bigram overlap (use n-gram analysis if needed)
combined_summary = " ".join(combined_sentences)
return combined_summary
# Define the Gradio interface
iface = gr.Interface(
fn=generate_weighted_combined_summary,
inputs="textbox",
outputs="textbox",
title="Text Summarizer with T5 and PEGASUS",
description="Enter a text to generate its summary using a combined T5 and PEGASUS model."
)
# Launch the interface
iface.launch()