import gradio as gr from transformers import T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer import evaluate import nltk # Ensure that the NLTK sentence tokenizer is available nltk.download('punkt') # Load the T5 model and tokenizer t5_model = T5ForConditionalGeneration.from_pretrained('t5-small') t5_tokenizer = T5Tokenizer.from_pretrained('t5-small') # Load the PEGASUS model and tokenizer pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum') # Load the ROUGE metric rouge = evaluate.load("rouge") # Function to generate a summary using T5 def generate_t5_summary(text): num_beams = 25 # Further increase beams for more diverse summaries length_penalty = 1.0 # Neutral to balance summary length no_repeat_ngram_size = 2 # Allow for more bigram coverage max_length = 150 # Focus on concise yet informative summaries min_length = 80 # Ensure summary includes core content do_sample = False t5_inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) t5_summary_ids = t5_model.generate(t5_inputs, max_length=max_length, min_length=min_length, num_beams=num_beams, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size, do_sample=do_sample, early_stopping=True) t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True) return t5_summary # Function to generate a summary using PEGASUS def generate_pegasus_summary(text): num_beams = 20 length_penalty = 1.2 no_repeat_ngram_size = 2 max_length = 150 min_length = 80 do_sample = False pegasus_inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512) pegasus_summary_ids = pegasus_model.generate(pegasus_inputs['input_ids'], max_length=max_length, min_length=min_length, num_beams=num_beams, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size, do_sample=do_sample, early_stopping=True) pegasus_summary = pegasus_tokenizer.decode(pegasus_summary_ids[0], skip_special_tokens=True) return pegasus_summary # Function to generate a combined summary with an emphasis on bigrams def generate_weighted_combined_summary(text, weight_t5=0.4, weight_pegasus=0.6): t5_summary = generate_t5_summary(text) pegasus_summary = generate_pegasus_summary(text) # Tokenize summaries into sentences t5_sentences = nltk.sent_tokenize(t5_summary) pegasus_sentences = nltk.sent_tokenize(pegasus_summary) # Combine sentences with a focus on maximizing bigram overlap combined_sentences = [] combined_sentences.extend(t5_sentences[:int(len(t5_sentences) * weight_t5)]) combined_sentences.extend(pegasus_sentences[:int(len(pegasus_sentences) * weight_pegasus)]) # Reorder sentences to maximize bigram overlap (use n-gram analysis if needed) combined_summary = " ".join(combined_sentences) return combined_summary # Define the Gradio interface iface = gr.Interface( fn=generate_weighted_combined_summary, inputs="textbox", outputs="textbox", title="Text Summarizer with T5 and PEGASUS", description="Enter a text to generate its summary using a combined T5 and PEGASUS model." ) # Launch the interface iface.launch()