Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer | |
| import evaluate | |
| import nltk | |
| # Ensure that the NLTK sentence tokenizer is available | |
| nltk.download('punkt') | |
| # Load the T5 model and tokenizer | |
| t5_model = T5ForConditionalGeneration.from_pretrained('t5-small') | |
| t5_tokenizer = T5Tokenizer.from_pretrained('t5-small') | |
| # Load the PEGASUS model and tokenizer | |
| pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum') | |
| pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum') | |
| # Load the ROUGE metric | |
| rouge = evaluate.load("rouge") | |
| # Function to generate a summary using T5 | |
| def generate_t5_summary(text): | |
| num_beams = 25 # Further increase beams for more diverse summaries | |
| length_penalty = 1.0 # Neutral to balance summary length | |
| no_repeat_ngram_size = 2 # Allow for more bigram coverage | |
| max_length = 150 # Focus on concise yet informative summaries | |
| min_length = 80 # Ensure summary includes core content | |
| do_sample = False | |
| t5_inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) | |
| t5_summary_ids = t5_model.generate(t5_inputs, max_length=max_length, min_length=min_length, | |
| num_beams=num_beams, length_penalty=length_penalty, | |
| no_repeat_ngram_size=no_repeat_ngram_size, | |
| do_sample=do_sample, early_stopping=True) | |
| t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True) | |
| return t5_summary | |
| # Function to generate a summary using PEGASUS | |
| def generate_pegasus_summary(text): | |
| num_beams = 20 | |
| length_penalty = 1.2 | |
| no_repeat_ngram_size = 2 | |
| max_length = 150 | |
| min_length = 80 | |
| do_sample = False | |
| pegasus_inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512) | |
| pegasus_summary_ids = pegasus_model.generate(pegasus_inputs['input_ids'], max_length=max_length, min_length=min_length, | |
| num_beams=num_beams, length_penalty=length_penalty, | |
| no_repeat_ngram_size=no_repeat_ngram_size, | |
| do_sample=do_sample, early_stopping=True) | |
| pegasus_summary = pegasus_tokenizer.decode(pegasus_summary_ids[0], skip_special_tokens=True) | |
| return pegasus_summary | |
| # Function to generate a combined summary with an emphasis on bigrams | |
| def generate_weighted_combined_summary(text, weight_t5=0.4, weight_pegasus=0.6): | |
| t5_summary = generate_t5_summary(text) | |
| pegasus_summary = generate_pegasus_summary(text) | |
| # Tokenize summaries into sentences | |
| t5_sentences = nltk.sent_tokenize(t5_summary) | |
| pegasus_sentences = nltk.sent_tokenize(pegasus_summary) | |
| # Combine sentences with a focus on maximizing bigram overlap | |
| combined_sentences = [] | |
| combined_sentences.extend(t5_sentences[:int(len(t5_sentences) * weight_t5)]) | |
| combined_sentences.extend(pegasus_sentences[:int(len(pegasus_sentences) * weight_pegasus)]) | |
| # Reorder sentences to maximize bigram overlap (use n-gram analysis if needed) | |
| combined_summary = " ".join(combined_sentences) | |
| return combined_summary | |
| # Define the Gradio interface | |
| iface = gr.Interface( | |
| fn=generate_weighted_combined_summary, | |
| inputs="textbox", | |
| outputs="textbox", | |
| title="Text Summarizer with T5 and PEGASUS", | |
| description="Enter a text to generate its summary using a combined T5 and PEGASUS model." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |