Spaces:

beepeen244586
/

final_project

Sleeping

App Files Files Community

final_project / app.py

beepeen244586

Update app.py

ee7550d verified over 1 year ago

raw

history blame contribute delete

3.7 kB

	import gradio as gr
	from transformers import T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
	import evaluate
	import nltk

	# Ensure that the NLTK sentence tokenizer is available
	nltk.download('punkt')

	# Load the T5 model and tokenizer
	t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
	t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

	# Load the PEGASUS model and tokenizer
	pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
	pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

	# Load the ROUGE metric
	rouge = evaluate.load("rouge")

	# Function to generate a summary using T5
	def generate_t5_summary(text):
	num_beams = 25 # Further increase beams for more diverse summaries
	length_penalty = 1.0 # Neutral to balance summary length
	no_repeat_ngram_size = 2 # Allow for more bigram coverage
	max_length = 150 # Focus on concise yet informative summaries
	min_length = 80 # Ensure summary includes core content
	do_sample = False

	t5_inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
	t5_summary_ids = t5_model.generate(t5_inputs, max_length=max_length, min_length=min_length,
	num_beams=num_beams, length_penalty=length_penalty,
	no_repeat_ngram_size=no_repeat_ngram_size,
	do_sample=do_sample, early_stopping=True)
	t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)

	return t5_summary

	# Function to generate a summary using PEGASUS
	def generate_pegasus_summary(text):
	num_beams = 20
	length_penalty = 1.2
	no_repeat_ngram_size = 2
	max_length = 150
	min_length = 80
	do_sample = False

	pegasus_inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=512)
	pegasus_summary_ids = pegasus_model.generate(pegasus_inputs['input_ids'], max_length=max_length, min_length=min_length,
	num_beams=num_beams, length_penalty=length_penalty,
	no_repeat_ngram_size=no_repeat_ngram_size,
	do_sample=do_sample, early_stopping=True)
	pegasus_summary = pegasus_tokenizer.decode(pegasus_summary_ids[0], skip_special_tokens=True)

	return pegasus_summary

	# Function to generate a combined summary with an emphasis on bigrams
	def generate_weighted_combined_summary(text, weight_t5=0.4, weight_pegasus=0.6):
	t5_summary = generate_t5_summary(text)
	pegasus_summary = generate_pegasus_summary(text)

	# Tokenize summaries into sentences
	t5_sentences = nltk.sent_tokenize(t5_summary)
	pegasus_sentences = nltk.sent_tokenize(pegasus_summary)

	# Combine sentences with a focus on maximizing bigram overlap
	combined_sentences = []
	combined_sentences.extend(t5_sentences[:int(len(t5_sentences) * weight_t5)])
	combined_sentences.extend(pegasus_sentences[:int(len(pegasus_sentences) * weight_pegasus)])

	# Reorder sentences to maximize bigram overlap (use n-gram analysis if needed)
	combined_summary = " ".join(combined_sentences)

	return combined_summary

	# Define the Gradio interface
	iface = gr.Interface(
	fn=generate_weighted_combined_summary,
	inputs="textbox",
	outputs="textbox",
	title="Text Summarizer with T5 and PEGASUS",
	description="Enter a text to generate its summary using a combined T5 and PEGASUS model."
	)

	# Launch the interface
	iface.launch()