Spaces:

Fantasticfour
/

Summarizer

Runtime error

App Files Files Community

Summarizer / app.py

NiinaAlavillamo

Update app.py

3362ccd verified about 1 year ago

raw

history blame contribute delete

9.06 kB

	import transformers
	import datasets
	import torch
	import sentencepiece
	import evaluate


	from datasets import load_dataset
	from transformers import MT5ForConditionalGeneration, T5Tokenizer
	import re

	# Load dataset
	ds = load_dataset("scillm/scientific_papers-archive", split="test")

	# Select the first 1000 examples
	small_ds = ds.select(range(1000))

	# Preprocessing function to remove unwanted references
	def preprocess_text(text):
	# Remove unwanted references like @xcite
	text = re.sub(r'@\w+', '', text) # Remove anything that starts with @
	text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
	return text

	# Preprocessing function
	def preprocess(examples):
	# Preprocess articles and summaries
	articles = [preprocess_text(article) for article in examples["input"]]
	outputs = [preprocess_text(output) for output in examples["output"]]

	# Add prefix to the articles
	inputs = ["summarize: " + article for article in articles]

	# Tokenize articles
	model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

	# Tokenize summaries
	labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")

	model_inputs["labels"] = labels["input_ids"]

	return model_inputs

	# Load mT5 model and tokenizer
	model_name = "google/mt5-small" # You can also use other mT5 models
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	model = MT5ForConditionalGeneration.from_pretrained(model_name)

	# Tokenize the smaller dataset
	tokenized_small_ds = small_ds.map(preprocess, batched=True)

	# Verify that the dataset is correctly tokenized
	print(tokenized_small_ds[0])

	# Split the data into train and test set
	small_ds = ds.train_test_split(test_size=0.2)

	small_ds["train"][0]

	print(small_ds['train'].features)

	print(small_ds.column_names)

	from transformers import T5Tokenizer

	model_name = "google/mt5-small"
	tokenizer = T5Tokenizer.from_pretrained(model_name)

	# Apply preprocessing function to dataset
	tokenized_ds = small_ds.map(preprocess, batched=True)

	from transformers import DataCollatorForSeq2Seq

	data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

	import torch
	torch.cuda.empty_cache()



	#pip install wandb
	import os
	import wandb
	api_key = os.getenv("API_KEY")

	# Authenticate with WandB
	wandb.login(key=api_key)
	#print(os.getenv('API_KEY'))
	#os.environ["API_KEY"]

	from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
	import torch

	# Load the model
	model_name = "google/mt5-small"
	model = MT5ForConditionalGeneration.from_pretrained(model_name)

	# Set the device
	device = torch.device("cpu")
	model.to(device)
	# Ensure model parameters are contiguous
	for name, param in model.named_parameters():
	if not param.is_contiguous():
	param.data = param.data.contiguous() # Make the tensor contiguous
	print(f"Made {name} contiguous.")

	training_args = Seq2SeqTrainingArguments(
	output_dir='./results',
	num_train_epochs=1,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	evaluation_strategy='epoch',
	logging_dir='./logs',
	predict_with_generate=True
	)

	# Create trainer instance
	trainer = Seq2SeqTrainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen
	eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen
	)

	# train the model
	trainer.train()

	#pip install rouge_score
	import evaluate
	rouge = evaluate.load("rouge")

	def compute_metrics(eval_pred):
	predictions, labels = eval_pred

	# Decode predictions and labels (remove special tokens)
	decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

	# Replace -100 in labels (ignore index) with the padding token id
	labels[labels == -100] = tokenizer.pad_token_id
	decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

	# Compute ROUGE scores using the `evaluate` library
	rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)

	return {
	"rouge1": rouge_output["rouge1"],
	"rouge2": rouge_output["rouge2"],
	"rougeL": rouge_output["rougeL"],
	}

	# Update trainer to include costom metrics
	trainer.compute_metrics = compute_metrics

	# Evaluate the model
	eval_result = trainer.evaluate()
	print(eval_result)

	# Save the fine-tuned model
	trainer.save_model("fine-tuned-mt5")
	tokenizer.save_pretrained("fine-tuned-mt5")

	# Load required libraries
	from transformers import T5Tokenizer, MT5ForConditionalGeneration

	# Load the fine-tuned tokenizer and model
	model_name = "fine-tuned-mt5"
	new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
	new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

	from transformers import pipeline
	import torch


	# Restructured input
	text = (
	"Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
	"1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
	"2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
	"3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
	"4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
	"5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
	"Please provide a summary."
	)


	# Määrittele laite (GPU tai CPU)
	device = 0 if torch.cuda.is_available() else -1

	# Load the pipeline
	summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)

	# Summarize the text
	summary = summarizer(text,
	max_length=120,
	min_length=30,
	do_sample=False,
	num_beams=10,
	repetition_penalty=5.0,
	no_repeat_ngram_size=2,
	length_penalty=1.0)[0]["summary_text"]

	# Clean the summary by removing the <extra_id_0> token

	import re

	# Regular expression to match both <extra_id_X> and <id_XX>
	pattern = r"<(extra_id_\d+\|id_\d+)>"

	# Replace all matches with a space
	cleaned_summary = re.sub(pattern, " ", summary).strip()


	print(cleaned_summary)




	import gradio as gr
	from transformers import T5Tokenizer, MT5ForConditionalGeneration
	import fitz # PyMuPDF

	# Load the fine-tuned tokenizer and model
	model_name = "fine-tuned-mt5"
	new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
	new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

	# Function to extract text from PDF using PyMuPDF
	def extract_text_from_pdf(pdf_file):
	text = ""
	# Open the PDF file
	with fitz.open(pdf_file) as doc:
	for page in doc:
	text += page.get_text() # Extract text from each page
	return text

	# Summarization function
	def summarize_pdf(pdf_file, max_summary_length):
	# Extract text from the PDF
	input_text = extract_text_from_pdf(pdf_file)

	# Tokenize the input to check length
	tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')



	try:
	# Generate the summary
	summary_ids = new_model.generate(
	tokenized_input,
	max_length=max_summary_length,
	min_length=30,
	num_beams=15,
	repetition_penalty=5.0,
	no_repeat_ngram_size=2
	)

	# Decode the generated summary
	summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	# Clean up the summary to remove unwanted tokens
	cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()

	# Ensure the summary ends with a complete sentence
	if cleaned_summary:
	last_period_index = cleaned_summary.rfind('.')
	if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
	cleaned_summary = cleaned_summary[:last_period_index + 1]
	else:
	cleaned_summary = cleaned_summary.strip()

	return cleaned_summary if cleaned_summary else "No valid summary generated."

	except Exception as e:
	return str(e) # Return the error message for debugging

	# Define the Gradio interface
	interface = gr.Interface(
	fn=summarize_pdf,
	inputs=[
	gr.File(label="Upload PDF"),
	gr.Slider(50, 300, step=10, label="Max summary length")
	],
	outputs="textbox", # A textbox for the output summary
	title="PDF Text Summarizer",
	description="Upload a PDF file to summarize its content."
	)


	# Launch the interface with debug mode enabled
	interface.launch(debug=True)