Spaces:
Runtime error
Runtime error
| import transformers | |
| import datasets | |
| import torch | |
| import sentencepiece | |
| import evaluate | |
| from datasets import load_dataset | |
| from transformers import MT5ForConditionalGeneration, T5Tokenizer | |
| import re | |
| # Load dataset | |
| ds = load_dataset("scillm/scientific_papers-archive", split="test") | |
| # Select the first 1000 examples | |
| small_ds = ds.select(range(1000)) | |
| # Preprocessing function to remove unwanted references | |
| def preprocess_text(text): | |
| # Remove unwanted references like @xcite | |
| text = re.sub(r'@\w+', '', text) # Remove anything that starts with @ | |
| text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces | |
| return text | |
| # Preprocessing function | |
| def preprocess(examples): | |
| # Preprocess articles and summaries | |
| articles = [preprocess_text(article) for article in examples["input"]] | |
| outputs = [preprocess_text(output) for output in examples["output"]] | |
| # Add prefix to the articles | |
| inputs = ["summarize: " + article for article in articles] | |
| # Tokenize articles | |
| model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length") | |
| # Tokenize summaries | |
| labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length") | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| # Load mT5 model and tokenizer | |
| model_name = "google/mt5-small" # You can also use other mT5 models | |
| tokenizer = T5Tokenizer.from_pretrained(model_name) | |
| model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
| # Tokenize the smaller dataset | |
| tokenized_small_ds = small_ds.map(preprocess, batched=True) | |
| # Verify that the dataset is correctly tokenized | |
| print(tokenized_small_ds[0]) | |
| # Split the data into train and test set | |
| small_ds = ds.train_test_split(test_size=0.2) | |
| small_ds["train"][0] | |
| print(small_ds['train'].features) | |
| print(small_ds.column_names) | |
| from transformers import T5Tokenizer | |
| model_name = "google/mt5-small" | |
| tokenizer = T5Tokenizer.from_pretrained(model_name) | |
| # Apply preprocessing function to dataset | |
| tokenized_ds = small_ds.map(preprocess, batched=True) | |
| from transformers import DataCollatorForSeq2Seq | |
| data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name) | |
| import torch | |
| torch.cuda.empty_cache() | |
| #pip install wandb | |
| import os | |
| import wandb | |
| api_key = os.getenv("API_KEY") | |
| # Authenticate with WandB | |
| wandb.login(key=api_key) | |
| #print(os.getenv('API_KEY')) | |
| #os.environ["API_KEY"] | |
| from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer | |
| import torch | |
| # Load the model | |
| model_name = "google/mt5-small" | |
| model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
| # Set the device | |
| device = torch.device("cpu") | |
| model.to(device) | |
| # Ensure model parameters are contiguous | |
| for name, param in model.named_parameters(): | |
| if not param.is_contiguous(): | |
| param.data = param.data.contiguous() # Make the tensor contiguous | |
| print(f"Made {name} contiguous.") | |
| training_args = Seq2SeqTrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=1, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| evaluation_strategy='epoch', | |
| logging_dir='./logs', | |
| predict_with_generate=True | |
| ) | |
| # Create trainer instance | |
| trainer = Seq2SeqTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen | |
| eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen | |
| ) | |
| # train the model | |
| trainer.train() | |
| #pip install rouge_score | |
| import evaluate | |
| rouge = evaluate.load("rouge") | |
| def compute_metrics(eval_pred): | |
| predictions, labels = eval_pred | |
| # Decode predictions and labels (remove special tokens) | |
| decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) | |
| # Replace -100 in labels (ignore index) with the padding token id | |
| labels[labels == -100] = tokenizer.pad_token_id | |
| decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
| # Compute ROUGE scores using the `evaluate` library | |
| rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels) | |
| return { | |
| "rouge1": rouge_output["rouge1"], | |
| "rouge2": rouge_output["rouge2"], | |
| "rougeL": rouge_output["rougeL"], | |
| } | |
| # Update trainer to include costom metrics | |
| trainer.compute_metrics = compute_metrics | |
| # Evaluate the model | |
| eval_result = trainer.evaluate() | |
| print(eval_result) | |
| # Save the fine-tuned model | |
| trainer.save_model("fine-tuned-mt5") | |
| tokenizer.save_pretrained("fine-tuned-mt5") | |
| # Load required libraries | |
| from transformers import T5Tokenizer, MT5ForConditionalGeneration | |
| # Load the fine-tuned tokenizer and model | |
| model_name = "fine-tuned-mt5" | |
| new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) | |
| new_model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
| from transformers import pipeline | |
| import torch | |
| # Restructured input | |
| text = ( | |
| "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n" | |
| "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n" | |
| "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n" | |
| "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n" | |
| "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n" | |
| "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n" | |
| "Please provide a summary." | |
| ) | |
| # Määrittele laite (GPU tai CPU) | |
| device = 0 if torch.cuda.is_available() else -1 | |
| # Load the pipeline | |
| summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device) | |
| # Summarize the text | |
| summary = summarizer(text, | |
| max_length=120, | |
| min_length=30, | |
| do_sample=False, | |
| num_beams=10, | |
| repetition_penalty=5.0, | |
| no_repeat_ngram_size=2, | |
| length_penalty=1.0)[0]["summary_text"] | |
| # Clean the summary by removing the <extra_id_0> token | |
| import re | |
| # Regular expression to match both <extra_id_X> and <id_XX> | |
| pattern = r"<(extra_id_\d+|id_\d+)>" | |
| # Replace all matches with a space | |
| cleaned_summary = re.sub(pattern, " ", summary).strip() | |
| print(cleaned_summary) | |
| import gradio as gr | |
| from transformers import T5Tokenizer, MT5ForConditionalGeneration | |
| import fitz # PyMuPDF | |
| # Load the fine-tuned tokenizer and model | |
| model_name = "fine-tuned-mt5" | |
| new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) | |
| new_model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
| # Function to extract text from PDF using PyMuPDF | |
| def extract_text_from_pdf(pdf_file): | |
| text = "" | |
| # Open the PDF file | |
| with fitz.open(pdf_file) as doc: | |
| for page in doc: | |
| text += page.get_text() # Extract text from each page | |
| return text | |
| # Summarization function | |
| def summarize_pdf(pdf_file, max_summary_length): | |
| # Extract text from the PDF | |
| input_text = extract_text_from_pdf(pdf_file) | |
| # Tokenize the input to check length | |
| tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt') | |
| try: | |
| # Generate the summary | |
| summary_ids = new_model.generate( | |
| tokenized_input, | |
| max_length=max_summary_length, | |
| min_length=30, | |
| num_beams=15, | |
| repetition_penalty=5.0, | |
| no_repeat_ngram_size=2 | |
| ) | |
| # Decode the generated summary | |
| summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| # Clean up the summary to remove unwanted tokens | |
| cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip() | |
| # Ensure the summary ends with a complete sentence | |
| if cleaned_summary: | |
| last_period_index = cleaned_summary.rfind('.') | |
| if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1: | |
| cleaned_summary = cleaned_summary[:last_period_index + 1] | |
| else: | |
| cleaned_summary = cleaned_summary.strip() | |
| return cleaned_summary if cleaned_summary else "No valid summary generated." | |
| except Exception as e: | |
| return str(e) # Return the error message for debugging | |
| # Define the Gradio interface | |
| interface = gr.Interface( | |
| fn=summarize_pdf, | |
| inputs=[ | |
| gr.File(label="Upload PDF"), | |
| gr.Slider(50, 300, step=10, label="Max summary length") | |
| ], | |
| outputs="textbox", # A textbox for the output summary | |
| title="PDF Text Summarizer", | |
| description="Upload a PDF file to summarize its content." | |
| ) | |
| # Launch the interface with debug mode enabled | |
| interface.launch(debug=True) | |