Spaces:

adithiyyha
/

xrayreport

Runtime error

App Files Files Community

adithiyyha commited on Jan 11, 2025

Commit

0faaa54

verified ·

1 Parent(s): 10b84e8

Upload 8 files

Browse files

Files changed (8) hide show

config.py +36 -0
dataset.py +165 -0
eval.py +62 -0
gui.py +92 -0
inference.py +88 -0
model.py +198 -0
train.py +84 -0
utils.py +111 -0

config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import albumentations as A
+import torch
+from albumentations.pytorch import ToTensorV2
+CHECKPOINT_FILE = './checkpoints/x_ray_model.pth.tar'
+DATASET_PATH = './dataset'
+IMAGES_DATASET = './dataset/images'
+DEVICE = 'cpu'
+BATCH_SIZE = 16
+PIN_MEMORY = False
+VOCAB_THRESHOLD = 2
+FEATURES_SIZE = 1024
+EMBED_SIZE = 300
+HIDDEN_SIZE = 256
+LEARNING_RATE = 4e-5
+EPOCHS = 50
+LOAD_MODEL = True
+SAVE_MODEL = True
+basic_transforms = A.Compose([
+    A.Resize(
+        height=256,
+        width=256
+    ),
+    A.Normalize(
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+    ),
+    ToTensorV2()
+])

dataset.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import spacy
+import torch
+import config
+import utils
+import numpy as np
+import xml.etree.ElementTree as ET
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset, DataLoader
+spacy_eng = spacy.load('en_core_web_sm')
+class Vocabulary:
+    def __init__(self, freq_threshold):
+        self.itos = {
+            0: '<PAD>',
+            1: '<SOS>',
+            2: '<EOS>',
+            3: '<UNK>',
+        }
+        self.stoi = {
+            '<PAD>': 0,
+            '<SOS>': 1,
+            '<EOS>': 2,
+            '<UNK>': 3,
+        }
+        self.freq_threshold = freq_threshold
+    @staticmethod
+    def tokenizer(text):
+        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
+    def build_vocabulary(self, sentence_list):
+        frequencies = {}
+        idx = 4
+        for sent in sentence_list:
+            for word in self.tokenizer(sent):
+                if word not in frequencies:
+                    frequencies[word] = 1
+                else:
+                    frequencies[word] += 1
+                if frequencies[word] == self.freq_threshold:
+                    self.stoi[word] = idx
+                    self.itos[idx] = word
+                    idx += 1
+    def numericalize(self, text):
+        tokenized_text = self.tokenizer(text)
+        return [
+            self.stoi[token] if token in self.stoi else self.stoi['<UNK>']
+            for token in tokenized_text
+        ]
+    def __len__(self):
+        return len(self.itos)
+class XRayDataset(Dataset):
+    def __init__(self, root, transform=None, freq_threshold=3, raw_caption=False):
+        self.root = root
+        self.transform = transform
+        self.raw_caption = raw_caption
+        self.vocab = Vocabulary(freq_threshold=freq_threshold)
+        self.captions = []
+        self.imgs = []
+        for file in os.listdir(os.path.join(self.root, 'reports')):
+            if file.endswith('.xml'):
+                tree = ET.parse(os.path.join(self.root, 'reports', file))
+                frontal_img = ''
+                findings = tree.find(".//AbstractText[@Label='FINDINGS']").text
+                if findings is None:
+                    continue
+                for x in tree.findall('parentImage'):
+                    if frontal_img != '':
+                        break
+                    img = x.attrib['id']
+                    img = os.path.join(config.IMAGES_DATASET, f'{img}.png')
+                    frontal_img = img
+                if frontal_img == '':
+                    continue
+                self.captions.append(findings)
+                self.imgs.append(frontal_img)
+        self.vocab.build_vocabulary(self.captions)
+    def __getitem__(self, item):
+        img = self.imgs[item]
+        caption = utils.normalize_text(self.captions[item])
+        img = np.array(Image.open(img).convert('L'))
+        img = np.expand_dims(img, axis=-1)
+        img = img.repeat(3, axis=-1)
+        if self.transform is not None:
+            img = self.transform(image=img)['image']
+        if self.raw_caption:
+            return img, caption
+        numericalized_caption = [self.vocab.stoi['<SOS>']]
+        numericalized_caption += self.vocab.numericalize(caption)
+        numericalized_caption.append(self.vocab.stoi['<EOS>'])
+        return img, torch.as_tensor(numericalized_caption, dtype=torch.long)
+    def __len__(self):
+        return len(self.captions)
+    def get_caption(self, item):
+        return self.captions[item].split(' ')
+class CollateDataset:
+    def __init__(self, pad_idx):
+        self.pad_idx = pad_idx
+    def __call__(self, batch):
+        images, captions = zip(*batch)
+        images = torch.stack(images, 0)
+        targets = [item for item in captions]
+        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_idx)
+        return images, targets
+if __name__ == '__main__':
+    all_dataset = XRayDataset(
+        root=config.DATASET_PATH,
+        transform=config.basic_transforms,
+        freq_threshold=config.VOCAB_THRESHOLD,
+    )
+    train_loader = DataLoader(
+        dataset=all_dataset,
+        batch_size=config.BATCH_SIZE,
+        pin_memory=config.PIN_MEMORY,
+        drop_last=True,
+        shuffle=True,
+        collate_fn=CollateDataset(pad_idx=all_dataset.vocab.stoi['<PAD>']),
+    )
+    for img, caption in train_loader:
+        print(img.shape, caption.shape)
+        break

eval.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import config
+import utils
+import numpy as np
+from tqdm import tqdm
+from nltk.translate.bleu_score import sentence_bleu
+def check_accuracy(dataset, model):
+    print('=> Testing')
+    model.eval()
+    bleu1_score = []
+    bleu2_score = []
+    bleu3_score = []
+    bleu4_score = []
+    for image, caption in tqdm(dataset):
+        image = image.to(config.DEVICE)
+        generated = model.generate_caption(image.unsqueeze(0), max_length=len(caption.split(' ')))
+        bleu1_score.append(
+            sentence_bleu([caption.split()], generated, weights=(1, 0, 0, 0))
+        )
+        bleu2_score.append(
+            sentence_bleu([caption.split()], generated, weights=(0.5, 0.5, 0, 0))
+        )
+        bleu3_score.append(
+            sentence_bleu([caption.split()], generated, weights=(0.33, 0.33, 0.33, 0))
+        )
+        bleu4_score.append(
+            sentence_bleu([caption.split()], generated, weights=(0.25, 0.25, 0.25, 0.25))
+        )
+    print(f'=> BLEU 1: {np.mean(bleu1_score)}')
+    print(f'=> BLEU 2: {np.mean(bleu2_score)}')
+    print(f'=> BLEU 3: {np.mean(bleu3_score)}')
+    print(f'=> BLEU 4: {np.mean(bleu4_score)}')
+def main():
+    all_dataset = utils.load_dataset(raw_caption=True)
+    model = utils.get_model_instance(all_dataset.vocab)
+    utils.load_checkpoint(model)
+    _, test_dataset = utils.train_test_split(dataset=all_dataset)
+    check_accuracy(
+        test_dataset,
+        model
+    )
+if __name__ == '__main__':
+    main()

gui.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import config
+import utils
+import numpy as np
+from tkinter import *
+from PIL import Image, ImageTk
+from tkinter import filedialog
+label = None
+image = None
+model = None
+def choose_image():
+    global label, image
+    path = filedialog.askopenfilename(initialdir='images', title='Select Photo')
+    screen = Toplevel(root)
+    screen.title('Report Generator')
+    ff1 = Frame(screen, bg='grey', borderwidth=6, relief=GROOVE)
+    ff1.pack(side=TOP,fill=X)
+    ff2 = Frame(screen, bg='grey', borderwidth=6, relief=GROOVE)
+    ff2.pack(side=TOP, fill=X)
+    ff4 = Frame(screen, bg='grey', borderwidth=6, relief=GROOVE)
+    ff4.pack(side=TOP, fill=X)
+    ff3 = Frame(screen, bg='grey', borderwidth=6, relief=GROOVE)
+    ff3.pack(side=TOP, fill=X)
+    Label(ff1, text='Select X-Ray', fg='white', bg='grey', font='Helvetica 16 bold').pack()
+    original_img = Image.open(path).convert('L')
+    image = np.array(original_img)
+    image = np.expand_dims(image, axis=-1)
+    image = image.repeat(3, axis=-1)
+    image = config.basic_transforms(image=image)['image']
+    photo = ImageTk.PhotoImage(original_img)
+    Label(ff2, image=photo).pack()
+    label = Label(ff4, text='', fg='blue', bg='gray', font='Helvetica 16 bold')
+    label.pack()
+    Button(ff3, text='Generate Report', bg='violet', command=generate_report, height=2, width=20, font='Helvetica 16 bold').pack(side=LEFT)
+    Button(ff3, text='Quit', bg='red', command=quit_gui, height=2, width=20, font='Helvetica 16 bold').pack()
+    screen.bind('<Configure>', lambda event: label.configure(wraplength=label.winfo_width()))
+    screen.mainloop()
+def generate_report():
+    global label, image, model
+    model.eval()
+    image = image.to(config.DEVICE)
+    report = model.generate_caption(image.unsqueeze(0), max_length=25)
+    label.config(text=report, fg='violet', bg='green', font='Helvetica 16 bold', width=40)
+    label.update_idletasks()
+def quit_gui():
+    root.destroy()
+root = Tk()
+root.title('Chest X-Ray Report Generator')
+f1 = Frame(root, bg='grey', borderwidth=6, relief=GROOVE)
+f1.pack(side=TOP, fill=X)
+f2 = Frame(root, bg='grey', borderwidth=6, relief=GROOVE)
+f2.pack(side=TOP, fill=X)
+Label(f1, text='Welcome to Chest X-Ray Report Generator', fg='white', bg='grey', font='Helvetica 16 bold').pack()
+btn1 = Button(root, text='Choose Chest X-Ray', command=choose_image, height=2, width=20, bg='blue', font="Helvetica 16 bold", pady=10)
+btn1.pack()
+Button(root, text='Quit', command=quit_gui, height=2, width=20, bg='violet', font='Helvetica 16 bold', pady=10).pack()
+if __name__ == '__main__':
+    model = utils.get_model_instance(utils.load_dataset().vocab)
+    utils.load_checkpoint(model)
+    root.mainloop()

inference.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import torch
+import config
+from utils import (
+    load_dataset,
+    get_model_instance,
+    load_checkpoint,
+    can_load_checkpoint,
+    normalize_text,
+)
+from PIL import Image
+import torchvision.transforms as transforms
+# Define device
+DEVICE = 'cpu'
+# Define image transformations (adjust based on training setup)
+TRANSFORMS = transforms.Compose([
+    transforms.Resize((224, 224)),  # Replace with your model's expected input size
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+def load_model():
+    """
+    Loads the model with the vocabulary and checkpoint.
+    """
+    print("Loading dataset and vocabulary...")
+    dataset = load_dataset()  # Load dataset to access vocabulary
+    vocabulary = dataset.vocab  # Assuming 'vocab' is an attribute of the dataset
+    print("Initializing the model...")
+    model = get_model_instance(vocabulary)  # Initialize the model
+    if can_load_checkpoint():
+        print("Loading checkpoint...")
+        load_checkpoint(model)
+    else:
+        print("No checkpoint found, starting with untrained model.")
+    model.eval()  # Set the model to evaluation mode
+    print("Model is ready for inference.")
+    return model
+def preprocess_image(image_path):
+    """
+    Preprocess the input image for the model.
+    """
+    print(f"Preprocessing image: {image_path}")
+    image = Image.open(image_path).convert("RGB")  # Ensure RGB format
+    image = TRANSFORMS(image).unsqueeze(0)  # Add batch dimension
+    return image.to(DEVICE)
+def generate_report(model, image_path):
+    """
+    Generates a report for a given image using the model.
+    """
+    image = preprocess_image(image_path)
+    print("Generating report...")
+    with torch.no_grad():
+        # Assuming the model has a 'generate_caption' method
+        output = model.generate_caption(image, max_length=25)
+        report = " ".join(output)
+    print(f"Generated report: {report}")
+    return report
+if __name__ == "__main__":
+    # Path to the checkpoint file
+    CHECKPOINT_PATH = config.CHECKPOINT_FILE  # Ensure config.CHECKPOINT_FILE is correctly set
+    # Path to the input image
+    IMAGE_PATH = "./dataset/images/CXR1178_IM-0121-1001.png"  # Replace with your image path
+    # Load the model
+    model = load_model()
+    # Ensure the image exists before inference
+    if os.path.exists(IMAGE_PATH):
+        report = generate_report(model, IMAGE_PATH)
+        print("Final Report:", report)
+    else:
+        print(f"Image not found at path: {IMAGE_PATH}")

model.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import re
+import torch
+import config
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+from collections import OrderedDict
+class DenseNet121(nn.Module):
+    def __init__(self, out_size=14, checkpoint=None):
+        super(DenseNet121, self).__init__()
+        self.densenet121 = models.densenet121(weights='DEFAULT')
+        num_classes = self.densenet121.classifier.in_features
+        self.densenet121.classifier = nn.Sequential(
+            nn.Linear(num_classes, out_size),
+            nn.Sigmoid()
+        )
+        if checkpoint is not None:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            checkpoint = torch.load(checkpoint, map_location=device)
+            state_dict = checkpoint['state_dict']
+            new_state_dict = OrderedDict()
+            for k, v in state_dict.items():
+                if 'module' not in k:
+                    k = f'module.{k}'
+                else:
+                    k = k.replace('module.densenet121.features', 'features')
+                    k = k.replace('module.densenet121.classifier', 'classifier')
+                    k = k.replace('.norm.1', '.norm1')
+                    k = k.replace('.conv.1', '.conv1')
+                    k = k.replace('.norm.2', '.norm2')
+                    k = k.replace('.conv.2', '.conv2')
+                new_state_dict[k] = v
+            self.densenet121.load_state_dict(new_state_dict)
+    def forward(self, x):
+        return self.densenet121(x)
+class EncoderCNN(nn.Module):
+    def __init__(self, checkpoint=None):
+        super(EncoderCNN, self).__init__()
+        self.model = DenseNet121(
+            checkpoint=checkpoint
+        )
+        for param in self.model.densenet121.parameters():
+            param.requires_grad_(False)
+    def forward(self, images):
+        features = self.model.densenet121.features(images)
+        batch, maps, size_1, size_2 = features.size()
+        features = features.permute(0, 2, 3, 1)
+        features = features.view(batch, size_1 * size_2, maps)
+        return features
+class Attention(nn.Module):
+    def __init__(self, features_size, hidden_size, output_size=1):
+        super(Attention, self).__init__()
+        self.W = nn.Linear(features_size, hidden_size)
+        self.U = nn.Linear(hidden_size, hidden_size)
+        self.v = nn.Linear(hidden_size, output_size)
+    def forward(self, features, decoder_output):
+        decoder_output = decoder_output.unsqueeze(1)
+        w = self.W(features)
+        u = self.U(decoder_output)
+        scores = self.v(torch.tanh(w + u))
+        weights = F.softmax(scores, dim=1)
+        context = torch.sum(weights * features, dim=1)
+        weights = weights.squeeze(2)
+        return context, weights
+class DecoderRNN(nn.Module):
+    def __init__(self, features_size, embed_size, hidden_size, vocab_size):
+        super(DecoderRNN, self).__init__()
+        self.vocab_size = vocab_size
+        self.embedding = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTMCell(embed_size + features_size, hidden_size)
+        self.fc = nn.Linear(hidden_size, vocab_size)
+        self.attention = Attention(features_size, hidden_size)
+        self.init_h = nn.Linear(features_size, hidden_size)
+        self.init_c = nn.Linear(features_size, hidden_size)
+    def forward(self, features, captions):
+        embeddings = self.embedding(captions)
+        h, c = self.init_hidden(features)
+        seq_len = len(captions[0]) - 1
+        features_size = features.size(1)
+        batch_size = captions.size(0)
+        outputs = torch.zeros(batch_size, seq_len, self.vocab_size).to(config.DEVICE)
+        atten_weights = torch.zeros(batch_size, seq_len, features_size).to(config.DEVICE)
+        for i in range(seq_len):
+            context, attention = self.attention(features, h)
+            inputs = torch.cat((embeddings[:, i, :], context), dim=1)
+            h, c = self.lstm(inputs, (h, c))
+            h = F.dropout(h, p=0.5)
+            output = self.fc(h)
+            outputs[:, i, :] = output
+            atten_weights[:, i, :] = attention
+        return outputs, atten_weights
+    def init_hidden(self, features):
+        features = torch.mean(features, dim=1)
+        h = self.init_h(features)
+        c = self.init_c(features)
+        return h, c
+class EncoderDecoderNet(nn.Module):
+    def __init__(self, features_size, embed_size, hidden_size, vocabulary, encoder_checkpoint=None):
+        super(EncoderDecoderNet, self).__init__()
+        self.vocabulary = vocabulary
+        self.encoder = EncoderCNN(
+            checkpoint=encoder_checkpoint
+        )
+        self.decoder = DecoderRNN(
+            features_size=features_size,
+            embed_size=embed_size,
+            hidden_size=hidden_size,
+            vocab_size=len(self.vocabulary)
+        )
+    def forward(self, images, captions):
+        features = self.encoder(images)
+        outputs, _ = self.decoder(features, captions)
+        return outputs
+    def generate_caption(self, image, max_length=25):
+        caption = []
+        with torch.no_grad():
+            features = self.encoder(image)
+            h, c = self.decoder.init_hidden(features)
+            word = torch.tensor(self.vocabulary.stoi['<SOS>']).view(1, -1).to(config.DEVICE)
+            embeddings = self.decoder.embedding(word).squeeze(0)
+            for _ in range(max_length):
+                context, _ = self.decoder.attention(features, h)
+                inputs = torch.cat((embeddings, context), dim=1)
+                h, c  = self.decoder.lstm(inputs, (h, c))
+                output = self.decoder.fc(F.dropout(h, p=0.5))
+                output = output.view(1, -1)
+                predicted = output.argmax(1)
+                if self.vocabulary.itos[predicted.item()] == '<EOS>':
+                    break
+                caption.append(predicted.item())
+                embeddings = self.decoder.embedding(predicted)
+        return [self.vocabulary.itos[idx] for idx in caption]

train.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import config
+import utils
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from dataset import CollateDataset
+def train_epoch(loader, model, optimizer, loss_fn, epoch):
+    model.train()
+    losses = []
+    loader = tqdm(loader)
+    for img, captions in loader:
+        img = img.to(config.DEVICE)
+        captions = captions.to(config.DEVICE)
+        output = model(img, captions)
+        loss = loss_fn(
+            output.reshape(-1, output.shape[2]),
+            captions[:, 1:].reshape(-1)
+        )
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        loader.set_postfix(loss=loss.item())
+        losses.append(loss.item())
+    if config.SAVE_MODEL:
+        utils.save_checkpoint({
+            'state_dict': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'epoch': epoch,
+            'loss': np.mean(losses)
+        })
+    print(f'Epoch[{epoch}]: Loss {np.mean(losses)}')
+def main():
+    all_dataset = utils.load_dataset()
+    train_dataset, _ = utils.train_test_split(dataset=all_dataset)
+    train_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.BATCH_SIZE,
+        pin_memory=config.PIN_MEMORY,
+        drop_last=False,
+        shuffle=True,
+        collate_fn=CollateDataset(pad_idx=all_dataset.vocab.stoi['<PAD>']),
+    )
+    model = utils.get_model_instance(all_dataset.vocab)
+    optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
+    loss_fn = nn.CrossEntropyLoss(ignore_index=all_dataset.vocab.stoi['<PAD>'])
+    starting_epoch = 1
+    if utils.can_load_checkpoint():
+        starting_epoch = utils.load_checkpoint(model, optimizer)
+    for epoch in range(starting_epoch, config.EPOCHS):
+        train_epoch(
+            train_loader,
+            model,
+            optimizer,
+            loss_fn,
+            epoch
+        )
+if __name__ == '__main__':
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import re
+import html
+import string
+import torch
+import config
+import unicodedata
+from nltk.tokenize import word_tokenize
+from dataset import XRayDataset
+from model import EncoderDecoderNet
+from torch.utils.data import Subset
+from sklearn.model_selection import train_test_split as sklearn_train_test_split
+def load_dataset(raw_caption=False):
+    return XRayDataset(
+        root=config.DATASET_PATH,
+        transform=config.basic_transforms,
+        freq_threshold=config.VOCAB_THRESHOLD,
+        raw_caption=raw_caption
+    )
+def get_model_instance(vocabulary):
+    model = EncoderDecoderNet(
+        features_size=config.FEATURES_SIZE,
+        embed_size=config.EMBED_SIZE,
+        hidden_size=config.HIDDEN_SIZE,
+        vocabulary=vocabulary,
+        encoder_checkpoint='./weights/chexnet.pth.tar'
+    )
+    model = model.to(config.DEVICE)
+    return model
+def train_test_split(dataset, test_size=0.25, random_state=44):
+    train_idx, test_idx = sklearn_train_test_split(
+        list(range(len(dataset))),
+        test_size=test_size,
+        random_state=random_state
+    )
+    return Subset(dataset, train_idx), Subset(dataset, test_idx)
+def save_checkpoint(checkpoint):
+    print('=> Saving checkpoint')
+    torch.save(checkpoint, config.CHECKPOINT_FILE)
+def load_checkpoint(model, optimizer=None):
+    print('=> Loading checkpoint')
+    checkpoint = torch.load(config.CHECKPOINT_FILE, map_location=torch.device('cpu'))
+    model.load_state_dict(checkpoint['state_dict'])
+    if optimizer is not None:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+    return checkpoint['epoch']
+def can_load_checkpoint():
+    return os.path.exists(config.CHECKPOINT_FILE) and config.LOAD_MODEL
+def remove_special_chars(text):
+    re1 = re.compile(r'  +')
+    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
+        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
+        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
+        ' @-@ ', '-').replace('\\', ' \\ ')
+    return re1.sub(' ', html.unescape(x1))
+def remove_non_ascii(text):
+    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
+def to_lowercase(text):
+    return text.lower()
+def remove_punctuation(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.translate(translator)
+def replace_numbers(text):
+    return re.sub(r'\d+', '', text)
+def text2words(text):
+    return word_tokenize(text)
+def normalize_text( text):
+    text = remove_special_chars(text)
+    text = remove_non_ascii(text)
+    text = remove_punctuation(text)
+    text = to_lowercase(text)
+    text = replace_numbers(text)
+    return text
+def normalize_corpus(corpus):
+    return [normalize_text(t) for t in corpus]