Spaces:

EveSa
/

SummaryProject

Runtime error

App Files Files Community

EveSa commited on Mar 13, 2023

Commit

89725f4

unverified ·

2 Parent(s): 5acd184 be067ba

Merge pull request #10 from EveSa/revert-9-Ling

Browse files

Files changed (3) hide show

requirements.txt +4 -82
src/fine_tune_T5.py +0 -230
src/inference_t5.py +15 -20

requirements.txt CHANGED Viewed

@@ -1,56 +1,15 @@
-absl-py==1.4.0
-aiohttp==3.8.4
-aiosignal==1.3.1
-alembic==1.9.4
-anyascii==0.3.1
 anyio==3.6.2
-async-timeout==4.0.2
-attrs==22.2.0
-banal==1.0.6
-blis==0.7.9
-catalogue==2.0.8
-certifi==2022.12.7
-charset-normalizer==3.0.1
-click==8.1.3
-confection==0.0.4
-contourpy==1.0.7
-contractions==0.1.73
-cycler==0.11.0
-cymem==2.0.7
-dataloader==2.0
-dataset==1.6.0
-datasets==2.10.1
-dill==0.3.6
-en-core-web-lg==3.5.0
-evaluate==0.4.0
-fastapi==0.91.0
-filelock==3.9.0
-flake8==6.0.0
-fonttools==4.38.0
-frozenlist==1.3.3
-fsspec==2023.3.0
-greenlet==2.0.2
-h11==0.14.0
-huggingface-hub==0.12.1
 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
 fastapi==0.92.0
 filelock==3.9.0
 idna==3.4
-importlib-metadata==6.0.0
-importlib-resources==5.12.0
 Jinja2==3.1.2
 joblib==1.2.0
-kiwisolver==1.4.4
-langcodes==3.3.0
-Mako==1.2.4
 MarkupSafe==2.1.2
-matplotlib==3.7.0
-mccabe==0.7.0
-multidict==6.0.4
-multiprocess==0.70.14
-murmurhash==1.0.9
 numpy==1.24.2
 nvidia-cublas-cu11==11.10.3.66
 nvidia-cuda-nvrtc-cu11==11.7.99
@@ -58,48 +17,15 @@ nvidia-cuda-runtime-cu11==11.7.99
 nvidia-cudnn-cu11==8.5.0.96
 packaging==23.0
 pandas==1.5.3
-pathy==0.10.1
-Pillow==9.4.0
-preshed==3.0.8
-protobuf==3.20.0
-pyahocorasick==2.0.0
-pyarrow==11.0.0
-pycodestyle==2.10.0
-pydantic==1.10.4
-pyflakes==3.0.1
-pyparsing==3.0.9
 python-dateutil==2.8.2
-python-multipart==0.0.5
 pytz==2022.7.1
 PyYAML==6.0
 regex==2022.10.31
 requests==2.28.2
-responses==0.18.0
-rouge-score==0.1.2
-scikit-learn==1.2.1
-scipy==1.10.0
-sentencepiece==0.1.97
 six==1.16.0
-smart-open==6.3.0
 sniffio==1.3.0
-spacy==3.5.0
-spacy-legacy==3.0.12
-spacy-loggers==1.0.4
-SQLAlchemy==1.4.46
-srsly==2.4.5
-starlette==0.24.0
-summarizer==0.0.7
-textsearch==0.0.24
-thinc==8.1.7
-threadpoolctl==3.1.0
-tokenizers==0.13.2
-tomli==2.0.1
-torch==1.13.1
-tqdm==4.64.1
-transformers==4.26.1
-typer==0.7.0
-typing-extensions==4.4.0
-urllib3==1.26.14
 starlette==0.25.0
 tokenizers==0.13.2
 torch==1.13.1
@@ -107,7 +33,3 @@ tqdm==4.65.0
 typing_extensions==4.5.0
 urllib3==1.26.15
 uvicorn==0.20.0
-wasabi==1.1.1
-xxhash==3.2.0
-yarl==1.8.2
-zipp==3.14.0

 anyio==3.6.2
 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
 fastapi==0.92.0
 filelock==3.9.0
+h11==0.14.0
+huggingface-hub==0.13.1
 idna==3.4
 Jinja2==3.1.2
 joblib==1.2.0
 MarkupSafe==2.1.2
 numpy==1.24.2
 nvidia-cublas-cu11==11.10.3.66
 nvidia-cuda-nvrtc-cu11==11.7.99
 nvidia-cudnn-cu11==8.5.0.96
 packaging==23.0
 pandas==1.5.3
+pydantic==1.10.5
 python-dateutil==2.8.2
+python-multipart==0.0.6
 pytz==2022.7.1
 PyYAML==6.0
 regex==2022.10.31
 requests==2.28.2
 six==1.16.0
 sniffio==1.3.0
 starlette==0.25.0
 tokenizers==0.13.2
 torch==1.13.1
 typing_extensions==4.5.0
 urllib3==1.26.15
 uvicorn==0.20.0

src/fine_tune_T5.py DELETED Viewed

@@ -1,230 +0,0 @@
-import re
-import os
-import string
-import contractions
-import torch
-import datasets
-from datasets import Dataset
-import pandas as pd
-from tqdm import tqdm
-import evaluate
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
-from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
-from transformers import DataCollatorForSeq2Seq
-def clean_text(texts):
-    '''This fonction makes clean text for the future use'''
-    texts = texts.lower()
-    texts = contractions.fix(texts)
-    texts = texts.translate(str.maketrans("", "", string.punctuation))
-    texts = re.sub(r'\n', ' ', texts)
-    return texts
-def datasetmaker(path=str):
-    '''This fonction take the jsonl file, read it to a dataframe,
-     remove the colums not needed for the task and turn it into a file type Dataset
-    '''
-    data = pd.read_json(path, lines=True)
-    df = data.drop(['url',
-                    'archive',
-                    'title',
-                    'date',
-                    'compression',
-                    'coverage',
-                    'density',
-                    'compression_bin',
-                    'coverage_bin',
-                    'density_bin'],
-                   axis=1)
-    tqdm.pandas()
-    df['text'] = df.text.apply(lambda texts: clean_text(texts))
-    df['summary'] = df.summary.apply(lambda summary: clean_text(summary))
-    dataset = Dataset.from_dict(df)
-    return dataset
-# voir si le model par hasard esr déjà bien
-# test_text = dataset['text'][0]
-# pipe = pipeline('summarization', model = model_ckpt)
-# pipe_out = pipe(test_text)
-# print(pipe_out[0]['summary_text'].replace('.<n>', '.\n'))
-# print(dataset['summary'][0])
-def generate_batch_sized_chunks(list_elements, batch_size):
-    """split the dataset into smaller batches that we can process simultaneously
-    Yield successive batch-sized chunks from list_of_elements."""
-    for i in range(0, len(list_elements), batch_size):
-        yield list_elements[i: i + batch_size]
-def calculate_metric(dataset, metric, model, tokenizer,
-                     batch_size, device,
-                     column_text='text',
-                     column_summary='summary'):
-    article_batches = list(
-        str(generate_batch_sized_chunks(dataset[column_text], batch_size)))
-    target_batches = list(
-        str(generate_batch_sized_chunks(dataset[column_summary], batch_size)))
-    for article_batch, target_batch in tqdm(
-            zip(article_batches, target_batches), total=len(article_batches)):
-        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
-                           padding="max_length", return_tensors="pt")
-        # parameter for length penalty ensures that the model does not
-        # generate sequences that are too long.
-        summaries = model.generate(
-            input_ids=inputs["input_ids"].to(device),
-            attention_mask=inputs["attention_mask"].to(device),
-            length_penalty=0.8,
-            num_beams=8,
-            max_length=128)
-        # Décode les textes
-        # renplacer les tokens, ajouter des textes décodés avec les rédéfences
-        # vers la métrique.
-        decoded_summaries = [
-            tokenizer.decode(
-                s,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=True) for s in summaries]
-        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
-        metric.add_batch(
-            predictions=decoded_summaries,
-            references=target_batch)
-    # compute et return les ROUGE scores.
-    results = metric.compute()
-    rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
-    rouge_dict = dict((rn, results[rn]) for rn in rouge_names)
-    return pd.DataFrame(rouge_dict, index=['T5'])
-def convert_ex_to_features(example_batch):
-    input_encodings = tokenizer(example_batch['text'],
-                                max_length=1024, truncation=True)
-    labels = tokenizer(
-        example_batch['summary'],
-        max_length=128,
-        truncation=True)
-    return {
-        'input_ids': input_encodings['input_ids'],
-        'attention_mask': input_encodings['attention_mask'],
-        'labels': labels['input_ids']
-    }
-if __name__ == '__main__':
-    train_dataset = datasetmaker('data/train_extract.jsonl')
-    dev_dataset = datasetmaker('data/dev_extract.jsonl')
-    test_dataset = datasetmaker('data/test_extract.jsonl')
-    dataset = datasets.DatasetDict({'train': train_dataset,
-                                    'dev': dev_dataset, 'test': test_dataset})
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')
-    mt5_config = AutoConfig.from_pretrained(
-        'google/mt5-small',
-        max_length=128,
-        length_penalty=0.6,
-        no_repeat_ngram_size=2,
-        num_beams=15,
-    )
-    model = (AutoModelForSeq2SeqLM
-             .from_pretrained('google/mt5-small', config=mt5_config)
-             .to(device))
-    dataset_pt = dataset.map(
-        convert_ex_to_features,
-        remove_columns=[
-            "summary",
-            "text"],
-        batched=True,
-        batch_size=128)
-    data_collator = DataCollatorForSeq2Seq(
-        tokenizer, model=model, return_tensors="pt")
-    training_args = Seq2SeqTrainingArguments(
-        output_dir="t5_summary",
-        log_level="error",
-        num_train_epochs=10,
-        learning_rate=5e-4,
-        warmup_steps=0,
-        optim="adafactor",
-        weight_decay=0.01,
-        per_device_train_batch_size=2,
-        per_device_eval_batch_size=1,
-        gradient_accumulation_steps=16,
-        evaluation_strategy="steps",
-        eval_steps=100,
-        predict_with_generate=True,
-        generation_max_length=128,
-        save_steps=500,
-        logging_steps=10,
-        # push_to_hub = True
-    )
-    trainer = Seq2SeqTrainer(
-        model=model,
-        args=training_args,
-        data_collator=data_collator,
-        # compute_metrics = calculate_metric,
-        train_dataset=dataset_pt['train'],
-        eval_dataset=dataset_pt['dev'].select(range(10)),
-        tokenizer=tokenizer,
-    )
-    trainer.train()
-    rouge_metric = evaluate.load("rouge")
-    score = calculate_metric(
-        test_dataset,
-        rouge_metric,
-        trainer.model,
-        tokenizer,
-        batch_size=2,
-        device=device,
-        column_text='text',
-        column_summary='summary')
-    print(score)
-    # Fine Tuning terminés et à sauvgarder
-    # save fine-tuned model in local
-    os.makedirs("t5_summary", exist_ok=True)
-    if hasattr(trainer.model, "module"):
-        trainer.model.module.save_pretrained("t5_summary")
-    else:
-        trainer.model.save_pretrained("t5_summary")
-    tokenizer.save_pretrained("t5_summary")
-    # load local model
-    model = (AutoModelForSeq2SeqLM
-             .from_pretrained("t5_summary")
-             .to(device))
-    # mettre en usage : TEST
-    # gen_kwargs = {"length_penalty" : 0.8, "num_beams" : 8, "max_length" : 128}
-    # sample_text = dataset["test"][0]["text"]
-    # reference = dataset["test"][0]["summary"]
-    # pipe = pipeline("summarization", model='./summarization_t5')
-    # print("Text :")
-    # print(sample_text)
-    # print("\nReference Summary :")
-    # print(reference)
-    # print("\nModel Summary :")
-    # print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

src/inference_t5.py CHANGED Viewed

@@ -7,16 +7,14 @@ import re
 import string
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-def clean_text(texts: str) -> str:
     texts = texts.lower()
     texts = contractions.fix(texts)
     texts = texts.translate(str.maketrans("", "", string.punctuation))
-    texts = re.sub(r'\n', ' ', texts)
     return texts
-def inferenceAPI(text: str) -> str:
     """
     Predict the summary for an input text
     --------
@@ -27,16 +25,14 @@ def inferenceAPI(text: str) -> str:
         str
             The summary for the input text
     """
-    # On défini les paramètres d'entrée pour le modèle
-    text = clean_text(text)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    tokenizer = (AutoTokenizer.from_pretrained("Linggg/t5_summary"))
-    # load local model
     model = (AutoModelForSeq2SeqLM
-             .from_pretrained("Linggg/t5_summary")
-             .to(device))
     text_encoding = tokenizer(
         text,
         max_length=1024,
@@ -56,12 +52,11 @@ def inferenceAPI(text: str) -> str:
     )
     preds = [
-        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        for gen_id in generated_ids
     ]
     return "".join(preds)
-# if __name__ == "__main__":
-#     text = input('Entrez votre phrase à résumer : ')
-#     print('summary:', inferenceAPI(text))

 import string
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+def clean_data(texts):
     texts = texts.lower()
     texts = contractions.fix(texts)
     texts = texts.translate(str.maketrans("", "", string.punctuation))
+    texts = re.sub(r'\n',' ',texts)
     return texts
+def inferenceAPI_t5(text: str) -> str:
     """
     Predict the summary for an input text
     --------
         str
             The summary for the input text
     """
+    # definition des parametres d'entree pour le modèle
+    text = clean_data(text)
+    device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
+    tokenizer= (AutoTokenizer.from_pretrained("./summarization_t5"))
+    # chargement du modele local
     model = (AutoModelForSeq2SeqLM
+            .from_pretrained("./summarization_t5")
+            .to(device))
     text_encoding = tokenizer(
         text,
         max_length=1024,
     )
     preds = [
+            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            for gen_id in generated_ids
     ]
     return "".join(preds)
+if __name__ == "__main__":
+     text = input('Entrez votre phrase à résumer : ')
+     print('summary:',inferenceAPI(text))