Spaces:

LaurianeMD
/

News_article_classification_bert

Runtime error

App Files Files Community

LaurianeMD commited on Jul 11, 2024

Commit

eb6d478

verified ·

1 Parent(s): 80c952e

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +2 -35
.gitignore +162 -0
LICENSE +28 -0
README.md +114 -12
accuracy_and_loss.PNG +0 -0
api.py +78 -0
bert_classification.py +92 -0
inshort_news_data.csv +0 -0
main.py +31 -0
models/trained_model.pth +3 -0
models/trained_model1.pth +3 -0
news_dataset.py +41 -0
requirements.txt +10 -0
utils.py +35 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ models/trained_model.pth filter=lfs diff=lfs merge=lfs -text
2	+ *.pth filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,28 @@

+BSD 3-Clause License
+Copyright (c) 2024, Lauriane MBAGDJE DORENAN
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md CHANGED Viewed

@@ -1,12 +1,114 @@
----
-title: News Article Classification Bert
-emoji: 👀
-colorFrom: pink
-colorTo: green
-sdk: gradio
-sdk_version: 4.37.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: News_article_classification_bert
+app_file: main.py
+sdk: gradio
+sdk_version: 4.37.2
+---
+# news_classification
+News Article Classification: Combining Headlines and Articles to Categorize News
+# **News Classification Using BERT**
+This project utilizes BERT (Bidirectional Encoder Representations from Transformers) for classifying news articles into predefined categories. The model achieves an accuracy of 96% and a loss of 0.1 on the test dataset.
+## **Dataset**
+The dataset used in this project is inshort_news_data.csv, containing short news articles categorized into various topics.
+## **Model Architecture**
+The model architecture is based on a custom BERT model fine-tuned for sequence classification:
+BERT Model: bert-base-uncased
+Batch Size: 8
+Optimizer: Adam with learning rate 2e-5
+Loss Function: CrossEntropyLoss
+Training
+The model is trained for 3 epochs with the following steps:
+**Data Preparation:** The dataset is tokenized using the BERT tokenizer and prepared as PyTorch DataLoader objects.
+**Training:** The model is trained using stochastic gradient descent with backpropagation. During training, the loss is minimized and weights are updated iteratively.
+**Evaluation:** After each epoch, the model is evaluated on a held-out validation set to measure accuracy and loss.
+**Results**
+Accuracy: 96%
+Loss: 0.1
+Usage
+To use the trained model for inference:
+Ensure all dependencies are installed (transformers, torch, fastapi, pydantic, etc.).
+Load the model using torch.load() and the appropriate tokenizer.
+Send POST requests to /predict/ endpoint with JSON containing headline and article fields to classify news articles.
+How to Run
+To run the FastAPI application:
+uvicorn api:app --host localhost --port 8080
+Navigate to http://localhost:8080/docs to interact with the API using Swagger UI.
+---------------------------------------------------------------------------------------------------
+***french***
+# Classification des Catégories de News avec BERT
+Ce projet vise à classifier automatiquement les catégories de nouvelles à partir des titres et du contenu des articles en utilisant un modèle BERT préalablement entraîné.
+## Contenu du Projet
+- `bert_classification.py` : Contient la définition du modèle `CustomBert` utilisé pour la classification.
+- `news_dataset.py` : Implémente la classe `NewsDataset` pour charger et prétraiter le dataset de nouvelles.
+- `utils.py` : Fournit des fonctions utilitaires pour charger le modèle entraîné et effectuer des prédictions.
+- `main.py` :  charge un modèle pré-entraîné pour la classification des catégories de nouvelles, crée une interface utilisateur web avec Gradio
+  pour permettre aux utilisateurs de  soumettre des titres et des articles, et affiche la catégorie prédite pour ces nouvelles.
+- `api.py` : Implémente une API web à l'aide de FastAPI pour permettre la prédiction des catégories de nouvelles en temps réel.
+## Installation des Dépendances
+Assurez-vous d'avoir Python 3.7+ installé ainsi que les packages nécessaires :
+pip install -r requirements.txt
+## Entraînement du Modèle
+Pour entraîner le modèle, exécutez main.py. Assurez-vous d'avoir un fichier CSV inshort_news_data.csv avec les colonnes news_headline et news_article.
+python main.py
+## Détails de l'Entraînement
+Batch Size : 8 (par défaut)
+Epochs : 3 (par défaut)
+Précision : 96%, Perte : 0.1 après l'entraînement.
+Modèle sauvegardé à ./models/trained_model1.pth.
+## Utilisation de l'API Web
+Pour utiliser l'API web pour la prédiction des catégories de news :
+Lancez l'API avec FastAPI en exécutant api.py:
+uvicorn api:app --host localhost --port 8080
+Accédez à http://localhost:8080 dans votre navigateur pour vérifier que l'API est en ligne.
+Envoyez des requêtes POST à [http://localhost:8080/predict/](http://localhost:8080/docs#/default/prediction_predict__post) avec les données d'entrée requises pour obtenir des prédictions de catégories de news.
+Exemple de requête JSON pour la prédiction :
+json
+{
+  "headline": "50-year-old problem of biology solved by Artificial Intelligence",
+  "article": "DeepMind's AI system 'AlphaFold' has been recognised as a solution to \"protein folding\", a grand challenge in biology for over 50 years. DeepMind showed it can predict how proteins fold into 3D shapes, a complex process that is fundamental to understanding the biological machinery of life. AlphaFold can predict the shape of proteins within the width of an atom."
+}
+Exemple de réponse attendue :
+json
+{
+  "category": "Science",
+  "score": 94.23
+}
+Assurez-vous d'avoir une connexion Internet active lors de l'exécution de l'API pour permettre le chargement du tokenizer BERT.

accuracy_and_loss.PNG ADDED Viewed

api.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# import torch
+# from fastapi import FastAPI, HTTPException
+# from pydantic import BaseModel
+# from transformers import AutoTokenizer
+# from torch.utils.data import DataLoader
+# from news_dataset import NewsDataset
+# from utils import load_model, predict_category
+# # Initialize FastAPI app
+# app = FastAPI()
+# # Load dataset and model
+# dataset = NewsDataset(csv_file="./inshort_news_data.csv", max_length=100)
+# num_classes = len(dataset.labels_dict)
+# model_path = './models/trained_model.pth'  # Path to your trained model
+# model = load_model(model_path, num_classes)
+# labels_dict = dataset.labels_dict
+# # Tokenizer initialization
+# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+# # Define Pydantic model for input data
+# class RequestPost(BaseModel):
+#     headline: str
+#     article: str
+# @app.get("/")
+# def read_root():
+#    return {"Hello": "World"}
+# # Define endpoint for prediction
+# @app.post("/predict/")
+# def prediction(request: RequestPost):
+#     try:
+#         category, score = predict_category(request.headline, request.article, model, labels_dict)
+#         return {"category": category, "score": score}
+#     except Exception as e:
+#         raise HTTPException(status_code=500, detail=str(e))
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Optional
+from transformers import AutoTokenizer
+from torch.utils.data import DataLoader
+from news_dataset import NewsDataset
+from utils import load_model, predict_category
+# Initialize FastAPI app
+app = FastAPI()
+# Load dataset and model
+dataset = NewsDataset(csv_file="./inshort_news_data.csv", max_length=100)
+num_classes = len(dataset.labels_dict)
+model_path = './models/trained_model1.pth'  # Path to your trained model
+model = load_model(model_path, num_classes)
+labels_dict = dataset.labels_dict
+# Tokenizer initialization
+tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+# Define Pydantic model for input data
+class RequestPost(BaseModel):
+    headline: str
+    article: str
+@app.get("/")
+def read_root():
+   return {"Hello": "World"}
+# Define endpoint for prediction
+@app.post("/predict/")
+def prediction(request: RequestPost):
+    try:
+        category, score = predict_category(request.headline, request.article, model, labels_dict)
+        return {"category": category, "score": score}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

bert_classification.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+from transformers import AutoTokenizer, BertModel
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+from news_dataset import NewsDataset
+class CustomBert(nn.Module):
+    def __init__(self, model_name_or_path="bert-base-uncased", n_classes=2):
+        super(CustomBert, self).__init__()
+        self.bert_pretrained = BertModel.from_pretrained(model_name_or_path)
+        self.classifier = nn.Linear(self.bert_pretrained.config.hidden_size, n_classes)
+    def forward(self, input_ids, attention_mask):
+        x = self.bert_pretrained(input_ids=input_ids, attention_mask=attention_mask)
+        x = self.classifier(x.pooler_output)
+        return x
+#Training function
+def training_step(model, data_loader, loss_fn, optimizer):
+    model.train()
+    total_loss = 0
+    for data in tqdm(data_loader, total=len(data_loader)):
+        input_ids = data['input_ids']
+        attention_mask = data['attention_mask']
+        labels = data['labels']
+        output = model(input_ids=input_ids, attention_mask=attention_mask)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        total_loss += loss.item()
+    return total_loss / len(data_loader.dataset)
+#Evaluation
+def evaluation(model, test_dataloader, loss_fn):
+    model.eval()
+    correct_predictions = 0
+    losses = []
+    for data in tqdm(test_dataloader, total=len(test_dataloader)):
+        input_ids = data['input_ids']
+        attention_mask = data['attention_mask']
+        labels = data['labels']
+        output = model(input_ids=input_ids, attention_mask=attention_mask)
+        _, pred = output.max(1)
+        correct_predictions += torch.sum(pred == labels)
+        loss = loss_fn(output, labels)
+        losses.append(loss.item())
+    return correct_predictions.double() / len(test_dataloader.dataset), np.mean(losses)
+#main
+if __name__ == "__main__":
+    dataset = NewsDataset(csv_file="./inshort_news_data.csv", max_length=100)
+    num_classes = len(dataset.labels_dict)
+    train_data, test_data = train_test_split(dataset, test_size=0.2)
+    train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
+    test_dataloader = DataLoader(test_data, batch_size=8, shuffle=False)
+    model = CustomBert(n_classes=num_classes)
+    loss_fn = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
+    num_epochs = 3
+    for epoch in range(num_epochs):
+        print(f"Epoch {epoch + 1}/{num_epochs}")
+        train_loss = training_step(model, train_dataloader, loss_fn, optimizer)
+        print(f"Train Loss: {train_loss:.4f}")
+        val_acc, val_loss = evaluation(model, test_dataloader, loss_fn)
+        print(f"Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}")
+    # Save the model
+        import os
+        os.makedirs('./models', exist_ok=True)
+        torch.save(model.state_dict(), './models/trained_model1.pth')

inshort_news_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# main.py
+import gradio as gr
+from utils import load_model, predict_category
+from news_dataset import NewsDataset  # Importez NewsDataset depuis news_dataset.py
+def launch_app():
+    dataset = NewsDataset(csv_file="./inshort_news_data.csv", max_length=100)
+    num_classes = len(dataset.labels_dict)
+    model_path = './models/trained_model1.pth'  # Chemin vers le modèle entraîné
+    model = load_model(model_path, num_classes)  # Charger le modèle entraîné avec le bon nombre de classes
+    labels_dict = dataset.labels_dict
+    def predict_function(headline, article):
+        return predict_category(headline, article, model, labels_dict)
+    iface = gr.Interface(
+        fn=predict_function,
+        inputs=["text", "text"],
+        outputs="text",
+        title="News Category Classification",
+        description="Enter a headline and an article to classify its category."
+    )
+    #iface.launch()
+    iface.launch(share=True)
+if __name__ == "__main__":
+    launch_app()

models/trained_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7949b4c99b6c2a8021bfd95d80a1fcf6567f71b7dd84a0984b80e58d94d75c36
+size 438039157

models/trained_model1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f06afe32b4012087998ccf1edbb475dc2f84c43600f61d1b4d1f9c5af1b690d
+size 438039361

news_dataset.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# news_dataset.py
+import torch
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+class NewsDataset(Dataset):
+    def __init__(self, csv_file, max_length):
+        import pandas as pd
+        self.df = pd.read_csv(csv_file)
+        self.labels = self.df['news_category'].unique()
+        self.labels_dict = {label: index for index, label in enumerate(self.labels)}
+        self.df['news_category'] = self.df['news_category'].map(self.labels_dict)
+        self.max_length = max_length
+        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, index):
+        headline_text = self.df.news_headline[index]
+        article_text = self.df.news_article[index]
+        combined_text = headline_text + " " + article_text
+        label = self.df.news_category[index]
+        inputs = self.tokenizer(
+            combined_text,
+            padding="max_length",
+            max_length=self.max_length,
+            truncation=True,
+            return_tensors="pt"
+        )
+        labels = torch.tensor(label)
+        return {
+            "input_ids": inputs["input_ids"].squeeze(0),
+            "attention_mask": inputs["attention_mask"].squeeze(0),
+            "labels": labels,
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.0.0
+transformers==4.30.0
+scikit-learn==1.2.2
+pandas==1.5.3
+tqdm==4.65.0
+numpy==1.23.5
+gradio==3.4.1
+fastapi
+#"uvicorn[standard]"
+pydantic

utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from transformers import AutoTokenizer
+import torch.nn as nn
+from bert_classification import CustomBert  # Importer le modèle depuis le fichier bert_classification.py
+tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+def load_model(model_path, num_classes):
+    model = CustomBert(n_classes=num_classes)  # Adapter ici le nombre de classes
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+    return model
+def predict_category(headline, article, model, labels_dict, max_length=100):
+    text = headline + " " + article
+    inputs = tokenizer(
+        text,
+        padding="max_length",
+        max_length=max_length,
+        truncation=True,
+        return_tensors="pt"
+    )
+    with torch.no_grad():
+        output = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
+        probabilities = nn.Softmax(dim=1)(output)
+        _, pred = torch.max(probabilities, dim=1)
+        score = probabilities[0][pred].item()
+        inv_labels_dict = {v: k for k, v in labels_dict.items()}
+        category = inv_labels_dict[pred.item()]
+    score = round(score, 2)
+    return category, score