Draft app
Browse files- .gitignore +2 -1
- README.md +63 -2
- app_gradio.py +54 -0
- requirements.txt +5 -0
.gitignore
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
__pycache__/
|
| 2 |
*.pyc
|
| 3 |
.DS_Store
|
| 4 |
-
Sandbox_Text_Summarization.ipynb
|
|
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
*.pyc
|
| 3 |
.DS_Store
|
| 4 |
+
Sandbox_Text_Summarization.ipynb
|
| 5 |
+
.gradio
|
README.md
CHANGED
|
@@ -1,2 +1,63 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🧠 Multilingual Text Summarizer with Transformers
|
| 2 |
+
|
| 3 |
+
This project is a web-based application that summarizes English or French text using LLMs. It supports direct input, `.txt`, and `.pdf` files with automatic language detection.
|
| 4 |
+
|
| 5 |
+
The project uses **Large Language Models (LLMs)** such as **BART** or **T5**, deployed via a simple, interactive **Gradio** interface.
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
## 📌 Objectives
|
| 9 |
+
|
| 10 |
+
- Automate the **synthesis of long texts** (e-mails, reports, news...)
|
| 11 |
+
- Apply **automatic summarization techniques with LLMs**.
|
| 12 |
+
- Propose a **simple and responsive user interface**.
|
| 13 |
+
- Demonstrate a **real-life case of NLP model industrialization**.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
## 🧠 Technical stack
|
| 17 |
+
|
| 18 |
+
- [Transformers](https://huggingface.co/docs/transformers/index) - Pre-trained models (BART, T5...)
|
| 19 |
+
- [Streamlit](https://streamlit.io) - Web interface
|
| 20 |
+
- [Gradio](https://www.gradio.app/) - Web interface
|
| 21 |
+
- [Python](https://www.python.org) - Processing & pipeline
|
| 22 |
+
- (Bonus) Docker, FastAPI, GitHub Actions - MLOps
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
## ✨ Features
|
| 26 |
+
- Automatic language detection (English or French)
|
| 27 |
+
- Summarization using state-of-the-art models
|
| 28 |
+
- Gradio-based web interface
|
| 29 |
+
- Supports text, .txt and .pdf inputs
|
| 30 |
+
|
| 31 |
+
## 🚀 Run the App
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
git clone https://github.com/issa-kabore/SmartSummarizer.git
|
| 35 |
+
cd SmartSummarizer
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
python app_gradio.py
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
## 🚀 Demo
|
| 42 |
+
👉 [Link to deployed app](https://...)
|
| 43 |
+
📸 See screenshots below
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
## 📂 Project structure
|
| 47 |
+
```bash
|
| 48 |
+
SmartSummarizer/
|
| 49 |
+
│
|
| 50 |
+
├── app_gradio.py # Gradio main script (user interface)
|
| 51 |
+
├── summarizer/
|
| 52 |
+
│ ├── __init__.py
|
| 53 |
+
│ ├── models.py # Loading models and pipelines
|
| 54 |
+
│ ├── utils.py # Import functions .txt/.pdf and Language detection
|
| 55 |
+
│ └── summarize.py # Main summary function
|
| 56 |
+
│
|
| 57 |
+
├── assets/ # (Optional) static files: images, logos, etc.
|
| 58 |
+
│
|
| 59 |
+
├── requirements.txt # Dependencies to install
|
| 60 |
+
├── README.md # Project presentation
|
| 61 |
+
└── .gitignore # Files to be ignored by Git
|
| 62 |
+
|
| 63 |
+
```
|
app_gradio.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from summarizer.summarize import generate_summary
|
| 3 |
+
|
| 4 |
+
iface = gr.Interface(
|
| 5 |
+
fn=generate_summary,
|
| 6 |
+
inputs=[
|
| 7 |
+
gr.Textbox(label="Enter text manually", lines=8, placeholder="Write or paste text here..."),
|
| 8 |
+
gr.File(label="Or upload a .txt or .pdf file", file_types=[".txt", ".pdf"]),
|
| 9 |
+
gr.Slider(10, 200, value=30, step=10, label="Min Summary Length"),
|
| 10 |
+
gr.Slider(30, 300, value=100, step=10, label="Max Summary Length"),
|
| 11 |
+
gr.Checkbox(label="Use sampling (do_sample)", value=False),
|
| 12 |
+
],
|
| 13 |
+
outputs=gr.Textbox(label="Generated Summary", elem_id="output-summary"),
|
| 14 |
+
title="📝 Multilingual Text Summarizer with LLMs",
|
| 15 |
+
description="Summarize English or French text using transformers. Supports text, PDF and TXT.",
|
| 16 |
+
theme=gr.themes.Monochrome(),
|
| 17 |
+
# gr.themes.Default()
|
| 18 |
+
# gr.themes.Base()
|
| 19 |
+
# gr.themes.Soft()
|
| 20 |
+
# gr.themes.Monochrome()
|
| 21 |
+
# gr.themes.Glass()
|
| 22 |
+
live=True, # Allow real-time interaction with the summarizer
|
| 23 |
+
examples=[
|
| 24 |
+
[
|
| 25 |
+
"""Bonjour, ceci est un exemple d'email professionnel très long. Nous avons plusieurs documents importants à examiner. Le premier document concerne la gestion des ressources humaines, et le second porte sur l'optimisation des processus logistiques pour améliorer l'efficacité des opérations de transport. Les deux documents contiennent des informations clés sur les changements organisationnels que nous avons mis en place récemment. La réunion d'aujourd'hui permettra de discuter de ces points et de prendre des décisions éclairées sur la direction future de notre entreprise. Nous avons besoin d'un résumé clair et précis des principaux changements et recommandations. Merci de prêter attention aux détails les plus importants.""",
|
| 26 |
+
],
|
| 27 |
+
[
|
| 28 |
+
"""This is a long English article that explains how machine learning models are trained using large datasets. Machine learning involves the development of algorithms that can process and analyze data to make predictions or decisions without being explicitly programmed. In this article, we will explore the different stages involved in training a machine learning model, starting with data collection, followed by data preprocessing, feature engineering, model selection, and finally, model training and evaluation. Each of these steps is crucial for building a high-performance machine learning model. We will also discuss some of the challenges faced during the training process, such as overfitting and underfitting, and how to mitigate them using techniques like cross-validation, hyperparameter tuning, and regularization. The goal is to help readers understand the entire process of model training and to provide insights into the best practices used in the industry.""",
|
| 29 |
+
]
|
| 30 |
+
],
|
| 31 |
+
css="""
|
| 32 |
+
#text-input, #file-input {
|
| 33 |
+
font-size: 16px;
|
| 34 |
+
border-radius: 8px;
|
| 35 |
+
}
|
| 36 |
+
#min-length-slider, #max-length-slider {
|
| 37 |
+
background-color: #e0e0e0;
|
| 38 |
+
}
|
| 39 |
+
#output-summary {
|
| 40 |
+
font-size: 14px;
|
| 41 |
+
font-family: Arial, sans-serif;
|
| 42 |
+
color: #333;
|
| 43 |
+
border: 1px solid #ddd;
|
| 44 |
+
border-radius: 8px;
|
| 45 |
+
padding: 12px;
|
| 46 |
+
}
|
| 47 |
+
.footer {
|
| 48 |
+
display: none; /* Remove footer */
|
| 49 |
+
}
|
| 50 |
+
"""
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
iface.launch()
|
requirements.txt
CHANGED
|
@@ -1,4 +1,9 @@
|
|
| 1 |
transformers
|
|
|
|
|
|
|
|
|
|
| 2 |
torch
|
| 3 |
langdetect
|
| 4 |
gradio # or streamlit
|
|
|
|
|
|
|
|
|
| 1 |
transformers
|
| 2 |
+
tiktoken
|
| 3 |
+
blobfile
|
| 4 |
+
sentencepiece
|
| 5 |
torch
|
| 6 |
langdetect
|
| 7 |
gradio # or streamlit
|
| 8 |
+
fpdf2 # for pdf support writing
|
| 9 |
+
pypdf2 # for pdf support reading
|