Commit
·
3ce1088
1
Parent(s):
222cf81
Upgrade gradio
Browse files
README.md
CHANGED
|
@@ -4,9 +4,99 @@ emoji: 🏢
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.33.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Sentence Transformers Demo
|
| 13 |
+
|
| 14 |
+
Interactive web application for semantic text similarity analysis using Sentence Transformers models.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
### 1. Paraphrase Mining
|
| 19 |
+
- Find sentences with similar meaning in a text corpus
|
| 20 |
+
- Support for multiple language models
|
| 21 |
+
- Adjustable similarity threshold
|
| 22 |
+
- Export results in CSV format
|
| 23 |
+
|
| 24 |
+
### 2. Semantic Textual Similarity (STS)
|
| 25 |
+
- Calculate semantic similarity between two sets of sentences
|
| 26 |
+
- Uses advanced sentence transformation models
|
| 27 |
+
- Compare sentences in different languages
|
| 28 |
+
- Export results in CSV format
|
| 29 |
+
|
| 30 |
+
## Available Models
|
| 31 |
+
|
| 32 |
+
- [`Lajavaness/bilingual-embedding-large`](https://huggingface.co/Lajavaness/bilingual-embedding-large): Multilingual model optimized for multiple languages
|
| 33 |
+
- [`sentence-transformers/all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2): High-quality general-purpose model
|
| 34 |
+
- [`intfloat/multilingual-e5-large-instruct`](https://huggingface.co/intfloat/multilingual-e5-large-instruct): Multilingual model with instructions
|
| 35 |
+
|
| 36 |
+
## Requirements
|
| 37 |
+
|
| 38 |
+
- Python 3.8+
|
| 39 |
+
- Dependencies listed in `requirements.txt`
|
| 40 |
+
|
| 41 |
+
## Installation
|
| 42 |
+
|
| 43 |
+
1. Clone the repository:
|
| 44 |
+
```bash
|
| 45 |
+
git clone https://github.com/yourusername/sentence-transformers.git
|
| 46 |
+
cd sentence-transformers
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
2. Create and activate a virtual environment:
|
| 50 |
+
```bash
|
| 51 |
+
python -m venv venv
|
| 52 |
+
source venv/bin/activate # Linux/Mac
|
| 53 |
+
# or
|
| 54 |
+
.\venv\Scripts\activate # Windows
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
3. Install dependencies:
|
| 58 |
+
```bash
|
| 59 |
+
pip install -r requirements.txt
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
## Usage
|
| 63 |
+
|
| 64 |
+
1. Start the application:
|
| 65 |
+
```bash
|
| 66 |
+
python app.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
2. Open your browser at `http://localhost:7860`
|
| 70 |
+
|
| 71 |
+
3. Select the desired functionality:
|
| 72 |
+
- Paraphrase Mining: Upload a CSV file with sentences to analyze
|
| 73 |
+
- STS: Upload two CSV files with sentences to compare
|
| 74 |
+
|
| 75 |
+
4. Select the model and adjust the similarity threshold
|
| 76 |
+
|
| 77 |
+
5. Click "Process" to start the analysis
|
| 78 |
+
|
| 79 |
+
6. Download results in CSV format
|
| 80 |
+
|
| 81 |
+
## CSV File Format
|
| 82 |
+
|
| 83 |
+
CSV files must contain a column named "text" with the sentences to analyze:
|
| 84 |
+
|
| 85 |
+
```csv
|
| 86 |
+
text
|
| 87 |
+
"First sentence to analyze"
|
| 88 |
+
"Second sentence to analyze"
|
| 89 |
+
...
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
## Notes
|
| 93 |
+
|
| 94 |
+
- Temporary files are automatically cleaned up every 30 minutes
|
| 95 |
+
- Using complete sentences is recommended for better results
|
| 96 |
+
- Models may take time to load on first use
|
| 97 |
+
|
| 98 |
+
## License
|
| 99 |
+
|
| 100 |
+
MIT
|
| 101 |
+
|
| 102 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -4,112 +4,260 @@ import gradio as gr
|
|
| 4 |
from mining import mining
|
| 5 |
from sts import sts
|
| 6 |
from utils import getDataFrame, save_to_csv, delete_folder_periodically
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"### Paraphrase mining is the task of finding paraphrases (texts with identical / similar meaning) in a large corpus of sentences")
|
| 15 |
-
with gr.Row():
|
| 16 |
-
with gr.Column():
|
| 17 |
-
gr.Markdown("#### sentences")
|
| 18 |
-
|
| 19 |
-
upload_button_sentences = gr.UploadButton(label="upload sentences csv", file_types=['.csv'],
|
| 20 |
-
file_count="single")
|
| 21 |
-
output_data_sentences = gr.Dataframe(headers=["text"], col_count=1, label="sentences data")
|
| 22 |
-
|
| 23 |
-
upload_button_sentences.upload(fn=getDataFrame, inputs=upload_button_sentences,
|
| 24 |
-
outputs=output_data_sentences, concurrency_limit=CONCURRENCY_LIMIT)
|
| 25 |
-
|
| 26 |
-
with gr.Row():
|
| 27 |
-
with gr.Column():
|
| 28 |
-
model = gr.Dropdown(
|
| 29 |
-
["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
|
| 30 |
-
"intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
|
| 31 |
-
score_mining = gr.Number(label="score", value=0.96, interactive=True)
|
| 32 |
-
submit_button_mining = gr.Button("Submit", variant="primary")
|
| 33 |
-
|
| 34 |
-
with gr.Row():
|
| 35 |
-
with gr.Column():
|
| 36 |
-
output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
|
| 37 |
-
label="Mining")
|
| 38 |
-
|
| 39 |
-
submit_button_mining.click(
|
| 40 |
-
fn=mining,
|
| 41 |
-
inputs=[model, upload_button_sentences, score_mining],
|
| 42 |
-
outputs=output_mining
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
download_button = gr.Button("Download Results as CSV", variant="huggingface")
|
| 46 |
-
download_file = gr.File(label="Downloadable File")
|
| 47 |
-
|
| 48 |
-
download_button.click(
|
| 49 |
-
fn=save_to_csv,
|
| 50 |
-
inputs=output_mining,
|
| 51 |
-
outputs=download_file
|
| 52 |
-
)
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
gr.Markdown("#### sentences 1")
|
| 61 |
-
upload_button_sentences1 = gr.UploadButton(label="upload sentences 1 csv", file_types=['.csv'],
|
| 62 |
-
file_count="single")
|
| 63 |
-
output_data_sentences1 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 1 data")
|
| 64 |
-
|
| 65 |
-
upload_button_sentences1.upload(fn=getDataFrame, inputs=upload_button_sentences1,
|
| 66 |
-
outputs=output_data_sentences1, concurrency_limit=CONCURRENCY_LIMIT)
|
| 67 |
-
|
| 68 |
-
with gr.Column():
|
| 69 |
-
gr.Markdown("#### sentences 2")
|
| 70 |
-
upload_button_sentences2 = gr.UploadButton(label="upload sentences 2 csv", file_types=['.csv'],
|
| 71 |
-
file_count="single")
|
| 72 |
-
output_data_sentences2 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 2 data")
|
| 73 |
-
|
| 74 |
-
upload_button_sentences2.upload(fn=getDataFrame, inputs=upload_button_sentences2,
|
| 75 |
-
outputs=output_data_sentences2, concurrency_limit=CONCURRENCY_LIMIT)
|
| 76 |
-
|
| 77 |
-
with gr.Row():
|
| 78 |
-
with gr.Column():
|
| 79 |
-
model = gr.Dropdown(
|
| 80 |
-
["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
|
| 81 |
-
"intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
|
| 82 |
-
score_sts = gr.Number(label="score", value=0.96, interactive=True)
|
| 83 |
-
submit_button_sts = gr.Button("Submit", variant="primary")
|
| 84 |
-
|
| 85 |
-
with gr.Row():
|
| 86 |
-
with gr.Column():
|
| 87 |
-
gr.Markdown("#### STS Results")
|
| 88 |
-
|
| 89 |
-
output_sts = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
|
| 90 |
-
label="Semantic Textual Similarit")
|
| 91 |
-
|
| 92 |
-
submit_button_sts.click(
|
| 93 |
-
fn=sts,
|
| 94 |
-
inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
|
| 95 |
-
outputs=output_sts
|
| 96 |
-
)
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
if __name__ == "__main__":
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from mining import mining
|
| 5 |
from sts import sts
|
| 6 |
from utils import getDataFrame, save_to_csv, delete_folder_periodically
|
| 7 |
+
import logging
|
| 8 |
|
| 9 |
+
# Configure logging
|
| 10 |
+
logging.basicConfig(
|
| 11 |
+
level=logging.INFO,
|
| 12 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 13 |
+
)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
CONCURRENCY_LIMIT = 5
|
| 17 |
+
AVAILABLE_MODELS = [
|
| 18 |
+
"Lajavaness/bilingual-embedding-large",
|
| 19 |
+
"sentence-transformers/all-mpnet-base-v2",
|
| 20 |
+
"intfloat/multilingual-e5-large-instruct"
|
| 21 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
MODEL_DESCRIPTIONS = {
|
| 24 |
+
"Lajavaness/bilingual-embedding-large": "Multilingual model optimized for multiple languages. [More info](https://huggingface.co/Lajavaness/bilingual-embedding-large)",
|
| 25 |
+
"sentence-transformers/all-mpnet-base-v2": "High-quality general-purpose model. [More info](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)",
|
| 26 |
+
"intfloat/multilingual-e5-large-instruct": "Multilingual model with instructions. [More info](https://huggingface.co/intfloat/multilingual-e5-large-instruct)"
|
| 27 |
+
}
|
| 28 |
|
| 29 |
+
def create_interface():
|
| 30 |
+
with gr.Blocks(title="Sentence Transformers Demo") as demo:
|
| 31 |
+
gr.Markdown("# Sentence Transformers Demo")
|
| 32 |
+
gr.Markdown("This application provides two main functionalities: Paraphrase Mining and Semantic Textual Similarity (STS).")
|
| 33 |
+
|
| 34 |
+
with gr.Tab("Paraphrase Mining"):
|
| 35 |
+
with gr.Row():
|
| 36 |
+
with gr.Column():
|
| 37 |
+
gr.Markdown(
|
| 38 |
+
"### Paraphrase Mining\n"
|
| 39 |
+
"Find paraphrases (texts with identical/similar meaning) in a large corpus of sentences.\n"
|
| 40 |
+
"Upload a CSV file containing your sentences and select a model to begin."
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
with gr.Row():
|
| 44 |
+
with gr.Column():
|
| 45 |
+
gr.Markdown("#### Input Sentences")
|
| 46 |
+
upload_button_sentences = gr.UploadButton(
|
| 47 |
+
label="Upload Sentences CSV",
|
| 48 |
+
file_types=['.csv'],
|
| 49 |
+
file_count="single",
|
| 50 |
+
variant="primary"
|
| 51 |
+
)
|
| 52 |
+
output_data_sentences = gr.Dataframe(
|
| 53 |
+
headers=["_id", "text"],
|
| 54 |
+
col_count=2,
|
| 55 |
+
label="Sentences Data",
|
| 56 |
+
interactive=False
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
upload_button_sentences.upload(
|
| 60 |
+
fn=getDataFrame,
|
| 61 |
+
inputs=upload_button_sentences,
|
| 62 |
+
outputs=output_data_sentences,
|
| 63 |
+
concurrency_limit=CONCURRENCY_LIMIT
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
with gr.Row():
|
| 67 |
+
with gr.Column():
|
| 68 |
+
model = gr.Dropdown(
|
| 69 |
+
choices=AVAILABLE_MODELS,
|
| 70 |
+
label="Select Model",
|
| 71 |
+
value=AVAILABLE_MODELS[0],
|
| 72 |
+
interactive=True
|
| 73 |
+
)
|
| 74 |
+
model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
|
| 75 |
+
|
| 76 |
+
def update_model_description(model_name):
|
| 77 |
+
return MODEL_DESCRIPTIONS[model_name]
|
| 78 |
+
|
| 79 |
+
model.change(
|
| 80 |
+
fn=update_model_description,
|
| 81 |
+
inputs=model,
|
| 82 |
+
outputs=model_description
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
score_mining = gr.Slider(
|
| 86 |
+
minimum=0.0,
|
| 87 |
+
maximum=1.0,
|
| 88 |
+
value=0.96,
|
| 89 |
+
step=0.01,
|
| 90 |
+
label="Similarity Threshold",
|
| 91 |
+
interactive=True
|
| 92 |
+
)
|
| 93 |
+
submit_button_mining = gr.Button("Process", variant="primary")
|
| 94 |
+
|
| 95 |
+
with gr.Row():
|
| 96 |
+
with gr.Column():
|
| 97 |
+
output_mining = gr.Dataframe(
|
| 98 |
+
headers=["score", "sentence_1", "sentence_2"],
|
| 99 |
+
type="polars",
|
| 100 |
+
label="Mining Results"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
submit_button_mining.click(
|
| 104 |
+
fn=mining,
|
| 105 |
+
inputs=[model, upload_button_sentences, score_mining],
|
| 106 |
+
outputs=output_mining
|
| 107 |
+
).then(
|
| 108 |
+
fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
|
| 109 |
+
inputs=[output_mining],
|
| 110 |
+
outputs=[]
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
download_button = gr.Button("Download Results as CSV", variant="secondary")
|
| 114 |
+
download_file = gr.File(label="Downloadable File")
|
| 115 |
+
|
| 116 |
+
download_button.click(
|
| 117 |
+
fn=save_to_csv,
|
| 118 |
+
inputs=output_mining,
|
| 119 |
+
outputs=download_file
|
| 120 |
+
).then(
|
| 121 |
+
fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
|
| 122 |
+
inputs=[download_file],
|
| 123 |
+
outputs=[]
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
with gr.Tab("Semantic Textual Similarity"):
|
| 127 |
+
with gr.Row():
|
| 128 |
+
with gr.Column():
|
| 129 |
+
gr.Markdown(
|
| 130 |
+
"### Semantic Textual Similarity (STS)\n"
|
| 131 |
+
"Calculate semantic similarity between two sets of sentences.\n"
|
| 132 |
+
"Upload two CSV files containing your sentences and select a model to begin."
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
with gr.Row():
|
| 136 |
+
with gr.Column():
|
| 137 |
+
gr.Markdown("#### First Set of Sentences")
|
| 138 |
+
upload_button_sentences1 = gr.UploadButton(
|
| 139 |
+
label="Upload First Set CSV",
|
| 140 |
+
file_types=['.csv'],
|
| 141 |
+
file_count="single",
|
| 142 |
+
variant="primary"
|
| 143 |
+
)
|
| 144 |
+
output_data_sentences1 = gr.Dataframe(
|
| 145 |
+
headers=["_id", "text"],
|
| 146 |
+
col_count=2,
|
| 147 |
+
label="First Set Data",
|
| 148 |
+
interactive=False
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
upload_button_sentences1.upload(
|
| 152 |
+
fn=getDataFrame,
|
| 153 |
+
inputs=upload_button_sentences1,
|
| 154 |
+
outputs=output_data_sentences1,
|
| 155 |
+
concurrency_limit=CONCURRENCY_LIMIT
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
with gr.Column():
|
| 159 |
+
gr.Markdown("#### Second Set of Sentences")
|
| 160 |
+
upload_button_sentences2 = gr.UploadButton(
|
| 161 |
+
label="Upload Second Set CSV",
|
| 162 |
+
file_types=['.csv'],
|
| 163 |
+
file_count="single",
|
| 164 |
+
variant="primary"
|
| 165 |
+
)
|
| 166 |
+
output_data_sentences2 = gr.Dataframe(
|
| 167 |
+
headers=["_id", "text"],
|
| 168 |
+
col_count=2,
|
| 169 |
+
label="Second Set Data",
|
| 170 |
+
interactive=False
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
upload_button_sentences2.upload(
|
| 174 |
+
fn=getDataFrame,
|
| 175 |
+
inputs=upload_button_sentences2,
|
| 176 |
+
outputs=output_data_sentences2,
|
| 177 |
+
concurrency_limit=CONCURRENCY_LIMIT
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
with gr.Row():
|
| 181 |
+
with gr.Column():
|
| 182 |
+
model = gr.Dropdown(
|
| 183 |
+
choices=AVAILABLE_MODELS,
|
| 184 |
+
label="Select Model",
|
| 185 |
+
value=AVAILABLE_MODELS[0],
|
| 186 |
+
interactive=True
|
| 187 |
+
)
|
| 188 |
+
model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
|
| 189 |
+
|
| 190 |
+
model.change(
|
| 191 |
+
fn=update_model_description,
|
| 192 |
+
inputs=model,
|
| 193 |
+
outputs=model_description
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
score_sts = gr.Slider(
|
| 197 |
+
minimum=0.0,
|
| 198 |
+
maximum=1.0,
|
| 199 |
+
value=0.96,
|
| 200 |
+
step=0.01,
|
| 201 |
+
label="Similarity Threshold",
|
| 202 |
+
interactive=True
|
| 203 |
+
)
|
| 204 |
+
submit_button_sts = gr.Button("Process", variant="primary")
|
| 205 |
+
|
| 206 |
+
with gr.Row():
|
| 207 |
+
with gr.Column():
|
| 208 |
+
output_sts = gr.Dataframe(
|
| 209 |
+
headers=["score", "sentences1", "sentences2"],
|
| 210 |
+
type="polars",
|
| 211 |
+
label="Similarity Results"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
submit_button_sts.click(
|
| 215 |
+
fn=sts,
|
| 216 |
+
inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
|
| 217 |
+
outputs=output_sts
|
| 218 |
+
).then(
|
| 219 |
+
fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
|
| 220 |
+
inputs=[output_sts],
|
| 221 |
+
outputs=[]
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
download_button = gr.Button("Download Results as CSV", variant="secondary")
|
| 225 |
+
download_file = gr.File(label="Downloadable File")
|
| 226 |
+
|
| 227 |
+
download_button.click(
|
| 228 |
+
fn=save_to_csv,
|
| 229 |
+
inputs=output_sts,
|
| 230 |
+
outputs=download_file
|
| 231 |
+
).then(
|
| 232 |
+
fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
|
| 233 |
+
inputs=[download_file],
|
| 234 |
+
outputs=[]
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
return demo
|
| 238 |
|
| 239 |
if __name__ == "__main__":
|
| 240 |
+
try:
|
| 241 |
+
multiprocessing.set_start_method("spawn")
|
| 242 |
+
|
| 243 |
+
# Start cleanup thread
|
| 244 |
+
folder_path = "data"
|
| 245 |
+
thread = threading.Thread(
|
| 246 |
+
target=delete_folder_periodically,
|
| 247 |
+
args=(folder_path, 1800),
|
| 248 |
+
daemon=True
|
| 249 |
+
)
|
| 250 |
+
thread.start()
|
| 251 |
|
| 252 |
+
# Create and launch interface
|
| 253 |
+
demo = create_interface()
|
| 254 |
+
demo.launch(
|
| 255 |
+
share=False,
|
| 256 |
+
server_name="0.0.0.0",
|
| 257 |
+
server_port=7860,
|
| 258 |
+
show_error=True,
|
| 259 |
+
show_api=False
|
| 260 |
+
)
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.error(f"Error starting application: {str(e)}")
|
| 263 |
+
raise
|
mining.py
CHANGED
|
@@ -2,56 +2,89 @@ import time
|
|
| 2 |
import pandas as pd
|
| 3 |
import polars as pl
|
| 4 |
import torch
|
|
|
|
| 5 |
from datasets import Dataset
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from sentence_transformers.util import paraphrase_mining
|
|
|
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
-
def mining(modelname, path, score):
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import polars as pl
|
| 4 |
import torch
|
| 5 |
+
import logging
|
| 6 |
from datasets import Dataset
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from sentence_transformers.util import paraphrase_mining
|
| 9 |
+
from typing import Optional
|
| 10 |
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
+
def mining(modelname: str, path: str, score: float) -> Optional[pl.DataFrame]:
|
| 14 |
+
"""
|
| 15 |
+
Perform paraphrase mining on the input data.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
modelname: Name of the model to use
|
| 19 |
+
path: Path to the input CSV file
|
| 20 |
+
score: Minimum similarity score threshold
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Optional[pl.DataFrame]: DataFrame with mining results or None if error occurs
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
st = time.time()
|
| 27 |
+
|
| 28 |
+
# Read and validate input data
|
| 29 |
+
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
|
| 30 |
+
original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
|
| 31 |
+
|
| 32 |
+
if data.num_rows == 0:
|
| 33 |
+
logger.error("No data found in input file")
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
# Initialize model
|
| 37 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 38 |
+
logger.info(f"Using device: {device}")
|
| 39 |
+
|
| 40 |
+
model = SentenceTransformer(
|
| 41 |
+
modelname,
|
| 42 |
+
device=device,
|
| 43 |
+
trust_remote_code=True,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Perform paraphrase mining
|
| 47 |
+
logger.info("Starting paraphrase mining...")
|
| 48 |
+
paraphrases = paraphrase_mining(
|
| 49 |
+
model,
|
| 50 |
+
data["text"],
|
| 51 |
+
corpus_chunk_size=len(data),
|
| 52 |
+
show_progress_bar=True,
|
| 53 |
+
batch_size=1024,
|
| 54 |
+
max_pairs=len(data) ** 2,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Process results
|
| 58 |
+
df_pd = pd.DataFrame(paraphrases)
|
| 59 |
+
df = pl.from_pandas(df_pd)
|
| 60 |
+
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
|
| 61 |
+
|
| 62 |
+
union_df = pl.DataFrame(data.to_pandas())
|
| 63 |
+
original_columns = original_df.columns.tolist()
|
| 64 |
+
|
| 65 |
+
# Add additional columns if present
|
| 66 |
+
additional_cols = []
|
| 67 |
+
for col in original_columns:
|
| 68 |
+
if col != "text":
|
| 69 |
+
additional_cols.extend([
|
| 70 |
+
union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
|
| 71 |
+
union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
|
| 72 |
+
])
|
| 73 |
+
|
| 74 |
+
# Process final results
|
| 75 |
+
df = df.with_columns([
|
| 76 |
+
pl.col("score").round(3).cast(pl.Float32),
|
| 77 |
+
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
|
| 78 |
+
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
|
| 79 |
+
*additional_cols
|
| 80 |
+
]).filter(pl.col("score") > score).sort(["score"], descending=True)
|
| 81 |
+
|
| 82 |
+
elapsed_time = time.time() - st
|
| 83 |
+
logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
|
| 84 |
+
logger.info(f'Found {len(df)} paraphrases above score threshold {score}')
|
| 85 |
+
|
| 86 |
+
return df
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"Error in mining process: {str(e)}")
|
| 90 |
+
return None
|
requirements.txt
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
-
transformers
|
| 2 |
-
torch
|
| 3 |
-
pandas
|
| 4 |
-
polars
|
| 5 |
-
datasets
|
| 6 |
-
sentence-transformers[openvino,onnx-gpu,onnx]
|
| 7 |
-
multiprocess
|
| 8 |
-
gradio
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers>=4.36.0
|
| 2 |
+
torch>=2.1.0
|
| 3 |
+
pandas>=2.1.0
|
| 4 |
+
polars>=0.20.0
|
| 5 |
+
datasets>=2.14.0
|
| 6 |
+
sentence-transformers[openvino,onnx-gpu,onnx]>=2.2.0
|
| 7 |
+
multiprocess>=0.70.15
|
| 8 |
+
gradio>=4.12.0
|
| 9 |
+
numpy>=1.24.0
|
| 10 |
+
tqdm>=4.66.0
|
sts.py
CHANGED
|
@@ -2,54 +2,111 @@ import time
|
|
| 2 |
import pandas as pd
|
| 3 |
import polars as pl
|
| 4 |
import torch
|
|
|
|
| 5 |
from datasets import Dataset
|
| 6 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 7 |
|
|
|
|
| 8 |
|
| 9 |
-
def sts(modelname, data1, data2, score):
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
modelname
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import polars as pl
|
| 4 |
import torch
|
| 5 |
+
import logging
|
| 6 |
from datasets import Dataset
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
+
from typing import Optional
|
| 9 |
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
|
| 12 |
+
def sts(modelname: str, data1: str, data2: str, score: float) -> Optional[pl.DataFrame]:
|
| 13 |
+
"""
|
| 14 |
+
Calculate semantic textual similarity between two sets of sentences.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
modelname: Name of the model to use
|
| 18 |
+
data1: Path to first input CSV file
|
| 19 |
+
data2: Path to second input CSV file
|
| 20 |
+
score: Minimum similarity score threshold
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Optional[pl.DataFrame]: DataFrame with similarity results or None if error occurs
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
st = time.time()
|
| 27 |
+
|
| 28 |
+
# Initialize model
|
| 29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
+
logger.info(f"Using device: {device}")
|
| 31 |
+
|
| 32 |
+
model = SentenceTransformer(
|
| 33 |
+
modelname,
|
| 34 |
+
device=device,
|
| 35 |
+
trust_remote_code=True,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Read and validate input data
|
| 39 |
+
sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
|
| 40 |
+
sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
|
| 41 |
+
|
| 42 |
+
if sentences1.num_rows == 0 or sentences2.num_rows == 0:
|
| 43 |
+
logger.error("Empty input data found")
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
# Generate embeddings
|
| 47 |
+
logger.info("Generating embeddings for first set...")
|
| 48 |
+
embeddings1 = model.encode(
|
| 49 |
+
sentences1["text"],
|
| 50 |
+
normalize_embeddings=True,
|
| 51 |
+
batch_size=1024,
|
| 52 |
+
show_progress_bar=True
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
logger.info("Generating embeddings for second set...")
|
| 56 |
+
embeddings2 = model.encode(
|
| 57 |
+
sentences2["text"],
|
| 58 |
+
normalize_embeddings=True,
|
| 59 |
+
batch_size=1024,
|
| 60 |
+
show_progress_bar=True
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Calculate similarity matrix
|
| 64 |
+
logger.info("Calculating similarity matrix...")
|
| 65 |
+
similarity_matrix = model.similarity(embeddings1, embeddings2)
|
| 66 |
+
|
| 67 |
+
# Process results
|
| 68 |
+
df_pd = pd.DataFrame(similarity_matrix)
|
| 69 |
+
dfi = df_pd.__dataframe__()
|
| 70 |
+
df = pl.from_dataframe(dfi)
|
| 71 |
+
|
| 72 |
+
# Transform matrix to long format
|
| 73 |
+
df_matrix_with_index = df.with_row_index(name="row_index").with_columns(
|
| 74 |
+
pl.col("row_index").cast(pl.UInt64)
|
| 75 |
+
)
|
| 76 |
+
df_long = df_matrix_with_index.unpivot(
|
| 77 |
+
index="row_index",
|
| 78 |
+
variable_name="column_index",
|
| 79 |
+
value_name="score"
|
| 80 |
+
).with_columns(pl.col("column_index").cast(pl.UInt64))
|
| 81 |
+
|
| 82 |
+
# Join with original text
|
| 83 |
+
df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
|
| 84 |
+
pl.col("row_index").cast(pl.UInt64)
|
| 85 |
+
)
|
| 86 |
+
df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
|
| 87 |
+
pl.col("column_index").cast(pl.UInt64)
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Process final results
|
| 91 |
+
df_long = (df_long
|
| 92 |
+
.with_columns([pl.col("score").round(4).cast(pl.Float32)])
|
| 93 |
+
.join(df_sentences1, on="row_index")
|
| 94 |
+
.join(df_sentences2, on="column_index"))
|
| 95 |
+
|
| 96 |
+
df_long = df_long.rename({
|
| 97 |
+
"text": "sentences1",
|
| 98 |
+
"text_right": "sentences2",
|
| 99 |
+
}).drop(["row_index", "column_index"])
|
| 100 |
+
|
| 101 |
+
# Filter and sort results
|
| 102 |
+
result_df = df_long.filter(pl.col("score") > score).sort(["score"], descending=True)
|
| 103 |
+
|
| 104 |
+
elapsed_time = time.time() - st
|
| 105 |
+
logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
|
| 106 |
+
logger.info(f'Found {len(result_df)} pairs above score threshold {score}')
|
| 107 |
+
|
| 108 |
+
return result_df
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Error in STS process: {str(e)}")
|
| 112 |
+
return None
|
utils.py
CHANGED
|
@@ -4,22 +4,159 @@ import shutil
|
|
| 4 |
import pandas as pd
|
| 5 |
import polars as pl
|
| 6 |
import time
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
def
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
os.makedirs(folder_path, exist_ok=True)
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
dataframe.write_csv(csv_path, separator="\t")
|
|
|
|
|
|
|
| 18 |
return csv_path
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
def delete_folder_periodically(path, interval=3600):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
while True:
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import polars as pl
|
| 6 |
import time
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Optional, Tuple
|
| 9 |
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.INFO,
|
| 13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 14 |
+
)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
+
def validate_csv_structure(df: pd.DataFrame) -> Tuple[bool, str]:
|
| 18 |
+
"""
|
| 19 |
+
Validate the structure of the DataFrame.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
df: DataFrame to validate
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Tuple[bool, str]: (is_valid, error_message)
|
| 26 |
+
"""
|
| 27 |
+
# Check if DataFrame is empty
|
| 28 |
+
if df.empty:
|
| 29 |
+
return False, "CSV file is empty"
|
| 30 |
+
|
| 31 |
+
# Check required columns
|
| 32 |
+
required_columns = ['_id', 'text']
|
| 33 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
| 34 |
+
if missing_columns:
|
| 35 |
+
return False, f"Missing required columns: {', '.join(missing_columns)}"
|
| 36 |
+
|
| 37 |
+
# Validate _id column
|
| 38 |
+
if df['_id'].isna().any():
|
| 39 |
+
return False, "Found empty _id values"
|
| 40 |
+
|
| 41 |
+
# Validate text column
|
| 42 |
+
if df['text'].isna().any():
|
| 43 |
+
return False, "Found empty text values"
|
| 44 |
+
|
| 45 |
+
# Check for duplicate _id values
|
| 46 |
+
if df['_id'].duplicated().any():
|
| 47 |
+
return False, "Found duplicate _id values"
|
| 48 |
+
|
| 49 |
+
return True, ""
|
| 50 |
+
|
| 51 |
+
def getDataFrame(path: str) -> Optional[pl.DataFrame]:
|
| 52 |
+
"""
|
| 53 |
+
Read and validate CSV file into a DataFrame.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
path: Path to the CSV file
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Optional[pl.DataFrame]: The validated DataFrame or None if validation fails
|
| 60 |
+
"""
|
| 61 |
+
try:
|
| 62 |
+
# Read CSV with tab separator
|
| 63 |
+
data = pd.read_csv(
|
| 64 |
+
path,
|
| 65 |
+
sep="\t",
|
| 66 |
+
header=0,
|
| 67 |
+
on_bad_lines='warn',
|
| 68 |
+
encoding='utf-8'
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Validate structure
|
| 72 |
+
is_valid, error_message = validate_csv_structure(data)
|
| 73 |
+
if not is_valid:
|
| 74 |
+
logger.error(error_message)
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
# Clean text column
|
| 78 |
+
data['text'] = data['text'].astype(str).str.strip()
|
| 79 |
+
data = data[data['text'].str.len() > 0]
|
| 80 |
+
|
| 81 |
+
if data.empty:
|
| 82 |
+
logger.error("No valid text data found after cleaning")
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
# Convert to Polars DataFrame
|
| 86 |
+
pl_df = pl.from_pandas(data)
|
| 87 |
+
logger.info(f"Successfully loaded {len(pl_df)} rows from CSV")
|
| 88 |
+
|
| 89 |
+
return pl_df
|
| 90 |
+
|
| 91 |
+
except pd.errors.EmptyDataError:
|
| 92 |
+
logger.error("CSV file is empty")
|
| 93 |
+
return None
|
| 94 |
+
except pd.errors.ParserError as e:
|
| 95 |
+
logger.error(f"Error parsing CSV file: {str(e)}")
|
| 96 |
+
return None
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.error(f"Unexpected error reading CSV: {str(e)}")
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
def save_to_csv(dataframe: pl.DataFrame) -> Optional[str]:
|
| 102 |
+
"""
|
| 103 |
+
Save DataFrame to CSV file.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
dataframe: Polars DataFrame to save
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
Optional[str]: Path to saved file or None if save fails
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
if dataframe is None or dataframe.is_empty():
|
| 113 |
+
logger.warning("No data to save")
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
# Create data directory if it doesn't exist
|
| 117 |
+
folder_path = "data"
|
| 118 |
os.makedirs(folder_path, exist_ok=True)
|
| 119 |
+
|
| 120 |
+
# Generate unique filename with timestamp
|
| 121 |
+
timestamp = int(time.time())
|
| 122 |
+
csv_path = f"{folder_path}/results_{timestamp}.csv"
|
| 123 |
+
|
| 124 |
+
# Save to CSV with tab separator
|
| 125 |
dataframe.write_csv(csv_path, separator="\t")
|
| 126 |
+
logger.info(f"Results saved to {csv_path}")
|
| 127 |
+
|
| 128 |
return csv_path
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Error saving results: {str(e)}")
|
| 132 |
+
return None
|
| 133 |
|
| 134 |
+
def delete_folder_periodically(path: str, interval: int = 3600) -> None:
|
| 135 |
+
"""
|
| 136 |
+
Periodically clean up the data folder.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
path: Path to folder to clean
|
| 140 |
+
interval: Interval between cleanups in seconds
|
| 141 |
+
"""
|
| 142 |
while True:
|
| 143 |
+
try:
|
| 144 |
+
if os.path.exists(path):
|
| 145 |
+
# Get current time
|
| 146 |
+
current_time = time.time()
|
| 147 |
+
|
| 148 |
+
# Check each file in the directory
|
| 149 |
+
for filename in os.listdir(path):
|
| 150 |
+
file_path = os.path.join(path, filename)
|
| 151 |
+
if os.path.isfile(file_path):
|
| 152 |
+
# Check file age
|
| 153 |
+
file_age = current_time - os.path.getmtime(file_path)
|
| 154 |
+
if file_age > interval:
|
| 155 |
+
os.remove(file_path)
|
| 156 |
+
logger.info(f"Deleted old file: {file_path}")
|
| 157 |
+
|
| 158 |
+
time.sleep(interval)
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.error(f"Error in cleanup task: {str(e)}")
|
| 162 |
+
time.sleep(interval)
|