Spaces:

albertmartinez
/

sentence-transformers

Sleeping

App Files Files Community

albertmartinez commited on Jun 10

Commit

3ce1088

1 Parent(s): 222cf81

Upgrade gradio

Browse files

Files changed (6) hide show

README.md +91 -1
app.py +250 -102
mining.py +81 -48
requirements.txt +10 -8
sts.py +104 -47
utils.py +149 -12

README.md CHANGED Viewed

@@ -4,9 +4,99 @@ emoji: 🏢
 colorFrom: green
 colorTo: gray
 sdk: gradio
-sdk_version: 5.23.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: green
 colorTo: gray
 sdk: gradio
+sdk_version: 5.33.1
 app_file: app.py
 pinned: false
 ---
+# Sentence Transformers Demo
+Interactive web application for semantic text similarity analysis using Sentence Transformers models.
+## Features
+### 1. Paraphrase Mining
+- Find sentences with similar meaning in a text corpus
+- Support for multiple language models
+- Adjustable similarity threshold
+- Export results in CSV format
+### 2. Semantic Textual Similarity (STS)
+- Calculate semantic similarity between two sets of sentences
+- Uses advanced sentence transformation models
+- Compare sentences in different languages
+- Export results in CSV format
+## Available Models
+- [`Lajavaness/bilingual-embedding-large`](https://huggingface.co/Lajavaness/bilingual-embedding-large): Multilingual model optimized for multiple languages
+- [`sentence-transformers/all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2): High-quality general-purpose model
+- [`intfloat/multilingual-e5-large-instruct`](https://huggingface.co/intfloat/multilingual-e5-large-instruct): Multilingual model with instructions
+## Requirements
+- Python 3.8+
+- Dependencies listed in `requirements.txt`
+## Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/sentence-transformers.git
+cd sentence-transformers
+```
+2. Create and activate a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # Linux/Mac
+# or
+.\venv\Scripts\activate  # Windows
+```
+3. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+## Usage
+1. Start the application:
+```bash
+python app.py
+```
+2. Open your browser at `http://localhost:7860`
+3. Select the desired functionality:
+   - Paraphrase Mining: Upload a CSV file with sentences to analyze
+   - STS: Upload two CSV files with sentences to compare
+4. Select the model and adjust the similarity threshold
+5. Click "Process" to start the analysis
+6. Download results in CSV format
+## CSV File Format
+CSV files must contain a column named "text" with the sentences to analyze:
+```csv
+text
+"First sentence to analyze"
+"Second sentence to analyze"
+...
+```
+## Notes
+- Temporary files are automatically cleaned up every 30 minutes
+- Using complete sentences is recommended for better results
+- Models may take time to load on first use
+## License
+MIT
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -4,112 +4,260 @@ import gradio as gr
 from mining import mining
 from sts import sts
 from utils import getDataFrame, save_to_csv, delete_folder_periodically
-CONCURRENCY_LIMIT = 5
-with gr.Blocks() as demo:
-    with gr.Tab("Paraphrase Mining"):
-        with gr.Row():
-            gr.Markdown(
-                "### Paraphrase mining is the task of finding paraphrases (texts with identical / similar meaning) in a large corpus of sentences")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("#### sentences")
-                upload_button_sentences = gr.UploadButton(label="upload sentences csv", file_types=['.csv'],
-                                                          file_count="single")
-                output_data_sentences = gr.Dataframe(headers=["text"], col_count=1, label="sentences data")
-                upload_button_sentences.upload(fn=getDataFrame, inputs=upload_button_sentences,
-                                               outputs=output_data_sentences, concurrency_limit=CONCURRENCY_LIMIT)
-        with gr.Row():
-            with gr.Column():
-                model = gr.Dropdown(
-                    ["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
-                     "intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
-                score_mining = gr.Number(label="score", value=0.96, interactive=True)
-                submit_button_mining = gr.Button("Submit", variant="primary")
-        with gr.Row():
-            with gr.Column():
-                output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
-                                             label="Mining")
-                submit_button_mining.click(
-                    fn=mining,
-                    inputs=[model, upload_button_sentences, score_mining],
-                    outputs=output_mining
-                )
-                download_button = gr.Button("Download Results as CSV", variant="huggingface")
-                download_file = gr.File(label="Downloadable File")
-                download_button.click(
-                    fn=save_to_csv,
-                    inputs=output_mining,
-                    outputs=download_file
-                )
-    with gr.Tab("Semantic Textual Similarity"):
-        with gr.Row():  # Row for the title
-            gr.Markdown(
-                "### Semantic Textual Similarity (STS), we want to produce embeddings for all texts involved and calculate the similarities between them")
-        with gr.Row():  # First row of two columns
-            with gr.Column():
-                gr.Markdown("#### sentences 1")
-                upload_button_sentences1 = gr.UploadButton(label="upload sentences 1 csv", file_types=['.csv'],
-                                                           file_count="single")
-                output_data_sentences1 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 1 data")
-                upload_button_sentences1.upload(fn=getDataFrame, inputs=upload_button_sentences1,
-                                                outputs=output_data_sentences1, concurrency_limit=CONCURRENCY_LIMIT)
-            with gr.Column():
-                gr.Markdown("#### sentences 2")
-                upload_button_sentences2 = gr.UploadButton(label="upload sentences 2 csv", file_types=['.csv'],
-                                                           file_count="single")
-                output_data_sentences2 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 2 data")
-                upload_button_sentences2.upload(fn=getDataFrame, inputs=upload_button_sentences2,
-                                                outputs=output_data_sentences2, concurrency_limit=CONCURRENCY_LIMIT)
-        with gr.Row():
-            with gr.Column():
-                model = gr.Dropdown(
-                    ["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
-                     "intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
-                score_sts = gr.Number(label="score", value=0.96, interactive=True)
-                submit_button_sts = gr.Button("Submit", variant="primary")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("#### STS Results")
-                output_sts = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
-                                          label="Semantic Textual Similarit")
-                submit_button_sts.click(
-                    fn=sts,
-                    inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
-                    outputs=output_sts
-                )
-                download_button = gr.Button("Download Results as CSV", variant="huggingface")
-                download_file = gr.File(label="Downloadable File")
-                download_button.click(
-                    fn=save_to_csv,
-                    inputs=output_sts,
-                    outputs=download_file
-                )
 if __name__ == "__main__":
-    multiprocessing.set_start_method("spawn")
-    folder_path = "data"
-    thread = threading.Thread(target=delete_folder_periodically, args=(folder_path, 1800), daemon=True)
-    thread.start()
-    print(gr.__version__)
-    demo.launch()

 from mining import mining
 from sts import sts
 from utils import getDataFrame, save_to_csv, delete_folder_periodically
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+CONCURRENCY_LIMIT = 5
+AVAILABLE_MODELS = [
+    "Lajavaness/bilingual-embedding-large",
+    "sentence-transformers/all-mpnet-base-v2",
+    "intfloat/multilingual-e5-large-instruct"
+]
+MODEL_DESCRIPTIONS = {
+    "Lajavaness/bilingual-embedding-large": "Multilingual model optimized for multiple languages. [More info](https://huggingface.co/Lajavaness/bilingual-embedding-large)",
+    "sentence-transformers/all-mpnet-base-v2": "High-quality general-purpose model. [More info](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)",
+    "intfloat/multilingual-e5-large-instruct": "Multilingual model with instructions. [More info](https://huggingface.co/intfloat/multilingual-e5-large-instruct)"
+}
+def create_interface():
+    with gr.Blocks(title="Sentence Transformers Demo") as demo:
+        gr.Markdown("# Sentence Transformers Demo")
+        gr.Markdown("This application provides two main functionalities: Paraphrase Mining and Semantic Textual Similarity (STS).")
+        with gr.Tab("Paraphrase Mining"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown(
+                        "### Paraphrase Mining\n"
+                        "Find paraphrases (texts with identical/similar meaning) in a large corpus of sentences.\n"
+                        "Upload a CSV file containing your sentences and select a model to begin."
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("#### Input Sentences")
+                    upload_button_sentences = gr.UploadButton(
+                        label="Upload Sentences CSV",
+                        file_types=['.csv'],
+                        file_count="single",
+                        variant="primary"
+                    )
+                    output_data_sentences = gr.Dataframe(
+                        headers=["_id", "text"],
+                        col_count=2,
+                        label="Sentences Data",
+                        interactive=False
+                    )
+                    upload_button_sentences.upload(
+                        fn=getDataFrame,
+                        inputs=upload_button_sentences,
+                        outputs=output_data_sentences,
+                        concurrency_limit=CONCURRENCY_LIMIT
+                    )
+            with gr.Row():
+                with gr.Column():
+                    model = gr.Dropdown(
+                        choices=AVAILABLE_MODELS,
+                        label="Select Model",
+                        value=AVAILABLE_MODELS[0],
+                        interactive=True
+                    )
+                    model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
+                    def update_model_description(model_name):
+                        return MODEL_DESCRIPTIONS[model_name]
+                    model.change(
+                        fn=update_model_description,
+                        inputs=model,
+                        outputs=model_description
+                    )
+                    score_mining = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.96,
+                        step=0.01,
+                        label="Similarity Threshold",
+                        interactive=True
+                    )
+                    submit_button_mining = gr.Button("Process", variant="primary")
+            with gr.Row():
+                with gr.Column():
+                    output_mining = gr.Dataframe(
+                        headers=["score", "sentence_1", "sentence_2"],
+                        type="polars",
+                        label="Mining Results"
+                    )
+                    submit_button_mining.click(
+                        fn=mining,
+                        inputs=[model, upload_button_sentences, score_mining],
+                        outputs=output_mining
+                    ).then(
+                        fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
+                        inputs=[output_mining],
+                        outputs=[]
+                    )
+                    download_button = gr.Button("Download Results as CSV", variant="secondary")
+                    download_file = gr.File(label="Downloadable File")
+                    download_button.click(
+                        fn=save_to_csv,
+                        inputs=output_mining,
+                        outputs=download_file
+                    ).then(
+                        fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
+                        inputs=[download_file],
+                        outputs=[]
+                    )
+        with gr.Tab("Semantic Textual Similarity"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown(
+                        "### Semantic Textual Similarity (STS)\n"
+                        "Calculate semantic similarity between two sets of sentences.\n"
+                        "Upload two CSV files containing your sentences and select a model to begin."
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("#### First Set of Sentences")
+                    upload_button_sentences1 = gr.UploadButton(
+                        label="Upload First Set CSV",
+                        file_types=['.csv'],
+                        file_count="single",
+                        variant="primary"
+                    )
+                    output_data_sentences1 = gr.Dataframe(
+                        headers=["_id", "text"],
+                        col_count=2,
+                        label="First Set Data",
+                        interactive=False
+                    )
+                    upload_button_sentences1.upload(
+                        fn=getDataFrame,
+                        inputs=upload_button_sentences1,
+                        outputs=output_data_sentences1,
+                        concurrency_limit=CONCURRENCY_LIMIT
+                    )
+                with gr.Column():
+                    gr.Markdown("#### Second Set of Sentences")
+                    upload_button_sentences2 = gr.UploadButton(
+                        label="Upload Second Set CSV",
+                        file_types=['.csv'],
+                        file_count="single",
+                        variant="primary"
+                    )
+                    output_data_sentences2 = gr.Dataframe(
+                        headers=["_id", "text"],
+                        col_count=2,
+                        label="Second Set Data",
+                        interactive=False
+                    )
+                    upload_button_sentences2.upload(
+                        fn=getDataFrame,
+                        inputs=upload_button_sentences2,
+                        outputs=output_data_sentences2,
+                        concurrency_limit=CONCURRENCY_LIMIT
+                    )
+            with gr.Row():
+                with gr.Column():
+                    model = gr.Dropdown(
+                        choices=AVAILABLE_MODELS,
+                        label="Select Model",
+                        value=AVAILABLE_MODELS[0],
+                        interactive=True
+                    )
+                    model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
+                    model.change(
+                        fn=update_model_description,
+                        inputs=model,
+                        outputs=model_description
+                    )
+                    score_sts = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.96,
+                        step=0.01,
+                        label="Similarity Threshold",
+                        interactive=True
+                    )
+                    submit_button_sts = gr.Button("Process", variant="primary")
+            with gr.Row():
+                with gr.Column():
+                    output_sts = gr.Dataframe(
+                        headers=["score", "sentences1", "sentences2"],
+                        type="polars",
+                        label="Similarity Results"
+                    )
+                    submit_button_sts.click(
+                        fn=sts,
+                        inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
+                        outputs=output_sts
+                    ).then(
+                        fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
+                        inputs=[output_sts],
+                        outputs=[]
+                    )
+                    download_button = gr.Button("Download Results as CSV", variant="secondary")
+                    download_file = gr.File(label="Downloadable File")
+                    download_button.click(
+                        fn=save_to_csv,
+                        inputs=output_sts,
+                        outputs=download_file
+                    ).then(
+                        fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
+                        inputs=[download_file],
+                        outputs=[]
+                    )
+        return demo
 if __name__ == "__main__":
+    try:
+        multiprocessing.set_start_method("spawn")
+        # Start cleanup thread
+        folder_path = "data"
+        thread = threading.Thread(
+            target=delete_folder_periodically,
+            args=(folder_path, 1800),
+            daemon=True
+        )
+        thread.start()
+        # Create and launch interface
+        demo = create_interface()
+        demo.launch(
+            share=False,
+            server_name="0.0.0.0",
+            server_port=7860,
+            show_error=True,
+            show_api=False
+        )
+    except Exception as e:
+        logger.error(f"Error starting application: {str(e)}")
+        raise

mining.py CHANGED Viewed

@@ -2,56 +2,89 @@ import time
 import pandas as pd
 import polars as pl
 import torch
 from datasets import Dataset
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import paraphrase_mining
-def mining(modelname, path, score):
-    st = time.time()
-    data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
-    original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = SentenceTransformer(
-        modelname,
-        device=device,
-        trust_remote_code=True,
-    )
-    paraphrases = paraphrase_mining(
-        model,
-        data["text"],
-        corpus_chunk_size=len(data),
-        show_progress_bar=True,
-        batch_size=1024,
-        max_pairs=len(data) ** 2,
-    )
-    df_pd = pd.DataFrame(paraphrases)
-    df = pl.from_pandas(df_pd)
-    df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
-    union_df = pl.DataFrame(data.to_pandas())
-    original_columns = original_df.columns.tolist()
-    additional_cols = []
-    for col in original_columns:
-        if col != "text":
-            additional_cols.extend([
-                union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
-                union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
-            ])
-    df = df.with_columns([
-        pl.col("score").round(3).cast(pl.Float32),
-        union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
-        union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
-        *additional_cols
-    ]).filter(pl.col("score") > score).sort(["score"], descending=True)
-    elapsed_time = time.time() - st
-    print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
-    return df

 import pandas as pd
 import polars as pl
 import torch
+import logging
 from datasets import Dataset
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import paraphrase_mining
+from typing import Optional
+logger = logging.getLogger(__name__)
+def mining(modelname: str, path: str, score: float) -> Optional[pl.DataFrame]:
+    """
+    Perform paraphrase mining on the input data.
+    Args:
+        modelname: Name of the model to use
+        path: Path to the input CSV file
+        score: Minimum similarity score threshold
+    Returns:
+        Optional[pl.DataFrame]: DataFrame with mining results or None if error occurs
+    """
+    try:
+        st = time.time()
+        # Read and validate input data
+        data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
+        original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
+        if data.num_rows == 0:
+            logger.error("No data found in input file")
+            return None
+        # Initialize model
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device}")
+        model = SentenceTransformer(
+            modelname,
+            device=device,
+            trust_remote_code=True,
+        )
+        # Perform paraphrase mining
+        logger.info("Starting paraphrase mining...")
+        paraphrases = paraphrase_mining(
+            model,
+            data["text"],
+            corpus_chunk_size=len(data),
+            show_progress_bar=True,
+            batch_size=1024,
+            max_pairs=len(data) ** 2,
+        )
+        # Process results
+        df_pd = pd.DataFrame(paraphrases)
+        df = pl.from_pandas(df_pd)
+        df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
+        union_df = pl.DataFrame(data.to_pandas())
+        original_columns = original_df.columns.tolist()
+        # Add additional columns if present
+        additional_cols = []
+        for col in original_columns:
+            if col != "text":
+                additional_cols.extend([
+                    union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
+                    union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
+                ])
+        # Process final results
+        df = df.with_columns([
+            pl.col("score").round(3).cast(pl.Float32),
+            union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
+            union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
+            *additional_cols
+        ]).filter(pl.col("score") > score).sort(["score"], descending=True)
+        elapsed_time = time.time() - st
+        logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
+        logger.info(f'Found {len(df)} paraphrases above score threshold {score}')
+        return df
+    except Exception as e:
+        logger.error(f"Error in mining process: {str(e)}")
+        return None

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
-transformers
-torch
-pandas
-polars
-datasets
-sentence-transformers[openvino,onnx-gpu,onnx]
-multiprocess
-gradio

+transformers>=4.36.0
+torch>=2.1.0
+pandas>=2.1.0
+polars>=0.20.0
+datasets>=2.14.0
+sentence-transformers[openvino,onnx-gpu,onnx]>=2.2.0
+multiprocess>=0.70.15
+gradio>=4.12.0
+numpy>=1.24.0
+tqdm>=4.66.0

sts.py CHANGED Viewed

@@ -2,54 +2,111 @@ import time
 import pandas as pd
 import polars as pl
 import torch
 from datasets import Dataset
 from sentence_transformers import SentenceTransformer
-def sts(modelname, data1, data2, score):
-    st = time.time()
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = SentenceTransformer(
-        modelname,
-        device=device,
-        trust_remote_code=True,
-    )
-    sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
-    sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
-    embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
-                               show_progress_bar=True)
-    embeddings2 = model.encode(sentences2["text"], normalize_embeddings=True, batch_size=1024,
-                               show_progress_bar=True)
-    similarity_matrix = model.similarity(embeddings1, embeddings2)
-    df_pd = pd.DataFrame(similarity_matrix)
-    dfi = df_pd.__dataframe__()
-    df = pl.from_dataframe(dfi)
-    df_matrix_with_index = df.with_row_index(name="row_index").with_columns(pl.col("row_index").cast(pl.UInt64))
-    df_long = df_matrix_with_index.unpivot(index="row_index", variable_name="column_index",
-                                           value_name="score").with_columns(pl.col("column_index").cast(pl.UInt64))
-    df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
-        pl.col("row_index").cast(pl.UInt64))
-    df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
-        pl.col("column_index").cast(pl.UInt64))
-    df_long = (df_long
-               .with_columns([pl.col("score").round(4).cast(pl.Float32)])  # Ensure column_index is UInt32
-               .join(df_sentences1, on="row_index")
-               .join(df_sentences2, on="column_index"))
-    df_long = df_long.rename({
-        "text": "sentences1",
-        "text_right": "sentences2",
-    }).drop(["row_index", "column_index"])
-    elapsed_time = time.time() - st
-    print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
-    return df_long.filter(pl.col("score") > score).sort(["score"],
-                                                        descending=True)

 import pandas as pd
 import polars as pl
 import torch
+import logging
 from datasets import Dataset
 from sentence_transformers import SentenceTransformer
+from typing import Optional
+logger = logging.getLogger(__name__)
+def sts(modelname: str, data1: str, data2: str, score: float) -> Optional[pl.DataFrame]:
+    """
+    Calculate semantic textual similarity between two sets of sentences.
+    Args:
+        modelname: Name of the model to use
+        data1: Path to first input CSV file
+        data2: Path to second input CSV file
+        score: Minimum similarity score threshold
+    Returns:
+        Optional[pl.DataFrame]: DataFrame with similarity results or None if error occurs
+    """
+    try:
+        st = time.time()
+        # Initialize model
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device}")
+        model = SentenceTransformer(
+            modelname,
+            device=device,
+            trust_remote_code=True,
+        )
+        # Read and validate input data
+        sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
+        sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
+        if sentences1.num_rows == 0 or sentences2.num_rows == 0:
+            logger.error("Empty input data found")
+            return None
+        # Generate embeddings
+        logger.info("Generating embeddings for first set...")
+        embeddings1 = model.encode(
+            sentences1["text"],
+            normalize_embeddings=True,
+            batch_size=1024,
+            show_progress_bar=True
+        )
+        logger.info("Generating embeddings for second set...")
+        embeddings2 = model.encode(
+            sentences2["text"],
+            normalize_embeddings=True,
+            batch_size=1024,
+            show_progress_bar=True
+        )
+        # Calculate similarity matrix
+        logger.info("Calculating similarity matrix...")
+        similarity_matrix = model.similarity(embeddings1, embeddings2)
+        # Process results
+        df_pd = pd.DataFrame(similarity_matrix)
+        dfi = df_pd.__dataframe__()
+        df = pl.from_dataframe(dfi)
+        # Transform matrix to long format
+        df_matrix_with_index = df.with_row_index(name="row_index").with_columns(
+            pl.col("row_index").cast(pl.UInt64)
+        )
+        df_long = df_matrix_with_index.unpivot(
+            index="row_index",
+            variable_name="column_index",
+            value_name="score"
+        ).with_columns(pl.col("column_index").cast(pl.UInt64))
+        # Join with original text
+        df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
+            pl.col("row_index").cast(pl.UInt64)
+        )
+        df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
+            pl.col("column_index").cast(pl.UInt64)
+        )
+        # Process final results
+        df_long = (df_long
+                  .with_columns([pl.col("score").round(4).cast(pl.Float32)])
+                  .join(df_sentences1, on="row_index")
+                  .join(df_sentences2, on="column_index"))
+        df_long = df_long.rename({
+            "text": "sentences1",
+            "text_right": "sentences2",
+        }).drop(["row_index", "column_index"])
+        # Filter and sort results
+        result_df = df_long.filter(pl.col("score") > score).sort(["score"], descending=True)
+        elapsed_time = time.time() - st
+        logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
+        logger.info(f'Found {len(result_df)} pairs above score threshold {score}')
+        return result_df
+    except Exception as e:
+        logger.error(f"Error in STS process: {str(e)}")
+        return None

utils.py CHANGED Viewed

@@ -4,22 +4,159 @@ import shutil
 import pandas as pd
 import polars as pl
 import time
-def getDataFrame(path):
-    data = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
-    return pl.from_pandas(data)
-def save_to_csv(dataframe):
-    folder_path = "data"
-    if not dataframe.is_empty():
         os.makedirs(folder_path, exist_ok=True)
-        csv_path = f"{folder_path}/{uuid.uuid4()}.csv"
         dataframe.write_csv(csv_path, separator="\t")
         return csv_path
-def delete_folder_periodically(path, interval=3600):
     while True:
-        if os.path.exists(path):
-            shutil.rmtree(path)
-        os.makedirs(path, exist_ok=True)
-        time.sleep(interval)

 import pandas as pd
 import polars as pl
 import time
+import logging
+from typing import Optional, Tuple
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def validate_csv_structure(df: pd.DataFrame) -> Tuple[bool, str]:
+    """
+    Validate the structure of the DataFrame.
+    Args:
+        df: DataFrame to validate
+    Returns:
+        Tuple[bool, str]: (is_valid, error_message)
+    """
+    # Check if DataFrame is empty
+    if df.empty:
+        return False, "CSV file is empty"
+    # Check required columns
+    required_columns = ['_id', 'text']
+    missing_columns = [col for col in required_columns if col not in df.columns]
+    if missing_columns:
+        return False, f"Missing required columns: {', '.join(missing_columns)}"
+    # Validate _id column
+    if df['_id'].isna().any():
+        return False, "Found empty _id values"
+    # Validate text column
+    if df['text'].isna().any():
+        return False, "Found empty text values"
+    # Check for duplicate _id values
+    if df['_id'].duplicated().any():
+        return False, "Found duplicate _id values"
+    return True, ""
+def getDataFrame(path: str) -> Optional[pl.DataFrame]:
+    """
+    Read and validate CSV file into a DataFrame.
+    Args:
+        path: Path to the CSV file
+    Returns:
+        Optional[pl.DataFrame]: The validated DataFrame or None if validation fails
+    """
+    try:
+        # Read CSV with tab separator
+        data = pd.read_csv(
+            path,
+            sep="\t",
+            header=0,
+            on_bad_lines='warn',
+            encoding='utf-8'
+        )
+        # Validate structure
+        is_valid, error_message = validate_csv_structure(data)
+        if not is_valid:
+            logger.error(error_message)
+            return None
+        # Clean text column
+        data['text'] = data['text'].astype(str).str.strip()
+        data = data[data['text'].str.len() > 0]
+        if data.empty:
+            logger.error("No valid text data found after cleaning")
+            return None
+        # Convert to Polars DataFrame
+        pl_df = pl.from_pandas(data)
+        logger.info(f"Successfully loaded {len(pl_df)} rows from CSV")
+        return pl_df
+    except pd.errors.EmptyDataError:
+        logger.error("CSV file is empty")
+        return None
+    except pd.errors.ParserError as e:
+        logger.error(f"Error parsing CSV file: {str(e)}")
+        return None
+    except Exception as e:
+        logger.error(f"Unexpected error reading CSV: {str(e)}")
+        return None
+def save_to_csv(dataframe: pl.DataFrame) -> Optional[str]:
+    """
+    Save DataFrame to CSV file.
+    Args:
+        dataframe: Polars DataFrame to save
+    Returns:
+        Optional[str]: Path to saved file or None if save fails
+    """
+    try:
+        if dataframe is None or dataframe.is_empty():
+            logger.warning("No data to save")
+            return None
+        # Create data directory if it doesn't exist
+        folder_path = "data"
         os.makedirs(folder_path, exist_ok=True)
+        # Generate unique filename with timestamp
+        timestamp = int(time.time())
+        csv_path = f"{folder_path}/results_{timestamp}.csv"
+        # Save to CSV with tab separator
         dataframe.write_csv(csv_path, separator="\t")
+        logger.info(f"Results saved to {csv_path}")
         return csv_path
+    except Exception as e:
+        logger.error(f"Error saving results: {str(e)}")
+        return None
+def delete_folder_periodically(path: str, interval: int = 3600) -> None:
+    """
+    Periodically clean up the data folder.
+    Args:
+        path: Path to folder to clean
+        interval: Interval between cleanups in seconds
+    """
     while True:
+        try:
+            if os.path.exists(path):
+                # Get current time
+                current_time = time.time()
+                # Check each file in the directory
+                for filename in os.listdir(path):
+                    file_path = os.path.join(path, filename)
+                    if os.path.isfile(file_path):
+                        # Check file age
+                        file_age = current_time - os.path.getmtime(file_path)
+                        if file_age > interval:
+                            os.remove(file_path)
+                            logger.info(f"Deleted old file: {file_path}")
+            time.sleep(interval)
+        except Exception as e:
+            logger.error(f"Error in cleanup task: {str(e)}")
+            time.sleep(interval)