File size: 2,273 Bytes
fb3abe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from pathlib import Path
import gradio as gr
import pandas as pd
import polars as pl
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import paraphrase_mining
import torch


def upload_file(filepath):
    name = Path(filepath).name
    return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]


def getData(path):
    #data = Dataset.from_csv(path, column_names=["text"])
    data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', names=["text"]))
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",
                                backend="openvino",
                                device=device,
                                trust_remote_code=True)

    paraphrases = paraphrase_mining(
        model,
        data["text"],
        corpus_chunk_size=len(data),
        show_progress_bar=True,
        batch_size=1024,
        max_pairs=len(data) ** 2
    )

    df_pd = pd.DataFrame(paraphrases)
    df = pl.from_pandas(df_pd)
    df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})

    union_df = pl.DataFrame(data.to_pandas())

    df = df.with_columns([
        pl.col("score").round(3).cast(pl.Float32),
        union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
        union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
    ]).filter(pl.col("score") > 0.96).sort(["score"], descending=True)

    data = pl.from_arrow(data.data.table)

    return [data, df]


with gr.Blocks() as demo:
    with gr.Column():
        upload_button = gr.UploadButton(label="upload csv", file_types=['.csv'], file_count="single")
        output_data = gr.Dataframe(headers=["text"], col_count=1, label="Uploaded Data")
        output_paraphrases = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
                                          label="Paraphrase Mining Results")

        upload_button.upload(fn=getData, inputs=upload_button, outputs=[output_data, output_paraphrases])

if __name__ == "__main__":
    demo.launch()