|
|
from pathlib import Path |
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import polars as pl |
|
|
from datasets import Dataset |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from sentence_transformers.util import paraphrase_mining |
|
|
import torch |
|
|
|
|
|
|
|
|
def upload_file(filepath): |
|
|
name = Path(filepath).name |
|
|
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)] |
|
|
|
|
|
|
|
|
def getData(path): |
|
|
|
|
|
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', names=["text"])) |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", |
|
|
backend="openvino", |
|
|
device=device, |
|
|
trust_remote_code=True) |
|
|
|
|
|
paraphrases = paraphrase_mining( |
|
|
model, |
|
|
data["text"], |
|
|
corpus_chunk_size=len(data), |
|
|
show_progress_bar=True, |
|
|
batch_size=1024, |
|
|
max_pairs=len(data) ** 2 |
|
|
) |
|
|
|
|
|
df_pd = pd.DataFrame(paraphrases) |
|
|
df = pl.from_pandas(df_pd) |
|
|
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"}) |
|
|
|
|
|
union_df = pl.DataFrame(data.to_pandas()) |
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col("score").round(3).cast(pl.Float32), |
|
|
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"), |
|
|
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"), |
|
|
]).filter(pl.col("score") > 0.96).sort(["score"], descending=True) |
|
|
|
|
|
data = pl.from_arrow(data.data.table) |
|
|
|
|
|
return [data, df] |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Column(): |
|
|
upload_button = gr.UploadButton(label="upload csv", file_types=['.csv'], file_count="single") |
|
|
output_data = gr.Dataframe(headers=["text"], col_count=1, label="Uploaded Data") |
|
|
output_paraphrases = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars", |
|
|
label="Paraphrase Mining Results") |
|
|
|
|
|
upload_button.upload(fn=getData, inputs=upload_button, outputs=[output_data, output_paraphrases]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|