File size: 2,273 Bytes
fb3abe1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from pathlib import Path
import gradio as gr
import pandas as pd
import polars as pl
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import paraphrase_mining
import torch
def upload_file(filepath):
name = Path(filepath).name
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
def getData(path):
#data = Dataset.from_csv(path, column_names=["text"])
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', names=["text"]))
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",
backend="openvino",
device=device,
trust_remote_code=True)
paraphrases = paraphrase_mining(
model,
data["text"],
corpus_chunk_size=len(data),
show_progress_bar=True,
batch_size=1024,
max_pairs=len(data) ** 2
)
df_pd = pd.DataFrame(paraphrases)
df = pl.from_pandas(df_pd)
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
union_df = pl.DataFrame(data.to_pandas())
df = df.with_columns([
pl.col("score").round(3).cast(pl.Float32),
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
]).filter(pl.col("score") > 0.96).sort(["score"], descending=True)
data = pl.from_arrow(data.data.table)
return [data, df]
with gr.Blocks() as demo:
with gr.Column():
upload_button = gr.UploadButton(label="upload csv", file_types=['.csv'], file_count="single")
output_data = gr.Dataframe(headers=["text"], col_count=1, label="Uploaded Data")
output_paraphrases = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
label="Paraphrase Mining Results")
upload_button.upload(fn=getData, inputs=upload_button, outputs=[output_data, output_paraphrases])
if __name__ == "__main__":
demo.launch()
|