albertmartinez's picture
initial commit
fb3abe1
raw
history blame
2.27 kB
from pathlib import Path
import gradio as gr
import pandas as pd
import polars as pl
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import paraphrase_mining
import torch
def upload_file(filepath):
name = Path(filepath).name
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
def getData(path):
#data = Dataset.from_csv(path, column_names=["text"])
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', names=["text"]))
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",
backend="openvino",
device=device,
trust_remote_code=True)
paraphrases = paraphrase_mining(
model,
data["text"],
corpus_chunk_size=len(data),
show_progress_bar=True,
batch_size=1024,
max_pairs=len(data) ** 2
)
df_pd = pd.DataFrame(paraphrases)
df = pl.from_pandas(df_pd)
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
union_df = pl.DataFrame(data.to_pandas())
df = df.with_columns([
pl.col("score").round(3).cast(pl.Float32),
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
]).filter(pl.col("score") > 0.96).sort(["score"], descending=True)
data = pl.from_arrow(data.data.table)
return [data, df]
with gr.Blocks() as demo:
with gr.Column():
upload_button = gr.UploadButton(label="upload csv", file_types=['.csv'], file_count="single")
output_data = gr.Dataframe(headers=["text"], col_count=1, label="Uploaded Data")
output_paraphrases = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
label="Paraphrase Mining Results")
upload_button.upload(fn=getData, inputs=upload_button, outputs=[output_data, output_paraphrases])
if __name__ == "__main__":
demo.launch()