PoemForSmallFThings / NAIA /Beta /parquet_token_update_tool.py
baqu2213's picture
Upload 2 files
ad7e448 verified
raw
history blame
3.18 kB
import os
import pandas as pd
from transformers import CLIPTokenizer
import tkinter as tk
from tkinter import filedialog, scrolledtext
import threading
# CLIPTokenizer ์ดˆ๊ธฐํ™”
s_token = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
def rcs(text):
if text is None:
return None
token_ids = s_token.encode(text)
return len(token_ids)
def process_files(dflist, text_box, select_button, process_button):
# /tags ํ•˜์œ„ ํด๋” ๋งŒ๋“ค๊ธฐ
output_dir = "processed"
os.makedirs(output_dir, exist_ok=True)
for i, _df in enumerate(dflist):
# parquet ํŒŒ์ผ ์ฝ๊ธฐ
df = pd.read_parquet(_df, engine="pyarrow")
# 'tokens' ์—ด ์ถ”๊ฐ€
tokens = []
total = len(df)
for idx, text in enumerate(df['general']):
if text is not None:
tokens.append(rcs(text))
# ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ
if (idx + 1) % 100 == 0 or idx + 1 == total:
progress = f"Processing file {_df}: {idx + 1}/{total} ({(idx + 1) / total * 100:.2f}%)\n"
text_box.insert(tk.END, progress)
text_box.see(tk.END)
else:
tokens.append(None)
df['tokens'] = tokens
# ์ฒ˜๋ฆฌ๋œ ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ
output_path = os.path.join(output_dir, os.path.basename(_df))
# parquet ํŒŒ์ผ๋กœ ์ €์žฅ
df.to_parquet(output_path, engine="pyarrow")
text_box.insert(tk.END, f"Finished processing {_df}\n")
text_box.see(tk.END)
text_box.insert(tk.END, "๋ชจ๋“  ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n")
text_box.see(tk.END)
# ์ž‘์—…์ด ๋ชจ๋‘ ์ข…๋ฃŒ๋˜๋ฉด output_dir ์œˆ๋„์šฐ ํด๋”๊ฐ€ ์—ด๋ฆฐ๋‹ค.
os.startfile(output_dir)
# dflist ์ดˆ๊ธฐํ™”
dflist = []
# ๋ฒ„ํŠผ ๋‹ค์‹œ ํ™œ์„ฑํ™”
select_button.config(state=tk.NORMAL)
process_button.config(state=tk.NORMAL)
def select_files():
file_paths = filedialog.askopenfilenames(filetypes=[("Parquet files", "*.parquet")])
if file_paths:
dflist.extend(file_paths)
text_box.insert(tk.END, f"Selected files:\n{file_paths}\n")
text_box.see(tk.END)
def start_processing():
if not dflist:
return
select_button.config(state=tk.DISABLED)
process_button.config(state=tk.DISABLED)
# ํŒŒ์ผ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ ์‹œ์ž‘
threading.Thread(target=process_files, args=(dflist, text_box, select_button, process_button)).start()
# Tkinter UI ์„ค์ •
root = tk.Tk()
root.title("ํ”„๋กฌํ”„ํŠธ ์Šคํƒœ์ปค์šฉ parquet ํŒŒ์ผ ํ† ํฐ ์—…๋ฐ์ดํŠธ ๋„๊ตฌ")
frame = tk.Frame(root)
frame.pack(padx=10, pady=10)
select_button = tk.Button(frame, text="Parquet ํŒŒ์ผ ์„ ํƒ", command=select_files)
select_button.pack(side=tk.LEFT, padx=5, pady=5)
process_button = tk.Button(frame, text="ํ† ํฐ ๊ณ„์‚ฐ ์‹œ์ž‘", command=start_processing)
process_button.pack(side=tk.LEFT, padx=5, pady=5)
text_box = scrolledtext.ScrolledText(root, width=80, height=20)
text_box.pack(padx=10, pady=10)
# dflist ์ดˆ๊ธฐํ™”
dflist = []
root.mainloop()