Upload 2 files
Browse files
.gitattributes
CHANGED
|
@@ -152,3 +152,4 @@ NAIA/Beta/NAIA[[:space:]]v1.12[[:space:]](testv3).exe filter=lfs diff=lfs merge=
|
|
| 152 |
NAIA/Beta/NAIA[[:space:]]v1.12.exe filter=lfs diff=lfs merge=lfs -text
|
| 153 |
NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv1.exe filter=lfs diff=lfs merge=lfs -text
|
| 154 |
NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv2.exe filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 152 |
NAIA/Beta/NAIA[[:space:]]v1.12.exe filter=lfs diff=lfs merge=lfs -text
|
| 153 |
NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv1.exe filter=lfs diff=lfs merge=lfs -text
|
| 154 |
NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv2.exe filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
NAIA/Beta/parquet_token_update_tool.exe filter=lfs diff=lfs merge=lfs -text
|
NAIA/Beta/parquet_token_update_tool.exe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:957e920ef0ceafd91952f3b761585da854be3265878e82f01952a2f5c4b3484c
|
| 3 |
+
size 259772306
|
NAIA/Beta/parquet_token_update_tool.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from transformers import CLIPTokenizer
|
| 4 |
+
import tkinter as tk
|
| 5 |
+
from tkinter import filedialog, scrolledtext
|
| 6 |
+
import threading
|
| 7 |
+
|
| 8 |
+
# CLIPTokenizer ์ด๊ธฐํ
|
| 9 |
+
s_token = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 10 |
+
|
| 11 |
+
def rcs(text):
|
| 12 |
+
if text is None:
|
| 13 |
+
return None
|
| 14 |
+
token_ids = s_token.encode(text)
|
| 15 |
+
return len(token_ids)
|
| 16 |
+
|
| 17 |
+
def process_files(dflist, text_box, select_button, process_button):
|
| 18 |
+
# /tags ํ์ ํด๋ ๋ง๋ค๊ธฐ
|
| 19 |
+
output_dir = "processed"
|
| 20 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
for i, _df in enumerate(dflist):
|
| 23 |
+
# parquet ํ์ผ ์ฝ๊ธฐ
|
| 24 |
+
df = pd.read_parquet(_df, engine="pyarrow")
|
| 25 |
+
|
| 26 |
+
# 'tokens' ์ด ์ถ๊ฐ
|
| 27 |
+
tokens = []
|
| 28 |
+
total = len(df)
|
| 29 |
+
for idx, text in enumerate(df['general']):
|
| 30 |
+
if text is not None:
|
| 31 |
+
tokens.append(rcs(text))
|
| 32 |
+
|
| 33 |
+
# ์งํ ์ํฉ ์ถ๋ ฅ
|
| 34 |
+
if (idx + 1) % 100 == 0 or idx + 1 == total:
|
| 35 |
+
progress = f"Processing file {_df}: {idx + 1}/{total} ({(idx + 1) / total * 100:.2f}%)\n"
|
| 36 |
+
text_box.insert(tk.END, progress)
|
| 37 |
+
text_box.see(tk.END)
|
| 38 |
+
else:
|
| 39 |
+
tokens.append(None)
|
| 40 |
+
|
| 41 |
+
df['tokens'] = tokens
|
| 42 |
+
|
| 43 |
+
# ์ฒ๋ฆฌ๋ ํ์ผ ์ ์ฅ ๊ฒฝ๋ก
|
| 44 |
+
output_path = os.path.join(output_dir, os.path.basename(_df))
|
| 45 |
+
|
| 46 |
+
# parquet ํ์ผ๋ก ์ ์ฅ
|
| 47 |
+
df.to_parquet(output_path, engine="pyarrow")
|
| 48 |
+
text_box.insert(tk.END, f"Finished processing {_df}\n")
|
| 49 |
+
text_box.see(tk.END)
|
| 50 |
+
|
| 51 |
+
text_box.insert(tk.END, "๋ชจ๋ ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ฒ๋ฆฌ๋์์ต๋๋ค.\n")
|
| 52 |
+
text_box.see(tk.END)
|
| 53 |
+
|
| 54 |
+
# ์์
์ด ๋ชจ๋ ์ข
๋ฃ๋๋ฉด output_dir ์๋์ฐ ํด๋๊ฐ ์ด๋ฆฐ๋ค.
|
| 55 |
+
os.startfile(output_dir)
|
| 56 |
+
|
| 57 |
+
# dflist ์ด๊ธฐํ
|
| 58 |
+
dflist = []
|
| 59 |
+
|
| 60 |
+
# ๋ฒํผ ๋ค์ ํ์ฑํ
|
| 61 |
+
select_button.config(state=tk.NORMAL)
|
| 62 |
+
process_button.config(state=tk.NORMAL)
|
| 63 |
+
|
| 64 |
+
def select_files():
|
| 65 |
+
file_paths = filedialog.askopenfilenames(filetypes=[("Parquet files", "*.parquet")])
|
| 66 |
+
if file_paths:
|
| 67 |
+
dflist.extend(file_paths)
|
| 68 |
+
text_box.insert(tk.END, f"Selected files:\n{file_paths}\n")
|
| 69 |
+
text_box.see(tk.END)
|
| 70 |
+
|
| 71 |
+
def start_processing():
|
| 72 |
+
if not dflist:
|
| 73 |
+
return
|
| 74 |
+
select_button.config(state=tk.DISABLED)
|
| 75 |
+
process_button.config(state=tk.DISABLED)
|
| 76 |
+
|
| 77 |
+
# ํ์ผ ์ฒ๋ฆฌ ์ค๋ ๋ ์์
|
| 78 |
+
threading.Thread(target=process_files, args=(dflist, text_box, select_button, process_button)).start()
|
| 79 |
+
|
| 80 |
+
# Tkinter UI ์ค์
|
| 81 |
+
root = tk.Tk()
|
| 82 |
+
root.title("ํ๋กฌํํธ ์คํ์ปค์ฉ parquet ํ์ผ ํ ํฐ ์
๋ฐ์ดํธ ๋๊ตฌ")
|
| 83 |
+
|
| 84 |
+
frame = tk.Frame(root)
|
| 85 |
+
frame.pack(padx=10, pady=10)
|
| 86 |
+
|
| 87 |
+
select_button = tk.Button(frame, text="Parquet ํ์ผ ์ ํ", command=select_files)
|
| 88 |
+
select_button.pack(side=tk.LEFT, padx=5, pady=5)
|
| 89 |
+
|
| 90 |
+
process_button = tk.Button(frame, text="ํ ํฐ ๊ณ์ฐ ์์", command=start_processing)
|
| 91 |
+
process_button.pack(side=tk.LEFT, padx=5, pady=5)
|
| 92 |
+
|
| 93 |
+
text_box = scrolledtext.ScrolledText(root, width=80, height=20)
|
| 94 |
+
text_box.pack(padx=10, pady=10)
|
| 95 |
+
|
| 96 |
+
# dflist ์ด๊ธฐํ
|
| 97 |
+
dflist = []
|
| 98 |
+
|
| 99 |
+
root.mainloop()
|