Spaces:
Runtime error
Runtime error
| import os | |
| import glob | |
| import time | |
| import numpy as np | |
| from os import path | |
| import pandas as pd | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from sklearn.cluster import DBSCAN | |
| from sklearn.preprocessing import StandardScaler | |
| import arrange as DbscanArrange | |
| import directories as Dir | |
| """ | |
| ์ด์ ํ๋ก์ธ์ค: | |
| image detection์ ํตํด handwritten text image์์ | |
| ๊ฐ๊ฐ์ word image๋ฅผ ์ถ์ถํ์ผ๋, | |
| ์์๋ฅผ ์์ง ๋ชปํ๊ฒ ๋จ์ด๋ง ํ์ธ์ด ๋์ด | |
| ์๋ ๊ธ์ ์๋ฏธ๋ฅผ ์์ด๋ฒ๋ฆฌ๋ ๋ฌธ์ ๊ฐ ์์์. | |
| ๋ค๋ง ์ถ์ถ ์์ ๋จ์ด์ ์ขํ๊ฐ์ ํฌํจ์ํฌ ์ ์์๊ธฐ์ | |
| ๊ฐ ๋จ์ด์ ์ด๋ฆ์ x,y์ขํ๊ฐ๊ณผ w,h ์ ๋ณด๋ฅผ ํฌํจํ์ฌ | |
| ๋จ์ด ์ด๋ฏธ์ง ํ์ผ์ ์์ฑ | |
| ํ์ฌ ํ๋ก์ธ์ค: | |
| yolo model์ ํตํด crop๋ ์ด๋ฏธ์ง๋ค์ | |
| ํ์ผ ์ด๋ฆ์ crop๋๊ธฐ ์ raw image์์์ ์ขํ ์ ๋ณด๋ฅผ ํฌํจํฉ๋๋ค. | |
| x89y147w199h184.jpg | |
| x10y148w157h184.jpg | |
| x28y149w108h180.jpg | |
| raw imageํ์ผ์์ ๊ฐ์ line์ ์๋ word๋ค์ | |
| ๊ฐ๊น์ด y๊ฐ์ ๊ฐ๊ณ ์์ ๊ฒ์ด๋ฏ๋ก, | |
| ๋ฐ๋๊ธฐ๋ฐ ๊ตฐ์งํ ์๊ณ ๋ฆฌ์ฆ์ ์ฌ์ฉํ์ฌ | |
| ๊ฐ์ line์ ์์ ๊ฒ์ด๋ผ๊ณ ์์ํ๋ word๊ฐ๋ค์ | |
| ๋์ผ ํด๋ฌ์คํฐ์ ์ํ๊ฒ ๋จ. | |
| ์ด๋ ๊ฒ ๋์ผ ํด๋ฌ์คํฐ์ ์ํ๋ y๊ฐ๋ค์ ํ๊ท ์ ๊ตฌํ์ฌ | |
| word์ ์๋ก์ด y๊ฐ์ผ๋ก label. | |
| ๋จ์ด ์ด๋ฏธ์ง ํ์ผ renameํ ๋, y๊ฐ์ด ๋จผ์ ์ค๊ฒ ํ๋ค. | |
| yyyy_xxxx.jpg | |
| y๊ฐ์ ๋ํ ์ ๋ ฌ์ด ๋๊ณ | |
| x๊ฐ์ ๋ํด ์ค๋ฆ์ฐจ์์ผ๋ก ์ ๋ ฌ๋จ. | |
| 0148_0010.jpg | |
| 0148_0028.jpg | |
| 0148_0089.jpg | |
| word file์ ์์๊ฐ ์๋์ ์ผ๋ก ์๋ text์ ์๋ฏธ๋ฅผ ๊ฐ์ง๊ฒ ๋๋ค. | |
| ์ฝ๋ ์ค๋ช : | |
| extract_text_from_filename(), | |
| get_folder_contents_with_text() ํจ์๋ฅผ ์ด์ฉํด | |
| ํ์ผ ์ด๋ฆ์์ x๊ฐ๊ณผ y๊ฐ์ ์ถ์ถ. | |
| StandardScaler๋ก ํ์คํ. | |
| Get Clustered y values using DBSCAN. | |
| rename_file()์ ์ด์ฉํด์ rename image files with y-clustered values. | |
| """ | |
| # ์ฌ์ฉ ์์ | |
| # Get cropped word images | |
| # folder_path = "C:/Users/ban/TEXTAI/yolov5/runs/detect/yujin_paper/crops/word" -> cropped word folder | |
| x_texts, y_texts, name_jpg = DbscanArrange.get_folder_contents_with_text("/home/user/app"+Dir.yolo_dir+Dir.folder_path) | |
| file_name = pd.DataFrame(name_jpg) | |
| file_name.columns=['file_name'] | |
| #๋๋ฏธ ๋ฆฌ์คํธ ์์ฑ, ๋ํ์ด ๋ณํ ํ reshape (-1, 1) | |
| zero_list = [0 for _ in range(len(y_texts))] #2์ฐจ์ ๋ง์ถ๊ธฐ ์ํ zero ๋ฆฌ์คํธ | |
| zero_list = np.array([zero_list]).reshape(-1, 1) | |
| y_text = np.array([y_texts]).reshape(-1, 1) | |
| #print('y_text\n', y_text) | |
| #print('zero_list\n', zero_list) | |
| ##################################### | |
| # ํ์คํ | |
| scalerX = StandardScaler() # ์ค์ผ์ผ ํจ์ ๊ฐ์ ธ์์ | |
| scalerX.fit(y_text.data) # ์ค์ผ์ผ | |
| std_y_text = scalerX.transform(y_text.data).reshape(-1, 1) #์ค์ผ์ผ ์๋ฃ | |
| feature = pd.DataFrame(std_y_text) # ์ค์ผ์ผ๋ ๋ํ์ด ์๋ฃํ์ ๋ฐ์ดํฐํ๋ ์ ํํ๋ก ๋ณํ | |
| feature.columns=['feature'] | |
| data_list = [std_y_text,zero_list] # ํ์คํ๋ ๋ฐ์ดํฐ์ ๋๋ฏธ ์ฝ์ | |
| data = pd.DataFrame(data_list[0]) # | |
| labels = pd.DataFrame(data_list[1]) | |
| # ๋ฐ์ดํฐ ์ปฌ๋ผ๋ช ์ค์ | |
| labels.columns=['labels'] | |
| data.columns=['y'] | |
| # ๋ ๋ฐ์ดํฐ์ด ๋ณํฉ | |
| datadf = pd.concat([data,labels],axis=1) | |
| ################################### | |
| # create model and prediction | |
| model = DBSCAN(eps=0.04,min_samples=2) | |
| predict = pd.DataFrame(model.fit_predict(feature)) | |
| predict.columns=['predict'] | |
| # file_name, feature, predict ๋ณํฉ | |
| r = pd.concat([file_name,feature,predict],axis=1) | |
| #r.to_csv('C:\\Users\\ban\\Desktop\\predict_final.csv') | |
| ########################################################### | |
| ########################################################### | |
| r = r.sort_values(by=['predict']) | |
| #print(type(set(r['predict']))[0]) | |
| predict_list = list(set(r['predict'])) # predict ์ ์ซ์๋ค์ ์์ง | |
| unknown_words = [] | |
| same_line = [] | |
| whole_word_map = [] | |
| final_result = {'y_mean' : [], | |
| 'x_value' : [], | |
| 'file_name' : []} | |
| df_final_result = pd.DataFrame(final_result) | |
| whole_word_map_df = pd.DataFrame(whole_word_map) | |
| for _,line in enumerate(predict_list): # ํ๋์ฉ ์ถ์ถ -1, 0, 1, 2, ...์ง์ง ํ๋ค๋ค | |
| if line >= 0: # predict ๊ฐ์ด 0 ์ด์์ด๋ฉด, | |
| y_list = [] # ํด๋ฌ์คํ ๋ y๊ฐ๋ค์ ํ๊ท ์ ๊ตฌํ๊ธฐ ์ํ ๋ฆฌ์คํธ | |
| print(type(r['predict'])) | |
| same_line = r[r['predict'] == line] # r ๋ฐ์ดํฐํ๋ ์์์ X์ธ predict๋ฅผ ๊ฐ์ง๊ณ ์๋ ์ด์ ๊ฐ์ ธ์์ | |
| file_num = 0 | |
| y_mean_column = [] | |
| total_word_map = [] | |
| total_word_map_df = pd.DataFrame(total_word_map) | |
| for filename in same_line['file_name']: # ๊ฐ์ ํด๋ฌ์คํฐ์์ ํ์ผ ํ๋์ฉ ๋ฝ์์ | |
| x_data, y_data = DbscanArrange.extract_text_from_filename(filename) # ํด๋น ํ์ผ์ x, y๊ฐ์ ๋ฝ์์ | |
| y_list.append(int(y_data)) # y๊ฐ ํ๊ท ์ ์ํ y๊ฐ ๋ฆฌ์คํธ์ ์ฝ์ | |
| #x_file = {x_data:filename} # key๊ฐ์ x๊ฐ, value๊ฐ์ file_name, n_line ๋ฐ์ดํฐํ๋ ์์ ์ฝ์ | |
| file_num += 1 | |
| word_map = {'x_value' : [int(x_data)], | |
| 'file_name' : [filename]} | |
| word_map_df = pd.DataFrame(word_map) | |
| total_word_map_df = pd.concat([total_word_map_df, word_map_df]) | |
| total_word_map_df = total_word_map_df.sort_values(by=['x_value']) | |
| y_mean = int(np.mean(y_list)) # ํ ์ค์ ๋ํ ํ๊ท ๊ฐ ์ป์ #y_mean ์ด์ ์ฝ์ | |
| total_word_map_df['y_mean'] = y_mean # ์ถ๊ฐ๋ ํ ์๋งํผ y_mean ์ด ์ถ๊ฐ | |
| else: | |
| total_word_map = [] | |
| same_line = r[r['predict'] == line] | |
| for filename in same_line['file_name']: | |
| x_data, y_data = DbscanArrange.extract_text_from_filename(filename) | |
| unknown_words = [r['predict']==line] | |
| y_mean = int(y_data) | |
| word_map = {'y_mean' : [y_mean], | |
| 'x_value' : [x_data], | |
| 'file_name' : [filename]} | |
| word_map_df = pd.DataFrame(word_map) | |
| total_word_map_df = pd.concat([total_word_map_df, word_map_df]) | |
| whole_word_map_df = pd.concat([whole_word_map_df,total_word_map_df]) | |
| file_name = list(whole_word_map_df['file_name']) | |
| x_value = list(whole_word_map_df['x_value']) | |
| y_mean = list(whole_word_map_df['y_mean']) | |
| whole_list = [file_name,x_value,y_mean] | |
| cnum = 0 | |
| timestr = time.strftime("%Y%m%d%H%M%S") | |
| #์ ์ฒด ๋ฐ์ดํฐ ์ ์ ๋ํด์ ํ๋์ฉ ์ถ์ถํ์ฌ ํจ๋ฉ ํ ์ด๋ฆ ๋ณํ | |
| for i in range(len(file_name)): #_, x_value, file_name, y_mean | |
| old_path = "/home/user/app"+ str(Dir.yolo_dir) + str(Dir.folder_path) +"/"+ str(file_name[i]) | |
| new_path = "/home/user/app"+ str(Dir.yolo_dir) + str(Dir.folder_path) +"/"+ str(y_mean[i]).zfill(4) +"_"+ str(x_value[i]).zfill(4) + ".jpg" | |
| DbscanArrange.rename_file(old_path, new_path) | |
| if os.listdir(Dir.folder_path) == True: | |
| folder_contents = os.listdir(Dir.folder_path) | |
| print('UNKNOWN WORDS: \n',folder_contents) | |