| # import math | |
| # from collections import defaultdict | |
| # from config import config | |
| # def calculate_duration(semt): | |
| # return math.ceil(((len(semt.split()) + 1) / 50) * 2) / 2 | |
| # def get_weights(weights,expected_data,languages): | |
| # new_weights = [] | |
| # expected_weights = config.weights_percentage_duration | |
| # total_files = sum([expected_data['total'][i] for i in expected_data['total']]) | |
| # duration_multiplier = {i:config.weights_percentage_duration[i]/(expected_data['total'][i]/total_files) for i in config.weights_percentage_duration} | |
| # print(expected_data['total'],duration_multiplier) | |
| # for i in weights: | |
| # new_weights.append(duration_multiplier[i[1]]) | |
| # return new_weights | |
| # def process_file(file_path): | |
| # weights = [] | |
| # expected_data = defaultdict(lambda: {i:0 for i in ["single_word","5s","10s","15s","20s","20_sentence",">20"]}) | |
| # languages = defaultdict(int) | |
| # count = 0 | |
| # with open(file_path, 'r') as file: | |
| # for line in file: | |
| # count+=1 | |
| # lang, path, text, semt, ref_files = line.split('|') | |
| # languages[lang]+=1 | |
| # dur = calculate_duration(semt) | |
| # # weights.append([lang,,1.0]) | |
| # # duration_files['total'][dur] += 1 | |
| # if len(text.strip().split(' '))==1: | |
| # expected_data[lang]["single_word"]+=1 | |
| # expected_data['total']["single_word"]+=1 | |
| # weights.append([lang,"single_word",1.0]) | |
| # continue | |
| # if dur >19.5 and dur<=20: | |
| # expected_data[lang]["20_sentence"]+=1 | |
| # expected_data['total']["20_sentence"]+=1 | |
| # weights.append([lang,"20_sentence",1.0]) | |
| # continue | |
| # if dur<=5: | |
| # expected_data[lang]["5s"]+=1 | |
| # expected_data['total']["5s"]+=1 | |
| # weights.append([lang,"5s",1.0]) | |
| # continue | |
| # elif dur<=10: | |
| # expected_data[lang]["10s"]+=1 | |
| # expected_data['total']["10s"]+=1 | |
| # weights.append([lang,"10s",1.0]) | |
| # continue | |
| # elif dur<=15: | |
| # expected_data[lang]["15s"]+=1 | |
| # expected_data['total']["15s"]+=1 | |
| # weights.append([lang,"15s",1.0]) | |
| # continue | |
| # elif dur<=20: | |
| # expected_data[lang]["20s"]+=1 | |
| # expected_data['total']["20s"]+=1 | |
| # weights.append([lang,"20s",1.0]) | |
| # continue | |
| # else: | |
| # # expected_data[lang][">20"]+=1 | |
| # # expected_data['total'][">20"]+=1 | |
| # # weights.append([lang,">20",1.0]) | |
| # continue | |
| # final_weights = get_weights(weights,expected_data,languages) | |
| # return final_weights,count | |
| # def process_file_for_heads(file_path,total_processes,process_id): | |
| # weights = [] | |
| # # heads = defaultdict(lambda: {i:[] for i in ["single_word","5s","10s","15s","20s","20_sentence",">20"]}) # to include langauges | |
| # heads = {i:[] for i in ["single_word","5s","10s","15s","20s","20_sentence",">20"]} | |
| # expected_data = defaultdict(lambda: {i:0 for i in ["single_word","5s","10s","15s","20s","20_sentence",">20"]}) | |
| # languages = defaultdict(int) | |
| # count = 0 | |
| # line_number = -1 | |
| # with open(file_path, 'r') as file: | |
| # for line in file: | |
| # count+=1 | |
| # line_number+=1 | |
| # lang, path, text, semt, ref_files = line.split('|') | |
| # languages[lang]+=1 | |
| # dur = calculate_duration(semt) | |
| # # weights.append([lang,,1.0]) | |
| # # duration_files['total'][dur] += 1 | |
| # if len(text.strip().split(' '))==1: | |
| # expected_data[lang]["single_word"]+=1 | |
| # expected_data['total']["single_word"]+=1 | |
| # weights.append([lang,"single_word",1.0]) | |
| # heads["single_word"].append(line_number) | |
| # continue | |
| # if dur >19.5 and dur<=20: | |
| # expected_data[lang]["20_sentence"]+=1 | |
| # expected_data['total']["20_sentence"]+=1 | |
| # weights.append([lang,"20_sentence",1.0]) | |
| # heads["20_sentence"].append(line_number) | |
| # continue | |
| # if dur<=5: | |
| # expected_data[lang]["5s"]+=1 | |
| # expected_data['total']["5s"]+=1 | |
| # weights.append([lang,"5s",1.0]) | |
| # heads["5s"].append(line_number) | |
| # continue | |
| # elif dur<=10: | |
| # expected_data[lang]["10s"]+=1 | |
| # expected_data['total']["10s"]+=1 | |
| # weights.append([lang,"10s",1.0]) | |
| # heads["10s"].append(line_number) | |
| # continue | |
| # elif dur<=15: | |
| # expected_data[lang]["15s"]+=1 | |
| # expected_data['total']["15s"]+=1 | |
| # weights.append([lang,"15s",1.0]) | |
| # heads["15s"].append(line_number) | |
| # continue | |
| # elif dur<=20: | |
| # expected_data[lang]["20s"]+=1 | |
| # expected_data['total']["20s"]+=1 | |
| # weights.append([lang,"20s",1.0]) | |
| # heads["20s"].append(line_number) | |
| # continue | |
| # else: | |
| # # expected_data[lang][">20"]+=1 | |
| # # expected_data['total'][">20"]+=1 | |
| # # weights.append([lang,">20",1.0]) | |
| # continue | |
| # line_number+=1 | |
| # # final_weights = get_weights(weights,expected_data,languages) | |
| # # final_weights = [1]*len(heads) # same weightage | |
| # if config.ts_gradient_accumulation_steps>1: | |
| # batch = config.ts_batch_size*total_processes*config.ts_gradient_accumulation_steps//config.ts_num_workers | |
| # else: | |
| # batch = config.ts_batch_size*total_processes*config.ts_gradient_accumulation_steps | |
| # # batch = config.ts_batch_size*total_processes*config.ts_gradient_accumulation_steps | |
| # # heads = heads[:-1] | |
| # heads = {i:heads[i] for i in heads if len(heads[i])!=0} | |
| # total_size = sum([len(heads[i]) for i in heads if len(heads[i])!=0]) | |
| # norm_nums = [len(heads[i])/total_size for i in heads if len(heads[i])!=0] | |
| # final_weights = [] | |
| # for i in norm_nums: | |
| # final_weights.append(max(1,math.ceil(i*batch))) | |
| # rem_elem = sum(final_weights)-batch | |
| # final_weights[final_weights.index(max(final_weights))]-=rem_elem | |
| # # heads,final_weights = sorted(zip(heads,final_weights),key=lambda x:x[1]) | |
| # # process_head = [] | |
| # # proc = 0 | |
| # # sm=0 | |
| # # for i in final_weights: | |
| # # # sm+=i | |
| # # if sm+i > | |
| # # process_batch_size = config.ts_batch_size*config.ts_gradient_accumulation_steps | |
| # # proc = 0 | |
| # # lens = {i:len(heads[i]) for i in heads} | |
| # # while proc <= process_id: | |
| # # new_heads ={} | |
| # # new_weights =[] | |
| # # sm=0 | |
| # # for i,j in zip(heads,range(len(final_weights))): | |
| # # if sm + final_weights[j] > process_batch_size: | |
| # # if sm+final_weights[j] == process_batch_size: | |
| # # new_heads[i] = heads[i] | |
| # # new_weights.append(final_weights[j]) | |
| # # heads.pop(i) | |
| # # else: | |
| # # new_heads[i] = heads[i][:1+(lens[i]*(process_batch_size-sm)//process_batch_size)] | |
| # # heads[i] = heads[i][1+(lens[i]*(process_batch_size-sm)//process_batch_size):] | |
| # # if len(heads[i]) == 0: | |
| # # heads.pop(i) | |
| # # new_weights.append(process_batch_size-sm) | |
| # # final_weights[j]-= process_batch_size-sm | |
| # # sm = 0 | |
| # # proc+=1 | |
| # # final_weights=final_weights[j:] | |
| # # break | |
| # # else: | |
| # # new_heads[i] = heads[i] | |
| # # new_weights.append(final_weights[j]) | |
| # # heads.pop(i) | |
| # # if len(heads) == 0: | |
| # # break | |
| # # print("weights",new_weights,[(i,len(heads[i])) for i in new_heads]) | |
| # # return new_heads,new_weights,count | |
| # # # make it more effective as to real_batch_size instead of worker_batch_size | |
| # # # #[867, 31444, 35458, 6764, 1561, 96, 0] per gpu for iitm | |
| # # # [10,400,400,60,20,1] | |
| # # # | |
| # print("weights",final_weights,[(i,len(heads[i])) for i in heads]) | |
| # print(process_id) | |
| # new_heads, new_weights = process_batches(heads,final_weights,process_id+1) | |
| # assert len(new_heads) != 0 and len(new_weights) == len(new_heads), print(new_heads) | |
| # print("process id",process_id,new_weights,[(i,len(new_heads[i])) for i in new_heads]) | |
| # return new_heads, new_weights, count | |
| # def process_batches(heads, final_weights, process_id=0): | |
| # if config.ts_gradient_accumulation_steps>1: | |
| # process_batch_size = config.ts_batch_size * config.ts_gradient_accumulation_steps//config.ts_num_workers | |
| # else: | |
| # process_batch_size = config.ts_batch_size * config.ts_gradient_accumulation_steps | |
| # proc = 0 | |
| # # Create a copy of the original dictionaries to avoid modifying them during iteration | |
| # remaining_heads = heads.copy() | |
| # remaining_weights = final_weights.copy() | |
| # lens = {i: len(heads[i]) for i in heads} | |
| # while proc <= process_id and remaining_heads: | |
| # new_heads = {} | |
| # new_weights = [] | |
| # current_sum = 0 | |
| # # Convert items to list to avoid dictionary size change during iteration | |
| # items = list(remaining_heads.items()) | |
| # for key, head_list in items: | |
| # weight = remaining_weights[0] # Get the corresponding weight | |
| # if current_sum + weight > process_batch_size: | |
| # # Calculate how much of this head we can include | |
| # remaining_space = process_batch_size - current_sum | |
| # if current_sum + weight == process_batch_size: | |
| # # If it fits exactly | |
| # new_heads[key] = head_list | |
| # new_weights.append(weight) | |
| # del remaining_heads[key] | |
| # remaining_weights.pop(0) | |
| # # print("inside first") | |
| # else: | |
| # # If we need to split the head | |
| # split_point = 1 + (lens[key] * remaining_space) // process_batch_size | |
| # new_heads[key] = head_list[:split_point] | |
| # remaining_heads[key] = head_list[split_point:] | |
| # # print(process_id,"inside >",remaining_heads) | |
| # if not remaining_heads[key]: # If the remaining list is empty | |
| # del remaining_heads[key] | |
| # new_weights.append(remaining_space) | |
| # remaining_weights[0] -= remaining_space | |
| # # print(process_id,"inside >",remaining_heads) | |
| # # print("inside seciond") | |
| # proc += 1 | |
| # break | |
| # else: | |
| # # If the current head fits completely | |
| # new_heads[key] = head_list | |
| # new_weights.append(weight) | |
| # del remaining_heads[key] | |
| # remaining_weights.pop(0) | |
| # current_sum += weight | |
| # # print("inside third") | |
| # if len(remaining_heads)==0: | |
| # proc+=1 | |
| # if proc == process_id: | |
| # # print("process id",process_id,proc,new_weights,[(i,len(new_heads[i])) for i in new_heads]) | |
| # return new_heads, new_weights | |
| # return {}, [] # Return empty structures if no valid batch is found | |