Spaces:
Sleeping
Sleeping
| import logging | |
| import random | |
| import math | |
| def process_data(instance, noise_rate, passage_num, filename, correct_rate=0): | |
| """Process the data for generating a noisy document set.""" | |
| query = instance['query'] | |
| ans = instance['answer'] | |
| logging.info(f"Query: {query}") | |
| logging.info(f"Answer: {ans}") | |
| neg_num = math.ceil(passage_num * noise_rate) | |
| pos_num = passage_num - neg_num | |
| docs = [] | |
| # Handling the '_int' case in filename | |
| if '_int' in filename: | |
| for i in instance['positive']: | |
| random.shuffle(i) | |
| docs = [i[0] for i in instance['positive']] | |
| if len(docs) < pos_num: | |
| maxnum = max([len(i) for i in instance['positive']]) | |
| for i in range(1, maxnum): | |
| for j in instance['positive']: | |
| if len(j) > i: | |
| docs.append(j[i]) | |
| if len(docs) == pos_num: | |
| break | |
| if len(docs) == pos_num: | |
| break | |
| neg_num = passage_num - len(docs) | |
| if neg_num > 0: | |
| negative = instance['negative'][:neg_num] | |
| docs += negative | |
| # Handling the '_fact' case in filename | |
| elif '_fact' in filename: | |
| correct_num = math.ceil(passage_num * correct_rate) | |
| # Adjust correct_num to not exceed passage_num - neg_num, excluding positive_wrong | |
| if correct_rate == 1.0: | |
| # For factual-only with noise, use only positive and negative documents | |
| correct_num = min(correct_num, passage_num - neg_num) | |
| pos_num = 0 # No positive_wrong documents when correct_rate = 1.0 | |
| else: | |
| # For other correct_rate values, calculate pos_num for positive_wrong | |
| pos_num = passage_num - neg_num - correct_num | |
| if pos_num < 0: | |
| pos_num = 0 # Ensure pos_num is not negative | |
| # Select positive documents (factual) first | |
| indexs_positive = list(range(len(instance['positive']))) | |
| selected_positive = random.sample(indexs_positive, min(len(indexs_positive), correct_num)) | |
| docs = [instance['positive'][i] for i in selected_positive] | |
| # Add negative documents (noise) if needed | |
| if neg_num > 0 and 'negative' in instance: | |
| docs += instance['negative'][:min(neg_num, len(instance['negative']))] | |
| # Only add positive_wrong documents if pos_num > 0 and correct_rate < 1.0 | |
| if pos_num > 0 and correct_rate < 1.0: | |
| indexs_positive_wrong = list(range(len(instance['positive_wrong']))) | |
| selected_positive_wrong = random.sample(indexs_positive_wrong, min(len(indexs_positive_wrong), pos_num)) | |
| docs += [instance['positive_wrong'][i] for i in selected_positive_wrong] | |
| # Ensure docs length does not exceed passage_num | |
| if len(docs) > passage_num: | |
| random.shuffle(docs) | |
| docs = docs[:passage_num] | |
| elif len(docs) < passage_num and 'negative' in instance: | |
| remaining = passage_num - len(docs) | |
| docs += instance['negative'][:min(remaining, len(instance['negative']))] | |
| # Default case (when filename doesn't match '_int' or '_fact') | |
| else: | |
| if noise_rate == 1: | |
| neg_num = passage_num | |
| pos_num = 0 | |
| else: | |
| if neg_num > len(instance['negative']): | |
| neg_num = len(instance['negative']) | |
| elif pos_num > len(instance['positive']): | |
| pos_num = len(instance['positive']) | |
| positive = instance['positive'][:pos_num] | |
| negative = instance['negative'][:neg_num] | |
| docs = positive + negative | |
| # Count the positive and negative documents | |
| num_positive = sum(1 for doc in docs if doc in positive) | |
| num_negative = sum(1 for doc in docs if doc in negative) | |
| logging.info(f"Using {num_positive} positive and {num_negative} negative documents as context") | |
| # Shuffle the final document list | |
| random.shuffle(docs) | |
| return query, ans, docs | |