Spaces:
Sleeping
Sleeping
| # Assuming sanitize_text is a function you've defined elsewhere | |
| import re | |
| def merge_documents(main_dict, additional_json, limit=1000): | |
| """ | |
| Adds a subset of documents from an additional JSON file to the main dictionary. | |
| Args: | |
| main_dict (dict): The main dictionary where processed documents are stored. | |
| additional_json (list): The additional JSON data containing documents. | |
| limit (int): The maximum number of documents to add to the main dictionary. | |
| Returns: | |
| dict: The updated main dictionary with additional documents added. | |
| """ | |
| # Counter to track how many documents have been added | |
| count = 0 | |
| for doc in additional_json: | |
| if count >= limit: | |
| break | |
| # Extract wikipedia_id and text from the document | |
| wikipedia_id = doc.get("wikipedia_id") | |
| text = doc.get("text", []) | |
| # Check if the document ID is unique to avoid overwriting | |
| if wikipedia_id not in main_dict: | |
| # Process and sanitize the document | |
| joined_text = " ".join(text) | |
| sanitized_text = sanitize_text(joined_text) | |
| # Add to the main dictionary | |
| main_dict[wikipedia_id] = sanitized_text | |
| count += 1 | |
| print(f"{count} documents added to the main dictionary.") | |
| return main_dict | |
| def sanitize_text(text): | |
| """ | |
| Cleans and standardizes text by keeping only alphanumeric characters and spaces. | |
| Args: | |
| text (str): Text to sanitize. | |
| Returns: | |
| str: Sanitized text. | |
| """ | |
| if isinstance(text, str): | |
| # Use regex to keep only alphanumeric characters and spaces | |
| text = re.sub(r'[^a-zA-Z0-9\s]', '', text) | |
| # Optionally, collapse multiple spaces into a single space | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def process_json_data(json_data): | |
| result_dict = {} | |
| for doc in json_data: | |
| # Extract wikipedia_id and text | |
| wikipedia_id = doc.get("wikipedia_id") | |
| text = doc.get("text", []) | |
| # Join the text content and sanitize | |
| joined_text = " ".join(text) | |
| sanitized_text = sanitize_text(joined_text) | |
| # Store in the dictionary | |
| result_dict[wikipedia_id] = sanitized_text | |
| return result_dict | |
| def process_queries(json_data): | |
| """ | |
| Processes a JSON object containing queries and query IDs. | |
| Args: | |
| json_data (dict): The input JSON data. | |
| Returns: | |
| dict: A dictionary with query_id as the key and query text as the value. | |
| """ | |
| result_dict = {} | |
| for query_id, query_info in json_data.items(): | |
| # Extract the query input | |
| query_text = query_info.get("input", "") | |
| # Store query_id and text in the result dictionary | |
| result_dict[query_id] = query_text | |
| return result_dict | |
| # Example usage | |
| # Assuming `query_json_file` contains your JSON data | |
| # processed_queries = process_queries(query_json_file) | |