Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| import ast | |
| import logging | |
| import re | |
| logger = logging.getLogger(__name__) | |
| def parse_r_vector(s): | |
| """ | |
| Parse R vector format strings like c("word1", "word2") into Python lists. | |
| Args: | |
| s: String in R vector format | |
| Returns: | |
| List of strings | |
| """ | |
| if pd.isna(s): | |
| return [] | |
| try: | |
| # Remove the c() wrapper and split by commas | |
| if isinstance(s, str) and s.startswith('c(') and s.endswith(')'): | |
| # Extract content between c( and ) | |
| content = s[2:-1].strip() | |
| # Use regex to properly split quoted strings | |
| pattern = r'"([^"]*)"' | |
| matches = re.findall(pattern, content) | |
| # Filter out empty strings and NA values | |
| ingredients = [item.strip() for item in matches if item.strip() and item.lower() != 'na'] | |
| return ingredients | |
| elif isinstance(s, list): | |
| return s | |
| else: | |
| return [] | |
| except Exception as e: | |
| logger.warning(f"Error parsing R vector: {s}, Error: {str(e)}") | |
| return [] | |
| def preprocess_data(df): | |
| """ | |
| Preprocess the dataframe by handling boolean, numerical, and list-like columns. | |
| """ | |
| bool_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free', | |
| 'is_low carb', 'is_keto', 'is_paleo'] | |
| for col in bool_columns: | |
| df[col] = df[col].map({'TRUE': 1, 'FALSE': 0, True: 1, False: 0}).fillna(0).astype(int) | |
| numerical_columns = ['Calories', 'TotalTime_minutes', 'AggregatedRating', 'ReviewCount'] | |
| for col in numerical_columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| median_value = df[col].median() | |
| df[col] = df[col].fillna(median_value) | |
| # Handle R vector format columns | |
| r_vector_columns = ['RecipeIngredientParts', 'RecipeInstructions', 'RecipeIngredientQuantities'] | |
| for col in r_vector_columns: | |
| df[col] = df[col].apply(parse_r_vector) | |
| # Handle regular list columns | |
| list_columns = ['Keywords', 'keywords_name'] | |
| for col in list_columns: | |
| df[col] = df[col].apply(parse_list_string) | |
| return df | |
| def parse_list_string(s): | |
| """ | |
| Safely parse list-like strings. | |
| """ | |
| if pd.isna(s): | |
| return [] | |
| try: | |
| if isinstance(s, str): | |
| parsed = ast.literal_eval(s) | |
| return parsed if isinstance(parsed, list) else [s] | |
| elif isinstance(s, list): | |
| return s | |
| return [] | |
| except (ValueError, SyntaxError): | |
| return [s] if s else [] | |
| def parse_recipe_ingredients(ingredient_parts): | |
| """ | |
| Parse RecipeIngredientParts field handling R vector format. | |
| """ | |
| return parse_r_vector(ingredient_parts) | |
| def parse_list_field(field): | |
| """ | |
| Parse a list field, handling various input types including R vectors. | |
| """ | |
| if pd.isna(field): | |
| return [] | |
| if isinstance(field, list): | |
| return field | |
| elif isinstance(field, str): | |
| if field.startswith('c('): | |
| return parse_r_vector(field) | |
| try: | |
| parsed = ast.literal_eval(field) | |
| return parsed if isinstance(parsed, list) else [] | |
| except (ValueError, SyntaxError): | |
| return [] | |
| return [] |