File size: 3,333 Bytes
c30b4ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import numpy as np
import ast
import logging
import re

logger = logging.getLogger(__name__)

def parse_r_vector(s):
    """
    Parse R vector format strings like c("word1", "word2") into Python lists.
    
    Args:
        s: String in R vector format
        
    Returns:
        List of strings
    """
    if pd.isna(s):
        return []
    
    try:
        # Remove the c() wrapper and split by commas
        if isinstance(s, str) and s.startswith('c(') and s.endswith(')'):
            # Extract content between c( and )
            content = s[2:-1].strip()
            
            # Use regex to properly split quoted strings
            pattern = r'"([^"]*)"'
            matches = re.findall(pattern, content)
            
            # Filter out empty strings and NA values
            ingredients = [item.strip() for item in matches if item.strip() and item.lower() != 'na']
            return ingredients
        elif isinstance(s, list):
            return s
        else:
            return []
    except Exception as e:
        logger.warning(f"Error parsing R vector: {s}, Error: {str(e)}")
        return []

def preprocess_data(df):
    """
    Preprocess the dataframe by handling boolean, numerical, and list-like columns.
    """
    bool_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free', 
                    'is_low carb', 'is_keto', 'is_paleo']
    for col in bool_columns:
        df[col] = df[col].map({'TRUE': 1, 'FALSE': 0, True: 1, False: 0}).fillna(0).astype(int)
    
    numerical_columns = ['Calories', 'TotalTime_minutes', 'AggregatedRating', 'ReviewCount']
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
    
    # Handle R vector format columns
    r_vector_columns = ['RecipeIngredientParts', 'RecipeInstructions', 'RecipeIngredientQuantities']
    for col in r_vector_columns:
        df[col] = df[col].apply(parse_r_vector)
    
    # Handle regular list columns
    list_columns = ['Keywords', 'keywords_name']
    for col in list_columns:
        df[col] = df[col].apply(parse_list_string)
    
    return df

def parse_list_string(s):
    """
    Safely parse list-like strings.
    """
    if pd.isna(s):
        return []
    try:
        if isinstance(s, str):
            parsed = ast.literal_eval(s)
            return parsed if isinstance(parsed, list) else [s]
        elif isinstance(s, list):
            return s
        return []
    except (ValueError, SyntaxError):
        return [s] if s else []

def parse_recipe_ingredients(ingredient_parts):
    """
    Parse RecipeIngredientParts field handling R vector format.
    """
    return parse_r_vector(ingredient_parts)

def parse_list_field(field):
    """
    Parse a list field, handling various input types including R vectors.
    """
    if pd.isna(field):
        return []
    if isinstance(field, list):
        return field
    elif isinstance(field, str):
        if field.startswith('c('):
            return parse_r_vector(field)
        try:
            parsed = ast.literal_eval(field)
            return parsed if isinstance(parsed, list) else []
        except (ValueError, SyntaxError):
            return []
    return []