Spaces:

garvitcpp
/

recipe-rover-api

Running

App Files Files Community

recipe-rover-api / app /utils /data_preprocessing.py

garvitcpp

Upload 48 files

c30b4ba verified over 1 year ago

raw

history blame contribute delete

3.33 kB

	import pandas as pd
	import numpy as np
	import ast
	import logging
	import re

	logger = logging.getLogger(__name__)

	def parse_r_vector(s):
	"""
	Parse R vector format strings like c("word1", "word2") into Python lists.

	Args:
	s: String in R vector format

	Returns:
	List of strings
	"""
	if pd.isna(s):
	return []

	try:
	# Remove the c() wrapper and split by commas
	if isinstance(s, str) and s.startswith('c(') and s.endswith(')'):
	# Extract content between c( and )
	content = s[2:-1].strip()

	# Use regex to properly split quoted strings
	pattern = r'"([^"]*)"'
	matches = re.findall(pattern, content)

	# Filter out empty strings and NA values
	ingredients = [item.strip() for item in matches if item.strip() and item.lower() != 'na']
	return ingredients
	elif isinstance(s, list):
	return s
	else:
	return []
	except Exception as e:
	logger.warning(f"Error parsing R vector: {s}, Error: {str(e)}")
	return []

	def preprocess_data(df):
	"""
	Preprocess the dataframe by handling boolean, numerical, and list-like columns.
	"""
	bool_columns = ['is_vegetarian', 'is_vegan', 'is_gluten free', 'is_dairy free',
	'is_low carb', 'is_keto', 'is_paleo']
	for col in bool_columns:
	df[col] = df[col].map({'TRUE': 1, 'FALSE': 0, True: 1, False: 0}).fillna(0).astype(int)

	numerical_columns = ['Calories', 'TotalTime_minutes', 'AggregatedRating', 'ReviewCount']
	for col in numerical_columns:
	df[col] = pd.to_numeric(df[col], errors='coerce')
	median_value = df[col].median()
	df[col] = df[col].fillna(median_value)

	# Handle R vector format columns
	r_vector_columns = ['RecipeIngredientParts', 'RecipeInstructions', 'RecipeIngredientQuantities']
	for col in r_vector_columns:
	df[col] = df[col].apply(parse_r_vector)

	# Handle regular list columns
	list_columns = ['Keywords', 'keywords_name']
	for col in list_columns:
	df[col] = df[col].apply(parse_list_string)

	return df

	def parse_list_string(s):
	"""
	Safely parse list-like strings.
	"""
	if pd.isna(s):
	return []
	try:
	if isinstance(s, str):
	parsed = ast.literal_eval(s)
	return parsed if isinstance(parsed, list) else [s]
	elif isinstance(s, list):
	return s
	return []
	except (ValueError, SyntaxError):
	return [s] if s else []

	def parse_recipe_ingredients(ingredient_parts):
	"""
	Parse RecipeIngredientParts field handling R vector format.
	"""
	return parse_r_vector(ingredient_parts)

	def parse_list_field(field):
	"""
	Parse a list field, handling various input types including R vectors.
	"""
	if pd.isna(field):
	return []
	if isinstance(field, list):
	return field
	elif isinstance(field, str):
	if field.startswith('c('):
	return parse_r_vector(field)
	try:
	parsed = ast.literal_eval(field)
	return parsed if isinstance(parsed, list) else []
	except (ValueError, SyntaxError):
	return []
	return []