Spaces:
Sleeping
Sleeping
Commit
·
4fdc102
1
Parent(s):
c8f19c2
optimization
Browse files
app.py
CHANGED
|
@@ -35,6 +35,7 @@ def download_data():
|
|
| 35 |
csv_filename = 'XWines_Full_100K_wines.csv'
|
| 36 |
|
| 37 |
if os.path.exists(csv_filename):
|
|
|
|
| 38 |
return csv_filename
|
| 39 |
|
| 40 |
# Convert Google Drive share link to direct download link
|
|
@@ -62,18 +63,28 @@ def load_and_preprocess_data():
|
|
| 62 |
csv_filename = download_data()
|
| 63 |
|
| 64 |
try:
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
| 66 |
except FileNotFoundError:
|
| 67 |
raise FileNotFoundError(f"CSV file '{csv_filename}' not found.")
|
| 68 |
|
| 69 |
def parse_list_string(s):
|
|
|
|
|
|
|
| 70 |
try:
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
except (ValueError, SyntaxError):
|
| 73 |
return []
|
| 74 |
|
| 75 |
-
|
| 76 |
-
df['
|
|
|
|
| 77 |
df['main_grape'] = df['grapes_list'].apply(lambda x: x[0] if x else 'Unknown')
|
| 78 |
df['num_grapes'] = df['grapes_list'].apply(len)
|
| 79 |
df['body_numeric'] = df['Body'].map(BODY_MAPPING)
|
|
@@ -81,9 +92,14 @@ def load_and_preprocess_data():
|
|
| 81 |
|
| 82 |
|
| 83 |
# --- OPTIMIZATION 2: Vectorized Data Aggregation ---
|
| 84 |
-
def get_top_food_pairings(
|
| 85 |
-
"""Get top N food pairings with emojis and names."""
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
if not all_pairings:
|
| 88 |
return {'emojis': '🍽️', 'names': 'General'}
|
| 89 |
|
|
@@ -123,14 +139,24 @@ def aggregate_wine_data(df, wine_types, max_grape_count, min_samples_choice, reg
|
|
| 123 |
if agg_df.empty:
|
| 124 |
return agg_df
|
| 125 |
|
| 126 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
agg_df['body_dist'] = agg_df['body_list'].apply(
|
| 128 |
-
lambda x: (
|
| 129 |
agg_df['acid_dist'] = agg_df['acidity_list'].apply(
|
| 130 |
-
lambda x: (
|
| 131 |
-
# --- END OF FIX ---
|
| 132 |
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
agg_df['pairing_emoji'] = agg_df['pairing_data'].apply(lambda x: x['emojis'])
|
| 135 |
agg_df['pairing_names'] = agg_df['pairing_data'].apply(lambda x: x['names'])
|
| 136 |
agg_df['wine_type_order'] = agg_df['Type'].map(WINE_TYPE_ORDER)
|
|
|
|
| 35 |
csv_filename = 'XWines_Full_100K_wines.csv'
|
| 36 |
|
| 37 |
if os.path.exists(csv_filename):
|
| 38 |
+
print(f"Using existing dataset: {csv_filename}")
|
| 39 |
return csv_filename
|
| 40 |
|
| 41 |
# Convert Google Drive share link to direct download link
|
|
|
|
| 63 |
csv_filename = download_data()
|
| 64 |
|
| 65 |
try:
|
| 66 |
+
print("Loading CSV data...")
|
| 67 |
+
# Use efficient data types and only load needed columns if possible
|
| 68 |
+
df = pd.read_csv(csv_filename, low_memory=False)
|
| 69 |
+
print(f"Loaded {len(df):,} wine records")
|
| 70 |
except FileNotFoundError:
|
| 71 |
raise FileNotFoundError(f"CSV file '{csv_filename}' not found.")
|
| 72 |
|
| 73 |
def parse_list_string(s):
|
| 74 |
+
if not isinstance(s, str) or not s.strip():
|
| 75 |
+
return []
|
| 76 |
try:
|
| 77 |
+
# Fast path for common patterns
|
| 78 |
+
s = s.strip()
|
| 79 |
+
if s.startswith('[') and s.endswith(']'):
|
| 80 |
+
return ast.literal_eval(s)
|
| 81 |
+
return []
|
| 82 |
except (ValueError, SyntaxError):
|
| 83 |
return []
|
| 84 |
|
| 85 |
+
# Vectorized string processing for better performance
|
| 86 |
+
df['grapes_list'] = df['Grapes'].fillna('[]').apply(parse_list_string)
|
| 87 |
+
df['harmonize_list'] = df['Harmonize'].fillna('[]').apply(parse_list_string)
|
| 88 |
df['main_grape'] = df['grapes_list'].apply(lambda x: x[0] if x else 'Unknown')
|
| 89 |
df['num_grapes'] = df['grapes_list'].apply(len)
|
| 90 |
df['body_numeric'] = df['Body'].map(BODY_MAPPING)
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
# --- OPTIMIZATION 2: Vectorized Data Aggregation ---
|
| 95 |
+
def get_top_food_pairings(harmonize_list, top_n=3):
|
| 96 |
+
"""Get top N food pairings with emojis and names - optimized version."""
|
| 97 |
+
# Flatten list more efficiently
|
| 98 |
+
all_pairings = []
|
| 99 |
+
for sublist in harmonize_list:
|
| 100 |
+
if isinstance(sublist, list):
|
| 101 |
+
all_pairings.extend(sublist)
|
| 102 |
+
|
| 103 |
if not all_pairings:
|
| 104 |
return {'emojis': '🍽️', 'names': 'General'}
|
| 105 |
|
|
|
|
| 139 |
if agg_df.empty:
|
| 140 |
return agg_df
|
| 141 |
|
| 142 |
+
# Optimized distribution calculation
|
| 143 |
+
def calc_distribution(values_list, categories):
|
| 144 |
+
if not values_list:
|
| 145 |
+
return {cat: 0.0 for cat in categories}
|
| 146 |
+
|
| 147 |
+
counts = pd.Series(values_list).value_counts(normalize=True) * 100
|
| 148 |
+
return {cat: counts.get(cat, 0.0) for cat in categories}
|
| 149 |
+
|
| 150 |
agg_df['body_dist'] = agg_df['body_list'].apply(
|
| 151 |
+
lambda x: calc_distribution(x, BODY_ORDER))
|
| 152 |
agg_df['acid_dist'] = agg_df['acidity_list'].apply(
|
| 153 |
+
lambda x: calc_distribution(x, ACIDITY_ORDER))
|
|
|
|
| 154 |
|
| 155 |
+
# Pre-compute food pairings more efficiently
|
| 156 |
+
pairing_data = []
|
| 157 |
+
for harmonize_list in agg_df['harmonize_list']:
|
| 158 |
+
pairing_data.append(get_top_food_pairings(harmonize_list))
|
| 159 |
+
agg_df['pairing_data'] = pairing_data
|
| 160 |
agg_df['pairing_emoji'] = agg_df['pairing_data'].apply(lambda x: x['emojis'])
|
| 161 |
agg_df['pairing_names'] = agg_df['pairing_data'].apply(lambda x: x['names'])
|
| 162 |
agg_df['wine_type_order'] = agg_df['Type'].map(WINE_TYPE_ORDER)
|