yusenthebot commited on
Commit
81e637f
·
1 Parent(s): d78eb74

Initial deployment

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gradio/certificate.pem +31 -0
  2. README.md +152 -12
  3. app.py +592 -0
  4. frige_detect/__pycache__/detect.cpython-313.pyc +0 -0
  5. frige_detect/annotated_image.jpg +0 -0
  6. frige_detect/demo/t1.jpg +0 -0
  7. frige_detect/demo/t2.jpg +0 -0
  8. frige_detect/demo/t3.jpg +0 -0
  9. frige_detect/demo/t4.jpg +0 -0
  10. frige_detect/detect.py +208 -0
  11. frige_detect/recipe_input.json +86 -0
  12. frige_detect/roboflow_credentials.txt +4 -0
  13. recipe_recommendation/__init__.py +0 -0
  14. recipe_recommendation/__pycache__/__init__.cpython-313.pyc +0 -0
  15. recipe_recommendation/__pycache__/main.cpython-313.pyc +0 -0
  16. recipe_recommendation/data/ingredient_map.data +0 -0
  17. recipe_recommendation/main.py +652 -0
  18. recipe_recommendation/readme.txt +142 -0
  19. recipe_recommendation/readme_cn.txt +92 -0
  20. recipe_recommendation/src/__init__.py +0 -0
  21. recipe_recommendation/src/__pycache__/__init__.cpython-313.pyc +0 -0
  22. recipe_recommendation/src/__pycache__/candidate.cpython-313.pyc +0 -0
  23. recipe_recommendation/src/__pycache__/coldstart.cpython-313.pyc +0 -0
  24. recipe_recommendation/src/__pycache__/embedding.cpython-313.pyc +0 -0
  25. recipe_recommendation/src/__pycache__/feature.cpython-313.pyc +0 -0
  26. recipe_recommendation/src/__pycache__/highlight.cpython-313.pyc +0 -0
  27. recipe_recommendation/src/__pycache__/io.cpython-313.pyc +0 -0
  28. recipe_recommendation/src/__pycache__/trainmodel.cpython-313.pyc +0 -0
  29. recipe_recommendation/src/candidate.py +365 -0
  30. recipe_recommendation/src/coldstart.py +279 -0
  31. recipe_recommendation/src/embedding.py +100 -0
  32. recipe_recommendation/src/feature.py +176 -0
  33. recipe_recommendation/src/highlight.py +91 -0
  34. recipe_recommendation/src/io.py +37 -0
  35. recipe_recommendation/src/trainmodel.py +237 -0
  36. recipe_recommendation/user_data/demo_user_1/user_profile.json +28 -0
  37. recipe_recommendation/user_data/user_0/feature_order.json +22 -0
  38. recipe_recommendation/user_data/user_0/feedback.csv +2 -0
  39. recipe_recommendation/user_data/user_0/qid.txt +1 -0
  40. recipe_recommendation/user_data/user_0/ranker.pkl +3 -0
  41. recipe_recommendation/user_data/user_0/user_features_rank.csv +0 -0
  42. recipe_recommendation/user_data/user_0/user_profile.json +26 -0
  43. recipe_recommendation/user_data/user_1/feature_order.json +22 -0
  44. recipe_recommendation/user_data/user_1/feedback.csv +3 -0
  45. recipe_recommendation/user_data/user_1/qid.txt +1 -0
  46. recipe_recommendation/user_data/user_1/ranker.pkl +3 -0
  47. recipe_recommendation/user_data/user_1/user_features_rank.csv +0 -0
  48. recipe_recommendation/user_data/user_1/user_profile.json +26 -0
  49. recipe_recommendation/user_data/user_2/feature_order.json +22 -0
  50. recipe_recommendation/user_data/user_2/feedback.csv +2 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,152 @@
1
- ---
2
- title: SmartFridge
3
- emoji: 🐨
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Smart Fridge Recipe Assistant
2
+
3
+ The Smart Fridge Recipe Assistant combines Roboflow-powered ingredient detection with a multi-stage recipe recommendation engine. Upload a photo of your fridge and instantly receive recipe ideas that respect your dietary preferences, nutritional goals, and ingredient availability.
4
+
5
+ ![Smart Fridge workflow](frige_detect/annotated_image.jpg)
6
+
7
+ ## Features
8
+
9
+ - **Visual ingredient detection** – Uses a Roboflow YOLO model to detect fridge items, annotate the photo, and build a structured ingredient payload.
10
+ - **Robust recipe ranking pipeline** – Performs coarse ranking, ML reranking, and clustering-based diversification using pretrained user profiles.
11
+ - **Personalized dietary controls** – Configure vegetarian style, allergies, preferred cuisines, macro ranges, and cooking time caps directly in the UI.
12
+ - **Interactive feedback loop** Record positive feedback for recommended recipes to continuously refine personal models.
13
+ - **One-click examples** – Try the demo instantly with bundled sample fridge photos.
14
+
15
+ ## Project structure
16
+
17
+ ```
18
+ smartFridge/
19
+ ├── app.py # Gradio user interface
20
+ ├── frige_detect/ # Roboflow detector & demo assets
21
+ │ ├── detect.py
22
+ │ ├── demo/
23
+ │ └── roboflow_credentials.txt
24
+ ├── recipe_recommendation/ # Recommendation engine
25
+ │ ├── main.py
26
+ │ ├── src/
27
+ │ └── user_data/
28
+ ├── requirements.txt
29
+ └── README.md
30
+ ```
31
+
32
+ ## Installation
33
+
34
+ 1. Create a new Python environment (recommended).
35
+ 2. Install dependencies:
36
+
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ The Roboflow API key and project information used by the detector are stored in `frige_detect/roboflow_credentials.txt` and loaded automatically; no manual input is required.
42
+
43
+ ```markdown
44
+ ## Running the app locally
45
+
46
+ ```bash
47
+ python app.py
48
+ ```
49
+
50
+ This command launches a Gradio web interface with share link enabled. In the browser you can:
51
+
52
+ ### Core Features
53
+
54
+ **1. Quick Start with Examples**
55
+ - Select from predefined user profiles (user_1, user_2, user_3) with different dietary preferences
56
+ - Choose from example fridge images (t1.jpg, t2.jpg, t3.jpg)
57
+ - Mix and match any profile with any image for testing
58
+
59
+ **2. Custom User Profiles**
60
+ - Create new user profiles by entering a custom User ID
61
+ - Configure comprehensive dietary preferences:
62
+ - **Vegetarian type**: flexible, flexible_vegetarian, ovo_vegetarian, lacto_vegetarian, vegan, non_vegetarian
63
+ - **Allergies**: comma-separated list (e.g., "peanut, shrimp")
64
+ - **Region preferences**: comma-separated (e.g., "Asia, Europe")
65
+ - **Nutritional goals**:
66
+ - Calorie range (min/max sliders from 0-4000)
67
+ - Protein range (min/max sliders from 0-250g)
68
+ - **Ingredient preferences**:
69
+ - Preferred main ingredients (e.g., "chicken, tofu")
70
+ - Disliked main ingredients (e.g., "lamb, beef")
71
+ - **Cooking time limit**: maximum cooking time in minutes (0-180)
72
+
73
+ **3. Smart Fridge Detection & Recipe Recommendation**
74
+ - Upload your own fridge photo or use example images
75
+ - Click **"Analyze fridge & recommend recipes"**
76
+ - The system will:
77
+ - Detect ingredients using the Roboflow computer vision model
78
+ - Map detected items to parent ingredient categories
79
+ - Filter recipes based on your dietary restrictions, nutrition goals, and disliked ingredients
80
+ - Score and rank recipes using ML-based personalization
81
+ - Apply region preference boosting and ingredient matching
82
+ - Diversify results using KMeans clustering to ensure variety
83
+
84
+ **4. Automatic Profile Management**
85
+ - User profiles are **automatically saved/updated** every time you click "Analyze"
86
+ - No manual save required - just modify preferences and run
87
+ - Feedback count is preserved when updating existing profiles
88
+ - All profiles stored under `recipe_recommendation/user_data/<user_id>/`
89
+
90
+ **5. Feedback System**
91
+ - Review the top 5 recommended recipes with detailed information:
92
+ - Recipe name and match score
93
+ - Region and cuisine type
94
+ - Nutritional information (calories, protein)
95
+ - Main, staple, and other ingredients used
96
+ - Select your favorite recipe from the dropdown
97
+ - Press **"Save feedback"** to log positive feedback
98
+ - Feedback is used to retrain personalized ranking models (every 20 feedback entries)
99
+
100
+ ### How the Recommendation Pipeline Works
101
+
102
+ 1. **Detection**: Roboflow model identifies ingredients in your fridge photo
103
+ 2. **Mapping**: Detected items are mapped to parent categories (e.g., "chicken breast" → "chicken")
104
+ 3. **Hard Filtering**:
105
+ - Removes recipes violating dietary restrictions (vegan/vegetarian)
106
+ - Filters out recipes outside your calorie/protein ranges
107
+ - Eliminates recipes containing disliked main ingredients
108
+ 4. **Coarse Ranking**: Fast ingredient matching across 20,000+ candidates
109
+ 5. **ML Reranking**: Personalized ranking using your trained model (or similar user's model)
110
+ 6. **Diversification**: KMeans clustering ensures variety in final recommendations
111
+ 7. **Top-K Selection**: Returns the best 5 recipes tailored to your preferences
112
+
113
+ All user profiles, feedback files, trained models, and feature rankings are stored under `recipe_recommendation/user_data/<user_id>/`.
114
+
115
+ ## Dataset & Models
116
+
117
+ ### Computer Vision Model
118
+ - **Fridge ingredient detection**: [Roboflow Nutrition Object Detection](https://universe.roboflow.com/ie-wqegj/nutrition-object-detection)
119
+ - Pre-trained model for detecting common food items in refrigerator images
120
+ - Provides bounding boxes and confidence scores for detected ingredients
121
+ - Credentials stored in `frige_detect/roboflow_credentials.txt`
122
+
123
+ ### Recipe Dataset
124
+ - **Recipe database**: Fetched from Hugging Face dataset [`Iris314/recipe-cleaned`](https://huggingface.co/datasets/Iris314/recipe-cleaned)
125
+ - **Ingredient mappings**: Hierarchical mapping from specific items to parent categories
126
+ - Both are automatically downloaded on first run and cached locally
127
+
128
+ ### Ranking Models
129
+ - User-specific ranking models are automatically:
130
+ - Bootstrapped using cold-start features for new users
131
+ - Copied from similar users (based on profile embedding similarity)
132
+ - Retrained every 20 feedback entries to improve personalization
133
+ - Models stored per user at `recipe_recommendation/user_data/<user_id>/ranker.pkl`
134
+
135
+ ## Deploying to Hugging Face Spaces
136
+
137
+ To deploy this application to Hugging Face Spaces:
138
+
139
+ 1. Create a new Space on Hugging Face with Gradio SDK
140
+ 2. Upload this repository to the Space
141
+ 3. Ensure `app.py` is set as the main application file
142
+ 4. The Space will automatically run `python app.py` on startup
143
+ 5. No additional environment variables or secrets required (Roboflow credentials are bundled)
144
+
145
+ The deployed app will have the same functionality as the local version, including persistent user profiles and feedback storage.
146
+
147
+ ## License
148
+
149
+ This project bundles third-party datasets and models subject to their respective licenses:
150
+ - Roboflow Nutrition Object Detection model: Subject to [Roboflow Terms of Service](https://roboflow.com/terms)
151
+ - Recipe dataset from Hugging Face: Check the [`Iris314/recipe-cleaned`](https://huggingface.co/datasets/Iris314/recipe-cleaned) dataset page for licensing details
152
+ ```
app.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio application for the smart fridge detector + recipe recommendation pipeline."""
2
+
3
+ import json
4
+ import tempfile
5
+ from pathlib import Path
6
+ from typing import List, Tuple, Dict, Any
7
+
8
+ import cv2
9
+ import gradio as gr
10
+ import numpy as np
11
+ from PIL import Image
12
+
13
+ from frige_detect.detect import (
14
+ detect_and_generate,
15
+ load_roboflow_credentials,
16
+ RoboflowCredentials,
17
+ )
18
+ from recipe_recommendation.main import (
19
+ load_recipes,
20
+ recommend_recipes,
21
+ save_user_profile,
22
+ get_feedback,
23
+ USER_DATA_DIR,
24
+ )
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Global resources
28
+ # ---------------------------------------------------------------------------
29
+ CREDENTIALS_PATH = Path("frige_detect/roboflow_credentials.txt")
30
+ ROBOFLOW_CREDENTIALS: RoboflowCredentials = load_roboflow_credentials(str(CREDENTIALS_PATH))
31
+ RECIPES_DF = load_recipes()
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Predefined user profiles for examples
35
+ # ---------------------------------------------------------------------------
36
+ EXAMPLE_PROFILES = {
37
+ "user_1": {
38
+ "vegetarian_type": "flexible",
39
+ "allergies": "",
40
+ "regions": "North America",
41
+ "calorie_min": 250,
42
+ "calorie_max": 2000,
43
+ "protein_min": 50,
44
+ "protein_max": 160,
45
+ "preferred_main": "",
46
+ "disliked_main": "",
47
+ "cooking_time": 45,
48
+ },
49
+ "user_2": {
50
+ "vegetarian_type": "flexible_vegetarian",
51
+ "allergies": "shrimp",
52
+ "regions": "Asia",
53
+ "calorie_min": 400,
54
+ "calorie_max": 1500,
55
+ "protein_min": 40,
56
+ "protein_max": 120,
57
+ "preferred_main": "tofu",
58
+ "disliked_main": "beef",
59
+ "cooking_time": 60,
60
+ },
61
+ "user_3": {
62
+ "vegetarian_type": "non_vegetarian",
63
+ "allergies": "",
64
+ "regions": "Europe",
65
+ "calorie_min": 500,
66
+ "calorie_max": 2000,
67
+ "protein_min": 80,
68
+ "protein_max": 160,
69
+ "preferred_main": "beef, chicken",
70
+ "disliked_main": "",
71
+ "cooking_time": 45,
72
+ },
73
+ }
74
+
75
+ # Predefined example images
76
+ EXAMPLE_IMAGES = [
77
+ "frige_detect/demo/t1.jpg",
78
+ "frige_detect/demo/t2.jpg",
79
+ "frige_detect/demo/t3.jpg",
80
+ ]
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Helper utilities
85
+ # ---------------------------------------------------------------------------
86
+ def parse_csv_list(text: str) -> List[str]:
87
+ if not text:
88
+ return []
89
+ parts = [item.strip() for item in text.split(",") if item.strip()]
90
+ return parts
91
+
92
+
93
+ def ensure_numpy_image(image: Any) -> np.ndarray:
94
+ """Convert incoming image (PIL or numpy) to RGB numpy array."""
95
+ if image is None:
96
+ raise ValueError("Please upload a fridge photo before running detection.")
97
+ if isinstance(image, np.ndarray):
98
+ return image
99
+ if isinstance(image, Image.Image):
100
+ return np.array(image.convert("RGB"))
101
+ raise ValueError("Unsupported image format provided.")
102
+
103
+
104
+ def write_temp_image(image: np.ndarray) -> str:
105
+ """Write numpy image to a temporary file and return the path."""
106
+ temp_dir = Path(tempfile.mkdtemp(prefix="fridge_upload_"))
107
+ temp_path = temp_dir / "upload.jpg"
108
+ bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
109
+ cv2.imwrite(str(temp_path), bgr_image)
110
+ return str(temp_path)
111
+
112
+
113
+ def build_user_profile(
114
+ user_id: str,
115
+ vegetarian_type: str,
116
+ allergies: str,
117
+ regions: str,
118
+ calorie_range: Tuple[float, float],
119
+ protein_range: Tuple[float, float],
120
+ preferred_main: str,
121
+ disliked_main: str,
122
+ cooking_time: float,
123
+ ) -> Dict[str, Any]:
124
+ """
125
+ Build and save user profile. This function ALWAYS creates or overwrites the profile
126
+ with the current input values, enabling users to modify preferences on-the-fly.
127
+ """
128
+ user_id = user_id.strip()
129
+ if not user_id:
130
+ raise ValueError("User ID cannot be empty.")
131
+
132
+ profile_dir = USER_DATA_DIR / user_id
133
+ profile_path = profile_dir / "user_profile.json"
134
+
135
+ # Preserve feedback count if profile exists
136
+ num_feedback = 0
137
+ if profile_path.exists():
138
+ try:
139
+ existing = json.loads(profile_path.read_text(encoding="utf-8"))
140
+ num_feedback = existing.get("num_feedback", 0)
141
+ except Exception:
142
+ pass
143
+
144
+ profile = {
145
+ "user_id": user_id,
146
+ "num_feedback": num_feedback,
147
+ "diet": {"vegetarian_type": vegetarian_type},
148
+ "allergies": parse_csv_list(allergies),
149
+ "region_preference": parse_csv_list(regions),
150
+ "nutritional_goals": {
151
+ "calories": {"min": int(calorie_range[0]), "max": int(calorie_range[1])},
152
+ "protein": {"min": int(protein_range[0]), "max": int(protein_range[1])},
153
+ },
154
+ "other_preferences": {
155
+ "preferred_main": parse_csv_list(preferred_main),
156
+ "disliked_main": parse_csv_list(disliked_main),
157
+ "cooking_time_max": int(cooking_time) if cooking_time else None,
158
+ },
159
+ }
160
+
161
+ # Always save the profile (create new or overwrite existing)
162
+ save_user_profile(user_id, profile)
163
+ print(f"[app] Profile saved/updated for user '{user_id}'")
164
+
165
+ return profile
166
+
167
+
168
+ def summarize_ingredients(
169
+ user_parents: List[str],
170
+ high_conf: List[str],
171
+ low_conf: List[str],
172
+ ) -> str:
173
+ lines = ["### Ingredient Mapping"]
174
+ if user_parents:
175
+ lines.append("- **Mapped parent ingredients:** " + ", ".join(sorted(user_parents)))
176
+ else:
177
+ lines.append("- **Mapped parent ingredients:** none")
178
+ if high_conf:
179
+ lines.append("- **High confidence detections:** " + ", ".join(sorted(high_conf)))
180
+ if low_conf:
181
+ lines.append("- **Low confidence detections:** " + ", ".join(sorted(set(low_conf))))
182
+ return "\n".join(lines)
183
+
184
+
185
+ def _ensure_iterable(value: Any) -> List[str]:
186
+ if value is None:
187
+ return []
188
+ if isinstance(value, set):
189
+ return sorted(value)
190
+ if isinstance(value, list):
191
+ return value
192
+ if isinstance(value, str):
193
+ return [value]
194
+ return list(value)
195
+
196
+
197
+ def render_recommendations(df) -> Tuple[str, List[Dict[str, Any]]]:
198
+ if df is None or df.empty:
199
+ return "No recipes matched the current constraints.", []
200
+
201
+ lines = ["### Recommended Recipes"]
202
+ feedback_rows: List[Dict[str, Any]] = []
203
+
204
+ for idx, row in df.head(5).iterrows():
205
+ match_score = row.get("match_score") or row.get("ml_score", 0)
206
+ scaled = match_score * 100 if match_score is not None else 0
207
+ name = row.get("name", f"Recipe {idx+1}")
208
+ lines.append(f"{idx + 1}. **{name}** — score {scaled:.1f}%")
209
+
210
+ region = row.get("region")
211
+ if region and not (isinstance(region, float) and np.isnan(region)):
212
+ if isinstance(region, (set, list)):
213
+ region_str = ", ".join(sorted(region))
214
+ else:
215
+ region_str = str(region)
216
+ lines.append(f" - Region: {region_str}")
217
+
218
+ cuisine = row.get("cuisine_attr")
219
+ cuisine_items = _ensure_iterable(cuisine)
220
+ if cuisine_items:
221
+ lines.append(f" - Cuisine: {', '.join(cuisine_items)}")
222
+
223
+ calories = row.get("calories")
224
+ protein = row.get("protein")
225
+ if calories is not None:
226
+ lines.append(f" - Calories: {calories}")
227
+ if protein is not None:
228
+ lines.append(f" - Protein: {protein}")
229
+
230
+ for key in ["main_parent", "staple_parent", "other_parent"]:
231
+ parents = _ensure_iterable(row.get(key))
232
+ if parents:
233
+ pretty_key = key.replace("_", " ").title()
234
+ lines.append(f" - {pretty_key}: {', '.join(parents)}")
235
+
236
+ ingredients = row.get("ingredients")
237
+ if ingredients:
238
+ if isinstance(ingredients, str):
239
+ ingredients_list = parse_csv_list(ingredients)
240
+ else:
241
+ ingredients_list = list(ingredients)
242
+ if ingredients_list:
243
+ lines.append(f" - Ingredients: {', '.join(ingredients_list[:10])}")
244
+ lines.append("")
245
+
246
+ feedback_row = row.to_dict()
247
+ for key in ["main_parent", "staple_parent", "other_parent", "seasoning_parent", "cuisine_attr", "ingredients"]:
248
+ value = feedback_row.get(key)
249
+ if isinstance(value, list):
250
+ feedback_row[key] = set(value)
251
+ elif isinstance(value, str):
252
+ feedback_row[key] = set(parse_csv_list(value))
253
+ feedback_rows.append(feedback_row)
254
+
255
+ return "\n".join(lines).strip(), feedback_rows
256
+
257
+
258
+ def load_example_profile(profile_name: str):
259
+ """Load a predefined user profile configuration."""
260
+ if profile_name in EXAMPLE_PROFILES:
261
+ config = EXAMPLE_PROFILES[profile_name]
262
+ return (
263
+ profile_name,
264
+ config["vegetarian_type"],
265
+ config["allergies"],
266
+ config["regions"],
267
+ config["calorie_min"],
268
+ config["calorie_max"],
269
+ config["protein_min"],
270
+ config["protein_max"],
271
+ config["preferred_main"],
272
+ config["disliked_main"],
273
+ config["cooking_time"],
274
+ )
275
+ # Default fallback
276
+ return ("user_custom", "flexible", "", "", 400, 2000, 50, 160, "", "", 45)
277
+
278
+
279
+ def load_example_image(image_path: str):
280
+ """Load an example image."""
281
+ return image_path
282
+
283
+
284
+ def run_pipeline(
285
+ image,
286
+ user_id,
287
+ vegetarian_type,
288
+ allergies,
289
+ regions,
290
+ calorie_min,
291
+ calorie_max,
292
+ protein_min,
293
+ protein_max,
294
+ preferred_main,
295
+ disliked_main,
296
+ cooking_time,
297
+ ):
298
+ """
299
+ Main pipeline function.
300
+ This ALWAYS creates/updates the user profile based on current input values,
301
+ then runs detection and recommendation.
302
+ """
303
+ try:
304
+ rgb_image = ensure_numpy_image(image)
305
+ upload_path = write_temp_image(rgb_image)
306
+ temp_dir = Path(tempfile.mkdtemp(prefix="fridge_outputs_"))
307
+ output_json = temp_dir / "recipe_input.json"
308
+ output_image = temp_dir / "annotated_image.jpg"
309
+
310
+ detection_result = detect_and_generate(
311
+ image_path=upload_path,
312
+ credentials=ROBOFLOW_CREDENTIALS,
313
+ conf_threshold=0.4,
314
+ overlap_threshold=0.3,
315
+ conf_split=0.7,
316
+ output_json=str(output_json),
317
+ output_image=str(output_image),
318
+ )
319
+ Path(upload_path).unlink(missing_ok=True)
320
+
321
+ #2: Always create/update user profile with current UI values
322
+ profile = build_user_profile(
323
+ user_id,
324
+ vegetarian_type,
325
+ allergies,
326
+ regions,
327
+ (calorie_min, calorie_max),
328
+ (protein_min, protein_max),
329
+ preferred_main,
330
+ disliked_main,
331
+ cooking_time,
332
+ )
333
+
334
+
335
+ import time
336
+ time.sleep(0.2)
337
+
338
+ detection_payload = detection_result["recipe_json"]
339
+ ml_top, user_parents, high_conf, low_conf = recommend_recipes(
340
+ detection_payload,
341
+ user_id,
342
+ RECIPES_DF,
343
+ topk=5,
344
+ )
345
+
346
+ ingredient_summary = summarize_ingredients(user_parents, high_conf, low_conf)
347
+ recommendation_md, feedback_rows = render_recommendations(ml_top)
348
+
349
+ dropdown_choices = [
350
+ f"{idx + 1}. {row.get('name', 'Recipe')}" for idx, row in enumerate(feedback_rows)
351
+ ]
352
+
353
+ status = "" if feedback_rows else "No recipes available for feedback yet."
354
+
355
+ # Add success message about profile creation/update
356
+ profile_status = f"✓ Profile '{user_id}' has been saved/updated with your current preferences."
357
+
358
+ return (
359
+ str(output_image),
360
+ detection_payload,
361
+ ingredient_summary,
362
+ recommendation_md,
363
+ gr.Dropdown(choices=dropdown_choices, value=None),
364
+ feedback_rows,
365
+ profile_status,
366
+ )
367
+ except Exception as exc:
368
+ import traceback
369
+ error_detail = traceback.format_exc()
370
+ return (
371
+ None,
372
+ None,
373
+ "",
374
+ f"⚠️ Error: {exc}\n\nDetails:\n{error_detail}",
375
+ gr.Dropdown(choices=[], value=None),
376
+ [],
377
+ f"⚠️ Error: {exc}",
378
+ )
379
+
380
+
381
+ def record_feedback(selected_recipe: str, user_id: str, feedback_rows: List[Dict[str, Any]]):
382
+ if not selected_recipe:
383
+ return "Please select a recipe before submitting feedback."
384
+ if not user_id:
385
+ return "Please provide a valid user ID."
386
+ if not feedback_rows:
387
+ return "No recommendation data available. Run the pipeline first."
388
+
389
+ try:
390
+ index = int(selected_recipe.split(".")[0]) - 1
391
+ except (ValueError, IndexError):
392
+ return "Unable to parse the selected recipe."
393
+
394
+ if index < 0 or index >= len(feedback_rows):
395
+ return "Selected recipe is out of range."
396
+
397
+ recipe_row = feedback_rows[index]
398
+ get_feedback(user_id, recipe_row)
399
+
400
+ profile_path = USER_DATA_DIR / user_id / "user_profile.json"
401
+ if profile_path.exists():
402
+ data = json.loads(profile_path.read_text(encoding="utf-8"))
403
+ data["num_feedback"] = data.get("num_feedback", 0) + 1
404
+ save_user_profile(user_id, data)
405
+
406
+ return f"✓ Feedback recorded for {recipe_row.get('name', 'selected recipe')}!"
407
+
408
+
409
+ # ---------------------------------------------------------------------------
410
+ # Gradio UI definition
411
+ # ---------------------------------------------------------------------------
412
+ with gr.Blocks(title="Smart Fridge Recipe Assistant", theme=gr.themes.Soft()) as demo:
413
+ gr.Markdown(
414
+ """
415
+ # Smart Fridge Recipe Assistant
416
+ **How to use:**
417
+ 1. (Optional) Select an example profile and/or image from dropdowns
418
+ 2. Modify any preferences in the form - your profile will be saved automatically when you click Analyze
419
+ 3. Upload or select a fridge image
420
+ 4. Click "Analyze fridge & recommend recipes"
421
+ """
422
+ )
423
+
424
+ with gr.Row():
425
+ with gr.Column(scale=1):
426
+ gr.Markdown("### Quick Start Examples")
427
+ profile_selector = gr.Dropdown(
428
+ label="Choose a predefined user profile",
429
+ choices=list(EXAMPLE_PROFILES.keys()),
430
+ value=None,
431
+ )
432
+
433
+ image_selector = gr.Dropdown(
434
+ label="Choose an example fridge image",
435
+ choices=[f"Image {i+1}: {img}" for i, img in enumerate(EXAMPLE_IMAGES)],
436
+ value=None,
437
+ )
438
+
439
+ image_input = gr.Image(
440
+ label="Fridge photo (upload or use example)",
441
+ type="pil",
442
+ height=350,
443
+ )
444
+
445
+ detection_json = gr.JSON(label="Detection payload")
446
+ annotated_output = gr.Image(label="Annotated detection", height=350)
447
+
448
+ with gr.Column(scale=1):
449
+ gr.Markdown("### User Preferences (auto-saved on each run)")
450
+ user_id_box = gr.Textbox(
451
+ label="User ID (will create new profile if doesn't exist)",
452
+ value="user_custom",
453
+ placeholder="e.g. my_new_profile",
454
+ )
455
+ vegetarian_radio = gr.Radio(
456
+ [
457
+ "flexible",
458
+ "flexible_vegetarian",
459
+ "ovo_vegetarian",
460
+ "lacto_vegetarian",
461
+ "vegan",
462
+ "non_vegetarian",
463
+ ],
464
+ label="Vegetarian preference",
465
+ value="flexible",
466
+ )
467
+ allergies_box = gr.Textbox(
468
+ label="Allergies (comma separated)",
469
+ placeholder="peanut, shrimp",
470
+ )
471
+ regions_box = gr.Textbox(
472
+ label="Preferred regions (comma separated)",
473
+ placeholder="Asia, Europe",
474
+ )
475
+ calorie_min = gr.Slider(
476
+ minimum=0,
477
+ maximum=4000,
478
+ value=400,
479
+ label="Minimum Calories",
480
+ step=50,
481
+ )
482
+ calorie_max = gr.Slider(
483
+ minimum=0,
484
+ maximum=4000,
485
+ value=2000,
486
+ label="Maximum Calories",
487
+ step=50,
488
+ )
489
+ protein_min = gr.Slider(
490
+ minimum=0,
491
+ maximum=250,
492
+ value=50,
493
+ label="Minimum Protein (g)",
494
+ step=5,
495
+ )
496
+ protein_max = gr.Slider(
497
+ minimum=0,
498
+ maximum=250,
499
+ value=160,
500
+ label="Maximum Protein (g)",
501
+ step=5,
502
+ )
503
+ preferred_box = gr.Textbox(
504
+ label="Preferred main ingredients",
505
+ placeholder="chicken, tofu",
506
+ )
507
+ disliked_box = gr.Textbox(
508
+ label="Disliked main ingredients",
509
+ placeholder="lamb",
510
+ )
511
+ cooking_slider = gr.Slider(
512
+ minimum=0,
513
+ maximum=180,
514
+ value=45,
515
+ step=5,
516
+ label="Max cooking time (minutes)",
517
+ )
518
+ run_button = gr.Button("Analyze fridge & recommend recipes", variant="primary")
519
+ ingredient_md = gr.Markdown()
520
+ recommendation_md = gr.Markdown()
521
+ feedback_dropdown = gr.Dropdown(label="Select a recipe for positive feedback", choices=[])
522
+ feedback_button = gr.Button("Save feedback")
523
+ feedback_status = gr.Markdown()
524
+ feedback_state = gr.State([])
525
+
526
+ # Connect profile selector
527
+ profile_selector.change(
528
+ fn=load_example_profile,
529
+ inputs=[profile_selector],
530
+ outputs=[
531
+ user_id_box,
532
+ vegetarian_radio,
533
+ allergies_box,
534
+ regions_box,
535
+ calorie_min,
536
+ calorie_max,
537
+ protein_min,
538
+ protein_max,
539
+ preferred_box,
540
+ disliked_box,
541
+ cooking_slider,
542
+ ],
543
+ )
544
+
545
+ # Connect image selector
546
+ def select_image(choice):
547
+ if choice:
548
+ idx = int(choice.split(":")[0].replace("Image ", "")) - 1
549
+ return EXAMPLE_IMAGES[idx]
550
+ return None
551
+
552
+ image_selector.change(
553
+ fn=select_image,
554
+ inputs=[image_selector],
555
+ outputs=[image_input],
556
+ )
557
+
558
+ run_button.click(
559
+ fn=run_pipeline,
560
+ inputs=[
561
+ image_input,
562
+ user_id_box,
563
+ vegetarian_radio,
564
+ allergies_box,
565
+ regions_box,
566
+ calorie_min,
567
+ calorie_max,
568
+ protein_min,
569
+ protein_max,
570
+ preferred_box,
571
+ disliked_box,
572
+ cooking_slider,
573
+ ],
574
+ outputs=[
575
+ annotated_output,
576
+ detection_json,
577
+ ingredient_md,
578
+ recommendation_md,
579
+ feedback_dropdown,
580
+ feedback_state,
581
+ feedback_status,
582
+ ],
583
+ )
584
+
585
+ feedback_button.click(
586
+ fn=record_feedback,
587
+ inputs=[feedback_dropdown, user_id_box, feedback_state],
588
+ outputs=feedback_status,
589
+ )
590
+
591
+ if __name__ == "__main__":
592
+ demo.launch(share=True)
frige_detect/__pycache__/detect.cpython-313.pyc ADDED
Binary file (8.28 kB). View file
 
frige_detect/annotated_image.jpg ADDED
frige_detect/demo/t1.jpg ADDED
frige_detect/demo/t2.jpg ADDED
frige_detect/demo/t3.jpg ADDED
frige_detect/demo/t4.jpg ADDED
frige_detect/detect.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Detect ingredients using a Roboflow model with preprocessing:
4
+ - Resize images to 640x640 if needed.
5
+ - Perform detection.
6
+ - Classify object sizes via K-Means.
7
+ - Generate JSON and annotated image outputs.
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import tempfile
13
+ from dataclasses import dataclass
14
+
15
+ import cv2
16
+ import numpy as np
17
+ from roboflow import Roboflow
18
+ from sklearn.cluster import KMeans
19
+ import supervision as sv
20
+
21
+
22
+ @dataclass
23
+ class RoboflowCredentials:
24
+ api_key: str
25
+ project_name: str
26
+ version: int = 1
27
+
28
+
29
+ def load_roboflow_credentials(path: str) -> RoboflowCredentials:
30
+ """Load Roboflow API credentials from a simple key=value text file."""
31
+ if not os.path.exists(path):
32
+ raise FileNotFoundError(
33
+ f"Roboflow credential file not found: {path}."
34
+ )
35
+
36
+ api_key = None
37
+ project_name = None
38
+ version = 1
39
+
40
+ with open(path, "r", encoding="utf-8") as f:
41
+ for line in f:
42
+ line = line.strip()
43
+ if not line or line.startswith("#"):
44
+ continue
45
+ if "=" not in line:
46
+ continue
47
+ key, value = line.split("=", 1)
48
+ key = key.strip().lower()
49
+ value = value.strip()
50
+ if key == "api_key":
51
+ api_key = value
52
+ elif key == "project_name":
53
+ project_name = value
54
+ elif key == "version":
55
+ try:
56
+ version = int(value)
57
+ except ValueError:
58
+ raise ValueError("Version in credential file must be an integer") from None
59
+
60
+ if not api_key or not project_name:
61
+ raise ValueError(
62
+ "Credential file must contain api_key and project_name entries."
63
+ )
64
+
65
+ return RoboflowCredentials(api_key=api_key, project_name=project_name, version=version)
66
+
67
+ def compute_area_ratios(predictions, img_shape):
68
+ """Compute area ratio (bbox area / image area) for each detection."""
69
+ img_area = float(img_shape[0] * img_shape[1])
70
+ ratios = []
71
+ for pred in predictions:
72
+ area = pred["width"] * pred["height"]
73
+ ratios.append(area / img_area)
74
+ return np.array(ratios).reshape(-1, 1)
75
+
76
+ def cluster_sizes(area_ratios):
77
+ """Cluster area ratios into two groups using K-Means and return size labels."""
78
+ kmeans = KMeans(n_clusters=2, init="k-means++", random_state=0)
79
+ labels = kmeans.fit_predict(area_ratios)
80
+ centroids = kmeans.cluster_centers_.flatten()
81
+ large_cluster = np.argmax(centroids)
82
+ return ["large" if lbl == large_cluster else "small" for lbl in labels]
83
+
84
+ def detect_and_generate(
85
+ image_path: str,
86
+ credentials: RoboflowCredentials,
87
+ conf_threshold: float = 0.4,
88
+ overlap_threshold: float = 0.3,
89
+ conf_split: float = 0.7,
90
+ output_json: str = "recipe_input.json",
91
+ output_image: str = "annotated_image.jpg"
92
+ ):
93
+ """
94
+ Resize image if necessary, run detection, classify sizes via K-Means, and
95
+ create both JSON output and annotated image.
96
+
97
+ Args:
98
+ image_path (str): Path to the original image.
99
+ api_key (str): Roboflow API key.
100
+ project_name (str): Roboflow project name.
101
+ version (int): Model version.
102
+ conf_threshold (float): Minimum confidence threshold (0–1).
103
+ overlap_threshold (float): NMS overlap threshold (0–1).
104
+ conf_split (float): Threshold for high/low confidence lists.
105
+ output_json (str): Output JSON filename.
106
+ output_image (str): Output annotated image filename.
107
+
108
+ Returns:
109
+ dict: Recipe input JSON structure.
110
+ """
111
+ # Load original image
112
+ original_img = cv2.imread(image_path)
113
+ if original_img is None:
114
+ raise FileNotFoundError(f"Image not found: {image_path}")
115
+
116
+ height, width = original_img.shape[:2]
117
+
118
+ # Preprocess: resize to 640x640 if needed, and save to a temp file
119
+ if height != 640 or width != 640:
120
+ resized_img = cv2.resize(original_img, (640, 640))
121
+ # create temporary file via mkstemp; close fd to avoid locking
122
+ fd, tmp_path = tempfile.mkstemp(suffix=".jpg")
123
+ os.close(fd)
124
+ cv2.imwrite(tmp_path, resized_img)
125
+ detection_path = tmp_path
126
+ img_for_annotation = resized_img
127
+ else:
128
+ detection_path = image_path
129
+ img_for_annotation = original_img
130
+
131
+ # Initialize Roboflow model
132
+ rf = Roboflow(api_key=credentials.api_key)
133
+ model = rf.workspace().project(credentials.project_name).version(credentials.version).model
134
+
135
+ # Run prediction
136
+ response = model.predict(
137
+ detection_path,
138
+ confidence=int(conf_threshold * 100),
139
+ overlap=int(overlap_threshold * 100)
140
+ ).json()
141
+ predictions = response["predictions"]
142
+
143
+ # Classify sizes using K-Means
144
+ area_ratios = compute_area_ratios(predictions, img_for_annotation.shape)
145
+ size_labels = cluster_sizes(area_ratios)
146
+
147
+ # Build JSON structure
148
+ ingredients = []
149
+ high_conf = []
150
+ low_conf = []
151
+ for pred, size_label in zip(predictions, size_labels):
152
+ name = pred["class"]
153
+ conf = pred["confidence"]
154
+ ingredients.append({
155
+ "name": name,
156
+ "quantity": size_label,
157
+ "confidence": round(conf, 2)
158
+ })
159
+ if conf >= conf_split:
160
+ high_conf.append(name)
161
+ else:
162
+ low_conf.append(name)
163
+
164
+ recipe_json = {
165
+ "ingredients": ingredients,
166
+ "high_confidence_ingredients": high_conf,
167
+ "low_confidence_ingredients": low_conf
168
+ }
169
+
170
+ # Write JSON to file
171
+ with open(output_json, "w", encoding="utf-8") as jf:
172
+ json.dump(recipe_json, jf, indent=4)
173
+
174
+ # Annotate image with bounding boxes and confidence labels
175
+ detections = sv.Detections.from_inference(response)
176
+ label_annotator = sv.LabelAnnotator()
177
+ box_annotator = sv.BoxAnnotator()
178
+
179
+ labels_for_annotation = [
180
+ f"{pred['class']} ({pred['confidence']:.2f})" for pred in predictions
181
+ ]
182
+
183
+ annotated_img = box_annotator.annotate(
184
+ scene=img_for_annotation.copy(),
185
+ detections=detections
186
+ )
187
+ annotated_img = label_annotator.annotate(
188
+ scene=annotated_img,
189
+ detections=detections,
190
+ labels=labels_for_annotation
191
+ )
192
+
193
+ cv2.imwrite(output_image, annotated_img)
194
+
195
+ # Display annotated image (optional, for notebooks)
196
+ # Clean up temporary file
197
+ if height != 640 or width != 640:
198
+ try:
199
+ os.remove(tmp_path)
200
+ except PermissionError:
201
+ # If still locked on Windows, delay deletion or log a warning
202
+ pass
203
+
204
+ return {
205
+ "recipe_json": recipe_json,
206
+ "output_json_path": output_json,
207
+ "annotated_image_path": output_image,
208
+ }
frige_detect/recipe_input.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ingredients": [
3
+ {
4
+ "name": "sugar",
5
+ "quantity": "large",
6
+ "confidence": 0.91
7
+ },
8
+ {
9
+ "name": "chicken",
10
+ "quantity": "large",
11
+ "confidence": 0.91
12
+ },
13
+ {
14
+ "name": "milk",
15
+ "quantity": "large",
16
+ "confidence": 0.89
17
+ },
18
+ {
19
+ "name": "flour",
20
+ "quantity": "large",
21
+ "confidence": 0.88
22
+ },
23
+ {
24
+ "name": "eggs",
25
+ "quantity": "small",
26
+ "confidence": 0.88
27
+ },
28
+ {
29
+ "name": "apple",
30
+ "quantity": "large",
31
+ "confidence": 0.86
32
+ },
33
+ {
34
+ "name": "corn",
35
+ "quantity": "small",
36
+ "confidence": 0.85
37
+ },
38
+ {
39
+ "name": "blueberries",
40
+ "quantity": "small",
41
+ "confidence": 0.83
42
+ },
43
+ {
44
+ "name": "chicken_breast",
45
+ "quantity": "large",
46
+ "confidence": 0.82
47
+ },
48
+ {
49
+ "name": "ground_beef",
50
+ "quantity": "large",
51
+ "confidence": 0.81
52
+ },
53
+ {
54
+ "name": "beef",
55
+ "quantity": "large",
56
+ "confidence": 0.77
57
+ },
58
+ {
59
+ "name": "carrot",
60
+ "quantity": "large",
61
+ "confidence": 0.75
62
+ },
63
+ {
64
+ "name": "bread",
65
+ "quantity": "large",
66
+ "confidence": 0.51
67
+ }
68
+ ],
69
+ "high_confidence_ingredients": [
70
+ "sugar",
71
+ "chicken",
72
+ "milk",
73
+ "flour",
74
+ "eggs",
75
+ "apple",
76
+ "corn",
77
+ "blueberries",
78
+ "chicken_breast",
79
+ "ground_beef",
80
+ "beef",
81
+ "carrot"
82
+ ],
83
+ "low_confidence_ingredients": [
84
+ "bread"
85
+ ]
86
+ }
frige_detect/roboflow_credentials.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Roboflow credentials used by the app and detector
2
+ api_key=t2nRJrn7ppJIC8RGHdwk
3
+ project_name=nutrition-object-detection
4
+ version=1
recipe_recommendation/__init__.py ADDED
File without changes
recipe_recommendation/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (191 Bytes). View file
 
recipe_recommendation/__pycache__/main.cpython-313.pyc ADDED
Binary file (26.8 kB). View file
 
recipe_recommendation/data/ingredient_map.data ADDED
The diff for this file is too large to render. See raw diff
 
recipe_recommendation/main.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Entry point for the new pipeline:
5
+ 1) I/O init & parsing
6
+ 2) Load user parents from recipe_input.json via ingredient_map (children -> parent)
7
+ 3) Ensure cold-start features & trained ranker exist
8
+ 4) Step 2: Coarse ranking
9
+ 5) Step 3: ML reranking
10
+ 6) Pretty print top results
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import ast
16
+ import pandas as pd
17
+ from pathlib import Path
18
+ import shutil
19
+
20
+ from recipe_recommendation.src.io import load_recipes_csv, load_ingredient_map, download_file
21
+ from recipe_recommendation.src.coldstart import cold_start_ranker
22
+ from recipe_recommendation.src.trainmodel import train_model_ranker
23
+ from recipe_recommendation.src.candidate import (
24
+ coarse_rank_candidates,
25
+ ml_generate_candidates,
26
+ hard_filter,
27
+ )
28
+ from recipe_recommendation.src.highlight import (
29
+ print_candidates,
30
+ diversify_topk_with_min_clusters,
31
+ )
32
+ from recipe_recommendation.src.feature import build_features, build_cluster_features
33
+ from recipe_recommendation.src.embedding import find_most_similar_user
34
+
35
+
36
+ BASE_DIR = Path(__file__).resolve().parent
37
+ USER_DATA_DIR = BASE_DIR / "user_data"
38
+
39
+
40
+
41
+ def load_recipes() -> pd.DataFrame:
42
+ """
43
+ Load recipes.csv as DataFrame and assign a unique recipe_id to each row.
44
+ This keeps io.py focused on downloading only.
45
+ """
46
+ path = download_file("recipes.csv")
47
+ df = pd.read_csv(path)
48
+ df.reset_index(drop=True, inplace=True)
49
+ df["recipe_id"] = df.index
50
+ return df
51
+
52
+ # ---------------------------
53
+ # Helpers: parsing utilities
54
+ # ---------------------------
55
+ def parse_list(x):
56
+ """Parse a cell into Python list; tolerant to str/NaN/set."""
57
+ if isinstance(x, list):
58
+ return x
59
+ if x is None or (isinstance(x, float) and pd.isna(x)):
60
+ return []
61
+ if isinstance(x, set):
62
+ return list(x)
63
+ s = str(x).strip()
64
+ if not s:
65
+ return []
66
+ # Try literal eval first
67
+ try:
68
+ v = ast.literal_eval(s)
69
+ if isinstance(v, list):
70
+ return v
71
+ if isinstance(v, set):
72
+ return list(v)
73
+ except Exception:
74
+ pass
75
+ # Fallback: comma-separated
76
+ s = s.strip("[]")
77
+ parts = [t.strip() for t in s.split(",") if t.strip()]
78
+ return parts
79
+
80
+
81
+ def parse_set(x):
82
+ """Parse a cell into Python set via parse_list."""
83
+ return set(parse_list(x))
84
+
85
+
86
+ # -------------------------------------
87
+ # Map user CV result -> parent set
88
+ # -------------------------------------
89
+ def load_user_parents_from_json(json_path, ingredient_map, conf_th=0.8):
90
+ """
91
+ Map raw ingredient names to parent categories using ingredient_map["children"].
92
+ If a name is already a parent in ingredient_map["parents"], keep it.
93
+ Unknown terms are skipped.
94
+ """
95
+ parents_map = ingredient_map.get("parents", {}) or {}
96
+ children_map = ingredient_map.get("children", {}) or {}
97
+
98
+ if not os.path.exists(json_path):
99
+ raise FileNotFoundError(f"recipe_input.json not found at: {json_path}")
100
+
101
+ with open(json_path, "r", encoding="utf-8") as f:
102
+ data = json.load(f)
103
+
104
+ out = []
105
+ hi, lo = [], []
106
+ for ing in data.get("ingredients", []):
107
+ name = (ing.get("name") or "").strip().lower().replace("_", " ")
108
+ conf = float(ing.get("confidence", 0.0))
109
+ parent = None
110
+ if name in children_map:
111
+ # Prefer "parent" field; fall back to "fallback" if present
112
+ parent = children_map[name].get("parent") or children_map[name].get("fallback")
113
+ elif name in parents_map:
114
+ parent = name
115
+
116
+ if parent and conf >= conf_th:
117
+ out.append(parent)
118
+ hi.append((name, parent))
119
+ else:
120
+ lo.append(name)
121
+
122
+ if hi:
123
+ print("High-confidence ingredients mapped to parents:")
124
+ for child, p in hi:
125
+ print(f" - {child} → {p}")
126
+ if lo:
127
+ print(f"Ignored (low confidence or no parent found): {sorted(set(lo))}")
128
+
129
+ return sorted(set(out))
130
+
131
+
132
+ def normalize_user_profile(profile):
133
+ """Fill missing keys and set defaults to avoid None errors downstream."""
134
+ # Diet
135
+ diet = profile.get("diet", {})
136
+ profile["diet"] = {"vegetarian_type": diet.get("vegetarian_type", "flexible")}
137
+
138
+ # Allergies
139
+ if "allergies" not in profile or profile["allergies"] is None:
140
+ profile["allergies"] = []
141
+
142
+ # Region
143
+ if "region_preference" not in profile or profile["region_preference"] is None:
144
+ profile["region_preference"] = []
145
+
146
+ # Nutritional goals
147
+ if "nutritional_goals" not in profile or profile["nutritional_goals"] is None:
148
+ profile["nutritional_goals"] = {
149
+ "calories": {"min": 0, "max": 9999},
150
+ "protein": {"min": 0, "max": 999}
151
+ }
152
+ else:
153
+ ng = profile["nutritional_goals"]
154
+ ng["calories"] = ng.get("calories", {"min": 0, "max": 9999})
155
+ ng["protein"] = ng.get("protein", {"min": 0, "max": 999})
156
+
157
+ # Other preferences
158
+ other = profile.get("other_preferences", {})
159
+ if not other:
160
+ other = {}
161
+ other["preferred_main"] = other.get("preferred_main", [])
162
+ other["disliked_main"] = other.get("disliked_main", [])
163
+ other["cooking_time_max"] = other.get("cooking_time_max", None)
164
+ profile["other_preferences"] = other
165
+
166
+ return profile
167
+
168
+ def is_profile_empty(profile):
169
+ """Return True if the profile has almost no meaningful preferences."""
170
+ if profile.get("diet", {}).get("vegetarian_type") not in [None, "", "flexible"]:
171
+ return False
172
+ if profile.get("allergies"):
173
+ return False
174
+ if profile.get("region_preference"):
175
+ return False
176
+
177
+ ng = profile.get("nutritional_goals", {})
178
+ if ng.get("calories") or ng.get("protein"):
179
+ c = ng.get("calories", {})
180
+ p = ng.get("protein", {})
181
+ if c.get("min", 0) > 0 or c.get("max", 0) < 9999:
182
+ return False
183
+ if p.get("min", 0) > 0 or p.get("max", 0) < 999:
184
+ return False
185
+
186
+ other = profile.get("other_preferences", {})
187
+ if other.get("preferred_main") or other.get("disliked_main") or other.get("cooking_time_max"):
188
+ return False
189
+
190
+ return True
191
+
192
+ def fill_default_preferences(profile):
193
+ """
194
+ Fill some lightweight, neutral defaults so that hard_filter and cold_start
195
+ can work efficiently even for new users with no explicit preferences.
196
+ """
197
+ profile["diet"]["vegetarian_type"] = "flexible"
198
+ profile["region_preference"] = ["North America", "Europe"]
199
+ profile["nutritional_goals"]["protein"] = {"min": 50, "max": 150}
200
+ profile["nutritional_goals"]["calories"] = {"min": 400, "max": 2000}
201
+ profile["other_preferences"]["cooking_time_max"] = 45
202
+ return profile
203
+
204
+ def ensure_user_profile(user_id):
205
+ """
206
+ Load user profile JSON, normalize structure, and fill default preferences
207
+ if the profile is empty. This ensures downstream code never breaks on None
208
+ and avoids extremely slow cold start for users with no preferences.
209
+ """
210
+ import os, json
211
+
212
+ profile_file = USER_DATA_DIR / user_id / "user_profile.json"
213
+ if not os.path.exists(profile_file):
214
+ raise FileNotFoundError(
215
+ f"Missing profile: {profile_file}. Please create one first."
216
+ )
217
+
218
+ # Load profile
219
+ with open(profile_file, "r", encoding="utf-8") as f:
220
+ profile = json.load(f)
221
+ # Normalize structure
222
+ profile = normalize_user_profile(profile)
223
+ # Detect if almost empty
224
+ if is_profile_empty(profile):
225
+ print(f"[profile] User {user_id} has an empty or near-empty profile. Filling defaults...")
226
+ profile = fill_default_preferences(profile)
227
+
228
+ return profile
229
+
230
+
231
+ def save_user_profile(user_id, profile):
232
+ profile_path = USER_DATA_DIR / user_id / "user_profile.json"
233
+ profile_path.parent.mkdir(parents=True, exist_ok=True)
234
+ with open(profile_path, "w", encoding="utf-8") as f:
235
+ json.dump(profile, f, indent=2)
236
+
237
+ def collect_user_feedback(user_id: str, selected_recipe_row: dict, user_profile: dict, qid: int):
238
+ """
239
+ Collect a single feedback sample.
240
+ - Uses build_features() to ensure feature alignment with training
241
+ - Maintains a fixed feature order via feature_order.json
242
+ """
243
+ user_dir = USER_DATA_DIR / user_id
244
+ user_dir.mkdir(parents=True, exist_ok=True)
245
+ feedback_path = user_dir / "feedback.csv"
246
+ feature_order_path = user_dir / "feature_order.json"
247
+
248
+ recipe_dict = {
249
+ "main": selected_recipe_row.get("main_parent", set()),
250
+ "staple": selected_recipe_row.get("staple_parent", set()),
251
+ "other": selected_recipe_row.get("other_parent", set()),
252
+ "seasoning": selected_recipe_row.get("seasoning_parent", set()),
253
+ "matched_main": len(selected_recipe_row.get("main_parent", set()) & set(user_profile.get("user_parents", []))),
254
+ "matched_staple": len(selected_recipe_row.get("staple_parent", set()) & set(user_profile.get("user_parents", []))),
255
+ "matched_other": len(selected_recipe_row.get("other_parent", set()) & set(user_profile.get("user_parents", []))),
256
+ "calories": selected_recipe_row.get("calories", 0),
257
+ "protein": selected_recipe_row.get("protein", 0),
258
+ "fat": selected_recipe_row.get("fat", 0),
259
+ "region": selected_recipe_row.get("region", ""),
260
+ "cuisine_attr": selected_recipe_row.get("cuisine_attr", []),
261
+ "ingredients": selected_recipe_row.get("ingredients", []),
262
+ "minutes": selected_recipe_row.get("minutes", None),
263
+ }
264
+ features = build_features(recipe_dict, user_profile)
265
+
266
+ if os.path.exists(feature_order_path):
267
+ with open(feature_order_path, "r", encoding="utf-8") as f:
268
+ feature_order = json.load(f)
269
+ else:
270
+ feature_order = list(features.keys())
271
+ with open(feature_order_path, "w", encoding="utf-8") as f:
272
+ json.dump(feature_order, f, indent=2)
273
+
274
+ for feat in features.keys():
275
+ if feat not in feature_order:
276
+ feature_order.append(feat)
277
+ with open(feature_order_path, "w", encoding="utf-8") as f:
278
+ json.dump(feature_order, f, indent=2)
279
+
280
+ row_data = {feat: features.get(feat, 0) for feat in feature_order}
281
+ row_data["recipe_id"] = selected_recipe_row["recipe_id"]
282
+ row_data["qid"] = qid
283
+ row_data["relevance"] = 5
284
+
285
+ new_row_df = pd.DataFrame([row_data])
286
+
287
+ if os.path.exists(feedback_path):
288
+ old_df = pd.read_csv(feedback_path)
289
+ for col in new_row_df.columns:
290
+ if col not in old_df.columns:
291
+ old_df[col] = 0
292
+ for col in old_df.columns:
293
+ if col not in new_row_df.columns:
294
+ new_row_df[col] = 0
295
+ df = pd.concat([old_df, new_row_df], ignore_index=True)
296
+ else:
297
+ df = new_row_df
298
+ df.to_csv(feedback_path, index=False)
299
+ print(f"[feedback] Saved user feedback to {feedback_path} ({len(df)} rows total)")
300
+
301
+
302
+ # def ensure_model(user_id):
303
+ # base_dir = USER_DATA_DIR / user_id
304
+ # base_dir.mkdir(parents=True, exist_ok=True)
305
+ # features_rank = base_dir / "user_features_rank.csv"
306
+ # model_file = base_dir / "ranker.pkl"
307
+
308
+ # if not os.path.exists(features_rank):
309
+ # print("[main] No cold-start features found; running cold_start_ranker() ...")
310
+ # cold_start_ranker(user_id=user_id)
311
+
312
+ # if not os.path.exists(model_file):
313
+ # print("[main] No model found; training ranker with train_model_ranker() ...")
314
+ # train_model_ranker(user_id=user_id)
315
+
316
+ # return model_file
317
+
318
+ def ensure_model(user_id):
319
+ base_dir = USER_DATA_DIR / user_id
320
+ base_dir.mkdir(parents=True, exist_ok=True)
321
+ features_rank = base_dir / "user_features_rank.csv"
322
+ model_file = base_dir / "ranker.pkl"
323
+
324
+ if not os.path.exists(features_rank):
325
+ print("[main] No cold-start features found; running cold_start_ranker() ...")
326
+ # pass user_data_dir
327
+ cold_start_ranker(user_id=user_id, user_data_dir=str(USER_DATA_DIR))
328
+
329
+ if not os.path.exists(model_file):
330
+ print("[main] No model found; training ranker with train_model_ranker() ...")
331
+ train_model_ranker(user_id=user_id)
332
+
333
+ return model_file
334
+
335
+
336
+ def prepare_recipes_df(df: pd.DataFrame) -> pd.DataFrame:
337
+ """
338
+ Normalize key columns to list/set shapes that our candidate/feature modules expect.
339
+ """
340
+ df = df.copy()
341
+
342
+ # list-like columns
343
+ for col in ["staple", "main", "seasoning", "other", "ingredients"]:
344
+ if col in df.columns:
345
+ df[col] = df[col].apply(parse_list)
346
+
347
+ # set-like columns
348
+ for col in ["staple_parent", "main_parent", "seasoning_parent", "other_parent", "cuisine_attr"]:
349
+ if col in df.columns:
350
+ df[col] = df[col].apply(parse_set)
351
+
352
+ # region: allow str or set; if it looks like list/set, cast to set; otherwise keep str
353
+ if "region" in df.columns:
354
+ def _region_norm(x):
355
+ if isinstance(x, (set, list)):
356
+ return set(x)
357
+ try:
358
+ v = ast.literal_eval(str(x))
359
+ if isinstance(v, (set, list)):
360
+ return set(v)
361
+ except Exception:
362
+ pass
363
+ return str(x) if pd.notna(x) else ""
364
+ df["region"] = df["region"].apply(_region_norm)
365
+
366
+ return df
367
+
368
+
369
+ def maybe_retrain_model(user_id):
370
+ profile_path = USER_DATA_DIR / user_id / "user_profile.json"
371
+ if not profile_path.exists():
372
+ return
373
+
374
+ profile = json.loads(profile_path.read_text())
375
+ n_fb = profile.get("num_feedback", 0)
376
+
377
+ if n_fb > 0 and n_fb % 20 == 0:
378
+ print(f"[main] {n_fb} feedback reached, retraining ranker...")
379
+
380
+ model_path = USER_DATA_DIR / user_id / "ranker.pkl"
381
+ if model_path.exists():
382
+ model_path.unlink()
383
+
384
+ train_model_ranker(user_id)
385
+
386
+ def get_next_qid(user_id: str) -> int:
387
+ user_dir = USER_DATA_DIR / user_id
388
+ user_dir.mkdir(parents=True, exist_ok=True)
389
+ qid_path = user_dir / "qid.txt"
390
+
391
+ if qid_path.exists():
392
+ qid = int(qid_path.read_text()) + 1
393
+ else:
394
+ qid = 0
395
+ qid_path.write_text(str(qid))
396
+ return qid
397
+
398
+ def maybe_reuse_model(user_id, threshold=0.85):
399
+ match_uid, sim = find_most_similar_user(user_id, threshold=threshold)
400
+ if match_uid:
401
+ print(f"[model reuse] Reusing {match_uid}'s model for {user_id} (sim={sim:.3f})")
402
+ return match_uid
403
+ return None
404
+
405
+ def main(user_id="user_1",
406
+ recipe_input_json=None,
407
+ topk=5,
408
+ topn_coarse=20000):
409
+ # 1) I/O init
410
+ maybe_retrain_model(user_id)
411
+
412
+ recipes_df = load_recipes()
413
+ ingredient_map = load_ingredient_map()
414
+
415
+ # 2) Load user_parents from recipe_input.json (fall back to /data if needed)
416
+ if recipe_input_json is None:
417
+ # prefer project root; then /data
418
+ default_candidates = [
419
+ os.path.join("data", "recipe_input.json"),
420
+ "recipe_input.json",
421
+ "/data/recipe_input.json",
422
+ ]
423
+ recipe_input_json = next((p for p in default_candidates if os.path.exists(p)), default_candidates[-1])
424
+
425
+ user_parents = load_user_parents_from_json(recipe_input_json, ingredient_map, conf_th=0.8)
426
+
427
+ # 3) Load user profile
428
+ user_profile = ensure_user_profile(user_id)
429
+
430
+ # Embedding similarity fallback
431
+ match_uid, sim = find_most_similar_user(user_id, threshold=0.85)
432
+ if match_uid is not None:
433
+ print(f"[main] Using model of similar user '{match_uid}' for '{user_id}' (sim={sim:.3f})")
434
+
435
+ src_dir = USER_DATA_DIR / match_uid
436
+ dst_dir = USER_DATA_DIR / user_id
437
+ dst_dir.mkdir(parents=True, exist_ok=True)
438
+
439
+ for fname in ["ranker.pkl", "user_features_rank.csv"]:
440
+ src = src_dir / fname
441
+ dst = dst_dir / fname
442
+ if os.path.exists(src) and not os.path.exists(dst):
443
+ shutil.copyfile(src, dst)
444
+ print(f"[embedding] Copied {fname} from {match_uid} to {user_id}")
445
+
446
+ # 4) Ensure cold-start features & model
447
+ model_path = ensure_model(user_id)
448
+
449
+ # 5) Prepare recipes & coarse rank (Step 2)
450
+ df = prepare_recipes_df(recipes_df)
451
+ recipes_records = df.to_dict(orient="records")
452
+
453
+ filtered_records = [r for r in recipes_records if hard_filter(r, user_profile)]
454
+ if not filtered_records:
455
+ print("[main] No recipes after hard dietary filtering.")
456
+ return
457
+
458
+ coarse = coarse_rank_candidates(
459
+ recipes=recipes_records,
460
+ user_parents=user_parents,
461
+ user_profile=user_profile,
462
+ top_n=topn_coarse
463
+ )
464
+
465
+ if not coarse:
466
+ print("[main] No coarse candidates. Please check user_parents or dataset.")
467
+ return
468
+
469
+ # 6) ML reranking (Step 3)
470
+ ml_top = ml_generate_candidates(
471
+ coarse_candidates=coarse,
472
+ user_parents=user_parents,
473
+ user_profile=user_profile,
474
+ model_path=model_path,
475
+ topk=200
476
+ )
477
+
478
+ if ml_top is None or len(ml_top) == 0:
479
+ print("[main] No ML candidates returned.")
480
+ return
481
+
482
+ # 6.5) KMeans Diversification
483
+ candidates_list = ml_top.to_dict(orient="records")
484
+ X_cluster = build_cluster_features(candidates_list)
485
+ diversified = diversify_topk_with_min_clusters(
486
+ ranked_candidates=candidates_list,
487
+ feature_matrix=X_cluster,
488
+ top_k=topk,
489
+ n_clusters=10,
490
+ min_clusters=3
491
+ )
492
+
493
+ ml_top = pd.DataFrame(diversified)
494
+
495
+ # 7) Pretty print (reuse print_candidates expecting 'match_score')
496
+ ml_top = ml_top.copy()
497
+ if "match_score" not in ml_top.columns and "ml_score" in ml_top.columns:
498
+ ml_top["match_score"] = ml_top["ml_score"]
499
+
500
+ print(f"\nFound {len(ml_top)} candidate recipes:\n")
501
+ print_candidates(ml_top, user_parents, topk=topk)
502
+
503
+ # 8) Give feedbacks
504
+ qid = get_next_qid(user_id)
505
+ selected_idx = int(input(f"Select a recipe from 1-{topk}: ")) - 1
506
+ selected_row = ml_top.iloc[selected_idx].to_dict()
507
+ collect_user_feedback(user_id, selected_row, user_profile, qid)
508
+
509
+
510
+ def recommend_recipes(detection_payload, user_id, recipes_df, topk=5):
511
+ """
512
+ Unified recommendation entry for the app.
513
+ Handles user profile loading, ingredient mapping, and embedding fallback internally.
514
+
515
+ """
516
+ # 0) Check if retraining is needed (new feedback, updated features)
517
+ maybe_retrain_model(user_id)
518
+ # 1) Ingredient mapping - use existing high/low confidence fields
519
+ ingredient_map = load_ingredient_map()
520
+ ingredients = detection_payload.get("ingredients", [])
521
+
522
+ high_conf = detection_payload.get("high_confidence_ingredients", [])
523
+ low_conf = detection_payload.get("low_confidence_ingredients", [])
524
+
525
+ # user_parents = []
526
+ # for item in ingredients:
527
+ # name = item.get("name")
528
+ # if not name:
529
+ # continue
530
+ # parent = ingredient_map.get(name.lower())
531
+ # if parent:
532
+ # user_parents.append(parent)
533
+
534
+ # user_parents = sorted(set(user_parents))
535
+
536
+ parents_map = ingredient_map.get("parents", {}) or {}
537
+ children_map = ingredient_map.get("children", {}) or {}
538
+
539
+ user_parents = []
540
+ for item in ingredients:
541
+ name = (item.get("name") or "").strip().lower().replace("_", " ")
542
+ if not name:
543
+ continue
544
+
545
+ parent = None
546
+ if name in children_map:
547
+ parent = children_map[name].get("parent") or children_map[name].get("fallback")
548
+ elif name in parents_map:
549
+ parent = name
550
+
551
+ if parent:
552
+ user_parents.append(parent)
553
+
554
+ user_parents = sorted(set(user_parents))
555
+
556
+ high_conf = sorted(set(high_conf))
557
+ low_conf = sorted(set(low_conf))
558
+
559
+ # 2) Load user profile internally
560
+ user_profile = ensure_user_profile(user_id)
561
+
562
+ # 3) Embedding fallback
563
+ match_uid, sim = find_most_similar_user(user_id, threshold=0.85)
564
+ if match_uid is not None:
565
+ print(f"[embedding] Using model of similar user '{match_uid}' for '{user_id}' (sim={sim:.3f})")
566
+ src_dir = USER_DATA_DIR / match_uid
567
+ dst_dir = USER_DATA_DIR / user_id
568
+ dst_dir.mkdir(parents=True, exist_ok=True)
569
+ for fname in ["ranker.pkl", "user_features_rank.csv"]:
570
+ src = src_dir / fname
571
+ dst = dst_dir / fname
572
+ if os.path.exists(src) and not os.path.exists(dst):
573
+ shutil.copyfile(src, dst)
574
+ print(f"[embedding] Copied {fname} from {match_uid} to {user_id}")
575
+
576
+ # 4) Coldstart / model ensure
577
+ model_path = ensure_model(user_id)
578
+
579
+ # 5) Coarse rank
580
+ df = prepare_recipes_df(recipes_df)
581
+ recipes_records = df.to_dict(orient="records")
582
+ filtered_records = [r for r in recipes_records if hard_filter(r, user_profile)]
583
+ if not filtered_records:
584
+ return pd.DataFrame(), user_parents, high_conf, low_conf
585
+
586
+ coarse = coarse_rank_candidates(
587
+ recipes=recipes_records,
588
+ user_parents=user_parents,
589
+ user_profile=user_profile,
590
+ top_n=20000
591
+ )
592
+ if not coarse:
593
+ return pd.DataFrame(), user_parents, high_conf, low_conf
594
+
595
+ # 6) ML rerank
596
+ ml_top = ml_generate_candidates(
597
+ coarse_candidates=coarse,
598
+ user_parents=user_parents,
599
+ user_profile=user_profile,
600
+ model_path=model_path,
601
+ topk=200
602
+ )
603
+ if ml_top is None or len(ml_top) == 0:
604
+ return pd.DataFrame(), user_parents, high_conf, low_conf
605
+
606
+ # 7) KMeans diversification
607
+ candidates_list = ml_top.to_dict(orient="records")
608
+ X_cluster = build_cluster_features(candidates_list)
609
+ diversified = diversify_topk_with_min_clusters(
610
+ ranked_candidates=candidates_list,
611
+ feature_matrix=X_cluster,
612
+ top_k=topk,
613
+ n_clusters=10,
614
+ min_clusters=3
615
+ )
616
+
617
+ ml_top = pd.DataFrame(diversified)
618
+
619
+ return ml_top, user_parents, high_conf, low_conf
620
+
621
+
622
+ def get_feedback(user_id: str, recipe_row: dict, qid: int = None):
623
+ """
624
+ App-friendly feedback collection function.
625
+
626
+ Parameters
627
+ ----------
628
+ user_id : str
629
+ The ID of the user submitting feedback.
630
+ recipe_row : dict
631
+ The recipe information dict (e.g., one row from ml_top.to_dict()).
632
+ qid : int, optional
633
+ The query ID for ranking context. If not provided, defaults to 0 or auto increments.
634
+ """
635
+ # 1) Ensure user profile is loaded internally
636
+ user_profile = ensure_user_profile(user_id)
637
+
638
+ # 2) If qid is not provided, generate automatically
639
+ if qid is None:
640
+ try:
641
+ qid = get_next_qid(user_id)
642
+ except Exception:
643
+ qid = 0
644
+
645
+ # 3) Delegate to existing collect_user_feedback
646
+ collect_user_feedback(user_id, recipe_row, user_profile, qid)
647
+
648
+ print(f"[app] Feedback collected for user '{user_id}', qid={qid}, recipe_id={recipe_row.get('id')}")
649
+
650
+
651
+ if __name__ == "__main__":
652
+ main("user_3")
recipe_recommendation/readme.txt ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ readme_text = """\
2
+ ===========================
3
+ Recipe Recommendation System
4
+ ===========================
5
+
6
+ This project implements a complete recipe recommendation system, including cold start ranking, ML-based reranking, KMeans-based diversification, and user feedback collection.
7
+ All functions are fully encapsulated and can be easily called from external applications.
8
+
9
+ -------------------------------------
10
+ 1. Main Entry Functions for External Use
11
+ -------------------------------------
12
+
13
+ The three main functions for external usage are:
14
+
15
+ 1) recommend_recipes(detection_payload, user_id, recipes_df, topk=5)
16
+ - Input:
17
+ • detection_payload: dict or JSON object containing detected ingredients.
18
+ • user_id: str, unique user identifier.
19
+ • recipes_df: pandas.DataFrame loaded by `load_recipes()`.
20
+ • topk: int, number of final recipes to return (default = 5).
21
+ - Output:
22
+ • ml_top: pandas.DataFrame of top recommended recipes (with ml_score & metadata).
23
+ • user_parents: list of mapped parent ingredients.
24
+ • high_conf: list of high-confidence ingredient matches.
25
+ • low_conf: list of low-confidence or unmapped ingredients.
26
+
27
+ Internally, this function performs:
28
+ - Ingredient mapping from detection payload
29
+ - Embedding fallback (copy model/features from similar user)
30
+ - Cold start feature generation if needed
31
+ - Coarse ranking → ML reranking → KMeans diversification
32
+ - Returns the final diversified top-k recommendations.
33
+
34
+ 2) load_recipes()
35
+ - Input: None
36
+ - Output: pandas.DataFrame of all recipes (automatically downloaded from Hugging Face if not present).
37
+ - This function loads the full recipe dataset into memory.
38
+ If the dataset is not found locally, it will automatically download and cache it under `data/`.
39
+
40
+ 3) get_feedback(user_id, recipe_row, qid=None)
41
+ - Input:
42
+ • user_id: str, unique user identifier.
43
+ • recipe_row: dict, a single recipe row (e.g. one of the top-k recommendations).
44
+ • qid: int, optional query ID. Defaults to auto-generated or 0.
45
+ - Output: None
46
+ - Function:
47
+ • Loads user profile internally
48
+ • Appends the feedback (recipe metadata, user choice) into `user_data/{user_id}/feedback.csv`
49
+ • Does not retrain the model automatically (use `maybe_retrain_model` if needed)
50
+
51
+ ----------------------------------------
52
+ 2. User Profiles and Pretrained Models
53
+ ----------------------------------------
54
+
55
+ The `user_data` folder contains four example users:
56
+
57
+ - user_0 : Empty profile for testing the system’s ability to bootstrap from zero information.
58
+ - user_1 : A user with specific dietary habits.
59
+ - user_2 : A user with different dietary preferences.
60
+ - user_3 : Similar to user_2, used to test simple embedding-based model reuse.
61
+
62
+ For each user:
63
+ - Cold start features and ML models (`user_features_rank.csv` and `ranker.pkl`) have already been generated.
64
+ - You can add new users by creating a new folder under `user_data/` with a profile file `user_profile.json` in the following format:
65
+
66
+ {
67
+ "user_id": "user_001",
68
+ "num_feedback": 0,
69
+ "diet": {
70
+ "vegetarian_type": "flexible_vegetarian"
71
+ },
72
+ "allergies": ["peanut", "shrimp"],
73
+ "region_preference": ["Asia", "Europe"],
74
+ "nutritional_goals": {
75
+ "calories": { "min": 400, "max": 3000 },
76
+ "protein": { "min": 100, "max": 160 }
77
+ },
78
+ "other_preferences": {
79
+ "preferred_main": ["chicken", "tofu"],
80
+ "disliked_main": ["lamb"],
81
+ "cooking_time_max": 40
82
+ }
83
+ }
84
+
85
+ The cold start process will typically take **15–25 minutes**, depending on your system performance.
86
+
87
+ ----------------------------------------
88
+ 3. Dataset Download
89
+ ----------------------------------------
90
+
91
+ Large recipe and ingredient mapping files are stored on Hugging Face under the account:
92
+ → iris314
93
+
94
+ These files will be automatically downloaded the first time `load_recipes()` or related functions are called.
95
+ No manual setup is required.
96
+
97
+ ----------------------------------------
98
+ 4. Feedback Loop & Retraining
99
+ ----------------------------------------
100
+
101
+ User feedback is saved in `feedback.csv` files under each user's directory.
102
+ To trigger retraining after feedback collection, call:
103
+
104
+ from trainmodel import maybe_retrain_model
105
+ maybe_retrain_model(user_id)
106
+
107
+ This checks timestamps between `user_features_rank.csv` and `ranker.pkl` to decide if retraining is needed.
108
+
109
+ ----------------------------------------
110
+ 5. Cold Start & Embedding Fallback
111
+ ----------------------------------------
112
+
113
+ - If a user has no model or features, the system runs a cold start procedure to generate ranking features.
114
+ - If a similar user exists (cosine similarity > 0.85), the system copies their model and features to skip retraining.
115
+
116
+ ----------------------------------------
117
+ 6. Quick Start Example
118
+ ----------------------------------------
119
+
120
+ from main import recommend_recipes, load_recipes, get_feedback
121
+
122
+ # 1. Load dataset
123
+ recipes_df = load_recipes()
124
+
125
+ # 2. Prepare a fake detection payload
126
+ payload = {"detected_ingredients": ["chicken", "milk", "flour"]}
127
+
128
+ # 3. Recommend
129
+ top_recipes, user_parents, high_conf, low_conf = recommend_recipes(payload, "user_1", recipes_df, topk=5)
130
+
131
+ # 4. Feedback
132
+ get_feedback("user_1", top_recipes.iloc[0].to_dict())
133
+
134
+ ----------------------------------------
135
+ End of README
136
+ ----------------------------------------
137
+ """
138
+
139
+ with open("README.txt", "w", encoding="utf-8") as f:
140
+ f.write(readme_text)
141
+
142
+ "README.txt file created successfully."
recipe_recommendation/readme_cn.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ =============================
2
+ 菜谱推荐系统(Recipe Recommendation)
3
+ =============================
4
+
5
+ 本项目实现了一个完整的菜谱推荐系统,包括:
6
+ - 冷启动(Cold Start)排序
7
+ - 机器学习模型(ML)重排序
8
+ - KMeans 聚类多样化
9
+ - 用户反馈收集与自动重训
10
+
11
+ 所有功能都已封装好,外部调用只需要几个简单的接口。
12
+
13
+ ----------------------------------------
14
+ 1. 外部主要调用函数
15
+ ----------------------------------------
16
+
17
+ 1) recommend_recipes(detection_payload, user_id, recipes_df, topk=5)
18
+ - 输入:
19
+ • detection_payload:dict 或 JSON,表示检测到的食材
20
+ • user_id:str,用户 ID
21
+ • recipes_df:通过 `load_recipes()` 加载的菜谱 DataFrame
22
+ • topk:返回的推荐菜谱数量(默认 5)
23
+ - 输出:
24
+ • ml_top:推荐结果(DataFrame)
25
+ • user_parents:映射后的父食材列表
26
+ • high_conf:高置信度匹配
27
+ • low_conf:低置信度/未匹配食材
28
+
29
+ 功能包括:食材映射 → 相似用户模型复制 → 冷启动 → 粗排 → ML 重排 → KMeans 多样化。
30
+
31
+ 2) load_recipes()
32
+ - 自动从 Hugging Face(iris314)下载菜谱数据到 `data/`,并返回 DataFrame。
33
+
34
+ 3) get_feedback(user_id, recipe_row, qid=None)
35
+ - 收集用户反馈并写入 `user_data/{user_id}/feedback.csv`
36
+ - user_profile 自动加载,qid 缺省自动分配
37
+
38
+ ----------------------------------------
39
+ 2. 用户数据
40
+ ----------------------------------------
41
+
42
+ `user_data` 里包含四个示例用户:
43
+ - user_0:空 profile,用于测试零信息自启
44
+ - user_1 / user_2:有不同饮食偏好的真实用户
45
+ - user_3:与 user_2 类似,用于测试 embedding 复制功能
46
+
47
+ 每个用户目录下都有 `user_profile.json`、`user_features_rank.csv`、`ranker.pkl`。
48
+ 你可以新增用户,只需遵循以下 JSON 格式:
49
+
50
+ {
51
+ "user_id": "user_001",
52
+ "num_feedback": 0,
53
+ "diet": {"vegetarian_type": "flexible_vegetarian"},
54
+ "allergies": ["peanut", "shrimp"],
55
+ "region_preference": ["Asia", "Europe"],
56
+ "nutritional_goals": {
57
+ "calories": {"min": 400, "max": 3000},
58
+ "protein": {"min": 100, "max": 160}
59
+ },
60
+ "other_preferences": {
61
+ "preferred_main": ["chicken", "tofu"],
62
+ "disliked_main": ["lamb"],
63
+ "cooking_time_max": 40
64
+ }
65
+ }
66
+
67
+ 冷启动过程通常需要 15~25 分钟(视机器性能而定)。
68
+
69
+ ----------------------------------------
70
+ 3. 数据下载
71
+ ----------------------------------------
72
+
73
+ 菜谱和食材映射等大文件会自动从 Hugging Face(iris314)下载并缓存到 `data/`,无需手动设置。
74
+
75
+ ----------------------------------------
76
+ 4. 快速上手示例
77
+ ----------------------------------------
78
+
79
+ ```python
80
+ from main import recommend_recipes, load_recipes, get_feedback
81
+
82
+ # 加载菜谱
83
+ recipes_df = load_recipes()
84
+
85
+ # 准备模拟检测输入
86
+ payload = {"detected_ingredients": ["chicken", "milk", "flour"]}
87
+
88
+ # 获取推荐结果
89
+ top_recipes, user_parents, high_conf, low_conf = recommend_recipes(payload, "user_1", recipes_df, topk=5)
90
+
91
+ # 提交反馈
92
+ get_feedback("user_1", top_recipes.iloc[0].to_dict())
recipe_recommendation/src/__init__.py ADDED
File without changes
recipe_recommendation/src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (195 Bytes). View file
 
recipe_recommendation/src/__pycache__/candidate.cpython-313.pyc ADDED
Binary file (15 kB). View file
 
recipe_recommendation/src/__pycache__/coldstart.cpython-313.pyc ADDED
Binary file (13.7 kB). View file
 
recipe_recommendation/src/__pycache__/embedding.cpython-313.pyc ADDED
Binary file (5.94 kB). View file
 
recipe_recommendation/src/__pycache__/feature.cpython-313.pyc ADDED
Binary file (8.57 kB). View file
 
recipe_recommendation/src/__pycache__/highlight.cpython-313.pyc ADDED
Binary file (4.47 kB). View file
 
recipe_recommendation/src/__pycache__/io.cpython-313.pyc ADDED
Binary file (2.02 kB). View file
 
recipe_recommendation/src/__pycache__/trainmodel.cpython-313.pyc ADDED
Binary file (10.4 kB). View file
 
recipe_recommendation/src/candidate.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from .feature import build_features
4
+ from .io import load_ingredient_map
5
+ import joblib
6
+
7
+ # Load ingredient map globally to avoid repeated I/O
8
+ INGREDIENT_MAP = load_ingredient_map()
9
+ PARENTS = INGREDIENT_MAP["parents"]
10
+ CHILDREN = INGREDIENT_MAP["children"]
11
+
12
+ def extract_user_parents(user_ingredients):
13
+ """Map user's ingredients to parent categories"""
14
+ user_parents = set()
15
+ for ing in user_ingredients:
16
+ ing_lower = ing.lower().strip()
17
+ if ing_lower in CHILDREN:
18
+ parent = CHILDREN[ing_lower]["parent"]
19
+ user_parents.add(parent)
20
+ elif ing_lower in PARENTS:
21
+ user_parents.add(ing_lower)
22
+ return user_parents
23
+
24
+
25
+ # def hard_filter(recipe, user_profile):
26
+ # diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
27
+ # if diet == "vegan" and not recipe.get("is_vegan_safe", True):
28
+ # return False
29
+ # if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
30
+ # return False
31
+ # return True
32
+
33
+ def hard_filter(recipe: dict, user_profile: dict) -> bool:
34
+ """
35
+ Apply hard filters to determine whether a recipe matches the user's dietary profile.
36
+
37
+ Args:
38
+ recipe (dict): Recipe data containing attributes like 'calories', 'protein', 'is_vegan_safe', etc.
39
+ user_profile (dict): User preferences including diet type, nutritional goals, and disliked ingredients.
40
+
41
+ Returns:
42
+ bool: True if the recipe passes all hard filters, False otherwise.
43
+ """
44
+
45
+ # --- Dietary filter ---
46
+ diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
47
+ if diet == "vegan" and not recipe.get("is_vegan_safe", True):
48
+ return False
49
+ if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
50
+ return False
51
+
52
+ # --- Nutritional goal filter ---
53
+ nutritional_goals = user_profile.get("nutritional_goals", {})
54
+
55
+ # Calorie range filter
56
+ cal_range = nutritional_goals.get("calories", {})
57
+ cal_min = cal_range.get("min", 0)
58
+ cal_max = cal_range.get("max", 9999)
59
+ recipe_calories = recipe.get("calories", 0)
60
+
61
+ if not (cal_min <= recipe_calories <= cal_max):
62
+ return False
63
+
64
+ # Protein range filter
65
+ protein_range = nutritional_goals.get("protein", {})
66
+ pro_min = protein_range.get("min", 0)
67
+ pro_max = protein_range.get("max", 999)
68
+ recipe_protein = recipe.get("protein", 0)
69
+
70
+ if not (pro_min <= recipe_protein <= pro_max):
71
+ return False
72
+
73
+ # --- Disliked main ingredients filter ---
74
+ disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
75
+ if disliked_main:
76
+ recipe_main = recipe.get("main_parent", set())
77
+ if isinstance(recipe_main, list):
78
+ recipe_main = set(recipe_main)
79
+ elif not isinstance(recipe_main, set):
80
+ recipe_main = set()
81
+
82
+ # Exclude if any main ingredient is in the disliked list
83
+ if recipe_main & disliked_main:
84
+ return False
85
+
86
+ return True
87
+
88
+
89
+
90
+ COARSE_WEIGHTS = {
91
+ "main_match_ratio": 1.0,
92
+ "staple_match_ratio": 0.3,
93
+ "other_match_ratio": 0.6,
94
+ "low_calorie_penalty": 0.2,
95
+ "preferred_course_overlap": 0.1,
96
+ "region_match": 0.8
97
+ }
98
+
99
+
100
+ def coarse_score(features, weights=COARSE_WEIGHTS):
101
+ score = 0.0
102
+ for key, w in weights.items():
103
+ if key in features:
104
+ score += w * features[key]
105
+ return score
106
+
107
+
108
+ def coarse_rank_candidates(recipes, user_parents, user_profile, top_n=30000, weights=COARSE_WEIGHTS):
109
+ """
110
+ Stage 2: Coarse Ranking (NumPy vectorized implementation)
111
+ ---------------------------------------------------------
112
+ Quickly retrieves a subset of candidate recipes by computing
113
+ ingredient coverage ratios (main / staple / other) between
114
+ the user's pantry and the recipes using vectorized operations.
115
+
116
+ This function replaces the original Python loop version
117
+ for significant speedup during cold start and real-time ranking.
118
+ """
119
+ if not recipes:
120
+ return []
121
+
122
+ # === 1. Build parent vocabulary ===
123
+ # Extract all unique parent ingredients across main/staple/other fields.
124
+ all_parents = sorted({
125
+ p for r in recipes
126
+ for k in ["main_parent", "staple_parent", "other_parent"]
127
+ for p in (r.get(k) or [])
128
+ })
129
+ parent_index = {p: i for i, p in enumerate(all_parents)}
130
+ num_recipes = len(recipes)
131
+ num_parents = len(all_parents)
132
+
133
+ # === 2. Construct multi-hot matrices for main, staple, other ===
134
+ # Each row corresponds to a recipe; each column to a parent ingredient.
135
+ main_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
136
+ staple_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
137
+ other_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
138
+
139
+ for i, r in enumerate(recipes):
140
+ for p in r.get("main_parent", []):
141
+ if p in parent_index:
142
+ main_mat[i, parent_index[p]] = 1
143
+ for p in r.get("staple_parent", []):
144
+ if p in parent_index:
145
+ staple_mat[i, parent_index[p]] = 1
146
+ for p in r.get("other_parent", []):
147
+ if p in parent_index:
148
+ other_mat[i, parent_index[p]] = 1
149
+
150
+ # === 3. Encode user pantry as a binary mask ===
151
+ user_mask = np.zeros(num_parents, dtype=np.uint8)
152
+ for p in user_parents:
153
+ if p in parent_index:
154
+ user_mask[parent_index[p]] = 1
155
+
156
+ # === 4. Compute ingredient match ratios in batch ===
157
+ # main_ratio = (# of matched main ingredients) / (# of total main ingredients)
158
+ main_total = main_mat.sum(axis=1)
159
+ staple_total = staple_mat.sum(axis=1)
160
+ other_total = other_mat.sum(axis=1)
161
+
162
+ main_match = (main_mat @ user_mask)
163
+ staple_match = (staple_mat @ user_mask)
164
+ other_match = (other_mat @ user_mask)
165
+
166
+ main_ratio = main_match / np.maximum(main_total, 1)
167
+ staple_ratio = staple_match / np.maximum(staple_total, 1)
168
+ other_ratio = other_match / np.maximum(other_total, 1)
169
+
170
+ # === 5. Additional coarse ranking signals ===
171
+ # Low-calorie preference & preferred cuisine overlap
172
+ calories = np.array([r.get("calories", 0) for r in recipes], dtype=float)
173
+ calorie_threshold = user_profile.get("calorie_threshold", 9999)
174
+ low_calorie_penalty = (calories <= calorie_threshold).astype(float)
175
+
176
+ preferred_course_types = set(user_profile.get("preferred_course_types", []))
177
+ preferred_overlap = np.array([
178
+ len(set(r.get("cuisine_attr", [])) & preferred_course_types)
179
+ for r in recipes
180
+ ], dtype=float)
181
+
182
+ # Region preference matching
183
+ preferred_regions = set(user_profile.get("region_preference", []))
184
+ region_match = np.array([
185
+ 1.0 if any(region in preferred_regions for region in
186
+ (r.get("region", []) if isinstance(r.get("region"), (list, set))
187
+ else [r.get("region", "")]))
188
+ else 0.0
189
+ for r in recipes
190
+ ], dtype=float)
191
+
192
+ # === 6. Compute coarse ranking scores ===
193
+ scores = (
194
+ weights["main_match_ratio"] * main_ratio +
195
+ weights["staple_match_ratio"] * staple_ratio +
196
+ weights["other_match_ratio"] * other_ratio +
197
+ weights["low_calorie_penalty"] * low_calorie_penalty +
198
+ weights["preferred_course_overlap"] * preferred_overlap +
199
+ weights.get("region_match", 0) * region_match
200
+ )
201
+
202
+ # === 7. Select top-N candidates ===
203
+ valid_idx = np.where(scores > 0)[0]
204
+ if valid_idx.size == 0:
205
+ return []
206
+
207
+ scores_valid = scores[valid_idx]
208
+ topk = min(top_n, valid_idx.size)
209
+
210
+ # Optional dynamic thresholding: keep candidates with score >= 50% of max
211
+ max_score = scores_valid.max()
212
+ keep_mask = scores_valid >= max_score * 0.5
213
+ keep_idx = valid_idx[keep_mask]
214
+
215
+ if keep_idx.size == 0:
216
+ return []
217
+
218
+ order = np.argsort(scores[keep_idx])[::-1]
219
+ top_idx = keep_idx[order[:topk]]
220
+
221
+ # Return the original recipe dicts corresponding to the top candidates
222
+ return [recipes[i] for i in top_idx]
223
+
224
+
225
+ def rule_generate_candidates(df, user_parents, user_profile):
226
+ """
227
+ Step 3: Rule-based reranking of coarse candidates.
228
+ Uses all available features (except vegan/vegetarian filters, which were applied in Step 1)
229
+ to compute a weighted rule-based score for each recipe.
230
+ """
231
+
232
+ def score(row):
233
+ # Build recipe_dict for feature extraction
234
+ recipe_dict = {
235
+ "main": row.get("main_parent", set()),
236
+ "staple": row.get("staple_parent", set()),
237
+ "other": row.get("other_parent", set()),
238
+ "seasoning": row.get("seasoning_parent", set()),
239
+ "matched_main": len(row.get("main_parent", set()) & set(user_parents)),
240
+ "matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
241
+ "matched_other": len(row.get("other_parent", set()) & set(user_parents)),
242
+ "calories": row.get("calories", 0),
243
+ "protein": row.get("protein", 0),
244
+ "fat": row.get("fat", 0),
245
+ "region": row.get("region", ""),
246
+ "cuisine_attr": row.get("cuisine_attr", []),
247
+ "ingredients": row.get("ingredients", []),
248
+ "minutes": row.get("minutes", None),
249
+ }
250
+
251
+ # Extract rule features
252
+ feats = build_features(recipe_dict, user_profile)
253
+
254
+ # Compute rule-based score
255
+ score = 0.0
256
+
257
+ # Ingredient match ratios
258
+ # Main ingredients are weighted most heavily
259
+ score += 2.0 * feats["main_match_ratio"]
260
+ score += 1.0 * feats["staple_match_ratio"]
261
+ score += 1.0 * feats["other_match_ratio"]
262
+
263
+ # Nutrition preferences
264
+ # Low calorie preference
265
+ if user_profile.get("low_calorie", False):
266
+ if feats["low_calorie_penalty"]:
267
+ score += 0.5
268
+
269
+ # High protein preference
270
+ if user_profile.get("high_protein", False) and feats["protein_ratio"] > 0.25:
271
+ score += 0.3
272
+
273
+ # Low fat preference (penalty if fat ratio is too high)
274
+ if user_profile.get("low_fat", False) and feats["fat_ratio"] > 0.35:
275
+ score -= 0.3
276
+
277
+ # Region / cuisine / main-type preferences
278
+ score += 0.5 * feats["region_match"]
279
+ score += 0.4 * feats["preferred_course_overlap"]
280
+ score += 0.3 * feats["preferred_main_overlap"]
281
+
282
+ # Cooking time preference
283
+ score += 0.3 * feats["within_cooking_time"]
284
+
285
+ # Missing ingredients penalty
286
+ # Minor penalty for missing main ingredients (after coarse filtering this is usually small)
287
+ score -= 0.2 * feats["missing_main_count"]
288
+
289
+ return max(score, 0.0)
290
+
291
+ # Apply scoring over the coarse candidate DataFrame
292
+ df = df.copy()
293
+ df["match_score"] = df.apply(score, axis=1)
294
+ df = df[df["match_score"] > 0]
295
+ if df.empty:
296
+ return df
297
+ df = df.sort_values("match_score", ascending=False).reset_index(drop=True)
298
+
299
+ return df
300
+
301
+
302
+ def ml_generate_candidates(coarse_candidates, user_parents, user_profile, model_path, topk=5):
303
+ """
304
+ Step 3: ML-based reranking (directly after Step 2).
305
+ Instead of rule-based prefiltering, use the coarse-ranked candidates (Step 2 output),
306
+ build features in the same format as training, and apply the trained ML model to rerank.
307
+ """
308
+
309
+ # Handle empty input
310
+ if coarse_candidates is None or len(coarse_candidates) == 0:
311
+ print("No candidates provided for ML reranking.")
312
+ return pd.DataFrame()
313
+
314
+ # If input is a list of dicts (from coarse_rank_candidates), convert to DataFrame
315
+ if isinstance(coarse_candidates, list):
316
+ df = pd.DataFrame(coarse_candidates)
317
+ else:
318
+ df = coarse_candidates.copy()
319
+
320
+ if df.empty:
321
+ print("Coarse candidates DataFrame is empty.")
322
+ return df
323
+
324
+ # Load trained model
325
+ model = joblib.load(model_path)
326
+
327
+ # Build feature DataFrame
328
+ feature_rows = []
329
+ for _, row in df.iterrows():
330
+ recipe_dict = {
331
+ "main": row.get("main_parent", set()),
332
+ "staple": row.get("staple_parent", set()),
333
+ "other": row.get("other_parent", set()),
334
+ "seasoning": row.get("seasoning_parent", set()),
335
+ "matched_main": len(row.get("main_parent", set()) & set(user_parents)),
336
+ "matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
337
+ "matched_other": len(row.get("other_parent", set()) & set(user_parents)),
338
+ "calories": row.get("calories", 0),
339
+ "protein": row.get("protein", 0),
340
+ "fat": row.get("fat", 0),
341
+ "region": row.get("region", ""),
342
+ "cuisine_attr": row.get("cuisine_attr", []),
343
+ "ingredients": row.get("ingredients", []),
344
+ "minutes": row.get("minutes", None),
345
+ }
346
+ feats = build_features(recipe_dict, user_profile)
347
+ feature_rows.append(feats)
348
+
349
+ feature_df = pd.DataFrame(feature_rows)
350
+
351
+ # Predict ML scores
352
+ if hasattr(model, "predict_proba"):
353
+ df["ml_score"] = model.predict_proba(feature_df)[:, 1]
354
+ else:
355
+ df["ml_score"] = model.predict(feature_df)
356
+
357
+ # normalize to 0-1
358
+ if len(df) > 0 and df["ml_score"].max() > df["ml_score"].min():
359
+ df["ml_score"] = (df["ml_score"] - df["ml_score"].min()) / (df["ml_score"].max() - df["ml_score"].min())
360
+
361
+ # Sort by ML score and return top-k candidates
362
+ return df.sort_values("ml_score", ascending=False).head(topk).reset_index(drop=True)
363
+
364
+
365
+
recipe_recommendation/src/coldstart.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ast
3
+ import json
4
+ import random
5
+ import pandas as pd
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ import warnings
9
+
10
+ from .candidate import coarse_rank_candidates, hard_filter, rule_generate_candidates
11
+ from .feature import build_features
12
+ from .io import load_recipes_csv, load_ingredient_map
13
+
14
+ RECIPES_PATH = load_recipes_csv()
15
+ INGREDIENT_MAP = load_ingredient_map()
16
+ PARENTS = INGREDIENT_MAP["parents"]
17
+ CHILDREN = INGREDIENT_MAP["children"]
18
+
19
+ def parse_list(x):
20
+ """Convert a stringified list into a Python list safely."""
21
+ if pd.isna(x) or x == "":
22
+ return []
23
+ if isinstance(x, list):
24
+ return x
25
+ try:
26
+ return ast.literal_eval(x)
27
+ except Exception:
28
+ return []
29
+
30
+ def parse_set(x):
31
+ """Convert a stringified collection into a Python set safely."""
32
+ if pd.isna(x) or x == "":
33
+ return set()
34
+ if isinstance(x, set):
35
+ return x
36
+ if isinstance(x, (list, tuple)):
37
+ return set(x)
38
+ if isinstance(x, str):
39
+ try:
40
+ v = ast.literal_eval(x)
41
+ if isinstance(v, (list, tuple, set)):
42
+ return set(v)
43
+ return {v}
44
+ except Exception:
45
+ return {x.strip()}
46
+ return {x}
47
+
48
+ def _parents_pool_from_df(df: pd.DataFrame):
49
+ cols = ["main_parent", "staple_parent", "other_parent", "seasoning_parent"]
50
+ pool = set()
51
+ for c in cols:
52
+ if c in df.columns:
53
+ for s in df[c]:
54
+ pool |= set(s) if isinstance(s, (set, list, tuple)) else set()
55
+ return sorted(pool)
56
+
57
+
58
+ def sample_user_parents(parents_pool,
59
+ user_profile=None,
60
+ prev_inventory=None,
61
+ min_items=3, max_items=10,
62
+ keep_ratio=0.6, reset_interval=20, round_idx=0):
63
+ liked = set((user_profile or {}).get("other_preferences", {}).get("preferred_main", []))
64
+ disliked = set((user_profile or {}).get("other_preferences", {}).get("disliked_main", []))
65
+ forbidden = set((user_profile or {}).get("forbidden_parents", [])) | disliked
66
+
67
+ pool, weights = [], []
68
+ for p in parents_pool:
69
+ if p in forbidden:
70
+ continue
71
+ w = 3.0 if p in liked else 1.0
72
+ pool.append(p); weights.append(w)
73
+ if not pool:
74
+ pool, weights = parents_pool[:], [1.0] * len(parents_pool)
75
+
76
+ inventory = set()
77
+ force_reset = (round_idx % reset_interval == 0)
78
+ if prev_inventory and not force_reset:
79
+ prev_list = list(prev_inventory); random.shuffle(prev_list)
80
+ keep_k = max(0, int(len(prev_list) * keep_ratio))
81
+ inventory |= set(prev_list[:keep_k])
82
+
83
+ k = random.randint(min_items, max_items)
84
+ remain = max(0, k - len(inventory))
85
+ for _ in range(min(remain, len(pool))):
86
+ idx = random.choices(range(len(pool)), weights=weights, k=1)[0]
87
+ inventory.add(pool[idx])
88
+ return list(inventory)
89
+
90
+
91
+ def _weighted_pick3(indexes, scores, temperature=1.0):
92
+ idxs = list(indexes)
93
+ scs = np.array(scores, dtype=float)
94
+ if np.any(scs < 0):
95
+ scs = scs - scs.min()
96
+ if scs.sum() == 0:
97
+ scs = np.ones_like(scs)
98
+ picks = []
99
+ for _ in range(min(3, len(idxs))):
100
+ probs = np.exp(scs / max(temperature, 1e-6))
101
+ probs = probs / probs.sum()
102
+ choice = np.random.choice(len(idxs), p=probs)
103
+ picks.append(idxs[choice])
104
+ idxs.pop(choice)
105
+ scs = np.delete(scs, choice)
106
+ if len(idxs) == 0:
107
+ break
108
+ return picks
109
+
110
+
111
+ # ---------- Main cold-start ----------
112
+ # ---------- Main cold-start ----------
113
+ def cold_start_ranker(user_id: str,
114
+ n_rounds: int = 10000,
115
+ topn_coarse: int = 5000,
116
+ topk_rule: int = 5,
117
+ batch_size: int = 5000,
118
+ switch_interval: int = 100):
119
+ """
120
+ Cold-start data generation for learning-to-rank.
121
+ Top-5 selection prioritizes user pantry coverage deterministically:
122
+ 1. Fully covered recipes first (missing_count == 0)
123
+ 2. Then few missing (esp. staple/other)
124
+ 3. Heavy penalty for missing main ingredients.
125
+ """
126
+ base_dir = os.path.join("user_data", user_id)
127
+ os.makedirs(base_dir, exist_ok=True)
128
+ profile_path = os.path.join(base_dir, "user_profile.json")
129
+ features_path = os.path.join(base_dir, "user_features_rank.csv")
130
+
131
+ if os.path.exists(features_path):
132
+ print(f"[cold_start] Features already exist at {features_path}")
133
+ return features_path
134
+
135
+ with open(profile_path, "r", encoding="utf-8") as f:
136
+ user_profile = json.load(f)
137
+
138
+ # Load and parse recipes
139
+ df_all = pd.read_csv(RECIPES_PATH)
140
+ to_set = ["main_parent", "staple_parent", "other_parent", "seasoning_parent", "cuisine_attr"]
141
+ to_list = ["ingredients"]
142
+ for c in to_set:
143
+ if c in df_all.columns:
144
+ df_all[c] = df_all[c].apply(parse_set)
145
+ for c in to_list:
146
+ if c in df_all.columns:
147
+ df_all[c] = df_all[c].apply(parse_list)
148
+
149
+ # Step 1 hard filter
150
+ if hard_filter is not None:
151
+ try:
152
+ before = len(df_all)
153
+ mask = df_all.apply(lambda r: hard_filter(r.to_dict(), user_profile), axis=1)
154
+ df_all = df_all[mask]
155
+ after = len(df_all)
156
+ print(f"[cold_start] Step1 hard filter applied: {before} -> {after}")
157
+ except Exception as e:
158
+ warnings.warn(f"[cold_start] hard_filter failed, skip. err={e}")
159
+
160
+ n_chunks = (len(df_all) // batch_size) + 1
161
+ chunks = np.array_split(df_all, n_chunks)
162
+ parents_pool = _parents_pool_from_df(df_all)
163
+ rows = []
164
+ prev_inventory = None
165
+
166
+ for i in tqdm(range(n_rounds), desc="Cold-start rounds"):
167
+ chunk_id = (i // switch_interval) % n_chunks
168
+ df_chunk = chunks[chunk_id].copy()
169
+
170
+ # pantry sampling
171
+ user_parents = sample_user_parents(
172
+ parents_pool,
173
+ user_profile=user_profile,
174
+ prev_inventory=prev_inventory,
175
+ round_idx=i
176
+ )
177
+ prev_inventory = user_parents
178
+
179
+ # Step 2: coarse recall
180
+ coarse_list = coarse_rank_candidates(
181
+ recipes=df_chunk.to_dict(orient="records"),
182
+ user_parents=user_parents,
183
+ user_profile=user_profile,
184
+ top_n=min(topn_coarse, len(df_chunk))
185
+ )
186
+ if not coarse_list:
187
+ continue
188
+
189
+ coarse_df = pd.DataFrame(coarse_list)
190
+
191
+ # Step 3: rule rerank → Top-5 candidates (just for selecting the 5)
192
+ rule_df = rule_generate_candidates(
193
+ coarse_df,
194
+ user_parents=user_parents,
195
+ user_profile=user_profile
196
+ )
197
+ if rule_df.empty or len(rule_df) < topk_rule:
198
+ continue
199
+
200
+ top5 = rule_df.head(topk_rule).copy()
201
+
202
+ # ===== New deterministic scoring with main priority =====
203
+ user_set = set(user_parents)
204
+ weighted_scores = []
205
+ for idx, row in top5.iterrows():
206
+ main_set = set(row.get("main_parent", set()))
207
+ staple_set = set(row.get("staple_parent", set()))
208
+ other_set = set(row.get("other_parent", set()))
209
+
210
+ main_missing = len(main_set - user_set)
211
+ staple_missing = len(staple_set - user_set)
212
+ other_missing = len(other_set - user_set)
213
+
214
+ weighted_missing = 10 * main_missing + 2 * staple_missing + 1 * other_missing
215
+ total_missing = main_missing + staple_missing + other_missing
216
+
217
+ weighted_scores.append((idx, weighted_missing, total_missing))
218
+
219
+ sorted_pairs = sorted(weighted_scores, key=lambda x: (x[1], x[2]))
220
+ picked_idxs = [idx for idx, _, _ in sorted_pairs[:3]]
221
+
222
+ # relevance 3 / 2 / 1
223
+ labels = {idx: 0 for idx in top5.index}
224
+ if len(picked_idxs) > 0:
225
+ labels[picked_idxs[0]] = 3
226
+ if len(picked_idxs) > 1:
227
+ labels[picked_idxs[1]] = 2
228
+ if len(picked_idxs) > 2:
229
+ labels[picked_idxs[2]] = 1
230
+
231
+ # build features for all 5 candidates
232
+ for idx, row in top5.iterrows():
233
+ up = set(user_parents)
234
+ main_set = set(row.get("main_parent", set()))
235
+ staple_set = set(row.get("staple_parent", set()))
236
+ other_set = set(row.get("other_parent", set()))
237
+
238
+ recipe_dict = {
239
+ "main": main_set,
240
+ "staple": staple_set,
241
+ "other": other_set,
242
+ "seasoning": set(row.get("seasoning_parent", set())),
243
+ "matched_main": len(main_set & up),
244
+ "matched_staple": len(staple_set & up),
245
+ "matched_other": len(other_set & up),
246
+ "calories": row.get("calories", 0),
247
+ "protein": row.get("protein", 0),
248
+ "fat": row.get("fat", 0),
249
+ "region": row.get("region", ""),
250
+ "cuisine_attr": row.get("cuisine_attr", []),
251
+ "ingredients": row.get("ingredients", []),
252
+ "minutes": row.get("minutes", None),
253
+ }
254
+
255
+ feats = build_features(recipe_dict, user_profile)
256
+ feats["relevance"] = float(labels[idx])
257
+ feats["qid"] = int(i)
258
+ rows.append(feats)
259
+
260
+ out = pd.DataFrame(rows)
261
+ valid_qids = out.groupby("qid").size()
262
+ keep_qids = valid_qids[valid_qids > 1].index
263
+ out = out[out["qid"].isin(keep_qids)].reset_index(drop=True)
264
+
265
+ out_path = os.path.join("user_data", user_id, "user_features_rank.csv")
266
+ out.to_csv(out_path, index=False)
267
+ print(f"[cold_start] Saved {len(out)} rows to {out_path}")
268
+ return out_path
269
+
270
+
271
+ if __name__ == "__main__":
272
+ cold_start_ranker(
273
+ user_id="user_1",
274
+ n_rounds=10000,
275
+ topn_coarse=20000,
276
+ topk_rule=5,
277
+ coverage_penalty=0.15,
278
+ temperature=0.5
279
+ )
recipe_recommendation/src/embedding.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ def profile_to_embedding(profile):
7
+ """
8
+ Convert a normalized user profile into a fixed-length numeric embedding.
9
+ Embedding structure:
10
+ [diet (3)] + [allergies (6)] + [region (6)] +
11
+ [nutritional goals (4)] + [preferred_main (8)] + [cooking_time (1)]
12
+ Total dim ≈ 28
13
+ """
14
+ vecs = []
15
+
16
+ # 1. Diet (one-hot)
17
+ diet_types = ["vegetarian", "flexible", "non_vegetarian"]
18
+ diet_vec = np.zeros(len(diet_types))
19
+ diet_value = profile.get("diet", {}).get("vegetarian_type", "flexible")
20
+ if diet_value in diet_types:
21
+ diet_vec[diet_types.index(diet_value)] = 1
22
+ vecs.append(diet_vec)
23
+
24
+ # 2. Allergies (multi-hot)
25
+ allergy_vocab = ["milk", "gluten", "peanut", "shrimp", "egg", "soy"]
26
+ allergies = set(profile.get("allergies", []))
27
+ allergy_vec = np.array([1 if a in allergies else 0 for a in allergy_vocab])
28
+ vecs.append(allergy_vec)
29
+
30
+ # 3. Region preferences (multi-hot)
31
+ region_vocab = ["North America", "Latin America", "Europe", "Asia", "Middle East", "Africa"]
32
+ regions = set(profile.get("region_preference", []))
33
+ region_vec = np.array([1 if r in regions else 0 for r in region_vocab])
34
+ vecs.append(region_vec)
35
+
36
+ # 4. Nutritional goals (normalized)
37
+ ng = profile.get("nutritional_goals", {})
38
+ cal = ng.get("calories", {})
39
+ pro = ng.get("protein", {})
40
+
41
+ cal_min = cal.get("min", 0) / 4000
42
+ cal_max = min(cal.get("max", 9999), 4000) / 4000
43
+ pro_min = pro.get("min", 0) / 300
44
+ pro_max = min(pro.get("max", 999), 300) / 300
45
+
46
+ vecs.append(np.array([cal_min, cal_max, pro_min, pro_max]))
47
+
48
+ # 5. Preferred main ingredients (multi-hot)
49
+ main_vocab = ["chicken", "tofu", "beef", "salmon", "eggs", "pork", "beans", "mushroom"]
50
+ mains = set(profile.get("other_preferences", {}).get("preferred_main", []))
51
+ main_vec = np.array([1 if m in mains else 0 for m in main_vocab])
52
+ vecs.append(main_vec)
53
+
54
+ # 6. Cooking time max (normalized to [0,1], assume 120 min upper bound)
55
+ t = profile.get("other_preferences", {}).get("cooking_time_max")
56
+ t_vec = np.array([min(t / 120, 1)]) if t is not None else np.array([0])
57
+ vecs.append(t_vec)
58
+
59
+ return np.concatenate(vecs)
60
+
61
+
62
+ def profile_similarity(profile_a, profile_b):
63
+ """Compute cosine similarity between two user profiles."""
64
+ emb_a = profile_to_embedding(profile_a).reshape(1, -1)
65
+ emb_b = profile_to_embedding(profile_b).reshape(1, -1)
66
+ return cosine_similarity(emb_a, emb_b)[0, 0]
67
+
68
+ def find_most_similar_user(target_user_id, user_data_dir="recipe_recommendation/user_data", threshold=0.85):
69
+ """
70
+ Find the most similar existing user based on profile embeddings.
71
+ Returns (best_match_user_id, similarity_score) or (None, -1) if no match.
72
+ """
73
+ target_profile_path = os.path.join(user_data_dir, target_user_id, "user_profile.json")
74
+ if not os.path.exists(target_profile_path):
75
+ raise FileNotFoundError(f"[embedding] No profile found for user {target_user_id}")
76
+
77
+ with open(target_profile_path, "r", encoding="utf-8") as f:
78
+ target_profile = json.load(f)
79
+ target_emb = profile_to_embedding(target_profile).reshape(1, -1)
80
+
81
+ best_match, best_score = None, -1
82
+
83
+ for uid in os.listdir(user_data_dir):
84
+ if uid == target_user_id:
85
+ continue
86
+ profile_path = os.path.join(user_data_dir, uid, "user_profile.json")
87
+ if not os.path.exists(profile_path):
88
+ continue
89
+ with open(profile_path, "r", encoding="utf-8") as f:
90
+ other_profile = json.load(f)
91
+ other_emb = profile_to_embedding(other_profile).reshape(1, -1)
92
+ sim = cosine_similarity(target_emb, other_emb)[0, 0]
93
+ if sim > best_score:
94
+ best_match, best_score = uid, sim
95
+
96
+ if best_match and best_score >= threshold:
97
+ print(f"[embedding] Found similar user: {best_match} (similarity={best_score:.3f})")
98
+ return best_match, best_score
99
+
100
+ return None, -1
recipe_recommendation/src/feature.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from .io import load_ingredient_map
3
+ import numpy as np
4
+
5
+ # Load ingredient map globally to avoid repeated I/O
6
+ INGREDIENT_MAP = load_ingredient_map()
7
+ PARENTS = INGREDIENT_MAP["parents"]
8
+ CHILDREN = INGREDIENT_MAP["children"]
9
+
10
+
11
+ def is_recipe_vegetarian_safe(ingredients: list[str], veg_type: str) -> bool:
12
+ """
13
+ Check if the recipe is safe for a given dietary type.
14
+ Supported veg_type: "vegan", "vegetarian", "flexible_vegetarian", "" (none).
15
+ """
16
+ for ing in ingredients:
17
+ ing_lower = ing.strip().lower()
18
+ if ing_lower in CHILDREN:
19
+ info = CHILDREN[ing_lower]
20
+ elif ing_lower in PARENTS:
21
+ info = PARENTS[ing_lower]
22
+ else:
23
+ # If the ingredient is not found in the map, treat it as safe by default.
24
+ continue
25
+
26
+ if veg_type == "vegan" and not info.get("vegan_safe", True):
27
+ return False
28
+ if veg_type == "vegetarian" and not info.get("vegetarian_safe", True):
29
+ return False
30
+ if veg_type == "flexible_vegetarian":
31
+ # Flexible vegetarians allow most ingredients except explicit meat.
32
+ # Here, we can use vegetarian_safe as a proxy for flexibility.
33
+ if not info.get("vegetarian_safe", True):
34
+ return False
35
+ return True
36
+
37
+
38
+ def build_features(recipe: dict, user_profile: dict) -> dict:
39
+ """
40
+ Build a feature dictionary for ML ranker and rule-based scoring.
41
+ All features are numeric scalars or counts.
42
+ """
43
+ features = {}
44
+
45
+ # Ingredient matching ratios
46
+ total_main = len(recipe.get("main", []))
47
+ total_other = len(recipe.get("other", []))
48
+ total_staple = len(recipe.get("staple", []))
49
+
50
+ features["main_match_ratio"] = recipe.get("matched_main", 0) / max(total_main, 1)
51
+ features["other_match_ratio"] = recipe.get("matched_other", 0) / max(total_other, 1)
52
+ features["staple_match_ratio"] = recipe.get("matched_staple", 0) / max(total_staple, 1)
53
+
54
+ features["missing_main_count"] = total_main - recipe.get("matched_main", 0)
55
+ features["missing_other_count"] = total_other - recipe.get("matched_other", 0)
56
+ features["missing_staple_count"] = total_staple - recipe.get("matched_staple", 0)
57
+
58
+ # Nutrition information
59
+ calories = recipe.get("calories", 0)
60
+ protein = recipe.get("protein", 0)
61
+ fat = recipe.get("fat", 0)
62
+ features["calories"] = calories
63
+ features["protein"] = protein
64
+ features["fat"] = fat
65
+ features["protein_ratio"] = protein / max(calories, 1)
66
+ features["fat_ratio"] = fat / max(calories, 1)
67
+
68
+ # Regional preference
69
+ recipe_region = recipe.get("region", "")
70
+ if isinstance(recipe_region, set):
71
+ features["region_match"] = int(any(
72
+ r in user_profile.get("preferred_regions", []) for r in recipe_region
73
+ ))
74
+ else:
75
+ features["region_match"] = int(
76
+ recipe_region in user_profile.get("preferred_regions", [])
77
+ )
78
+
79
+ # Diet constraints
80
+ ingredients_all = recipe.get("ingredients", [])
81
+
82
+ # Vegan-safe check (absolute, independent of user)
83
+ features["is_vegan_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegan"))
84
+
85
+ # Vegetarian-safe check (absolute, independent of user)
86
+ features["is_vegetarian_safe_absolute"] = int(
87
+ is_recipe_vegetarian_safe(ingredients_all, "vegetarian")
88
+ )
89
+
90
+ # Flexible vegetarian-safe check (absolute, independent of user)
91
+ features["is_flexible_safe_absolute"] = int(
92
+ is_recipe_vegetarian_safe(ingredients_all, "flexible_vegetarian")
93
+ )
94
+
95
+ # User diet safety (depends on user_profile)
96
+ veg_type = (user_profile.get("diet", {}).get("vegetarian_type", "") or "").lower()
97
+ features["is_user_diet_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, veg_type))
98
+
99
+ # Calorie preference
100
+ calorie_threshold = user_profile.get("calorie_threshold", 9999)
101
+ features["low_calorie_penalty"] = int(calories <= calorie_threshold)
102
+
103
+ # Main ingredient preference
104
+ preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
105
+ recipe_main = set(recipe.get("main", []))
106
+ features["preferred_main_overlap"] = len(recipe_main & preferred_main)
107
+
108
+ # Course type preference
109
+ # e.g. user may prefer 'main-dish' or 'desserts'
110
+ recipe_types = set(recipe.get("cuisine_attr", []))
111
+ preferred_types = set(user_profile.get("preferred_course_types", []))
112
+ features["preferred_course_overlap"] = len(recipe_types & preferred_types)
113
+
114
+ # Cooking time preference
115
+ cooking_time_max = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
116
+ if cooking_time_max:
117
+ features["within_cooking_time"] = int(recipe.get("minutes", 9999) <= cooking_time_max)
118
+ else:
119
+ features["within_cooking_time"] = 1
120
+
121
+ return features
122
+
123
+ def build_cluster_features(candidates):
124
+ """
125
+ Build simple ingredient + cuisine based feature vectors for KMeans clustering.
126
+ This is separate from model training features.
127
+
128
+ Args:
129
+ candidates (list[dict]): list of recipe dicts.
130
+
131
+ Returns:
132
+ np.ndarray: feature matrix (num_candidates, num_features)
133
+ """
134
+ # 1. Collect vocabulary for ingredients and cuisine
135
+ all_main = set()
136
+ all_staple = set()
137
+ all_other = set()
138
+ all_cuisine = set()
139
+
140
+ for r in candidates:
141
+ all_main.update(r.get("main_parent", []) or [])
142
+ all_staple.update(r.get("staple_parent", []) or [])
143
+ all_other.update(r.get("other_parent", []) or [])
144
+ all_cuisine.update(r.get("cuisine_attr", []) or [])
145
+
146
+ main_vocab = sorted(all_main)
147
+ staple_vocab = sorted(all_staple)
148
+ other_vocab = sorted(all_other)
149
+ cuisine_vocab = sorted(all_cuisine)
150
+
151
+ # 2. Build index map
152
+ main_idx = {p: i for i, p in enumerate(main_vocab)}
153
+ staple_idx = {p: i + len(main_vocab) for i, p in enumerate(staple_vocab)}
154
+ other_idx = {p: i + len(main_vocab) + len(staple_vocab) for i, p in enumerate(other_vocab)}
155
+ cuisine_idx = {p: i + len(main_vocab) + len(staple_vocab) + len(other_vocab)
156
+ for i, p in enumerate(cuisine_vocab)}
157
+
158
+ dim = len(main_vocab) + len(staple_vocab) + len(other_vocab) + len(cuisine_vocab)
159
+ X = np.zeros((len(candidates), dim), dtype=np.uint8)
160
+
161
+ # 3. Fill feature matrix
162
+ for i, r in enumerate(candidates):
163
+ for p in r.get("main_parent", []) or []:
164
+ if p in main_idx:
165
+ X[i, main_idx[p]] = 1
166
+ for p in r.get("staple_parent", []) or []:
167
+ if p in staple_idx:
168
+ X[i, staple_idx[p]] = 1
169
+ for p in r.get("other_parent", []) or []:
170
+ if p in other_idx:
171
+ X[i, other_idx[p]] = 1
172
+ for p in r.get("cuisine_attr", []) or []:
173
+ if p in cuisine_idx:
174
+ X[i, cuisine_idx[p]] = 1
175
+
176
+ return X
recipe_recommendation/src/highlight.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.cluster import KMeans
3
+ from sklearn.preprocessing import StandardScaler
4
+ import numpy as np
5
+
6
+
7
+ def print_candidates(candidates, user_parents, topk=10):
8
+ shown = 0
9
+ max_score = candidates['match_score'].max()
10
+ min_score = candidates['match_score'].min()
11
+
12
+ for _, row in candidates.head(topk).iterrows():
13
+ scaled_score = 100 * row['match_score'] / (max_score + 1e-9)
14
+ print(f"{row['name']} (score {scaled_score:.1f}%)")
15
+
16
+ # ----- Region -----
17
+ region = row.get("region", None)
18
+ if pd.notna(region) and isinstance(region, str) and region.strip() and region.lower() != "unavailable":
19
+ print(f" region: {region}")
20
+
21
+ # ----- Cuisine Attributes -----
22
+ cuisine = row.get("cuisine_attr", None)
23
+ if cuisine is not None and not (isinstance(cuisine, float) and pd.isna(cuisine)):
24
+ # Convert set to list for printing
25
+ if isinstance(cuisine, set):
26
+ cuisine = list(cuisine)
27
+ elif isinstance(cuisine, str):
28
+ cuisine = [cuisine]
29
+
30
+ if isinstance(cuisine, list) and len(cuisine) > 0:
31
+ print(f" cuisine: {', '.join(cuisine)}")
32
+
33
+ # ----- Nutrition -----
34
+ print(f" calories: {row.get('calories', 'N/A')}")
35
+
36
+ # ----- Ingredient Marking -----
37
+ def mark_list(lst):
38
+ return [("✅ " + ing) if ing in user_parents else ("❌ " + ing) for ing in lst]
39
+
40
+ print(f" staple: {mark_list(row.get('staple_parent', []))}")
41
+ print(f" main: {mark_list(row.get('main_parent', []))}")
42
+ print(f" seasoning: {row.get('seasoning_parent', [])}")
43
+ print(f" other: {mark_list(row.get('other_parent', []))}")
44
+ print("-" * 40)
45
+
46
+ shown += 1
47
+
48
+ def diversify_topk_with_min_clusters(
49
+ ranked_candidates,
50
+ feature_matrix,
51
+ top_k=5,
52
+ n_clusters=20,
53
+ min_clusters=3,
54
+ random_state=42
55
+ ):
56
+ """
57
+ Diversify top-k displayed recipes using KMeans clustering.
58
+ Ensures that the final top_k contains at least `min_clusters` distinct clusters.
59
+ """
60
+ if len(ranked_candidates) == 0:
61
+ return []
62
+
63
+ n_clusters = min(n_clusters, len(ranked_candidates))
64
+ scaler = StandardScaler()
65
+ X_scaled = scaler.fit_transform(feature_matrix)
66
+
67
+ # KMeans clustering
68
+ kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=random_state)
69
+ cluster_ids = kmeans.fit_predict(X_scaled)
70
+
71
+ # Step 1: pick candidates from distinct clusters until min_clusters reached
72
+ picked = []
73
+ picked_clusters = set()
74
+ for i, c in enumerate(cluster_ids):
75
+ if c not in picked_clusters:
76
+ picked.append(ranked_candidates[i])
77
+ picked_clusters.add(c)
78
+ if len(picked_clusters) >= min_clusters or len(picked) >= top_k:
79
+ break
80
+
81
+ # Step 2: fill the rest purely by rank order
82
+ if len(picked) < top_k:
83
+ for i, c in enumerate(cluster_ids):
84
+ if ranked_candidates[i] not in picked:
85
+ picked.append(ranked_candidates[i])
86
+ if len(picked) >= top_k:
87
+ break
88
+
89
+ return picked
90
+
91
+
recipe_recommendation/src/io.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ # Hugging Face ID
6
+ REPO_ID = "Iris314/recipe-cleaned"
7
+
8
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9
+ DATA_DIR = os.path.join(ROOT_DIR, "data")
10
+ os.makedirs(DATA_DIR, exist_ok=True)
11
+
12
+
13
+ def download_file(filename: str) -> str:
14
+
15
+ local_path = os.path.join(DATA_DIR, filename)
16
+ if not os.path.exists(local_path):
17
+ print(f"Downloading {filename} from Hugging Face Hub...")
18
+ hf_hub_download(
19
+ repo_id=REPO_ID,
20
+ filename=filename,
21
+ repo_type="dataset",
22
+ local_dir=DATA_DIR,
23
+ local_dir_use_symlinks=False
24
+ )
25
+ else:
26
+ print(f"{filename} already exists locally.")
27
+ return local_path
28
+
29
+
30
+ def load_recipes_csv() -> str:
31
+ return download_file("recipes.csv")
32
+
33
+
34
+ def load_ingredient_map() -> dict:
35
+ path = download_file("ingredient_map.data")
36
+ with open(path, "r", encoding="utf-8") as f:
37
+ return json.load(f)
recipe_recommendation/src/trainmodel.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import joblib
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ from typing import List, Tuple, Sequence, Optional
7
+ from xgboost import XGBRanker
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import ndcg_score
10
+ from pandas.api.types import is_numeric_dtype
11
+
12
+
13
+ # ----------------------------- Helpers -----------------------------
14
+ def _pick_feature_cols(df: pd.DataFrame, drop_cols: Sequence[str]) -> List[str]:
15
+ """
16
+ Pick numeric feature columns robustly, excluding drop_cols.
17
+ Uses pandas is_numeric_dtype to correctly include nullable ints/floats/bools.
18
+ """
19
+ cols = []
20
+ for c in df.columns:
21
+ if c in drop_cols:
22
+ continue
23
+ if is_numeric_dtype(df[c]):
24
+ cols.append(c)
25
+ return cols
26
+
27
+
28
+ def _sort_and_pack_by_qid(
29
+ X: pd.DataFrame, y: pd.Series, qid: pd.Series, feature_cols: List[str]
30
+ ) -> Tuple[pd.DataFrame, np.ndarray, List[int], np.ndarray]:
31
+ """
32
+ Sort rows by qid so that group sizes match the sample order.
33
+ Returns:
34
+ X_sorted, y_sorted, groups, qid_sorted (aligned with X_sorted/y_sorted)
35
+ """
36
+ packed = X.copy()
37
+ packed["_label"] = y.values
38
+ packed["_qid"] = qid.values
39
+ packed = packed.sort_values("_qid").reset_index(drop=True)
40
+
41
+ groups = packed.groupby("_qid").size().tolist()
42
+ X_sorted = packed[feature_cols].copy()
43
+ y_sorted = packed["_label"].astype(float).values
44
+ qid_sorted = packed["_qid"].values
45
+ return X_sorted, y_sorted, groups, qid_sorted
46
+
47
+
48
+ def _eval_mean_ndcg(
49
+ model: XGBRanker,
50
+ X_val: pd.DataFrame,
51
+ y_val, # can be np.ndarray or pd.Series
52
+ qid_val, # aligned with X_val/y_val
53
+ ks: Sequence[int] = (5, 10),
54
+ ) -> dict:
55
+ """
56
+ Compute mean NDCG@k for each k in ks over validation queries.
57
+ Accepts numpy arrays or pandas Series.
58
+ """
59
+ # Try to respect early-stopping best iteration if available (xgboost>=2.0)
60
+ try:
61
+ preds = model.predict(X_val, iteration_range=(0, model.best_iteration + 1))
62
+ except Exception:
63
+ preds = model.predict(X_val)
64
+
65
+ y_arr = np.asarray(y_val)
66
+ q_arr = np.asarray(qid_val)
67
+
68
+ out = {}
69
+ for k in ks:
70
+ ndcgs = []
71
+ for q in np.unique(q_arr):
72
+ mask = (q_arr == q)
73
+ if mask.sum() < 2:
74
+ continue
75
+ ndcgs.append(ndcg_score([y_arr[mask]], [preds[mask]], k=k))
76
+ out[f"NDCG@{k}"] = float(np.mean(ndcgs)) if ndcgs else 0.0
77
+ return out
78
+
79
+
80
+
81
+ # ----------------------------- Main Trainer -----------------------------
82
+ def train_model_ranker(
83
+ user_id: str = "user_1",
84
+ features_path: Optional[str] = None,
85
+ save_model: bool = True,
86
+ model_params: Optional[dict] = None,
87
+ val_ratio: float = 0.2,
88
+ random_state: int = 42,
89
+ max_rows: Optional[int] = None,
90
+ ):
91
+ """
92
+ Train an XGBoost Learning-to-Rank model (XGBRanker) on cold-start generated data.
93
+
94
+ Expected input CSV (from cold_start.py):
95
+ - qid: query id (one round of pantry sampling = one query)
96
+ - relevance: graded relevance label (e.g., 3/2/1/0)
97
+ - features: numeric columns produced by build_features (and any extra numeric signals)
98
+
99
+ The function:
100
+ 1) Reads the CSV
101
+ 2) Selects numeric feature columns robustly
102
+ 3) Splits train/val by qid to avoid leakage
103
+ 4) Sorts each split by qid and builds group sizes aligned to sample order
104
+ 5) Trains XGBRanker and reports NDCG@5/10
105
+ 6) Saves model to user_data/<user_id>/ranker.pkl
106
+ """
107
+ base_dir = os.path.join("user_data", user_id)
108
+ os.makedirs(base_dir, exist_ok=True)
109
+
110
+ # Resolve features path
111
+ if features_path is None:
112
+ features_path = os.path.join(base_dir, "user_features_rank.csv")
113
+ if not os.path.exists(features_path):
114
+ raise FileNotFoundError(
115
+ f"[train_model_ranker] Cold-start features not found at: {features_path}\n"
116
+ f"Please run cold_start_ranker(user_id='{user_id}') first."
117
+ )
118
+
119
+ # Load data
120
+ df = pd.read_csv(features_path)
121
+ if max_rows is not None and len(df) > max_rows:
122
+ df = df.sample(max_rows, random_state=random_state).reset_index(drop=True)
123
+
124
+ # Basic validation
125
+ if "qid" not in df.columns or "relevance" not in df.columns:
126
+ raise ValueError("Input CSV must contain 'qid' and 'relevance' columns.")
127
+
128
+ # Fill NaNs in label/qid (should not happen, but defensive)
129
+ df["qid"] = pd.to_numeric(df["qid"], errors="coerce").fillna(-1).astype(int)
130
+ df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(float)
131
+
132
+ # Pick numeric feature columns robustly
133
+ drop_cols = {"qid", "relevance"}
134
+ feature_cols = _pick_feature_cols(df, drop_cols)
135
+ if not feature_cols:
136
+ raise ValueError("No numeric feature columns found in dataset.")
137
+
138
+ # Ensure numeric + finite values only (replace inf/nan with 0)
139
+ df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")
140
+ df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
141
+
142
+ # Split by qid to avoid leakage across queries
143
+ unique_qids = df["qid"].unique()
144
+ if len(unique_qids) < 2:
145
+ warnings.warn("Only one unique qid found — ranking training may be ineffective.")
146
+ train_qids, val_qids = train_test_split(
147
+ unique_qids, test_size=val_ratio, random_state=random_state
148
+ )
149
+ train_mask = df["qid"].isin(train_qids)
150
+ val_mask = df["qid"].isin(val_qids)
151
+
152
+ # Split dataframes
153
+ X_train_raw = df.loc[train_mask, feature_cols]
154
+ y_train_raw = df.loc[train_mask, "relevance"]
155
+ qid_train = df.loc[train_mask, "qid"]
156
+
157
+ X_val_raw = df.loc[val_mask, feature_cols]
158
+ y_val_raw = df.loc[val_mask, "relevance"]
159
+ qid_val = df.loc[val_mask, "qid"]
160
+
161
+ # Sort by qid and build group sizes aligned with sample order (CRITICAL for XGBRanker)
162
+ X_train, y_train, group_train, _ = _sort_and_pack_by_qid(
163
+ X_train_raw, y_train_raw, qid_train, feature_cols
164
+ )
165
+ X_val, y_val, group_val, qid_val_sorted = _sort_and_pack_by_qid(
166
+ X_val_raw, y_val_raw, qid_val, feature_cols
167
+ )
168
+
169
+
170
+ print(f"[ranker] #Train groups: {len(group_train)} | #Val groups: {len(group_val)}")
171
+ print(f"[ranker] Train rows: {len(X_train)} | Val rows: {len(X_val)} | #Features: {len(feature_cols)}")
172
+
173
+ # Default model params
174
+ default_params = dict(
175
+ objective="rank:ndcg",
176
+ eval_metric="ndcg",
177
+ n_estimators=400,
178
+ learning_rate=0.08,
179
+ max_depth=6,
180
+ subsample=0.8,
181
+ colsample_bytree=0.8,
182
+ random_state=random_state,
183
+ tree_method="hist",
184
+ reg_lambda=1.0,
185
+ reg_alpha=0.0,
186
+ )
187
+ if model_params:
188
+ default_params.update(model_params)
189
+
190
+ model = XGBRanker(**default_params)
191
+
192
+ # Fit model (XGBRanker requires group/group for eval_set as well)
193
+ fit_kwargs = dict(
194
+ X=X_train,
195
+ y=y_train,
196
+ group=group_train,
197
+ eval_set=[(X_val, y_val)],
198
+ eval_group=[group_val],
199
+ verbose=False,
200
+ )
201
+
202
+ try:
203
+ # Newer xgboost versions (some builds) support early_stopping_rounds on Ranker
204
+ model.fit(early_stopping_rounds=50, **fit_kwargs) # maximize=True is inferred by 'ndcg'
205
+ except TypeError:
206
+ # Fallback to callback API (older versions)
207
+ try:
208
+ from xgboost.callback import EarlyStopping
209
+ model.fit(callbacks=[EarlyStopping(rounds=50, save_best=True, maximize=True)], **fit_kwargs)
210
+ except Exception:
211
+ # Last resort: train without early stopping
212
+ model.fit(**fit_kwargs)
213
+
214
+ # Evaluate mean NDCG@5/10
215
+ metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
216
+
217
+ print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
218
+
219
+ # Save model
220
+ if save_model:
221
+ model_path = os.path.join(base_dir, "ranker.pkl")
222
+ joblib.dump(model, model_path)
223
+ print(f"[ranker] Model saved to {model_path}")
224
+
225
+ return model, metrics, feature_cols
226
+
227
+
228
+ if __name__ == "__main__":
229
+ # Example run
230
+ train_model_ranker(
231
+ user_id="user_1",
232
+ save_model=True,
233
+ val_ratio=0.2,
234
+ random_state=42,
235
+ max_rows=None, # or set an upper bound for quick iterations, e.g., 200_000
236
+ model_params=None, # override defaults if desired
237
+ )
recipe_recommendation/user_data/demo_user_1/user_profile.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "user_id": "demo_user_1",
3
+ "num_feedback": 0,
4
+ "diet": {
5
+ "vegetarian_type": "flexible"
6
+ },
7
+ "allergies": [],
8
+ "region_preference": [
9
+ "North America"
10
+ ],
11
+ "nutritional_goals": {
12
+ "calories": {
13
+ "min": 200,
14
+ "max": 800
15
+ },
16
+ "protein": {
17
+ "min": 20,
18
+ "max": 100
19
+ }
20
+ },
21
+ "other_preferences": {
22
+ "preferred_main": [
23
+ "chicken"
24
+ ],
25
+ "disliked_main": [],
26
+ "cooking_time_max": 30
27
+ }
28
+ }
recipe_recommendation/user_data/user_0/feature_order.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "main_match_ratio",
3
+ "other_match_ratio",
4
+ "staple_match_ratio",
5
+ "missing_main_count",
6
+ "missing_other_count",
7
+ "missing_staple_count",
8
+ "calories",
9
+ "protein",
10
+ "fat",
11
+ "protein_ratio",
12
+ "fat_ratio",
13
+ "region_match",
14
+ "is_vegan_safe",
15
+ "is_vegetarian_safe_absolute",
16
+ "is_flexible_safe_absolute",
17
+ "is_user_diet_safe",
18
+ "low_calorie_penalty",
19
+ "preferred_main_overlap",
20
+ "preferred_course_overlap",
21
+ "within_cooking_time"
22
+ ]
recipe_recommendation/user_data/user_0/feedback.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ main_match_ratio,other_match_ratio,staple_match_ratio,missing_main_count,missing_other_count,missing_staple_count,calories,protein,fat,protein_ratio,fat_ratio,region_match,is_vegan_safe,is_vegetarian_safe_absolute,is_flexible_safe_absolute,is_user_diet_safe,low_calorie_penalty,preferred_main_overlap,preferred_course_overlap,within_cooking_time,recipe_id,qid,relevance
2
+ 0.0,0.0,0.0,1,3,1,123.9,0,0,0.0,0.0,0,0,0,0,1,1,0,0,1,73148,0,5
recipe_recommendation/user_data/user_0/qid.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
recipe_recommendation/user_data/user_0/ranker.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72a3361c05b69d3627a69983ee1460730b304b1a4c562be6fc75001ef9bd887f
3
+ size 1598006
recipe_recommendation/user_data/user_0/user_features_rank.csv ADDED
The diff for this file is too large to render. See raw diff
 
recipe_recommendation/user_data/user_0/user_profile.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "user_id": "user_0",
3
+ "num_feedback": 0,
4
+ "diet": {
5
+ "vegetarian_type": "non_vegetarian"
6
+ },
7
+ "allergies": [],
8
+ "region_preference": [
9
+ "Asia"
10
+ ],
11
+ "nutritional_goals": {
12
+ "calories": {
13
+ "min": 250,
14
+ "max": 4000
15
+ },
16
+ "protein": {
17
+ "min": 20,
18
+ "max": 160
19
+ }
20
+ },
21
+ "other_preferences": {
22
+ "preferred_main": [],
23
+ "disliked_main": [],
24
+ "cooking_time_max": 180
25
+ }
26
+ }
recipe_recommendation/user_data/user_1/feature_order.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "main_match_ratio",
3
+ "other_match_ratio",
4
+ "staple_match_ratio",
5
+ "missing_main_count",
6
+ "missing_other_count",
7
+ "missing_staple_count",
8
+ "calories",
9
+ "protein",
10
+ "fat",
11
+ "protein_ratio",
12
+ "fat_ratio",
13
+ "region_match",
14
+ "is_vegan_safe",
15
+ "is_vegetarian_safe_absolute",
16
+ "is_flexible_safe_absolute",
17
+ "is_user_diet_safe",
18
+ "low_calorie_penalty",
19
+ "preferred_main_overlap",
20
+ "preferred_course_overlap",
21
+ "within_cooking_time"
22
+ ]
recipe_recommendation/user_data/user_1/feedback.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ main_match_ratio,other_match_ratio,staple_match_ratio,missing_main_count,missing_other_count,missing_staple_count,calories,protein,fat,protein_ratio,fat_ratio,region_match,is_vegan_safe,is_vegetarian_safe_absolute,is_flexible_safe_absolute,is_user_diet_safe,low_calorie_penalty,preferred_main_overlap,preferred_course_overlap,within_cooking_time,recipe_id,qid,relevance
2
+ 0.0,0.0,0.0,1,3,1,320.2,0,0,0.0,0.0,0,0,0,0,0,1,1,0,1,44939,0,5
3
+ 0.0,0.0,0.0,1,3,1,123.9,0,0,0.0,0.0,0,0,0,0,0,1,0,0,1,73148,1,5
recipe_recommendation/user_data/user_1/qid.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 2
recipe_recommendation/user_data/user_1/ranker.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8f305ad668b45ca0bb8d6f6cb1b87ca68d26a5c495622d2df4ac38e546b2787
3
+ size 1638981
recipe_recommendation/user_data/user_1/user_features_rank.csv ADDED
The diff for this file is too large to render. See raw diff
 
recipe_recommendation/user_data/user_1/user_profile.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "user_id": "user_1",
3
+ "num_feedback": 0,
4
+ "diet": {
5
+ "vegetarian_type": "flexible"
6
+ },
7
+ "allergies": [],
8
+ "region_preference": [
9
+ "North America"
10
+ ],
11
+ "nutritional_goals": {
12
+ "calories": {
13
+ "min": 250,
14
+ "max": 2000
15
+ },
16
+ "protein": {
17
+ "min": 50,
18
+ "max": 160
19
+ }
20
+ },
21
+ "other_preferences": {
22
+ "preferred_main": [],
23
+ "disliked_main": [],
24
+ "cooking_time_max": 45
25
+ }
26
+ }
recipe_recommendation/user_data/user_2/feature_order.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "main_match_ratio",
3
+ "other_match_ratio",
4
+ "staple_match_ratio",
5
+ "missing_main_count",
6
+ "missing_other_count",
7
+ "missing_staple_count",
8
+ "calories",
9
+ "protein",
10
+ "fat",
11
+ "protein_ratio",
12
+ "fat_ratio",
13
+ "region_match",
14
+ "is_vegan_safe",
15
+ "is_vegetarian_safe_absolute",
16
+ "is_flexible_safe_absolute",
17
+ "is_user_diet_safe",
18
+ "low_calorie_penalty",
19
+ "preferred_main_overlap",
20
+ "preferred_course_overlap",
21
+ "within_cooking_time"
22
+ ]
recipe_recommendation/user_data/user_2/feedback.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ main_match_ratio,other_match_ratio,staple_match_ratio,missing_main_count,missing_other_count,missing_staple_count,calories,protein,fat,protein_ratio,fat_ratio,region_match,is_vegan_safe,is_vegetarian_safe_absolute,is_flexible_safe_absolute,is_user_diet_safe,low_calorie_penalty,preferred_main_overlap,preferred_course_overlap,within_cooking_time,recipe_id,qid,relevance
2
+ 0.0,0.0,0.0,1,2,1,1640.1,0,0,0.0,0.0,0,0,0,0,1,1,0,0,1,106901,0,5