Jesse Johnson commited on
Commit
c59d808
·
0 Parent(s):

New commit for backend deployment: 2025-09-25_13-24-03

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +96 -0
  2. Dockerfile +16 -0
  3. README.md +54 -0
  4. app.py +1 -0
  5. backend/.env.example +105 -0
  6. backend/.gitignore +141 -0
  7. backend/Dockerfile +16 -0
  8. backend/README.md +462 -0
  9. backend/app.py +193 -0
  10. backend/config/__init__.py +0 -0
  11. backend/config/database.py +61 -0
  12. backend/config/logging_config.py +82 -0
  13. backend/config/settings.py +176 -0
  14. backend/core/__init__.py +0 -0
  15. backend/core/exceptions.py +15 -0
  16. backend/data/__init__.py +0 -0
  17. backend/data/sample_recipes.json +138 -0
  18. backend/data_minning/__init__.py +1 -0
  19. backend/data_minning/all_nigerian_recipe_scraper.py +193 -0
  20. backend/data_minning/base_scrapper.py +348 -0
  21. backend/data_minning/dto/__init__.py +1 -0
  22. backend/data_minning/dto/recipe_doc.py +45 -0
  23. backend/data_minning/dto/stream_opts.py +12 -0
  24. backend/data_minning/soup_client.py +43 -0
  25. backend/data_minning/yummy_medley_scraper.py +209 -0
  26. backend/docs/README.md +85 -0
  27. backend/docs/chromadb_refresh.md +228 -0
  28. backend/docs/embedding-compatibility-guide.md +249 -0
  29. backend/docs/embedding-troubleshooting.md +132 -0
  30. backend/docs/logging_guide.md +56 -0
  31. backend/docs/model-configuration-guide.md +542 -0
  32. backend/docs/model-selection-guide.md +502 -0
  33. backend/docs/opensource-llm-configuration.md +394 -0
  34. backend/docs/optimal_recipes_structure.md +160 -0
  35. backend/docs/sanitization_guide.md +147 -0
  36. backend/docs/scraper.md +372 -0
  37. backend/docs/unified-provider-configuration.md +108 -0
  38. backend/requirements.txt +49 -0
  39. backend/services/__init__.py +1 -0
  40. backend/services/custom_mongo_vector.py +154 -0
  41. backend/services/llm_service.py +354 -0
  42. backend/services/vector_store.py +386 -0
  43. backend/tests/__init__.py +0 -0
  44. backend/tests/test_db_settings.py +53 -0
  45. backend/tests/test_llm_provider_settings.py +39 -0
  46. backend/tests/test_llm_service.py +26 -0
  47. backend/utils/__init__.py +10 -0
  48. backend/utils/helpers.py +2 -0
  49. backend/utils/request_dto/chat_response.py +4 -0
  50. backend/utils/request_dto/scrape_request.py +7 -0
.gitignore ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+ MANIFEST
22
+
23
+ # PyInstaller
24
+ *.manifest
25
+ *.spec
26
+
27
+ # Unit test / coverage reports
28
+ htmlcov/
29
+ .tox/
30
+ .coverage
31
+ .coverage.*
32
+ .cache
33
+ nosetests.xml
34
+ coverage.xml
35
+ *.cover
36
+ .hypothesis/
37
+ .pytest_cache/
38
+
39
+ # Virtual environments
40
+ .env
41
+ .venv
42
+ env/
43
+ venv/
44
+ ENV/
45
+ env.bak/
46
+ venv.bak/
47
+ venv/
48
+
49
+ # IDEs
50
+ .vscode/
51
+ .idea/
52
+ *.swp
53
+ *.swo
54
+ *~
55
+
56
+ # Jupyter Notebook
57
+ .ipynb_checkpoints
58
+
59
+ # Environment variables
60
+ .env
61
+ .env.local
62
+ .env.development.local
63
+ .env.test.local
64
+ .env.production.local
65
+
66
+ # Data files
67
+ *.csv
68
+ *.xlsx
69
+ *.pickle
70
+ *.pkl
71
+
72
+ # API keys and secrets
73
+ secrets/
74
+ *.key
75
+ *.pem
76
+
77
+ # OS
78
+ .DS_Store
79
+ Thumbs.db
80
+
81
+ # Logs
82
+ *.log
83
+ logs/
84
+
85
+ # Model files
86
+ models/
87
+ *.model
88
+ *.h5
89
+ *.pkl
90
+
91
+ # Vector databases
92
+ chroma_db/
93
+ faiss_index/
94
+
95
+ frontend/.next/
96
+ frontend/node_modules/
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Recipe Recommendation Chatbot API
3
+ emoji: 🥘
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # Recipe Recommendation Chatbot
12
+
13
+ A GenAI-powered chatbot that recommends recipes based on available ingredients using RAG (Retrieval Augmented Generation).
14
+
15
+ ## 🚀 Quick Start
16
+ ```bash
17
+ # Clone repository
18
+ git clone https://github.com/A3copilotprogram/PLG4-Recipe-Recommendation-Chatbot.git
19
+ cd PLG4-Recipe-Recommendation-Chatbot
20
+
21
+ # Install dependencies
22
+ pip install -r requirements.txt
23
+
24
+ # Run the chatbot
25
+ python src/main.py
26
+ ```
27
+
28
+ ## 📁 Project Structure
29
+ - `backend/` - FastAPI backend with RAG pipeline
30
+ - `frontend/` - React frontend interface
31
+ - `data/` - Recipe datasets and embeddings
32
+ - `docs/` - Project documentation
33
+ - `notebooks/` - Jupyter notebooks for exploration
34
+ - `tests/` - Unit and integration tests
35
+
36
+ ## 📚 Documentation
37
+
38
+ ### Quick Start Guides
39
+ - **[Backend Setup](./backend/README.md)** - FastAPI server setup and configuration
40
+ - **[Frontend Setup](./frontend/README.md)** - React app development
41
+
42
+ ### Troubleshooting
43
+ - **[Embedding Issues](./backend/docs/embedding-troubleshooting.md)** - Fix common dimension mismatch errors
44
+ - **[Documentation Index](./backend/docs/README.md)** - Complete documentation overview
45
+
46
+ ### Architecture
47
+ - **[System Architecture](./docs/architecture.md)** - High-level system design
48
+ - **[API Documentation](./backend/docs/api-documentation.md)** - Detailed API reference
49
+
50
+ ## 🤝 Contributing
51
+ See [CONTRIBUTING.md](docs/CONTRIBUTING.md) for ways of working and contribution guidelines.
52
+
53
+ ## 👥 Team
54
+ GenAI PLG 4 - Andela Community Program
app.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from backend.app import app
backend/.env.example ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===========================================
2
+ # Recipe Recommendation Bot - Environment Configuration
3
+ # ===========================================
4
+
5
+ # Server Configuration
6
+ PORT=8080
7
+ HOST=0.0.0.0
8
+ ENVIRONMENT=development
9
+ DEBUG=true
10
+ LANGCHAIN_DEBUG=true
11
+
12
+ # CORS Configuration
13
+ CORS_ORIGINS=["http://localhost:3000","http://localhost:5173","http://localhost:8000"]
14
+ CORS_ALLOW_CREDENTIALS=true
15
+ CORS_ALLOW_METHODS=["GET","POST","PUT","DELETE","OPTIONS"]
16
+ CORS_ALLOW_HEADERS=["*"]
17
+
18
+ # ===========================================
19
+ # LLM & Embedding Provider Configuration
20
+ # ===========================================
21
+ # Supported providers: openai, google, huggingface, ollama
22
+ # This provider will be used for both LLM and embeddings
23
+
24
+ LLM_PROVIDER=google
25
+ EMBEDDING_PROVIDER=google
26
+
27
+ # OpenAI Configuration
28
+ # Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'openai'
29
+ OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE
30
+ OPENAI_MODEL=gpt-5-nano
31
+ OPENAI_TEMPERATURE=0.7
32
+ OPENAI_MAX_TOKENS=1000
33
+
34
+ # Google AI Configuration (Gemini)
35
+ # Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'google'
36
+ GOOGLE_API_KEY=YOUR_GOOGLE_API_KEY_HERE
37
+ GOOGLE_MODEL=gemini-2.0-flash
38
+ GOOGLE_TEMPERATURE=0.7
39
+ GOOGLE_MAX_TOKENS=1000
40
+
41
+ # Hugging Face Configuration
42
+ # Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'huggingface'
43
+ HUGGINGFACE_API_TOKEN=YOUR_HUGGINGFACE_API_TOKEN_HERE
44
+ HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1
45
+ HUGGINGFACE_API_URL=https://api-inference.huggingface.co/models/
46
+ HUGGINGFACE_USE_API=true
47
+ HUGGINGFACE_USE_GPU=false
48
+
49
+ # Ollama Configuration (local inference)
50
+ # Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'ollama'
51
+ OLLAMA_BASE_URL=http://localhost:11434
52
+ OLLAMA_MODEL=llama3.1:8b
53
+ OLLAMA_TEMPERATURE=0.7
54
+
55
+
56
+ # ===========================================
57
+ # Vector Store Configuration
58
+ # ===========================================
59
+ # Supported stores: chromadb, mongodb
60
+ VECTOR_STORE_PROVIDER=mongodb
61
+
62
+ # ChromaDB Configuration
63
+ DB_PATH=./data/chromadb
64
+ DB_COLLECTION_NAME=recipes
65
+ DB_PERSIST_DIRECTORY=./data/chromadb_persist
66
+ # Set to true to delete and recreate DB on startup (useful for adding new recipes)
67
+ DB_REFRESH_ON_START=false
68
+
69
+ # MongoDB Atlas Configuration (for vector search)
70
+ # Provide your connection string and collection settings when using MongoDB
71
+ MONGODB_URI=mongodb+srv://<username>:<password>@<cluster>.mongodb.net/?retryWrites=true&w=majority&appName=<AppName>
72
+ MONGODB_DATABASE=food_recommendation
73
+ MONGODB_COLLECTION=AI_DB
74
+ MONGODB_INDEX_NAME=foodInstructionIndex
75
+ MONGODB_VECTOR_FIELD=ingredients_emb
76
+ MONGODB_TEXT_FIELD=title
77
+ MONGODB_SIMILARITY_METRIC=dotProduct
78
+ MONGODB_NUM_CANDIDATES=100
79
+
80
+
81
+ # ===========================================
82
+ # Model Configuration
83
+ # ===========================================
84
+ # The LLM_PROVIDER setting above controls both LLM and embedding models
85
+
86
+ # OpenAI Models
87
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small
88
+
89
+ # Google Models
90
+ GOOGLE_EMBEDDING_MODEL=models/embedding-001
91
+
92
+ # HuggingFace Models
93
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
94
+
95
+ # Ollama Models
96
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
97
+
98
+
99
+ # ===========================================
100
+ # Logging Configuration
101
+ # ===========================================
102
+ LOG_LEVEL=INFO
103
+ LOG_FORMAT=%(asctime)s - %(name)s - %(levelname)s - %(message)s
104
+ LOG_FILE=./logs/app.log
105
+
backend/.gitignore ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # Application specific
132
+ vector_store/
133
+ logs/
134
+ *.log
135
+ .DS_Store
136
+ node_modules/
137
+
138
+ # Data folder - ignore everything except sample recipe
139
+ data/*
140
+ !data/sample_recipes.json
141
+ !data/recipes
backend/Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
backend/README.md ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Recipe Recommendation Chatbot - Backend API
2
+
3
+ Backend for AI-powered recipe recommendation system built with FastAPI, featuring RAG (Retrieval-Augmented Generation) capabilities, conversational memory, and multi-provider LLM support.
4
+
5
+ ## 🚀 Quick Start
6
+
7
+ ### Prerequisites
8
+ - Python 3.9+
9
+ - pip or poetry
10
+ - API keys for your chosen LLM provider (OpenAI, Google, or HuggingFace)
11
+
12
+ ### Installation
13
+
14
+ 1. **Clone and navigate to backend**
15
+ ```bash
16
+ git clone <repository-url>
17
+ cd PLG4-Recipe-Recommendation-Chatbot/backend
18
+ ```
19
+
20
+ 2. **Install dependencies**
21
+ ```bash
22
+ pip install -r requirements.txt
23
+ ```
24
+ > 💡 **Note**: Some packages are commented out by default to keep the installation lightweight:
25
+ > - **HuggingFace dependencies** (`transformers`, `accelerate`, `sentence-transformers`) - Uncomment if using HuggingFace models
26
+ > - **sentence-transformers** (~800MB) - Uncomment for HuggingFace embeddings
27
+
28
+ 3. **Configure environment**
29
+ ```bash
30
+ cp .env.example .env
31
+ # Edit .env with your API keys and configuration
32
+ ```
33
+
34
+ 4. **Run the server**
35
+ ```bash
36
+ # Development mode with auto-reload
37
+ uvicorn app:app --reload --host 127.0.0.1 --port 8080
38
+
39
+ # Or production mode
40
+ uvicorn app:app --host 127.0.0.1 --port 8080
41
+ ```
42
+
43
+ 5. **Test the API**
44
+ ```bash
45
+ curl http://localhost:8080/health
46
+ ```
47
+
48
+ 6. **HuggingFace Spaces deployment**
49
+ ```
50
+ sh deploy-to-hf.sh <remote>
51
+ ```
52
+ where <remote> points to the HuggingFace Spaces repository
53
+
54
+ ## 📁 Project Structure
55
+
56
+ ```
57
+ backend/
58
+ ├── app.py # FastAPI application entry point
59
+ ├── requirements.txt # Python dependencies
60
+ ├── .env.example # Environment configuration template
61
+ ├── .gitignore # Git ignore rules
62
+
63
+ ├── config/ # Configuration modules
64
+ │ ├── __init__.py
65
+ │ ├── settings.py # Application settings
66
+ │ ├── database.py # Database configuration
67
+ │ └── logging_config.py # Logging setup
68
+
69
+ ├── services/ # Core business logic
70
+ │ ├── __init__.py
71
+ │ ├── llm_service.py # LLM and RAG pipeline
72
+ │ └── vector_store.py # Vector database management
73
+
74
+ ├── data/ # Data storage
75
+ │ ├── recipes/ # Recipe JSON files
76
+ │ │ └── recipe.json # Sample recipe data
77
+ │ └── chromadb_persist/ # ChromaDB persistence
78
+
79
+ ├── logs/ # Application logs
80
+ │ └── recipe_bot.log # Main log file
81
+
82
+ ├── docs/ # Documentation
83
+ │ ├── model-selection-guide.md # 🎯 Complete model selection & comparison guide
84
+ │ ├── model-quick-reference.md # ⚡ Quick model switching commands
85
+ │ ├── chromadb_refresh.md # ChromaDB refresh guide
86
+ │ ├── opensource-llm-configuration.md # Open source LLM setup guide
87
+ │ ├── logging_guide.md # Logging documentation
88
+ │ ├── optimal_recipes_structure.md # Recipe data structure guide
89
+ │ ├── sanitization_guide.md # Input sanitization guide
90
+ │ └── unified-provider-configuration.md # Unified provider approach guide
91
+
92
+ └── utils/ # Utility functions
93
+ └── __init__.py
94
+ ```
95
+
96
+ ## ⚙️ Configuration
97
+
98
+ ### Environment Variables
99
+
100
+ Copy `.env.example` to `.env` and configure the following:
101
+
102
+ > 🎯 **Unified Provider Approach**: The `LLM_PROVIDER` setting controls both LLM and embedding models, preventing configuration mismatches. See [`docs/unified-provider-configuration.md`](docs/unified-provider-configuration.md) for details.
103
+
104
+ #### **Server Configuration**
105
+ ```bash
106
+ PORT=8000 # Server port
107
+ HOST=0.0.0.0 # Server host
108
+ ENVIRONMENT=development # Environment mode
109
+ DEBUG=true # Debug mode
110
+ ```
111
+
112
+ #### **Provider Configuration**
113
+ Choose one provider for both LLM and embeddings (unified approach):
114
+
115
+ > 🎯 **NEW: Complete Model Selection Guide**: For detailed comparisons of all models (OpenAI, Google, Anthropic, Ollama, HuggingFace) including latest 2025 models, performance metrics, costs, and scenario-based recommendations, see [`docs/model-selection-guide.md`](docs/model-selection-guide.md)
116
+
117
+ > ⚡ **Quick Reference**: For one-command model switching, see [`docs/model-quick-reference.md`](docs/model-quick-reference.md)
118
+
119
+ **OpenAI (Best Value & Latest Models)**
120
+ ```bash
121
+ LLM_PROVIDER=openai
122
+ OPENAI_API_KEY=your_openai_api_key_here
123
+ OPENAI_MODEL=gpt-5-nano # 🎯 BEST VALUE: $1/month for 30K queries - Modern GPT-5 at nano price
124
+ # Alternatives:
125
+ # - gpt-4o-mini # Proven choice: $4/month for 30K queries
126
+ # - gpt-5 # Premium: $20/month unlimited (Plus plan)
127
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small # Used automatically
128
+ ```
129
+
130
+ **Google Gemini (Best Free Tier)**
131
+ ```bash
132
+ LLM_PROVIDER=google
133
+ GOOGLE_API_KEY=your_google_api_key_here
134
+ GOOGLE_MODEL=gemini-2.5-flash # 🎯 RECOMMENDED: Excellent free tier, then $2/month
135
+ # Alternatives:
136
+ # - gemini-2.0-flash-lite # Ultra budget: $0.90/month for 30K queries
137
+ # - gemini-2.5-pro # Premium: $25/month for 30K queries
138
+ GOOGLE_EMBEDDING_MODEL=models/embedding-001 # Used automatically
139
+ ```
140
+
141
+ **Anthropic Claude (Best Quality-to-Cost)**
142
+ ```bash
143
+ LLM_PROVIDER=anthropic
144
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
145
+ ANTHROPIC_MODEL=claude-3-5-haiku-20241022 # 🎯 BUDGET WINNER: $4/month for 30K queries
146
+ # Alternatives:
147
+ # - claude-3-5-sonnet-20241022 # Production standard: $45/month for 30K queries
148
+ # - claude-3-opus-20240229 # Premium quality: $225/month for 30K queries
149
+ ANTHROPIC_EMBEDDING_MODEL=voyage-large-2 # Used automatically
150
+ ```
151
+
152
+ **Ollama (Best for Privacy/Self-Hosting)**
153
+ ```bash
154
+ LLM_PROVIDER=ollama
155
+ OLLAMA_BASE_URL=http://localhost:11434
156
+ OLLAMA_MODEL=llama3.1:8b # 🎯 YOUR CURRENT: 4.7GB download, 8GB RAM, excellent balance
157
+ # New alternatives:
158
+ # - deepseek-r1:7b # Breakthrough reasoning: 4.7GB download, O1-level performance
159
+ # - codeqwen:7b # Structured data expert: 4.2GB download, excellent for recipes
160
+ # - gemma3:4b # Resource-efficient: 3.3GB download, 6GB RAM
161
+ # - mistral-nemo:12b # Balanced performance: 7GB download, 12GB RAM
162
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text # Used automatically
163
+ ```
164
+
165
+ **HuggingFace (Downloadable Models Only - APIs Unreliable)**
166
+ ```bash
167
+ LLM_PROVIDER=ollama # Use Ollama to run HuggingFace models locally
168
+ OLLAMA_MODEL=codeqwen:7b # 🎯 RECOMMENDED: Download HF models via Ollama for reliability
169
+ # Other downloadable options:
170
+ # - mistral-nemo:12b # Mistral's balanced model
171
+ # - nous-hermes2:10.7b # Fine-tuned for instruction following
172
+ # - openhermes2.5-mistral:7b # Community favorite
173
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text # Used automatically
174
+ ```
175
+ > ⚠️ **Important Change**: HuggingFace APIs have proven unreliable for production. We now recommend downloading HuggingFace models locally via Ollama for consistent performance.
176
+ > ⚠️ **HuggingFace Update**: HuggingFace dependencies are no longer required as we recommend using downloadable models via Ollama instead of unreliable APIs. For local HuggingFace models, use Ollama which provides better reliability and performance.
177
+
178
+ > 📖 **Local Model Setup**: See [`docs/opensource-llm-configuration.md`](docs/opensource-llm-configuration.md) for GPU setup, model selection, and performance optimization with Ollama.
179
+
180
+ > 💡 **Unified Provider**: The `LLM_PROVIDER` setting automatically configures both the LLM and embedding models, ensuring consistency and preventing mismatched configurations.
181
+
182
+ #### **Vector Store Configuration**
183
+ Choose between ChromaDB (local) or MongoDB Atlas:
184
+
185
+ **ChromaDB (Default)**
186
+ ```bash
187
+ VECTOR_STORE_PROVIDER=chromadb
188
+ DB_COLLECTION_NAME=recipes
189
+ DB_PERSIST_DIRECTORY=./data/chromadb_persist
190
+ # Set to true to delete and recreate DB on startup (useful for adding new recipes)
191
+ DB_REFRESH_ON_START=false
192
+ ```
193
+
194
+ **MongoDB Atlas**
195
+ ```bash
196
+ VECTOR_STORE_PROVIDER=mongodb
197
+ MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
198
+ MONGODB_DATABASE=recipe_bot
199
+ MONGODB_COLLECTION=recipes
200
+ ```
201
+
202
+ #### **Embedding Configuration**
203
+ ```bash
204
+ # Embedding provider automatically matches LLM_PROVIDER (unified approach)
205
+ # No separate configuration needed - handled automatically based on LLM_PROVIDER setting
206
+ ```
207
+
208
+ > 💡 **Unified Provider**: The `LLM_PROVIDER` setting automatically configures both the LLM and embedding models, ensuring consistency and preventing mismatched configurations. See [`docs/model-selection-guide.md`](docs/model-selection-guide.md) for all available options.
209
+
210
+ ## 🛠️ API Endpoints
211
+
212
+ ### Core Endpoints
213
+
214
+ #### **Health Check**
215
+ ```bash
216
+ GET /health
217
+ ```
218
+ Returns service health and configuration status.
219
+
220
+ #### **Chat with RAG**
221
+ ```bash
222
+ POST /chat
223
+ Content-Type: application/json
224
+
225
+ {
226
+ "message": "What chicken recipes do you have?"
227
+ }
228
+ ```
229
+ Full conversational RAG pipeline with memory and vector retrieval.
230
+
231
+ #### **Simple Demo**
232
+ ```bash
233
+ GET /demo?prompt=Tell me about Italian cuisine
234
+ ```
235
+ Simple LLM completion without RAG for testing.
236
+
237
+ #### **Clear Memory**
238
+ ```bash
239
+ POST /clear-memory
240
+ ```
241
+ Clears conversation memory for fresh start.
242
+
243
+ ### Example Requests
244
+
245
+ **Chat Request:**
246
+ ```bash
247
+ curl -X POST "http://localhost:8080/chat"
248
+ -H "Content-Type: application/json"
249
+ -d '{"message": "What are some quick breakfast recipes?"}'
250
+ ```
251
+
252
+ **Demo Request:**
253
+ ```bash
254
+ curl "http://localhost:8080/demo?prompt=What%20is%20your%20favorite%20pasta%20dish?"
255
+ ```
256
+
257
+ ## 🏗️ Architecture
258
+
259
+ ### Core Components
260
+
261
+ #### **LLM Service** (`services/llm_service.py`)
262
+ - **ConversationalRetrievalChain**: Main RAG pipeline with memory
263
+ - **Simple Chat Completion**: Direct LLM responses without RAG
264
+ - **Multi-provider Support**: OpenAI, Google, HuggingFace
265
+ - **Conversation Memory**: Persistent chat history
266
+
267
+ #### **Vector Store Service** (`services/vector_store.py`)
268
+ - **ChromaDB Integration**: Local vector database
269
+ - **MongoDB Atlas Support**: Cloud vector search
270
+ - **Document Loading**: Automatic recipe data ingestion
271
+ - **Embedding Management**: Multi-provider embedding support
272
+
273
+ #### **Configuration System** (`config/`)
274
+ - **Settings Management**: Environment-based configuration
275
+ - **Database Configuration**: Vector store setup
276
+ - **Logging Configuration**: Structured logging with rotation
277
+
278
+ ### Data Flow
279
+
280
+ 1. **User Query** → FastAPI endpoint
281
+ 2. **RAG Pipeline** → Vector similarity search
282
+ 3. **Context Retrieval** → Top-k relevant recipes
283
+ 4. **LLM Generation** → Context-aware response
284
+ 5. **Memory Storage** → Conversation persistence
285
+ 6. **Response** → JSON formatted reply
286
+
287
+ ## 📊 Logging
288
+
289
+ Comprehensive logging system with:
290
+
291
+ - **File Rotation**: 10MB max size, 5 backups
292
+ - **Structured Format**: Timestamps, levels, source location
293
+ - **Emoji Indicators**: Visual status indicators
294
+ - **Error Tracking**: Full stack traces for debugging
295
+
296
+ **Log Levels:**
297
+ - 🚀 **INFO**: Normal operations
298
+ - ⚠️ **WARNING**: Non-critical issues
299
+ - ❌ **ERROR**: Failures with stack traces
300
+ - 🔧 **DEBUG**: Detailed operation steps
301
+
302
+ **Log Location:** `./logs/recipe_bot.log`
303
+
304
+ ## 📁 Data Management
305
+
306
+ ### Recipe Data
307
+ - **Location**: `./data/recipes/`
308
+ - **Format**: JSON files with structured recipe data
309
+ - **Schema**: title, ingredients, directions, tags
310
+ - **Auto-loading**: Automatic chunking and vectorization
311
+
312
+ ### Vector Storage
313
+ - **ChromaDB**: Local persistence in `./data/chromadb_persist/`
314
+ - **MongoDB**: Cloud-based vector search
315
+ - **Embeddings**: Configurable embedding models
316
+ - **Retrieval**: Top-k similarity search (k=25)
317
+
318
+ ## 🔧 Development
319
+
320
+ ### Running in Development
321
+ ```bash
322
+ # Install dependencies
323
+ pip install -r requirements.txt
324
+
325
+ # Set up environment
326
+ cp .env.example .env
327
+ # Configure your API keys
328
+
329
+ # Run with auto-reload
330
+ uvicorn app:app --reload --host 127.0.0.1 --port 8080
331
+ ```
332
+
333
+ ### Testing Individual Components
334
+ ```bash
335
+ # Test vector store
336
+ python -c "from services.vector_store import vector_store_service; print('Vector store initialized')"
337
+
338
+ # Test LLM service
339
+ python -c "from services.llm_service import llm_service; print('LLM service initialized')"
340
+ ```
341
+
342
+ ### Adding New Recipes
343
+ 1. Add JSON files to `./data/recipes/`
344
+ 2. Set `DB_REFRESH_ON_START=true` in `.env` file
345
+ 3. Restart the application (ChromaDB will be recreated)
346
+ 4. Set `DB_REFRESH_ON_START=false` to prevent repeated deletion
347
+ 5. New recipes are now available for search
348
+
349
+ **Quick refresh:**
350
+ ```bash
351
+ # Enable refresh, restart, then disable
352
+ echo "DB_REFRESH_ON_START=true" >> .env
353
+ uvicorn app:app --reload --host 127.0.0.1 --port 8080
354
+ # After startup completes:
355
+ sed -i 's/DB_REFRESH_ON_START=true/DB_REFRESH_ON_START=false/' .env
356
+ ```
357
+
358
+ ## 🚀 Production Deployment
359
+
360
+ ### Environment Setup
361
+ ```bash
362
+ ENVIRONMENT=production
363
+ DEBUG=false
364
+ LOG_LEVEL=INFO
365
+ ```
366
+
367
+ ### Docker Deployment
368
+ The backend is containerized and ready for deployment on platforms like Hugging Face Spaces.
369
+
370
+ ### Security Features
371
+ - **Environment Variables**: Secure API key management
372
+ - **CORS Configuration**: Frontend integration protection
373
+ - **Input Sanitization**: Context-appropriate validation for recipe queries
374
+ - XSS protection through HTML encoding
375
+ - Length validation (1-1000 characters)
376
+ - Basic harmful pattern removal
377
+ - Whitespace normalization
378
+ - **Pydantic Validation**: Type safety and automatic sanitization
379
+ - **Structured Error Handling**: Safe error responses without data leaks
380
+
381
+ ## 🛠️ Troubleshooting
382
+
383
+ ### Common Issues
384
+
385
+ **Vector store initialization fails**
386
+ - Check API keys for embedding provider
387
+ - Verify data folder contains recipe files
388
+ - Check ChromaDB permissions
389
+
390
+ **LLM service fails**
391
+ - Verify API key configuration
392
+ - Check provider-specific requirements
393
+ - Review logs for detailed error messages
394
+
395
+ **HuggingFace model import errors**
396
+ - HuggingFace APIs have proven unreliable for production use
397
+ - **Recommended**: Use Ollama to run HuggingFace models locally instead:
398
+ ```bash
399
+ # Install and run HuggingFace models via Ollama
400
+ ollama pull codeqwen:7b
401
+ ollama pull mistral-nemo:12b
402
+ # Set LLM_PROVIDER=ollama in .env
403
+ ```
404
+ - For legacy HuggingFace API setup, uncomment dependencies in `requirements.txt` (not recommended)
405
+ - For detailed model comparisons, see [`docs/model-selection-guide.md`](docs/model-selection-guide.md)
406
+
407
+ **Memory issues**
408
+ ```bash
409
+ # Clear conversation memory
410
+ curl -X POST http://localhost:8080/clear-memory
411
+ ```
412
+
413
+ ### Debug Mode
414
+ Set `DEBUG=true` in `.env` for detailed logging and error traces.
415
+
416
+ ### Log Analysis
417
+ Check `./logs/recipe_bot.log` for detailed operation logs with emoji indicators for quick status identification.
418
+
419
+ ## 📚 Documentation
420
+
421
+ ### Troubleshooting Guides
422
+ - **[Embedding Troubleshooting](./docs/embedding-troubleshooting.md)** - Quick fixes for common embedding dimension errors
423
+ - **[Embedding Compatibility Guide](./docs/embedding-compatibility-guide.md)** - Comprehensive guide to embedding models and dimensions
424
+ - **[Logging Guide](./docs/logging_guide.md)** - Understanding the logging system
425
+
426
+ ### Technical Guides
427
+ - **[Architecture Documentation](./docs/architecture.md)** - System architecture overview
428
+ - **[API Documentation](./docs/api-documentation.md)** - Detailed API reference
429
+ - **[Deployment Guide](./docs/deployment.md)** - Production deployment instructions
430
+
431
+ ### Common Issues
432
+ - **Dimension mismatch errors**: See [Embedding Troubleshooting](./docs/embedding-troubleshooting.md)
433
+ - **Model loading issues**: Check provider configuration in `.env`
434
+ - **Database connection problems**: Verify MongoDB/ChromaDB settings
435
+
436
+ ## 📚 Dependencies
437
+
438
+ ### Core Dependencies
439
+ - **FastAPI**: Modern web framework
440
+ - **uvicorn**: ASGI server
441
+ - **pydantic**: Data validation
442
+ - **python-dotenv**: Environment management
443
+
444
+ ### AI/ML Dependencies
445
+ - **langchain**: LLM framework and chains
446
+ - **langchain-openai**: OpenAI integration
447
+ - **langchain-google-genai**: Google AI integration
448
+ - **sentence-transformers**: Embedding models
449
+ - **chromadb**: Vector database
450
+ - **pymongo**: MongoDB integration
451
+
452
+ ### Optional Dependencies
453
+ - **langchain-huggingface**: HuggingFace integration
454
+ - **torch**: PyTorch for local models
455
+
456
+ ## 📄 License
457
+
458
+ This project is part of the PLG4 Recipe Recommendation Chatbot system.
459
+
460
+ ---
461
+
462
+ For more detailed documentation, check the `docs/` folder or visit the API documentation at `http://localhost:8080/docs` when running the server.
backend/app.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.utils.request_dto.chat_response import ChatResponse
2
+ from backend.utils.request_dto.scrape_request import ScrapeRequest
3
+ from backend.utils.types import ChatMessage
4
+ from fastapi import FastAPI, HTTPException, BackgroundTasks, Header
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ import os
7
+ from typing import Type
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from data_minning.dto.stream_opts import StreamOptions
10
+ from data_minning.base_scrapper import BaseRecipeScraper, JsonArraySink, MongoSink
11
+ from data_minning.all_nigerian_recipe_scraper import AllNigerianRecipesScraper
12
+ from data_minning.yummy_medley_scraper import YummyMedleyScraper
13
+ from backend.config.settings import settings
14
+ from backend.config.logging_config import setup_default_logging, get_logger
15
+ from backend.utils.sanitization import sanitize_user_input
16
+ from backend.services.vector_store import vector_store_service
17
+ # Setup logging first, before importing services
18
+ setup_default_logging()
19
+ logger = get_logger("app")
20
+
21
+ # Import services after logging is configured
22
+ from backend.services.llm_service import llm_service
23
+
24
+ SCRAPERS: dict[str, Type[BaseRecipeScraper]] = {
25
+ "yummy": YummyMedleyScraper,
26
+ "anr": AllNigerianRecipesScraper,
27
+ }
28
+
29
+ app = FastAPI(
30
+ title="Recipe Recommendation Bot API",
31
+ description="AI-powered recipe recommendation system with RAG capabilities",
32
+ version="1.0.0"
33
+ )
34
+
35
+ logger.info("🚀 Starting Recipe Recommendation Bot API")
36
+ logger.info(f"Environment: {settings.ENVIRONMENT}")
37
+ logger.info(f"Provider: {settings.get_llm_config()['provider']} (LLM + Embeddings)")
38
+
39
+ # Add CORS middleware
40
+ app.add_middleware(
41
+ CORSMiddleware,
42
+ allow_origins=settings.CORS_ORIGINS or ["*"],
43
+ allow_credentials=settings.CORS_ALLOW_CREDENTIALS or True,
44
+ allow_methods=settings.CORS_ALLOW_METHODS or ["*"],
45
+ allow_headers=settings.CORS_ALLOW_HEADERS or ["*"],
46
+ )
47
+
48
+ # Remove OpenAI direct setup - now handled by LLM service
49
+ # if settings.OPENAI_API_KEY:
50
+ # openai.api_key = settings.OPENAI_API_KEY
51
+
52
+ @app.get("/")
53
+ def index():
54
+ logger.info("📡 Root endpoint accessed")
55
+ return {
56
+ "message": "Recipe Recommendation Bot API",
57
+ "version": "1.0.0",
58
+ "status": "running"
59
+ }
60
+
61
+ @app.get("/health")
62
+ def health_check():
63
+ logger.info("🏥 Health check endpoint accessed")
64
+ return {
65
+ "status": "healthy",
66
+ "environment": settings.ENVIRONMENT,
67
+ "llm_service_initialized": llm_service is not None
68
+ }
69
+
70
+ @app.post("/chat", response_model=ChatResponse)
71
+ async def chat(chat_message: ChatMessage):
72
+ """Main chatbot endpoint - Recipe recommendation with ConversationalRetrievalChain"""
73
+ try:
74
+ # Message is already sanitized by the Pydantic validator
75
+ # Find the last user message in the messages list
76
+ last_user_message = chat_message.get_latest_message()
77
+ if not last_user_message:
78
+ raise ValueError("No valid user message found")
79
+ user_text = last_user_message.parts[0].text
80
+
81
+ response_text = llm_service.ask_question(user_text)
82
+ return ChatResponse(response=response_text)
83
+
84
+ except ValueError as e:
85
+ # Handle validation/sanitization errors
86
+ logger.warning(f"⚠️ Invalid input received: {str(e)}")
87
+ raise HTTPException(status_code=400, detail=f"Invalid input: {str(e)}")
88
+
89
+ except Exception as e:
90
+ logger.error(f"❌ Chat service error: {str(e)}", exc_info=True)
91
+ raise HTTPException(status_code=500, detail=f"Chat service error: {str(e)}")
92
+
93
+ @app.get("/demo")
94
+ def demo(prompt: str = "What recipes do you have?"):
95
+ """Demo endpoint - uses simple chat completion without RAG"""
96
+ logger.info(f"🎯 Demo request: '{prompt[:50]}...'")
97
+
98
+ try:
99
+ # Sanitize the demo prompt using the same sanitization method
100
+ sanitized_prompt = sanitize_user_input(prompt)
101
+ response_text = llm_service.simple_chat_completion(sanitized_prompt)
102
+ return {"prompt": sanitized_prompt, "reply": response_text}
103
+
104
+ except ValueError as e:
105
+ # Handle validation/sanitization errors
106
+ logger.warning(f"⚠️ Invalid demo prompt: {str(e)}")
107
+ return {"error": f"Invalid prompt: {str(e)}", "prompt": prompt}
108
+
109
+ except Exception as e:
110
+ logger.error(f"❌ Demo endpoint error: {str(e)}", exc_info=True)
111
+ return {"error": f"Failed to get response: {str(e)}"}
112
+
113
+ @app.post("/clear-memory")
114
+ def clear_conversation_memory():
115
+ """Clear conversation memory"""
116
+ logger.info("🧹 Memory clear request received")
117
+
118
+ try:
119
+ success = llm_service.clear_memory()
120
+
121
+ if success:
122
+ logger.info("✅ Conversation memory cleared successfully")
123
+ return {"status": "success", "message": "Conversation memory cleared"}
124
+ else:
125
+ logger.warning("⚠️ Memory clear operation failed")
126
+ return {"status": "failed", "message": "Failed to clear conversation memory"}
127
+
128
+ except Exception as e:
129
+ logger.error(f"❌ Memory clear error: {str(e)}", exc_info=True)
130
+ return {"status": "error", "message": str(e)}
131
+
132
+
133
+
134
+ def run_job(job_id: str, site: str, limit: int, output_type: str):
135
+ '''
136
+ Background job to run the scraper
137
+ Uses global JOBS dict to track status
138
+ Outputs to JSON file or MongoDB based on output_type
139
+ '''
140
+ s = SCRAPERS[site]()
141
+ s.embedder = vector_store_service._create_sentence_transformer_wrapper("sentence-transformers/all-MiniLM-L6-v2")
142
+ s.embedding_fields = [(("title", "ingredients", "instructions"), "recipe_emb")]
143
+ sink = None
144
+ if output_type == "json":
145
+ sink = JsonArraySink("./data/recipes_unified.json")
146
+ elif output_type == "mongo":
147
+ sink = MongoSink() if os.getenv("MONGODB_URI") else None
148
+
149
+ stream_opts = StreamOptions(
150
+ delay=0.3,
151
+ limit=500,
152
+ batch_size=limit,
153
+ resume_file="recipes.resume",
154
+ progress_callback=make_progress_cb(job_id),
155
+ )
156
+ try:
157
+ JOBS[job_id] = {"status": "running", "count": 0}
158
+ s.stream( sink=sink, options=stream_opts)
159
+ JOBS[job_id]["status"] = "done"
160
+ except Exception as e:
161
+ JOBS[job_id] = {"status": "error", "error": str(e)}
162
+
163
+ def make_progress_cb(job_id: str):
164
+ ''' Create a progress callback to update JOBS dict
165
+ '''
166
+ def _cb(n: int):
167
+ JOBS[job_id]["count"] = n
168
+ return _cb
169
+
170
+
171
+
172
+
173
+
174
+ # super-lightweight in-memory job store (reset on restart)
175
+ JOBS: dict[str, any] = {}
176
+
177
+ @app.post("/scrape")
178
+ def scrape(body: ScrapeRequest, background: BackgroundTasks, x_api_key: str = Header(None)):
179
+ if body.site not in SCRAPERS:
180
+ raise HTTPException(status_code=400, detail="Unknown site")
181
+
182
+ job_id = f"{body.site}-{os.urandom(4).hex()}"
183
+ # use thread via BackgroundTasks to avoid blocking the request
184
+ background.add_task(run_job, job_id, body.site, body.limit, body.output_type)
185
+ return {"job_id": job_id, "status": "queued"}
186
+
187
+ @app.get("/jobs/{job_id}")
188
+ def job_status(job_id: str):
189
+ return JOBS.get(job_id, {"status": "unknown"})
190
+
191
+ @app.get("/jobs")
192
+ def list_jobs():
193
+ return JOBS
backend/config/__init__.py ADDED
File without changes
backend/config/database.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Database and vector store configuration
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env file
6
+ load_dotenv()
7
+
8
+ class DatabaseSettings:
9
+ """Simple database settings class that reads environment variables directly"""
10
+
11
+ def __init__(self):
12
+ # ===========================================
13
+ # Vector Store Configuration
14
+ # ===========================================
15
+ self.VECTOR_STORE_PROVIDER = os.getenv("VECTOR_STORE_PROVIDER", "chromadb")
16
+
17
+ # ChromaDB Configuration
18
+ self.DB_PATH = os.getenv("DB_PATH", "./data/chromadb")
19
+ self.DB_COLLECTION_NAME = os.getenv("DB_COLLECTION_NAME", "recipes")
20
+ self.DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "./data/chromadb_persist")
21
+ self.DB_REFRESH_ON_START = os.getenv("DB_REFRESH_ON_START", "false").lower() == "true"
22
+
23
+ # MongoDB Atlas Configuration
24
+ self.MONGODB_URI = os.getenv("MONGODB_URI")
25
+ self.MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "recipe_bot")
26
+ self.MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION", "recipes")
27
+ self.MONGODB_INDEX_NAME = os.getenv("MONGODB_INDEX_NAME", "vector_index")
28
+ self.MONGODB_VECTOR_FIELD = os.getenv("MONGODB_VECTOR_FIELD", "embedding")
29
+ self.MONGODB_TEXT_FIELD = os.getenv("MONGODB_TEXT_FIELD", "text")
30
+ self.MONGODB_SIMILARITY_METRIC = os.getenv("MONGODB_SIMILARITY_METRIC", "cosine")
31
+ self.MONGODB_NUM_CANDIDATES = int(os.getenv("MONGODB_NUM_CANDIDATES", "50"))
32
+
33
+ def get_vector_store_config(self):
34
+ """Get vector store configuration based on selected provider"""
35
+ if self.VECTOR_STORE_PROVIDER == "chromadb":
36
+ return {
37
+ "provider": "chromadb",
38
+ "path": self.DB_PATH,
39
+ "collection_name": self.DB_COLLECTION_NAME,
40
+ "persist_directory": self.DB_PERSIST_DIRECTORY,
41
+ "refresh_on_start": self.DB_REFRESH_ON_START
42
+ }
43
+ elif self.VECTOR_STORE_PROVIDER == "mongodb":
44
+ if not self.MONGODB_URI:
45
+ raise ValueError("MongoDB URI is required when using MongoDB Atlas as vector store")
46
+ return {
47
+ "provider": "mongodb",
48
+ "uri": self.MONGODB_URI,
49
+ "database": self.MONGODB_DATABASE,
50
+ "collection_name": self.MONGODB_COLLECTION,
51
+ "index_name": self.MONGODB_INDEX_NAME,
52
+ "vector_field": self.MONGODB_VECTOR_FIELD,
53
+ "text_field": self.MONGODB_TEXT_FIELD,
54
+ "similarity_metric": self.MONGODB_SIMILARITY_METRIC,
55
+ "num_candidates": self.MONGODB_NUM_CANDIDATES
56
+ }
57
+ else:
58
+ raise ValueError(f"Unsupported vector store provider: {self.VECTOR_STORE_PROVIDER}")
59
+
60
+ # Create global database settings instance
61
+ db_settings = DatabaseSettings()
backend/config/logging_config.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logging Configuration
2
+ import logging
3
+ import logging.handlers
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ def setup_logging(
9
+ log_level: str = "INFO",
10
+ log_file: Optional[str] = None,
11
+ enable_console: bool = True,
12
+ max_file_size: int = 10 * 1024 * 1024, # 10MB
13
+ backup_count: int = 5
14
+ ) -> logging.Logger:
15
+ """
16
+ Setup centralized logging configuration
17
+
18
+ Args:
19
+ log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
20
+ log_file: Path to log file (optional)
21
+ enable_console: Whether to enable console logging
22
+ max_file_size: Maximum log file size in bytes
23
+ backup_count: Number of backup files to keep
24
+
25
+ Returns:
26
+ Configured logger instance
27
+ """
28
+
29
+ # Create logs directory if it doesn't exist
30
+ if log_file:
31
+ log_path = Path(log_file)
32
+ log_path.parent.mkdir(parents=True, exist_ok=True)
33
+
34
+ # Configure root logger
35
+ logger = logging.getLogger("recipe_bot")
36
+ logger.setLevel(getattr(logging, log_level.upper()))
37
+
38
+ # Clear existing handlers
39
+ logger.handlers.clear()
40
+
41
+ # Create formatter
42
+ formatter = logging.Formatter(
43
+ fmt="%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
44
+ datefmt="%Y-%m-%d %H:%M:%S"
45
+ )
46
+
47
+ # Console handler
48
+ if enable_console:
49
+ console_handler = logging.StreamHandler(sys.stdout)
50
+ console_handler.setLevel(getattr(logging, log_level.upper()))
51
+ console_handler.setFormatter(formatter)
52
+ logger.addHandler(console_handler)
53
+
54
+ # File handler with rotation
55
+ if log_file:
56
+ file_handler = logging.handlers.RotatingFileHandler(
57
+ filename=log_file,
58
+ maxBytes=max_file_size,
59
+ backupCount=backup_count,
60
+ encoding='utf-8'
61
+ )
62
+ file_handler.setLevel(getattr(logging, log_level.upper()))
63
+ file_handler.setFormatter(formatter)
64
+ logger.addHandler(file_handler)
65
+
66
+ # Prevent duplicate logs
67
+ logger.propagate = False
68
+
69
+ return logger
70
+
71
+ def get_logger(name: str) -> logging.Logger:
72
+ """Get a child logger with the specified name"""
73
+ return logging.getLogger(f"recipe_bot.{name}")
74
+
75
+ # Default logger setup
76
+ def setup_default_logging():
77
+ """Setup default logging configuration"""
78
+ return setup_logging(
79
+ log_level="INFO",
80
+ log_file="./logs/recipe_bot.log",
81
+ enable_console=True
82
+ )
backend/config/settings.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration settings for the Recipe Recommendation Bot
2
+ import os
3
+ from typing import Optional, List
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables from .env file
7
+ load_dotenv()
8
+
9
+ class Settings:
10
+ """Simple settings class that reads environment variables directly"""
11
+
12
+ def __init__(self):
13
+ # ===========================================
14
+ # Server Configuration
15
+ # ===========================================
16
+ self.PORT = int(os.getenv("PORT", 8000))
17
+ self.HOST = os.getenv("HOST", "0.0.0.0")
18
+ self.ENVIRONMENT = os.getenv("ENVIRONMENT", "development")
19
+ self.DEBUG = os.getenv("DEBUG", "true").lower() == "true"
20
+
21
+ # ===========================================
22
+ # CORS Configuration
23
+ # ===========================================
24
+ cors_origins = os.getenv("CORS_ORIGINS", '["http://localhost:3000","http://localhost:5173","http://localhost:8080"]')
25
+ self.CORS_ORIGINS = self._parse_list(cors_origins)
26
+ self.CORS_ALLOW_CREDENTIALS = os.getenv("CORS_ALLOW_CREDENTIALS", "true").lower() == "true"
27
+
28
+ cors_methods = os.getenv("CORS_ALLOW_METHODS", '["GET","POST","PUT","DELETE","OPTIONS"]')
29
+ self.CORS_ALLOW_METHODS = self._parse_list(cors_methods)
30
+
31
+ cors_headers = os.getenv("CORS_ALLOW_HEADERS", '["*"]')
32
+ self.CORS_ALLOW_HEADERS = self._parse_list(cors_headers)
33
+
34
+ # ===========================================
35
+ # LLM & Embedding Provider Configuration
36
+ # ===========================================
37
+ self.LLM_PROVIDER = os.getenv("LLM_PROVIDER", "google")
38
+ self.EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", self.LLM_PROVIDER) # Default to same as LLM
39
+
40
+ # OpenAI Configuration
41
+ self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
42
+ self.OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5-nano")
43
+ self.OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.7"))
44
+ self.OPENAI_MAX_TOKENS = int(os.getenv("OPENAI_MAX_TOKENS", "1000"))
45
+
46
+ # Google AI Configuration
47
+ self.GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
48
+ self.GOOGLE_MODEL = os.getenv("GOOGLE_MODEL", "gemini-2.5-flash")
49
+ self.GOOGLE_TEMPERATURE = float(os.getenv("GOOGLE_TEMPERATURE", "0.7"))
50
+ self.GOOGLE_MAX_TOKENS = int(os.getenv("GOOGLE_MAX_TOKENS", "1000"))
51
+
52
+ # Hugging Face Configuration
53
+ self.HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
54
+ self.HUGGINGFACE_MODEL = os.getenv("HUGGINGFACE_MODEL", "microsoft/DialoGPT-medium")
55
+ self.HUGGINGFACE_API_URL = os.getenv("HUGGINGFACE_API_URL", "https://api-inference.huggingface.co/models/")
56
+ self.HUGGINGFACE_USE_GPU = os.getenv("HUGGINGFACE_USE_GPU", "false").lower() == "true"
57
+ self.HUGGINGFACE_USE_API = os.getenv("HUGGINGFACE_USE_API", "false").lower() == "true"
58
+
59
+ # Ollama Configuration
60
+ self.OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
61
+ self.OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.1:8b")
62
+ self.OLLAMA_TEMPERATURE = float(os.getenv("OLLAMA_TEMPERATURE", "0.7"))
63
+
64
+ # ===========================================
65
+ # Embedding Model Configuration
66
+ # ===========================================
67
+ # Note: Embedding provider is determined by LLM_PROVIDER setting above
68
+
69
+ # OpenAI Embeddings
70
+ self.OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
71
+
72
+ # Google Embeddings
73
+ self.GOOGLE_EMBEDDING_MODEL = os.getenv("GOOGLE_EMBEDDING_MODEL", "models/embedding-001")
74
+
75
+ # Hugging Face Embeddings
76
+ self.HUGGINGFACE_EMBEDDING_MODEL = os.getenv("HUGGINGFACE_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
77
+
78
+ # Ollama Embeddings
79
+ self.OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
80
+
81
+ # ===========================================
82
+ # Logging Configuration
83
+ # ===========================================
84
+ self.LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
85
+ self.LOG_FORMAT = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
86
+ self.LOG_FILE = os.getenv("LOG_FILE", "./logs/app.log")
87
+
88
+ # ===========================================
89
+ # Langchain Debugging Configuration
90
+ # ===========================================
91
+ # Note: set to "true" to enable detailed Langchain logs
92
+ self.LANGCHAIN_DEBUG = os.getenv("LANGCHAIN_DEBUG", "false").lower() == "true"
93
+
94
+ def _parse_list(self, value: str) -> List[str]:
95
+ """Parse a string representation of a list into an actual list"""
96
+ try:
97
+ # Remove brackets and quotes, split by comma
98
+ if value.startswith('[') and value.endswith(']'):
99
+ value = value[1:-1]
100
+ items = [item.strip().strip('"').strip("'") for item in value.split(',')]
101
+ return [item for item in items if item] # Remove empty items
102
+ except:
103
+ return ["*"] # Fallback to allow all
104
+
105
+ def get_llm_config(self):
106
+ """Get LLM configuration based on selected provider"""
107
+ if self.LLM_PROVIDER == "openai":
108
+ return {
109
+ "provider": "openai",
110
+ "api_key": self.OPENAI_API_KEY,
111
+ "model": self.OPENAI_MODEL,
112
+ "temperature": self.OPENAI_TEMPERATURE,
113
+ "max_tokens": self.OPENAI_MAX_TOKENS
114
+ }
115
+ elif self.LLM_PROVIDER == "google":
116
+ return {
117
+ "provider": "google",
118
+ "api_key": self.GOOGLE_API_KEY,
119
+ "model": self.GOOGLE_MODEL,
120
+ "temperature": self.GOOGLE_TEMPERATURE,
121
+ "max_tokens": self.GOOGLE_MAX_TOKENS
122
+ }
123
+ elif self.LLM_PROVIDER == "huggingface":
124
+ return {
125
+ "provider": "huggingface",
126
+ "api_token": self.HUGGINGFACE_API_TOKEN,
127
+ "model": self.HUGGINGFACE_MODEL,
128
+ "api_url": self.HUGGINGFACE_API_URL,
129
+ "use_gpu": self.HUGGINGFACE_USE_GPU,
130
+ "use_api": self.HUGGINGFACE_USE_API
131
+ }
132
+ elif self.LLM_PROVIDER == "ollama":
133
+ return {
134
+ "provider": "ollama",
135
+ "base_url": self.OLLAMA_BASE_URL,
136
+ "model": self.OLLAMA_MODEL,
137
+ "temperature": self.OLLAMA_TEMPERATURE
138
+ }
139
+ else:
140
+ raise ValueError(f"Unsupported LLM provider: {self.LLM_PROVIDER}")
141
+
142
+ def get_embedding_config(self):
143
+ """Get embedding configuration based on EMBEDDING_PROVIDER setting"""
144
+ provider = self.EMBEDDING_PROVIDER
145
+
146
+ if provider == "openai":
147
+ return {
148
+ "provider": "openai",
149
+ "api_key": self.OPENAI_API_KEY,
150
+ "model": self.OPENAI_EMBEDDING_MODEL
151
+ }
152
+ elif provider == "google":
153
+ return {
154
+ "provider": "google",
155
+ "api_key": self.GOOGLE_API_KEY,
156
+ "model": self.GOOGLE_EMBEDDING_MODEL
157
+ }
158
+ elif provider == "huggingface":
159
+ return {
160
+ "provider": "huggingface",
161
+ "model": self.HUGGINGFACE_EMBEDDING_MODEL
162
+ }
163
+ elif provider == "ollama":
164
+ return {
165
+ "provider": "ollama",
166
+ "base_url": self.OLLAMA_BASE_URL,
167
+ "model": self.OLLAMA_EMBEDDING_MODEL
168
+ }
169
+ else:
170
+ raise ValueError(f"Unsupported provider: {provider}. Supported providers: openai, google, huggingface, ollama")
171
+
172
+ # Create global settings instance
173
+ settings = Settings()
174
+
175
+ # Note: Vector store and database configuration is in database.py
176
+ # from config.database import db_settings
backend/core/__init__.py ADDED
File without changes
backend/core/exceptions.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Custom exception handlers for the Recipe Recommendation Bot
2
+
3
+ class RecipeNotFoundError(Exception):
4
+ """Raised when a recipe is not found"""
5
+ pass
6
+
7
+ class LLMServiceError(Exception):
8
+ """Raised when LLM service encounters an error"""
9
+ pass
10
+
11
+ class VectorStoreError(Exception):
12
+ """Raised when vector store operations fail"""
13
+ pass
14
+
15
+ # TODO: Add more specific exception classes and error handlers
backend/data/__init__.py ADDED
File without changes
backend/data/sample_recipes.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "title": "Mixed Seafood Coconut Fried Rice",
4
+ "ingredients": [
5
+ "jasmine rice",
6
+ "cooked shrimp",
7
+ "prawns",
8
+ "scallops",
9
+ "coconut milk",
10
+ "fish sauce",
11
+ "soy sauce",
12
+ "garlic",
13
+ "onion",
14
+ "ginger",
15
+ "green onions",
16
+ "cilantro",
17
+ "lime",
18
+ "vegetable oil",
19
+ "salt",
20
+ "pepper"
21
+ ],
22
+ "instructions": "1. Heat vegetable oil in large pan over medium-high heat. 2. Add garlic, onion, and ginger, stir-fry until fragrant (about 1 minute). 3. Add cooked jasmine rice and mix well, breaking up any clumps. 4. Add shrimp, prawns, and scallops, cook until heated through (3-4 minutes). 5. Pour in coconut milk and season with fish sauce and soy sauce. 6. Stir everything together and cook for 2-3 minutes until well combined. 7. Garnish with chopped green onions and fresh cilantro. 8. Serve immediately with lime wedges on the side.",
23
+ "metadata": {
24
+ "cook_time": "25 minutes",
25
+ "difficulty": "medium",
26
+ "servings": "4",
27
+ "category": "seafood",
28
+ "image_url": "https://example.com/images/mixed-seafood-coconut-fried-rice.jpg"
29
+ }
30
+ },
31
+ {
32
+ "title": "Classic Chicken Alfredo Pasta",
33
+ "ingredients": [
34
+ "fettuccine pasta",
35
+ "chicken breast",
36
+ "heavy cream",
37
+ "parmesan cheese",
38
+ "butter",
39
+ "garlic",
40
+ "italian seasoning",
41
+ "salt",
42
+ "pepper",
43
+ "olive oil",
44
+ "parsley"
45
+ ],
46
+ "instructions": "1. Cook fettuccine pasta according to package directions, drain and set aside. 2. Season chicken breast with salt, pepper, and Italian seasoning. 3. Heat olive oil in large skillet over medium-high heat. 4. Cook chicken breast until golden brown and cooked through (6-7 minutes per side). 5. Remove chicken and slice into strips. 6. In same skillet, melt butter and add minced garlic, cook for 1 minute. 7. Add heavy cream and bring to gentle simmer. 8. Stir in grated Parmesan cheese until melted and smooth. 9. Add cooked pasta and chicken strips to sauce. 10. Toss everything together and garnish with fresh parsley.",
47
+ "metadata": {
48
+ "cook_time": "30 minutes",
49
+ "difficulty": "easy",
50
+ "servings": "4",
51
+ "category": "pasta",
52
+ "image_url": "https://example.com/images/chicken-alfredo-pasta.jpg"
53
+ }
54
+ },
55
+ {
56
+ "title": "Vegetarian Black Bean Tacos",
57
+ "ingredients": [
58
+ "black beans",
59
+ "canned corn",
60
+ "tortillas",
61
+ "avocado",
62
+ "lime",
63
+ "red onion",
64
+ "cilantro",
65
+ "cumin",
66
+ "chili powder",
67
+ "garlic powder",
68
+ "salt",
69
+ "pepper",
70
+ "olive oil",
71
+ "lettuce",
72
+ "tomato",
73
+ "mexican cheese blend"
74
+ ],
75
+ "instructions": "1. Drain and rinse black beans, drain corn. 2. Heat olive oil in skillet over medium heat. 3. Add black beans, corn, cumin, chili powder, garlic powder, salt, and pepper. 4. Cook for 5-7 minutes until heated through and flavors meld. 5. Warm tortillas in dry skillet or microwave. 6. Dice avocado, red onion, and tomato. 7. Squeeze lime juice over diced avocado to prevent browning. 8. Assemble tacos with bean mixture, lettuce, tomato, avocado, onion, and cheese. 9. Garnish with fresh cilantro and serve with lime wedges.",
76
+ "metadata": {
77
+ "cook_time": "15 minutes",
78
+ "difficulty": "easy",
79
+ "servings": "3",
80
+ "category": "vegetarian",
81
+ "image_url": "https://example.com/images/black-bean-tacos.jpg"
82
+ }
83
+ },
84
+ {
85
+ "title": "Beef and Vegetable Stir Fry",
86
+ "ingredients": [
87
+ "beef sirloin",
88
+ "soy sauce",
89
+ "sesame oil",
90
+ "cornstarch",
91
+ "broccoli",
92
+ "bell peppers",
93
+ "carrots",
94
+ "snap peas",
95
+ "garlic",
96
+ "ginger",
97
+ "vegetable oil",
98
+ "rice vinegar",
99
+ "brown sugar",
100
+ "green onions",
101
+ "sesame seeds"
102
+ ],
103
+ "instructions": "1. Slice beef sirloin into thin strips and marinate with soy sauce, sesame oil, and cornstarch for 15 minutes. 2. Cut broccoli into florets, slice bell peppers and carrots. 3. Heat vegetable oil in large wok or skillet over high heat. 4. Add marinated beef and stir-fry until browned (3-4 minutes). 5. Remove beef and set aside. 6. Add more oil if needed, then add garlic and ginger, stir for 30 seconds. 7. Add hard vegetables (carrots, broccoli) first, stir-fry for 2 minutes. 8. Add bell peppers and snap peas, stir-fry for another 2 minutes. 9. Return beef to pan, add rice vinegar and brown sugar. 10. Stir everything together for 1-2 minutes. 11. Garnish with green onions and sesame seeds.",
104
+ "metadata": {
105
+ "cook_time": "20 minutes",
106
+ "difficulty": "medium",
107
+ "servings": "4",
108
+ "category": "beef",
109
+ "image_url": "https://example.com/images/beef-vegetable-stir-fry.jpg"
110
+ }
111
+ },
112
+ {
113
+ "title": "Mediterranean Quinoa Salad",
114
+ "ingredients": [
115
+ "quinoa",
116
+ "cucumber",
117
+ "cherry tomatoes",
118
+ "red onion",
119
+ "kalamata olives",
120
+ "feta cheese",
121
+ "olive oil",
122
+ "lemon juice",
123
+ "oregano",
124
+ "salt",
125
+ "pepper",
126
+ "fresh mint",
127
+ "parsley"
128
+ ],
129
+ "instructions": "1. Rinse quinoa thoroughly and cook according to package directions (usually 1:2 ratio with water). 2. Let cooked quinoa cool completely. 3. Dice cucumber, halve cherry tomatoes, and thinly slice red onion. 4. Pit and halve kalamata olives, crumble feta cheese. 5. In large bowl, combine cooled quinoa with prepared vegetables and olives. 6. Make dressing by whisking together olive oil, lemon juice, oregano, salt, and pepper. 7. Pour dressing over salad and toss well. 8. Add crumbled feta cheese and chopped fresh mint and parsley. 9. Toss gently and let sit for 15 minutes to allow flavors to meld. 10. Serve chilled or at room temperature.",
130
+ "metadata": {
131
+ "cook_time": "25 minutes",
132
+ "difficulty": "easy",
133
+ "servings": "6",
134
+ "category": "salad",
135
+ "image_url": "https://example.com/images/mediterranean-quinoa-salad.jpg"
136
+ }
137
+ }
138
+ ]
backend/data_minning/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services package initialization
backend/data_minning/all_nigerian_recipe_scraper.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Iterable, List, Optional
3
+ from urllib.parse import urljoin, urlparse
4
+
5
+ from bs4 import BeautifulSoup, NavigableString, Tag
6
+ from .dto.recipe_doc import RecipeDoc
7
+ from .base_scrapper import BaseRecipeScraper
8
+ from backend.utils.sanitization import clean
9
+
10
+
11
+ class AllNigerianRecipesScraper(BaseRecipeScraper):
12
+ # two-level paths like /beans/beans-porridge/
13
+ ALLOWED_CATS = {
14
+ "beans","soups","stews","salad","breakfast","rice",
15
+ "yam","plantain","swallow","drinks","desserts","meat","fish", "chicken"
16
+ }
17
+ PAT_ING = re.compile(r"\bingredients?\b|\bwhat you need\b|\bingredients? for\b", re.I)
18
+ PAT_NOTEI = re.compile(r"\bnotes?\b.*ingredients?\b", re.I)
19
+ PAT_BEFORE= re.compile(r"\bbefore you (cook|grill|fry|bake|roast|steam|boil|prepare)\b", re.I)
20
+ PAT_INSTR = re.compile(r"\b(preparation|directions|method|instructions|cooking|making)\b", re.I)
21
+ # Map first path segment (category) -> course
22
+ COURSE_BY_CATEGORY = {
23
+ "soups": "Soup",
24
+ "stews": "Stew",
25
+ "snacks": "Snack",
26
+ "drinks": "Drink",
27
+ "desserts": "Dessert",
28
+ "breakfast": "Breakfast",
29
+ "salad": "Salad",
30
+ "rice": "Main Course",
31
+ "beans": "Main Course",
32
+ "yam": "Main Course",
33
+ "plantain": "Main Course",
34
+ "meat": "Main Course",
35
+ "fish": "Main Course",
36
+ "chicken": "Main Course", # <-- add chicken
37
+ "swallow": "Side Dish",
38
+ }
39
+ # Fallback keyword hints from URL/title if category isn’t enough
40
+ COURSE_BY_KEYWORD = [
41
+ (r"\bsoup(s)?\b", "Soup"),
42
+ (r"\bstew(s)?\b", "Stew"),
43
+ (r"\bsalad(s)?\b", "Salad"),
44
+ (r"\b(snack|small[-\s]?chops)\b", "Snack"),
45
+ (r"\b(drink|juice|smoothie)\b", "Drink"),
46
+ (r"\bbreakfast\b", "Breakfast"),
47
+ (r"\bdessert(s)?\b", "Dessert"),
48
+ ]
49
+
50
+ def __init__(self, base_domain="www.allnigerianrecipes.com", index_url="https://www.allnigerianrecipes.com/other/sitemap/"):
51
+ super().__init__(base_domain)
52
+ self.index_url = index_url
53
+
54
+ def _is_two_level_recipe(self, url: str) -> bool:
55
+ sp = urlparse(url); segs = [s for s in sp.path.strip("/").split("/") if s]
56
+ if len(segs) != 2: return False
57
+ if segs[0].lower() not in self.ALLOWED_CATS: return False
58
+ bad = {"page","tag","tags","category","categories","author","search"}
59
+ if any(s in bad for s in segs): return False
60
+ if sp.path.lower().endswith((".xml",".pdf",".zip",".jpg",".jpeg",".png",".webp",".mp4",".mov")):
61
+ return False
62
+ return True
63
+
64
+ def discover_urls(self) -> Iterable[str]:
65
+ soup = self.fetch_soup(self.index_url)
66
+ seen = set()
67
+ for a in soup.select("a[href]"):
68
+ u = urljoin(self.index_url, a["href"])
69
+ if self.same_domain(u) and self._is_two_level_recipe(u) and u not in seen:
70
+ seen.add(u); yield u
71
+
72
+ def _li_text_only(self, li: Tag) -> str:
73
+ parts=[]
74
+ for ch in li.contents:
75
+ if isinstance(ch, NavigableString): parts.append(str(ch))
76
+ elif isinstance(ch, Tag) and ch.name not in ("ul","ol","iframe"):
77
+ parts.append(ch.get_text(" ", strip=True))
78
+ return clean(" ".join(parts)) or ""
79
+
80
+ def _collect_after(self, h: Tag, categories = None) -> List[str]:
81
+ lvl = int(h.name[1])
82
+ out = []
83
+ for sib in h.next_siblings:
84
+ if isinstance(sib, Tag) and sib.name in self.HEADING_TAGS and int(sib.name[1]) <= lvl:
85
+ break
86
+ if isinstance(sib, Tag):
87
+ if categories == 'ingredients':
88
+ if sib.name in ("ul", "ol"):
89
+ lis = sib.find_all("li", recursive=False) or sib.find_all("li")
90
+ for li in lis:
91
+ t = self._li_text_only(li)
92
+ if t:
93
+ out.append(t)
94
+ # Skip other tags for ingredients
95
+ else:
96
+ if sib.name in ("ul", "ol"):
97
+ lis = sib.find_all("li", recursive=False) or sib.find_all("li")
98
+ for li in lis:
99
+ t = self._li_text_only(li)
100
+ if t:
101
+ out.append(t)
102
+ elif sib.name in ("p", "div", "blockquote"):
103
+ t = clean(sib.get_text(" ", strip=True))
104
+ if t:
105
+ out.append(t)
106
+ elif isinstance(sib, NavigableString):
107
+ if categories != 'ingredients': # Only add NavigableString if not ingredients
108
+ t = clean(str(sib))
109
+ if t:
110
+ out.append(t)
111
+ return [x for x in out if not x.lower().startswith("video of ")]
112
+
113
+ def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
114
+ doc = RecipeDoc.make(url)
115
+ # JSON-LD first
116
+ j = self.extract_jsonld(soup)
117
+ if j:
118
+ for k,v in j.items():
119
+ if hasattr(doc, k): setattr(doc, k, v)
120
+ # Title fallback
121
+ if not doc.title:
122
+ title = soup.find("h1") or soup.find("title")
123
+ doc.title = clean(title.get_text()) if title else None
124
+
125
+ root = soup.select_one(".entry-content") or soup
126
+ sections_ing, sections_instr, notes = [], [], []
127
+
128
+ for h in root.find_all(self.HEADING_TAGS):
129
+ ht = h.get_text(" ", strip=True) or ""
130
+ if self.PAT_NOTEI.search(ht):
131
+ it = self._collect_after(h);
132
+ if it: notes.append({"title": ht, "items": it})
133
+ elif self.PAT_ING.search(ht):
134
+ it = [x for x in self._collect_after(h, 'ingredients') if self._looks_like_ingredient(x)]
135
+ if it: sections_ing.append({"title": ht, "items": it})
136
+ elif self.PAT_BEFORE.search(ht) or self.PAT_INSTR.search(ht):
137
+ st = self._collect_after(h);
138
+ if st: sections_instr.append({"title": ht, "steps": st})
139
+
140
+ # Flatten
141
+ for s in sections_ing + notes:
142
+ doc.ingredients.extend(s["items"])
143
+ for s in sections_instr:
144
+ doc.instructions.extend(s["steps"])
145
+
146
+ ingredients_text = self._to_ingredients_text(doc.ingredients)
147
+ instructions_text = self._to_instructions_text(doc.instructions)
148
+
149
+ # Store as text (team requirement)
150
+ doc.ingredients = ingredients_text
151
+ doc.instructions = instructions_text
152
+ # Notes & image
153
+ if notes:
154
+ # concatenate notes into a single field
155
+ doc.notes = " ".join(["; ".join(n.get("items", [])) for n in notes if n.get("items")])
156
+ if not doc.image_url:
157
+ doc.image_url = clean(self.get_meta_image(soup))
158
+
159
+ # Infer category (first path segment) and map to a course
160
+ if not getattr(doc, "category", None) or not getattr(doc, "course", None):
161
+ cat, crs = self._infer_category_and_course(url, doc.title)
162
+ if not doc.category:
163
+ doc.category = cat
164
+ if not doc.course:
165
+ doc.course = crs
166
+
167
+ return doc
168
+
169
+ def _looks_like_ingredient(self, text: str) -> bool:
170
+ if len(text.split()) > 15: return False
171
+ # Match numbers followed by units (with or without space), or common ingredient keywords
172
+ return bool(re.search(r"\b(\d+\s?(cup|cups|tsp|tbsp|teaspoon|tablespoon|g|kg|ml|l|gram)?|cup|cups|tsp|tbsp|teaspoon|tablespoon|g|kg|ml|l|gram|salt|pepper|onion|oil|water|rice|groundnuts|tamarind)\b", text, re.I))
173
+
174
+ @staticmethod
175
+ def _slug_to_title(slug: Optional[str]) -> Optional[str]:
176
+ return slug.replace("-", " ").title() if slug else None
177
+
178
+ def _infer_category_and_course(self, url: str, title: Optional[str]) -> tuple[Optional[str], Optional[str]]:
179
+ sp = urlparse(url)
180
+ segs = [s for s in sp.path.strip("/").split("/") if s]
181
+ cat_slug = segs[0] if segs else None
182
+ category = self._slug_to_title(cat_slug) if cat_slug else None
183
+
184
+ course = self.COURSE_BY_CATEGORY.get(cat_slug)
185
+ if not course:
186
+ hay = f"{url} {(title or '')}".lower()
187
+ for pat, crs in self.COURSE_BY_KEYWORD:
188
+ if re.search(pat, hay):
189
+ course = crs
190
+ break
191
+ if not course and category:
192
+ course = "Main Course"
193
+ return category, course
backend/data_minning/base_scrapper.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict
2
+ from datetime import datetime
3
+ import io
4
+ import json
5
+ import os
6
+ import time
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
8
+ from pathlib import Path
9
+ from click import Tuple
10
+ from pymongo import MongoClient, UpdateOne, errors
11
+
12
+ from .dto.stream_opts import StreamOptions
13
+
14
+ from .dto.recipe_doc import RecipeDoc
15
+
16
+ from .soup_client import SoupClient
17
+ from backend.utils.sanitization import clean
18
+ from bs4 import BeautifulSoup
19
+ from backend.config.database import db_settings
20
+
21
+ class JsonArraySink:
22
+ """
23
+ Append-safe JSON array writer.
24
+ - Creates file with `[` ... `]`
25
+ - If file exists, removes trailing `]`, appends items, and re-closes.
26
+ """
27
+ def __init__(self, path: str):
28
+ self.path = path
29
+ self._opened = False
30
+ self._first = True
31
+ self.f = None
32
+
33
+ def _prepare(self):
34
+ if self._opened:
35
+ return
36
+
37
+ # Ensure file exists with an empty array
38
+ if not os.path.exists(self.path):
39
+ with open(self.path, "w", encoding="utf-8") as f:
40
+ f.write("[\n]")
41
+
42
+ self.f = open(self.path, "r+", encoding="utf-8")
43
+
44
+ # Find the position of the final ']' from the end
45
+ self.f.seek(0, io.SEEK_END)
46
+ end = self.f.tell()
47
+ step = min(4096, end)
48
+ pos = end
49
+ last_bracket = -1
50
+ while pos > 0:
51
+ pos = max(0, pos - step)
52
+ self.f.seek(pos)
53
+ chunk = self.f.read(step)
54
+ j = chunk.rfind("]")
55
+ if j != -1:
56
+ last_bracket = pos + j
57
+ break
58
+
59
+ if last_bracket == -1:
60
+ # Corrupt file: reset to empty array
61
+ self.f.seek(0); self.f.truncate(0); self.f.write("[\n]"); self.f.flush()
62
+ last_bracket = 2 # index of ']' in "[\n]"
63
+
64
+ # Decide "is first item?" by inspecting the content BEFORE the ']'
65
+ self.f.seek(0)
66
+ prefix = self.f.read(last_bracket).strip() # content up to (but not including) ']'
67
+ # Empty array has only '[' (possibly with whitespace/newline)
68
+ self._first = (prefix == "[")
69
+
70
+ # Now remove the closing ']' so we can append
71
+ self.f.seek(last_bracket)
72
+ self.f.truncate()
73
+
74
+ self._opened = True
75
+
76
+ def write_many(self, docs: List[Dict[str, Any]]):
77
+ if not docs:
78
+ return
79
+ self._prepare()
80
+
81
+ for d in docs:
82
+ if not self._first:
83
+ self.f.write(",\n")
84
+ else:
85
+ # First item: no leading comma
86
+ self._first = False
87
+ self.f.write(json.dumps(d, ensure_ascii=False, indent=2, default=str))
88
+
89
+ # Restore the closing bracket
90
+ self.f.write("\n]")
91
+ self.f.flush()
92
+
93
+ def close(self):
94
+ if self.f:
95
+ self.f.close()
96
+ self._opened = False
97
+
98
+ class MongoSink:
99
+ def __init__(self, ):
100
+ db_config = db_settings.get_vector_store_config()
101
+ self.client = MongoClient(db_config["uri"], retryWrites=True, serverSelectionTimeoutMS=10000)
102
+ self.col = self.client[db_config["database"]][db_config["collection_name"]]
103
+ self._ensure_indexes()
104
+
105
+ def _ensure_indexes(self):
106
+ self.col.create_index("url", unique=True)
107
+ self.col.create_index("title")
108
+ self.col.create_index("category")
109
+ self.col.create_index("scraped_at")
110
+
111
+ def write_many(self, docs: List[Dict[str, Any]]):
112
+ if not docs: return
113
+ ops = []
114
+ now = datetime.utcnow()
115
+ for d in docs:
116
+ d = d.copy()
117
+ d.setdefault("scraped_at", now)
118
+ ops.append(UpdateOne({"url": d["url"]}, {"$set": d, "$setOnInsert": {"created_at": now}}, upsert=True))
119
+ try:
120
+ self.col.bulk_write(ops, ordered=False)
121
+ except errors.BulkWriteError as e:
122
+ # duplicates or minor issues won't halt unordered bulk
123
+ pass
124
+
125
+ def close(self):
126
+ self.client.close()
127
+
128
+ # we need to create a search function to fetch recipes by title or ingredients from the embeddings given the embedding fields, db and collection
129
+
130
+ class DualSink:
131
+ def __init__(self, json_sink: Optional[JsonArraySink], mongo_sink: Optional[MongoSink]):
132
+ self.json = json_sink
133
+ self.mongo = mongo_sink
134
+ def write_many(self, docs: List[Dict[str, Any]]):
135
+ if self.json: self.json.write_many(docs)
136
+ if self.mongo: self.mongo.upsert_batch(docs)
137
+ def close(self):
138
+ if self.json: self.json.close()
139
+ if self.mongo: self.mongo.close()
140
+
141
+
142
+ class BaseRecipeScraper(SoupClient):
143
+ HEADING_TAGS = ("h1","h2","h3","h4","h5","h6")
144
+
145
+ def __init__(
146
+ self,
147
+ *args,
148
+ embedder= None,
149
+ embedding_fields= None,
150
+ **kwargs
151
+ ):
152
+ """
153
+ embedder: HFEmbedder(), optional
154
+ embedding_fields: list of (source_field, target_field) like:
155
+ [("title", "title_emb"), ("instructions_text", "instr_emb")]
156
+ """
157
+ super().__init__(*args, **kwargs)
158
+ self.embedder = embedder
159
+ self.embedding_fields = embedding_fields or []
160
+ self.logger = self.log
161
+
162
+ def extract_jsonld(self, soup: BeautifulSoup) -> Optional[Dict[str, Any]]:
163
+ def to_list(x): return x if isinstance(x, list) else [x]
164
+ for tag in soup.find_all("script", type="application/ld+json"):
165
+ try:
166
+ data = json.loads(tag.string or "{}")
167
+ except Exception:
168
+ continue
169
+ nodes = (data.get("@graph", [data]) if isinstance(data, dict)
170
+ else (data if isinstance(data, list) else []))
171
+ for n in nodes:
172
+ if not isinstance(n, dict): continue
173
+ t = n.get("@type")
174
+ if t == "Recipe" or (isinstance(t, list) and "Recipe" in t):
175
+ doc = RecipeDoc()
176
+ doc.title = clean(n.get("name"))
177
+ # ingredients
178
+ ings = []
179
+ for ing in to_list(n.get("recipeIngredient") or []):
180
+ if isinstance(ing, dict):
181
+ ings.append(clean(ing.get("name") or ing.get("text")))
182
+ else:
183
+ ings.append(clean(str(ing)))
184
+ doc.ingredients = [x for x in ings if x]
185
+ # instructions
186
+ steps = []
187
+ for st in to_list(n.get("recipeInstructions") or []):
188
+ if isinstance(st, dict):
189
+ steps.append(clean(st.get("text") or st.get("name")))
190
+ else:
191
+ steps.append(clean(str(st)))
192
+ doc.instructions = [x for x in steps if x]
193
+
194
+ doc.servings = n.get("recipeYield")
195
+ doc.image_url = clean((n.get("image") or {}).get("url") if isinstance(n.get("image"), dict) else (n.get("image")[0] if isinstance(n.get("image"), list) else n.get("image")))
196
+ doc.course = clean(n.get("recipeCategory")) if isinstance(n.get("recipeCategory"), str) else None
197
+ doc.cuisine = clean(n.get("recipeCuisine")) if isinstance(n.get("recipeCuisine"), str) else None
198
+ return asdict(doc)
199
+ return None
200
+
201
+ @staticmethod
202
+ def _dedupe_preserve_order(items: List[str]) -> List[str]:
203
+ seen = set()
204
+ out = []
205
+ for x in items:
206
+ x = clean(x)
207
+ if not x or x in seen:
208
+ continue
209
+ seen.add(x); out.append(x)
210
+ return out
211
+
212
+ @staticmethod
213
+ def _to_ingredients_text(items: List[str]) -> str:
214
+ """
215
+ Turn ingredient bullets into a single text block.
216
+ Using one-per-line is great for embeddings and human readability.
217
+ """
218
+ items = [clean(x) for x in items if x]
219
+ items = BaseRecipeScraper._dedupe_preserve_order(items)
220
+ return "\n".join(f"- {x}" for x in items)
221
+
222
+ @staticmethod
223
+ def _to_instructions_text(steps: List[str]) -> str:
224
+ """
225
+ Turn ordered steps into a single text block.
226
+ Numbered paragraphs help embeddings keep sequence context.
227
+ """
228
+ steps = [clean(x) for x in steps if x]
229
+ steps = BaseRecipeScraper._dedupe_preserve_order(steps)
230
+ return "\n\n".join(f"{i}. {s}" for i, s in enumerate(steps, 1))
231
+ # site-specific scrapers override these two:
232
+ def discover_urls(self) -> Iterable[str]:
233
+ raise NotImplementedError
234
+ def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
235
+ raise NotImplementedError
236
+
237
+ # shared streaming loop
238
+ def stream(self, sink: DualSink, options: Optional[StreamOptions] = None) -> int:
239
+ opts = options or StreamOptions()
240
+ self.log.info(
241
+ f"Starting stream: limit={opts.limit} batch_size={opts.batch_size} "
242
+ f"resume_file={opts.resume_file} sink={type(sink).__name__}"
243
+ )
244
+
245
+ processed = set()
246
+ if opts.resume_file:
247
+ resume_path = Path("data") / opts.resume_file # <-- not ../data
248
+ print(resume_path, 'resume_path')
249
+ if resume_path.exists(): # <-- open only if it exists
250
+ with resume_path.open("r", encoding="utf-8") as f:
251
+ processed = {line.strip() for line in f if line.strip()}
252
+ else:
253
+ processed = set()
254
+ self.log.info(f"[resume] {len(processed)} URLs already done")
255
+
256
+ batch, saved = [], 0
257
+ try:
258
+ for i, url in enumerate(self.discover_urls(), 1):
259
+ if opts.limit and i > opts.limit: break
260
+ if not self.same_domain(url): continue
261
+ if url in processed: continue
262
+
263
+ try:
264
+ soup = self.fetch_soup(url)
265
+ doc = self.extract_recipe(soup, url)
266
+ doc.finalize()
267
+ batch.append(asdict(doc))
268
+ except Exception as e:
269
+ self.log.warning(f"[skip] {url} -> {e}")
270
+
271
+ if opts.resume_file:
272
+ resume_path = Path("data") / opts.resume_file
273
+ with open(resume_path, "a", encoding="utf-8") as rf:
274
+ rf.write(url + "\n")
275
+
276
+ if len(batch) >= opts.batch_size:
277
+ self._apply_embeddings(batch)
278
+ sink.write_many(batch); saved += len(batch); batch = []
279
+ if opts.progress_callback: opts.progress_callback(saved)
280
+ self.log.info(f"[resume] {saved} URLs already done 1")
281
+
282
+ if i % 25 == 0:
283
+ self.log.info(f"…processed {i}, saved {saved}")
284
+
285
+ time.sleep(opts.delay)
286
+
287
+ if batch:
288
+ self._apply_embeddings(batch)
289
+ sink.write_many(batch); saved += len(batch)
290
+ if opts.progress_callback: opts.progress_callback(saved)
291
+ self.log.info(f"[resume] {saved} URLs already done2 ")
292
+ finally:
293
+ sink.close()
294
+
295
+ self.log.info(f"[done] saved {saved}")
296
+ return saved
297
+
298
+ @staticmethod
299
+ def _field_to_text(val: Any) -> str:
300
+ if isinstance(val, list):
301
+ return "\n".join(str(x) for x in val)
302
+ if val is None:
303
+ return ""
304
+ return str(val)
305
+
306
+ def _gather_text(self, doc: Dict[str, Any], src: Any) -> str:
307
+ if isinstance(src, tuple):
308
+ parts: List[str] = []
309
+ for f in src:
310
+ t = self._field_to_text(doc.get(f))
311
+ if t:
312
+ # Optional: label sections to help the embedder
313
+ label = "Ingredients" if f == "ingredients" else ("Instructions" if f == "instructions" else f)
314
+ parts.append(f"{label}:\n{t}")
315
+ return "\n\n".join(parts)
316
+ else:
317
+ return self._field_to_text(doc.get(src))
318
+
319
+ def _apply_embeddings(self, batch: List[Dict[str, Any]]) -> None:
320
+ """
321
+ Applies embeddings to specified fields in a batch of documents.
322
+
323
+ For each (source_field, destination_field) pair in `self.embedding_fields`, this method:
324
+ - Extracts the value from `source_field` in each document of the batch.
325
+ - Converts the value to a string. If the value is a list, joins its elements with newlines.
326
+ - Handles `None` values by converting them to empty strings.
327
+ - Uses `self.embedder.encode` to generate embeddings for the processed texts.
328
+ - Stores the resulting embedding vector in `destination_field` of each document.
329
+
330
+ If `self.embedder`, `self.embedding_fields`, or `batch` is not set or empty, the method returns immediately.
331
+
332
+ Args:
333
+ batch (List[Dict[str, Any]]): A list of documents to process, where each document is a dictionary.
334
+
335
+ Returns:
336
+ None
337
+ """
338
+ if not self.embedder or not self.embedding_fields or not batch:
339
+ return
340
+ try:
341
+ for src_spec, dst_field in self.embedding_fields:
342
+ texts = [ self._gather_text(doc, src_spec) for doc in batch ]
343
+ embs = self.embedder.encode(texts)
344
+ for document, vec in zip(batch, embs):
345
+ document[dst_field] = vec
346
+ except Exception as e:
347
+ self.logger.warning(f"[stream error]: {e}")
348
+
backend/data_minning/dto/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services package initialization
backend/data_minning/dto/recipe_doc.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Dict, Any
2
+ from dataclasses import dataclass, asdict, field
3
+ from datetime import datetime
4
+ from urllib.parse import urlparse
5
+
6
+ @dataclass
7
+ class RecipeDoc:
8
+ title: Optional[str] = None
9
+ url: Optional[str] = None
10
+ source: Optional[str] = None
11
+ category: Optional[str] = None
12
+ ingredients: List[str] = field(default_factory=list)
13
+ instructions: List[str] = field(default_factory=list)
14
+ prep_time: Optional[str] = None
15
+ cook_time: Optional[str] = None
16
+ total_time: Optional[str] = None
17
+ servings: Optional[Any] = None
18
+ calories: Optional[float] = None
19
+ rating: Optional[float] = None
20
+ rating_count: Optional[int] = None
21
+ course: Optional[str] = None
22
+ cuisine: Optional[str] = None
23
+ notes: Optional[str] = None
24
+ image_url: Optional[str] = None
25
+ needs_review: Optional[bool] = None
26
+ section: List[Dict[str, Any]] = field(default_factory=list)
27
+ # internal
28
+ scraped_at: Optional[datetime] = None
29
+
30
+ @staticmethod
31
+ def make(url: str) -> "RecipeDoc":
32
+ host = urlparse(url).netloc
33
+ RecipeDocs = RecipeDoc(url=url, source=host, scraped_at=datetime.utcnow())
34
+ return RecipeDocs
35
+
36
+ def finalize(self):
37
+ # mark needs_review conservatively
38
+ self.needs_review = bool((len(self.ingredients) < 1) or (len(self.instructions) < 1))
39
+ # normalize empties to None when appropriate
40
+ for k, v in list(asdict(self).items()):
41
+ if isinstance(v, list) and len(v) == 0:
42
+ setattr(self, k, [] if k in ("ingredients","instructions","section") else None)
43
+ elif v == "":
44
+ setattr(self, k, None)
45
+
backend/data_minning/dto/stream_opts.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Callable, Optional
3
+
4
+
5
+ @dataclass
6
+ class StreamOptions:
7
+ delay: float = 0.3
8
+ limit: Optional[int] = None
9
+ batch_size: int = 50
10
+ resume_file: Optional[str] = None
11
+ # progress gets total_saved so far
12
+ progress_callback: Optional[Callable[[int], None]] = None
backend/data_minning/soup_client.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional
3
+ from urllib.parse import urlparse
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+
8
+ class SoupClient:
9
+ def __init__(self, base_domain: str, user_agent: str = None):
10
+ self.base_domain = base_domain
11
+ self.base_url = f"https://{base_domain}"
12
+ self.session = requests.Session()
13
+ self.session.headers.update({
14
+ "User-Agent": user_agent or
15
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
16
+ "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
17
+ })
18
+ self.log = logging.getLogger(self.__class__.__name__)
19
+
20
+ def same_domain(self, url: str) -> bool:
21
+ host = urlparse(url).netloc.lower()
22
+ return host == self.base_domain.lower() or host == self.base_domain.lower().replace("www.", "")
23
+
24
+ def fetch_soup(self, url: str, timeout: int = 30) -> BeautifulSoup:
25
+ r = self.session.get(url, timeout=timeout)
26
+ r.raise_for_status()
27
+ return BeautifulSoup(r.content, "lxml")
28
+
29
+ def close(self):
30
+ self.session.close()
31
+
32
+ def get_meta_image(self, soup: BeautifulSoup) -> Optional[str]:
33
+ for sel in [
34
+ "meta[property='og:image']",
35
+ "meta[name='og:image']",
36
+ "meta[name='twitter:image']",
37
+ "meta[property='twitter:image']"
38
+ ]:
39
+ m = soup.select_one(sel)
40
+ if m and m.get("content"):
41
+ return m["content"]
42
+ img = soup.find("img")
43
+ return (img.get("src") or img.get("data-src")) if img else None
backend/data_minning/yummy_medley_scraper.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Iterable, Optional
3
+ from urllib.parse import urljoin, urlparse
4
+ from bs4 import BeautifulSoup
5
+ from .base_scrapper import BaseRecipeScraper, RecipeDoc
6
+ from backend.utils.sanitization import clean
7
+
8
+ class YummyMedleyScraper(BaseRecipeScraper):
9
+ """Specifically targets WPRM (WP Recipe Maker) blocks on yummymedley.com"""
10
+
11
+ # Only collect tags from the home page tag-cloud widget
12
+ TAG_CLOUD_SELECTORS = [
13
+ "#tag_cloud-4 .tagcloud a[href*='/tag/']",
14
+ "div.widget_tag_cloud .tagcloud a[href*='/tag/']",
15
+ ]
16
+
17
+ # How we find post links on a tag page (grid cards & headers)
18
+ POST_LINK_SELECTORS = [
19
+ "#main ul.sp-grid li article .post-header a[href]", # header link
20
+ "#main ul.sp-grid li article .post-img a[href]", # image link
21
+ "#main article .post-header a[href]", # fallback
22
+ ]
23
+
24
+ TAG_RE = re.compile(r"/tag/[^/]+/?$")
25
+
26
+ def __init__(self, base_domain="www.yummymedley.com"):
27
+ super().__init__(base_domain)
28
+
29
+ # --- NEW: get tag URLs only from home page tag cloud ---
30
+ def _discover_tag_urls_from_home(self) -> list[str]:
31
+ soup = self.fetch_soup(self.base_url)
32
+ tags = set()
33
+
34
+ # prefer strict selector; gracefully fall back
35
+ anchors = []
36
+ for sel in self.TAG_CLOUD_SELECTORS:
37
+ anchors = soup.select(sel)
38
+ if anchors:
39
+ break
40
+
41
+ if not anchors:
42
+ # final fallback: any /tag/... link on home page
43
+ anchors = soup.find_all("a", href=self.TAG_RE)
44
+
45
+ for a in anchors:
46
+ href = a.get("href")
47
+ if href:
48
+ tags.add(urljoin(self.base_url, href))
49
+ return sorted(tags)
50
+
51
+ # --- NEW: extract post links from a single tag page soup ---
52
+ def _extract_post_links_from_tag_page(self, soup: BeautifulSoup) -> set[str]:
53
+ links = set()
54
+ for sel in self.POST_LINK_SELECTORS:
55
+ for a in soup.select(sel):
56
+ href = a.get("href")
57
+ if href:
58
+ links.add(urljoin(self.base_url, href))
59
+ if links:
60
+ break # got some via this selector
61
+ return links
62
+
63
+ # --- helper: is this URL an article we should open? ---
64
+
65
+ # --- REWRITTEN: discover only from home-page tag cloud, then crawl each tag ---
66
+ def discover_urls(self) -> Iterable[str]:
67
+ tags = self._discover_tag_urls_from_home()
68
+ if not tags:
69
+ # Safety: if tag cloud not found, bail early (or fallback to /recipes/)
70
+ self.logger.warning("No tags discovered from home page tag cloud; falling back to /recipes/")
71
+ tags = [urljoin(self.base_url, "/recipes/")]
72
+
73
+ seen = set()
74
+ for tag_url in tags:
75
+ page = 1
76
+ while page <= 20: # hard cap to avoid runaway pagination
77
+ url = tag_url if page == 1 else f"{tag_url.rstrip('/')}/page/{page}/"
78
+ try:
79
+ soup = self.fetch_soup(url)
80
+ except Exception as e:
81
+ self.logger.warning(f"[tag] fetch failed {url}: {e}")
82
+ break
83
+
84
+ post_links = self._extract_post_links_from_tag_page(soup)
85
+ if not post_links:
86
+ # no posts found -> stop paginating this tag
87
+ break
88
+
89
+ for u in sorted(post_links):
90
+ if u not in seen and self._looks_like_article(u):
91
+ seen.add(u)
92
+ yield u
93
+
94
+ # pagination: look for 'next' or 'older'
95
+ next_link = (
96
+ soup.find("a", string=re.compile(r"next|older", re.I)) or
97
+ soup.find("a", rel="next")
98
+ )
99
+ if not next_link:
100
+ break
101
+ page += 1
102
+
103
+ def _looks_like_article(self, u: str) -> bool:
104
+ sp = urlparse(u)
105
+ if not self.same_domain(u): return False
106
+ if re.search(r"/(tag|category|author|page|feed)/", sp.path, re.I): return False
107
+ if sp.path.endswith((".xml",".jpg",".png",".pdf",".webp",".zip")): return False
108
+ segs = [s for s in sp.path.strip("/").split("/") if s]
109
+ return 1 <= len(segs) <= 3
110
+
111
+ def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
112
+ doc = RecipeDoc.make(url)
113
+ # JSON-LD first (many WPRM pages also embed it)
114
+ j = self.extract_jsonld(soup)
115
+ if j:
116
+ for k, v in j.items():
117
+ if not hasattr(doc, k):
118
+ continue
119
+ # skip empty values from JSON-LD
120
+ if v in (None, "", [], {}):
121
+ continue
122
+ # never overwrite an already-set url/source
123
+ if k in ("url", "source") and getattr(doc, k):
124
+ continue
125
+ setattr(doc, k, v)
126
+ # WPRM block
127
+ w = soup.find("div", class_="wprm-recipe-container")
128
+ if w:
129
+ if not doc.title:
130
+ t = w.find(class_="wprm-recipe-name")
131
+ doc.title = clean(t.get_text()) if t else doc.title
132
+ # image
133
+ if not doc.image_url:
134
+ img = w.find("img")
135
+ doc.image_url = clean(img.get("src") or img.get("data-src")) if img else clean(self.get_meta_image(soup))
136
+ # rating
137
+ r = w.find(class_="wprm-recipe-rating-average")
138
+ if r:
139
+ try: doc.rating = float(r.get_text().strip())
140
+ except: pass
141
+ rc = w.find(class_="wprm-recipe-rating-count")
142
+ if rc:
143
+ try: doc.rating_count = int(rc.get_text().strip())
144
+ except: pass
145
+ # times
146
+ def pick(c):
147
+ x = w.find(class_=c)
148
+ return clean(x.get_text()) if x else None
149
+ doc.prep_time = pick("wprm-recipe-prep_time") or doc.prep_time
150
+ doc.cook_time = pick("wprm-recipe-cook_time") or doc.cook_time
151
+ doc.total_time= pick("wprm-recipe-total_time") or doc.total_time
152
+ # servings
153
+ s = w.find(class_="wprm-recipe-servings")
154
+ if s:
155
+ txt = s.get_text().strip()
156
+ doc.servings = int(txt) if txt.isdigit() else clean(txt)
157
+ # calories
158
+ cal = w.find(class_="wprm-recipe-calories")
159
+ if cal:
160
+ try: doc.calories = float(cal.get_text().strip())
161
+ except: pass
162
+ # course/cuisine
163
+ cse = w.find(class_="wprm-recipe-course"); doc.course = clean(cse.get_text()) if cse else doc.course
164
+ cui = w.find(class_="wprm-recipe-cuisine"); doc.cuisine = clean(cui.get_text()) if cui else doc.cuisine
165
+ # ingredients
166
+ ings = []
167
+ ic = w.find(class_="wprm-recipe-ingredients-container")
168
+ if ic:
169
+ for ing in ic.find_all(class_="wprm-recipe-ingredient"):
170
+ parts=[]
171
+ for cls in ("wprm-recipe-ingredient-amount","wprm-recipe-ingredient-unit","wprm-recipe-ingredient-name","wprm-recipe-ingredient-notes"):
172
+ el = ing.find(class_=cls)
173
+ if el:
174
+ t = el.get_text().strip()
175
+ if t:
176
+ parts.append(t if "notes" not in cls else f"({t})")
177
+ txt = clean(" ".join(parts))
178
+ if txt: ings.append(txt)
179
+ if ings: doc.ingredients = ings
180
+ # instructions
181
+ steps=[]
182
+ ic2 = w.find(class_="wprm-recipe-instructions-container")
183
+ if ic2:
184
+ for ins in ic2.find_all(class_="wprm-recipe-instruction"):
185
+ t = ins.find(class_="wprm-recipe-instruction-text") or ins
186
+ txt = clean(t.get_text().strip())
187
+ if txt: steps.append(txt)
188
+ if steps: doc.instructions = steps
189
+
190
+ ingredients_text = self._to_ingredients_text(doc.ingredients)
191
+ instructions_text = self._to_instructions_text(doc.instructions)
192
+
193
+ # Store as text (team requirement)
194
+ doc.ingredients = ingredients_text
195
+ doc.instructions = instructions_text
196
+
197
+ # Fallback title & image
198
+ if not doc.title:
199
+ h1 = soup.find("h1") or soup.find("title")
200
+ doc.title = clean(h1.get_text()) if h1 else None
201
+ if not doc.image_url:
202
+ doc.image_url = clean(self.get_meta_image(soup))
203
+
204
+ # Optional category (yours wants "Afro-tropical Recipes" etc. — grab from breadcrumbs or page tags if available)
205
+ cat = soup.find("a", href=re.compile(r"/category/|/tag/"))
206
+ doc.category = clean(cat.get_text()) if cat else doc.category
207
+
208
+ return doc
209
+
backend/docs/README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Documentation Index
2
+
3
+ ## 🚨 Troubleshooting (Start Here)
4
+
5
+ ### Quick Fixes
6
+ - **[Embedding Troubleshooting](./embedding-troubleshooting.md)** - Fix dimension mismatch errors in 5 minutes
7
+ - **[Logging Guide](./logging_guide.md)** - Understanding error messages and logs
8
+
9
+ ### Common Error Messages
10
+ | Error | Quick Fix |
11
+ |-------|-----------|
12
+ | `shapes (768,) and (384,) not aligned` | [Embedding dimension mismatch](./embedding-troubleshooting.md#shapes-768-and-384-not-aligned) |
13
+ | `MongoDB connection failed` | Check `MONGODB_URI` in `.env` |
14
+ | `ChromaDB not available` | `pip install langchain_chroma` |
15
+ | `OpenAI API key invalid` | Update `OPENAI_API_KEY` in `.env` |
16
+
17
+ ## 📖 Comprehensive Guides
18
+
19
+ ### Setup & Configuration
20
+ - **[Embedding Compatibility Guide](./embedding-compatibility-guide.md)** - Complete guide to embedding models and dimensions
21
+ - **[Model Configuration Guide](./model-configuration-guide.md)** - Provider-specific settings, temperature limitations, and best practices
22
+ - **[Architecture Documentation](../docs/architecture.md)** - System overview and design patterns
23
+ - **[Deployment Guide](./deployment.md)** - Production deployment instructions
24
+
25
+ ### API & Development
26
+ - **[API Documentation](./api-documentation.md)** - Detailed API reference
27
+ - **[ChromaDB Refresh Guide](./chromadb_refresh.md)** - Database management and refresh procedures
28
+
29
+ ## 🔧 Developer Resources
30
+
31
+ ### Configuration Examples
32
+ ```bash
33
+ # Most reliable setup (384D embeddings)
34
+ EMBEDDING_PROVIDER=huggingface
35
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
36
+
37
+ # Local inference setup (768D embeddings)
38
+ EMBEDDING_PROVIDER=ollama
39
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
40
+
41
+ # Premium quality setup (1536D embeddings)
42
+ EMBEDDING_PROVIDER=openai
43
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small
44
+ ```
45
+
46
+ ### Quick Commands
47
+ ```bash
48
+ # Check current embedding configuration
49
+ grep EMBEDDING .env
50
+
51
+ # Test API health
52
+ curl http://localhost:8080/health
53
+
54
+ # Clear conversation memory
55
+ curl -X POST http://localhost:8080/clear-memory
56
+
57
+ # View recent logs
58
+ tail -f ./logs/recipe_bot.log
59
+ ```
60
+
61
+ ## 📋 File Organization
62
+
63
+ ```
64
+ docs/
65
+ ├── README.md # This file
66
+ ├── embedding-troubleshooting.md # 🚨 Quick fixes
67
+ ├── embedding-compatibility-guide.md # 📖 Complete embedding guide
68
+ ├── model-configuration-guide.md # ⚙️ LLM provider configurations
69
+ ├── logging_guide.md # 🔍 Log analysis
70
+ ├── chromadb_refresh.md # 🔄 Database management
71
+ ├── api-documentation.md # 📡 API reference
72
+ ├── architecture.md # 🏗️ System design
73
+ └── deployment.md # 🚀 Production setup
74
+ ```
75
+
76
+ ## 🆘 Getting Help
77
+
78
+ 1. **Check error logs**: `tail -f ./logs/recipe_bot.log`
79
+ 2. **Common issues**: Start with [Embedding Troubleshooting](./embedding-troubleshooting.md)
80
+ 3. **API problems**: See [API Documentation](./api-documentation.md)
81
+ 4. **Setup issues**: Review [Embedding Compatibility Guide](./embedding-compatibility-guide.md)
82
+
83
+ ---
84
+
85
+ 💡 **Tip**: Bookmark the [Embedding Troubleshooting](./embedding-troubleshooting.md) page - it solves 80% of common issues!
backend/docs/chromadb_refresh.md ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ChromaDB Refresh Feature Documentation
2
+
3
+ ## Overview
4
+
5
+ The ChromaDB refresh feature allows you to automatically delete and recreate your local vector database on application startup. This is useful when you add new recipe files or update existing content that needs to be re-indexed.
6
+
7
+ ## Configuration
8
+
9
+ ### Environment Variables
10
+
11
+ Add the following to your `.env` file:
12
+
13
+ ```bash
14
+ # Set to true to delete and recreate DB on startup (useful for adding new recipes)
15
+ DB_REFRESH_ON_START=false
16
+ ```
17
+
18
+ **Default:** `false` (disabled)
19
+
20
+ ### Environment Files Updated
21
+
22
+ - ✅ `.env` - Your local configuration
23
+ - ✅ `.env.example` - Template for new deployments
24
+ - ✅ `config/database.py` - Configuration class updated
25
+ - ✅ `services/vector_store.py` - Implementation added
26
+
27
+ ## How It Works
28
+
29
+ ### Normal Operation (DB_REFRESH_ON_START=false)
30
+ 1. Check if `DB_PERSIST_DIRECTORY` exists
31
+ 2. If exists with data → Load existing ChromaDB
32
+ 3. If empty/missing → Create new ChromaDB from recipe files
33
+
34
+ ### Refresh Mode (DB_REFRESH_ON_START=true)
35
+ 1. Check if `DB_PERSIST_DIRECTORY` exists
36
+ 2. If exists → **Delete entire directory** 🚨
37
+ 3. Create new ChromaDB from recipe files in `./data/recipes/`
38
+ 4. All data is re-indexed with current embedding model
39
+
40
+ ## Usage Examples
41
+
42
+ ### Adding New Recipes
43
+
44
+ ```bash
45
+ # 1. Add new recipe files to ./data/recipes/
46
+ cp new_recipes.json ./data/recipes/
47
+
48
+ # 2. Enable refresh in .env
49
+ DB_REFRESH_ON_START=true
50
+
51
+ # 3. Start application (will recreate database)
52
+ uvicorn app:app --reload
53
+
54
+ # 4. Disable refresh (IMPORTANT!)
55
+ DB_REFRESH_ON_START=false
56
+ ```
57
+
58
+ ### Changing Embedding Models
59
+
60
+ ```bash
61
+ # 1. Change embedding provider in .env
62
+ EMBEDDING_PROVIDER=openai
63
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-large
64
+
65
+ # 2. Enable refresh to rebuild vectors
66
+ DB_REFRESH_ON_START=true
67
+
68
+ # 3. Start application
69
+ uvicorn app:app --reload
70
+
71
+ # 4. Disable refresh
72
+ DB_REFRESH_ON_START=false
73
+ ```
74
+
75
+ ### Troubleshooting Vector Issues
76
+
77
+ ```bash
78
+ # If ChromaDB is corrupted or having issues
79
+ DB_REFRESH_ON_START=true
80
+ # Restart app to rebuild from scratch
81
+ ```
82
+
83
+ ## Important Warnings ⚠️
84
+
85
+ ### Data Loss Warning
86
+ - **Refresh DELETES ALL existing vector data**
87
+ - **This operation CANNOT be undone**
88
+ - Always backup important data before refresh
89
+
90
+ ### Performance Impact
91
+ - Re-indexing takes time (depends on recipe count)
92
+ - Embedding API calls cost money (OpenAI, Google)
93
+ - Application startup will be slower during refresh
94
+
95
+ ### Memory Usage
96
+ - Large recipe datasets require more memory during indexing
97
+ - Monitor system resources during refresh
98
+
99
+ ## Best Practices
100
+
101
+ ### ✅ DO
102
+ - Set `DB_REFRESH_ON_START=false` after refresh completes
103
+ - Test refresh in development before production
104
+ - Monitor logs during refresh process
105
+ - Add new recipes in batches if possible
106
+
107
+ ### ❌ DON'T
108
+ - Leave refresh enabled in production
109
+ - Refresh unnecessarily (wastes resources)
110
+ - Interrupt refresh process (may corrupt data)
111
+ - Forget to disable after refresh
112
+
113
+ ## Monitoring and Logs
114
+
115
+ The refresh process is fully logged:
116
+
117
+ ```
118
+ 🔄 DB_REFRESH_ON_START=true - Deleting existing ChromaDB at ./data/chromadb_persist
119
+ ✅ Existing ChromaDB deleted successfully
120
+ 🆕 Creating new ChromaDB at ./data/chromadb_persist
121
+ ✅ Created ChromaDB with 150 document chunks
122
+ ```
123
+
124
+ ## Configuration Reference
125
+
126
+ ### Complete Environment Setup
127
+
128
+ ```bash
129
+ # Vector Store Configuration
130
+ VECTOR_STORE_PROVIDER=chromadb
131
+ DB_PATH=./data/chromadb
132
+ DB_COLLECTION_NAME=recipes
133
+ DB_PERSIST_DIRECTORY=./data/chromadb_persist
134
+
135
+ # Refresh Control
136
+ DB_REFRESH_ON_START=false # Set to true only when needed
137
+
138
+ # Embedding Configuration
139
+ EMBEDDING_PROVIDER=huggingface
140
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
141
+ ```
142
+
143
+ ### Database Configuration Object
144
+
145
+ ```python
146
+ from config.database import DatabaseSettings
147
+
148
+ db_settings = DatabaseSettings()
149
+ config = db_settings.get_vector_store_config()
150
+
151
+ # Access refresh setting
152
+ refresh_enabled = config['refresh_on_start'] # boolean
153
+ ```
154
+
155
+ ## Troubleshooting
156
+
157
+ ### Common Issues
158
+
159
+ **Refresh not working:**
160
+ - Check `.env` file has `DB_REFRESH_ON_START=true`
161
+ - Verify environment is loaded correctly
162
+ - Check file permissions on persist directory
163
+
164
+ **Application won't start after refresh:**
165
+ - Check recipe files exist in `./data/recipes/`
166
+ - Verify embedding provider credentials
167
+ - Review application logs for specific errors
168
+
169
+ **Partial refresh/corruption:**
170
+ - Delete persist directory manually
171
+ - Set refresh=true and restart
172
+ - Check disk space availability
173
+
174
+ ### Emergency Recovery
175
+
176
+ If refresh fails or corrupts data:
177
+
178
+ ```bash
179
+ # Manual cleanup
180
+ rm -rf ./data/chromadb_persist
181
+
182
+ # Reset configuration
183
+ DB_REFRESH_ON_START=true
184
+
185
+ # Restart application
186
+ uvicorn app:app --reload
187
+ ```
188
+
189
+ ## Testing
190
+
191
+ Test the refresh functionality:
192
+
193
+ ```bash
194
+ # Run refresh tests
195
+ python3 test_refresh.py
196
+
197
+ # Demo the feature
198
+ python3 demo_refresh.py
199
+ ```
200
+
201
+ ## Implementation Details
202
+
203
+ ### Files Modified
204
+
205
+ 1. **`config/database.py`**
206
+ - Added `DB_REFRESH_ON_START` environment variable
207
+ - Updated `get_vector_store_config()` method
208
+
209
+ 2. **`services/vector_store.py`**
210
+ - Added `shutil` import for directory deletion
211
+ - Implemented refresh logic in `_get_or_create_vector_store()`
212
+ - Added comprehensive logging
213
+
214
+ 3. **Environment Files**
215
+ - Updated `.env` and `.env.example` with new variable
216
+ - Added documentation comments
217
+
218
+ ### Code Changes
219
+
220
+ ```python
221
+ # In vector_store.py
222
+ if refresh_on_start and persist_dir.exists():
223
+ logger.info(f"🔄 DB_REFRESH_ON_START=true - Deleting existing ChromaDB at {persist_dir}")
224
+ shutil.rmtree(persist_dir)
225
+ logger.info(f"✅ Existing ChromaDB deleted successfully")
226
+ ```
227
+
228
+ This feature provides a simple but powerful way to manage vector database content lifecycle while maintaining data integrity and providing clear user control.
backend/docs/embedding-compatibility-guide.md ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Embedding Compatibility Guide
2
+
3
+ ## 🔍 Understanding Embedding Dimensions
4
+
5
+ When working with vector databases and embeddings, **dimension compatibility** is crucial for successful similarity searches. This guide helps you understand and troubleshoot embedding dimension issues.
6
+
7
+ ## 📊 Common Embedding Models & Their Dimensions
8
+
9
+ | Provider | Model | Dimensions | Use Case |
10
+ |----------|-------|------------|----------|
11
+ | **HuggingFace** | `sentence-transformers/all-MiniLM-L6-v2` | **384** | Fast, lightweight, good for most tasks |
12
+ | **HuggingFace** | `sentence-transformers/all-mpnet-base-v2` | **768** | Higher quality, larger model |
13
+ | **Ollama** | `nomic-embed-text:v1.5` | **768** | Local inference, privacy-focused |
14
+ | **Ollama** | `mxbai-embed-large` | **1024** | High-quality local embeddings |
15
+ | **OpenAI** | `text-embedding-3-small` | **1536** | Commercial API, good performance |
16
+ | **OpenAI** | `text-embedding-3-large` | **3072** | Highest quality, expensive |
17
+ | **Google** | `models/embedding-001` | **768** | Google AI integration |
18
+
19
+ ## ⚠️ Common Error: Dimension Mismatch
20
+
21
+ ### Symptoms
22
+ ```
23
+ WARNING - [custom_mongo_vector.py:103] - ⚠️ Error processing document: shapes (768,) and (384,) not aligned
24
+ ```
25
+
26
+ ### Root Cause
27
+ Your **query embeddings** and **stored embeddings** have different dimensions:
28
+ - Query: Generated with Model A (e.g., 768 dimensions)
29
+ - Stored: Created with Model B (e.g., 384 dimensions)
30
+
31
+ ### Why This Happens
32
+ 1. You changed embedding models after creating your database
33
+ 2. Your database was created with a different embedding provider
34
+ 3. Environment configuration doesn't match the original setup
35
+
36
+ ## 🔧 Solution Strategies
37
+
38
+ ### Strategy 1: Match Your Current Database (Recommended)
39
+
40
+ **Step 1: Identify stored embedding dimensions**
41
+ ```bash
42
+ # Check your MongoDB collection to see stored embedding dimensions
43
+ # Look at the 'ingredients_emb' field length
44
+ ```
45
+
46
+ **Step 2: Update .env to match**
47
+ ```bash
48
+ # If stored embeddings are 384-dimensional (common with all-MiniLM-L6-v2)
49
+ EMBEDDING_PROVIDER=huggingface
50
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
51
+
52
+ # If stored embeddings are 768-dimensional
53
+ EMBEDDING_PROVIDER=ollama
54
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
55
+ ```
56
+
57
+ ### Strategy 2: Regenerate Database with New Model
58
+
59
+ **Step 1: Choose your preferred embedding model**
60
+ ```bash
61
+ # Example: Use Ollama for local inference
62
+ EMBEDDING_PROVIDER=ollama
63
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
64
+ ```
65
+
66
+ **Step 2: Enable database refresh**
67
+ ```bash
68
+ DB_REFRESH_ON_START=true
69
+ ```
70
+
71
+ **Step 3: Restart application**
72
+ ```bash
73
+ uvicorn app:app --reload
74
+ ```
75
+
76
+ **Step 4: Disable refresh (Important!)**
77
+ ```bash
78
+ DB_REFRESH_ON_START=false
79
+ ```
80
+
81
+ ## 🔍 Debugging Embedding Issues
82
+
83
+ ### Check Current Configuration
84
+ ```bash
85
+ # View your current embedding setup
86
+ grep -E "EMBEDDING_PROVIDER|_EMBEDDING_MODEL" .env
87
+ ```
88
+
89
+ ### Monitor Embedding Dimensions
90
+ The custom MongoDB vector store now logs dimension information:
91
+ ```
92
+ 🔢 Query embedding dimensions: 768
93
+ ⚠️ Dimension mismatch: query=768D, stored=384D
94
+ 💡 Consider changing EMBEDDING_PROVIDER to match stored embeddings
95
+ ```
96
+
97
+ ### Verify Database Content
98
+ ```python
99
+ # Check stored embedding dimensions in MongoDB
100
+ collection.find_one({"ingredients_emb": {"$exists": True}})["ingredients_emb"]
101
+ # Count the array length to get dimensions
102
+ ```
103
+
104
+ ## 📋 Environment Configuration Examples
105
+
106
+ ### Example 1: HuggingFace (384D) - Most Common
107
+ ```bash
108
+ # .env configuration for 384-dimensional embeddings
109
+ EMBEDDING_PROVIDER=huggingface
110
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
111
+ HUGGINGFACE_API_TOKEN=your_token_here
112
+ ```
113
+
114
+ ### Example 2: Ollama (768D) - Local Inference
115
+ ```bash
116
+ # .env configuration for 768-dimensional embeddings
117
+ EMBEDDING_PROVIDER=ollama
118
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
119
+ OLLAMA_BASE_URL=http://localhost:11434
120
+ ```
121
+
122
+ ### Example 3: OpenAI (1536D) - Premium Quality
123
+ ```bash
124
+ # .env configuration for 1536-dimensional embeddings
125
+ EMBEDDING_PROVIDER=openai
126
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small
127
+ OPENAI_API_KEY=your_api_key_here
128
+ ```
129
+
130
+ ## 🚨 Common Pitfalls
131
+
132
+ ### 1. Mixed Providers
133
+ ❌ **Don't do this:**
134
+ ```bash
135
+ # Database created with HuggingFace
136
+ EMBEDDING_PROVIDER=huggingface # Original
137
+
138
+ # Later changed to Ollama without refreshing DB
139
+ EMBEDDING_PROVIDER=ollama # New - causes dimension mismatch!
140
+ ```
141
+
142
+ ### 2. Forgetting to Disable Refresh
143
+ ❌ **Don't forget:**
144
+ ```bash
145
+ # After refreshing database, always disable refresh
146
+ DB_REFRESH_ON_START=false # SET THIS BACK TO FALSE!
147
+ ```
148
+
149
+ ### 3. Model Name Typos
150
+ ❌ **Watch out for:**
151
+ ```bash
152
+ # Typo in model name will cause failures
153
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5 ✅
154
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text ❌ (missing version)
155
+ ```
156
+
157
+ ## 📊 Performance Comparison
158
+
159
+ | Model | Speed | Quality | Dimensions | Local/API | Cost |
160
+ |-------|-------|---------|------------|-----------|------|
161
+ | `all-MiniLM-L6-v2` | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | 384 | Both | Free |
162
+ | `nomic-embed-text:v1.5` | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | 768 | Local | Free |
163
+ | `text-embedding-3-small` | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | 1536 | API | $$$ |
164
+
165
+ ## 🔧 Troubleshooting Steps
166
+
167
+ ### Step 1: Check Current Setup
168
+ ```bash
169
+ # 1. Check your environment configuration
170
+ cat .env | grep EMBEDDING
171
+
172
+ # 2. Check vector store provider
173
+ cat .env | grep VECTOR_STORE_PROVIDER
174
+ ```
175
+
176
+ ### Step 2: Test Embedding Generation
177
+ ```python
178
+ # Test script to check embedding dimensions
179
+ from services.vector_store import vector_store_service
180
+
181
+ # Generate a test embedding
182
+ test_embedding = vector_store_service.embeddings.embed_query("test")
183
+ print(f"Current embedding dimensions: {len(test_embedding)}")
184
+ ```
185
+
186
+ ### Step 3: Check Database Content
187
+ For MongoDB users:
188
+ ```javascript
189
+ // MongoDB shell command to check stored embedding dimensions
190
+ db.your_collection.findOne({"ingredients_emb": {"$exists": true}})
191
+ ```
192
+
193
+ ### Step 4: Apply Fix
194
+ Choose one of the strategies above based on your needs.
195
+
196
+ ## 📝 Best Practices
197
+
198
+ ### 1. Document Your Embedding Model
199
+ Keep a record of which embedding model you used:
200
+ ```bash
201
+ # Add comments to your .env file
202
+ # Database created on 2025-08-27 with all-MiniLM-L6-v2 (384D)
203
+ EMBEDDING_PROVIDER=huggingface
204
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
205
+ ```
206
+
207
+ ### 2. Version Control Your Configuration
208
+ ```bash
209
+ # Commit your .env changes with descriptive messages
210
+ git add .env
211
+ git commit -m "Update embedding model to match database (384D)"
212
+ ```
213
+
214
+ ### 3. Test After Changes
215
+ ```bash
216
+ # After changing embedding configuration, test a query
217
+ curl -X POST "http://localhost:8080/chat" \
218
+ -H "Content-Type: application/json" \
219
+ -d '{"message": "test query"}'
220
+ ```
221
+
222
+ ## 🆘 Quick Reference
223
+
224
+ ### Error Pattern Recognition
225
+ ```
226
+ shapes (768,) and (384,) not aligned → Query=768D, Stored=384D
227
+ shapes (384,) and (768,) not aligned → Query=384D, Stored=768D
228
+ shapes (1536,) and (384,) not aligned → Query=1536D, Stored=384D
229
+ ```
230
+
231
+ ### Quick Fixes
232
+ | Stored Dimensions | Set EMBEDDING_PROVIDER to |
233
+ |-------------------|-------------------------|
234
+ | 384 | `huggingface` with `all-MiniLM-L6-v2` |
235
+ | 768 | `ollama` with `nomic-embed-text:v1.5` |
236
+ | 1536 | `openai` with `text-embedding-3-small` |
237
+
238
+ ---
239
+
240
+ ## 📞 Need Help?
241
+
242
+ If you're still experiencing issues:
243
+
244
+ 1. Check the application logs for detailed error messages
245
+ 2. Verify your embedding model is properly installed/accessible
246
+ 3. Ensure your database connection is working
247
+ 4. Consider regenerating your vector database if switching models permanently
248
+
249
+ Remember: **Consistency is key** - your query embeddings and stored embeddings must use the same model and dimensions!
backend/docs/embedding-troubleshooting.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨 Embedding Troubleshooting Quick Start
2
+
3
+ ## Common Error Messages & Instant Fixes
4
+
5
+ ### ⚠️ "shapes (768,) and (384,) not aligned"
6
+
7
+ **What it means:** Your query embeddings (768D) don't match stored embeddings (384D)
8
+
9
+ **Instant fix:**
10
+ ```bash
11
+ # Open .env file and change:
12
+ EMBEDDING_PROVIDER=huggingface
13
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
14
+
15
+ # Restart your application
16
+ ```
17
+
18
+ ### ⚠️ "shapes (384,) and (768,) not aligned"
19
+
20
+ **What it means:** Your query embeddings (384D) don't match stored embeddings (768D)
21
+
22
+ **Instant fix:**
23
+ ```bash
24
+ # Open .env file and change:
25
+ EMBEDDING_PROVIDER=ollama
26
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
27
+
28
+ # Make sure Ollama is running: ollama serve
29
+ # Pull the model: ollama pull nomic-embed-text:v1.5
30
+ # Restart your application
31
+ ```
32
+
33
+ ### ⚠️ "shapes (1536,) and (384,) not aligned"
34
+
35
+ **What it means:** Your query embeddings (1536D) don't match stored embeddings (384D)
36
+
37
+ **Instant fix:**
38
+ ```bash
39
+ # Open .env file and change:
40
+ EMBEDDING_PROVIDER=huggingface
41
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
42
+
43
+ # Restart your application
44
+ ```
45
+
46
+ ## 🔧 5-Minute Fix Guide
47
+
48
+ ### Step 1: Identify Your Error (30 seconds)
49
+ Look at your error message and find the dimension numbers:
50
+ - `shapes (X,) and (Y,)` → X = query dimensions, Y = stored dimensions
51
+
52
+ ### Step 2: Choose Matching Model (1 minute)
53
+ | Stored Dimensions (Y) | Set in .env |
54
+ |---------------------|-------------|
55
+ | 384 | `EMBEDDING_PROVIDER=huggingface`<br/>`HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2` |
56
+ | 768 | `EMBEDDING_PROVIDER=ollama`<br/>`OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5` |
57
+ | 1024 | `EMBEDDING_PROVIDER=ollama`<br/>`OLLAMA_EMBEDDING_MODEL=mxbai-embed-large` |
58
+ | 1536 | `EMBEDDING_PROVIDER=openai`<br/>`OPENAI_EMBEDDING_MODEL=text-embedding-3-small` |
59
+
60
+ ### Step 3: Update Configuration (2 minutes)
61
+ ```bash
62
+ # Edit your .env file
63
+ nano .env # or use your preferred editor
64
+
65
+ # Find the EMBEDDING_PROVIDER lines and update them
66
+ # Save the file
67
+ ```
68
+
69
+ ### Step 4: Restart Application (1 minute)
70
+ ```bash
71
+ # Kill current process (Ctrl+C)
72
+ # Restart
73
+ uvicorn app:app --reload
74
+ ```
75
+
76
+ ### Step 5: Test (30 seconds)
77
+ ```bash
78
+ # Test with a simple query
79
+ curl -X POST "http://localhost:8080/chat" \
80
+ -H "Content-Type: application/json" \
81
+ -d '{"message": "chicken recipe"}'
82
+ ```
83
+
84
+ ## 🔍 Alternative: Start Fresh
85
+
86
+ If you prefer to use a different embedding model permanently:
87
+
88
+ ### Option A: Regenerate Database (5 minutes)
89
+ ```bash
90
+ # 1. Choose your preferred model in .env
91
+ EMBEDDING_PROVIDER=ollama
92
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
93
+
94
+ # 2. Enable database refresh
95
+ DB_REFRESH_ON_START=true
96
+
97
+ # 3. Restart application (this will rebuild everything)
98
+ uvicorn app:app --reload
99
+
100
+ # 4. IMPORTANT: Disable refresh after startup
101
+ DB_REFRESH_ON_START=false
102
+ ```
103
+
104
+ ### Option B: Switch Vector Store (2 minutes)
105
+ ```bash
106
+ # Switch to ChromaDB (will create fresh database)
107
+ VECTOR_STORE_PROVIDER=chromadb
108
+
109
+ # Restart application
110
+ uvicorn app:app --reload
111
+ ```
112
+
113
+ ## ⚡ Prevention Tips
114
+
115
+ ### Document Your Choice
116
+ Add a comment to your .env file:
117
+ ```bash
118
+ # Created 2025-08-27 with all-MiniLM-L6-v2 (384 dimensions)
119
+ EMBEDDING_PROVIDER=huggingface
120
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
121
+ ```
122
+
123
+ ### Consistent Development
124
+ If working in a team, ensure everyone uses the same configuration:
125
+ ```bash
126
+ # Share this in your team chat:
127
+ # "Use EMBEDDING_PROVIDER=huggingface with all-MiniLM-L6-v2"
128
+ ```
129
+
130
+ ---
131
+
132
+ **Still stuck?** Check the full [Embedding Compatibility Guide](./embedding-compatibility-guide.md) for detailed explanations.
backend/docs/logging_guide.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logging Example and Configuration
2
+
3
+ ## Logging Features Implemented
4
+
5
+ ### 1. Centralized Logging Configuration
6
+ - **File**: `config/logging_config.py`
7
+ - **Features**:
8
+ - Rotating file logs (10MB max, 5 backups)
9
+ - Console and file output
10
+ - Structured format with timestamps, levels, and source location
11
+ - Environment-based configuration
12
+
13
+ ### 2. Service-Level Logging
14
+ - **Vector Store Service**: Logs initialization, document loading, provider setup
15
+ - **LLM Service**: Logs model setup, question processing, memory operations
16
+ - **API Endpoints**: Logs requests, responses, and errors
17
+
18
+ ### 3. Log Levels Used
19
+ - **INFO**: Normal operations, successful completions
20
+ - **DEBUG**: Detailed operation steps, memory operations
21
+ - **WARNING**: Non-critical issues, fallbacks
22
+ - **ERROR**: Failures, exceptions (with stack traces)
23
+
24
+ ### 4. Emoji-Coded Log Messages
25
+ - 🚀 Startup/Initialization
26
+ - ✅ Success operations
27
+ - ❌ Error conditions
28
+ - ⚠️ Warning conditions
29
+ - 🔧 Configuration/Setup
30
+ - 💬 Chat operations
31
+ - 🧠 Memory operations
32
+ - 📊 Database operations
33
+ - 🔍 Search/Retrieval
34
+ - 💾 Storage operations
35
+
36
+ ## Usage Examples
37
+
38
+ ```python
39
+ from config.logging_config import get_logger
40
+
41
+ # Get a logger for your module
42
+ logger = get_logger("my_module")
43
+
44
+ # Log at different levels
45
+ logger.info("✅ Operation completed successfully")
46
+ logger.warning("⚠️ Using fallback configuration")
47
+ logger.error("❌ Operation failed", exc_info=True) # Includes stack trace
48
+ ```
49
+
50
+ ## Log File Location
51
+ - **Path**: `./logs/recipe_bot.log`
52
+ - **Rotation**: Automatic when file exceeds 10MB
53
+ - **Backups**: Keeps 5 backup files
54
+
55
+ ## Console Output
56
+ All logs are also displayed in the console with colored formatting for easy debugging during development.
backend/docs/model-configuration-guide.md ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Configuration Guide
2
+
3
+ This guide focuses on the technical configuration, settings management, parameter handling, and troubleshooting for LLM providers in the Recipe Chatbot project.
4
+
5
+ > 📚 **Looking for model recommendations?** See [Model Selection Guide](./model-selection-guide.md) for detailed model comparisons and use case recommendations.
6
+
7
+ ## 🔧 Configuration System Overview
8
+
9
+ ### Settings Architecture
10
+ The project uses a centralized configuration system in `config/settings.py` with environment variable overrides:
11
+
12
+ ```python
13
+ # Configuration loading flow
14
+ Environment Variables (.env) → settings.py → LLM Service → Provider APIs
15
+ ```
16
+
17
+ ### Temperature Management
18
+ Each provider has different temperature constraints that are automatically handled:
19
+
20
+ | Provider | Range | Auto-Handling | Special Cases |
21
+ |----------|-------|---------------|---------------|
22
+ | **OpenAI** | 0.0 - 2.0 | ✅ GPT-5-nano → 1.0 | Nano models fixed |
23
+ | **Google** | 0.0 - 1.0 | ✅ Clamp to range | Strict validation |
24
+ | **Ollama** | 0.0 - 2.0 | ⚠️ Model dependent | Local processing |
25
+ | **HuggingFace** | Fixed ~0.7 | ❌ API ignores setting | Read-only |
26
+
27
+ ## 🛠️ Provider Configuration Details
28
+
29
+ ### OpenAI Configuration
30
+
31
+ #### Environment Variables
32
+ ```bash
33
+ # Core settings
34
+ OPENAI_API_KEY=sk-proj-xxxxx
35
+ OPENAI_MODEL=gpt-4o-mini
36
+ OPENAI_TEMPERATURE=0.7
37
+ OPENAI_MAX_TOKENS=1000
38
+
39
+ # Advanced parameters (optional)
40
+ OPENAI_TOP_P=1.0
41
+ OPENAI_FREQUENCY_PENALTY=0.0
42
+ OPENAI_PRESENCE_PENALTY=0.0
43
+ ```
44
+
45
+ #### Automatic Temperature Override
46
+ ```python
47
+ # Implemented in services/llm_service.py
48
+ if "gpt-5-nano" in model_name.lower():
49
+ temperature = 1.0 # Only supported value
50
+ logger.info(f"Auto-adjusting temperature to 1.0 for {model_name}")
51
+ ```
52
+
53
+ #### Parameter Validation
54
+ - **Temperature**: `0.0 - 2.0` (except nano models: fixed `1.0`)
55
+ - **Max Tokens**: `1 - 4096` (model-dependent)
56
+ - **Top P**: `0.0 - 1.0`
57
+
58
+ ### Google (Gemini) Configuration
59
+
60
+ #### Environment Variables
61
+ ```bash
62
+ # Core settings
63
+ GOOGLE_API_KEY=AIzaSyxxxxx
64
+ GOOGLE_MODEL=gemini-2.5-flash
65
+ GOOGLE_TEMPERATURE=0.7
66
+ GOOGLE_MAX_TOKENS=1000
67
+
68
+ # Advanced parameters (optional)
69
+ GOOGLE_TOP_P=0.95
70
+ GOOGLE_TOP_K=40
71
+ ```
72
+
73
+ #### Temperature Clamping
74
+ ```python
75
+ # Auto-clamping to Google's range
76
+ google_temp = max(0.0, min(1.0, configured_temperature))
77
+ if google_temp != configured_temperature:
78
+ logger.info(f"Clamping temperature from {configured_temperature} to {google_temp}")
79
+ ```
80
+
81
+ #### Parameter Constraints
82
+ - **Temperature**: `0.0 - 1.0` (strictly enforced)
83
+ - **Max Tokens**: `1 - 8192`
84
+ - **Top K**: `1 - 40`
85
+
86
+ ### Ollama Configuration
87
+
88
+ #### Environment Variables
89
+ ```bash
90
+ # Core settings
91
+ OLLAMA_BASE_URL=http://localhost:11434
92
+ OLLAMA_MODEL=llama3.1:8b
93
+ OLLAMA_TEMPERATURE=0.7
94
+ OLLAMA_MAX_TOKENS=1000
95
+
96
+ # Connection settings
97
+ OLLAMA_TIMEOUT=30
98
+ OLLAMA_KEEP_ALIVE=5m
99
+ ```
100
+
101
+ #### Service Management
102
+ ```bash
103
+ # Start Ollama service
104
+ ollama serve &
105
+
106
+ # Verify service status
107
+ curl http://localhost:11434/api/version
108
+
109
+ # Model management
110
+ ollama pull llama3.1:8b
111
+ ollama list
112
+ ollama rm unused_model
113
+ ```
114
+
115
+ #### Parameter Flexibility
116
+ - **Temperature**: `0.0 - 2.0` (widest range)
117
+ - **Context Length**: Model-dependent (2K - 128K)
118
+ - **Custom Parameters**: Model-specific options available
119
+
120
+ ### HuggingFace Configuration
121
+
122
+ #### Environment Variables
123
+ ```bash
124
+ # Core settings
125
+ HUGGINGFACE_API_KEY=hf_xxxxx
126
+ HUGGINGFACE_MODEL=microsoft/DialoGPT-medium
127
+ HUGGINGFACE_TEMPERATURE=0.7 # Often ignored
128
+ HUGGINGFACE_MAX_TOKENS=500
129
+
130
+ # API settings
131
+ HUGGINGFACE_WAIT_FOR_MODEL=true
132
+ HUGGINGFACE_USE_CACHE=true
133
+ ```
134
+
135
+ #### API Limitations
136
+ ```python
137
+ # Note: Temperature is often ignored by Inference API
138
+ logger.warning(f"HuggingFace model {model_name} may ignore temperature setting")
139
+ return 0.7 # API typically uses this default
140
+ ```
141
+
142
+ ## ⚙️ Advanced Configuration
143
+
144
+ ### Dynamic Provider Switching
145
+ ```python
146
+ # config/settings.py implementation
147
+ def get_llm_config():
148
+ provider = os.getenv("LLM_PROVIDER", "openai").lower()
149
+ fallback = os.getenv("LLM_FALLBACK_PROVIDER", "google").lower()
150
+
151
+ return {
152
+ "provider": provider,
153
+ "fallback_provider": fallback,
154
+ **get_provider_config(provider)
155
+ }
156
+
157
+ def get_provider_config(provider):
158
+ """Get provider-specific configuration."""
159
+ configs = {
160
+ "openai": {
161
+ "api_key": os.getenv("OPENAI_API_KEY"),
162
+ "model": os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
163
+ "temperature": float(os.getenv("OPENAI_TEMPERATURE", "0.7")),
164
+ "max_tokens": int(os.getenv("OPENAI_MAX_TOKENS", "1000")),
165
+ },
166
+ "google": {
167
+ "api_key": os.getenv("GOOGLE_API_KEY"),
168
+ "model": os.getenv("GOOGLE_MODEL", "gemini-2.5-flash"),
169
+ "temperature": float(os.getenv("GOOGLE_TEMPERATURE", "0.7")),
170
+ "max_tokens": int(os.getenv("GOOGLE_MAX_TOKENS", "1000")),
171
+ },
172
+ # ... other providers
173
+ }
174
+ return configs.get(provider, {})
175
+ ```
176
+
177
+ ### Fallback Configuration
178
+ ```python
179
+ # Automatic fallback on provider failure
180
+ def get_llm_response(message):
181
+ try:
182
+ return primary_provider.chat_completion(message)
183
+ except Exception as e:
184
+ logger.warning(f"Primary provider failed: {e}")
185
+ return fallback_provider.chat_completion(message)
186
+ ```
187
+
188
+ ### Environment-Specific Configs
189
+
190
+ #### Development (.env.development)
191
+ ```bash
192
+ # Fast, free/cheap for testing
193
+ LLM_PROVIDER=google
194
+ GOOGLE_MODEL=gemini-2.5-flash
195
+ GOOGLE_TEMPERATURE=0.8 # More creative for testing
196
+ LLM_FALLBACK_PROVIDER=ollama
197
+ ```
198
+
199
+ #### Production (.env.production)
200
+ ```bash
201
+ # Reliable, consistent for production
202
+ LLM_PROVIDER=openai
203
+ OPENAI_MODEL=gpt-4o-mini
204
+ OPENAI_TEMPERATURE=0.7 # Consistent responses
205
+ LLM_FALLBACK_PROVIDER=google
206
+ ```
207
+
208
+ #### Local Development (.env.local)
209
+ ```bash
210
+ # Self-hosted for offline development
211
+ LLM_PROVIDER=ollama
212
+ OLLAMA_MODEL=llama3.1:8b
213
+ OLLAMA_TEMPERATURE=0.7
214
+ # No fallback - fully local
215
+ ```
216
+
217
+ ## 🚨 Configuration Troubleshooting
218
+
219
+ ### Issue: GPT-5-nano Temperature Error
220
+ **Error**: `Temperature must be 1.0 for gpt-5-nano`
221
+ **Status**: ✅ Auto-fixed in `services/llm_service.py`
222
+ **Verification**:
223
+ ```bash
224
+ python -c "
225
+ import os
226
+ os.environ['OPENAI_MODEL'] = 'gpt-5-nano'
227
+ os.environ['OPENAI_TEMPERATURE'] = '0.5'
228
+ from services.llm_service import LLMService
229
+ LLMService() # Should log temperature override
230
+ "
231
+ ```
232
+
233
+ ### Issue: Google Temperature Out of Range
234
+ **Error**: `Temperature must be between 0.0 and 1.0`
235
+ **Solution**: Automatic clamping implemented
236
+ **Test**:
237
+ ```bash
238
+ python -c "
239
+ import os
240
+ os.environ['LLM_PROVIDER'] = 'google'
241
+ os.environ['GOOGLE_TEMPERATURE'] = '1.5'
242
+ from services.llm_service import LLMService
243
+ LLMService() # Should clamp to 1.0
244
+ "
245
+ ```
246
+
247
+ ### Issue: Ollama Connection Failed
248
+ **Error**: `ConnectionError: Could not connect to Ollama`
249
+ **Diagnosis**:
250
+ ```bash
251
+ # Check if Ollama is running
252
+ curl -f http://localhost:11434/api/version || echo "Ollama not running"
253
+
254
+ # Check if model exists
255
+ ollama list | grep "llama3.1:8b" || echo "Model not found"
256
+
257
+ # Check system resources
258
+ free -h # RAM usage
259
+ df -h # Disk space
260
+ ```
261
+
262
+ **Fix**:
263
+ ```bash
264
+ # Start Ollama service
265
+ ollama serve &
266
+
267
+ # Pull required model
268
+ ollama pull llama3.1:8b
269
+
270
+ # Test connection
271
+ curl -d '{"model":"llama3.1:8b","prompt":"test","stream":false}' \
272
+ http://localhost:11434/api/generate
273
+ ```
274
+
275
+ ### Issue: HuggingFace Temperature Ignored
276
+ **Issue**: Settings have no effect on response
277
+ **Explanation**: This is expected behavior - HuggingFace Inference API typically ignores temperature
278
+ **Workaround**: Use different models or providers for temperature control
279
+
280
+ ### Issue: Missing API Keys
281
+ **Error**: `AuthenticationError: Invalid API key`
282
+ **Diagnosis**:
283
+ ```bash
284
+ # Check environment variables
285
+ echo "OpenAI: ${OPENAI_API_KEY:0:10}..."
286
+ echo "Google: ${GOOGLE_API_KEY:0:10}..."
287
+ echo "HuggingFace: ${HUGGINGFACE_API_KEY:0:10}..."
288
+
289
+ # Test API key validity
290
+ curl -H "Authorization: Bearer $OPENAI_API_KEY" \
291
+ https://api.openai.com/v1/models | jq '.data[0].id' || echo "Invalid OpenAI key"
292
+ ```
293
+
294
+ ## 🔍 Configuration Validation
295
+
296
+ ### Automated Configuration Check
297
+ ```bash
298
+ # Run comprehensive configuration validation
299
+ python -c "
300
+ from config.settings import get_llm_config
301
+ from services.llm_service import LLMService
302
+ import json
303
+
304
+ print('🔧 Configuration Validation')
305
+ print('=' * 40)
306
+
307
+ # Load configuration
308
+ try:
309
+ config = get_llm_config()
310
+ print('✅ Configuration loaded successfully')
311
+ print(f'Provider: {config.get(\"provider\")}')
312
+ print(f'Model: {config.get(\"model\")}')
313
+ print(f'Temperature: {config.get(\"temperature\")}')
314
+ except Exception as e:
315
+ print(f'❌ Configuration error: {e}')
316
+ exit(1)
317
+
318
+ # Test service initialization
319
+ try:
320
+ service = LLMService()
321
+ print('✅ LLM Service initialized')
322
+ except Exception as e:
323
+ print(f'❌ Service initialization failed: {e}')
324
+ exit(1)
325
+
326
+ # Test simple completion
327
+ try:
328
+ response = service.simple_chat_completion('Test message')
329
+ print('✅ Chat completion successful')
330
+ print(f'Response length: {len(response)} characters')
331
+ except Exception as e:
332
+ print(f'❌ Chat completion failed: {e}')
333
+ exit(1)
334
+
335
+ print('🎉 All configuration checks passed!')
336
+ "
337
+ ```
338
+
339
+ ### Provider-Specific Health Checks
340
+ ```bash
341
+ # OpenAI health check
342
+ curl -H "Authorization: Bearer $OPENAI_API_KEY" \
343
+ https://api.openai.com/v1/models | jq '.data | length'
344
+
345
+ # Google health check
346
+ curl "https://generativelanguage.googleapis.com/v1beta/models?key=$GOOGLE_API_KEY" | jq '.models | length'
347
+
348
+ # Ollama health check
349
+ curl http://localhost:11434/api/tags | jq '.models | length'
350
+
351
+ # HuggingFace health check
352
+ curl -H "Authorization: Bearer $HUGGINGFACE_API_KEY" \
353
+ https://huggingface.co/api/whoami | jq '.name'
354
+ ```
355
+
356
+ ### Configuration Diff Tool
357
+ ```bash
358
+ # Compare current config with defaults
359
+ python -c "
360
+ import os
361
+ from config.settings import get_llm_config
362
+
363
+ defaults = {
364
+ 'openai': {'temperature': 0.7, 'max_tokens': 1000},
365
+ 'google': {'temperature': 0.7, 'max_tokens': 1000},
366
+ 'ollama': {'temperature': 0.7, 'max_tokens': 1000},
367
+ }
368
+
369
+ current = get_llm_config()
370
+ provider = current.get('provider')
371
+ default = defaults.get(provider, {})
372
+
373
+ print(f'Configuration for {provider}:')
374
+ for key, default_val in default.items():
375
+ current_val = current.get(key)
376
+ status = '✅' if current_val == default_val else '⚠️'
377
+ print(f'{status} {key}: {current_val} (default: {default_val})')
378
+ "
379
+ ```
380
+
381
+ ## 📋 Configuration Templates
382
+
383
+ ### Minimal Setup (Single Provider)
384
+ ```bash
385
+ # .env.minimal
386
+ LLM_PROVIDER=google
387
+ GOOGLE_API_KEY=your_api_key
388
+ GOOGLE_MODEL=gemini-2.5-flash
389
+ ```
390
+
391
+ ### Robust Setup (Primary + Fallback)
392
+ ```bash
393
+ # .env.robust
394
+ LLM_PROVIDER=openai
395
+ OPENAI_API_KEY=your_primary_key
396
+ OPENAI_MODEL=gpt-4o-mini
397
+ LLM_FALLBACK_PROVIDER=google
398
+ GOOGLE_API_KEY=your_fallback_key
399
+ GOOGLE_MODEL=gemini-2.5-flash
400
+ ```
401
+
402
+ ### Local-First Setup
403
+ ```bash
404
+ # .env.local-first
405
+ LLM_PROVIDER=ollama
406
+ OLLAMA_MODEL=llama3.1:8b
407
+ LLM_FALLBACK_PROVIDER=google
408
+ GOOGLE_API_KEY=your_cloud_backup_key
409
+ ```
410
+
411
+ ### Budget-Conscious Setup
412
+ ```bash
413
+ # .env.budget
414
+ LLM_PROVIDER=openai
415
+ OPENAI_MODEL=gpt-5-nano
416
+ OPENAI_TEMPERATURE=1.0 # Fixed for nano
417
+ OPENAI_MAX_TOKENS=500 # Reduce costs
418
+ ```
419
+
420
+ ## 🔐 Security Best Practices
421
+
422
+ ### API Key Management
423
+ ```bash
424
+ # Use environment variables
425
+ export OPENAI_API_KEY="sk-..."
426
+
427
+ # Never commit keys to git
428
+ echo "*.env*" >> .gitignore
429
+ echo ".env" >> .gitignore
430
+
431
+ # Use different keys for different environments
432
+ cp .env.example .env.development
433
+ cp .env.example .env.production
434
+ ```
435
+
436
+ ### Rate Limiting Configuration
437
+ ```python
438
+ # Add to config/settings.py
439
+ RATE_LIMITS = {
440
+ "openai": {"rpm": 500, "tpm": 40000},
441
+ "google": {"rpm": 60, "tpm": 32000},
442
+ "ollama": {"rpm": None, "tpm": None}, # Local = unlimited
443
+ }
444
+ ```
445
+
446
+ ### Error Handling Strategy
447
+ ```python
448
+ # Graceful degradation configuration
449
+ FALLBACK_CHAIN = [
450
+ "primary_provider",
451
+ "fallback_provider",
452
+ "local_provider",
453
+ "cached_response"
454
+ ]
455
+ ```
456
+
457
+ ## 🧪 Testing Configuration Changes
458
+
459
+ ### Unit Tests for Configuration
460
+ ```bash
461
+ # Test temperature overrides
462
+ python -m pytest tests/test_llm_temperature.py -v
463
+
464
+ # Test provider fallbacks
465
+ python -m pytest tests/test_llm_fallback.py -v
466
+
467
+ # Test API key validation
468
+ python -m pytest tests/test_api_keys.py -v
469
+ ```
470
+
471
+ ### Integration Tests
472
+ ```bash
473
+ # Test each provider individually
474
+ python -c "
475
+ import os
476
+ providers = ['openai', 'google', 'ollama']
477
+
478
+ for provider in providers:
479
+ os.environ['LLM_PROVIDER'] = provider
480
+ try:
481
+ from services.llm_service import LLMService
482
+ service = LLMService()
483
+ response = service.simple_chat_completion('Test')
484
+ print(f'✅ {provider}: {len(response)} chars')
485
+ except Exception as e:
486
+ print(f'❌ {provider}: {e}')
487
+ "
488
+ ```
489
+
490
+ ### Performance Benchmarks
491
+ ```bash
492
+ # Measure response times
493
+ python -c "
494
+ import time
495
+ from services.llm_service import LLMService
496
+
497
+ service = LLMService()
498
+ start = time.time()
499
+ response = service.simple_chat_completion('Quick recipe suggestion')
500
+ elapsed = time.time() - start
501
+
502
+ print(f'Response time: {elapsed:.2f}s')
503
+ print(f'Response length: {len(response)} characters')
504
+ print(f'Words per second: {len(response.split()) / elapsed:.1f}')
505
+ "
506
+ ```
507
+
508
+ ## 🔄 Configuration Migration
509
+
510
+ ### Upgrading from Old Configuration
511
+ ```bash
512
+ # Migrate old environment variables
513
+ # Old format → New format
514
+ mv .env .env.backup
515
+
516
+ # Update variable names
517
+ sed 's/LLM_MODEL=/OPENAI_MODEL=/' .env.backup > .env
518
+ sed -i 's/LLM_TEMPERATURE=/OPENAI_TEMPERATURE=/' .env
519
+ sed -i 's/LLM_MAX_TOKENS=/OPENAI_MAX_TOKENS=/' .env
520
+
521
+ echo "LLM_PROVIDER=openai" >> .env
522
+ ```
523
+
524
+ ### Version Compatibility Check
525
+ ```python
526
+ # Check if configuration is compatible
527
+ def check_config_version():
528
+ required_vars = ["LLM_PROVIDER"]
529
+ legacy_vars = ["LLM_MODEL", "LLM_TEMPERATURE"]
530
+
531
+ has_new = all(os.getenv(var) for var in required_vars)
532
+ has_legacy = any(os.getenv(var) for var in legacy_vars)
533
+
534
+ if has_legacy and not has_new:
535
+ raise ValueError("Legacy configuration detected. Please migrate to new format.")
536
+
537
+ return has_new
538
+ ```
539
+
540
+ ---
541
+
542
+ 💡 **Next Steps**: After configuring your providers, see the [Model Selection Guide](./model-selection-guide.md) for choosing the best models for your use case.
backend/docs/model-selection-guide.md ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Selection Guide
2
+
3
+ ## 🎯 At-a-Glance Recommendations
4
+
5
+ | Priority | Best Choice | Provider | Monthly Cost* | Setup Time | Quality Score | Why Choose This |
6
+ |----------|-------------|----------|---------------|------------|---------------|-----------------|
7
+ | **Ease of Use** | Gemini 2.5 Flash | Google | Free - $2 | 2 min | 90% | Excellent free tier |
8
+ | **Best Value** | GPT-5-nano | OpenAI | $1.00 | 2 min | 88% | Modern GPT-5 at nano price |
9
+ | **Premium Quality** | Claude 3 Opus | Anthropic | $225 | 2 min | 95% | Highest reasoning quality |
10
+ | **Self-Hosted** | Llama 3.1:8b | Ollama | Free | 10 min | 82% | Perfect balance |
11
+ | **High-End Local** | DeepSeek-R1:7b | Ollama | Free | 15 min | 88% | Best reasoning model |
12
+ | **Budget Cloud** | Claude 3.5 Haiku | Anthropic | $4 | 2 min | 87% | Fast and affordable |
13
+ | **Alternative Local** | CodeQwen1.5:7b | Ollama | Free | 10 min | 85% | Excellent for structured data |
14
+
15
+ *Based on 30,000 queries/month
16
+
17
+ ---
18
+
19
+ ## 🏢 Cloud Models (Closed Source)
20
+
21
+ ### OpenAI Models
22
+
23
+ #### GPT-5 (Latest Flagship) ⭐ **NEW**
24
+ ```bash
25
+ OPENAI_MODEL=gpt-5
26
+ ```
27
+ - **Pricing**: $20/month (Plus plan) - Unlimited with guardrails
28
+ - **Capabilities**: Advanced reasoning, thinking, code execution
29
+ - **Best For**: Premium applications requiring cutting-edge AI
30
+ - **Recipe Quality**: Outstanding (96%) - Best culinary understanding
31
+ - **Context**: 196K tokens (reasoning mode)
32
+
33
+
34
+ #### GPT-5-nano (Ultra Budget) ⭐ **MISSED GEM**
35
+ ```bash
36
+ OPENAI_MODEL=gpt-5-nano
37
+ ```
38
+ - **Pricing**: $0.05/1M input, $0.40/1M output tokens
39
+ - **Monthly Cost**: ~$1.00 for 30K queries
40
+ - **Best For**: Budget-conscious deployments with modern capabilities
41
+ - **Recipe Quality**: Very Good (88%)
42
+ - **Speed**: Very Fast
43
+ - **Features**: GPT-5 architecture at nano pricing
44
+
45
+
46
+ #### GPT-4o-mini (Proven Budget Choice)
47
+ ```bash
48
+ OPENAI_MODEL=gpt-4o-mini
49
+ ```
50
+ - **Pricing**: $0.15/1M input, $0.60/1M output tokens
51
+ - **Monthly Cost**: ~$4 for 30K queries
52
+ - **Best For**: Cost-effective production deployments
53
+ - **Recipe Quality**: Very Good (86%)
54
+ - **Speed**: Very Fast
55
+
56
+
57
+ ### Google AI (Gemini) Models
58
+
59
+ #### Gemini 2.5 Flash ⭐ **RECOMMENDED**
60
+ ```bash
61
+ GOOGLE_MODEL=gemini-2.5-flash
62
+ ```
63
+ - **Pricing**: Free tier, then $0.30/1M input, $2.50/1M output
64
+ - **Monthly Cost**: Free - $2 for most usage patterns
65
+ - **Best For**: Development and cost-conscious production
66
+ - **Recipe Quality**: Excellent (90%)
67
+ - **Features**: Thinking budgets, 1M context window
68
+
69
+ #### Gemini 2.5 Pro (High-End)
70
+ ```bash
71
+ GOOGLE_MODEL=gemini-2.5-pro
72
+ ```
73
+ - **Pricing**: $1.25/1M input, $10/1M output (≤200K context)
74
+ - **Monthly Cost**: ~$25 for 30K queries
75
+ - **Best For**: Premium applications requiring best Google AI
76
+ - **Recipe Quality**: Excellent (92%)
77
+
78
+ #### Gemini 2.0 Flash-Lite (Ultra Budget)
79
+ ```bash
80
+ GOOGLE_MODEL=gemini-2.0-flash-lite
81
+ ```
82
+ - **Pricing**: $0.075/1M input, $0.30/1M output
83
+ - **Monthly Cost**: ~$0.90 for 30K queries
84
+ - **Best For**: High-volume, cost-sensitive applications
85
+ - **Recipe Quality**: Good (85%)
86
+
87
+
88
+ ## 🔓 Open Source Models (Self-Hosted)
89
+
90
+ ### Ollama Models (Latest Releases)
91
+
92
+ #### DeepSeek-R1:7b ⭐ **BREAKTHROUGH MODEL**
93
+ ```bash
94
+ OLLAMA_MODEL=deepseek-r1:7b
95
+ ```
96
+ - **Parameters**: 7B
97
+ - **Download**: ~4.7GB
98
+ - **RAM Required**: 8GB
99
+ - **Best For**: Advanced reasoning tasks, O1-level performance
100
+ - **Recipe Quality**: Outstanding (88%)
101
+ - **Special**: Chain-of-thought reasoning, approaching GPT-4 performance
102
+
103
+ #### Gemma 3:27b ⭐ **NEW FLAGSHIP**
104
+ ```bash
105
+ OLLAMA_MODEL=gemma3:27b
106
+ ```
107
+ - **Parameters**: 27B
108
+ - **Download**: ~17GB
109
+ - **RAM Required**: 32GB
110
+ - **Best For**: Highest quality open source experience
111
+ - **Recipe Quality**: Outstanding (89%)
112
+ - **Features**: Vision capabilities, state-of-the-art performance
113
+
114
+ #### Llama 3.1:8b (Proven Choice)
115
+ ```bash
116
+ OLLAMA_MODEL=llama3.1:8b
117
+ ```
118
+ - **Parameters**: 8B
119
+ - **Download**: ~4.7GB
120
+ - **RAM Required**: 8GB
121
+ - **Best For**: Balanced production deployment
122
+ - **Recipe Quality**: Very Good (82%)
123
+ - **Status**: Your current choice - excellent balance!
124
+
125
+ #### Qwen 3:8b ⭐ **NEW RELEASE**
126
+ ```bash
127
+ OLLAMA_MODEL=qwen3:8b
128
+ ```
129
+ - **Parameters**: 8B
130
+ - **Download**: ~4.4GB
131
+ - **RAM Required**: 8GB
132
+ - **Best For**: Multilingual support, latest technology
133
+ - **Recipe Quality**: Very Good (84%)
134
+ - **Features**: Tool use, thinking capabilities
135
+
136
+ #### Phi 4:14b ⭐ **MICROSOFT'S LATEST**
137
+ ```bash
138
+ OLLAMA_MODEL=phi4:14b
139
+ ```
140
+ - **Parameters**: 14B
141
+ - **Download**: ~9.1GB
142
+ - **RAM Required**: 16GB
143
+ - **Best For**: Reasoning and math tasks
144
+ - **Recipe Quality**: Very Good (85%)
145
+ - **Features**: State-of-the-art efficiency
146
+
147
+ #### Gemma 3:4b (Efficient Choice)
148
+ ```bash
149
+ OLLAMA_MODEL=gemma3:4b
150
+ ```
151
+ - **Parameters**: 4B
152
+ - **Download**: ~3.3GB
153
+ - **RAM Required**: 6GB
154
+ - **Best For**: Resource-constrained deployments
155
+ - **Recipe Quality**: Good (78%)
156
+ - **Features**: Excellent for size, runs on modest hardware
157
+
158
+ ### HuggingFace Models (Downloadable for Local Use)
159
+
160
+ #### CodeQwen1.5:7b ⭐ **ALIBABA'S CODE MODEL**
161
+ ```bash
162
+ OLLAMA_MODEL=codeqwen:7b
163
+ ```
164
+ - **Parameters**: 7B
165
+ - **Download**: ~4.2GB
166
+ - **RAM Required**: 8GB
167
+ - **Best For**: Recipe parsing, ingredient analysis, structured data
168
+ - **Recipe Quality**: Very Good (85%)
169
+ - **Features**: Excellent at understanding structured recipe formats
170
+
171
+ #### Mistral-Nemo:12b ⭐ **BALANCED CHOICE**
172
+ ```bash
173
+ OLLAMA_MODEL=mistral-nemo:12b
174
+ ```
175
+ - **Parameters**: 12B
176
+ - **Download**: ~7GB
177
+ - **RAM Required**: 12GB
178
+ - **Best For**: General conversation with good reasoning
179
+ - **Recipe Quality**: Very Good (84%)
180
+ - **Features**: Multilingual, efficient, well-balanced
181
+
182
+ #### Nous-Hermes2:10.7b ⭐ **FINE-TUNED EXCELLENCE**
183
+ ```bash
184
+ OLLAMA_MODEL=nous-hermes2:10.7b
185
+ ```
186
+ - **Parameters**: 10.7B
187
+ - **Download**: ~6.4GB
188
+ - **RAM Required**: 12GB
189
+ - **Best For**: Instruction following, detailed responses
190
+ - **Recipe Quality**: Very Good (83%)
191
+ - **Features**: Excellent instruction following, helpful responses
192
+
193
+ #### OpenHermes2.5-Mistral:7b ⭐ **COMMUNITY FAVORITE**
194
+ ```bash
195
+ OLLAMA_MODEL=openhermes2.5-mistral:7b
196
+ ```
197
+ - **Parameters**: 7B
198
+ - **Download**: ~4.1GB
199
+ - **RAM Required**: 8GB
200
+ - **Best For**: Creative recipe suggestions, conversational AI
201
+ - **Recipe Quality**: Good (81%)
202
+ - **Features**: Creative, conversational, reliable
203
+
204
+ #### Solar:10.7b ⭐ **UPSTAGE'S MODEL**
205
+ ```bash
206
+ OLLAMA_MODEL=solar:10.7b
207
+ ```
208
+ - **Parameters**: 10.7B
209
+ - **Download**: ~6.1GB
210
+ - **RAM Required**: 12GB
211
+ - **Best For**: Analytical tasks, recipe modifications
212
+ - **Recipe Quality**: Very Good (83%)
213
+ - **Features**: Strong analytical capabilities, detailed explanations
214
+
215
+
216
+ ### Anthropic Claude Models
217
+
218
+ #### Claude 3.5 Sonnet (Production Standard)
219
+ ```bash
220
+ ANTHROPIC_MODEL=claude-3-5-sonnet-20241022
221
+ ```
222
+ - **Pricing**: $3/1M input, $15/1M output tokens
223
+ - **Monthly Cost**: ~$45 for 30K queries
224
+ - **Best For**: Balanced performance and reasoning
225
+ - **Recipe Quality**: Outstanding (94%)
226
+ - **Features**: Advanced analysis, code understanding
227
+
228
+ #### Claude 3.5 Haiku (Speed Focused)
229
+ ```bash
230
+ ANTHROPIC_MODEL=claude-3-5-haiku-20241022
231
+ ```
232
+ - **Pricing**: $0.25/1M input, $1.25/1M output tokens
233
+ - **Monthly Cost**: ~$4 for 30K queries
234
+ - **Best For**: Fast, cost-effective responses
235
+ - **Recipe Quality**: Very Good (87%)
236
+ - **Features**: Lightning fast, good quality
237
+
238
+ #### Claude 3 Opus (Premium Reasoning)
239
+ ```bash
240
+ ANTHROPIC_MODEL=claude-3-opus-20240229
241
+ ```
242
+ - **Pricing**: $15/1M input, $75/1M output tokens
243
+ - **Monthly Cost**: ~$225 for 30K queries
244
+ - **Best For**: Complex reasoning, highest quality
245
+ - **Recipe Quality**: Outstanding (95%)
246
+ - **Features**: Top-tier reasoning, complex tasks
247
+
248
+ ---
249
+
250
+
251
+ ## 🎯 Scenario-Based Recommendations
252
+
253
+ ### 👨‍💻 **Development & Testing**
254
+ **Choice**: Gemini 2.5 Flash
255
+ ```bash
256
+ LLM_PROVIDER=google
257
+ GOOGLE_MODEL=gemini-2.5-flash
258
+ ```
259
+ - Free tier covers most development
260
+ - Excellent quality for testing
261
+ - Easy setup and integration
262
+
263
+ ### 🚀 **Small to Medium Production**
264
+ **Choice**: Gemini 2.5 Flash or GPT-4o-mini
265
+ ```bash
266
+ # Cost-focused
267
+ LLM_PROVIDER=google
268
+ GOOGLE_MODEL=gemini-2.5-flash
269
+
270
+ # Quality-focused
271
+ LLM_PROVIDER=openai
272
+ OPENAI_MODEL=gpt-4o-mini
273
+ ```
274
+
275
+ ### 🏠 **Self-Hosted**
276
+ **Choice**: Llama 3.1:8b or upgrade to DeepSeek-R1:7b
277
+ ```bash
278
+ # Your current (excellent choice)
279
+ LLM_PROVIDER=ollama
280
+ OLLAMA_MODEL=llama3.1:8b
281
+
282
+ # Upgrade option (better reasoning)
283
+ LLM_PROVIDER=ollama
284
+ OLLAMA_MODEL=deepseek-r1:7b
285
+ ```
286
+
287
+ ### 💰 **Budget/Free**
288
+ **Choice**: Local models or GPT-5-nano
289
+ ```bash
290
+ # Best local alternative
291
+ LLM_PROVIDER=ollama
292
+ OLLAMA_MODEL=codeqwen:7b
293
+
294
+ # Best budget paid option
295
+ LLM_PROVIDER=openai
296
+ OPENAI_MODEL=gpt-5-nano
297
+
298
+ # Quality budget cloud
299
+ LLM_PROVIDER=anthropic
300
+ ANTHROPIC_MODEL=claude-3-5-haiku-20241022
301
+ ```
302
+
303
+ ### 🔒 **Privacy/Offline**
304
+ **Choice**: DeepSeek-R1:7b or Gemma 3:4b
305
+ ```bash
306
+ # Best reasoning
307
+ LLM_PROVIDER=ollama
308
+ OLLAMA_MODEL=deepseek-r1:7b
309
+
310
+ # Resource-efficient
311
+ LLM_PROVIDER=ollama
312
+ OLLAMA_MODEL=gemma3:4b
313
+ ```
314
+
315
+ ---
316
+
317
+ ## ⚡ Quick Setup Commands
318
+
319
+ ### Cloud Models (Instant Setup)
320
+
321
+ #### Gemini 2.5 Flash (Recommended)
322
+ ```bash
323
+ # Update .env
324
+ LLM_PROVIDER=google
325
+ GOOGLE_MODEL=gemini-2.5-flash
326
+ GOOGLE_TEMPERATURE=0.7
327
+ GOOGLE_MAX_TOKENS=1000
328
+
329
+ # Test
330
+ python -c "
331
+ from services.llm_service import LLMService
332
+ service = LLMService()
333
+ print('✅ Gemini 2.5 Flash ready!')
334
+ response = service.simple_chat_completion('Suggest a quick pasta recipe')
335
+ print(f'Response: {response[:100]}...')
336
+ "
337
+ ```
338
+
339
+ #### CodeQwen1.5:7b (Structured Data Expert)
340
+ ```bash
341
+ # Pull model
342
+ ollama pull codeqwen:7b
343
+
344
+ # Update .env
345
+ LLM_PROVIDER=ollama
346
+ OLLAMA_MODEL=codeqwen:7b
347
+ OLLAMA_TEMPERATURE=0.7
348
+
349
+ # Test
350
+ python -c "
351
+ from services.llm_service import LLMService
352
+ service = LLMService()
353
+ print('✅ CodeQwen 1.5:7b ready!')
354
+ response = service.simple_chat_completion('Parse this recipe: 2 cups flour, 1 egg, 1 cup milk')
355
+ print(f'Response: {response[:100]}...')
356
+ "
357
+ ```
358
+
359
+ #### Mistral-Nemo:12b (Balanced Performance)
360
+ ```bash
361
+ # Pull model
362
+ ollama pull mistral-nemo:12b
363
+
364
+ # Update .env
365
+ LLM_PROVIDER=ollama
366
+ OLLAMA_MODEL=mistral-nemo:12b
367
+ OLLAMA_TEMPERATURE=0.7
368
+
369
+ # Test
370
+ python -c "
371
+ from services.llm_service import LLMService
372
+ service = LLMService()
373
+ print('✅ Mistral-Nemo ready!')
374
+ response = service.simple_chat_completion('Suggest a Mediterranean dinner menu')
375
+ print(f'Response: {response[:100]}...')
376
+ "
377
+ ```
378
+
379
+ #### Claude 3.5 Haiku (Speed + Quality)
380
+ ```bash
381
+ # Update .env
382
+ LLM_PROVIDER=anthropic
383
+ ANTHROPIC_MODEL=claude-3-5-haiku-20241022
384
+ ANTHROPIC_TEMPERATURE=0.7
385
+ ANTHROPIC_MAX_TOKENS=1000
386
+
387
+ # Test
388
+ python -c "
389
+ from services.llm_service import LLMService
390
+ service = LLMService()
391
+ print('✅ Claude 3.5 Haiku ready!')
392
+ response = service.simple_chat_completion('Quick dinner ideas with vegetables')
393
+ print(f'Response: {response[:100]}...')
394
+ "
395
+ ```
396
+
397
+ #### GPT-5-nano (Budget Winner)
398
+ ```bash
399
+ # Update .env
400
+ LLM_PROVIDER=openai
401
+ OPENAI_MODEL=gpt-5-nano
402
+ OPENAI_TEMPERATURE=0.7
403
+ OPENAI_MAX_TOKENS=1000
404
+
405
+ # Test
406
+ python -c "
407
+ from services.llm_service import LLMService
408
+ service = LLMService()
409
+ print('✅ GPT-5-nano ready!')
410
+ response = service.simple_chat_completion('Quick healthy breakfast ideas')
411
+ print(f'Response: {response[:100]}...')
412
+ "
413
+ ```
414
+
415
+ #### GPT-5 (Premium)
416
+ ```bash
417
+ # Update .env
418
+ LLM_PROVIDER=openai
419
+ OPENAI_MODEL=gpt-5
420
+ OPENAI_TEMPERATURE=0.7
421
+ OPENAI_MAX_TOKENS=1000
422
+
423
+ # Test
424
+ python -c "
425
+ from services.llm_service import LLMService
426
+ service = LLMService()
427
+ print('✅ GPT-5 ready!')
428
+ response = service.simple_chat_completion('Create a healthy meal plan')
429
+ print(f'Response: {response[:100]}...')
430
+ "
431
+ ```
432
+
433
+ ### Self-Hosted Models
434
+
435
+ #### DeepSeek-R1:7b (Latest Breakthrough)
436
+ ```bash
437
+ # Pull model
438
+ ollama pull deepseek-r1:7b
439
+
440
+ # Update .env
441
+ LLM_PROVIDER=ollama
442
+ OLLAMA_MODEL=deepseek-r1:7b
443
+ OLLAMA_TEMPERATURE=0.7
444
+
445
+ # Start Ollama
446
+ ollama serve &
447
+
448
+ # Test
449
+ python -c "
450
+ from services.llm_service import LLMService
451
+ service = LLMService()
452
+ print('✅ DeepSeek-R1 ready!')
453
+ response = service.simple_chat_completion('Explain the science behind sourdough fermentation')
454
+ print(f'Response: {response[:100]}...')
455
+ "
456
+ ```
457
+
458
+ #### Gemma 3:4b (Efficient)
459
+ ```bash
460
+ # Pull model
461
+ ollama pull gemma3:4b
462
+
463
+ # Update .env
464
+ LLM_PROVIDER=ollama
465
+ OLLAMA_MODEL=gemma3:4b
466
+ OLLAMA_TEMPERATURE=0.7
467
+
468
+ # Test
469
+ python -c "
470
+ from services.llm_service import LLMService
471
+ service = LLMService()
472
+ print('✅ Gemma 3:4b ready!')
473
+ response = service.simple_chat_completion('Quick chicken recipes for weeknight dinners')
474
+ print(f'Response: {response[:100]}...')
475
+ "
476
+ ```
477
+
478
+ ---
479
+
480
+ ## 🔧 Hardware Requirements
481
+
482
+ ### Cloud Models
483
+ - **Requirements**: Internet connection, API key
484
+ - **RAM**: Any (processing done remotely)
485
+ - **Storage**: Minimal
486
+ - **Best For**: Instant setup, no hardware constraints
487
+
488
+ ### Self-Hosted Requirements
489
+
490
+ | Model | Parameters | RAM Needed | Storage | GPU Beneficial | Best For |
491
+ |-------|------------|------------|---------|----------------|----------|
492
+ | `gemma3:4b` | 4B | 6GB | 3.3GB | Optional | Laptops, modest hardware |
493
+ | `codeqwen:7b` | 7B | 8GB | 4.2GB | Yes | Structured data, parsing |
494
+ | `llama3.1:8b` | 8B | 8GB | 4.7GB | Yes | Standard workstations |
495
+ | `deepseek-r1:7b` | 7B | 8GB | 4.7GB | Yes | Reasoning tasks |
496
+ | `openhermes2.5-mistral:7b` | 7B | 8GB | 4.1GB | Yes | Conversational AI |
497
+ | `nous-hermes2:10.7b` | 10.7B | 12GB | 6.4GB | Recommended | Instruction following |
498
+ | `mistral-nemo:12b` | 12B | 12GB | 7GB | Recommended | Balanced performance |
499
+ | `phi4:14b` | 14B | 16GB | 9.1GB | Recommended | High-end workstations |
500
+ | `gemma3:27b` | 27B | 32GB | 17GB | Required | Powerful servers |
501
+
502
+ ---
backend/docs/opensource-llm-configuration.md ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source LLM Configuration Guide (HuggingFace & Ollama)
2
+
3
+ ## Overview
4
+ The Recipe Recommendation Bot supports open source models through both HuggingFace and Ollama. This guide explains how to configure these providers for optimal performance, with recommended models under 20B parameters.
5
+
6
+ > 📚 **For comprehensive model comparisons including closed source options (OpenAI, Google), see [Comprehensive Model Guide](./comprehensive-model-guide.md)**
7
+
8
+ ## Quick Model Recommendations
9
+
10
+ | Use Case | Model | Download Size | RAM Required | Quality |
11
+ |----------|-------|---------------|--------------|---------|
12
+ | **Development** | `gemma2:2b` | 1.6GB | 4GB | Good |
13
+ | **Production** | `llama3.1:8b` | 4.7GB | 8GB | Excellent |
14
+ | **High Quality** | `llama3.1:13b` | 7.4GB | 16GB | Outstanding |
15
+ | **API (Free)** | `deepseek-ai/DeepSeek-V3.1` | 0GB | N/A | Very Good |
16
+
17
+ ## 🤗 HuggingFace Configuration
18
+
19
+ ### Environment Variables
20
+
21
+ Add these variables to your `.env` file:
22
+
23
+ ```bash
24
+ # LLM Provider Configuration
25
+ LLM_PROVIDER=huggingface
26
+
27
+ # HuggingFace Configuration
28
+ HUGGINGFACE_API_TOKEN=your_hf_token_here # Optional for public models
29
+ HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1 # Current recommended model
30
+ HUGGINGFACE_API_URL=https://api-inference.huggingface.co/models/
31
+ HUGGINGFACE_USE_API=true # Use API vs local inference
32
+ HUGGINGFACE_USE_GPU=false # Set to true for local GPU inference
33
+
34
+ # Embedding Configuration
35
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
36
+ ```
37
+
38
+ ### Deployment Options
39
+
40
+ #### Option 1: API Inference (Recommended)
41
+ ```bash
42
+ HUGGINGFACE_USE_API=true
43
+ ```
44
+ - **Pros**: No local downloads, fast startup, always latest models
45
+ - **Cons**: Requires internet connection, API rate limits
46
+ - **Download Size**: 0 bytes (no local storage needed)
47
+ - **Best for**: Development, testing, quick prototyping
48
+
49
+ #### Option 2: Local Inference
50
+ ```bash
51
+ HUGGINGFACE_USE_API=false
52
+ HUGGINGFACE_USE_GPU=false # CPU-only
53
+ ```
54
+ - **Pros**: No internet required, no rate limits, private
55
+ - **Cons**: Large model downloads, slower inference on CPU
56
+ - **Best for**: Production, offline deployments
57
+
58
+ #### Option 3: Local GPU Inference
59
+ ```bash
60
+ HUGGINGFACE_USE_API=false
61
+ HUGGINGFACE_USE_GPU=true # Requires CUDA GPU
62
+ ```
63
+ - **Pros**: Fast inference, no internet required, no rate limits
64
+ - **Cons**: Large downloads, requires GPU with sufficient VRAM
65
+ - **Best for**: Production with GPU resources
66
+
67
+ ### Recommended HuggingFace Models
68
+
69
+ #### Lightweight Models (Good for CPU)
70
+ ```bash
71
+ HUGGINGFACE_MODEL=microsoft/DialoGPT-small # ~117MB download
72
+ HUGGINGFACE_MODEL=distilgpt2 # ~319MB download
73
+ HUGGINGFACE_MODEL=google/flan-t5-small # ~242MB download
74
+ ```
75
+
76
+ #### Balanced Performance Models
77
+ ```bash
78
+ HUGGINGFACE_MODEL=microsoft/DialoGPT-medium # ~863MB download
79
+ HUGGINGFACE_MODEL=google/flan-t5-base # ~990MB download
80
+ HUGGINGFACE_MODEL=microsoft/CodeGPT-small-py # ~510MB download
81
+ ```
82
+
83
+ #### High Quality Models (GPU Recommended)
84
+ ```bash
85
+ HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1 # ~4.2GB download (7B params)
86
+ HUGGINGFACE_MODEL=microsoft/DialoGPT-large # ~3.2GB download
87
+ HUGGINGFACE_MODEL=google/flan-t5-large # ~2.8GB download (770M params)
88
+ HUGGINGFACE_MODEL=huggingface/CodeBERTa-small-v1 # ~1.1GB download
89
+ ```
90
+
91
+ #### Specialized Recipe/Cooking Models
92
+ ```bash
93
+ HUGGINGFACE_MODEL=recipe-nlg/recipe-nlg-base # ~450MB download
94
+ HUGGINGFACE_MODEL=cooking-assistant/chef-gpt # ~2.1GB download (if available)
95
+ ```
96
+
97
+ ## 🦙 Ollama Configuration
98
+
99
+ ### Installation
100
+
101
+ First, install Ollama on your system:
102
+
103
+ ```bash
104
+ # Linux/macOS
105
+ curl -fsSL https://ollama.ai/install.sh | sh
106
+
107
+ # Windows
108
+ # Download installer from https://ollama.ai/download
109
+ ```
110
+
111
+ ### Environment Variables
112
+
113
+ ```bash
114
+ # LLM Provider Configuration
115
+ LLM_PROVIDER=ollama
116
+
117
+ # Ollama Configuration
118
+ OLLAMA_BASE_URL=http://localhost:11434
119
+ OLLAMA_MODEL=llama3.1:8b
120
+ OLLAMA_TEMPERATURE=0.7
121
+
122
+ # Embedding Configuration
123
+ OLLAMA_EMBEDDING_MODEL=nomic-embed-text
124
+ ```
125
+
126
+ ### Starting Ollama Service
127
+
128
+ ```bash
129
+ # Start Ollama server
130
+ ollama serve
131
+
132
+ # In another terminal, pull your desired model
133
+ ollama pull llama3.1:8b
134
+ ```
135
+
136
+ ### Recommended Ollama Models
137
+
138
+ #### Lightweight Models (4GB RAM or less)
139
+ ```bash
140
+ OLLAMA_MODEL=phi3:mini # ~2.3GB download (3.8B params)
141
+ OLLAMA_MODEL=gemma2:2b # ~1.6GB download (2B params)
142
+ OLLAMA_MODEL=qwen2:1.5b # ~934MB download (1.5B params)
143
+ ```
144
+
145
+ #### Balanced Performance Models (8GB RAM)
146
+ ```bash
147
+ OLLAMA_MODEL=llama3.1:8b # ~4.7GB download (8B params)
148
+ OLLAMA_MODEL=gemma2:9b # ~5.4GB download (9B params)
149
+ OLLAMA_MODEL=mistral:7b # ~4.1GB download (7B params)
150
+ OLLAMA_MODEL=qwen2:7b # ~4.4GB download (7B params)
151
+ ```
152
+
153
+ #### High Quality Models (16GB+ RAM)
154
+ ```bash
155
+ OLLAMA_MODEL=llama3.1:13b # ~7.4GB download (13B params)
156
+ OLLAMA_MODEL=mixtral:8x7b # ~26GB download (47B params - sparse)
157
+ OLLAMA_MODEL=qwen2:14b # ~8.2GB download (14B params)
158
+ ```
159
+
160
+ #### Code/Instruction Following Models
161
+ ```bash
162
+ OLLAMA_MODEL=codellama:7b # ~3.8GB download (7B params)
163
+ OLLAMA_MODEL=deepseek-coder:6.7b # ~3.8GB download (6.7B params)
164
+ OLLAMA_MODEL=wizard-coder:7b # ~4.1GB download (7B params)
165
+ ```
166
+
167
+ ### Ollama Model Management
168
+
169
+ ```bash
170
+ # List available models
171
+ ollama list
172
+
173
+ # Pull a specific model
174
+ ollama pull llama3.1:8b
175
+
176
+ # Remove a model to free space
177
+ ollama rm old-model:tag
178
+
179
+ # Check model information
180
+ ollama show llama3.1:8b
181
+ ```
182
+
183
+ ## Installation Requirements
184
+
185
+ ### HuggingFace Setup
186
+
187
+ #### For API Usage (No Downloads)
188
+ ```bash
189
+ pip install -r requirements.txt
190
+ # No additional setup needed
191
+ ```
192
+
193
+ #### For Local CPU Inference
194
+ ```bash
195
+ pip install -r requirements.txt
196
+ # Models will be downloaded automatically on first use
197
+ ```
198
+
199
+ #### For Local GPU Inference
200
+ ```bash
201
+ # Install CUDA version of PyTorch
202
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
203
+
204
+ # Install other requirements
205
+ pip install -r requirements.txt
206
+
207
+ # Verify GPU availability
208
+ python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
209
+ ```
210
+
211
+ ### Ollama Setup
212
+
213
+ #### Installation
214
+ ```bash
215
+ # Install Ollama
216
+ curl -fsSL https://ollama.ai/install.sh | sh
217
+
218
+ # Start Ollama service
219
+ ollama serve
220
+
221
+ # Pull your first model (in another terminal)
222
+ ollama pull llama3.1:8b
223
+ ```
224
+
225
+ ## Storage Requirements & Download Sizes
226
+
227
+ ### HuggingFace Local Models
228
+ - **Storage Location**: `~/.cache/huggingface/transformers/`
229
+ - **Small Models**: 100MB - 1GB (good for development)
230
+ - **Medium Models**: 1GB - 5GB (balanced performance)
231
+ - **Large Models**: 5GB - 15GB (high quality, under 20B params)
232
+
233
+ ### Ollama Models
234
+ - **Storage Location**: `~/.ollama/models/`
235
+ - **Quantized Storage**: Models use efficient quantization (4-bit, 8-bit)
236
+ - **2B Models**: ~1-2GB download
237
+ - **7-8B Models**: ~4-5GB download
238
+ - **13-14B Models**: ~7-8GB download
239
+
240
+ ### Embedding Models
241
+ ```bash
242
+ # HuggingFace Embeddings (auto-downloaded)
243
+ sentence-transformers/all-MiniLM-L6-v2 # ~80MB
244
+ sentence-transformers/all-mpnet-base-v2 # ~420MB
245
+
246
+ # Ollama Embeddings
247
+ ollama pull nomic-embed-text # ~274MB
248
+ ollama pull mxbai-embed-large # ~669MB
249
+ ```
250
+
251
+ ## Performance & Hardware Recommendations
252
+
253
+ ### System Requirements
254
+
255
+ #### Minimum (API Usage)
256
+ - **RAM**: 2GB
257
+ - **Storage**: 100MB
258
+ - **Internet**: Required for API calls
259
+
260
+ #### CPU Inference
261
+ - **RAM**: 8GB+ (16GB for larger models)
262
+ - **CPU**: 4+ cores recommended
263
+ - **Storage**: 5GB+ for models cache
264
+
265
+ #### GPU Inference
266
+ - **GPU**: 8GB+ VRAM (for 7B models)
267
+ - **RAM**: 16GB+ system RAM
268
+ - **Storage**: 10GB+ for models
269
+
270
+ ### Performance Tips
271
+
272
+ 1. **Start Small**: Begin with lightweight models and upgrade based on quality needs
273
+ 2. **Use API First**: Test with HuggingFace API before committing to local inference
274
+ 3. **Monitor Resources**: Check CPU/GPU/RAM usage during inference
275
+ 4. **Model Caching**: First run downloads models, subsequent runs are faster
276
+
277
+ ## Troubleshooting
278
+
279
+ ### HuggingFace Issues
280
+
281
+ #### "accelerate package required"
282
+ ```bash
283
+ pip install accelerate
284
+ ```
285
+
286
+ #### GPU not detected
287
+ ```bash
288
+ # Check CUDA availability
289
+ python -c "import torch; print(torch.cuda.is_available())"
290
+
291
+ # If false, install CUDA PyTorch
292
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
293
+ ```
294
+
295
+ #### Out of memory errors
296
+ - Switch to a smaller model
297
+ - Set `HUGGINGFACE_USE_GPU=false` for CPU inference
298
+ - Use API instead: `HUGGINGFACE_USE_API=true`
299
+
300
+ ### Ollama Issues
301
+
302
+ #### Ollama service not starting
303
+ ```bash
304
+ # Check if port 11434 is available
305
+ lsof -i :11434
306
+
307
+ # Restart Ollama
308
+ ollama serve
309
+ ```
310
+
311
+ #### Model not found
312
+ ```bash
313
+ # List available models
314
+ ollama list
315
+
316
+ # Pull the model
317
+ ollama pull llama3.1:8b
318
+ ```
319
+
320
+ #### Slow inference
321
+ - Try a smaller model
322
+ - Check available RAM
323
+ - Consider using GPU if available
324
+
325
+ ## Quick Tests
326
+
327
+ ### Test HuggingFace Configuration
328
+ ```bash
329
+ cd backend
330
+ python -c "
331
+ from services.llm_service import LLMService
332
+ import os
333
+ os.environ['LLM_PROVIDER'] = 'huggingface'
334
+ service = LLMService()
335
+ print('✅ HuggingFace LLM working!')
336
+ response = service.simple_chat_completion('Hello')
337
+ print(f'Response: {response}')
338
+ "
339
+ ```
340
+
341
+ ### Test Ollama Configuration
342
+ ```bash
343
+ # First ensure Ollama is running
344
+ ollama serve &
345
+
346
+ # Test the service
347
+ cd backend
348
+ python -c "
349
+ from services.llm_service import LLMService
350
+ import os
351
+ os.environ['LLM_PROVIDER'] = 'ollama'
352
+ service = LLMService()
353
+ print('✅ Ollama LLM working!')
354
+ response = service.simple_chat_completion('Hello')
355
+ print(f'Response: {response}')
356
+ "
357
+ ```
358
+
359
+ ## Configuration Examples
360
+
361
+ ### Development Setup (Fast Start)
362
+ ```bash
363
+ # Use HuggingFace API for quick testing
364
+ LLM_PROVIDER=huggingface
365
+ HUGGINGFACE_USE_API=true
366
+ HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1
367
+ HUGGINGFACE_API_TOKEN=your_token_here
368
+ ```
369
+
370
+ ### Local CPU Setup
371
+ ```bash
372
+ # Local inference on CPU
373
+ LLM_PROVIDER=ollama
374
+ OLLAMA_MODEL=llama3.1:8b
375
+ OLLAMA_BASE_URL=http://localhost:11434
376
+ ```
377
+
378
+ ### Local GPU Setup
379
+ ```bash
380
+ # Local inference with GPU acceleration
381
+ LLM_PROVIDER=huggingface
382
+ HUGGINGFACE_USE_API=false
383
+ HUGGINGFACE_USE_GPU=true
384
+ HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1
385
+ ```
386
+
387
+ ### Production Setup (High Performance)
388
+ ```bash
389
+ # Ollama with optimized model
390
+ LLM_PROVIDER=ollama
391
+ OLLAMA_MODEL=llama3.1:13b # Higher quality
392
+ OLLAMA_BASE_URL=http://localhost:11434
393
+ # Ensure 16GB+ RAM available
394
+ ```
backend/docs/optimal_recipes_structure.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Universal Recipe Data Structure
2
+
3
+ This document defines a simple, universal data structure for recipe storage that works efficiently with both ChromaDB and MongoDB Atlas for ingredient-based recipe recommendations.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Ingredient-focused**: Primary search is by ingredients
8
+ 2. **Universal compatibility**: Same structure works for ChromaDB and MongoDB
9
+ 3. **Simple and clean**: Easy to understand and maintain
10
+ 4. **Efficient retrieval**: Optimized for RAG performance
11
+
12
+ ## Universal Recipe Structure
13
+
14
+ ### Required Fields
15
+
16
+ ```json
17
+ {
18
+ "title": "String - Recipe name",
19
+ "ingredients": ["Array of strings - Individual ingredients"],
20
+ "instructions": "String - Step-by-step cooking instructions",
21
+ "metadata": {
22
+ "cook_time": "String - Optional cooking time",
23
+ "difficulty": "String - Optional difficulty level",
24
+ "servings": "String - Optional number of servings",
25
+ "category": "String - Optional recipe category",
26
+ "image_url": "String - Optional recipe image URL"
27
+ }
28
+ }
29
+ ```
30
+
31
+ ### Example Document
32
+
33
+ ```json
34
+ {
35
+ "title": "Mixed Seafood Coconut Fried Rice",
36
+ "ingredients": [
37
+ "jasmine rice",
38
+ "cooked shrimp",
39
+ "prawns",
40
+ "scallops",
41
+ "coconut milk",
42
+ "fish sauce",
43
+ "soy sauce",
44
+ "garlic",
45
+ "onion",
46
+ "ginger",
47
+ "green onions",
48
+ "cilantro",
49
+ "lime",
50
+ "vegetable oil",
51
+ "salt",
52
+ "pepper"
53
+ ],
54
+ "instructions": "1. Heat vegetable oil in large pan. 2. Add garlic, onion, ginger and stir-fry until fragrant. 3. Add cooked rice and mix well. 4. Add seafood and cook until heated through. 5. Pour in coconut milk and season with fish sauce and soy sauce. 6. Garnish with green onions and cilantro. 7. Serve with lime wedges.",
55
+ "metadata": {
56
+ "cook_time": "25 minutes",
57
+ "difficulty": "medium",
58
+ "servings": "4",
59
+ "category": "seafood",
60
+ "image_url": "https://example.com/images/mixed-seafood-coconut-fried-rice.jpg"
61
+ }
62
+ }
63
+ ```
64
+
65
+ ## Key Features
66
+
67
+ ### 1. Clean Ingredients Format
68
+ - **Array structure**: Each ingredient as separate string
69
+ - **Individual embedding**: Each ingredient can be embedded separately
70
+ - **Easy matching**: Simple array operations for ingredient search
71
+ - **No duplicates**: Each ingredient appears once in the array
72
+
73
+ ### 2. Universal Compatibility
74
+ - **ChromaDB**: Automatically creates embeddings from full document
75
+ - **MongoDB Atlas**: Can use pre-computed embeddings or text search
76
+ - **Same structure**: No provider-specific modifications needed
77
+
78
+ ### 3. Efficient Search Patterns
79
+
80
+ #### Primary: Ingredient-based Search
81
+ ```
82
+ User: "I have shrimp, rice, and coconut milk"
83
+ Search: ingredients array for ["shrimp", "rice", "coconut"]
84
+ Result: Mixed Seafood Coconut Fried Rice (high relevance)
85
+ ```
86
+
87
+ #### Secondary: Title-based Search
88
+ ```
89
+ User: "How to make fried rice"
90
+ Search: title field for "fried rice"
91
+ Result: All fried rice recipes
92
+ ```
93
+
94
+ #### Fallback: Full-text Search
95
+ ```
96
+ User: "Quick dinner recipes"
97
+ Search: Full document for "quick dinner"
98
+ Result: Recipes mentioning quick preparation
99
+ ```
100
+
101
+ ## Implementation Guidelines
102
+
103
+ ### For ChromaDB
104
+ ```python
105
+ # Documents are automatically embedded as full text
106
+ ingredients_text = ", ".join(recipe['ingredients'])
107
+ document = Document(
108
+ page_content=f"Title: {recipe['title']}. Ingredients: {ingredients_text}. Instructions: {recipe['instructions']}",
109
+ metadata=recipe['metadata']
110
+ )
111
+ ```
112
+
113
+ ### For MongoDB Atlas
114
+ ```python
115
+ # Can use array search or vector search on the same structure
116
+ # Array search on ingredients
117
+ {"ingredients": {"$in": user_ingredients_list}}
118
+
119
+ # Or vector search if embeddings are pre-computed
120
+ {"ingredients_vector": {"$near": query_embedding}}
121
+ ```
122
+
123
+ ## Data Preparation
124
+
125
+ ### Ingredient Processing Rules
126
+ 1. **Clean individual items**: "2 cups rice" → "rice"
127
+ 2. **Remove measurements**: "1 lb chicken breast" → "chicken breast"
128
+ 3. **Lowercase**: "Fresh Basil" → "fresh basil"
129
+ 4. **Array format**: ["rice", "chicken breast", "fresh basil"]
130
+ 5. **No duplicates**: Remove duplicate ingredients from array
131
+
132
+ ### Example Transformation
133
+ ```
134
+ Raw: "2 lbs fresh shrimp, 1 cup jasmine rice (cooked), 1/2 cup coconut milk"
135
+ Clean: ["fresh shrimp", "jasmine rice", "coconut milk"]
136
+ ```
137
+
138
+ ## Benefits
139
+
140
+ ### 1. Simplicity
141
+ - Single structure for all providers
142
+ - Easy to understand and maintain
143
+ - No complex transformations needed
144
+
145
+ ### 2. Performance
146
+ - Optimized for ingredient matching
147
+ - Fast text and vector search
148
+ - Minimal processing overhead
149
+
150
+ ### 3. Flexibility
151
+ - Works with existing MongoDB data
152
+ - Compatible with ChromaDB auto-embedding
153
+ - Supports both search types (text/vector)
154
+
155
+ ### 4. Scalability
156
+ - Easy to add new recipes
157
+ - Simple data validation
158
+ - Consistent across providers
159
+
160
+ This universal structure ensures maximum compatibility and efficiency for ingredient-based recipe recommendations across all vector store providers.
backend/docs/sanitization_guide.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Simplified Data Sanitization Documentation
2
+
3
+ ## Overview
4
+
5
+ The simplified data sanitization module provides focused input validation and sanitization for the Recipe Recommendation Bot API. It's designed specifically for recipe chatbot context with essential security protection.
6
+
7
+ ## Features
8
+
9
+ ### 🛡️ **Essential Security Protection**
10
+ - **XSS Prevention**: HTML encoding and basic script removal
11
+ - **Input Validation**: Length limits and content validation
12
+ - **Whitespace Normalization**: Clean formatting
13
+
14
+ ### 🔧 **Simple Configuration**
15
+ - **Maximum Message Length**: 1000 characters
16
+ - **Minimum Message Length**: 1 character
17
+ - **Single Method**: One sanitization method for all inputs
18
+
19
+ ## Usage
20
+
21
+ ### Basic Sanitization
22
+
23
+ ```python
24
+ from utils.sanitization import sanitize_user_input
25
+
26
+ # Sanitize any user input (chat messages, demo prompts)
27
+ clean_input = sanitize_user_input("What are some chicken recipes?")
28
+ ```
29
+
30
+ ### Advanced Usage
31
+
32
+ ```python
33
+ from utils.sanitization import DataSanitizer
34
+
35
+ # Direct class usage
36
+ sanitizer = DataSanitizer()
37
+ clean_text = sanitizer.sanitize_input("User input")
38
+ ```
39
+
40
+ ## Security Patterns Handled
41
+
42
+ ### Basic XSS Protection
43
+ - `<script>` tags → Removed
44
+ - `javascript:` URLs → Cleaned
45
+ - Event handlers (`onclick`, `onload`) → Removed
46
+ - HTML entities → Properly encoded
47
+
48
+ ### Input Validation
49
+ - Length limits (1-1000 characters)
50
+ - Empty input detection
51
+ - Whitespace normalization
52
+
53
+ ## Integration
54
+
55
+ The sanitization is automatically applied in FastAPI endpoints:
56
+
57
+ ### Chat Endpoint
58
+ ```python
59
+ class ChatMessage(BaseModel):
60
+ message: str = Field(..., min_length=1, max_length=1000)
61
+
62
+ @validator('message')
63
+ def sanitize_message_field(cls, v):
64
+ return sanitize_user_input(v)
65
+ ```
66
+
67
+ ### Demo Endpoint
68
+ ```python
69
+ @app.get("/demo")
70
+ def demo(prompt: str = "What recipes do you have?"):
71
+ sanitized_prompt = sanitize_user_input(prompt)
72
+ # ... rest of the logic
73
+ ```
74
+
75
+ ## Error Handling
76
+
77
+ The sanitization raises `ValueError` for invalid input:
78
+
79
+ ```python
80
+ try:
81
+ clean_input = sanitize_user_input(user_input)
82
+ except ValueError as e:
83
+ return {"error": f"Invalid input: {str(e)}"}
84
+ ```
85
+
86
+ ## Testing
87
+
88
+ Run the sanitization tests:
89
+
90
+ ```bash
91
+ python3 test_sanitization.py
92
+ ```
93
+
94
+ The test suite covers:
95
+ - Normal recipe-related messages
96
+ - Basic harmful content (scripts, JavaScript)
97
+ - Length validation
98
+ - Whitespace normalization
99
+ - Edge cases
100
+
101
+ ## What's Simplified
102
+
103
+ ### Removed Overly Complex Features:
104
+ - ❌ SQL injection patterns (not relevant for LLM chatbot)
105
+ - ❌ Command injection patterns (not applicable)
106
+ - ❌ Separate strict/relaxed modes (unnecessary complexity)
107
+ - ❌ Multiple sanitization methods (unified approach)
108
+
109
+ ### Kept Essential Features:
110
+ - ✅ Basic XSS protection
111
+ - ✅ Input length validation
112
+ - ✅ HTML encoding
113
+ - ✅ Whitespace normalization
114
+ - ✅ Clear error messages
115
+
116
+ ## Performance
117
+
118
+ - **Lightweight**: Minimal regex patterns
119
+ - **Fast**: Simple operations only
120
+ - **Memory Efficient**: No complex state
121
+ - **Recipe-Focused**: Context-appropriate validation
122
+
123
+ ## Examples
124
+
125
+ ### Valid Inputs (Cleaned):
126
+ ```python
127
+ "What are chicken recipes?" → "What are chicken recipes?"
128
+ "<script>alert('xss')</script>Tell me about pasta" → "Tell me about pasta"
129
+ " How to cook rice? " → "How to cook rice?"
130
+ "What about desserts & sweets?" → "What about desserts &amp; sweets?"
131
+ ```
132
+
133
+ ### Invalid Inputs (Rejected):
134
+ ```python
135
+ "" → ValueError: Input cannot be empty
136
+ "a" * 1001 → ValueError: Input too long (maximum 1000 characters)
137
+ ```
138
+
139
+ ## Best Practices
140
+
141
+ 1. **Keep It Simple**: Focus on actual threats for recipe chatbot
142
+ 2. **Context Appropriate**: Don't over-engineer for non-existent threats
143
+ 3. **User Friendly**: Allow normal recipe-related punctuation
144
+ 4. **Clear Errors**: Provide helpful error messages
145
+ 5. **Test Regularly**: Verify with real recipe queries
146
+
147
+ This simplified approach provides adequate protection while maintaining usability for a recipe recommendation chatbot context.
backend/docs/scraper.md ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Recipe Scraper – FastAPI demo
2
+
3
+ A tiny FastAPI service + CLI that scrapes recipe sites, normalizes data, and (optionally) embeds combined **ingredients + instructions** into a single vector (`recipe_emb`). Designed as a **test project**—simple to run locally, easy to extend.
4
+
5
+ ---
6
+
7
+ ## Features
8
+
9
+ * 🔧 **Sites**: `yummy` (YummyMedley), `anr` (All Nigerian Recipes)
10
+ * 🧱 **Unified text**: builds `recipe_text` from sections, or embeds `("ingredients","instructions") → recipe_emb`
11
+ * 🧠 **Embeddings**: Hugging Face `sentence-transformers` via your `HFEmbedder` (default: `all-MiniLM-L6-v2`)
12
+ * 🚀 **API trigger**: `POST /scrape` runs scraping in the background
13
+ * 👀 **Progress**: `GET /jobs/{job_id}` (and optional `GET /jobs`) to check status
14
+ * 💾 **Output**: `output_type = "json"` (local file) or `"mongo"` (MongoDB/Atlas)
15
+
16
+ ---
17
+
18
+ ## Project layout (essential bits)
19
+
20
+ ```
21
+ backend/
22
+ app.py
23
+ data_minning/
24
+ base_scraper.py # BaseRecipeScraper (+ StreamOptions)
25
+ all_nigerian_recipe_scraper.py
26
+ yummy_medley_scraper.py
27
+ dto/recipe_doc.py
28
+ soup_client.py
29
+ utils/sanitization.py
30
+ ```
31
+
32
+ Make sure every package dir has an `__init__.py`.
33
+
34
+ ---
35
+
36
+ ## Requirements
37
+
38
+ * Python 3.9+
39
+ * macOS/Linux (Windows should work too)
40
+ * (Optional) MongoDB/Atlas for `"mongo"` output
41
+
42
+ ### Install
43
+
44
+ ```bash
45
+ python3 -m venv .venv
46
+ source .venv/bin/activate
47
+
48
+ pip install --upgrade pip
49
+ pip install -r requirements.txt
50
+ # If you don’t have a requirements.txt, minimum:
51
+ pip install fastapi "uvicorn[standard]" pydantic==2.* requests beautifulsoup4 \
52
+ sentence-transformers numpy pymongo python-dotenv
53
+ ```
54
+
55
+ > If `uvicorn` isn’t found on your PATH, you can always run with `python3 -m uvicorn ...`.
56
+
57
+ ---
58
+
59
+ ## Environment variables
60
+
61
+ Create `.env` in repo root (or export envs) as needed:
62
+
63
+ ```dotenv
64
+
65
+
66
+ # For Mongo output_type="mongo"
67
+ MONGODB_URI=mongodb+srv://user:pass@cluster/recipes?retryWrites=true&w=majority
68
+ MONGODB_DB=recipes
69
+ MONGODB_COL=items
70
+ ATLAS_INDEX=recipes_vec # your Atlas Search index name
71
+
72
+ # Embeddings (HFEmbedder)
73
+ HF_MODEL=sentence-transformers/all-MiniLM-L6-v2
74
+ HF_DEVICE=cpu # or cuda
75
+ ```
76
+
77
+ ---
78
+
79
+ ## Running the API
80
+
81
+ From the project root (the folder **containing** `backend/`):
82
+
83
+ ```bash
84
+ python3 -m uvicorn app:app --reload --host 127.0.0.1 --port 8080
85
+ ```
86
+
87
+
88
+ ---
89
+
90
+ ## API
91
+
92
+ ### POST `/scrape`
93
+
94
+ Trigger a scrape job (non-blocking). **Body** is a JSON object:
95
+
96
+ ```json
97
+ {
98
+ "site": "yummy",
99
+ "limit": 50, #optional
100
+ "output_type": "json" // or "mongo"
101
+ }
102
+ ```
103
+
104
+ **Headers**
105
+
106
+ * `Content-Type: application/json`
107
+ * If enabled: `X-API-Key: <ADMIN_API_KEY>`
108
+
109
+ **curl example (JSON output):**
110
+
111
+ ```bash
112
+ curl -X POST http://127.0.0.1:8080/scrape \
113
+ -H "Content-Type: application/json" \
114
+ -H "X-API-Key: dev-key" \
115
+ -d '{"site":"yummy","limit":20,"output_type":"json"}'
116
+ ```
117
+
118
+ **Response**
119
+
120
+ ```json
121
+ { "job_id": "yummy-a1b2c3d4", "status": "queued" }
122
+ ```
123
+
124
+ ### GET `/jobs/{job_id}`
125
+
126
+ Check progress:
127
+
128
+ ```bash
129
+ curl http://127.0.0.1:8080/jobs/yummy-a1b2c3d4
130
+ ```
131
+
132
+ **Possible responses**
133
+
134
+ ```json
135
+ { "status": "running", "count": 13 }
136
+ { "status": "done", "count": 50 }
137
+ { "status": "error", "error": "Traceback ..." }
138
+ { "status": "unknown" }
139
+ ```
140
+
141
+ ### (Optional) GET `/jobs`
142
+
143
+ Return the whole in-memory job map (useful for debugging):
144
+
145
+ ```bash
146
+ curl http://127.0.0.1:8080/jobs
147
+ ```
148
+
149
+ > Note: jobs are stored in a process-local dict and clear on server restart.
150
+
151
+ ---
152
+
153
+ ## Output modes
154
+
155
+ ### `"json"`
156
+
157
+ Writes batches to a JSON sink (e.g., newline-delimited file). Check the sink path configured in your `JsonArraySink`/`DualSink`.
158
+
159
+ Typical document shape:
160
+
161
+ ```json
162
+ {
163
+ "title": "...",
164
+ "url": "...",
165
+ "source": "...",
166
+ "category": "...",
167
+ "ingredients": "- 1 cup rice\n- 2 tbsp oil\n...",
168
+ "instructions": "1. Heat oil...\n\n2. Add rice...",
169
+ "image_url": "...",
170
+ "needs_review": false,
171
+ "scraped_at": "2025-09-14 10:03:32.289232",
172
+ "recipe_emb": [0.0123, -0.0456, ...] // when embeddings enabled
173
+ }
174
+ ```
175
+
176
+ ### `"mongo"`
177
+
178
+ Writes to `MONGODB_DB.MONGODB_COL`. Ensure your Atlas Search index is created if you plan to query vectors.
179
+
180
+ **Atlas index mapping (single vector field)**
181
+
182
+ ```json
183
+ {
184
+ "mappings": {
185
+ "dynamic": false,
186
+ "fields": {
187
+ "recipe_emb": { "type": "knnVector", "dims": 384, "similarity": "cosine" }
188
+ }
189
+ }
190
+ }
191
+ ```
192
+
193
+ **Query example:**
194
+
195
+ ```python
196
+ qvec = embedder.encode([query])[0]
197
+ pipeline = [{
198
+ "$vectorSearch": {
199
+ "index": os.getenv("ATLAS_INDEX", "recipes_vec"),
200
+ "path": "recipe_emb",
201
+ "queryVector": qvec,
202
+ "numCandidates": 400,
203
+ "limit": 10,
204
+ "filter": { "needs_review": { "$ne": True } }
205
+ }
206
+ }]
207
+ results = list(col.aggregate(pipeline))
208
+ ```
209
+
210
+ ---
211
+
212
+ ## Embeddings (combined fields → one vector)
213
+
214
+ We embed **ingredients + instructions** into a single `recipe_emb`. Two supported patterns:
215
+
216
+ ### A) Combine at embedding time
217
+
218
+ Configure:
219
+
220
+ ```python
221
+ embedding_fields = [
222
+ (("ingredients", "instructions"), "recipe_emb")
223
+ ]
224
+ ```
225
+
226
+ `_apply_embeddings` concatenates labeled sections:
227
+
228
+ ```
229
+ Ingredients:
230
+ - ...
231
+
232
+ Instructions:
233
+ 1. ...
234
+ ```
235
+
236
+ ### B) Build `recipe_text` in `RecipeDoc.finalize()` and embed once
237
+
238
+ ```python
239
+ self.recipe_text = "\n\n".join(
240
+ [s for s in [
241
+ f"Title:\n{self.title}" if self.title else "",
242
+ f"Ingredients:\n{self.ingredients_text}" if self.ingredients_text else "",
243
+ f"Instructions:\n{self.instructions_text}" if self.instructions_text else ""
244
+ ] if s]
245
+ )
246
+ # embedding_fields = [("recipe_text", "recipe_emb")]
247
+ ```
248
+
249
+ **HFEmbedder config (defaults):**
250
+
251
+ ```python
252
+ HF_MODEL=sentence-transformers/all-MiniLM-L6-v2
253
+ HF_DEVICE=cpu
254
+ ```
255
+
256
+ ---
257
+
258
+ ## CLI (optional but handy)
259
+
260
+ Create `run_scrape.py`:
261
+
262
+ ```python
263
+ from backend.services.data_minning.yummy_medley_scraper import YummyMedleyScraper
264
+ from backend.services.data_minning.all_nigerian_recipe_scraper import AllNigerianRecipesScraper
265
+
266
+ SCRAPERS = {
267
+ "yummy": YummyMedleyScraper,
268
+ "anr": AllNigerianRecipesScraper,
269
+ }
270
+
271
+ if __name__ == "__main__":
272
+ import argparse
273
+ from dataclasses import asdict
274
+ p = argparse.ArgumentParser()
275
+ p.add_argument("--site", choices=SCRAPERS.keys(), required=True)
276
+ p.add_argument("--limit", type=int, default=50)
277
+ args = p.parse_args()
278
+
279
+ s = SCRAPERS[args.site]()
280
+ saved = s.stream(sink=..., options=StreamOptions(limit=args.limit))
281
+ print(f"Saved {saved}")
282
+ ```
283
+
284
+ Run:
285
+
286
+ ```bash
287
+ python3 run_scrape.py --site yummy --limit 25
288
+ ```
289
+
290
+ ---
291
+
292
+ ## Implementation notes
293
+
294
+ ### `StreamOptions` (clean params)
295
+
296
+ ```python
297
+ from dataclasses import dataclass
298
+ from typing import Optional, Callable
299
+
300
+ @dataclass
301
+ class StreamOptions:
302
+ delay: float = 0.3
303
+ limit: Optional[int] = None
304
+ batch_size: int = 50
305
+ resume_file: Optional[str] = None
306
+ progress_callback: Optional[Callable[[int], None]] = None
307
+ ```
308
+
309
+ ### Progress to `/jobs`
310
+
311
+ We pass a `progress_callback` that updates the job by `job_id`:
312
+
313
+ ```python
314
+ def make_progress_cb(job_id: str):
315
+ def _cb(n: int):
316
+ JOBS[job_id]["count"] = n
317
+ return _cb
318
+ ```
319
+
320
+ Used as:
321
+
322
+ ```python
323
+ saved = s.stream(
324
+ sink=json_or_mongo_sink,
325
+ options=StreamOptions(
326
+ limit=body.limit,
327
+ batch_size=body.limit,
328
+ resume_file="recipes.resume",
329
+ progress_callback=make_progress_cb(job_id),
330
+ ),
331
+ )
332
+ ```
333
+
334
+ ---
335
+
336
+ ## Common pitfalls & fixes
337
+
338
+ * **`ModuleNotFoundError: No module named 'backend'`**
339
+ Run with module path:
340
+ `python3 -m uvicorn backend.app:app --reload`
341
+
342
+ * **Uvicorn not found (`zsh: command not found: uvicorn`)**
343
+ Use: `python3 -m uvicorn ...` or add `~/Library/Python/3.9/bin` to PATH.
344
+
345
+ * **`422 Unprocessable Entity` on `/scrape`**
346
+ In Postman: Body → **raw → JSON** and send:
347
+ `{"site":"yummy","limit":20,"output_type":"json"}`
348
+
349
+ * **Pydantic v2: “non-annotated attribute”**
350
+ Keep globals like `JOBS = {}` **outside** `BaseModel` classes.
351
+
352
+ * **`'int' object is not iterable`**
353
+ Don’t iterate `stream()`—it **returns** an `int`. Use the `progress_callback` if you need live updates.
354
+
355
+ * **`BackgroundTasks` undefined**
356
+ Import from FastAPI:
357
+ `from fastapi import BackgroundTasks`
358
+
359
+ * **Too many commas in ingredients**
360
+ Don’t `.join()` a **string**—only join if it’s a `list[str]`.
361
+
362
+ ---
363
+
364
+ ## Future ideas (nice-to-haves)
365
+
366
+ * Store jobs in Redis for persistence across restarts
367
+ * Add `started_at` / `finished_at` timestamps and durations to jobs
368
+ * Rate-limit per site; cool-down if a scrape ran recently
369
+ * Switch to task queue (Celery/RQ/BullMQ) if you need scale
370
+ * Add `/search` endpoint that calls `$vectorSearch` in MongoDB
371
+
372
+ ---
backend/docs/unified-provider-configuration.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Unified Provider Configuration
2
+
3
+ ## Overview
4
+ The Recipe Recommendation Bot now uses a **unified provider approach** where a single `LLM_PROVIDER` setting controls both LLM and embedding models. This eliminates configuration mismatches and simplifies setup.
5
+
6
+ ## Before vs After
7
+
8
+ ### ❌ Previous Approach (Confusing)
9
+ ```bash
10
+ LLM_PROVIDER=huggingface
11
+ EMBEDDING_PROVIDER=openai # 😵 Different providers - causes issues!
12
+ ```
13
+
14
+ ### ✅ New Approach (Simple)
15
+ ```bash
16
+ LLM_PROVIDER=huggingface # 🎯 One setting controls both LLM and embeddings
17
+ ```
18
+
19
+ ## Benefits
20
+
21
+ 1. **Prevents Mismatches**: No more accidentally mixing providers
22
+ 2. **Simplified Configuration**: One setting instead of two
23
+ 3. **Better User Experience**: Less confusion, fewer errors
24
+ 4. **Consistent Performance**: Same provider for both LLM and embeddings
25
+ 5. **Easier Troubleshooting**: Single provider to debug
26
+
27
+ ## Supported Combinations
28
+
29
+ | Provider | LLM Model | Embedding Model |
30
+ |----------|-----------|-----------------|
31
+ | `openai` | `gpt-5-nano` | `text-embedding-3-small` |
32
+ | `google` | `gemini-2.0-flash` | `models/embedding-001` |
33
+ | `huggingface` | `microsoft/DialoGPT-small` | `sentence-transformers/all-MiniLM-L6-v2` |
34
+
35
+ ## Configuration Examples
36
+
37
+ ### OpenAI (Complete Setup)
38
+ ```bash
39
+ LLM_PROVIDER=openai
40
+ OPENAI_API_KEY=your_api_key_here
41
+ OPENAI_MODEL=gpt-5-nano
42
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small
43
+ ```
44
+
45
+ ### Google (Complete Setup)
46
+ ```bash
47
+ LLM_PROVIDER=google
48
+ GOOGLE_API_KEY=your_api_key_here
49
+ GOOGLE_MODEL=gemini-2.0-flash
50
+ GOOGLE_EMBEDDING_MODEL=models/embedding-001
51
+ ```
52
+
53
+ ### HuggingFace (Complete Setup)
54
+ ```bash
55
+ LLM_PROVIDER=huggingface
56
+ HUGGINGFACE_API_TOKEN=your_token_here
57
+ HUGGINGFACE_MODEL=microsoft/DialoGPT-small
58
+ HUGGINGFACE_USE_GPU=false
59
+ HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
60
+ ```
61
+
62
+ ## Migration Guide
63
+
64
+ If you have an existing `.env` file:
65
+
66
+ 1. **Remove** the `EMBEDDING_PROVIDER` line
67
+ 2. **Keep** the `LLM_PROVIDER` line
68
+ 3. **Ensure** both LLM and embedding model settings are configured for your chosen provider
69
+
70
+ ### Example Migration
71
+ ```bash
72
+ # OLD .env
73
+ LLM_PROVIDER=huggingface
74
+ EMBEDDING_PROVIDER=huggingface # ← Remove this line
75
+
76
+ # NEW .env
77
+ LLM_PROVIDER=huggingface # ← Keep this, it controls both
78
+ ```
79
+
80
+ ## Technical Implementation
81
+
82
+ The configuration system now:
83
+ - Uses `LLM_PROVIDER` for both `get_llm_config()` and `get_embedding_config()`
84
+ - Automatically matches provider types
85
+ - Validates that the provider supports both LLM and embeddings
86
+ - Provides clear error messages for unsupported providers
87
+
88
+ ## Validation
89
+
90
+ You can verify your configuration works:
91
+ ```bash
92
+ cd backend
93
+ python -c "
94
+ from config.settings import settings
95
+ llm = settings.get_llm_config()
96
+ emb = settings.get_embedding_config()
97
+ print(f'LLM: {llm[\"provider\"]}')
98
+ print(f'Embedding: {emb[\"provider\"]}')
99
+ print(f'Match: {llm[\"provider\"] == emb[\"provider\"]}')
100
+ "
101
+ ```
102
+
103
+ Expected output:
104
+ ```
105
+ LLM: huggingface
106
+ Embedding: huggingface
107
+ Match: True
108
+ ```
backend/requirements.txt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Production Requirements - Core dependencies only
2
+ # Keep this minimal for production deployments
3
+
4
+ # Core API dependencies
5
+ fastapi
6
+ uvicorn[standard]
7
+ python-dotenv
8
+ python-multipart
9
+
10
+ # Data processing
11
+ numpy
12
+ requests
13
+
14
+ # LLM Providers
15
+ openai
16
+ google-generativeai
17
+
18
+ # Vector Store & Embeddings (optional - choose based on needs)
19
+ # Uncomment if needed in production
20
+ chromadb
21
+ langchain-mongodb
22
+ pymongo
23
+
24
+ # MongoDB Atlas Vector Store (optional)
25
+ pymongo[srv]
26
+
27
+ # HuggingFace dependencies
28
+ # transformers
29
+ # accelerate
30
+
31
+ # Sentence Transformers - Choose ONE option below:
32
+ # FULL sentence-transformers (easier to use, ~800MB+ )
33
+ # sentence-transformers
34
+
35
+ # Note: sentence-transformers will automatically use CPU-only PyTorch if CUDA is not available (easier to use, ~200MB - 300MB )
36
+ # To force CPU-only installation: pip install torch --index-url https://download.pytorch.org/whl/cpu && pip install sentence-transformers
37
+
38
+ # LangChain for RAG
39
+ langchain
40
+ langchain-core
41
+ langchain-text-splitters
42
+ langchain-openai
43
+ langchain-chroma
44
+ langchain_google_genai
45
+ langchain-huggingface
46
+ langchain-community
47
+
48
+ bs4
49
+ lxml
backend/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services package initialization
backend/services/custom_mongo_vector.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Streamlined MongoDB Vector Store with Atlas Vector Search
3
+ """
4
+
5
+ from typing import List, Dict, Any, Optional, NamedTuple
6
+ import numpy as np
7
+ from langchain.schema import Document
8
+ from langchain.vectorstores.base import VectorStore
9
+ from pymongo.collection import Collection
10
+ from backend.config.logging_config import get_logger
11
+
12
+ logger = get_logger("custom_mongo_vector")
13
+
14
+ class VectorSearchOptions(NamedTuple):
15
+ """Configuration options for vector search"""
16
+ index_name: str = "foodInstructionIndex"
17
+ embedding_key: str = "ingredients_emb"
18
+ text_key: str = "title"
19
+ num_candidates: int = 50
20
+ similarity_metric: str = "cosine" # cosine or dotProduct
21
+
22
+ class CustomMongoDBVectorStore(VectorStore):
23
+ """
24
+ Streamlined MongoDB Atlas Vector Store with efficient $vectorSearch aggregation.
25
+ Falls back to Python similarity calculation when Atlas Vector Search is unavailable.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ collection: Collection,
31
+ embedding_function,
32
+ options: Optional[VectorSearchOptions] = None
33
+ ):
34
+ self.collection = collection
35
+ self.embedding_function = embedding_function
36
+ self.options = options or VectorSearchOptions()
37
+
38
+ logger.info(f"🔧 Streamlined MongoDB Vector Store initialized")
39
+ logger.info(f"� Config: {self.options.index_name} index, {self.options.similarity_metric} similarity")
40
+
41
+ def _calculate_similarity(self, vec1: List[float], vec2: List[float]) -> float:
42
+ """Calculate similarity using the most efficient method"""
43
+ a, b = np.array(vec1), np.array(vec2)
44
+
45
+ if self.options.similarity_metric == "dotProduct":
46
+ # Dot product (faster, good for normalized embeddings)
47
+ return float(np.dot(a, b))
48
+ else:
49
+ # Cosine similarity (more robust, handles non-normalized embeddings)
50
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
51
+
52
+ def similarity_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
53
+ """Streamlined similarity search using Atlas Vector Search with Python fallback"""
54
+ logger.info(f"🔍 Searching: '{query}' (k={k})")
55
+
56
+ qvec = self.embedding_function.embed_query(query)
57
+
58
+ # Primary: Try Atlas Vector Search (efficient, server-side)
59
+ try:
60
+ pipeline = [
61
+ {
62
+ "$vectorSearch": {
63
+ "index": self.options.index_name,
64
+ "path": self.options.embedding_key,
65
+ "queryVector": qvec,
66
+ "numCandidates": self.options.num_candidates,
67
+ "limit": k
68
+ }
69
+ },
70
+ {
71
+ "$match": {
72
+ '$or': [
73
+ { 'needs_review': { '$exists': False } },
74
+ { 'needs_review': False }
75
+ ]
76
+ }
77
+ }
78
+ ]
79
+
80
+ results = list(self.collection.aggregate(pipeline))
81
+ if results:
82
+ logger.info(f"✅ Atlas Vector Search: {len(results)} results")
83
+ return self._create_documents(results)
84
+
85
+ except Exception as e:
86
+ logger.warning(f"⚠️ Atlas Vector Search failed: {e}")
87
+
88
+ # Fallback: Python similarity calculation
89
+ logger.info("🔄 Using Python similarity fallback")
90
+ return self._python_similarity_search(qvec, k)
91
+
92
+ def _python_similarity_search(self, qvec: List[float], k: int) -> List[Document]:
93
+ """Efficient Python-based similarity search fallback"""
94
+ cursor = self.collection.find(
95
+ {'$or': [
96
+ {'needs_review': {'$exists': False}},
97
+ {'needs_review': False}
98
+ ]},
99
+ {self.options.text_key: 1, self.options.embedding_key: 1, "ingredients": 1, "instructions": 1}
100
+ )
101
+
102
+ # Vectorized similarity calculation for efficiency
103
+ similarities = []
104
+ for doc in cursor:
105
+ doc_emb = doc.get(self.options.embedding_key)
106
+ if doc_emb and len(doc_emb) == len(qvec):
107
+ score = self._calculate_similarity(qvec, doc_emb)
108
+ similarities.append((doc, score))
109
+
110
+ # Return top-k results
111
+ similarities.sort(key=lambda x: x[1], reverse=True)
112
+ top_docs = [doc for doc, _ in similarities[:k]]
113
+
114
+ logger.info(f"📊 Python fallback: {len(similarities)} processed, {len(top_docs)} returned")
115
+ return self._create_documents(top_docs)
116
+
117
+ def _create_documents(self, docs: List[Dict]) -> List[Document]:
118
+ """Create LangChain Documents from MongoDB results using clean string content"""
119
+ documents = []
120
+ for doc in docs:
121
+ title = doc.get(self.options.text_key, "Untitled Recipe")
122
+ ingredients = doc.get("ingredients", "")
123
+ instructions = doc.get("instructions", "")
124
+
125
+ # Build clean content without extra formatting
126
+ content_parts = [f"Recipe: {title}"]
127
+
128
+ if ingredients:
129
+ content_parts.append(f"Ingredients: {ingredients}")
130
+
131
+ if instructions:
132
+ content_parts.append(f"Instructions: {instructions}")
133
+
134
+ content = "\n\n".join(content_parts)
135
+
136
+ documents.append(Document(
137
+ page_content=content,
138
+ metadata={"_id": str(doc["_id"]), "title": title}
139
+ ))
140
+
141
+ return documents
142
+
143
+ def similarity_search_with_score(self, query: str, k: int = 4, **kwargs: Any) -> List[tuple]:
144
+ """Return docs with similarity scores (simplified)"""
145
+ docs = self.similarity_search(query, k, **kwargs)
146
+ return [(doc, 1.0) for doc in docs] # Atlas Vector Search doesn't return raw scores
147
+ def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) -> List[str]:
148
+ """Read-only vector store - adding texts not supported"""
149
+ raise NotImplementedError("This vector store is read-only for pre-existing embeddings")
150
+
151
+ @classmethod
152
+ def from_texts(cls, texts: List[str], embedding_function, metadatas: Optional[List[dict]] = None, **kwargs: Any):
153
+ """Read-only vector store - creating from texts not supported"""
154
+ raise NotImplementedError("This vector store is read-only for pre-existing embeddings")
backend/services/llm_service.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LLM Service - RAG pipeline using ConversationalRetrievalChain
2
+ from typing import List, Dict, Any, Optional
3
+
4
+ # Local imports
5
+ from backend.config.settings import settings
6
+ from backend.config.logging_config import get_logger
7
+ from backend.services.vector_store import vector_store_service
8
+
9
+ # Setup logging
10
+ logger = get_logger("llm_service")
11
+
12
+ class LLMService:
13
+ """LLM service using ConversationalRetrievalChain for RAG pipeline"""
14
+
15
+ def __init__(self):
16
+ logger.info("🤖 Initializing LLM Service...")
17
+
18
+ try:
19
+ self.llm = self._setup_llm()
20
+ self.retriever = self._setup_retriever()
21
+ self.memory = self._setup_memory()
22
+ self.qa_chain = self._setup_qa_chain()
23
+
24
+ logger.info("🚀 LLM Service initialized successfully")
25
+
26
+ except Exception as e:
27
+ logger.error(f"❌ LLM Service initialization failed: {str(e)}", exc_info=True)
28
+ raise
29
+
30
+ def _setup_llm(self):
31
+ """Setup LLM based on configuration with conditional imports"""
32
+ llm_config = settings.get_llm_config()
33
+ provider = llm_config["provider"]
34
+
35
+ logger.info(f"🔧 Setting up LLM provider: {provider}")
36
+
37
+ if provider == "openai":
38
+ try:
39
+ from langchain_openai import ChatOpenAI
40
+ logger.info("✅ OpenAI LLM imported successfully")
41
+
42
+ # Handle special cases for temperature restrictions
43
+ temperature = llm_config["temperature"]
44
+ model = llm_config["model"]
45
+ max_tokens = llm_config.get("max_tokens", 1000)
46
+
47
+ # GPT-5-nano has temperature restrictions (defaults to 1.0)
48
+ if "gpt-5-nano" in model.lower():
49
+ temperature = 1.0
50
+ logger.info(f"🔧 Using temperature=1.0 for {model} (model restriction)")
51
+
52
+ # Log token configuration
53
+ logger.info(f"🎯 OpenAI config - Model: {model}, Output tokens: {max_tokens}, Temperature: {temperature}")
54
+
55
+ return ChatOpenAI(
56
+ api_key=llm_config["api_key"],
57
+ model=model,
58
+ temperature=temperature,
59
+ max_tokens=max_tokens # This limits OUTPUT tokens only
60
+ )
61
+ except ImportError as e:
62
+ logger.error(f"❌ OpenAI LLM not available: {e}")
63
+ raise ImportError("OpenAI provider selected but langchain_openai not installed")
64
+
65
+ elif provider == "google":
66
+ try:
67
+ from langchain_google_genai import ChatGoogleGenerativeAI
68
+ logger.info("✅ Google LLM imported successfully")
69
+
70
+ max_output_tokens = llm_config.get("max_tokens", 1000)
71
+ model = llm_config["model"]
72
+ temperature = llm_config["temperature"]
73
+
74
+ # Log token configuration
75
+ logger.info(f"🎯 Google config - Model: {model}, Output tokens: {max_output_tokens}, Temperature: {temperature}")
76
+
77
+ return ChatGoogleGenerativeAI(
78
+ google_api_key=llm_config["api_key"],
79
+ model=model,
80
+ temperature=temperature,
81
+ max_output_tokens=max_output_tokens # This limits OUTPUT tokens only
82
+ )
83
+ except ImportError as e:
84
+ logger.error(f"❌ Google LLM not available: {e}")
85
+ raise ImportError("Google provider selected but langchain_google_genai not installed")
86
+
87
+ elif provider == "ollama":
88
+ try:
89
+ from langchain_community.llms import Ollama
90
+ logger.info("✅ Ollama LLM imported successfully")
91
+ return Ollama(
92
+ base_url=llm_config["base_url"],
93
+ model=llm_config["model"],
94
+ temperature=llm_config["temperature"]
95
+ )
96
+ except ImportError as e:
97
+ logger.error(f"❌ Ollama LLM not available: {e}")
98
+ raise ImportError("Ollama provider selected but langchain_community not installed")
99
+
100
+ elif provider == "huggingface":
101
+ try:
102
+ # Check if we should use API or local pipeline
103
+ use_api = llm_config.get("use_api", False)
104
+
105
+ if use_api:
106
+ # Use HuggingFace Inference API with better error handling
107
+ try:
108
+ from langchain_huggingface import HuggingFaceEndpoint
109
+ logger.info("✅ Using HuggingFace API (no local download)")
110
+
111
+ return HuggingFaceEndpoint(
112
+ repo_id=llm_config["model"],
113
+ huggingfacehub_api_token=llm_config["api_token"],
114
+ temperature=0.7, # HuggingFace API doesn't support dynamic temperature from config
115
+ max_new_tokens=200,
116
+ repetition_penalty=1.1,
117
+ top_p=0.9
118
+ )
119
+ except Exception as api_error:
120
+ logger.warning(f"⚠️ HuggingFace API failed: {api_error}")
121
+ logger.info("🔄 Falling back to HuggingFace Hub API...")
122
+
123
+ # Fallback to HuggingFaceHub (older but more reliable)
124
+ try:
125
+ from langchain_community.llms import HuggingFaceHub
126
+
127
+ return HuggingFaceHub(
128
+ repo_id=llm_config["model"],
129
+ huggingfacehub_api_token=llm_config["api_token"],
130
+ model_kwargs={
131
+ "temperature": 0.7, # HuggingFace Hub API has limited temperature control
132
+ "max_new_tokens": 200,
133
+ "repetition_penalty": 1.1,
134
+ "top_p": 0.9,
135
+ "do_sample": True
136
+ }
137
+ )
138
+ except Exception as hub_error:
139
+ logger.error(f"❌ HuggingFace Hub also failed: {hub_error}")
140
+ raise ImportError(f"Both HuggingFace API methods failed: {api_error}, {hub_error}")
141
+ else:
142
+ # Use local pipeline (downloads model)
143
+ from langchain_huggingface import HuggingFacePipeline
144
+ from transformers import pipeline
145
+
146
+ logger.info("✅ Using HuggingFace local pipeline")
147
+
148
+ # Create HuggingFace pipeline - avoid device_map for CPU-only setups
149
+ pipeline_kwargs = {
150
+ "task": "text-generation",
151
+ "model": llm_config["model"],
152
+ "max_length": 512, # Increase max length
153
+ "do_sample": True, # Enable sampling for better responses
154
+ "temperature": 0.7, # Local pipeline uses default 0.7 for stability
155
+ "pad_token_id": 50256, # Set pad token to avoid warnings
156
+ "eos_token_id": 50256, # Set end of sequence token
157
+ }
158
+
159
+ # Only add device_map if using GPU
160
+ if llm_config.get("use_gpu", False):
161
+ pipeline_kwargs["device_map"] = "auto"
162
+ else:
163
+ # For CPU, use device=0 which maps to CPU
164
+ pipeline_kwargs["device"] = "cpu"
165
+
166
+ hf_pipeline = pipeline(**pipeline_kwargs)
167
+
168
+ return HuggingFacePipeline(
169
+ pipeline=hf_pipeline,
170
+ model_kwargs={
171
+ "temperature": 0.7, # Local pipeline temperature (limited configurability)
172
+ "max_new_tokens": 150, # Reduced for efficiency
173
+ "do_sample": True,
174
+ "top_p": 0.9,
175
+ "repetition_penalty": 1.1,
176
+ "early_stopping": True,
177
+ "num_beams": 4 # Better quality for instruction following
178
+ }
179
+ )
180
+ except ImportError as e:
181
+ logger.error(f"❌ HuggingFace LLM not available: {e}")
182
+ raise ImportError("HuggingFace provider selected but required packages not installed")
183
+
184
+ else:
185
+ logger.warning(f"⚠️ Unknown LLM provider '{provider}', falling back to OpenAI")
186
+ try:
187
+ from langchain_openai import ChatOpenAI
188
+ return ChatOpenAI()
189
+ except ImportError:
190
+ logger.error("❌ No valid LLM provider available")
191
+ raise ImportError("No valid LLM provider available")
192
+
193
+ def _setup_retriever(self):
194
+ """Setup retriever from vector store service"""
195
+ return vector_store_service.get_retriever()
196
+
197
+ def _setup_memory(self):
198
+ """Setup conversation memory"""
199
+ try:
200
+ from langchain.memory import ConversationBufferMemory
201
+ return ConversationBufferMemory(memory_key='chat_history', return_messages=True)
202
+ except ImportError as e:
203
+ logger.error(f"❌ ConversationBufferMemory not available: {e}")
204
+ raise ImportError("langchain memory not available")
205
+
206
+ def _setup_qa_chain(self):
207
+ """Setup ConversationalRetrievalChain"""
208
+ try:
209
+ from langchain.chains import ConversationalRetrievalChain
210
+ return ConversationalRetrievalChain.from_llm(
211
+ llm=self.llm,
212
+ retriever=self.retriever,
213
+ memory=self.memory,
214
+ verbose=settings.LANGCHAIN_DEBUG # Reduce debugging noise
215
+ )
216
+ except ImportError as e:
217
+ logger.error(f"❌ ConversationalRetrievalChain not available: {e}")
218
+ raise ImportError("langchain chains not available")
219
+
220
+ def _preprocess_query(self, question: str) -> str:
221
+ """Preprocess user query to improve vector search accuracy"""
222
+ import re
223
+
224
+ # Convert to lowercase for consistency
225
+ processed = question.lower()
226
+
227
+ # Remove common stop words that don't help with recipe matching
228
+ stop_words = ['i', 'want', 'a', 'an', 'the', 'for', 'with', 'can', 'you', 'give', 'me', 'please', 'help']
229
+ words = processed.split()
230
+ words = [word for word in words if word not in stop_words]
231
+
232
+ # Remove punctuation except spaces
233
+ processed = ' '.join(words)
234
+ processed = re.sub(r'[^\w\s]', '', processed)
235
+
236
+ # Normalize multiple spaces
237
+ processed = ' '.join(processed.split())
238
+
239
+ logger.debug(f"🔧 Query preprocessing: '{question}' → '{processed}'")
240
+ return processed
241
+
242
+ def ask_question(self, user_question: str) -> str:
243
+ """Ask a question using the conversational retrieval chain"""
244
+ logger.info(f"❓ Processing: '{user_question[:60]}...'")
245
+
246
+ try:
247
+ # Preprocess query for better matching
248
+ processed_query = self._preprocess_query(user_question)
249
+
250
+ # Get context for token tracking
251
+ document_retriever = getattr(self.qa_chain, 'retriever', None)
252
+ retrieved_context = ""
253
+ if document_retriever:
254
+ # Use both queries for comprehensive results
255
+ original_docs = document_retriever.invoke(user_question)
256
+ processed_docs = document_retriever.invoke(processed_query)
257
+
258
+ # Deduplicate documents
259
+ seen_content = set()
260
+ unique_documents = []
261
+ for document in original_docs + processed_docs:
262
+ if document.page_content not in seen_content:
263
+ unique_documents.append(document)
264
+ seen_content.add(document.page_content)
265
+
266
+ retrieved_context = "\n".join([doc.page_content for doc in unique_documents[:8]])
267
+ logger.debug(f"📄 Retrieved {len(unique_documents)} unique documents")
268
+
269
+ # Enhanced question for natural responses
270
+ enhanced_question = f"""Based on the available recipe information, please answer this cooking question: "{user_question}"
271
+
272
+ Respond directly and naturally as if you're sharing your own culinary knowledge. If there's a specific recipe that matches the request, share the complete recipe with ingredients and step-by-step instructions in a friendly, conversational way."""
273
+
274
+ result = self.qa_chain({"question": enhanced_question})
275
+ generated_answer = result["answer"]
276
+
277
+ self._log_token_usage(user_question, retrieved_context, generated_answer)
278
+
279
+ logger.info(f"✅ Response generated ({len(generated_answer)} chars)")
280
+ return generated_answer
281
+
282
+ except Exception as error:
283
+ logger.error(f"❌ Error in ask_question: {str(error)}")
284
+ return f"Sorry, I encountered an error: {str(error)}"
285
+
286
+ def _count_tokens(self, text: str) -> int:
287
+ """Count tokens in text (rough estimate for debugging)"""
288
+ return len(text) // 4 if text else 0
289
+
290
+ def _log_token_usage(self, question: str, context: str, response: str):
291
+ """Log token usage for monitoring"""
292
+ question_tokens = self._count_tokens(question)
293
+ context_tokens = self._count_tokens(context)
294
+ response_tokens = self._count_tokens(response)
295
+ total_input_tokens = question_tokens + context_tokens
296
+
297
+ logger.info(f"📊 Token Usage - Input:{total_input_tokens} (Q:{question_tokens}+C:{context_tokens}), Output:{response_tokens}")
298
+
299
+ if context_tokens > 3000:
300
+ logger.warning(f"⚠️ Large context detected: {context_tokens} tokens")
301
+
302
+ return {
303
+ "input_tokens": total_input_tokens,
304
+ "output_tokens": response_tokens,
305
+ "total_tokens": total_input_tokens + response_tokens
306
+ }
307
+
308
+ def clear_memory(self):
309
+ """Clear conversation memory"""
310
+ try:
311
+ if hasattr(self.memory, 'clear'):
312
+ self.memory.clear()
313
+ logger.info("✅ Memory cleared")
314
+ return True
315
+ except Exception as e:
316
+ logger.warning(f"⚠️ Could not clear memory: {e}")
317
+ return False
318
+
319
+ def simple_chat_completion(self, user_message: str) -> str:
320
+ """Simple chat completion without RAG - direct LLM response"""
321
+ logger.info(f"💭 Simple chat: '{user_message[:50]}...'")
322
+
323
+ try:
324
+ llm_prompt = f"As a knowledgeable cooking expert, share your insights about {user_message}. Provide helpful culinary advice and recommendations:\n\n"
325
+
326
+ llm_response = self.llm.invoke(llm_prompt) if hasattr(self.llm, 'invoke') else self.llm(llm_prompt)
327
+
328
+ # Extract content based on response type
329
+ if hasattr(llm_response, 'content'):
330
+ generated_answer = llm_response.content
331
+ elif isinstance(llm_response, str):
332
+ generated_answer = llm_response.replace(llm_prompt, "").strip() if llm_prompt in llm_response else llm_response
333
+ else:
334
+ generated_answer = str(llm_response)
335
+
336
+ # Validate and clean response
337
+ generated_answer = generated_answer.strip()
338
+ if not generated_answer or len(generated_answer) < 10:
339
+ generated_answer = "I'd be happy to help with recipes! Ask me about specific ingredients or dishes."
340
+
341
+ # Limit response length
342
+ if len(generated_answer) > 300:
343
+ answer_sentences = generated_answer.split('. ')
344
+ generated_answer = '. '.join(answer_sentences[:2]) + '.' if len(answer_sentences) > 1 else generated_answer[:300]
345
+
346
+ logger.info(f"✅ Response generated ({len(generated_answer)} chars)")
347
+ return generated_answer
348
+
349
+ except Exception as error:
350
+ logger.error(f"❌ Simple chat completion error: {str(error)}")
351
+ return f"Sorry, I encountered an error: {str(error)}"
352
+
353
+ # Create global LLM service instance
354
+ llm_service = LLMService()
backend/services/vector_store.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Vector Store Service - Simple setup for retriever use
2
+ import json
3
+ import os
4
+ import shutil
5
+ from typing import List, Dict, Any, Optional
6
+ from pathlib import Path
7
+
8
+ # Core LangChain imports (always needed)
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.schema import Document
11
+
12
+ # Local imports
13
+ from backend.config.settings import settings
14
+ from backend.config.database import db_settings
15
+ from backend.config.logging_config import get_logger
16
+
17
+ # MongoDB imports
18
+ from pymongo import MongoClient
19
+ from backend.services.custom_mongo_vector import CustomMongoDBVectorStore, VectorSearchOptions
20
+
21
+ # Setup logging
22
+ logger = get_logger("vector_store")
23
+
24
+ class VectorStoreService:
25
+ """Simple vector store service - creates or retrieves vector store for retriever use"""
26
+
27
+ def __init__(self):
28
+ logger.info("📚 Initializing Vector Store Service...")
29
+
30
+ try:
31
+ self.embeddings = self._get_embeddings()
32
+ logger.info("✅ Embeddings setup completed")
33
+
34
+ self.vector_store = self._get_or_create_vector_store()
35
+ logger.info("✅ Vector store setup completed")
36
+
37
+ logger.info("🚀 Vector Store Service initialization successful")
38
+
39
+ except Exception as e:
40
+ logger.error(f"❌ Vector Store Service initialization failed: {str(e)}", exc_info=True)
41
+ raise
42
+
43
+ def _get_embeddings(self):
44
+ """Get embeddings provider based on configuration with conditional imports"""
45
+ embedding_config = settings.get_embedding_config()
46
+ provider = embedding_config["provider"]
47
+
48
+ logger.info(f"🔧 Setting up embeddings provider: {provider}")
49
+
50
+ if provider == "openai":
51
+ try:
52
+ from langchain_openai import OpenAIEmbeddings
53
+ logger.info("✅ OpenAI embeddings imported successfully")
54
+ return OpenAIEmbeddings(
55
+ openai_api_key=embedding_config["api_key"],
56
+ model=embedding_config["model"]
57
+ )
58
+ except ImportError as e:
59
+ logger.error(f"❌ OpenAI embeddings not available: {e}")
60
+ raise ImportError("OpenAI provider selected but langchain_openai not installed")
61
+
62
+ elif provider == "google":
63
+ try:
64
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
65
+ logger.info("✅ Google embeddings imported successfully")
66
+ return GoogleGenerativeAIEmbeddings(
67
+ google_api_key=embedding_config["api_key"],
68
+ model=embedding_config["model"]
69
+ )
70
+ except ImportError as e:
71
+ logger.error(f"❌ Google embeddings not available: {e}")
72
+ raise ImportError("Google provider selected but langchain_google_genai not installed")
73
+
74
+ elif provider == "huggingface":
75
+ try:
76
+ # Try modern langchain-huggingface first
77
+ from langchain_huggingface import HuggingFaceEmbeddings
78
+ logger.info("✅ HuggingFace embeddings imported successfully")
79
+ return HuggingFaceEmbeddings(
80
+ model_name=embedding_config["model"]
81
+ )
82
+ except ImportError:
83
+ try:
84
+ # Fallback to sentence-transformers directly
85
+ from sentence_transformers import SentenceTransformer
86
+ logger.warning("⚠️ Using sentence-transformers directly (langchain-huggingface not available)")
87
+ # Return a wrapper that mimics the embeddings interface
88
+ return self._create_sentence_transformer_wrapper(embedding_config["model"])
89
+ except ImportError as e:
90
+ logger.error(f"❌ HuggingFace embeddings not available: {e}")
91
+ logger.error("💡 To fix this, install sentence-transformers: pip install sentence-transformers")
92
+ raise ImportError("HuggingFace provider selected but sentence-transformers not installed. Run: pip install sentence-transformers")
93
+
94
+ elif provider == "ollama":
95
+ try:
96
+ from langchain_community.embeddings import OllamaEmbeddings
97
+ logger.info("✅ Ollama embeddings imported successfully")
98
+ return OllamaEmbeddings(
99
+ base_url=embedding_config["base_url"],
100
+ model=embedding_config["model"]
101
+ )
102
+ except ImportError as e:
103
+ logger.error(f"❌ Ollama embeddings not available: {e}")
104
+ raise ImportError("Ollama provider selected but langchain_community not installed")
105
+
106
+ else:
107
+ logger.warning(f"⚠️ Unknown embedding provider '{provider}', falling back to OpenAI")
108
+ try:
109
+ from langchain_openai import OpenAIEmbeddings
110
+ return OpenAIEmbeddings()
111
+ except ImportError:
112
+ logger.error("❌ No valid embedding provider available")
113
+ raise ImportError("No valid embedding provider available")
114
+
115
+ def _create_sentence_transformer_wrapper(self, model_name):
116
+ """Create a simple wrapper for sentence-transformers to work with LangChain"""
117
+ from sentence_transformers import SentenceTransformer
118
+
119
+ class SentenceTransformerWrapper:
120
+ def __init__(self, model_name):
121
+ self.model = SentenceTransformer(model_name)
122
+
123
+ def encode(self, texts):
124
+ return self.model.encode(texts).tolist()
125
+
126
+ def embed_query(self, text):
127
+ return self.model.encode([text])[0].tolist()
128
+
129
+ return SentenceTransformerWrapper(model_name)
130
+
131
+ def _get_or_create_vector_store(self):
132
+ """Get or create vector store with conditional imports"""
133
+ db_config = db_settings.get_vector_store_config()
134
+ provider = db_config["provider"]
135
+
136
+ if provider == "chromadb":
137
+ try:
138
+ from langchain_chroma import Chroma
139
+
140
+ persist_dir = Path(db_config["persist_directory"])
141
+ collection_name = db_config["collection_name"]
142
+ refresh_on_start = db_config.get("refresh_on_start", False)
143
+
144
+ # Check if refresh is requested
145
+ if refresh_on_start and persist_dir.exists():
146
+ logger.info(f"🔄 CHROMADB_REFRESH_ON_START=true - Deleting existing ChromaDB at {persist_dir}")
147
+ shutil.rmtree(persist_dir)
148
+ logger.info(f"✅ Existing ChromaDB deleted successfully")
149
+
150
+ # Check if persisted database exists
151
+ if persist_dir.exists() and any(persist_dir.iterdir()):
152
+ logger.info(f"📂 Loading existing ChromaDB from {persist_dir}")
153
+ return Chroma(
154
+ collection_name=collection_name,
155
+ embedding_function=self.embeddings,
156
+ persist_directory=str(persist_dir)
157
+ )
158
+ else:
159
+ # Create new vector store with documents
160
+ logger.info(f"🆕 Creating new ChromaDB at {persist_dir}")
161
+ documents = self._load_documents_from_folder()
162
+
163
+ if documents:
164
+ vector_store = Chroma.from_documents(
165
+ documents=documents,
166
+ embedding=self.embeddings,
167
+ collection_name=collection_name,
168
+ persist_directory=str(persist_dir)
169
+ )
170
+ logger.info(f"✅ Created ChromaDB with {len(documents)} document chunks")
171
+ return vector_store
172
+ else:
173
+ logger.info("📝 No documents found, creating empty ChromaDB")
174
+ return Chroma(
175
+ collection_name=collection_name,
176
+ embedding_function=self.embeddings,
177
+ persist_directory=str(persist_dir)
178
+ )
179
+ except ImportError as e:
180
+ logger.error(f"❌ ChromaDB not available: {e}")
181
+ raise ImportError("ChromaDB provider selected but langchain_chroma not installed")
182
+
183
+ elif provider == "mongodb":
184
+ try:
185
+ logger.info("🔗 Setting up MongoDB Atlas connection...")
186
+ client = MongoClient(db_config["uri"])
187
+ client.admin.command('ping')
188
+ logger.info(f"✅ MongoDB Atlas connection verified")
189
+ print(client.list_database_names())
190
+ # Get the collection
191
+ database = client[db_config["database"]]
192
+ collection = database[db_config["collection_name"]]
193
+ # Create streamlined vector store with Atlas Vector Search
194
+ options = VectorSearchOptions(
195
+ index_name=db_config.get("index_name", "vector_index"),
196
+ embedding_key=db_config.get("vector_field", "ingredients_emb"),
197
+ text_key="title",
198
+ num_candidates=db_config.get("num_candidates", 50),
199
+ similarity_metric=db_config.get("similarity_metric", "cosine")
200
+ )
201
+
202
+ vector_store = CustomMongoDBVectorStore(
203
+ collection=collection,
204
+ embedding_function=self.embeddings,
205
+ options=options
206
+ )
207
+
208
+ logger.info(f"✅ Custom MongoDB Vector Store created successfully")
209
+ logger.info("🎯 Using pre-existing embeddings without requiring vector search index")
210
+ return vector_store
211
+
212
+ except ImportError as e:
213
+ logger.error(f"❌ MongoDB packages not available: {e}")
214
+ raise ImportError("MongoDB provider selected but langchain-mongodb not installed. Run: pip install langchain-mongodb pymongo")
215
+ except Exception as e:
216
+ logger.error(f"❌ MongoDB Atlas connection failed: {e}")
217
+ raise ConnectionError(f"Failed to connect to MongoDB Atlas: {e}")
218
+
219
+ else:
220
+ logger.warning(f"⚠️ Unknown vector store provider '{provider}', falling back to ChromaDB")
221
+ try:
222
+ from langchain_chroma import Chroma
223
+ return Chroma(
224
+ collection_name="fallback_collection",
225
+ embedding_function=self.embeddings,
226
+ persist_directory="./vector_store/fallback_chroma"
227
+ )
228
+ except ImportError:
229
+ logger.error("❌ No valid vector store provider available")
230
+ raise ImportError("No valid vector store provider available")
231
+
232
+ def _load_documents_from_folder(self, folder_path: str = "./data/recipes") -> List[Document]:
233
+ """Load and chunk all documents from folder with UTF-8 encoding, fallback to sample data"""
234
+ logger.info(f"📄 Loading documents from: {folder_path}")
235
+
236
+ documents = []
237
+ folder = Path(folder_path)
238
+
239
+ # Check if folder exists and has files
240
+ has_recipe_files = False
241
+ if folder.exists():
242
+ # Check if there are any files in the recipes folder
243
+ recipe_files = list(folder.rglob("*"))
244
+ has_recipe_files = any(f.is_file() and f.stat().st_size > 0 for f in recipe_files)
245
+
246
+ # If no recipe files found, use sample data
247
+ if not has_recipe_files:
248
+ logger.info(f"📭 No recipe files found in {folder_path}, using sample data")
249
+ folder_path = "./data" # Use data folder where sample_recipes.json is located
250
+ folder = Path(folder_path)
251
+
252
+ if not folder.exists():
253
+ logger.error(f"❌ Folder does not exist: {folder.absolute()}")
254
+ return documents
255
+
256
+ # Text splitter for chunking
257
+ text_splitter = RecursiveCharacterTextSplitter(
258
+ chunk_size=1000,
259
+ chunk_overlap=200
260
+ )
261
+
262
+ # Process all text-based files uniformly
263
+ for file_path in folder.rglob("*"):
264
+ if file_path.is_file():
265
+ try:
266
+ # Read file content with UTF-8 encoding
267
+ with open(file_path, 'r', encoding='utf-8') as f:
268
+ content = f.read()
269
+
270
+ # Skip empty files
271
+ if not content.strip():
272
+ continue
273
+
274
+ # Handle JSON files specially to format them properly
275
+ if file_path.suffix.lower() == '.json':
276
+ formatted_content = self._format_json_recipes(content, file_path)
277
+ if formatted_content:
278
+ content = formatted_content
279
+
280
+ # Split content into chunks using text splitter
281
+ chunks = text_splitter.split_text(content)
282
+
283
+ # Create documents for each chunk
284
+ for i, chunk in enumerate(chunks):
285
+ documents.append(Document(
286
+ page_content=chunk,
287
+ metadata={
288
+ "source": str(file_path),
289
+ "filename": file_path.name,
290
+ "chunk_index": i,
291
+ "file_type": file_path.suffix
292
+ }
293
+ ))
294
+
295
+ except Exception as e:
296
+ logger.error(f"❌ Error loading {file_path}: {e}")
297
+ continue
298
+
299
+ logger.info(f"✅ Loaded and chunked {len(documents)} document segments")
300
+ return documents
301
+
302
+ def _format_json_recipes(self, json_content: str, file_path: Path) -> str:
303
+ """Format JSON recipe data into readable text format similar to MongoDB output"""
304
+ try:
305
+ import json
306
+ recipes = json.loads(json_content)
307
+
308
+ # Handle both single recipe object and array of recipes
309
+ if isinstance(recipes, dict):
310
+ recipes = [recipes]
311
+ elif not isinstance(recipes, list):
312
+ logger.warning(f"⚠️ Unexpected JSON structure in {file_path}")
313
+ return None
314
+
315
+ formatted_recipes = []
316
+
317
+ for recipe in recipes:
318
+ if not isinstance(recipe, dict):
319
+ continue
320
+
321
+ # Extract recipe components
322
+ title = recipe.get("title", "Untitled Recipe")
323
+ ingredients = recipe.get("ingredients", [])
324
+ instructions = recipe.get("instructions", "")
325
+
326
+ # Format similar to MongoDB output
327
+ formatted_content = f"Recipe: {title}\n"
328
+
329
+ if ingredients:
330
+ if isinstance(ingredients, list):
331
+ formatted_content += f"Ingredients: {', '.join(ingredients)}\n"
332
+ else:
333
+ formatted_content += f"Ingredients: {ingredients}\n"
334
+
335
+ if instructions:
336
+ # Handle both string and list instructions
337
+ if isinstance(instructions, list):
338
+ formatted_content += f"Instructions: {' '.join(instructions)}"
339
+ else:
340
+ formatted_content += f"Instructions: {instructions}"
341
+
342
+ # Add metadata if available
343
+ metadata = recipe.get("metadata", {})
344
+ if metadata:
345
+ formatted_content += f"\n"
346
+ for key, value in metadata.items():
347
+ if key in ["cook_time", "difficulty", "servings", "category"]:
348
+ formatted_content += f"{key.replace('_', ' ').title()}: {value}\n"
349
+
350
+ formatted_recipes.append(formatted_content)
351
+
352
+ # Join all recipes with double newlines
353
+ result = "\n\n".join(formatted_recipes)
354
+ logger.info(f"✅ Formatted {len(recipes)} JSON recipes from {file_path.name}")
355
+ return result
356
+
357
+ except json.JSONDecodeError as e:
358
+ logger.error(f"❌ Invalid JSON in {file_path}: {e}")
359
+ return None
360
+ except Exception as e:
361
+ logger.error(f"❌ Error formatting JSON recipes from {file_path}: {e}")
362
+ return None
363
+
364
+ def get_retriever(self):
365
+ """Get retriever for use with ConversationalRetrievalChain"""
366
+ logger.info("🔍 Creating retriever from vector store...")
367
+
368
+ # For both ChromaDB and MongoDB Atlas, create standard retriever
369
+ retriever = self.vector_store.as_retriever()
370
+
371
+ # Configure search parameters based on provider
372
+ if hasattr(self.vector_store, '__class__'):
373
+ class_name = self.vector_store.__class__.__name__
374
+ if 'MongoDB' in class_name:
375
+ # MongoDB Atlas configuration
376
+ retriever.search_kwargs = {"k": 5}
377
+ logger.info("🔍 MongoDB Atlas retriever configured with k=5")
378
+ else:
379
+ # ChromaDB configuration
380
+ retriever.search_kwargs = {"k": 5}
381
+ logger.info("🔍 ChromaDB retriever configured with k=5")
382
+
383
+ return retriever
384
+
385
+ # Create global vector store service instance
386
+ vector_store_service = VectorStoreService()
backend/tests/__init__.py ADDED
File without changes
backend/tests/test_db_settings.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import os
3
+ from unittest.mock import patch
4
+ from backend.config.database import db_settings
5
+
6
+ class TestMongoAtlasSettings(unittest.TestCase):
7
+
8
+ @patch.dict(os.environ, {
9
+ 'VECTOR_STORE_PROVIDER': 'mongodb'
10
+ })
11
+ def test_mongo_settings(self):
12
+ """
13
+ Test mongodb config contains expected fields when VECTOR_STORE_PROVIDER is set
14
+ """
15
+ # reinitialize db_settings instance
16
+ db_settings.__init__()
17
+
18
+ db_config = db_settings.get_vector_store_config()
19
+ self.assertTrue('collection_name' in db_config)
20
+ self.assertTrue('index_name' in db_config)
21
+ self.assertTrue('text_field' in db_config)
22
+
23
+ class TestChromaDBSettings(unittest.TestCase):
24
+
25
+ @patch.dict(os.environ, {
26
+ 'VECTOR_STORE_PROVIDER': 'chromadb'
27
+ })
28
+ def test_mongo_settings(self):
29
+ """
30
+ Test chromadb config contains expected fields when VECTOR_STORE_PROVIDER is set
31
+ """
32
+ # reinitialize db_settings instance
33
+ db_settings.__init__()
34
+
35
+ db_config = db_settings.get_vector_store_config()
36
+ self.assertTrue('collection_name' in db_config)
37
+ self.assertTrue('persist_directory' in db_config)
38
+ self.assertTrue('refresh_on_start' in db_config)
39
+
40
+ class TestInvalidDBSettings(unittest.TestCase):
41
+
42
+ @patch.dict(os.environ, {
43
+ 'VECTOR_STORE_PROVIDER': 'postgres'
44
+ })
45
+ def test_mongo_settings(self):
46
+ """
47
+ Test invalid db config raises correct error
48
+ """
49
+ # reinitialize db_settings instance
50
+ db_settings.__init__()
51
+
52
+ with self.assertRaisesRegex(ValueError, "Unsupported"):
53
+ db_settings.get_vector_store_config()
backend/tests/test_llm_provider_settings.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import os
3
+ from unittest.mock import patch
4
+ from backend.config.settings import settings
5
+
6
+ class TestValidLLMProviderSettings(unittest.TestCase):
7
+
8
+ @patch.dict(os.environ, {
9
+ 'GOOGLE_API_KEY': 'ivq',
10
+ 'GOOGLE_MAX_TOKENS': '1200',
11
+ 'OPENAI_API_KEY': 'xyz',
12
+ 'OPENAI_MAX_TOKENS': '3800',
13
+ 'LLM_PROVIDER': 'openai'
14
+ })
15
+ def test_valid_llm_provider_settings(self):
16
+ """
17
+ Test settings object provides correct config when LLM_PROVIDER is set
18
+ """
19
+ # reinitialize settings instance
20
+ settings.__init__()
21
+
22
+ llm_config = settings.get_llm_config()
23
+ self.assertEqual(llm_config['api_key'], 'xyz')
24
+ self.assertEqual(llm_config['max_tokens'], 3800)
25
+
26
+ class TestInvalidLLMProviderSettings(unittest.TestCase):
27
+
28
+ @patch.dict(os.environ, {
29
+ 'LLM_PROVIDER': 'microsoft'
30
+ })
31
+ def test_invalid_llm_provider_settings(self):
32
+ """
33
+ Test that improper provider config raises the right error
34
+ """
35
+ # reinitialize settings instance
36
+ settings.__init__()
37
+
38
+ with self.assertRaisesRegex(ValueError, "Unsupported"):
39
+ settings.get_llm_config()
backend/tests/test_llm_service.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import os
3
+ from unittest.mock import patch
4
+ from backend.services.llm_service import llm_service
5
+ from backend.config.settings import settings
6
+ from backend.config.database import db_settings
7
+
8
+ class TestLLMService(unittest.TestCase):
9
+
10
+ @patch.dict(os.environ, {
11
+ 'VECTOR_STORE_PROVIDER': 'chromadb',
12
+ 'LLM_PROVIDER': 'google'
13
+ , })
14
+ def test_chat_completion(self):
15
+ """
16
+ Test chat completions work, assuming proper config and API keys
17
+ """
18
+ # reinitialize globals instance
19
+ settings.__init__()
20
+ db_settings.__init__()
21
+ llm_service.__init__()
22
+
23
+ response = llm_service.simple_chat_completion("Hello there 👋🏽")
24
+ print(response)
25
+ # There should be a better way to test this
26
+ self.assertTrue(len(response) > 0)
backend/utils/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils package - Utility functions and helpers
2
+ from .sanitization import (
3
+ DataSanitizer,
4
+ sanitize_user_input
5
+ )
6
+
7
+ __all__ = [
8
+ 'DataSanitizer',
9
+ 'sanitize_user_input'
10
+ ]
backend/utils/helpers.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Utility functions for the Recipe Recommendation Bot
2
+
backend/utils/request_dto/chat_response.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+
3
+ class ChatResponse(BaseModel):
4
+ response: str = Field(..., description="Bot response to the user message")
backend/utils/request_dto/scrape_request.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class ScrapeRequest(BaseModel):
5
+ site: str
6
+ limit: int = 50
7
+ output_type: str = "json" # or "mongo"