Jesse Johnson
commited on
Commit
·
c59d808
0
Parent(s):
New commit for backend deployment: 2025-09-25_13-24-03
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +96 -0
- Dockerfile +16 -0
- README.md +54 -0
- app.py +1 -0
- backend/.env.example +105 -0
- backend/.gitignore +141 -0
- backend/Dockerfile +16 -0
- backend/README.md +462 -0
- backend/app.py +193 -0
- backend/config/__init__.py +0 -0
- backend/config/database.py +61 -0
- backend/config/logging_config.py +82 -0
- backend/config/settings.py +176 -0
- backend/core/__init__.py +0 -0
- backend/core/exceptions.py +15 -0
- backend/data/__init__.py +0 -0
- backend/data/sample_recipes.json +138 -0
- backend/data_minning/__init__.py +1 -0
- backend/data_minning/all_nigerian_recipe_scraper.py +193 -0
- backend/data_minning/base_scrapper.py +348 -0
- backend/data_minning/dto/__init__.py +1 -0
- backend/data_minning/dto/recipe_doc.py +45 -0
- backend/data_minning/dto/stream_opts.py +12 -0
- backend/data_minning/soup_client.py +43 -0
- backend/data_minning/yummy_medley_scraper.py +209 -0
- backend/docs/README.md +85 -0
- backend/docs/chromadb_refresh.md +228 -0
- backend/docs/embedding-compatibility-guide.md +249 -0
- backend/docs/embedding-troubleshooting.md +132 -0
- backend/docs/logging_guide.md +56 -0
- backend/docs/model-configuration-guide.md +542 -0
- backend/docs/model-selection-guide.md +502 -0
- backend/docs/opensource-llm-configuration.md +394 -0
- backend/docs/optimal_recipes_structure.md +160 -0
- backend/docs/sanitization_guide.md +147 -0
- backend/docs/scraper.md +372 -0
- backend/docs/unified-provider-configuration.md +108 -0
- backend/requirements.txt +49 -0
- backend/services/__init__.py +1 -0
- backend/services/custom_mongo_vector.py +154 -0
- backend/services/llm_service.py +354 -0
- backend/services/vector_store.py +386 -0
- backend/tests/__init__.py +0 -0
- backend/tests/test_db_settings.py +53 -0
- backend/tests/test_llm_provider_settings.py +39 -0
- backend/tests/test_llm_service.py +26 -0
- backend/utils/__init__.py +10 -0
- backend/utils/helpers.py +2 -0
- backend/utils/request_dto/chat_response.py +4 -0
- backend/utils/request_dto/scrape_request.py +7 -0
.gitignore
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib64/
|
| 14 |
+
parts/
|
| 15 |
+
sdist/
|
| 16 |
+
var/
|
| 17 |
+
wheels/
|
| 18 |
+
*.egg-info/
|
| 19 |
+
.installed.cfg
|
| 20 |
+
*.egg
|
| 21 |
+
MANIFEST
|
| 22 |
+
|
| 23 |
+
# PyInstaller
|
| 24 |
+
*.manifest
|
| 25 |
+
*.spec
|
| 26 |
+
|
| 27 |
+
# Unit test / coverage reports
|
| 28 |
+
htmlcov/
|
| 29 |
+
.tox/
|
| 30 |
+
.coverage
|
| 31 |
+
.coverage.*
|
| 32 |
+
.cache
|
| 33 |
+
nosetests.xml
|
| 34 |
+
coverage.xml
|
| 35 |
+
*.cover
|
| 36 |
+
.hypothesis/
|
| 37 |
+
.pytest_cache/
|
| 38 |
+
|
| 39 |
+
# Virtual environments
|
| 40 |
+
.env
|
| 41 |
+
.venv
|
| 42 |
+
env/
|
| 43 |
+
venv/
|
| 44 |
+
ENV/
|
| 45 |
+
env.bak/
|
| 46 |
+
venv.bak/
|
| 47 |
+
venv/
|
| 48 |
+
|
| 49 |
+
# IDEs
|
| 50 |
+
.vscode/
|
| 51 |
+
.idea/
|
| 52 |
+
*.swp
|
| 53 |
+
*.swo
|
| 54 |
+
*~
|
| 55 |
+
|
| 56 |
+
# Jupyter Notebook
|
| 57 |
+
.ipynb_checkpoints
|
| 58 |
+
|
| 59 |
+
# Environment variables
|
| 60 |
+
.env
|
| 61 |
+
.env.local
|
| 62 |
+
.env.development.local
|
| 63 |
+
.env.test.local
|
| 64 |
+
.env.production.local
|
| 65 |
+
|
| 66 |
+
# Data files
|
| 67 |
+
*.csv
|
| 68 |
+
*.xlsx
|
| 69 |
+
*.pickle
|
| 70 |
+
*.pkl
|
| 71 |
+
|
| 72 |
+
# API keys and secrets
|
| 73 |
+
secrets/
|
| 74 |
+
*.key
|
| 75 |
+
*.pem
|
| 76 |
+
|
| 77 |
+
# OS
|
| 78 |
+
.DS_Store
|
| 79 |
+
Thumbs.db
|
| 80 |
+
|
| 81 |
+
# Logs
|
| 82 |
+
*.log
|
| 83 |
+
logs/
|
| 84 |
+
|
| 85 |
+
# Model files
|
| 86 |
+
models/
|
| 87 |
+
*.model
|
| 88 |
+
*.h5
|
| 89 |
+
*.pkl
|
| 90 |
+
|
| 91 |
+
# Vector databases
|
| 92 |
+
chroma_db/
|
| 93 |
+
faiss_index/
|
| 94 |
+
|
| 95 |
+
frontend/.next/
|
| 96 |
+
frontend/node_modules/
|
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
+
# you will also find guides on how best to write your Dockerfile
|
| 3 |
+
|
| 4 |
+
FROM python:3.9
|
| 5 |
+
|
| 6 |
+
RUN useradd -m -u 1000 user
|
| 7 |
+
USER user
|
| 8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY --chown=user . /app
|
| 16 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Recipe Recommendation Chatbot API
|
| 3 |
+
emoji: 🥘
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Recipe Recommendation Chatbot
|
| 12 |
+
|
| 13 |
+
A GenAI-powered chatbot that recommends recipes based on available ingredients using RAG (Retrieval Augmented Generation).
|
| 14 |
+
|
| 15 |
+
## 🚀 Quick Start
|
| 16 |
+
```bash
|
| 17 |
+
# Clone repository
|
| 18 |
+
git clone https://github.com/A3copilotprogram/PLG4-Recipe-Recommendation-Chatbot.git
|
| 19 |
+
cd PLG4-Recipe-Recommendation-Chatbot
|
| 20 |
+
|
| 21 |
+
# Install dependencies
|
| 22 |
+
pip install -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# Run the chatbot
|
| 25 |
+
python src/main.py
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## 📁 Project Structure
|
| 29 |
+
- `backend/` - FastAPI backend with RAG pipeline
|
| 30 |
+
- `frontend/` - React frontend interface
|
| 31 |
+
- `data/` - Recipe datasets and embeddings
|
| 32 |
+
- `docs/` - Project documentation
|
| 33 |
+
- `notebooks/` - Jupyter notebooks for exploration
|
| 34 |
+
- `tests/` - Unit and integration tests
|
| 35 |
+
|
| 36 |
+
## 📚 Documentation
|
| 37 |
+
|
| 38 |
+
### Quick Start Guides
|
| 39 |
+
- **[Backend Setup](./backend/README.md)** - FastAPI server setup and configuration
|
| 40 |
+
- **[Frontend Setup](./frontend/README.md)** - React app development
|
| 41 |
+
|
| 42 |
+
### Troubleshooting
|
| 43 |
+
- **[Embedding Issues](./backend/docs/embedding-troubleshooting.md)** - Fix common dimension mismatch errors
|
| 44 |
+
- **[Documentation Index](./backend/docs/README.md)** - Complete documentation overview
|
| 45 |
+
|
| 46 |
+
### Architecture
|
| 47 |
+
- **[System Architecture](./docs/architecture.md)** - High-level system design
|
| 48 |
+
- **[API Documentation](./backend/docs/api-documentation.md)** - Detailed API reference
|
| 49 |
+
|
| 50 |
+
## 🤝 Contributing
|
| 51 |
+
See [CONTRIBUTING.md](docs/CONTRIBUTING.md) for ways of working and contribution guidelines.
|
| 52 |
+
|
| 53 |
+
## 👥 Team
|
| 54 |
+
GenAI PLG 4 - Andela Community Program
|
app.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from backend.app import app
|
backend/.env.example
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ===========================================
|
| 2 |
+
# Recipe Recommendation Bot - Environment Configuration
|
| 3 |
+
# ===========================================
|
| 4 |
+
|
| 5 |
+
# Server Configuration
|
| 6 |
+
PORT=8080
|
| 7 |
+
HOST=0.0.0.0
|
| 8 |
+
ENVIRONMENT=development
|
| 9 |
+
DEBUG=true
|
| 10 |
+
LANGCHAIN_DEBUG=true
|
| 11 |
+
|
| 12 |
+
# CORS Configuration
|
| 13 |
+
CORS_ORIGINS=["http://localhost:3000","http://localhost:5173","http://localhost:8000"]
|
| 14 |
+
CORS_ALLOW_CREDENTIALS=true
|
| 15 |
+
CORS_ALLOW_METHODS=["GET","POST","PUT","DELETE","OPTIONS"]
|
| 16 |
+
CORS_ALLOW_HEADERS=["*"]
|
| 17 |
+
|
| 18 |
+
# ===========================================
|
| 19 |
+
# LLM & Embedding Provider Configuration
|
| 20 |
+
# ===========================================
|
| 21 |
+
# Supported providers: openai, google, huggingface, ollama
|
| 22 |
+
# This provider will be used for both LLM and embeddings
|
| 23 |
+
|
| 24 |
+
LLM_PROVIDER=google
|
| 25 |
+
EMBEDDING_PROVIDER=google
|
| 26 |
+
|
| 27 |
+
# OpenAI Configuration
|
| 28 |
+
# Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'openai'
|
| 29 |
+
OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE
|
| 30 |
+
OPENAI_MODEL=gpt-5-nano
|
| 31 |
+
OPENAI_TEMPERATURE=0.7
|
| 32 |
+
OPENAI_MAX_TOKENS=1000
|
| 33 |
+
|
| 34 |
+
# Google AI Configuration (Gemini)
|
| 35 |
+
# Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'google'
|
| 36 |
+
GOOGLE_API_KEY=YOUR_GOOGLE_API_KEY_HERE
|
| 37 |
+
GOOGLE_MODEL=gemini-2.0-flash
|
| 38 |
+
GOOGLE_TEMPERATURE=0.7
|
| 39 |
+
GOOGLE_MAX_TOKENS=1000
|
| 40 |
+
|
| 41 |
+
# Hugging Face Configuration
|
| 42 |
+
# Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'huggingface'
|
| 43 |
+
HUGGINGFACE_API_TOKEN=YOUR_HUGGINGFACE_API_TOKEN_HERE
|
| 44 |
+
HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1
|
| 45 |
+
HUGGINGFACE_API_URL=https://api-inference.huggingface.co/models/
|
| 46 |
+
HUGGINGFACE_USE_API=true
|
| 47 |
+
HUGGINGFACE_USE_GPU=false
|
| 48 |
+
|
| 49 |
+
# Ollama Configuration (local inference)
|
| 50 |
+
# Use only if LLM_PROVIDER or EMBEDDING_PROVIDER is set to 'ollama'
|
| 51 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 52 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 53 |
+
OLLAMA_TEMPERATURE=0.7
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ===========================================
|
| 57 |
+
# Vector Store Configuration
|
| 58 |
+
# ===========================================
|
| 59 |
+
# Supported stores: chromadb, mongodb
|
| 60 |
+
VECTOR_STORE_PROVIDER=mongodb
|
| 61 |
+
|
| 62 |
+
# ChromaDB Configuration
|
| 63 |
+
DB_PATH=./data/chromadb
|
| 64 |
+
DB_COLLECTION_NAME=recipes
|
| 65 |
+
DB_PERSIST_DIRECTORY=./data/chromadb_persist
|
| 66 |
+
# Set to true to delete and recreate DB on startup (useful for adding new recipes)
|
| 67 |
+
DB_REFRESH_ON_START=false
|
| 68 |
+
|
| 69 |
+
# MongoDB Atlas Configuration (for vector search)
|
| 70 |
+
# Provide your connection string and collection settings when using MongoDB
|
| 71 |
+
MONGODB_URI=mongodb+srv://<username>:<password>@<cluster>.mongodb.net/?retryWrites=true&w=majority&appName=<AppName>
|
| 72 |
+
MONGODB_DATABASE=food_recommendation
|
| 73 |
+
MONGODB_COLLECTION=AI_DB
|
| 74 |
+
MONGODB_INDEX_NAME=foodInstructionIndex
|
| 75 |
+
MONGODB_VECTOR_FIELD=ingredients_emb
|
| 76 |
+
MONGODB_TEXT_FIELD=title
|
| 77 |
+
MONGODB_SIMILARITY_METRIC=dotProduct
|
| 78 |
+
MONGODB_NUM_CANDIDATES=100
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ===========================================
|
| 82 |
+
# Model Configuration
|
| 83 |
+
# ===========================================
|
| 84 |
+
# The LLM_PROVIDER setting above controls both LLM and embedding models
|
| 85 |
+
|
| 86 |
+
# OpenAI Models
|
| 87 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-small
|
| 88 |
+
|
| 89 |
+
# Google Models
|
| 90 |
+
GOOGLE_EMBEDDING_MODEL=models/embedding-001
|
| 91 |
+
|
| 92 |
+
# HuggingFace Models
|
| 93 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 94 |
+
|
| 95 |
+
# Ollama Models
|
| 96 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ===========================================
|
| 100 |
+
# Logging Configuration
|
| 101 |
+
# ===========================================
|
| 102 |
+
LOG_LEVEL=INFO
|
| 103 |
+
LOG_FORMAT=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
| 104 |
+
LOG_FILE=./logs/app.log
|
| 105 |
+
|
backend/.gitignore
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
pip-wheel-metadata/
|
| 24 |
+
share/python-wheels/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
MANIFEST
|
| 29 |
+
|
| 30 |
+
# PyInstaller
|
| 31 |
+
# Usually these files are written by a python script from a template
|
| 32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 33 |
+
*.manifest
|
| 34 |
+
*.spec
|
| 35 |
+
|
| 36 |
+
# Installer logs
|
| 37 |
+
pip-log.txt
|
| 38 |
+
pip-delete-this-directory.txt
|
| 39 |
+
|
| 40 |
+
# Unit test / coverage reports
|
| 41 |
+
htmlcov/
|
| 42 |
+
.tox/
|
| 43 |
+
.nox/
|
| 44 |
+
.coverage
|
| 45 |
+
.coverage.*
|
| 46 |
+
.cache
|
| 47 |
+
nosetests.xml
|
| 48 |
+
coverage.xml
|
| 49 |
+
*.cover
|
| 50 |
+
*.py,cover
|
| 51 |
+
.hypothesis/
|
| 52 |
+
.pytest_cache/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
target/
|
| 76 |
+
|
| 77 |
+
# Jupyter Notebook
|
| 78 |
+
.ipynb_checkpoints
|
| 79 |
+
|
| 80 |
+
# IPython
|
| 81 |
+
profile_default/
|
| 82 |
+
ipython_config.py
|
| 83 |
+
|
| 84 |
+
# pyenv
|
| 85 |
+
.python-version
|
| 86 |
+
|
| 87 |
+
# pipenv
|
| 88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 91 |
+
# install all needed dependencies.
|
| 92 |
+
#Pipfile.lock
|
| 93 |
+
|
| 94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 95 |
+
__pypackages__/
|
| 96 |
+
|
| 97 |
+
# Celery stuff
|
| 98 |
+
celerybeat-schedule
|
| 99 |
+
celerybeat.pid
|
| 100 |
+
|
| 101 |
+
# SageMath parsed files
|
| 102 |
+
*.sage.py
|
| 103 |
+
|
| 104 |
+
# Environments
|
| 105 |
+
.env
|
| 106 |
+
.venv
|
| 107 |
+
env/
|
| 108 |
+
venv/
|
| 109 |
+
ENV/
|
| 110 |
+
env.bak/
|
| 111 |
+
venv.bak/
|
| 112 |
+
|
| 113 |
+
# Spyder project settings
|
| 114 |
+
.spyderproject
|
| 115 |
+
.spyproject
|
| 116 |
+
|
| 117 |
+
# Rope project settings
|
| 118 |
+
.ropeproject
|
| 119 |
+
|
| 120 |
+
# mkdocs documentation
|
| 121 |
+
/site
|
| 122 |
+
|
| 123 |
+
# mypy
|
| 124 |
+
.mypy_cache/
|
| 125 |
+
.dmypy.json
|
| 126 |
+
dmypy.json
|
| 127 |
+
|
| 128 |
+
# Pyre type checker
|
| 129 |
+
.pyre/
|
| 130 |
+
|
| 131 |
+
# Application specific
|
| 132 |
+
vector_store/
|
| 133 |
+
logs/
|
| 134 |
+
*.log
|
| 135 |
+
.DS_Store
|
| 136 |
+
node_modules/
|
| 137 |
+
|
| 138 |
+
# Data folder - ignore everything except sample recipe
|
| 139 |
+
data/*
|
| 140 |
+
!data/sample_recipes.json
|
| 141 |
+
!data/recipes
|
backend/Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
+
# you will also find guides on how best to write your Dockerfile
|
| 3 |
+
|
| 4 |
+
FROM python:3.9
|
| 5 |
+
|
| 6 |
+
RUN useradd -m -u 1000 user
|
| 7 |
+
USER user
|
| 8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY --chown=user . /app
|
| 16 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
backend/README.md
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Recipe Recommendation Chatbot - Backend API
|
| 2 |
+
|
| 3 |
+
Backend for AI-powered recipe recommendation system built with FastAPI, featuring RAG (Retrieval-Augmented Generation) capabilities, conversational memory, and multi-provider LLM support.
|
| 4 |
+
|
| 5 |
+
## 🚀 Quick Start
|
| 6 |
+
|
| 7 |
+
### Prerequisites
|
| 8 |
+
- Python 3.9+
|
| 9 |
+
- pip or poetry
|
| 10 |
+
- API keys for your chosen LLM provider (OpenAI, Google, or HuggingFace)
|
| 11 |
+
|
| 12 |
+
### Installation
|
| 13 |
+
|
| 14 |
+
1. **Clone and navigate to backend**
|
| 15 |
+
```bash
|
| 16 |
+
git clone <repository-url>
|
| 17 |
+
cd PLG4-Recipe-Recommendation-Chatbot/backend
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
2. **Install dependencies**
|
| 21 |
+
```bash
|
| 22 |
+
pip install -r requirements.txt
|
| 23 |
+
```
|
| 24 |
+
> 💡 **Note**: Some packages are commented out by default to keep the installation lightweight:
|
| 25 |
+
> - **HuggingFace dependencies** (`transformers`, `accelerate`, `sentence-transformers`) - Uncomment if using HuggingFace models
|
| 26 |
+
> - **sentence-transformers** (~800MB) - Uncomment for HuggingFace embeddings
|
| 27 |
+
|
| 28 |
+
3. **Configure environment**
|
| 29 |
+
```bash
|
| 30 |
+
cp .env.example .env
|
| 31 |
+
# Edit .env with your API keys and configuration
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
4. **Run the server**
|
| 35 |
+
```bash
|
| 36 |
+
# Development mode with auto-reload
|
| 37 |
+
uvicorn app:app --reload --host 127.0.0.1 --port 8080
|
| 38 |
+
|
| 39 |
+
# Or production mode
|
| 40 |
+
uvicorn app:app --host 127.0.0.1 --port 8080
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
5. **Test the API**
|
| 44 |
+
```bash
|
| 45 |
+
curl http://localhost:8080/health
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
6. **HuggingFace Spaces deployment**
|
| 49 |
+
```
|
| 50 |
+
sh deploy-to-hf.sh <remote>
|
| 51 |
+
```
|
| 52 |
+
where <remote> points to the HuggingFace Spaces repository
|
| 53 |
+
|
| 54 |
+
## 📁 Project Structure
|
| 55 |
+
|
| 56 |
+
```
|
| 57 |
+
backend/
|
| 58 |
+
├── app.py # FastAPI application entry point
|
| 59 |
+
├── requirements.txt # Python dependencies
|
| 60 |
+
├── .env.example # Environment configuration template
|
| 61 |
+
├── .gitignore # Git ignore rules
|
| 62 |
+
│
|
| 63 |
+
├── config/ # Configuration modules
|
| 64 |
+
│ ├── __init__.py
|
| 65 |
+
│ ├── settings.py # Application settings
|
| 66 |
+
│ ├── database.py # Database configuration
|
| 67 |
+
│ └── logging_config.py # Logging setup
|
| 68 |
+
│
|
| 69 |
+
├── services/ # Core business logic
|
| 70 |
+
│ ├── __init__.py
|
| 71 |
+
│ ├── llm_service.py # LLM and RAG pipeline
|
| 72 |
+
│ └── vector_store.py # Vector database management
|
| 73 |
+
│
|
| 74 |
+
├── data/ # Data storage
|
| 75 |
+
│ ├── recipes/ # Recipe JSON files
|
| 76 |
+
│ │ └── recipe.json # Sample recipe data
|
| 77 |
+
│ └── chromadb_persist/ # ChromaDB persistence
|
| 78 |
+
│
|
| 79 |
+
├── logs/ # Application logs
|
| 80 |
+
│ └── recipe_bot.log # Main log file
|
| 81 |
+
│
|
| 82 |
+
├── docs/ # Documentation
|
| 83 |
+
│ ├── model-selection-guide.md # 🎯 Complete model selection & comparison guide
|
| 84 |
+
│ ├── model-quick-reference.md # ⚡ Quick model switching commands
|
| 85 |
+
│ ├── chromadb_refresh.md # ChromaDB refresh guide
|
| 86 |
+
│ ├── opensource-llm-configuration.md # Open source LLM setup guide
|
| 87 |
+
│ ├── logging_guide.md # Logging documentation
|
| 88 |
+
│ ├── optimal_recipes_structure.md # Recipe data structure guide
|
| 89 |
+
│ ├── sanitization_guide.md # Input sanitization guide
|
| 90 |
+
│ └── unified-provider-configuration.md # Unified provider approach guide
|
| 91 |
+
│
|
| 92 |
+
└── utils/ # Utility functions
|
| 93 |
+
└── __init__.py
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## ⚙️ Configuration
|
| 97 |
+
|
| 98 |
+
### Environment Variables
|
| 99 |
+
|
| 100 |
+
Copy `.env.example` to `.env` and configure the following:
|
| 101 |
+
|
| 102 |
+
> 🎯 **Unified Provider Approach**: The `LLM_PROVIDER` setting controls both LLM and embedding models, preventing configuration mismatches. See [`docs/unified-provider-configuration.md`](docs/unified-provider-configuration.md) for details.
|
| 103 |
+
|
| 104 |
+
#### **Server Configuration**
|
| 105 |
+
```bash
|
| 106 |
+
PORT=8000 # Server port
|
| 107 |
+
HOST=0.0.0.0 # Server host
|
| 108 |
+
ENVIRONMENT=development # Environment mode
|
| 109 |
+
DEBUG=true # Debug mode
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
#### **Provider Configuration**
|
| 113 |
+
Choose one provider for both LLM and embeddings (unified approach):
|
| 114 |
+
|
| 115 |
+
> 🎯 **NEW: Complete Model Selection Guide**: For detailed comparisons of all models (OpenAI, Google, Anthropic, Ollama, HuggingFace) including latest 2025 models, performance metrics, costs, and scenario-based recommendations, see [`docs/model-selection-guide.md`](docs/model-selection-guide.md)
|
| 116 |
+
|
| 117 |
+
> ⚡ **Quick Reference**: For one-command model switching, see [`docs/model-quick-reference.md`](docs/model-quick-reference.md)
|
| 118 |
+
|
| 119 |
+
**OpenAI (Best Value & Latest Models)**
|
| 120 |
+
```bash
|
| 121 |
+
LLM_PROVIDER=openai
|
| 122 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 123 |
+
OPENAI_MODEL=gpt-5-nano # 🎯 BEST VALUE: $1/month for 30K queries - Modern GPT-5 at nano price
|
| 124 |
+
# Alternatives:
|
| 125 |
+
# - gpt-4o-mini # Proven choice: $4/month for 30K queries
|
| 126 |
+
# - gpt-5 # Premium: $20/month unlimited (Plus plan)
|
| 127 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-small # Used automatically
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
**Google Gemini (Best Free Tier)**
|
| 131 |
+
```bash
|
| 132 |
+
LLM_PROVIDER=google
|
| 133 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 134 |
+
GOOGLE_MODEL=gemini-2.5-flash # 🎯 RECOMMENDED: Excellent free tier, then $2/month
|
| 135 |
+
# Alternatives:
|
| 136 |
+
# - gemini-2.0-flash-lite # Ultra budget: $0.90/month for 30K queries
|
| 137 |
+
# - gemini-2.5-pro # Premium: $25/month for 30K queries
|
| 138 |
+
GOOGLE_EMBEDDING_MODEL=models/embedding-001 # Used automatically
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
**Anthropic Claude (Best Quality-to-Cost)**
|
| 142 |
+
```bash
|
| 143 |
+
LLM_PROVIDER=anthropic
|
| 144 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 145 |
+
ANTHROPIC_MODEL=claude-3-5-haiku-20241022 # 🎯 BUDGET WINNER: $4/month for 30K queries
|
| 146 |
+
# Alternatives:
|
| 147 |
+
# - claude-3-5-sonnet-20241022 # Production standard: $45/month for 30K queries
|
| 148 |
+
# - claude-3-opus-20240229 # Premium quality: $225/month for 30K queries
|
| 149 |
+
ANTHROPIC_EMBEDDING_MODEL=voyage-large-2 # Used automatically
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
**Ollama (Best for Privacy/Self-Hosting)**
|
| 153 |
+
```bash
|
| 154 |
+
LLM_PROVIDER=ollama
|
| 155 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 156 |
+
OLLAMA_MODEL=llama3.1:8b # 🎯 YOUR CURRENT: 4.7GB download, 8GB RAM, excellent balance
|
| 157 |
+
# New alternatives:
|
| 158 |
+
# - deepseek-r1:7b # Breakthrough reasoning: 4.7GB download, O1-level performance
|
| 159 |
+
# - codeqwen:7b # Structured data expert: 4.2GB download, excellent for recipes
|
| 160 |
+
# - gemma3:4b # Resource-efficient: 3.3GB download, 6GB RAM
|
| 161 |
+
# - mistral-nemo:12b # Balanced performance: 7GB download, 12GB RAM
|
| 162 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text # Used automatically
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
**HuggingFace (Downloadable Models Only - APIs Unreliable)**
|
| 166 |
+
```bash
|
| 167 |
+
LLM_PROVIDER=ollama # Use Ollama to run HuggingFace models locally
|
| 168 |
+
OLLAMA_MODEL=codeqwen:7b # 🎯 RECOMMENDED: Download HF models via Ollama for reliability
|
| 169 |
+
# Other downloadable options:
|
| 170 |
+
# - mistral-nemo:12b # Mistral's balanced model
|
| 171 |
+
# - nous-hermes2:10.7b # Fine-tuned for instruction following
|
| 172 |
+
# - openhermes2.5-mistral:7b # Community favorite
|
| 173 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text # Used automatically
|
| 174 |
+
```
|
| 175 |
+
> ⚠️ **Important Change**: HuggingFace APIs have proven unreliable for production. We now recommend downloading HuggingFace models locally via Ollama for consistent performance.
|
| 176 |
+
> ⚠️ **HuggingFace Update**: HuggingFace dependencies are no longer required as we recommend using downloadable models via Ollama instead of unreliable APIs. For local HuggingFace models, use Ollama which provides better reliability and performance.
|
| 177 |
+
|
| 178 |
+
> 📖 **Local Model Setup**: See [`docs/opensource-llm-configuration.md`](docs/opensource-llm-configuration.md) for GPU setup, model selection, and performance optimization with Ollama.
|
| 179 |
+
|
| 180 |
+
> 💡 **Unified Provider**: The `LLM_PROVIDER` setting automatically configures both the LLM and embedding models, ensuring consistency and preventing mismatched configurations.
|
| 181 |
+
|
| 182 |
+
#### **Vector Store Configuration**
|
| 183 |
+
Choose between ChromaDB (local) or MongoDB Atlas:
|
| 184 |
+
|
| 185 |
+
**ChromaDB (Default)**
|
| 186 |
+
```bash
|
| 187 |
+
VECTOR_STORE_PROVIDER=chromadb
|
| 188 |
+
DB_COLLECTION_NAME=recipes
|
| 189 |
+
DB_PERSIST_DIRECTORY=./data/chromadb_persist
|
| 190 |
+
# Set to true to delete and recreate DB on startup (useful for adding new recipes)
|
| 191 |
+
DB_REFRESH_ON_START=false
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
**MongoDB Atlas**
|
| 195 |
+
```bash
|
| 196 |
+
VECTOR_STORE_PROVIDER=mongodb
|
| 197 |
+
MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
|
| 198 |
+
MONGODB_DATABASE=recipe_bot
|
| 199 |
+
MONGODB_COLLECTION=recipes
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
#### **Embedding Configuration**
|
| 203 |
+
```bash
|
| 204 |
+
# Embedding provider automatically matches LLM_PROVIDER (unified approach)
|
| 205 |
+
# No separate configuration needed - handled automatically based on LLM_PROVIDER setting
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
> 💡 **Unified Provider**: The `LLM_PROVIDER` setting automatically configures both the LLM and embedding models, ensuring consistency and preventing mismatched configurations. See [`docs/model-selection-guide.md`](docs/model-selection-guide.md) for all available options.
|
| 209 |
+
|
| 210 |
+
## 🛠️ API Endpoints
|
| 211 |
+
|
| 212 |
+
### Core Endpoints
|
| 213 |
+
|
| 214 |
+
#### **Health Check**
|
| 215 |
+
```bash
|
| 216 |
+
GET /health
|
| 217 |
+
```
|
| 218 |
+
Returns service health and configuration status.
|
| 219 |
+
|
| 220 |
+
#### **Chat with RAG**
|
| 221 |
+
```bash
|
| 222 |
+
POST /chat
|
| 223 |
+
Content-Type: application/json
|
| 224 |
+
|
| 225 |
+
{
|
| 226 |
+
"message": "What chicken recipes do you have?"
|
| 227 |
+
}
|
| 228 |
+
```
|
| 229 |
+
Full conversational RAG pipeline with memory and vector retrieval.
|
| 230 |
+
|
| 231 |
+
#### **Simple Demo**
|
| 232 |
+
```bash
|
| 233 |
+
GET /demo?prompt=Tell me about Italian cuisine
|
| 234 |
+
```
|
| 235 |
+
Simple LLM completion without RAG for testing.
|
| 236 |
+
|
| 237 |
+
#### **Clear Memory**
|
| 238 |
+
```bash
|
| 239 |
+
POST /clear-memory
|
| 240 |
+
```
|
| 241 |
+
Clears conversation memory for fresh start.
|
| 242 |
+
|
| 243 |
+
### Example Requests
|
| 244 |
+
|
| 245 |
+
**Chat Request:**
|
| 246 |
+
```bash
|
| 247 |
+
curl -X POST "http://localhost:8080/chat"
|
| 248 |
+
-H "Content-Type: application/json"
|
| 249 |
+
-d '{"message": "What are some quick breakfast recipes?"}'
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
**Demo Request:**
|
| 253 |
+
```bash
|
| 254 |
+
curl "http://localhost:8080/demo?prompt=What%20is%20your%20favorite%20pasta%20dish?"
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
## 🏗️ Architecture
|
| 258 |
+
|
| 259 |
+
### Core Components
|
| 260 |
+
|
| 261 |
+
#### **LLM Service** (`services/llm_service.py`)
|
| 262 |
+
- **ConversationalRetrievalChain**: Main RAG pipeline with memory
|
| 263 |
+
- **Simple Chat Completion**: Direct LLM responses without RAG
|
| 264 |
+
- **Multi-provider Support**: OpenAI, Google, HuggingFace
|
| 265 |
+
- **Conversation Memory**: Persistent chat history
|
| 266 |
+
|
| 267 |
+
#### **Vector Store Service** (`services/vector_store.py`)
|
| 268 |
+
- **ChromaDB Integration**: Local vector database
|
| 269 |
+
- **MongoDB Atlas Support**: Cloud vector search
|
| 270 |
+
- **Document Loading**: Automatic recipe data ingestion
|
| 271 |
+
- **Embedding Management**: Multi-provider embedding support
|
| 272 |
+
|
| 273 |
+
#### **Configuration System** (`config/`)
|
| 274 |
+
- **Settings Management**: Environment-based configuration
|
| 275 |
+
- **Database Configuration**: Vector store setup
|
| 276 |
+
- **Logging Configuration**: Structured logging with rotation
|
| 277 |
+
|
| 278 |
+
### Data Flow
|
| 279 |
+
|
| 280 |
+
1. **User Query** → FastAPI endpoint
|
| 281 |
+
2. **RAG Pipeline** → Vector similarity search
|
| 282 |
+
3. **Context Retrieval** → Top-k relevant recipes
|
| 283 |
+
4. **LLM Generation** → Context-aware response
|
| 284 |
+
5. **Memory Storage** → Conversation persistence
|
| 285 |
+
6. **Response** → JSON formatted reply
|
| 286 |
+
|
| 287 |
+
## 📊 Logging
|
| 288 |
+
|
| 289 |
+
Comprehensive logging system with:
|
| 290 |
+
|
| 291 |
+
- **File Rotation**: 10MB max size, 5 backups
|
| 292 |
+
- **Structured Format**: Timestamps, levels, source location
|
| 293 |
+
- **Emoji Indicators**: Visual status indicators
|
| 294 |
+
- **Error Tracking**: Full stack traces for debugging
|
| 295 |
+
|
| 296 |
+
**Log Levels:**
|
| 297 |
+
- 🚀 **INFO**: Normal operations
|
| 298 |
+
- ⚠️ **WARNING**: Non-critical issues
|
| 299 |
+
- ❌ **ERROR**: Failures with stack traces
|
| 300 |
+
- 🔧 **DEBUG**: Detailed operation steps
|
| 301 |
+
|
| 302 |
+
**Log Location:** `./logs/recipe_bot.log`
|
| 303 |
+
|
| 304 |
+
## 📁 Data Management
|
| 305 |
+
|
| 306 |
+
### Recipe Data
|
| 307 |
+
- **Location**: `./data/recipes/`
|
| 308 |
+
- **Format**: JSON files with structured recipe data
|
| 309 |
+
- **Schema**: title, ingredients, directions, tags
|
| 310 |
+
- **Auto-loading**: Automatic chunking and vectorization
|
| 311 |
+
|
| 312 |
+
### Vector Storage
|
| 313 |
+
- **ChromaDB**: Local persistence in `./data/chromadb_persist/`
|
| 314 |
+
- **MongoDB**: Cloud-based vector search
|
| 315 |
+
- **Embeddings**: Configurable embedding models
|
| 316 |
+
- **Retrieval**: Top-k similarity search (k=25)
|
| 317 |
+
|
| 318 |
+
## 🔧 Development
|
| 319 |
+
|
| 320 |
+
### Running in Development
|
| 321 |
+
```bash
|
| 322 |
+
# Install dependencies
|
| 323 |
+
pip install -r requirements.txt
|
| 324 |
+
|
| 325 |
+
# Set up environment
|
| 326 |
+
cp .env.example .env
|
| 327 |
+
# Configure your API keys
|
| 328 |
+
|
| 329 |
+
# Run with auto-reload
|
| 330 |
+
uvicorn app:app --reload --host 127.0.0.1 --port 8080
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
### Testing Individual Components
|
| 334 |
+
```bash
|
| 335 |
+
# Test vector store
|
| 336 |
+
python -c "from services.vector_store import vector_store_service; print('Vector store initialized')"
|
| 337 |
+
|
| 338 |
+
# Test LLM service
|
| 339 |
+
python -c "from services.llm_service import llm_service; print('LLM service initialized')"
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
### Adding New Recipes
|
| 343 |
+
1. Add JSON files to `./data/recipes/`
|
| 344 |
+
2. Set `DB_REFRESH_ON_START=true` in `.env` file
|
| 345 |
+
3. Restart the application (ChromaDB will be recreated)
|
| 346 |
+
4. Set `DB_REFRESH_ON_START=false` to prevent repeated deletion
|
| 347 |
+
5. New recipes are now available for search
|
| 348 |
+
|
| 349 |
+
**Quick refresh:**
|
| 350 |
+
```bash
|
| 351 |
+
# Enable refresh, restart, then disable
|
| 352 |
+
echo "DB_REFRESH_ON_START=true" >> .env
|
| 353 |
+
uvicorn app:app --reload --host 127.0.0.1 --port 8080
|
| 354 |
+
# After startup completes:
|
| 355 |
+
sed -i 's/DB_REFRESH_ON_START=true/DB_REFRESH_ON_START=false/' .env
|
| 356 |
+
```
|
| 357 |
+
|
| 358 |
+
## 🚀 Production Deployment
|
| 359 |
+
|
| 360 |
+
### Environment Setup
|
| 361 |
+
```bash
|
| 362 |
+
ENVIRONMENT=production
|
| 363 |
+
DEBUG=false
|
| 364 |
+
LOG_LEVEL=INFO
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
### Docker Deployment
|
| 368 |
+
The backend is containerized and ready for deployment on platforms like Hugging Face Spaces.
|
| 369 |
+
|
| 370 |
+
### Security Features
|
| 371 |
+
- **Environment Variables**: Secure API key management
|
| 372 |
+
- **CORS Configuration**: Frontend integration protection
|
| 373 |
+
- **Input Sanitization**: Context-appropriate validation for recipe queries
|
| 374 |
+
- XSS protection through HTML encoding
|
| 375 |
+
- Length validation (1-1000 characters)
|
| 376 |
+
- Basic harmful pattern removal
|
| 377 |
+
- Whitespace normalization
|
| 378 |
+
- **Pydantic Validation**: Type safety and automatic sanitization
|
| 379 |
+
- **Structured Error Handling**: Safe error responses without data leaks
|
| 380 |
+
|
| 381 |
+
## 🛠️ Troubleshooting
|
| 382 |
+
|
| 383 |
+
### Common Issues
|
| 384 |
+
|
| 385 |
+
**Vector store initialization fails**
|
| 386 |
+
- Check API keys for embedding provider
|
| 387 |
+
- Verify data folder contains recipe files
|
| 388 |
+
- Check ChromaDB permissions
|
| 389 |
+
|
| 390 |
+
**LLM service fails**
|
| 391 |
+
- Verify API key configuration
|
| 392 |
+
- Check provider-specific requirements
|
| 393 |
+
- Review logs for detailed error messages
|
| 394 |
+
|
| 395 |
+
**HuggingFace model import errors**
|
| 396 |
+
- HuggingFace APIs have proven unreliable for production use
|
| 397 |
+
- **Recommended**: Use Ollama to run HuggingFace models locally instead:
|
| 398 |
+
```bash
|
| 399 |
+
# Install and run HuggingFace models via Ollama
|
| 400 |
+
ollama pull codeqwen:7b
|
| 401 |
+
ollama pull mistral-nemo:12b
|
| 402 |
+
# Set LLM_PROVIDER=ollama in .env
|
| 403 |
+
```
|
| 404 |
+
- For legacy HuggingFace API setup, uncomment dependencies in `requirements.txt` (not recommended)
|
| 405 |
+
- For detailed model comparisons, see [`docs/model-selection-guide.md`](docs/model-selection-guide.md)
|
| 406 |
+
|
| 407 |
+
**Memory issues**
|
| 408 |
+
```bash
|
| 409 |
+
# Clear conversation memory
|
| 410 |
+
curl -X POST http://localhost:8080/clear-memory
|
| 411 |
+
```
|
| 412 |
+
|
| 413 |
+
### Debug Mode
|
| 414 |
+
Set `DEBUG=true` in `.env` for detailed logging and error traces.
|
| 415 |
+
|
| 416 |
+
### Log Analysis
|
| 417 |
+
Check `./logs/recipe_bot.log` for detailed operation logs with emoji indicators for quick status identification.
|
| 418 |
+
|
| 419 |
+
## 📚 Documentation
|
| 420 |
+
|
| 421 |
+
### Troubleshooting Guides
|
| 422 |
+
- **[Embedding Troubleshooting](./docs/embedding-troubleshooting.md)** - Quick fixes for common embedding dimension errors
|
| 423 |
+
- **[Embedding Compatibility Guide](./docs/embedding-compatibility-guide.md)** - Comprehensive guide to embedding models and dimensions
|
| 424 |
+
- **[Logging Guide](./docs/logging_guide.md)** - Understanding the logging system
|
| 425 |
+
|
| 426 |
+
### Technical Guides
|
| 427 |
+
- **[Architecture Documentation](./docs/architecture.md)** - System architecture overview
|
| 428 |
+
- **[API Documentation](./docs/api-documentation.md)** - Detailed API reference
|
| 429 |
+
- **[Deployment Guide](./docs/deployment.md)** - Production deployment instructions
|
| 430 |
+
|
| 431 |
+
### Common Issues
|
| 432 |
+
- **Dimension mismatch errors**: See [Embedding Troubleshooting](./docs/embedding-troubleshooting.md)
|
| 433 |
+
- **Model loading issues**: Check provider configuration in `.env`
|
| 434 |
+
- **Database connection problems**: Verify MongoDB/ChromaDB settings
|
| 435 |
+
|
| 436 |
+
## 📚 Dependencies
|
| 437 |
+
|
| 438 |
+
### Core Dependencies
|
| 439 |
+
- **FastAPI**: Modern web framework
|
| 440 |
+
- **uvicorn**: ASGI server
|
| 441 |
+
- **pydantic**: Data validation
|
| 442 |
+
- **python-dotenv**: Environment management
|
| 443 |
+
|
| 444 |
+
### AI/ML Dependencies
|
| 445 |
+
- **langchain**: LLM framework and chains
|
| 446 |
+
- **langchain-openai**: OpenAI integration
|
| 447 |
+
- **langchain-google-genai**: Google AI integration
|
| 448 |
+
- **sentence-transformers**: Embedding models
|
| 449 |
+
- **chromadb**: Vector database
|
| 450 |
+
- **pymongo**: MongoDB integration
|
| 451 |
+
|
| 452 |
+
### Optional Dependencies
|
| 453 |
+
- **langchain-huggingface**: HuggingFace integration
|
| 454 |
+
- **torch**: PyTorch for local models
|
| 455 |
+
|
| 456 |
+
## 📄 License
|
| 457 |
+
|
| 458 |
+
This project is part of the PLG4 Recipe Recommendation Chatbot system.
|
| 459 |
+
|
| 460 |
+
---
|
| 461 |
+
|
| 462 |
+
For more detailed documentation, check the `docs/` folder or visit the API documentation at `http://localhost:8080/docs` when running the server.
|
backend/app.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.utils.request_dto.chat_response import ChatResponse
|
| 2 |
+
from backend.utils.request_dto.scrape_request import ScrapeRequest
|
| 3 |
+
from backend.utils.types import ChatMessage
|
| 4 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks, Header
|
| 5 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
+
import os
|
| 7 |
+
from typing import Type
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from data_minning.dto.stream_opts import StreamOptions
|
| 10 |
+
from data_minning.base_scrapper import BaseRecipeScraper, JsonArraySink, MongoSink
|
| 11 |
+
from data_minning.all_nigerian_recipe_scraper import AllNigerianRecipesScraper
|
| 12 |
+
from data_minning.yummy_medley_scraper import YummyMedleyScraper
|
| 13 |
+
from backend.config.settings import settings
|
| 14 |
+
from backend.config.logging_config import setup_default_logging, get_logger
|
| 15 |
+
from backend.utils.sanitization import sanitize_user_input
|
| 16 |
+
from backend.services.vector_store import vector_store_service
|
| 17 |
+
# Setup logging first, before importing services
|
| 18 |
+
setup_default_logging()
|
| 19 |
+
logger = get_logger("app")
|
| 20 |
+
|
| 21 |
+
# Import services after logging is configured
|
| 22 |
+
from backend.services.llm_service import llm_service
|
| 23 |
+
|
| 24 |
+
SCRAPERS: dict[str, Type[BaseRecipeScraper]] = {
|
| 25 |
+
"yummy": YummyMedleyScraper,
|
| 26 |
+
"anr": AllNigerianRecipesScraper,
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
app = FastAPI(
|
| 30 |
+
title="Recipe Recommendation Bot API",
|
| 31 |
+
description="AI-powered recipe recommendation system with RAG capabilities",
|
| 32 |
+
version="1.0.0"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
logger.info("🚀 Starting Recipe Recommendation Bot API")
|
| 36 |
+
logger.info(f"Environment: {settings.ENVIRONMENT}")
|
| 37 |
+
logger.info(f"Provider: {settings.get_llm_config()['provider']} (LLM + Embeddings)")
|
| 38 |
+
|
| 39 |
+
# Add CORS middleware
|
| 40 |
+
app.add_middleware(
|
| 41 |
+
CORSMiddleware,
|
| 42 |
+
allow_origins=settings.CORS_ORIGINS or ["*"],
|
| 43 |
+
allow_credentials=settings.CORS_ALLOW_CREDENTIALS or True,
|
| 44 |
+
allow_methods=settings.CORS_ALLOW_METHODS or ["*"],
|
| 45 |
+
allow_headers=settings.CORS_ALLOW_HEADERS or ["*"],
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Remove OpenAI direct setup - now handled by LLM service
|
| 49 |
+
# if settings.OPENAI_API_KEY:
|
| 50 |
+
# openai.api_key = settings.OPENAI_API_KEY
|
| 51 |
+
|
| 52 |
+
@app.get("/")
|
| 53 |
+
def index():
|
| 54 |
+
logger.info("📡 Root endpoint accessed")
|
| 55 |
+
return {
|
| 56 |
+
"message": "Recipe Recommendation Bot API",
|
| 57 |
+
"version": "1.0.0",
|
| 58 |
+
"status": "running"
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
@app.get("/health")
|
| 62 |
+
def health_check():
|
| 63 |
+
logger.info("🏥 Health check endpoint accessed")
|
| 64 |
+
return {
|
| 65 |
+
"status": "healthy",
|
| 66 |
+
"environment": settings.ENVIRONMENT,
|
| 67 |
+
"llm_service_initialized": llm_service is not None
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
@app.post("/chat", response_model=ChatResponse)
|
| 71 |
+
async def chat(chat_message: ChatMessage):
|
| 72 |
+
"""Main chatbot endpoint - Recipe recommendation with ConversationalRetrievalChain"""
|
| 73 |
+
try:
|
| 74 |
+
# Message is already sanitized by the Pydantic validator
|
| 75 |
+
# Find the last user message in the messages list
|
| 76 |
+
last_user_message = chat_message.get_latest_message()
|
| 77 |
+
if not last_user_message:
|
| 78 |
+
raise ValueError("No valid user message found")
|
| 79 |
+
user_text = last_user_message.parts[0].text
|
| 80 |
+
|
| 81 |
+
response_text = llm_service.ask_question(user_text)
|
| 82 |
+
return ChatResponse(response=response_text)
|
| 83 |
+
|
| 84 |
+
except ValueError as e:
|
| 85 |
+
# Handle validation/sanitization errors
|
| 86 |
+
logger.warning(f"⚠️ Invalid input received: {str(e)}")
|
| 87 |
+
raise HTTPException(status_code=400, detail=f"Invalid input: {str(e)}")
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"❌ Chat service error: {str(e)}", exc_info=True)
|
| 91 |
+
raise HTTPException(status_code=500, detail=f"Chat service error: {str(e)}")
|
| 92 |
+
|
| 93 |
+
@app.get("/demo")
|
| 94 |
+
def demo(prompt: str = "What recipes do you have?"):
|
| 95 |
+
"""Demo endpoint - uses simple chat completion without RAG"""
|
| 96 |
+
logger.info(f"🎯 Demo request: '{prompt[:50]}...'")
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
# Sanitize the demo prompt using the same sanitization method
|
| 100 |
+
sanitized_prompt = sanitize_user_input(prompt)
|
| 101 |
+
response_text = llm_service.simple_chat_completion(sanitized_prompt)
|
| 102 |
+
return {"prompt": sanitized_prompt, "reply": response_text}
|
| 103 |
+
|
| 104 |
+
except ValueError as e:
|
| 105 |
+
# Handle validation/sanitization errors
|
| 106 |
+
logger.warning(f"⚠️ Invalid demo prompt: {str(e)}")
|
| 107 |
+
return {"error": f"Invalid prompt: {str(e)}", "prompt": prompt}
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"❌ Demo endpoint error: {str(e)}", exc_info=True)
|
| 111 |
+
return {"error": f"Failed to get response: {str(e)}"}
|
| 112 |
+
|
| 113 |
+
@app.post("/clear-memory")
|
| 114 |
+
def clear_conversation_memory():
|
| 115 |
+
"""Clear conversation memory"""
|
| 116 |
+
logger.info("🧹 Memory clear request received")
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
success = llm_service.clear_memory()
|
| 120 |
+
|
| 121 |
+
if success:
|
| 122 |
+
logger.info("✅ Conversation memory cleared successfully")
|
| 123 |
+
return {"status": "success", "message": "Conversation memory cleared"}
|
| 124 |
+
else:
|
| 125 |
+
logger.warning("⚠️ Memory clear operation failed")
|
| 126 |
+
return {"status": "failed", "message": "Failed to clear conversation memory"}
|
| 127 |
+
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.error(f"❌ Memory clear error: {str(e)}", exc_info=True)
|
| 130 |
+
return {"status": "error", "message": str(e)}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def run_job(job_id: str, site: str, limit: int, output_type: str):
|
| 135 |
+
'''
|
| 136 |
+
Background job to run the scraper
|
| 137 |
+
Uses global JOBS dict to track status
|
| 138 |
+
Outputs to JSON file or MongoDB based on output_type
|
| 139 |
+
'''
|
| 140 |
+
s = SCRAPERS[site]()
|
| 141 |
+
s.embedder = vector_store_service._create_sentence_transformer_wrapper("sentence-transformers/all-MiniLM-L6-v2")
|
| 142 |
+
s.embedding_fields = [(("title", "ingredients", "instructions"), "recipe_emb")]
|
| 143 |
+
sink = None
|
| 144 |
+
if output_type == "json":
|
| 145 |
+
sink = JsonArraySink("./data/recipes_unified.json")
|
| 146 |
+
elif output_type == "mongo":
|
| 147 |
+
sink = MongoSink() if os.getenv("MONGODB_URI") else None
|
| 148 |
+
|
| 149 |
+
stream_opts = StreamOptions(
|
| 150 |
+
delay=0.3,
|
| 151 |
+
limit=500,
|
| 152 |
+
batch_size=limit,
|
| 153 |
+
resume_file="recipes.resume",
|
| 154 |
+
progress_callback=make_progress_cb(job_id),
|
| 155 |
+
)
|
| 156 |
+
try:
|
| 157 |
+
JOBS[job_id] = {"status": "running", "count": 0}
|
| 158 |
+
s.stream( sink=sink, options=stream_opts)
|
| 159 |
+
JOBS[job_id]["status"] = "done"
|
| 160 |
+
except Exception as e:
|
| 161 |
+
JOBS[job_id] = {"status": "error", "error": str(e)}
|
| 162 |
+
|
| 163 |
+
def make_progress_cb(job_id: str):
|
| 164 |
+
''' Create a progress callback to update JOBS dict
|
| 165 |
+
'''
|
| 166 |
+
def _cb(n: int):
|
| 167 |
+
JOBS[job_id]["count"] = n
|
| 168 |
+
return _cb
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# super-lightweight in-memory job store (reset on restart)
|
| 175 |
+
JOBS: dict[str, any] = {}
|
| 176 |
+
|
| 177 |
+
@app.post("/scrape")
|
| 178 |
+
def scrape(body: ScrapeRequest, background: BackgroundTasks, x_api_key: str = Header(None)):
|
| 179 |
+
if body.site not in SCRAPERS:
|
| 180 |
+
raise HTTPException(status_code=400, detail="Unknown site")
|
| 181 |
+
|
| 182 |
+
job_id = f"{body.site}-{os.urandom(4).hex()}"
|
| 183 |
+
# use thread via BackgroundTasks to avoid blocking the request
|
| 184 |
+
background.add_task(run_job, job_id, body.site, body.limit, body.output_type)
|
| 185 |
+
return {"job_id": job_id, "status": "queued"}
|
| 186 |
+
|
| 187 |
+
@app.get("/jobs/{job_id}")
|
| 188 |
+
def job_status(job_id: str):
|
| 189 |
+
return JOBS.get(job_id, {"status": "unknown"})
|
| 190 |
+
|
| 191 |
+
@app.get("/jobs")
|
| 192 |
+
def list_jobs():
|
| 193 |
+
return JOBS
|
backend/config/__init__.py
ADDED
|
File without changes
|
backend/config/database.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Database and vector store configuration
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
# Load environment variables from .env file
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
class DatabaseSettings:
|
| 9 |
+
"""Simple database settings class that reads environment variables directly"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
# ===========================================
|
| 13 |
+
# Vector Store Configuration
|
| 14 |
+
# ===========================================
|
| 15 |
+
self.VECTOR_STORE_PROVIDER = os.getenv("VECTOR_STORE_PROVIDER", "chromadb")
|
| 16 |
+
|
| 17 |
+
# ChromaDB Configuration
|
| 18 |
+
self.DB_PATH = os.getenv("DB_PATH", "./data/chromadb")
|
| 19 |
+
self.DB_COLLECTION_NAME = os.getenv("DB_COLLECTION_NAME", "recipes")
|
| 20 |
+
self.DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "./data/chromadb_persist")
|
| 21 |
+
self.DB_REFRESH_ON_START = os.getenv("DB_REFRESH_ON_START", "false").lower() == "true"
|
| 22 |
+
|
| 23 |
+
# MongoDB Atlas Configuration
|
| 24 |
+
self.MONGODB_URI = os.getenv("MONGODB_URI")
|
| 25 |
+
self.MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "recipe_bot")
|
| 26 |
+
self.MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION", "recipes")
|
| 27 |
+
self.MONGODB_INDEX_NAME = os.getenv("MONGODB_INDEX_NAME", "vector_index")
|
| 28 |
+
self.MONGODB_VECTOR_FIELD = os.getenv("MONGODB_VECTOR_FIELD", "embedding")
|
| 29 |
+
self.MONGODB_TEXT_FIELD = os.getenv("MONGODB_TEXT_FIELD", "text")
|
| 30 |
+
self.MONGODB_SIMILARITY_METRIC = os.getenv("MONGODB_SIMILARITY_METRIC", "cosine")
|
| 31 |
+
self.MONGODB_NUM_CANDIDATES = int(os.getenv("MONGODB_NUM_CANDIDATES", "50"))
|
| 32 |
+
|
| 33 |
+
def get_vector_store_config(self):
|
| 34 |
+
"""Get vector store configuration based on selected provider"""
|
| 35 |
+
if self.VECTOR_STORE_PROVIDER == "chromadb":
|
| 36 |
+
return {
|
| 37 |
+
"provider": "chromadb",
|
| 38 |
+
"path": self.DB_PATH,
|
| 39 |
+
"collection_name": self.DB_COLLECTION_NAME,
|
| 40 |
+
"persist_directory": self.DB_PERSIST_DIRECTORY,
|
| 41 |
+
"refresh_on_start": self.DB_REFRESH_ON_START
|
| 42 |
+
}
|
| 43 |
+
elif self.VECTOR_STORE_PROVIDER == "mongodb":
|
| 44 |
+
if not self.MONGODB_URI:
|
| 45 |
+
raise ValueError("MongoDB URI is required when using MongoDB Atlas as vector store")
|
| 46 |
+
return {
|
| 47 |
+
"provider": "mongodb",
|
| 48 |
+
"uri": self.MONGODB_URI,
|
| 49 |
+
"database": self.MONGODB_DATABASE,
|
| 50 |
+
"collection_name": self.MONGODB_COLLECTION,
|
| 51 |
+
"index_name": self.MONGODB_INDEX_NAME,
|
| 52 |
+
"vector_field": self.MONGODB_VECTOR_FIELD,
|
| 53 |
+
"text_field": self.MONGODB_TEXT_FIELD,
|
| 54 |
+
"similarity_metric": self.MONGODB_SIMILARITY_METRIC,
|
| 55 |
+
"num_candidates": self.MONGODB_NUM_CANDIDATES
|
| 56 |
+
}
|
| 57 |
+
else:
|
| 58 |
+
raise ValueError(f"Unsupported vector store provider: {self.VECTOR_STORE_PROVIDER}")
|
| 59 |
+
|
| 60 |
+
# Create global database settings instance
|
| 61 |
+
db_settings = DatabaseSettings()
|
backend/config/logging_config.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logging Configuration
|
| 2 |
+
import logging
|
| 3 |
+
import logging.handlers
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
def setup_logging(
|
| 9 |
+
log_level: str = "INFO",
|
| 10 |
+
log_file: Optional[str] = None,
|
| 11 |
+
enable_console: bool = True,
|
| 12 |
+
max_file_size: int = 10 * 1024 * 1024, # 10MB
|
| 13 |
+
backup_count: int = 5
|
| 14 |
+
) -> logging.Logger:
|
| 15 |
+
"""
|
| 16 |
+
Setup centralized logging configuration
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
| 20 |
+
log_file: Path to log file (optional)
|
| 21 |
+
enable_console: Whether to enable console logging
|
| 22 |
+
max_file_size: Maximum log file size in bytes
|
| 23 |
+
backup_count: Number of backup files to keep
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Configured logger instance
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
# Create logs directory if it doesn't exist
|
| 30 |
+
if log_file:
|
| 31 |
+
log_path = Path(log_file)
|
| 32 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# Configure root logger
|
| 35 |
+
logger = logging.getLogger("recipe_bot")
|
| 36 |
+
logger.setLevel(getattr(logging, log_level.upper()))
|
| 37 |
+
|
| 38 |
+
# Clear existing handlers
|
| 39 |
+
logger.handlers.clear()
|
| 40 |
+
|
| 41 |
+
# Create formatter
|
| 42 |
+
formatter = logging.Formatter(
|
| 43 |
+
fmt="%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
|
| 44 |
+
datefmt="%Y-%m-%d %H:%M:%S"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Console handler
|
| 48 |
+
if enable_console:
|
| 49 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 50 |
+
console_handler.setLevel(getattr(logging, log_level.upper()))
|
| 51 |
+
console_handler.setFormatter(formatter)
|
| 52 |
+
logger.addHandler(console_handler)
|
| 53 |
+
|
| 54 |
+
# File handler with rotation
|
| 55 |
+
if log_file:
|
| 56 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
| 57 |
+
filename=log_file,
|
| 58 |
+
maxBytes=max_file_size,
|
| 59 |
+
backupCount=backup_count,
|
| 60 |
+
encoding='utf-8'
|
| 61 |
+
)
|
| 62 |
+
file_handler.setLevel(getattr(logging, log_level.upper()))
|
| 63 |
+
file_handler.setFormatter(formatter)
|
| 64 |
+
logger.addHandler(file_handler)
|
| 65 |
+
|
| 66 |
+
# Prevent duplicate logs
|
| 67 |
+
logger.propagate = False
|
| 68 |
+
|
| 69 |
+
return logger
|
| 70 |
+
|
| 71 |
+
def get_logger(name: str) -> logging.Logger:
|
| 72 |
+
"""Get a child logger with the specified name"""
|
| 73 |
+
return logging.getLogger(f"recipe_bot.{name}")
|
| 74 |
+
|
| 75 |
+
# Default logger setup
|
| 76 |
+
def setup_default_logging():
|
| 77 |
+
"""Setup default logging configuration"""
|
| 78 |
+
return setup_logging(
|
| 79 |
+
log_level="INFO",
|
| 80 |
+
log_file="./logs/recipe_bot.log",
|
| 81 |
+
enable_console=True
|
| 82 |
+
)
|
backend/config/settings.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration settings for the Recipe Recommendation Bot
|
| 2 |
+
import os
|
| 3 |
+
from typing import Optional, List
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Load environment variables from .env file
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
class Settings:
|
| 10 |
+
"""Simple settings class that reads environment variables directly"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
# ===========================================
|
| 14 |
+
# Server Configuration
|
| 15 |
+
# ===========================================
|
| 16 |
+
self.PORT = int(os.getenv("PORT", 8000))
|
| 17 |
+
self.HOST = os.getenv("HOST", "0.0.0.0")
|
| 18 |
+
self.ENVIRONMENT = os.getenv("ENVIRONMENT", "development")
|
| 19 |
+
self.DEBUG = os.getenv("DEBUG", "true").lower() == "true"
|
| 20 |
+
|
| 21 |
+
# ===========================================
|
| 22 |
+
# CORS Configuration
|
| 23 |
+
# ===========================================
|
| 24 |
+
cors_origins = os.getenv("CORS_ORIGINS", '["http://localhost:3000","http://localhost:5173","http://localhost:8080"]')
|
| 25 |
+
self.CORS_ORIGINS = self._parse_list(cors_origins)
|
| 26 |
+
self.CORS_ALLOW_CREDENTIALS = os.getenv("CORS_ALLOW_CREDENTIALS", "true").lower() == "true"
|
| 27 |
+
|
| 28 |
+
cors_methods = os.getenv("CORS_ALLOW_METHODS", '["GET","POST","PUT","DELETE","OPTIONS"]')
|
| 29 |
+
self.CORS_ALLOW_METHODS = self._parse_list(cors_methods)
|
| 30 |
+
|
| 31 |
+
cors_headers = os.getenv("CORS_ALLOW_HEADERS", '["*"]')
|
| 32 |
+
self.CORS_ALLOW_HEADERS = self._parse_list(cors_headers)
|
| 33 |
+
|
| 34 |
+
# ===========================================
|
| 35 |
+
# LLM & Embedding Provider Configuration
|
| 36 |
+
# ===========================================
|
| 37 |
+
self.LLM_PROVIDER = os.getenv("LLM_PROVIDER", "google")
|
| 38 |
+
self.EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", self.LLM_PROVIDER) # Default to same as LLM
|
| 39 |
+
|
| 40 |
+
# OpenAI Configuration
|
| 41 |
+
self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 42 |
+
self.OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5-nano")
|
| 43 |
+
self.OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.7"))
|
| 44 |
+
self.OPENAI_MAX_TOKENS = int(os.getenv("OPENAI_MAX_TOKENS", "1000"))
|
| 45 |
+
|
| 46 |
+
# Google AI Configuration
|
| 47 |
+
self.GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 48 |
+
self.GOOGLE_MODEL = os.getenv("GOOGLE_MODEL", "gemini-2.5-flash")
|
| 49 |
+
self.GOOGLE_TEMPERATURE = float(os.getenv("GOOGLE_TEMPERATURE", "0.7"))
|
| 50 |
+
self.GOOGLE_MAX_TOKENS = int(os.getenv("GOOGLE_MAX_TOKENS", "1000"))
|
| 51 |
+
|
| 52 |
+
# Hugging Face Configuration
|
| 53 |
+
self.HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
|
| 54 |
+
self.HUGGINGFACE_MODEL = os.getenv("HUGGINGFACE_MODEL", "microsoft/DialoGPT-medium")
|
| 55 |
+
self.HUGGINGFACE_API_URL = os.getenv("HUGGINGFACE_API_URL", "https://api-inference.huggingface.co/models/")
|
| 56 |
+
self.HUGGINGFACE_USE_GPU = os.getenv("HUGGINGFACE_USE_GPU", "false").lower() == "true"
|
| 57 |
+
self.HUGGINGFACE_USE_API = os.getenv("HUGGINGFACE_USE_API", "false").lower() == "true"
|
| 58 |
+
|
| 59 |
+
# Ollama Configuration
|
| 60 |
+
self.OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
| 61 |
+
self.OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.1:8b")
|
| 62 |
+
self.OLLAMA_TEMPERATURE = float(os.getenv("OLLAMA_TEMPERATURE", "0.7"))
|
| 63 |
+
|
| 64 |
+
# ===========================================
|
| 65 |
+
# Embedding Model Configuration
|
| 66 |
+
# ===========================================
|
| 67 |
+
# Note: Embedding provider is determined by LLM_PROVIDER setting above
|
| 68 |
+
|
| 69 |
+
# OpenAI Embeddings
|
| 70 |
+
self.OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
|
| 71 |
+
|
| 72 |
+
# Google Embeddings
|
| 73 |
+
self.GOOGLE_EMBEDDING_MODEL = os.getenv("GOOGLE_EMBEDDING_MODEL", "models/embedding-001")
|
| 74 |
+
|
| 75 |
+
# Hugging Face Embeddings
|
| 76 |
+
self.HUGGINGFACE_EMBEDDING_MODEL = os.getenv("HUGGINGFACE_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 77 |
+
|
| 78 |
+
# Ollama Embeddings
|
| 79 |
+
self.OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
|
| 80 |
+
|
| 81 |
+
# ===========================================
|
| 82 |
+
# Logging Configuration
|
| 83 |
+
# ===========================================
|
| 84 |
+
self.LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
| 85 |
+
self.LOG_FORMAT = os.getenv("LOG_FORMAT", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
| 86 |
+
self.LOG_FILE = os.getenv("LOG_FILE", "./logs/app.log")
|
| 87 |
+
|
| 88 |
+
# ===========================================
|
| 89 |
+
# Langchain Debugging Configuration
|
| 90 |
+
# ===========================================
|
| 91 |
+
# Note: set to "true" to enable detailed Langchain logs
|
| 92 |
+
self.LANGCHAIN_DEBUG = os.getenv("LANGCHAIN_DEBUG", "false").lower() == "true"
|
| 93 |
+
|
| 94 |
+
def _parse_list(self, value: str) -> List[str]:
|
| 95 |
+
"""Parse a string representation of a list into an actual list"""
|
| 96 |
+
try:
|
| 97 |
+
# Remove brackets and quotes, split by comma
|
| 98 |
+
if value.startswith('[') and value.endswith(']'):
|
| 99 |
+
value = value[1:-1]
|
| 100 |
+
items = [item.strip().strip('"').strip("'") for item in value.split(',')]
|
| 101 |
+
return [item for item in items if item] # Remove empty items
|
| 102 |
+
except:
|
| 103 |
+
return ["*"] # Fallback to allow all
|
| 104 |
+
|
| 105 |
+
def get_llm_config(self):
|
| 106 |
+
"""Get LLM configuration based on selected provider"""
|
| 107 |
+
if self.LLM_PROVIDER == "openai":
|
| 108 |
+
return {
|
| 109 |
+
"provider": "openai",
|
| 110 |
+
"api_key": self.OPENAI_API_KEY,
|
| 111 |
+
"model": self.OPENAI_MODEL,
|
| 112 |
+
"temperature": self.OPENAI_TEMPERATURE,
|
| 113 |
+
"max_tokens": self.OPENAI_MAX_TOKENS
|
| 114 |
+
}
|
| 115 |
+
elif self.LLM_PROVIDER == "google":
|
| 116 |
+
return {
|
| 117 |
+
"provider": "google",
|
| 118 |
+
"api_key": self.GOOGLE_API_KEY,
|
| 119 |
+
"model": self.GOOGLE_MODEL,
|
| 120 |
+
"temperature": self.GOOGLE_TEMPERATURE,
|
| 121 |
+
"max_tokens": self.GOOGLE_MAX_TOKENS
|
| 122 |
+
}
|
| 123 |
+
elif self.LLM_PROVIDER == "huggingface":
|
| 124 |
+
return {
|
| 125 |
+
"provider": "huggingface",
|
| 126 |
+
"api_token": self.HUGGINGFACE_API_TOKEN,
|
| 127 |
+
"model": self.HUGGINGFACE_MODEL,
|
| 128 |
+
"api_url": self.HUGGINGFACE_API_URL,
|
| 129 |
+
"use_gpu": self.HUGGINGFACE_USE_GPU,
|
| 130 |
+
"use_api": self.HUGGINGFACE_USE_API
|
| 131 |
+
}
|
| 132 |
+
elif self.LLM_PROVIDER == "ollama":
|
| 133 |
+
return {
|
| 134 |
+
"provider": "ollama",
|
| 135 |
+
"base_url": self.OLLAMA_BASE_URL,
|
| 136 |
+
"model": self.OLLAMA_MODEL,
|
| 137 |
+
"temperature": self.OLLAMA_TEMPERATURE
|
| 138 |
+
}
|
| 139 |
+
else:
|
| 140 |
+
raise ValueError(f"Unsupported LLM provider: {self.LLM_PROVIDER}")
|
| 141 |
+
|
| 142 |
+
def get_embedding_config(self):
|
| 143 |
+
"""Get embedding configuration based on EMBEDDING_PROVIDER setting"""
|
| 144 |
+
provider = self.EMBEDDING_PROVIDER
|
| 145 |
+
|
| 146 |
+
if provider == "openai":
|
| 147 |
+
return {
|
| 148 |
+
"provider": "openai",
|
| 149 |
+
"api_key": self.OPENAI_API_KEY,
|
| 150 |
+
"model": self.OPENAI_EMBEDDING_MODEL
|
| 151 |
+
}
|
| 152 |
+
elif provider == "google":
|
| 153 |
+
return {
|
| 154 |
+
"provider": "google",
|
| 155 |
+
"api_key": self.GOOGLE_API_KEY,
|
| 156 |
+
"model": self.GOOGLE_EMBEDDING_MODEL
|
| 157 |
+
}
|
| 158 |
+
elif provider == "huggingface":
|
| 159 |
+
return {
|
| 160 |
+
"provider": "huggingface",
|
| 161 |
+
"model": self.HUGGINGFACE_EMBEDDING_MODEL
|
| 162 |
+
}
|
| 163 |
+
elif provider == "ollama":
|
| 164 |
+
return {
|
| 165 |
+
"provider": "ollama",
|
| 166 |
+
"base_url": self.OLLAMA_BASE_URL,
|
| 167 |
+
"model": self.OLLAMA_EMBEDDING_MODEL
|
| 168 |
+
}
|
| 169 |
+
else:
|
| 170 |
+
raise ValueError(f"Unsupported provider: {provider}. Supported providers: openai, google, huggingface, ollama")
|
| 171 |
+
|
| 172 |
+
# Create global settings instance
|
| 173 |
+
settings = Settings()
|
| 174 |
+
|
| 175 |
+
# Note: Vector store and database configuration is in database.py
|
| 176 |
+
# from config.database import db_settings
|
backend/core/__init__.py
ADDED
|
File without changes
|
backend/core/exceptions.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Custom exception handlers for the Recipe Recommendation Bot
|
| 2 |
+
|
| 3 |
+
class RecipeNotFoundError(Exception):
|
| 4 |
+
"""Raised when a recipe is not found"""
|
| 5 |
+
pass
|
| 6 |
+
|
| 7 |
+
class LLMServiceError(Exception):
|
| 8 |
+
"""Raised when LLM service encounters an error"""
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
class VectorStoreError(Exception):
|
| 12 |
+
"""Raised when vector store operations fail"""
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
# TODO: Add more specific exception classes and error handlers
|
backend/data/__init__.py
ADDED
|
File without changes
|
backend/data/sample_recipes.json
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"title": "Mixed Seafood Coconut Fried Rice",
|
| 4 |
+
"ingredients": [
|
| 5 |
+
"jasmine rice",
|
| 6 |
+
"cooked shrimp",
|
| 7 |
+
"prawns",
|
| 8 |
+
"scallops",
|
| 9 |
+
"coconut milk",
|
| 10 |
+
"fish sauce",
|
| 11 |
+
"soy sauce",
|
| 12 |
+
"garlic",
|
| 13 |
+
"onion",
|
| 14 |
+
"ginger",
|
| 15 |
+
"green onions",
|
| 16 |
+
"cilantro",
|
| 17 |
+
"lime",
|
| 18 |
+
"vegetable oil",
|
| 19 |
+
"salt",
|
| 20 |
+
"pepper"
|
| 21 |
+
],
|
| 22 |
+
"instructions": "1. Heat vegetable oil in large pan over medium-high heat. 2. Add garlic, onion, and ginger, stir-fry until fragrant (about 1 minute). 3. Add cooked jasmine rice and mix well, breaking up any clumps. 4. Add shrimp, prawns, and scallops, cook until heated through (3-4 minutes). 5. Pour in coconut milk and season with fish sauce and soy sauce. 6. Stir everything together and cook for 2-3 minutes until well combined. 7. Garnish with chopped green onions and fresh cilantro. 8. Serve immediately with lime wedges on the side.",
|
| 23 |
+
"metadata": {
|
| 24 |
+
"cook_time": "25 minutes",
|
| 25 |
+
"difficulty": "medium",
|
| 26 |
+
"servings": "4",
|
| 27 |
+
"category": "seafood",
|
| 28 |
+
"image_url": "https://example.com/images/mixed-seafood-coconut-fried-rice.jpg"
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"title": "Classic Chicken Alfredo Pasta",
|
| 33 |
+
"ingredients": [
|
| 34 |
+
"fettuccine pasta",
|
| 35 |
+
"chicken breast",
|
| 36 |
+
"heavy cream",
|
| 37 |
+
"parmesan cheese",
|
| 38 |
+
"butter",
|
| 39 |
+
"garlic",
|
| 40 |
+
"italian seasoning",
|
| 41 |
+
"salt",
|
| 42 |
+
"pepper",
|
| 43 |
+
"olive oil",
|
| 44 |
+
"parsley"
|
| 45 |
+
],
|
| 46 |
+
"instructions": "1. Cook fettuccine pasta according to package directions, drain and set aside. 2. Season chicken breast with salt, pepper, and Italian seasoning. 3. Heat olive oil in large skillet over medium-high heat. 4. Cook chicken breast until golden brown and cooked through (6-7 minutes per side). 5. Remove chicken and slice into strips. 6. In same skillet, melt butter and add minced garlic, cook for 1 minute. 7. Add heavy cream and bring to gentle simmer. 8. Stir in grated Parmesan cheese until melted and smooth. 9. Add cooked pasta and chicken strips to sauce. 10. Toss everything together and garnish with fresh parsley.",
|
| 47 |
+
"metadata": {
|
| 48 |
+
"cook_time": "30 minutes",
|
| 49 |
+
"difficulty": "easy",
|
| 50 |
+
"servings": "4",
|
| 51 |
+
"category": "pasta",
|
| 52 |
+
"image_url": "https://example.com/images/chicken-alfredo-pasta.jpg"
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"title": "Vegetarian Black Bean Tacos",
|
| 57 |
+
"ingredients": [
|
| 58 |
+
"black beans",
|
| 59 |
+
"canned corn",
|
| 60 |
+
"tortillas",
|
| 61 |
+
"avocado",
|
| 62 |
+
"lime",
|
| 63 |
+
"red onion",
|
| 64 |
+
"cilantro",
|
| 65 |
+
"cumin",
|
| 66 |
+
"chili powder",
|
| 67 |
+
"garlic powder",
|
| 68 |
+
"salt",
|
| 69 |
+
"pepper",
|
| 70 |
+
"olive oil",
|
| 71 |
+
"lettuce",
|
| 72 |
+
"tomato",
|
| 73 |
+
"mexican cheese blend"
|
| 74 |
+
],
|
| 75 |
+
"instructions": "1. Drain and rinse black beans, drain corn. 2. Heat olive oil in skillet over medium heat. 3. Add black beans, corn, cumin, chili powder, garlic powder, salt, and pepper. 4. Cook for 5-7 minutes until heated through and flavors meld. 5. Warm tortillas in dry skillet or microwave. 6. Dice avocado, red onion, and tomato. 7. Squeeze lime juice over diced avocado to prevent browning. 8. Assemble tacos with bean mixture, lettuce, tomato, avocado, onion, and cheese. 9. Garnish with fresh cilantro and serve with lime wedges.",
|
| 76 |
+
"metadata": {
|
| 77 |
+
"cook_time": "15 minutes",
|
| 78 |
+
"difficulty": "easy",
|
| 79 |
+
"servings": "3",
|
| 80 |
+
"category": "vegetarian",
|
| 81 |
+
"image_url": "https://example.com/images/black-bean-tacos.jpg"
|
| 82 |
+
}
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"title": "Beef and Vegetable Stir Fry",
|
| 86 |
+
"ingredients": [
|
| 87 |
+
"beef sirloin",
|
| 88 |
+
"soy sauce",
|
| 89 |
+
"sesame oil",
|
| 90 |
+
"cornstarch",
|
| 91 |
+
"broccoli",
|
| 92 |
+
"bell peppers",
|
| 93 |
+
"carrots",
|
| 94 |
+
"snap peas",
|
| 95 |
+
"garlic",
|
| 96 |
+
"ginger",
|
| 97 |
+
"vegetable oil",
|
| 98 |
+
"rice vinegar",
|
| 99 |
+
"brown sugar",
|
| 100 |
+
"green onions",
|
| 101 |
+
"sesame seeds"
|
| 102 |
+
],
|
| 103 |
+
"instructions": "1. Slice beef sirloin into thin strips and marinate with soy sauce, sesame oil, and cornstarch for 15 minutes. 2. Cut broccoli into florets, slice bell peppers and carrots. 3. Heat vegetable oil in large wok or skillet over high heat. 4. Add marinated beef and stir-fry until browned (3-4 minutes). 5. Remove beef and set aside. 6. Add more oil if needed, then add garlic and ginger, stir for 30 seconds. 7. Add hard vegetables (carrots, broccoli) first, stir-fry for 2 minutes. 8. Add bell peppers and snap peas, stir-fry for another 2 minutes. 9. Return beef to pan, add rice vinegar and brown sugar. 10. Stir everything together for 1-2 minutes. 11. Garnish with green onions and sesame seeds.",
|
| 104 |
+
"metadata": {
|
| 105 |
+
"cook_time": "20 minutes",
|
| 106 |
+
"difficulty": "medium",
|
| 107 |
+
"servings": "4",
|
| 108 |
+
"category": "beef",
|
| 109 |
+
"image_url": "https://example.com/images/beef-vegetable-stir-fry.jpg"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"title": "Mediterranean Quinoa Salad",
|
| 114 |
+
"ingredients": [
|
| 115 |
+
"quinoa",
|
| 116 |
+
"cucumber",
|
| 117 |
+
"cherry tomatoes",
|
| 118 |
+
"red onion",
|
| 119 |
+
"kalamata olives",
|
| 120 |
+
"feta cheese",
|
| 121 |
+
"olive oil",
|
| 122 |
+
"lemon juice",
|
| 123 |
+
"oregano",
|
| 124 |
+
"salt",
|
| 125 |
+
"pepper",
|
| 126 |
+
"fresh mint",
|
| 127 |
+
"parsley"
|
| 128 |
+
],
|
| 129 |
+
"instructions": "1. Rinse quinoa thoroughly and cook according to package directions (usually 1:2 ratio with water). 2. Let cooked quinoa cool completely. 3. Dice cucumber, halve cherry tomatoes, and thinly slice red onion. 4. Pit and halve kalamata olives, crumble feta cheese. 5. In large bowl, combine cooled quinoa with prepared vegetables and olives. 6. Make dressing by whisking together olive oil, lemon juice, oregano, salt, and pepper. 7. Pour dressing over salad and toss well. 8. Add crumbled feta cheese and chopped fresh mint and parsley. 9. Toss gently and let sit for 15 minutes to allow flavors to meld. 10. Serve chilled or at room temperature.",
|
| 130 |
+
"metadata": {
|
| 131 |
+
"cook_time": "25 minutes",
|
| 132 |
+
"difficulty": "easy",
|
| 133 |
+
"servings": "6",
|
| 134 |
+
"category": "salad",
|
| 135 |
+
"image_url": "https://example.com/images/mediterranean-quinoa-salad.jpg"
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
]
|
backend/data_minning/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Services package initialization
|
backend/data_minning/all_nigerian_recipe_scraper.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Iterable, List, Optional
|
| 3 |
+
from urllib.parse import urljoin, urlparse
|
| 4 |
+
|
| 5 |
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
| 6 |
+
from .dto.recipe_doc import RecipeDoc
|
| 7 |
+
from .base_scrapper import BaseRecipeScraper
|
| 8 |
+
from backend.utils.sanitization import clean
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AllNigerianRecipesScraper(BaseRecipeScraper):
|
| 12 |
+
# two-level paths like /beans/beans-porridge/
|
| 13 |
+
ALLOWED_CATS = {
|
| 14 |
+
"beans","soups","stews","salad","breakfast","rice",
|
| 15 |
+
"yam","plantain","swallow","drinks","desserts","meat","fish", "chicken"
|
| 16 |
+
}
|
| 17 |
+
PAT_ING = re.compile(r"\bingredients?\b|\bwhat you need\b|\bingredients? for\b", re.I)
|
| 18 |
+
PAT_NOTEI = re.compile(r"\bnotes?\b.*ingredients?\b", re.I)
|
| 19 |
+
PAT_BEFORE= re.compile(r"\bbefore you (cook|grill|fry|bake|roast|steam|boil|prepare)\b", re.I)
|
| 20 |
+
PAT_INSTR = re.compile(r"\b(preparation|directions|method|instructions|cooking|making)\b", re.I)
|
| 21 |
+
# Map first path segment (category) -> course
|
| 22 |
+
COURSE_BY_CATEGORY = {
|
| 23 |
+
"soups": "Soup",
|
| 24 |
+
"stews": "Stew",
|
| 25 |
+
"snacks": "Snack",
|
| 26 |
+
"drinks": "Drink",
|
| 27 |
+
"desserts": "Dessert",
|
| 28 |
+
"breakfast": "Breakfast",
|
| 29 |
+
"salad": "Salad",
|
| 30 |
+
"rice": "Main Course",
|
| 31 |
+
"beans": "Main Course",
|
| 32 |
+
"yam": "Main Course",
|
| 33 |
+
"plantain": "Main Course",
|
| 34 |
+
"meat": "Main Course",
|
| 35 |
+
"fish": "Main Course",
|
| 36 |
+
"chicken": "Main Course", # <-- add chicken
|
| 37 |
+
"swallow": "Side Dish",
|
| 38 |
+
}
|
| 39 |
+
# Fallback keyword hints from URL/title if category isn’t enough
|
| 40 |
+
COURSE_BY_KEYWORD = [
|
| 41 |
+
(r"\bsoup(s)?\b", "Soup"),
|
| 42 |
+
(r"\bstew(s)?\b", "Stew"),
|
| 43 |
+
(r"\bsalad(s)?\b", "Salad"),
|
| 44 |
+
(r"\b(snack|small[-\s]?chops)\b", "Snack"),
|
| 45 |
+
(r"\b(drink|juice|smoothie)\b", "Drink"),
|
| 46 |
+
(r"\bbreakfast\b", "Breakfast"),
|
| 47 |
+
(r"\bdessert(s)?\b", "Dessert"),
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
def __init__(self, base_domain="www.allnigerianrecipes.com", index_url="https://www.allnigerianrecipes.com/other/sitemap/"):
|
| 51 |
+
super().__init__(base_domain)
|
| 52 |
+
self.index_url = index_url
|
| 53 |
+
|
| 54 |
+
def _is_two_level_recipe(self, url: str) -> bool:
|
| 55 |
+
sp = urlparse(url); segs = [s for s in sp.path.strip("/").split("/") if s]
|
| 56 |
+
if len(segs) != 2: return False
|
| 57 |
+
if segs[0].lower() not in self.ALLOWED_CATS: return False
|
| 58 |
+
bad = {"page","tag","tags","category","categories","author","search"}
|
| 59 |
+
if any(s in bad for s in segs): return False
|
| 60 |
+
if sp.path.lower().endswith((".xml",".pdf",".zip",".jpg",".jpeg",".png",".webp",".mp4",".mov")):
|
| 61 |
+
return False
|
| 62 |
+
return True
|
| 63 |
+
|
| 64 |
+
def discover_urls(self) -> Iterable[str]:
|
| 65 |
+
soup = self.fetch_soup(self.index_url)
|
| 66 |
+
seen = set()
|
| 67 |
+
for a in soup.select("a[href]"):
|
| 68 |
+
u = urljoin(self.index_url, a["href"])
|
| 69 |
+
if self.same_domain(u) and self._is_two_level_recipe(u) and u not in seen:
|
| 70 |
+
seen.add(u); yield u
|
| 71 |
+
|
| 72 |
+
def _li_text_only(self, li: Tag) -> str:
|
| 73 |
+
parts=[]
|
| 74 |
+
for ch in li.contents:
|
| 75 |
+
if isinstance(ch, NavigableString): parts.append(str(ch))
|
| 76 |
+
elif isinstance(ch, Tag) and ch.name not in ("ul","ol","iframe"):
|
| 77 |
+
parts.append(ch.get_text(" ", strip=True))
|
| 78 |
+
return clean(" ".join(parts)) or ""
|
| 79 |
+
|
| 80 |
+
def _collect_after(self, h: Tag, categories = None) -> List[str]:
|
| 81 |
+
lvl = int(h.name[1])
|
| 82 |
+
out = []
|
| 83 |
+
for sib in h.next_siblings:
|
| 84 |
+
if isinstance(sib, Tag) and sib.name in self.HEADING_TAGS and int(sib.name[1]) <= lvl:
|
| 85 |
+
break
|
| 86 |
+
if isinstance(sib, Tag):
|
| 87 |
+
if categories == 'ingredients':
|
| 88 |
+
if sib.name in ("ul", "ol"):
|
| 89 |
+
lis = sib.find_all("li", recursive=False) or sib.find_all("li")
|
| 90 |
+
for li in lis:
|
| 91 |
+
t = self._li_text_only(li)
|
| 92 |
+
if t:
|
| 93 |
+
out.append(t)
|
| 94 |
+
# Skip other tags for ingredients
|
| 95 |
+
else:
|
| 96 |
+
if sib.name in ("ul", "ol"):
|
| 97 |
+
lis = sib.find_all("li", recursive=False) or sib.find_all("li")
|
| 98 |
+
for li in lis:
|
| 99 |
+
t = self._li_text_only(li)
|
| 100 |
+
if t:
|
| 101 |
+
out.append(t)
|
| 102 |
+
elif sib.name in ("p", "div", "blockquote"):
|
| 103 |
+
t = clean(sib.get_text(" ", strip=True))
|
| 104 |
+
if t:
|
| 105 |
+
out.append(t)
|
| 106 |
+
elif isinstance(sib, NavigableString):
|
| 107 |
+
if categories != 'ingredients': # Only add NavigableString if not ingredients
|
| 108 |
+
t = clean(str(sib))
|
| 109 |
+
if t:
|
| 110 |
+
out.append(t)
|
| 111 |
+
return [x for x in out if not x.lower().startswith("video of ")]
|
| 112 |
+
|
| 113 |
+
def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
|
| 114 |
+
doc = RecipeDoc.make(url)
|
| 115 |
+
# JSON-LD first
|
| 116 |
+
j = self.extract_jsonld(soup)
|
| 117 |
+
if j:
|
| 118 |
+
for k,v in j.items():
|
| 119 |
+
if hasattr(doc, k): setattr(doc, k, v)
|
| 120 |
+
# Title fallback
|
| 121 |
+
if not doc.title:
|
| 122 |
+
title = soup.find("h1") or soup.find("title")
|
| 123 |
+
doc.title = clean(title.get_text()) if title else None
|
| 124 |
+
|
| 125 |
+
root = soup.select_one(".entry-content") or soup
|
| 126 |
+
sections_ing, sections_instr, notes = [], [], []
|
| 127 |
+
|
| 128 |
+
for h in root.find_all(self.HEADING_TAGS):
|
| 129 |
+
ht = h.get_text(" ", strip=True) or ""
|
| 130 |
+
if self.PAT_NOTEI.search(ht):
|
| 131 |
+
it = self._collect_after(h);
|
| 132 |
+
if it: notes.append({"title": ht, "items": it})
|
| 133 |
+
elif self.PAT_ING.search(ht):
|
| 134 |
+
it = [x for x in self._collect_after(h, 'ingredients') if self._looks_like_ingredient(x)]
|
| 135 |
+
if it: sections_ing.append({"title": ht, "items": it})
|
| 136 |
+
elif self.PAT_BEFORE.search(ht) or self.PAT_INSTR.search(ht):
|
| 137 |
+
st = self._collect_after(h);
|
| 138 |
+
if st: sections_instr.append({"title": ht, "steps": st})
|
| 139 |
+
|
| 140 |
+
# Flatten
|
| 141 |
+
for s in sections_ing + notes:
|
| 142 |
+
doc.ingredients.extend(s["items"])
|
| 143 |
+
for s in sections_instr:
|
| 144 |
+
doc.instructions.extend(s["steps"])
|
| 145 |
+
|
| 146 |
+
ingredients_text = self._to_ingredients_text(doc.ingredients)
|
| 147 |
+
instructions_text = self._to_instructions_text(doc.instructions)
|
| 148 |
+
|
| 149 |
+
# Store as text (team requirement)
|
| 150 |
+
doc.ingredients = ingredients_text
|
| 151 |
+
doc.instructions = instructions_text
|
| 152 |
+
# Notes & image
|
| 153 |
+
if notes:
|
| 154 |
+
# concatenate notes into a single field
|
| 155 |
+
doc.notes = " ".join(["; ".join(n.get("items", [])) for n in notes if n.get("items")])
|
| 156 |
+
if not doc.image_url:
|
| 157 |
+
doc.image_url = clean(self.get_meta_image(soup))
|
| 158 |
+
|
| 159 |
+
# Infer category (first path segment) and map to a course
|
| 160 |
+
if not getattr(doc, "category", None) or not getattr(doc, "course", None):
|
| 161 |
+
cat, crs = self._infer_category_and_course(url, doc.title)
|
| 162 |
+
if not doc.category:
|
| 163 |
+
doc.category = cat
|
| 164 |
+
if not doc.course:
|
| 165 |
+
doc.course = crs
|
| 166 |
+
|
| 167 |
+
return doc
|
| 168 |
+
|
| 169 |
+
def _looks_like_ingredient(self, text: str) -> bool:
|
| 170 |
+
if len(text.split()) > 15: return False
|
| 171 |
+
# Match numbers followed by units (with or without space), or common ingredient keywords
|
| 172 |
+
return bool(re.search(r"\b(\d+\s?(cup|cups|tsp|tbsp|teaspoon|tablespoon|g|kg|ml|l|gram)?|cup|cups|tsp|tbsp|teaspoon|tablespoon|g|kg|ml|l|gram|salt|pepper|onion|oil|water|rice|groundnuts|tamarind)\b", text, re.I))
|
| 173 |
+
|
| 174 |
+
@staticmethod
|
| 175 |
+
def _slug_to_title(slug: Optional[str]) -> Optional[str]:
|
| 176 |
+
return slug.replace("-", " ").title() if slug else None
|
| 177 |
+
|
| 178 |
+
def _infer_category_and_course(self, url: str, title: Optional[str]) -> tuple[Optional[str], Optional[str]]:
|
| 179 |
+
sp = urlparse(url)
|
| 180 |
+
segs = [s for s in sp.path.strip("/").split("/") if s]
|
| 181 |
+
cat_slug = segs[0] if segs else None
|
| 182 |
+
category = self._slug_to_title(cat_slug) if cat_slug else None
|
| 183 |
+
|
| 184 |
+
course = self.COURSE_BY_CATEGORY.get(cat_slug)
|
| 185 |
+
if not course:
|
| 186 |
+
hay = f"{url} {(title or '')}".lower()
|
| 187 |
+
for pat, crs in self.COURSE_BY_KEYWORD:
|
| 188 |
+
if re.search(pat, hay):
|
| 189 |
+
course = crs
|
| 190 |
+
break
|
| 191 |
+
if not course and category:
|
| 192 |
+
course = "Main Course"
|
| 193 |
+
return category, course
|
backend/data_minning/base_scrapper.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import asdict
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import io
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from click import Tuple
|
| 10 |
+
from pymongo import MongoClient, UpdateOne, errors
|
| 11 |
+
|
| 12 |
+
from .dto.stream_opts import StreamOptions
|
| 13 |
+
|
| 14 |
+
from .dto.recipe_doc import RecipeDoc
|
| 15 |
+
|
| 16 |
+
from .soup_client import SoupClient
|
| 17 |
+
from backend.utils.sanitization import clean
|
| 18 |
+
from bs4 import BeautifulSoup
|
| 19 |
+
from backend.config.database import db_settings
|
| 20 |
+
|
| 21 |
+
class JsonArraySink:
|
| 22 |
+
"""
|
| 23 |
+
Append-safe JSON array writer.
|
| 24 |
+
- Creates file with `[` ... `]`
|
| 25 |
+
- If file exists, removes trailing `]`, appends items, and re-closes.
|
| 26 |
+
"""
|
| 27 |
+
def __init__(self, path: str):
|
| 28 |
+
self.path = path
|
| 29 |
+
self._opened = False
|
| 30 |
+
self._first = True
|
| 31 |
+
self.f = None
|
| 32 |
+
|
| 33 |
+
def _prepare(self):
|
| 34 |
+
if self._opened:
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
# Ensure file exists with an empty array
|
| 38 |
+
if not os.path.exists(self.path):
|
| 39 |
+
with open(self.path, "w", encoding="utf-8") as f:
|
| 40 |
+
f.write("[\n]")
|
| 41 |
+
|
| 42 |
+
self.f = open(self.path, "r+", encoding="utf-8")
|
| 43 |
+
|
| 44 |
+
# Find the position of the final ']' from the end
|
| 45 |
+
self.f.seek(0, io.SEEK_END)
|
| 46 |
+
end = self.f.tell()
|
| 47 |
+
step = min(4096, end)
|
| 48 |
+
pos = end
|
| 49 |
+
last_bracket = -1
|
| 50 |
+
while pos > 0:
|
| 51 |
+
pos = max(0, pos - step)
|
| 52 |
+
self.f.seek(pos)
|
| 53 |
+
chunk = self.f.read(step)
|
| 54 |
+
j = chunk.rfind("]")
|
| 55 |
+
if j != -1:
|
| 56 |
+
last_bracket = pos + j
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
if last_bracket == -1:
|
| 60 |
+
# Corrupt file: reset to empty array
|
| 61 |
+
self.f.seek(0); self.f.truncate(0); self.f.write("[\n]"); self.f.flush()
|
| 62 |
+
last_bracket = 2 # index of ']' in "[\n]"
|
| 63 |
+
|
| 64 |
+
# Decide "is first item?" by inspecting the content BEFORE the ']'
|
| 65 |
+
self.f.seek(0)
|
| 66 |
+
prefix = self.f.read(last_bracket).strip() # content up to (but not including) ']'
|
| 67 |
+
# Empty array has only '[' (possibly with whitespace/newline)
|
| 68 |
+
self._first = (prefix == "[")
|
| 69 |
+
|
| 70 |
+
# Now remove the closing ']' so we can append
|
| 71 |
+
self.f.seek(last_bracket)
|
| 72 |
+
self.f.truncate()
|
| 73 |
+
|
| 74 |
+
self._opened = True
|
| 75 |
+
|
| 76 |
+
def write_many(self, docs: List[Dict[str, Any]]):
|
| 77 |
+
if not docs:
|
| 78 |
+
return
|
| 79 |
+
self._prepare()
|
| 80 |
+
|
| 81 |
+
for d in docs:
|
| 82 |
+
if not self._first:
|
| 83 |
+
self.f.write(",\n")
|
| 84 |
+
else:
|
| 85 |
+
# First item: no leading comma
|
| 86 |
+
self._first = False
|
| 87 |
+
self.f.write(json.dumps(d, ensure_ascii=False, indent=2, default=str))
|
| 88 |
+
|
| 89 |
+
# Restore the closing bracket
|
| 90 |
+
self.f.write("\n]")
|
| 91 |
+
self.f.flush()
|
| 92 |
+
|
| 93 |
+
def close(self):
|
| 94 |
+
if self.f:
|
| 95 |
+
self.f.close()
|
| 96 |
+
self._opened = False
|
| 97 |
+
|
| 98 |
+
class MongoSink:
|
| 99 |
+
def __init__(self, ):
|
| 100 |
+
db_config = db_settings.get_vector_store_config()
|
| 101 |
+
self.client = MongoClient(db_config["uri"], retryWrites=True, serverSelectionTimeoutMS=10000)
|
| 102 |
+
self.col = self.client[db_config["database"]][db_config["collection_name"]]
|
| 103 |
+
self._ensure_indexes()
|
| 104 |
+
|
| 105 |
+
def _ensure_indexes(self):
|
| 106 |
+
self.col.create_index("url", unique=True)
|
| 107 |
+
self.col.create_index("title")
|
| 108 |
+
self.col.create_index("category")
|
| 109 |
+
self.col.create_index("scraped_at")
|
| 110 |
+
|
| 111 |
+
def write_many(self, docs: List[Dict[str, Any]]):
|
| 112 |
+
if not docs: return
|
| 113 |
+
ops = []
|
| 114 |
+
now = datetime.utcnow()
|
| 115 |
+
for d in docs:
|
| 116 |
+
d = d.copy()
|
| 117 |
+
d.setdefault("scraped_at", now)
|
| 118 |
+
ops.append(UpdateOne({"url": d["url"]}, {"$set": d, "$setOnInsert": {"created_at": now}}, upsert=True))
|
| 119 |
+
try:
|
| 120 |
+
self.col.bulk_write(ops, ordered=False)
|
| 121 |
+
except errors.BulkWriteError as e:
|
| 122 |
+
# duplicates or minor issues won't halt unordered bulk
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
def close(self):
|
| 126 |
+
self.client.close()
|
| 127 |
+
|
| 128 |
+
# we need to create a search function to fetch recipes by title or ingredients from the embeddings given the embedding fields, db and collection
|
| 129 |
+
|
| 130 |
+
class DualSink:
|
| 131 |
+
def __init__(self, json_sink: Optional[JsonArraySink], mongo_sink: Optional[MongoSink]):
|
| 132 |
+
self.json = json_sink
|
| 133 |
+
self.mongo = mongo_sink
|
| 134 |
+
def write_many(self, docs: List[Dict[str, Any]]):
|
| 135 |
+
if self.json: self.json.write_many(docs)
|
| 136 |
+
if self.mongo: self.mongo.upsert_batch(docs)
|
| 137 |
+
def close(self):
|
| 138 |
+
if self.json: self.json.close()
|
| 139 |
+
if self.mongo: self.mongo.close()
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class BaseRecipeScraper(SoupClient):
|
| 143 |
+
HEADING_TAGS = ("h1","h2","h3","h4","h5","h6")
|
| 144 |
+
|
| 145 |
+
def __init__(
|
| 146 |
+
self,
|
| 147 |
+
*args,
|
| 148 |
+
embedder= None,
|
| 149 |
+
embedding_fields= None,
|
| 150 |
+
**kwargs
|
| 151 |
+
):
|
| 152 |
+
"""
|
| 153 |
+
embedder: HFEmbedder(), optional
|
| 154 |
+
embedding_fields: list of (source_field, target_field) like:
|
| 155 |
+
[("title", "title_emb"), ("instructions_text", "instr_emb")]
|
| 156 |
+
"""
|
| 157 |
+
super().__init__(*args, **kwargs)
|
| 158 |
+
self.embedder = embedder
|
| 159 |
+
self.embedding_fields = embedding_fields or []
|
| 160 |
+
self.logger = self.log
|
| 161 |
+
|
| 162 |
+
def extract_jsonld(self, soup: BeautifulSoup) -> Optional[Dict[str, Any]]:
|
| 163 |
+
def to_list(x): return x if isinstance(x, list) else [x]
|
| 164 |
+
for tag in soup.find_all("script", type="application/ld+json"):
|
| 165 |
+
try:
|
| 166 |
+
data = json.loads(tag.string or "{}")
|
| 167 |
+
except Exception:
|
| 168 |
+
continue
|
| 169 |
+
nodes = (data.get("@graph", [data]) if isinstance(data, dict)
|
| 170 |
+
else (data if isinstance(data, list) else []))
|
| 171 |
+
for n in nodes:
|
| 172 |
+
if not isinstance(n, dict): continue
|
| 173 |
+
t = n.get("@type")
|
| 174 |
+
if t == "Recipe" or (isinstance(t, list) and "Recipe" in t):
|
| 175 |
+
doc = RecipeDoc()
|
| 176 |
+
doc.title = clean(n.get("name"))
|
| 177 |
+
# ingredients
|
| 178 |
+
ings = []
|
| 179 |
+
for ing in to_list(n.get("recipeIngredient") or []):
|
| 180 |
+
if isinstance(ing, dict):
|
| 181 |
+
ings.append(clean(ing.get("name") or ing.get("text")))
|
| 182 |
+
else:
|
| 183 |
+
ings.append(clean(str(ing)))
|
| 184 |
+
doc.ingredients = [x for x in ings if x]
|
| 185 |
+
# instructions
|
| 186 |
+
steps = []
|
| 187 |
+
for st in to_list(n.get("recipeInstructions") or []):
|
| 188 |
+
if isinstance(st, dict):
|
| 189 |
+
steps.append(clean(st.get("text") or st.get("name")))
|
| 190 |
+
else:
|
| 191 |
+
steps.append(clean(str(st)))
|
| 192 |
+
doc.instructions = [x for x in steps if x]
|
| 193 |
+
|
| 194 |
+
doc.servings = n.get("recipeYield")
|
| 195 |
+
doc.image_url = clean((n.get("image") or {}).get("url") if isinstance(n.get("image"), dict) else (n.get("image")[0] if isinstance(n.get("image"), list) else n.get("image")))
|
| 196 |
+
doc.course = clean(n.get("recipeCategory")) if isinstance(n.get("recipeCategory"), str) else None
|
| 197 |
+
doc.cuisine = clean(n.get("recipeCuisine")) if isinstance(n.get("recipeCuisine"), str) else None
|
| 198 |
+
return asdict(doc)
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
@staticmethod
|
| 202 |
+
def _dedupe_preserve_order(items: List[str]) -> List[str]:
|
| 203 |
+
seen = set()
|
| 204 |
+
out = []
|
| 205 |
+
for x in items:
|
| 206 |
+
x = clean(x)
|
| 207 |
+
if not x or x in seen:
|
| 208 |
+
continue
|
| 209 |
+
seen.add(x); out.append(x)
|
| 210 |
+
return out
|
| 211 |
+
|
| 212 |
+
@staticmethod
|
| 213 |
+
def _to_ingredients_text(items: List[str]) -> str:
|
| 214 |
+
"""
|
| 215 |
+
Turn ingredient bullets into a single text block.
|
| 216 |
+
Using one-per-line is great for embeddings and human readability.
|
| 217 |
+
"""
|
| 218 |
+
items = [clean(x) for x in items if x]
|
| 219 |
+
items = BaseRecipeScraper._dedupe_preserve_order(items)
|
| 220 |
+
return "\n".join(f"- {x}" for x in items)
|
| 221 |
+
|
| 222 |
+
@staticmethod
|
| 223 |
+
def _to_instructions_text(steps: List[str]) -> str:
|
| 224 |
+
"""
|
| 225 |
+
Turn ordered steps into a single text block.
|
| 226 |
+
Numbered paragraphs help embeddings keep sequence context.
|
| 227 |
+
"""
|
| 228 |
+
steps = [clean(x) for x in steps if x]
|
| 229 |
+
steps = BaseRecipeScraper._dedupe_preserve_order(steps)
|
| 230 |
+
return "\n\n".join(f"{i}. {s}" for i, s in enumerate(steps, 1))
|
| 231 |
+
# site-specific scrapers override these two:
|
| 232 |
+
def discover_urls(self) -> Iterable[str]:
|
| 233 |
+
raise NotImplementedError
|
| 234 |
+
def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
|
| 235 |
+
raise NotImplementedError
|
| 236 |
+
|
| 237 |
+
# shared streaming loop
|
| 238 |
+
def stream(self, sink: DualSink, options: Optional[StreamOptions] = None) -> int:
|
| 239 |
+
opts = options or StreamOptions()
|
| 240 |
+
self.log.info(
|
| 241 |
+
f"Starting stream: limit={opts.limit} batch_size={opts.batch_size} "
|
| 242 |
+
f"resume_file={opts.resume_file} sink={type(sink).__name__}"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
processed = set()
|
| 246 |
+
if opts.resume_file:
|
| 247 |
+
resume_path = Path("data") / opts.resume_file # <-- not ../data
|
| 248 |
+
print(resume_path, 'resume_path')
|
| 249 |
+
if resume_path.exists(): # <-- open only if it exists
|
| 250 |
+
with resume_path.open("r", encoding="utf-8") as f:
|
| 251 |
+
processed = {line.strip() for line in f if line.strip()}
|
| 252 |
+
else:
|
| 253 |
+
processed = set()
|
| 254 |
+
self.log.info(f"[resume] {len(processed)} URLs already done")
|
| 255 |
+
|
| 256 |
+
batch, saved = [], 0
|
| 257 |
+
try:
|
| 258 |
+
for i, url in enumerate(self.discover_urls(), 1):
|
| 259 |
+
if opts.limit and i > opts.limit: break
|
| 260 |
+
if not self.same_domain(url): continue
|
| 261 |
+
if url in processed: continue
|
| 262 |
+
|
| 263 |
+
try:
|
| 264 |
+
soup = self.fetch_soup(url)
|
| 265 |
+
doc = self.extract_recipe(soup, url)
|
| 266 |
+
doc.finalize()
|
| 267 |
+
batch.append(asdict(doc))
|
| 268 |
+
except Exception as e:
|
| 269 |
+
self.log.warning(f"[skip] {url} -> {e}")
|
| 270 |
+
|
| 271 |
+
if opts.resume_file:
|
| 272 |
+
resume_path = Path("data") / opts.resume_file
|
| 273 |
+
with open(resume_path, "a", encoding="utf-8") as rf:
|
| 274 |
+
rf.write(url + "\n")
|
| 275 |
+
|
| 276 |
+
if len(batch) >= opts.batch_size:
|
| 277 |
+
self._apply_embeddings(batch)
|
| 278 |
+
sink.write_many(batch); saved += len(batch); batch = []
|
| 279 |
+
if opts.progress_callback: opts.progress_callback(saved)
|
| 280 |
+
self.log.info(f"[resume] {saved} URLs already done 1")
|
| 281 |
+
|
| 282 |
+
if i % 25 == 0:
|
| 283 |
+
self.log.info(f"…processed {i}, saved {saved}")
|
| 284 |
+
|
| 285 |
+
time.sleep(opts.delay)
|
| 286 |
+
|
| 287 |
+
if batch:
|
| 288 |
+
self._apply_embeddings(batch)
|
| 289 |
+
sink.write_many(batch); saved += len(batch)
|
| 290 |
+
if opts.progress_callback: opts.progress_callback(saved)
|
| 291 |
+
self.log.info(f"[resume] {saved} URLs already done2 ")
|
| 292 |
+
finally:
|
| 293 |
+
sink.close()
|
| 294 |
+
|
| 295 |
+
self.log.info(f"[done] saved {saved}")
|
| 296 |
+
return saved
|
| 297 |
+
|
| 298 |
+
@staticmethod
|
| 299 |
+
def _field_to_text(val: Any) -> str:
|
| 300 |
+
if isinstance(val, list):
|
| 301 |
+
return "\n".join(str(x) for x in val)
|
| 302 |
+
if val is None:
|
| 303 |
+
return ""
|
| 304 |
+
return str(val)
|
| 305 |
+
|
| 306 |
+
def _gather_text(self, doc: Dict[str, Any], src: Any) -> str:
|
| 307 |
+
if isinstance(src, tuple):
|
| 308 |
+
parts: List[str] = []
|
| 309 |
+
for f in src:
|
| 310 |
+
t = self._field_to_text(doc.get(f))
|
| 311 |
+
if t:
|
| 312 |
+
# Optional: label sections to help the embedder
|
| 313 |
+
label = "Ingredients" if f == "ingredients" else ("Instructions" if f == "instructions" else f)
|
| 314 |
+
parts.append(f"{label}:\n{t}")
|
| 315 |
+
return "\n\n".join(parts)
|
| 316 |
+
else:
|
| 317 |
+
return self._field_to_text(doc.get(src))
|
| 318 |
+
|
| 319 |
+
def _apply_embeddings(self, batch: List[Dict[str, Any]]) -> None:
|
| 320 |
+
"""
|
| 321 |
+
Applies embeddings to specified fields in a batch of documents.
|
| 322 |
+
|
| 323 |
+
For each (source_field, destination_field) pair in `self.embedding_fields`, this method:
|
| 324 |
+
- Extracts the value from `source_field` in each document of the batch.
|
| 325 |
+
- Converts the value to a string. If the value is a list, joins its elements with newlines.
|
| 326 |
+
- Handles `None` values by converting them to empty strings.
|
| 327 |
+
- Uses `self.embedder.encode` to generate embeddings for the processed texts.
|
| 328 |
+
- Stores the resulting embedding vector in `destination_field` of each document.
|
| 329 |
+
|
| 330 |
+
If `self.embedder`, `self.embedding_fields`, or `batch` is not set or empty, the method returns immediately.
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
batch (List[Dict[str, Any]]): A list of documents to process, where each document is a dictionary.
|
| 334 |
+
|
| 335 |
+
Returns:
|
| 336 |
+
None
|
| 337 |
+
"""
|
| 338 |
+
if not self.embedder or not self.embedding_fields or not batch:
|
| 339 |
+
return
|
| 340 |
+
try:
|
| 341 |
+
for src_spec, dst_field in self.embedding_fields:
|
| 342 |
+
texts = [ self._gather_text(doc, src_spec) for doc in batch ]
|
| 343 |
+
embs = self.embedder.encode(texts)
|
| 344 |
+
for document, vec in zip(batch, embs):
|
| 345 |
+
document[dst_field] = vec
|
| 346 |
+
except Exception as e:
|
| 347 |
+
self.logger.warning(f"[stream error]: {e}")
|
| 348 |
+
|
backend/data_minning/dto/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Services package initialization
|
backend/data_minning/dto/recipe_doc.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Dict, Any
|
| 2 |
+
from dataclasses import dataclass, asdict, field
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class RecipeDoc:
|
| 8 |
+
title: Optional[str] = None
|
| 9 |
+
url: Optional[str] = None
|
| 10 |
+
source: Optional[str] = None
|
| 11 |
+
category: Optional[str] = None
|
| 12 |
+
ingredients: List[str] = field(default_factory=list)
|
| 13 |
+
instructions: List[str] = field(default_factory=list)
|
| 14 |
+
prep_time: Optional[str] = None
|
| 15 |
+
cook_time: Optional[str] = None
|
| 16 |
+
total_time: Optional[str] = None
|
| 17 |
+
servings: Optional[Any] = None
|
| 18 |
+
calories: Optional[float] = None
|
| 19 |
+
rating: Optional[float] = None
|
| 20 |
+
rating_count: Optional[int] = None
|
| 21 |
+
course: Optional[str] = None
|
| 22 |
+
cuisine: Optional[str] = None
|
| 23 |
+
notes: Optional[str] = None
|
| 24 |
+
image_url: Optional[str] = None
|
| 25 |
+
needs_review: Optional[bool] = None
|
| 26 |
+
section: List[Dict[str, Any]] = field(default_factory=list)
|
| 27 |
+
# internal
|
| 28 |
+
scraped_at: Optional[datetime] = None
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def make(url: str) -> "RecipeDoc":
|
| 32 |
+
host = urlparse(url).netloc
|
| 33 |
+
RecipeDocs = RecipeDoc(url=url, source=host, scraped_at=datetime.utcnow())
|
| 34 |
+
return RecipeDocs
|
| 35 |
+
|
| 36 |
+
def finalize(self):
|
| 37 |
+
# mark needs_review conservatively
|
| 38 |
+
self.needs_review = bool((len(self.ingredients) < 1) or (len(self.instructions) < 1))
|
| 39 |
+
# normalize empties to None when appropriate
|
| 40 |
+
for k, v in list(asdict(self).items()):
|
| 41 |
+
if isinstance(v, list) and len(v) == 0:
|
| 42 |
+
setattr(self, k, [] if k in ("ingredients","instructions","section") else None)
|
| 43 |
+
elif v == "":
|
| 44 |
+
setattr(self, k, None)
|
| 45 |
+
|
backend/data_minning/dto/stream_opts.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import Callable, Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class StreamOptions:
|
| 7 |
+
delay: float = 0.3
|
| 8 |
+
limit: Optional[int] = None
|
| 9 |
+
batch_size: int = 50
|
| 10 |
+
resume_file: Optional[str] = None
|
| 11 |
+
# progress gets total_saved so far
|
| 12 |
+
progress_callback: Optional[Callable[[int], None]] = None
|
backend/data_minning/soup_client.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Optional
|
| 3 |
+
from urllib.parse import urlparse
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
|
| 8 |
+
class SoupClient:
|
| 9 |
+
def __init__(self, base_domain: str, user_agent: str = None):
|
| 10 |
+
self.base_domain = base_domain
|
| 11 |
+
self.base_url = f"https://{base_domain}"
|
| 12 |
+
self.session = requests.Session()
|
| 13 |
+
self.session.headers.update({
|
| 14 |
+
"User-Agent": user_agent or
|
| 15 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 16 |
+
"(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
| 17 |
+
})
|
| 18 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
| 19 |
+
|
| 20 |
+
def same_domain(self, url: str) -> bool:
|
| 21 |
+
host = urlparse(url).netloc.lower()
|
| 22 |
+
return host == self.base_domain.lower() or host == self.base_domain.lower().replace("www.", "")
|
| 23 |
+
|
| 24 |
+
def fetch_soup(self, url: str, timeout: int = 30) -> BeautifulSoup:
|
| 25 |
+
r = self.session.get(url, timeout=timeout)
|
| 26 |
+
r.raise_for_status()
|
| 27 |
+
return BeautifulSoup(r.content, "lxml")
|
| 28 |
+
|
| 29 |
+
def close(self):
|
| 30 |
+
self.session.close()
|
| 31 |
+
|
| 32 |
+
def get_meta_image(self, soup: BeautifulSoup) -> Optional[str]:
|
| 33 |
+
for sel in [
|
| 34 |
+
"meta[property='og:image']",
|
| 35 |
+
"meta[name='og:image']",
|
| 36 |
+
"meta[name='twitter:image']",
|
| 37 |
+
"meta[property='twitter:image']"
|
| 38 |
+
]:
|
| 39 |
+
m = soup.select_one(sel)
|
| 40 |
+
if m and m.get("content"):
|
| 41 |
+
return m["content"]
|
| 42 |
+
img = soup.find("img")
|
| 43 |
+
return (img.get("src") or img.get("data-src")) if img else None
|
backend/data_minning/yummy_medley_scraper.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Iterable, Optional
|
| 3 |
+
from urllib.parse import urljoin, urlparse
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from .base_scrapper import BaseRecipeScraper, RecipeDoc
|
| 6 |
+
from backend.utils.sanitization import clean
|
| 7 |
+
|
| 8 |
+
class YummyMedleyScraper(BaseRecipeScraper):
|
| 9 |
+
"""Specifically targets WPRM (WP Recipe Maker) blocks on yummymedley.com"""
|
| 10 |
+
|
| 11 |
+
# Only collect tags from the home page tag-cloud widget
|
| 12 |
+
TAG_CLOUD_SELECTORS = [
|
| 13 |
+
"#tag_cloud-4 .tagcloud a[href*='/tag/']",
|
| 14 |
+
"div.widget_tag_cloud .tagcloud a[href*='/tag/']",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
# How we find post links on a tag page (grid cards & headers)
|
| 18 |
+
POST_LINK_SELECTORS = [
|
| 19 |
+
"#main ul.sp-grid li article .post-header a[href]", # header link
|
| 20 |
+
"#main ul.sp-grid li article .post-img a[href]", # image link
|
| 21 |
+
"#main article .post-header a[href]", # fallback
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
TAG_RE = re.compile(r"/tag/[^/]+/?$")
|
| 25 |
+
|
| 26 |
+
def __init__(self, base_domain="www.yummymedley.com"):
|
| 27 |
+
super().__init__(base_domain)
|
| 28 |
+
|
| 29 |
+
# --- NEW: get tag URLs only from home page tag cloud ---
|
| 30 |
+
def _discover_tag_urls_from_home(self) -> list[str]:
|
| 31 |
+
soup = self.fetch_soup(self.base_url)
|
| 32 |
+
tags = set()
|
| 33 |
+
|
| 34 |
+
# prefer strict selector; gracefully fall back
|
| 35 |
+
anchors = []
|
| 36 |
+
for sel in self.TAG_CLOUD_SELECTORS:
|
| 37 |
+
anchors = soup.select(sel)
|
| 38 |
+
if anchors:
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
if not anchors:
|
| 42 |
+
# final fallback: any /tag/... link on home page
|
| 43 |
+
anchors = soup.find_all("a", href=self.TAG_RE)
|
| 44 |
+
|
| 45 |
+
for a in anchors:
|
| 46 |
+
href = a.get("href")
|
| 47 |
+
if href:
|
| 48 |
+
tags.add(urljoin(self.base_url, href))
|
| 49 |
+
return sorted(tags)
|
| 50 |
+
|
| 51 |
+
# --- NEW: extract post links from a single tag page soup ---
|
| 52 |
+
def _extract_post_links_from_tag_page(self, soup: BeautifulSoup) -> set[str]:
|
| 53 |
+
links = set()
|
| 54 |
+
for sel in self.POST_LINK_SELECTORS:
|
| 55 |
+
for a in soup.select(sel):
|
| 56 |
+
href = a.get("href")
|
| 57 |
+
if href:
|
| 58 |
+
links.add(urljoin(self.base_url, href))
|
| 59 |
+
if links:
|
| 60 |
+
break # got some via this selector
|
| 61 |
+
return links
|
| 62 |
+
|
| 63 |
+
# --- helper: is this URL an article we should open? ---
|
| 64 |
+
|
| 65 |
+
# --- REWRITTEN: discover only from home-page tag cloud, then crawl each tag ---
|
| 66 |
+
def discover_urls(self) -> Iterable[str]:
|
| 67 |
+
tags = self._discover_tag_urls_from_home()
|
| 68 |
+
if not tags:
|
| 69 |
+
# Safety: if tag cloud not found, bail early (or fallback to /recipes/)
|
| 70 |
+
self.logger.warning("No tags discovered from home page tag cloud; falling back to /recipes/")
|
| 71 |
+
tags = [urljoin(self.base_url, "/recipes/")]
|
| 72 |
+
|
| 73 |
+
seen = set()
|
| 74 |
+
for tag_url in tags:
|
| 75 |
+
page = 1
|
| 76 |
+
while page <= 20: # hard cap to avoid runaway pagination
|
| 77 |
+
url = tag_url if page == 1 else f"{tag_url.rstrip('/')}/page/{page}/"
|
| 78 |
+
try:
|
| 79 |
+
soup = self.fetch_soup(url)
|
| 80 |
+
except Exception as e:
|
| 81 |
+
self.logger.warning(f"[tag] fetch failed {url}: {e}")
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
post_links = self._extract_post_links_from_tag_page(soup)
|
| 85 |
+
if not post_links:
|
| 86 |
+
# no posts found -> stop paginating this tag
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
for u in sorted(post_links):
|
| 90 |
+
if u not in seen and self._looks_like_article(u):
|
| 91 |
+
seen.add(u)
|
| 92 |
+
yield u
|
| 93 |
+
|
| 94 |
+
# pagination: look for 'next' or 'older'
|
| 95 |
+
next_link = (
|
| 96 |
+
soup.find("a", string=re.compile(r"next|older", re.I)) or
|
| 97 |
+
soup.find("a", rel="next")
|
| 98 |
+
)
|
| 99 |
+
if not next_link:
|
| 100 |
+
break
|
| 101 |
+
page += 1
|
| 102 |
+
|
| 103 |
+
def _looks_like_article(self, u: str) -> bool:
|
| 104 |
+
sp = urlparse(u)
|
| 105 |
+
if not self.same_domain(u): return False
|
| 106 |
+
if re.search(r"/(tag|category|author|page|feed)/", sp.path, re.I): return False
|
| 107 |
+
if sp.path.endswith((".xml",".jpg",".png",".pdf",".webp",".zip")): return False
|
| 108 |
+
segs = [s for s in sp.path.strip("/").split("/") if s]
|
| 109 |
+
return 1 <= len(segs) <= 3
|
| 110 |
+
|
| 111 |
+
def extract_recipe(self, soup: BeautifulSoup, url: str, category: Optional[str] = None) -> RecipeDoc:
|
| 112 |
+
doc = RecipeDoc.make(url)
|
| 113 |
+
# JSON-LD first (many WPRM pages also embed it)
|
| 114 |
+
j = self.extract_jsonld(soup)
|
| 115 |
+
if j:
|
| 116 |
+
for k, v in j.items():
|
| 117 |
+
if not hasattr(doc, k):
|
| 118 |
+
continue
|
| 119 |
+
# skip empty values from JSON-LD
|
| 120 |
+
if v in (None, "", [], {}):
|
| 121 |
+
continue
|
| 122 |
+
# never overwrite an already-set url/source
|
| 123 |
+
if k in ("url", "source") and getattr(doc, k):
|
| 124 |
+
continue
|
| 125 |
+
setattr(doc, k, v)
|
| 126 |
+
# WPRM block
|
| 127 |
+
w = soup.find("div", class_="wprm-recipe-container")
|
| 128 |
+
if w:
|
| 129 |
+
if not doc.title:
|
| 130 |
+
t = w.find(class_="wprm-recipe-name")
|
| 131 |
+
doc.title = clean(t.get_text()) if t else doc.title
|
| 132 |
+
# image
|
| 133 |
+
if not doc.image_url:
|
| 134 |
+
img = w.find("img")
|
| 135 |
+
doc.image_url = clean(img.get("src") or img.get("data-src")) if img else clean(self.get_meta_image(soup))
|
| 136 |
+
# rating
|
| 137 |
+
r = w.find(class_="wprm-recipe-rating-average")
|
| 138 |
+
if r:
|
| 139 |
+
try: doc.rating = float(r.get_text().strip())
|
| 140 |
+
except: pass
|
| 141 |
+
rc = w.find(class_="wprm-recipe-rating-count")
|
| 142 |
+
if rc:
|
| 143 |
+
try: doc.rating_count = int(rc.get_text().strip())
|
| 144 |
+
except: pass
|
| 145 |
+
# times
|
| 146 |
+
def pick(c):
|
| 147 |
+
x = w.find(class_=c)
|
| 148 |
+
return clean(x.get_text()) if x else None
|
| 149 |
+
doc.prep_time = pick("wprm-recipe-prep_time") or doc.prep_time
|
| 150 |
+
doc.cook_time = pick("wprm-recipe-cook_time") or doc.cook_time
|
| 151 |
+
doc.total_time= pick("wprm-recipe-total_time") or doc.total_time
|
| 152 |
+
# servings
|
| 153 |
+
s = w.find(class_="wprm-recipe-servings")
|
| 154 |
+
if s:
|
| 155 |
+
txt = s.get_text().strip()
|
| 156 |
+
doc.servings = int(txt) if txt.isdigit() else clean(txt)
|
| 157 |
+
# calories
|
| 158 |
+
cal = w.find(class_="wprm-recipe-calories")
|
| 159 |
+
if cal:
|
| 160 |
+
try: doc.calories = float(cal.get_text().strip())
|
| 161 |
+
except: pass
|
| 162 |
+
# course/cuisine
|
| 163 |
+
cse = w.find(class_="wprm-recipe-course"); doc.course = clean(cse.get_text()) if cse else doc.course
|
| 164 |
+
cui = w.find(class_="wprm-recipe-cuisine"); doc.cuisine = clean(cui.get_text()) if cui else doc.cuisine
|
| 165 |
+
# ingredients
|
| 166 |
+
ings = []
|
| 167 |
+
ic = w.find(class_="wprm-recipe-ingredients-container")
|
| 168 |
+
if ic:
|
| 169 |
+
for ing in ic.find_all(class_="wprm-recipe-ingredient"):
|
| 170 |
+
parts=[]
|
| 171 |
+
for cls in ("wprm-recipe-ingredient-amount","wprm-recipe-ingredient-unit","wprm-recipe-ingredient-name","wprm-recipe-ingredient-notes"):
|
| 172 |
+
el = ing.find(class_=cls)
|
| 173 |
+
if el:
|
| 174 |
+
t = el.get_text().strip()
|
| 175 |
+
if t:
|
| 176 |
+
parts.append(t if "notes" not in cls else f"({t})")
|
| 177 |
+
txt = clean(" ".join(parts))
|
| 178 |
+
if txt: ings.append(txt)
|
| 179 |
+
if ings: doc.ingredients = ings
|
| 180 |
+
# instructions
|
| 181 |
+
steps=[]
|
| 182 |
+
ic2 = w.find(class_="wprm-recipe-instructions-container")
|
| 183 |
+
if ic2:
|
| 184 |
+
for ins in ic2.find_all(class_="wprm-recipe-instruction"):
|
| 185 |
+
t = ins.find(class_="wprm-recipe-instruction-text") or ins
|
| 186 |
+
txt = clean(t.get_text().strip())
|
| 187 |
+
if txt: steps.append(txt)
|
| 188 |
+
if steps: doc.instructions = steps
|
| 189 |
+
|
| 190 |
+
ingredients_text = self._to_ingredients_text(doc.ingredients)
|
| 191 |
+
instructions_text = self._to_instructions_text(doc.instructions)
|
| 192 |
+
|
| 193 |
+
# Store as text (team requirement)
|
| 194 |
+
doc.ingredients = ingredients_text
|
| 195 |
+
doc.instructions = instructions_text
|
| 196 |
+
|
| 197 |
+
# Fallback title & image
|
| 198 |
+
if not doc.title:
|
| 199 |
+
h1 = soup.find("h1") or soup.find("title")
|
| 200 |
+
doc.title = clean(h1.get_text()) if h1 else None
|
| 201 |
+
if not doc.image_url:
|
| 202 |
+
doc.image_url = clean(self.get_meta_image(soup))
|
| 203 |
+
|
| 204 |
+
# Optional category (yours wants "Afro-tropical Recipes" etc. — grab from breadcrumbs or page tags if available)
|
| 205 |
+
cat = soup.find("a", href=re.compile(r"/category/|/tag/"))
|
| 206 |
+
doc.category = clean(cat.get_text()) if cat else doc.category
|
| 207 |
+
|
| 208 |
+
return doc
|
| 209 |
+
|
backend/docs/README.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Documentation Index
|
| 2 |
+
|
| 3 |
+
## 🚨 Troubleshooting (Start Here)
|
| 4 |
+
|
| 5 |
+
### Quick Fixes
|
| 6 |
+
- **[Embedding Troubleshooting](./embedding-troubleshooting.md)** - Fix dimension mismatch errors in 5 minutes
|
| 7 |
+
- **[Logging Guide](./logging_guide.md)** - Understanding error messages and logs
|
| 8 |
+
|
| 9 |
+
### Common Error Messages
|
| 10 |
+
| Error | Quick Fix |
|
| 11 |
+
|-------|-----------|
|
| 12 |
+
| `shapes (768,) and (384,) not aligned` | [Embedding dimension mismatch](./embedding-troubleshooting.md#shapes-768-and-384-not-aligned) |
|
| 13 |
+
| `MongoDB connection failed` | Check `MONGODB_URI` in `.env` |
|
| 14 |
+
| `ChromaDB not available` | `pip install langchain_chroma` |
|
| 15 |
+
| `OpenAI API key invalid` | Update `OPENAI_API_KEY` in `.env` |
|
| 16 |
+
|
| 17 |
+
## 📖 Comprehensive Guides
|
| 18 |
+
|
| 19 |
+
### Setup & Configuration
|
| 20 |
+
- **[Embedding Compatibility Guide](./embedding-compatibility-guide.md)** - Complete guide to embedding models and dimensions
|
| 21 |
+
- **[Model Configuration Guide](./model-configuration-guide.md)** - Provider-specific settings, temperature limitations, and best practices
|
| 22 |
+
- **[Architecture Documentation](../docs/architecture.md)** - System overview and design patterns
|
| 23 |
+
- **[Deployment Guide](./deployment.md)** - Production deployment instructions
|
| 24 |
+
|
| 25 |
+
### API & Development
|
| 26 |
+
- **[API Documentation](./api-documentation.md)** - Detailed API reference
|
| 27 |
+
- **[ChromaDB Refresh Guide](./chromadb_refresh.md)** - Database management and refresh procedures
|
| 28 |
+
|
| 29 |
+
## 🔧 Developer Resources
|
| 30 |
+
|
| 31 |
+
### Configuration Examples
|
| 32 |
+
```bash
|
| 33 |
+
# Most reliable setup (384D embeddings)
|
| 34 |
+
EMBEDDING_PROVIDER=huggingface
|
| 35 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 36 |
+
|
| 37 |
+
# Local inference setup (768D embeddings)
|
| 38 |
+
EMBEDDING_PROVIDER=ollama
|
| 39 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 40 |
+
|
| 41 |
+
# Premium quality setup (1536D embeddings)
|
| 42 |
+
EMBEDDING_PROVIDER=openai
|
| 43 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-small
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### Quick Commands
|
| 47 |
+
```bash
|
| 48 |
+
# Check current embedding configuration
|
| 49 |
+
grep EMBEDDING .env
|
| 50 |
+
|
| 51 |
+
# Test API health
|
| 52 |
+
curl http://localhost:8080/health
|
| 53 |
+
|
| 54 |
+
# Clear conversation memory
|
| 55 |
+
curl -X POST http://localhost:8080/clear-memory
|
| 56 |
+
|
| 57 |
+
# View recent logs
|
| 58 |
+
tail -f ./logs/recipe_bot.log
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## 📋 File Organization
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
docs/
|
| 65 |
+
├── README.md # This file
|
| 66 |
+
├── embedding-troubleshooting.md # 🚨 Quick fixes
|
| 67 |
+
├── embedding-compatibility-guide.md # 📖 Complete embedding guide
|
| 68 |
+
├── model-configuration-guide.md # ⚙️ LLM provider configurations
|
| 69 |
+
├── logging_guide.md # 🔍 Log analysis
|
| 70 |
+
├── chromadb_refresh.md # 🔄 Database management
|
| 71 |
+
├── api-documentation.md # 📡 API reference
|
| 72 |
+
├── architecture.md # 🏗️ System design
|
| 73 |
+
└── deployment.md # 🚀 Production setup
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
## 🆘 Getting Help
|
| 77 |
+
|
| 78 |
+
1. **Check error logs**: `tail -f ./logs/recipe_bot.log`
|
| 79 |
+
2. **Common issues**: Start with [Embedding Troubleshooting](./embedding-troubleshooting.md)
|
| 80 |
+
3. **API problems**: See [API Documentation](./api-documentation.md)
|
| 81 |
+
4. **Setup issues**: Review [Embedding Compatibility Guide](./embedding-compatibility-guide.md)
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
💡 **Tip**: Bookmark the [Embedding Troubleshooting](./embedding-troubleshooting.md) page - it solves 80% of common issues!
|
backend/docs/chromadb_refresh.md
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ChromaDB Refresh Feature Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The ChromaDB refresh feature allows you to automatically delete and recreate your local vector database on application startup. This is useful when you add new recipe files or update existing content that needs to be re-indexed.
|
| 6 |
+
|
| 7 |
+
## Configuration
|
| 8 |
+
|
| 9 |
+
### Environment Variables
|
| 10 |
+
|
| 11 |
+
Add the following to your `.env` file:
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
# Set to true to delete and recreate DB on startup (useful for adding new recipes)
|
| 15 |
+
DB_REFRESH_ON_START=false
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
**Default:** `false` (disabled)
|
| 19 |
+
|
| 20 |
+
### Environment Files Updated
|
| 21 |
+
|
| 22 |
+
- ✅ `.env` - Your local configuration
|
| 23 |
+
- ✅ `.env.example` - Template for new deployments
|
| 24 |
+
- ✅ `config/database.py` - Configuration class updated
|
| 25 |
+
- ✅ `services/vector_store.py` - Implementation added
|
| 26 |
+
|
| 27 |
+
## How It Works
|
| 28 |
+
|
| 29 |
+
### Normal Operation (DB_REFRESH_ON_START=false)
|
| 30 |
+
1. Check if `DB_PERSIST_DIRECTORY` exists
|
| 31 |
+
2. If exists with data → Load existing ChromaDB
|
| 32 |
+
3. If empty/missing → Create new ChromaDB from recipe files
|
| 33 |
+
|
| 34 |
+
### Refresh Mode (DB_REFRESH_ON_START=true)
|
| 35 |
+
1. Check if `DB_PERSIST_DIRECTORY` exists
|
| 36 |
+
2. If exists → **Delete entire directory** 🚨
|
| 37 |
+
3. Create new ChromaDB from recipe files in `./data/recipes/`
|
| 38 |
+
4. All data is re-indexed with current embedding model
|
| 39 |
+
|
| 40 |
+
## Usage Examples
|
| 41 |
+
|
| 42 |
+
### Adding New Recipes
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
# 1. Add new recipe files to ./data/recipes/
|
| 46 |
+
cp new_recipes.json ./data/recipes/
|
| 47 |
+
|
| 48 |
+
# 2. Enable refresh in .env
|
| 49 |
+
DB_REFRESH_ON_START=true
|
| 50 |
+
|
| 51 |
+
# 3. Start application (will recreate database)
|
| 52 |
+
uvicorn app:app --reload
|
| 53 |
+
|
| 54 |
+
# 4. Disable refresh (IMPORTANT!)
|
| 55 |
+
DB_REFRESH_ON_START=false
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Changing Embedding Models
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
# 1. Change embedding provider in .env
|
| 62 |
+
EMBEDDING_PROVIDER=openai
|
| 63 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-large
|
| 64 |
+
|
| 65 |
+
# 2. Enable refresh to rebuild vectors
|
| 66 |
+
DB_REFRESH_ON_START=true
|
| 67 |
+
|
| 68 |
+
# 3. Start application
|
| 69 |
+
uvicorn app:app --reload
|
| 70 |
+
|
| 71 |
+
# 4. Disable refresh
|
| 72 |
+
DB_REFRESH_ON_START=false
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### Troubleshooting Vector Issues
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
# If ChromaDB is corrupted or having issues
|
| 79 |
+
DB_REFRESH_ON_START=true
|
| 80 |
+
# Restart app to rebuild from scratch
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
## Important Warnings ⚠️
|
| 84 |
+
|
| 85 |
+
### Data Loss Warning
|
| 86 |
+
- **Refresh DELETES ALL existing vector data**
|
| 87 |
+
- **This operation CANNOT be undone**
|
| 88 |
+
- Always backup important data before refresh
|
| 89 |
+
|
| 90 |
+
### Performance Impact
|
| 91 |
+
- Re-indexing takes time (depends on recipe count)
|
| 92 |
+
- Embedding API calls cost money (OpenAI, Google)
|
| 93 |
+
- Application startup will be slower during refresh
|
| 94 |
+
|
| 95 |
+
### Memory Usage
|
| 96 |
+
- Large recipe datasets require more memory during indexing
|
| 97 |
+
- Monitor system resources during refresh
|
| 98 |
+
|
| 99 |
+
## Best Practices
|
| 100 |
+
|
| 101 |
+
### ✅ DO
|
| 102 |
+
- Set `DB_REFRESH_ON_START=false` after refresh completes
|
| 103 |
+
- Test refresh in development before production
|
| 104 |
+
- Monitor logs during refresh process
|
| 105 |
+
- Add new recipes in batches if possible
|
| 106 |
+
|
| 107 |
+
### ❌ DON'T
|
| 108 |
+
- Leave refresh enabled in production
|
| 109 |
+
- Refresh unnecessarily (wastes resources)
|
| 110 |
+
- Interrupt refresh process (may corrupt data)
|
| 111 |
+
- Forget to disable after refresh
|
| 112 |
+
|
| 113 |
+
## Monitoring and Logs
|
| 114 |
+
|
| 115 |
+
The refresh process is fully logged:
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
🔄 DB_REFRESH_ON_START=true - Deleting existing ChromaDB at ./data/chromadb_persist
|
| 119 |
+
✅ Existing ChromaDB deleted successfully
|
| 120 |
+
🆕 Creating new ChromaDB at ./data/chromadb_persist
|
| 121 |
+
✅ Created ChromaDB with 150 document chunks
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
## Configuration Reference
|
| 125 |
+
|
| 126 |
+
### Complete Environment Setup
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
# Vector Store Configuration
|
| 130 |
+
VECTOR_STORE_PROVIDER=chromadb
|
| 131 |
+
DB_PATH=./data/chromadb
|
| 132 |
+
DB_COLLECTION_NAME=recipes
|
| 133 |
+
DB_PERSIST_DIRECTORY=./data/chromadb_persist
|
| 134 |
+
|
| 135 |
+
# Refresh Control
|
| 136 |
+
DB_REFRESH_ON_START=false # Set to true only when needed
|
| 137 |
+
|
| 138 |
+
# Embedding Configuration
|
| 139 |
+
EMBEDDING_PROVIDER=huggingface
|
| 140 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### Database Configuration Object
|
| 144 |
+
|
| 145 |
+
```python
|
| 146 |
+
from config.database import DatabaseSettings
|
| 147 |
+
|
| 148 |
+
db_settings = DatabaseSettings()
|
| 149 |
+
config = db_settings.get_vector_store_config()
|
| 150 |
+
|
| 151 |
+
# Access refresh setting
|
| 152 |
+
refresh_enabled = config['refresh_on_start'] # boolean
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
## Troubleshooting
|
| 156 |
+
|
| 157 |
+
### Common Issues
|
| 158 |
+
|
| 159 |
+
**Refresh not working:**
|
| 160 |
+
- Check `.env` file has `DB_REFRESH_ON_START=true`
|
| 161 |
+
- Verify environment is loaded correctly
|
| 162 |
+
- Check file permissions on persist directory
|
| 163 |
+
|
| 164 |
+
**Application won't start after refresh:**
|
| 165 |
+
- Check recipe files exist in `./data/recipes/`
|
| 166 |
+
- Verify embedding provider credentials
|
| 167 |
+
- Review application logs for specific errors
|
| 168 |
+
|
| 169 |
+
**Partial refresh/corruption:**
|
| 170 |
+
- Delete persist directory manually
|
| 171 |
+
- Set refresh=true and restart
|
| 172 |
+
- Check disk space availability
|
| 173 |
+
|
| 174 |
+
### Emergency Recovery
|
| 175 |
+
|
| 176 |
+
If refresh fails or corrupts data:
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
# Manual cleanup
|
| 180 |
+
rm -rf ./data/chromadb_persist
|
| 181 |
+
|
| 182 |
+
# Reset configuration
|
| 183 |
+
DB_REFRESH_ON_START=true
|
| 184 |
+
|
| 185 |
+
# Restart application
|
| 186 |
+
uvicorn app:app --reload
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
## Testing
|
| 190 |
+
|
| 191 |
+
Test the refresh functionality:
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
# Run refresh tests
|
| 195 |
+
python3 test_refresh.py
|
| 196 |
+
|
| 197 |
+
# Demo the feature
|
| 198 |
+
python3 demo_refresh.py
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## Implementation Details
|
| 202 |
+
|
| 203 |
+
### Files Modified
|
| 204 |
+
|
| 205 |
+
1. **`config/database.py`**
|
| 206 |
+
- Added `DB_REFRESH_ON_START` environment variable
|
| 207 |
+
- Updated `get_vector_store_config()` method
|
| 208 |
+
|
| 209 |
+
2. **`services/vector_store.py`**
|
| 210 |
+
- Added `shutil` import for directory deletion
|
| 211 |
+
- Implemented refresh logic in `_get_or_create_vector_store()`
|
| 212 |
+
- Added comprehensive logging
|
| 213 |
+
|
| 214 |
+
3. **Environment Files**
|
| 215 |
+
- Updated `.env` and `.env.example` with new variable
|
| 216 |
+
- Added documentation comments
|
| 217 |
+
|
| 218 |
+
### Code Changes
|
| 219 |
+
|
| 220 |
+
```python
|
| 221 |
+
# In vector_store.py
|
| 222 |
+
if refresh_on_start and persist_dir.exists():
|
| 223 |
+
logger.info(f"🔄 DB_REFRESH_ON_START=true - Deleting existing ChromaDB at {persist_dir}")
|
| 224 |
+
shutil.rmtree(persist_dir)
|
| 225 |
+
logger.info(f"✅ Existing ChromaDB deleted successfully")
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
This feature provides a simple but powerful way to manage vector database content lifecycle while maintaining data integrity and providing clear user control.
|
backend/docs/embedding-compatibility-guide.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Embedding Compatibility Guide
|
| 2 |
+
|
| 3 |
+
## 🔍 Understanding Embedding Dimensions
|
| 4 |
+
|
| 5 |
+
When working with vector databases and embeddings, **dimension compatibility** is crucial for successful similarity searches. This guide helps you understand and troubleshoot embedding dimension issues.
|
| 6 |
+
|
| 7 |
+
## 📊 Common Embedding Models & Their Dimensions
|
| 8 |
+
|
| 9 |
+
| Provider | Model | Dimensions | Use Case |
|
| 10 |
+
|----------|-------|------------|----------|
|
| 11 |
+
| **HuggingFace** | `sentence-transformers/all-MiniLM-L6-v2` | **384** | Fast, lightweight, good for most tasks |
|
| 12 |
+
| **HuggingFace** | `sentence-transformers/all-mpnet-base-v2` | **768** | Higher quality, larger model |
|
| 13 |
+
| **Ollama** | `nomic-embed-text:v1.5` | **768** | Local inference, privacy-focused |
|
| 14 |
+
| **Ollama** | `mxbai-embed-large` | **1024** | High-quality local embeddings |
|
| 15 |
+
| **OpenAI** | `text-embedding-3-small` | **1536** | Commercial API, good performance |
|
| 16 |
+
| **OpenAI** | `text-embedding-3-large` | **3072** | Highest quality, expensive |
|
| 17 |
+
| **Google** | `models/embedding-001` | **768** | Google AI integration |
|
| 18 |
+
|
| 19 |
+
## ⚠️ Common Error: Dimension Mismatch
|
| 20 |
+
|
| 21 |
+
### Symptoms
|
| 22 |
+
```
|
| 23 |
+
WARNING - [custom_mongo_vector.py:103] - ⚠️ Error processing document: shapes (768,) and (384,) not aligned
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Root Cause
|
| 27 |
+
Your **query embeddings** and **stored embeddings** have different dimensions:
|
| 28 |
+
- Query: Generated with Model A (e.g., 768 dimensions)
|
| 29 |
+
- Stored: Created with Model B (e.g., 384 dimensions)
|
| 30 |
+
|
| 31 |
+
### Why This Happens
|
| 32 |
+
1. You changed embedding models after creating your database
|
| 33 |
+
2. Your database was created with a different embedding provider
|
| 34 |
+
3. Environment configuration doesn't match the original setup
|
| 35 |
+
|
| 36 |
+
## 🔧 Solution Strategies
|
| 37 |
+
|
| 38 |
+
### Strategy 1: Match Your Current Database (Recommended)
|
| 39 |
+
|
| 40 |
+
**Step 1: Identify stored embedding dimensions**
|
| 41 |
+
```bash
|
| 42 |
+
# Check your MongoDB collection to see stored embedding dimensions
|
| 43 |
+
# Look at the 'ingredients_emb' field length
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
**Step 2: Update .env to match**
|
| 47 |
+
```bash
|
| 48 |
+
# If stored embeddings are 384-dimensional (common with all-MiniLM-L6-v2)
|
| 49 |
+
EMBEDDING_PROVIDER=huggingface
|
| 50 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 51 |
+
|
| 52 |
+
# If stored embeddings are 768-dimensional
|
| 53 |
+
EMBEDDING_PROVIDER=ollama
|
| 54 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
### Strategy 2: Regenerate Database with New Model
|
| 58 |
+
|
| 59 |
+
**Step 1: Choose your preferred embedding model**
|
| 60 |
+
```bash
|
| 61 |
+
# Example: Use Ollama for local inference
|
| 62 |
+
EMBEDDING_PROVIDER=ollama
|
| 63 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
**Step 2: Enable database refresh**
|
| 67 |
+
```bash
|
| 68 |
+
DB_REFRESH_ON_START=true
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
**Step 3: Restart application**
|
| 72 |
+
```bash
|
| 73 |
+
uvicorn app:app --reload
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
**Step 4: Disable refresh (Important!)**
|
| 77 |
+
```bash
|
| 78 |
+
DB_REFRESH_ON_START=false
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
## 🔍 Debugging Embedding Issues
|
| 82 |
+
|
| 83 |
+
### Check Current Configuration
|
| 84 |
+
```bash
|
| 85 |
+
# View your current embedding setup
|
| 86 |
+
grep -E "EMBEDDING_PROVIDER|_EMBEDDING_MODEL" .env
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
### Monitor Embedding Dimensions
|
| 90 |
+
The custom MongoDB vector store now logs dimension information:
|
| 91 |
+
```
|
| 92 |
+
🔢 Query embedding dimensions: 768
|
| 93 |
+
⚠️ Dimension mismatch: query=768D, stored=384D
|
| 94 |
+
💡 Consider changing EMBEDDING_PROVIDER to match stored embeddings
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
### Verify Database Content
|
| 98 |
+
```python
|
| 99 |
+
# Check stored embedding dimensions in MongoDB
|
| 100 |
+
collection.find_one({"ingredients_emb": {"$exists": True}})["ingredients_emb"]
|
| 101 |
+
# Count the array length to get dimensions
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
## 📋 Environment Configuration Examples
|
| 105 |
+
|
| 106 |
+
### Example 1: HuggingFace (384D) - Most Common
|
| 107 |
+
```bash
|
| 108 |
+
# .env configuration for 384-dimensional embeddings
|
| 109 |
+
EMBEDDING_PROVIDER=huggingface
|
| 110 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 111 |
+
HUGGINGFACE_API_TOKEN=your_token_here
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
### Example 2: Ollama (768D) - Local Inference
|
| 115 |
+
```bash
|
| 116 |
+
# .env configuration for 768-dimensional embeddings
|
| 117 |
+
EMBEDDING_PROVIDER=ollama
|
| 118 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 119 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
### Example 3: OpenAI (1536D) - Premium Quality
|
| 123 |
+
```bash
|
| 124 |
+
# .env configuration for 1536-dimensional embeddings
|
| 125 |
+
EMBEDDING_PROVIDER=openai
|
| 126 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-small
|
| 127 |
+
OPENAI_API_KEY=your_api_key_here
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## 🚨 Common Pitfalls
|
| 131 |
+
|
| 132 |
+
### 1. Mixed Providers
|
| 133 |
+
❌ **Don't do this:**
|
| 134 |
+
```bash
|
| 135 |
+
# Database created with HuggingFace
|
| 136 |
+
EMBEDDING_PROVIDER=huggingface # Original
|
| 137 |
+
|
| 138 |
+
# Later changed to Ollama without refreshing DB
|
| 139 |
+
EMBEDDING_PROVIDER=ollama # New - causes dimension mismatch!
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### 2. Forgetting to Disable Refresh
|
| 143 |
+
❌ **Don't forget:**
|
| 144 |
+
```bash
|
| 145 |
+
# After refreshing database, always disable refresh
|
| 146 |
+
DB_REFRESH_ON_START=false # SET THIS BACK TO FALSE!
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
### 3. Model Name Typos
|
| 150 |
+
❌ **Watch out for:**
|
| 151 |
+
```bash
|
| 152 |
+
# Typo in model name will cause failures
|
| 153 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5 ✅
|
| 154 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text ❌ (missing version)
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
## 📊 Performance Comparison
|
| 158 |
+
|
| 159 |
+
| Model | Speed | Quality | Dimensions | Local/API | Cost |
|
| 160 |
+
|-------|-------|---------|------------|-----------|------|
|
| 161 |
+
| `all-MiniLM-L6-v2` | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | 384 | Both | Free |
|
| 162 |
+
| `nomic-embed-text:v1.5` | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | 768 | Local | Free |
|
| 163 |
+
| `text-embedding-3-small` | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | 1536 | API | $$$ |
|
| 164 |
+
|
| 165 |
+
## 🔧 Troubleshooting Steps
|
| 166 |
+
|
| 167 |
+
### Step 1: Check Current Setup
|
| 168 |
+
```bash
|
| 169 |
+
# 1. Check your environment configuration
|
| 170 |
+
cat .env | grep EMBEDDING
|
| 171 |
+
|
| 172 |
+
# 2. Check vector store provider
|
| 173 |
+
cat .env | grep VECTOR_STORE_PROVIDER
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### Step 2: Test Embedding Generation
|
| 177 |
+
```python
|
| 178 |
+
# Test script to check embedding dimensions
|
| 179 |
+
from services.vector_store import vector_store_service
|
| 180 |
+
|
| 181 |
+
# Generate a test embedding
|
| 182 |
+
test_embedding = vector_store_service.embeddings.embed_query("test")
|
| 183 |
+
print(f"Current embedding dimensions: {len(test_embedding)}")
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### Step 3: Check Database Content
|
| 187 |
+
For MongoDB users:
|
| 188 |
+
```javascript
|
| 189 |
+
// MongoDB shell command to check stored embedding dimensions
|
| 190 |
+
db.your_collection.findOne({"ingredients_emb": {"$exists": true}})
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
### Step 4: Apply Fix
|
| 194 |
+
Choose one of the strategies above based on your needs.
|
| 195 |
+
|
| 196 |
+
## 📝 Best Practices
|
| 197 |
+
|
| 198 |
+
### 1. Document Your Embedding Model
|
| 199 |
+
Keep a record of which embedding model you used:
|
| 200 |
+
```bash
|
| 201 |
+
# Add comments to your .env file
|
| 202 |
+
# Database created on 2025-08-27 with all-MiniLM-L6-v2 (384D)
|
| 203 |
+
EMBEDDING_PROVIDER=huggingface
|
| 204 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
### 2. Version Control Your Configuration
|
| 208 |
+
```bash
|
| 209 |
+
# Commit your .env changes with descriptive messages
|
| 210 |
+
git add .env
|
| 211 |
+
git commit -m "Update embedding model to match database (384D)"
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
### 3. Test After Changes
|
| 215 |
+
```bash
|
| 216 |
+
# After changing embedding configuration, test a query
|
| 217 |
+
curl -X POST "http://localhost:8080/chat" \
|
| 218 |
+
-H "Content-Type: application/json" \
|
| 219 |
+
-d '{"message": "test query"}'
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
## 🆘 Quick Reference
|
| 223 |
+
|
| 224 |
+
### Error Pattern Recognition
|
| 225 |
+
```
|
| 226 |
+
shapes (768,) and (384,) not aligned → Query=768D, Stored=384D
|
| 227 |
+
shapes (384,) and (768,) not aligned → Query=384D, Stored=768D
|
| 228 |
+
shapes (1536,) and (384,) not aligned → Query=1536D, Stored=384D
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
### Quick Fixes
|
| 232 |
+
| Stored Dimensions | Set EMBEDDING_PROVIDER to |
|
| 233 |
+
|-------------------|-------------------------|
|
| 234 |
+
| 384 | `huggingface` with `all-MiniLM-L6-v2` |
|
| 235 |
+
| 768 | `ollama` with `nomic-embed-text:v1.5` |
|
| 236 |
+
| 1536 | `openai` with `text-embedding-3-small` |
|
| 237 |
+
|
| 238 |
+
---
|
| 239 |
+
|
| 240 |
+
## 📞 Need Help?
|
| 241 |
+
|
| 242 |
+
If you're still experiencing issues:
|
| 243 |
+
|
| 244 |
+
1. Check the application logs for detailed error messages
|
| 245 |
+
2. Verify your embedding model is properly installed/accessible
|
| 246 |
+
3. Ensure your database connection is working
|
| 247 |
+
4. Consider regenerating your vector database if switching models permanently
|
| 248 |
+
|
| 249 |
+
Remember: **Consistency is key** - your query embeddings and stored embeddings must use the same model and dimensions!
|
backend/docs/embedding-troubleshooting.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚨 Embedding Troubleshooting Quick Start
|
| 2 |
+
|
| 3 |
+
## Common Error Messages & Instant Fixes
|
| 4 |
+
|
| 5 |
+
### ⚠️ "shapes (768,) and (384,) not aligned"
|
| 6 |
+
|
| 7 |
+
**What it means:** Your query embeddings (768D) don't match stored embeddings (384D)
|
| 8 |
+
|
| 9 |
+
**Instant fix:**
|
| 10 |
+
```bash
|
| 11 |
+
# Open .env file and change:
|
| 12 |
+
EMBEDDING_PROVIDER=huggingface
|
| 13 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 14 |
+
|
| 15 |
+
# Restart your application
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
### ⚠️ "shapes (384,) and (768,) not aligned"
|
| 19 |
+
|
| 20 |
+
**What it means:** Your query embeddings (384D) don't match stored embeddings (768D)
|
| 21 |
+
|
| 22 |
+
**Instant fix:**
|
| 23 |
+
```bash
|
| 24 |
+
# Open .env file and change:
|
| 25 |
+
EMBEDDING_PROVIDER=ollama
|
| 26 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 27 |
+
|
| 28 |
+
# Make sure Ollama is running: ollama serve
|
| 29 |
+
# Pull the model: ollama pull nomic-embed-text:v1.5
|
| 30 |
+
# Restart your application
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
### ⚠️ "shapes (1536,) and (384,) not aligned"
|
| 34 |
+
|
| 35 |
+
**What it means:** Your query embeddings (1536D) don't match stored embeddings (384D)
|
| 36 |
+
|
| 37 |
+
**Instant fix:**
|
| 38 |
+
```bash
|
| 39 |
+
# Open .env file and change:
|
| 40 |
+
EMBEDDING_PROVIDER=huggingface
|
| 41 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 42 |
+
|
| 43 |
+
# Restart your application
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## 🔧 5-Minute Fix Guide
|
| 47 |
+
|
| 48 |
+
### Step 1: Identify Your Error (30 seconds)
|
| 49 |
+
Look at your error message and find the dimension numbers:
|
| 50 |
+
- `shapes (X,) and (Y,)` → X = query dimensions, Y = stored dimensions
|
| 51 |
+
|
| 52 |
+
### Step 2: Choose Matching Model (1 minute)
|
| 53 |
+
| Stored Dimensions (Y) | Set in .env |
|
| 54 |
+
|---------------------|-------------|
|
| 55 |
+
| 384 | `EMBEDDING_PROVIDER=huggingface`<br/>`HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2` |
|
| 56 |
+
| 768 | `EMBEDDING_PROVIDER=ollama`<br/>`OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5` |
|
| 57 |
+
| 1024 | `EMBEDDING_PROVIDER=ollama`<br/>`OLLAMA_EMBEDDING_MODEL=mxbai-embed-large` |
|
| 58 |
+
| 1536 | `EMBEDDING_PROVIDER=openai`<br/>`OPENAI_EMBEDDING_MODEL=text-embedding-3-small` |
|
| 59 |
+
|
| 60 |
+
### Step 3: Update Configuration (2 minutes)
|
| 61 |
+
```bash
|
| 62 |
+
# Edit your .env file
|
| 63 |
+
nano .env # or use your preferred editor
|
| 64 |
+
|
| 65 |
+
# Find the EMBEDDING_PROVIDER lines and update them
|
| 66 |
+
# Save the file
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### Step 4: Restart Application (1 minute)
|
| 70 |
+
```bash
|
| 71 |
+
# Kill current process (Ctrl+C)
|
| 72 |
+
# Restart
|
| 73 |
+
uvicorn app:app --reload
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### Step 5: Test (30 seconds)
|
| 77 |
+
```bash
|
| 78 |
+
# Test with a simple query
|
| 79 |
+
curl -X POST "http://localhost:8080/chat" \
|
| 80 |
+
-H "Content-Type: application/json" \
|
| 81 |
+
-d '{"message": "chicken recipe"}'
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## 🔍 Alternative: Start Fresh
|
| 85 |
+
|
| 86 |
+
If you prefer to use a different embedding model permanently:
|
| 87 |
+
|
| 88 |
+
### Option A: Regenerate Database (5 minutes)
|
| 89 |
+
```bash
|
| 90 |
+
# 1. Choose your preferred model in .env
|
| 91 |
+
EMBEDDING_PROVIDER=ollama
|
| 92 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text:v1.5
|
| 93 |
+
|
| 94 |
+
# 2. Enable database refresh
|
| 95 |
+
DB_REFRESH_ON_START=true
|
| 96 |
+
|
| 97 |
+
# 3. Restart application (this will rebuild everything)
|
| 98 |
+
uvicorn app:app --reload
|
| 99 |
+
|
| 100 |
+
# 4. IMPORTANT: Disable refresh after startup
|
| 101 |
+
DB_REFRESH_ON_START=false
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### Option B: Switch Vector Store (2 minutes)
|
| 105 |
+
```bash
|
| 106 |
+
# Switch to ChromaDB (will create fresh database)
|
| 107 |
+
VECTOR_STORE_PROVIDER=chromadb
|
| 108 |
+
|
| 109 |
+
# Restart application
|
| 110 |
+
uvicorn app:app --reload
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
## ⚡ Prevention Tips
|
| 114 |
+
|
| 115 |
+
### Document Your Choice
|
| 116 |
+
Add a comment to your .env file:
|
| 117 |
+
```bash
|
| 118 |
+
# Created 2025-08-27 with all-MiniLM-L6-v2 (384 dimensions)
|
| 119 |
+
EMBEDDING_PROVIDER=huggingface
|
| 120 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### Consistent Development
|
| 124 |
+
If working in a team, ensure everyone uses the same configuration:
|
| 125 |
+
```bash
|
| 126 |
+
# Share this in your team chat:
|
| 127 |
+
# "Use EMBEDDING_PROVIDER=huggingface with all-MiniLM-L6-v2"
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
---
|
| 131 |
+
|
| 132 |
+
**Still stuck?** Check the full [Embedding Compatibility Guide](./embedding-compatibility-guide.md) for detailed explanations.
|
backend/docs/logging_guide.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logging Example and Configuration
|
| 2 |
+
|
| 3 |
+
## Logging Features Implemented
|
| 4 |
+
|
| 5 |
+
### 1. Centralized Logging Configuration
|
| 6 |
+
- **File**: `config/logging_config.py`
|
| 7 |
+
- **Features**:
|
| 8 |
+
- Rotating file logs (10MB max, 5 backups)
|
| 9 |
+
- Console and file output
|
| 10 |
+
- Structured format with timestamps, levels, and source location
|
| 11 |
+
- Environment-based configuration
|
| 12 |
+
|
| 13 |
+
### 2. Service-Level Logging
|
| 14 |
+
- **Vector Store Service**: Logs initialization, document loading, provider setup
|
| 15 |
+
- **LLM Service**: Logs model setup, question processing, memory operations
|
| 16 |
+
- **API Endpoints**: Logs requests, responses, and errors
|
| 17 |
+
|
| 18 |
+
### 3. Log Levels Used
|
| 19 |
+
- **INFO**: Normal operations, successful completions
|
| 20 |
+
- **DEBUG**: Detailed operation steps, memory operations
|
| 21 |
+
- **WARNING**: Non-critical issues, fallbacks
|
| 22 |
+
- **ERROR**: Failures, exceptions (with stack traces)
|
| 23 |
+
|
| 24 |
+
### 4. Emoji-Coded Log Messages
|
| 25 |
+
- 🚀 Startup/Initialization
|
| 26 |
+
- ✅ Success operations
|
| 27 |
+
- ❌ Error conditions
|
| 28 |
+
- ⚠️ Warning conditions
|
| 29 |
+
- 🔧 Configuration/Setup
|
| 30 |
+
- 💬 Chat operations
|
| 31 |
+
- 🧠 Memory operations
|
| 32 |
+
- 📊 Database operations
|
| 33 |
+
- 🔍 Search/Retrieval
|
| 34 |
+
- 💾 Storage operations
|
| 35 |
+
|
| 36 |
+
## Usage Examples
|
| 37 |
+
|
| 38 |
+
```python
|
| 39 |
+
from config.logging_config import get_logger
|
| 40 |
+
|
| 41 |
+
# Get a logger for your module
|
| 42 |
+
logger = get_logger("my_module")
|
| 43 |
+
|
| 44 |
+
# Log at different levels
|
| 45 |
+
logger.info("✅ Operation completed successfully")
|
| 46 |
+
logger.warning("⚠️ Using fallback configuration")
|
| 47 |
+
logger.error("❌ Operation failed", exc_info=True) # Includes stack trace
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Log File Location
|
| 51 |
+
- **Path**: `./logs/recipe_bot.log`
|
| 52 |
+
- **Rotation**: Automatic when file exceeds 10MB
|
| 53 |
+
- **Backups**: Keeps 5 backup files
|
| 54 |
+
|
| 55 |
+
## Console Output
|
| 56 |
+
All logs are also displayed in the console with colored formatting for easy debugging during development.
|
backend/docs/model-configuration-guide.md
ADDED
|
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Configuration Guide
|
| 2 |
+
|
| 3 |
+
This guide focuses on the technical configuration, settings management, parameter handling, and troubleshooting for LLM providers in the Recipe Chatbot project.
|
| 4 |
+
|
| 5 |
+
> 📚 **Looking for model recommendations?** See [Model Selection Guide](./model-selection-guide.md) for detailed model comparisons and use case recommendations.
|
| 6 |
+
|
| 7 |
+
## 🔧 Configuration System Overview
|
| 8 |
+
|
| 9 |
+
### Settings Architecture
|
| 10 |
+
The project uses a centralized configuration system in `config/settings.py` with environment variable overrides:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
# Configuration loading flow
|
| 14 |
+
Environment Variables (.env) → settings.py → LLM Service → Provider APIs
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
### Temperature Management
|
| 18 |
+
Each provider has different temperature constraints that are automatically handled:
|
| 19 |
+
|
| 20 |
+
| Provider | Range | Auto-Handling | Special Cases |
|
| 21 |
+
|----------|-------|---------------|---------------|
|
| 22 |
+
| **OpenAI** | 0.0 - 2.0 | ✅ GPT-5-nano → 1.0 | Nano models fixed |
|
| 23 |
+
| **Google** | 0.0 - 1.0 | ✅ Clamp to range | Strict validation |
|
| 24 |
+
| **Ollama** | 0.0 - 2.0 | ⚠️ Model dependent | Local processing |
|
| 25 |
+
| **HuggingFace** | Fixed ~0.7 | ❌ API ignores setting | Read-only |
|
| 26 |
+
|
| 27 |
+
## 🛠️ Provider Configuration Details
|
| 28 |
+
|
| 29 |
+
### OpenAI Configuration
|
| 30 |
+
|
| 31 |
+
#### Environment Variables
|
| 32 |
+
```bash
|
| 33 |
+
# Core settings
|
| 34 |
+
OPENAI_API_KEY=sk-proj-xxxxx
|
| 35 |
+
OPENAI_MODEL=gpt-4o-mini
|
| 36 |
+
OPENAI_TEMPERATURE=0.7
|
| 37 |
+
OPENAI_MAX_TOKENS=1000
|
| 38 |
+
|
| 39 |
+
# Advanced parameters (optional)
|
| 40 |
+
OPENAI_TOP_P=1.0
|
| 41 |
+
OPENAI_FREQUENCY_PENALTY=0.0
|
| 42 |
+
OPENAI_PRESENCE_PENALTY=0.0
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
#### Automatic Temperature Override
|
| 46 |
+
```python
|
| 47 |
+
# Implemented in services/llm_service.py
|
| 48 |
+
if "gpt-5-nano" in model_name.lower():
|
| 49 |
+
temperature = 1.0 # Only supported value
|
| 50 |
+
logger.info(f"Auto-adjusting temperature to 1.0 for {model_name}")
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
#### Parameter Validation
|
| 54 |
+
- **Temperature**: `0.0 - 2.0` (except nano models: fixed `1.0`)
|
| 55 |
+
- **Max Tokens**: `1 - 4096` (model-dependent)
|
| 56 |
+
- **Top P**: `0.0 - 1.0`
|
| 57 |
+
|
| 58 |
+
### Google (Gemini) Configuration
|
| 59 |
+
|
| 60 |
+
#### Environment Variables
|
| 61 |
+
```bash
|
| 62 |
+
# Core settings
|
| 63 |
+
GOOGLE_API_KEY=AIzaSyxxxxx
|
| 64 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 65 |
+
GOOGLE_TEMPERATURE=0.7
|
| 66 |
+
GOOGLE_MAX_TOKENS=1000
|
| 67 |
+
|
| 68 |
+
# Advanced parameters (optional)
|
| 69 |
+
GOOGLE_TOP_P=0.95
|
| 70 |
+
GOOGLE_TOP_K=40
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
#### Temperature Clamping
|
| 74 |
+
```python
|
| 75 |
+
# Auto-clamping to Google's range
|
| 76 |
+
google_temp = max(0.0, min(1.0, configured_temperature))
|
| 77 |
+
if google_temp != configured_temperature:
|
| 78 |
+
logger.info(f"Clamping temperature from {configured_temperature} to {google_temp}")
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
#### Parameter Constraints
|
| 82 |
+
- **Temperature**: `0.0 - 1.0` (strictly enforced)
|
| 83 |
+
- **Max Tokens**: `1 - 8192`
|
| 84 |
+
- **Top K**: `1 - 40`
|
| 85 |
+
|
| 86 |
+
### Ollama Configuration
|
| 87 |
+
|
| 88 |
+
#### Environment Variables
|
| 89 |
+
```bash
|
| 90 |
+
# Core settings
|
| 91 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 92 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 93 |
+
OLLAMA_TEMPERATURE=0.7
|
| 94 |
+
OLLAMA_MAX_TOKENS=1000
|
| 95 |
+
|
| 96 |
+
# Connection settings
|
| 97 |
+
OLLAMA_TIMEOUT=30
|
| 98 |
+
OLLAMA_KEEP_ALIVE=5m
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
#### Service Management
|
| 102 |
+
```bash
|
| 103 |
+
# Start Ollama service
|
| 104 |
+
ollama serve &
|
| 105 |
+
|
| 106 |
+
# Verify service status
|
| 107 |
+
curl http://localhost:11434/api/version
|
| 108 |
+
|
| 109 |
+
# Model management
|
| 110 |
+
ollama pull llama3.1:8b
|
| 111 |
+
ollama list
|
| 112 |
+
ollama rm unused_model
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
#### Parameter Flexibility
|
| 116 |
+
- **Temperature**: `0.0 - 2.0` (widest range)
|
| 117 |
+
- **Context Length**: Model-dependent (2K - 128K)
|
| 118 |
+
- **Custom Parameters**: Model-specific options available
|
| 119 |
+
|
| 120 |
+
### HuggingFace Configuration
|
| 121 |
+
|
| 122 |
+
#### Environment Variables
|
| 123 |
+
```bash
|
| 124 |
+
# Core settings
|
| 125 |
+
HUGGINGFACE_API_KEY=hf_xxxxx
|
| 126 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-medium
|
| 127 |
+
HUGGINGFACE_TEMPERATURE=0.7 # Often ignored
|
| 128 |
+
HUGGINGFACE_MAX_TOKENS=500
|
| 129 |
+
|
| 130 |
+
# API settings
|
| 131 |
+
HUGGINGFACE_WAIT_FOR_MODEL=true
|
| 132 |
+
HUGGINGFACE_USE_CACHE=true
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
#### API Limitations
|
| 136 |
+
```python
|
| 137 |
+
# Note: Temperature is often ignored by Inference API
|
| 138 |
+
logger.warning(f"HuggingFace model {model_name} may ignore temperature setting")
|
| 139 |
+
return 0.7 # API typically uses this default
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
## ⚙️ Advanced Configuration
|
| 143 |
+
|
| 144 |
+
### Dynamic Provider Switching
|
| 145 |
+
```python
|
| 146 |
+
# config/settings.py implementation
|
| 147 |
+
def get_llm_config():
|
| 148 |
+
provider = os.getenv("LLM_PROVIDER", "openai").lower()
|
| 149 |
+
fallback = os.getenv("LLM_FALLBACK_PROVIDER", "google").lower()
|
| 150 |
+
|
| 151 |
+
return {
|
| 152 |
+
"provider": provider,
|
| 153 |
+
"fallback_provider": fallback,
|
| 154 |
+
**get_provider_config(provider)
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
def get_provider_config(provider):
|
| 158 |
+
"""Get provider-specific configuration."""
|
| 159 |
+
configs = {
|
| 160 |
+
"openai": {
|
| 161 |
+
"api_key": os.getenv("OPENAI_API_KEY"),
|
| 162 |
+
"model": os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
|
| 163 |
+
"temperature": float(os.getenv("OPENAI_TEMPERATURE", "0.7")),
|
| 164 |
+
"max_tokens": int(os.getenv("OPENAI_MAX_TOKENS", "1000")),
|
| 165 |
+
},
|
| 166 |
+
"google": {
|
| 167 |
+
"api_key": os.getenv("GOOGLE_API_KEY"),
|
| 168 |
+
"model": os.getenv("GOOGLE_MODEL", "gemini-2.5-flash"),
|
| 169 |
+
"temperature": float(os.getenv("GOOGLE_TEMPERATURE", "0.7")),
|
| 170 |
+
"max_tokens": int(os.getenv("GOOGLE_MAX_TOKENS", "1000")),
|
| 171 |
+
},
|
| 172 |
+
# ... other providers
|
| 173 |
+
}
|
| 174 |
+
return configs.get(provider, {})
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
### Fallback Configuration
|
| 178 |
+
```python
|
| 179 |
+
# Automatic fallback on provider failure
|
| 180 |
+
def get_llm_response(message):
|
| 181 |
+
try:
|
| 182 |
+
return primary_provider.chat_completion(message)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.warning(f"Primary provider failed: {e}")
|
| 185 |
+
return fallback_provider.chat_completion(message)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Environment-Specific Configs
|
| 189 |
+
|
| 190 |
+
#### Development (.env.development)
|
| 191 |
+
```bash
|
| 192 |
+
# Fast, free/cheap for testing
|
| 193 |
+
LLM_PROVIDER=google
|
| 194 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 195 |
+
GOOGLE_TEMPERATURE=0.8 # More creative for testing
|
| 196 |
+
LLM_FALLBACK_PROVIDER=ollama
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
#### Production (.env.production)
|
| 200 |
+
```bash
|
| 201 |
+
# Reliable, consistent for production
|
| 202 |
+
LLM_PROVIDER=openai
|
| 203 |
+
OPENAI_MODEL=gpt-4o-mini
|
| 204 |
+
OPENAI_TEMPERATURE=0.7 # Consistent responses
|
| 205 |
+
LLM_FALLBACK_PROVIDER=google
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
#### Local Development (.env.local)
|
| 209 |
+
```bash
|
| 210 |
+
# Self-hosted for offline development
|
| 211 |
+
LLM_PROVIDER=ollama
|
| 212 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 213 |
+
OLLAMA_TEMPERATURE=0.7
|
| 214 |
+
# No fallback - fully local
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
## 🚨 Configuration Troubleshooting
|
| 218 |
+
|
| 219 |
+
### Issue: GPT-5-nano Temperature Error
|
| 220 |
+
**Error**: `Temperature must be 1.0 for gpt-5-nano`
|
| 221 |
+
**Status**: ✅ Auto-fixed in `services/llm_service.py`
|
| 222 |
+
**Verification**:
|
| 223 |
+
```bash
|
| 224 |
+
python -c "
|
| 225 |
+
import os
|
| 226 |
+
os.environ['OPENAI_MODEL'] = 'gpt-5-nano'
|
| 227 |
+
os.environ['OPENAI_TEMPERATURE'] = '0.5'
|
| 228 |
+
from services.llm_service import LLMService
|
| 229 |
+
LLMService() # Should log temperature override
|
| 230 |
+
"
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### Issue: Google Temperature Out of Range
|
| 234 |
+
**Error**: `Temperature must be between 0.0 and 1.0`
|
| 235 |
+
**Solution**: Automatic clamping implemented
|
| 236 |
+
**Test**:
|
| 237 |
+
```bash
|
| 238 |
+
python -c "
|
| 239 |
+
import os
|
| 240 |
+
os.environ['LLM_PROVIDER'] = 'google'
|
| 241 |
+
os.environ['GOOGLE_TEMPERATURE'] = '1.5'
|
| 242 |
+
from services.llm_service import LLMService
|
| 243 |
+
LLMService() # Should clamp to 1.0
|
| 244 |
+
"
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
### Issue: Ollama Connection Failed
|
| 248 |
+
**Error**: `ConnectionError: Could not connect to Ollama`
|
| 249 |
+
**Diagnosis**:
|
| 250 |
+
```bash
|
| 251 |
+
# Check if Ollama is running
|
| 252 |
+
curl -f http://localhost:11434/api/version || echo "Ollama not running"
|
| 253 |
+
|
| 254 |
+
# Check if model exists
|
| 255 |
+
ollama list | grep "llama3.1:8b" || echo "Model not found"
|
| 256 |
+
|
| 257 |
+
# Check system resources
|
| 258 |
+
free -h # RAM usage
|
| 259 |
+
df -h # Disk space
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
**Fix**:
|
| 263 |
+
```bash
|
| 264 |
+
# Start Ollama service
|
| 265 |
+
ollama serve &
|
| 266 |
+
|
| 267 |
+
# Pull required model
|
| 268 |
+
ollama pull llama3.1:8b
|
| 269 |
+
|
| 270 |
+
# Test connection
|
| 271 |
+
curl -d '{"model":"llama3.1:8b","prompt":"test","stream":false}' \
|
| 272 |
+
http://localhost:11434/api/generate
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
### Issue: HuggingFace Temperature Ignored
|
| 276 |
+
**Issue**: Settings have no effect on response
|
| 277 |
+
**Explanation**: This is expected behavior - HuggingFace Inference API typically ignores temperature
|
| 278 |
+
**Workaround**: Use different models or providers for temperature control
|
| 279 |
+
|
| 280 |
+
### Issue: Missing API Keys
|
| 281 |
+
**Error**: `AuthenticationError: Invalid API key`
|
| 282 |
+
**Diagnosis**:
|
| 283 |
+
```bash
|
| 284 |
+
# Check environment variables
|
| 285 |
+
echo "OpenAI: ${OPENAI_API_KEY:0:10}..."
|
| 286 |
+
echo "Google: ${GOOGLE_API_KEY:0:10}..."
|
| 287 |
+
echo "HuggingFace: ${HUGGINGFACE_API_KEY:0:10}..."
|
| 288 |
+
|
| 289 |
+
# Test API key validity
|
| 290 |
+
curl -H "Authorization: Bearer $OPENAI_API_KEY" \
|
| 291 |
+
https://api.openai.com/v1/models | jq '.data[0].id' || echo "Invalid OpenAI key"
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
## 🔍 Configuration Validation
|
| 295 |
+
|
| 296 |
+
### Automated Configuration Check
|
| 297 |
+
```bash
|
| 298 |
+
# Run comprehensive configuration validation
|
| 299 |
+
python -c "
|
| 300 |
+
from config.settings import get_llm_config
|
| 301 |
+
from services.llm_service import LLMService
|
| 302 |
+
import json
|
| 303 |
+
|
| 304 |
+
print('🔧 Configuration Validation')
|
| 305 |
+
print('=' * 40)
|
| 306 |
+
|
| 307 |
+
# Load configuration
|
| 308 |
+
try:
|
| 309 |
+
config = get_llm_config()
|
| 310 |
+
print('✅ Configuration loaded successfully')
|
| 311 |
+
print(f'Provider: {config.get(\"provider\")}')
|
| 312 |
+
print(f'Model: {config.get(\"model\")}')
|
| 313 |
+
print(f'Temperature: {config.get(\"temperature\")}')
|
| 314 |
+
except Exception as e:
|
| 315 |
+
print(f'❌ Configuration error: {e}')
|
| 316 |
+
exit(1)
|
| 317 |
+
|
| 318 |
+
# Test service initialization
|
| 319 |
+
try:
|
| 320 |
+
service = LLMService()
|
| 321 |
+
print('✅ LLM Service initialized')
|
| 322 |
+
except Exception as e:
|
| 323 |
+
print(f'❌ Service initialization failed: {e}')
|
| 324 |
+
exit(1)
|
| 325 |
+
|
| 326 |
+
# Test simple completion
|
| 327 |
+
try:
|
| 328 |
+
response = service.simple_chat_completion('Test message')
|
| 329 |
+
print('✅ Chat completion successful')
|
| 330 |
+
print(f'Response length: {len(response)} characters')
|
| 331 |
+
except Exception as e:
|
| 332 |
+
print(f'❌ Chat completion failed: {e}')
|
| 333 |
+
exit(1)
|
| 334 |
+
|
| 335 |
+
print('🎉 All configuration checks passed!')
|
| 336 |
+
"
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
### Provider-Specific Health Checks
|
| 340 |
+
```bash
|
| 341 |
+
# OpenAI health check
|
| 342 |
+
curl -H "Authorization: Bearer $OPENAI_API_KEY" \
|
| 343 |
+
https://api.openai.com/v1/models | jq '.data | length'
|
| 344 |
+
|
| 345 |
+
# Google health check
|
| 346 |
+
curl "https://generativelanguage.googleapis.com/v1beta/models?key=$GOOGLE_API_KEY" | jq '.models | length'
|
| 347 |
+
|
| 348 |
+
# Ollama health check
|
| 349 |
+
curl http://localhost:11434/api/tags | jq '.models | length'
|
| 350 |
+
|
| 351 |
+
# HuggingFace health check
|
| 352 |
+
curl -H "Authorization: Bearer $HUGGINGFACE_API_KEY" \
|
| 353 |
+
https://huggingface.co/api/whoami | jq '.name'
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
### Configuration Diff Tool
|
| 357 |
+
```bash
|
| 358 |
+
# Compare current config with defaults
|
| 359 |
+
python -c "
|
| 360 |
+
import os
|
| 361 |
+
from config.settings import get_llm_config
|
| 362 |
+
|
| 363 |
+
defaults = {
|
| 364 |
+
'openai': {'temperature': 0.7, 'max_tokens': 1000},
|
| 365 |
+
'google': {'temperature': 0.7, 'max_tokens': 1000},
|
| 366 |
+
'ollama': {'temperature': 0.7, 'max_tokens': 1000},
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
current = get_llm_config()
|
| 370 |
+
provider = current.get('provider')
|
| 371 |
+
default = defaults.get(provider, {})
|
| 372 |
+
|
| 373 |
+
print(f'Configuration for {provider}:')
|
| 374 |
+
for key, default_val in default.items():
|
| 375 |
+
current_val = current.get(key)
|
| 376 |
+
status = '✅' if current_val == default_val else '⚠️'
|
| 377 |
+
print(f'{status} {key}: {current_val} (default: {default_val})')
|
| 378 |
+
"
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
## 📋 Configuration Templates
|
| 382 |
+
|
| 383 |
+
### Minimal Setup (Single Provider)
|
| 384 |
+
```bash
|
| 385 |
+
# .env.minimal
|
| 386 |
+
LLM_PROVIDER=google
|
| 387 |
+
GOOGLE_API_KEY=your_api_key
|
| 388 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 389 |
+
```
|
| 390 |
+
|
| 391 |
+
### Robust Setup (Primary + Fallback)
|
| 392 |
+
```bash
|
| 393 |
+
# .env.robust
|
| 394 |
+
LLM_PROVIDER=openai
|
| 395 |
+
OPENAI_API_KEY=your_primary_key
|
| 396 |
+
OPENAI_MODEL=gpt-4o-mini
|
| 397 |
+
LLM_FALLBACK_PROVIDER=google
|
| 398 |
+
GOOGLE_API_KEY=your_fallback_key
|
| 399 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
+
### Local-First Setup
|
| 403 |
+
```bash
|
| 404 |
+
# .env.local-first
|
| 405 |
+
LLM_PROVIDER=ollama
|
| 406 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 407 |
+
LLM_FALLBACK_PROVIDER=google
|
| 408 |
+
GOOGLE_API_KEY=your_cloud_backup_key
|
| 409 |
+
```
|
| 410 |
+
|
| 411 |
+
### Budget-Conscious Setup
|
| 412 |
+
```bash
|
| 413 |
+
# .env.budget
|
| 414 |
+
LLM_PROVIDER=openai
|
| 415 |
+
OPENAI_MODEL=gpt-5-nano
|
| 416 |
+
OPENAI_TEMPERATURE=1.0 # Fixed for nano
|
| 417 |
+
OPENAI_MAX_TOKENS=500 # Reduce costs
|
| 418 |
+
```
|
| 419 |
+
|
| 420 |
+
## 🔐 Security Best Practices
|
| 421 |
+
|
| 422 |
+
### API Key Management
|
| 423 |
+
```bash
|
| 424 |
+
# Use environment variables
|
| 425 |
+
export OPENAI_API_KEY="sk-..."
|
| 426 |
+
|
| 427 |
+
# Never commit keys to git
|
| 428 |
+
echo "*.env*" >> .gitignore
|
| 429 |
+
echo ".env" >> .gitignore
|
| 430 |
+
|
| 431 |
+
# Use different keys for different environments
|
| 432 |
+
cp .env.example .env.development
|
| 433 |
+
cp .env.example .env.production
|
| 434 |
+
```
|
| 435 |
+
|
| 436 |
+
### Rate Limiting Configuration
|
| 437 |
+
```python
|
| 438 |
+
# Add to config/settings.py
|
| 439 |
+
RATE_LIMITS = {
|
| 440 |
+
"openai": {"rpm": 500, "tpm": 40000},
|
| 441 |
+
"google": {"rpm": 60, "tpm": 32000},
|
| 442 |
+
"ollama": {"rpm": None, "tpm": None}, # Local = unlimited
|
| 443 |
+
}
|
| 444 |
+
```
|
| 445 |
+
|
| 446 |
+
### Error Handling Strategy
|
| 447 |
+
```python
|
| 448 |
+
# Graceful degradation configuration
|
| 449 |
+
FALLBACK_CHAIN = [
|
| 450 |
+
"primary_provider",
|
| 451 |
+
"fallback_provider",
|
| 452 |
+
"local_provider",
|
| 453 |
+
"cached_response"
|
| 454 |
+
]
|
| 455 |
+
```
|
| 456 |
+
|
| 457 |
+
## 🧪 Testing Configuration Changes
|
| 458 |
+
|
| 459 |
+
### Unit Tests for Configuration
|
| 460 |
+
```bash
|
| 461 |
+
# Test temperature overrides
|
| 462 |
+
python -m pytest tests/test_llm_temperature.py -v
|
| 463 |
+
|
| 464 |
+
# Test provider fallbacks
|
| 465 |
+
python -m pytest tests/test_llm_fallback.py -v
|
| 466 |
+
|
| 467 |
+
# Test API key validation
|
| 468 |
+
python -m pytest tests/test_api_keys.py -v
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
### Integration Tests
|
| 472 |
+
```bash
|
| 473 |
+
# Test each provider individually
|
| 474 |
+
python -c "
|
| 475 |
+
import os
|
| 476 |
+
providers = ['openai', 'google', 'ollama']
|
| 477 |
+
|
| 478 |
+
for provider in providers:
|
| 479 |
+
os.environ['LLM_PROVIDER'] = provider
|
| 480 |
+
try:
|
| 481 |
+
from services.llm_service import LLMService
|
| 482 |
+
service = LLMService()
|
| 483 |
+
response = service.simple_chat_completion('Test')
|
| 484 |
+
print(f'✅ {provider}: {len(response)} chars')
|
| 485 |
+
except Exception as e:
|
| 486 |
+
print(f'❌ {provider}: {e}')
|
| 487 |
+
"
|
| 488 |
+
```
|
| 489 |
+
|
| 490 |
+
### Performance Benchmarks
|
| 491 |
+
```bash
|
| 492 |
+
# Measure response times
|
| 493 |
+
python -c "
|
| 494 |
+
import time
|
| 495 |
+
from services.llm_service import LLMService
|
| 496 |
+
|
| 497 |
+
service = LLMService()
|
| 498 |
+
start = time.time()
|
| 499 |
+
response = service.simple_chat_completion('Quick recipe suggestion')
|
| 500 |
+
elapsed = time.time() - start
|
| 501 |
+
|
| 502 |
+
print(f'Response time: {elapsed:.2f}s')
|
| 503 |
+
print(f'Response length: {len(response)} characters')
|
| 504 |
+
print(f'Words per second: {len(response.split()) / elapsed:.1f}')
|
| 505 |
+
"
|
| 506 |
+
```
|
| 507 |
+
|
| 508 |
+
## 🔄 Configuration Migration
|
| 509 |
+
|
| 510 |
+
### Upgrading from Old Configuration
|
| 511 |
+
```bash
|
| 512 |
+
# Migrate old environment variables
|
| 513 |
+
# Old format → New format
|
| 514 |
+
mv .env .env.backup
|
| 515 |
+
|
| 516 |
+
# Update variable names
|
| 517 |
+
sed 's/LLM_MODEL=/OPENAI_MODEL=/' .env.backup > .env
|
| 518 |
+
sed -i 's/LLM_TEMPERATURE=/OPENAI_TEMPERATURE=/' .env
|
| 519 |
+
sed -i 's/LLM_MAX_TOKENS=/OPENAI_MAX_TOKENS=/' .env
|
| 520 |
+
|
| 521 |
+
echo "LLM_PROVIDER=openai" >> .env
|
| 522 |
+
```
|
| 523 |
+
|
| 524 |
+
### Version Compatibility Check
|
| 525 |
+
```python
|
| 526 |
+
# Check if configuration is compatible
|
| 527 |
+
def check_config_version():
|
| 528 |
+
required_vars = ["LLM_PROVIDER"]
|
| 529 |
+
legacy_vars = ["LLM_MODEL", "LLM_TEMPERATURE"]
|
| 530 |
+
|
| 531 |
+
has_new = all(os.getenv(var) for var in required_vars)
|
| 532 |
+
has_legacy = any(os.getenv(var) for var in legacy_vars)
|
| 533 |
+
|
| 534 |
+
if has_legacy and not has_new:
|
| 535 |
+
raise ValueError("Legacy configuration detected. Please migrate to new format.")
|
| 536 |
+
|
| 537 |
+
return has_new
|
| 538 |
+
```
|
| 539 |
+
|
| 540 |
+
---
|
| 541 |
+
|
| 542 |
+
💡 **Next Steps**: After configuring your providers, see the [Model Selection Guide](./model-selection-guide.md) for choosing the best models for your use case.
|
backend/docs/model-selection-guide.md
ADDED
|
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Selection Guide
|
| 2 |
+
|
| 3 |
+
## 🎯 At-a-Glance Recommendations
|
| 4 |
+
|
| 5 |
+
| Priority | Best Choice | Provider | Monthly Cost* | Setup Time | Quality Score | Why Choose This |
|
| 6 |
+
|----------|-------------|----------|---------------|------------|---------------|-----------------|
|
| 7 |
+
| **Ease of Use** | Gemini 2.5 Flash | Google | Free - $2 | 2 min | 90% | Excellent free tier |
|
| 8 |
+
| **Best Value** | GPT-5-nano | OpenAI | $1.00 | 2 min | 88% | Modern GPT-5 at nano price |
|
| 9 |
+
| **Premium Quality** | Claude 3 Opus | Anthropic | $225 | 2 min | 95% | Highest reasoning quality |
|
| 10 |
+
| **Self-Hosted** | Llama 3.1:8b | Ollama | Free | 10 min | 82% | Perfect balance |
|
| 11 |
+
| **High-End Local** | DeepSeek-R1:7b | Ollama | Free | 15 min | 88% | Best reasoning model |
|
| 12 |
+
| **Budget Cloud** | Claude 3.5 Haiku | Anthropic | $4 | 2 min | 87% | Fast and affordable |
|
| 13 |
+
| **Alternative Local** | CodeQwen1.5:7b | Ollama | Free | 10 min | 85% | Excellent for structured data |
|
| 14 |
+
|
| 15 |
+
*Based on 30,000 queries/month
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## 🏢 Cloud Models (Closed Source)
|
| 20 |
+
|
| 21 |
+
### OpenAI Models
|
| 22 |
+
|
| 23 |
+
#### GPT-5 (Latest Flagship) ⭐ **NEW**
|
| 24 |
+
```bash
|
| 25 |
+
OPENAI_MODEL=gpt-5
|
| 26 |
+
```
|
| 27 |
+
- **Pricing**: $20/month (Plus plan) - Unlimited with guardrails
|
| 28 |
+
- **Capabilities**: Advanced reasoning, thinking, code execution
|
| 29 |
+
- **Best For**: Premium applications requiring cutting-edge AI
|
| 30 |
+
- **Recipe Quality**: Outstanding (96%) - Best culinary understanding
|
| 31 |
+
- **Context**: 196K tokens (reasoning mode)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
#### GPT-5-nano (Ultra Budget) ⭐ **MISSED GEM**
|
| 35 |
+
```bash
|
| 36 |
+
OPENAI_MODEL=gpt-5-nano
|
| 37 |
+
```
|
| 38 |
+
- **Pricing**: $0.05/1M input, $0.40/1M output tokens
|
| 39 |
+
- **Monthly Cost**: ~$1.00 for 30K queries
|
| 40 |
+
- **Best For**: Budget-conscious deployments with modern capabilities
|
| 41 |
+
- **Recipe Quality**: Very Good (88%)
|
| 42 |
+
- **Speed**: Very Fast
|
| 43 |
+
- **Features**: GPT-5 architecture at nano pricing
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
#### GPT-4o-mini (Proven Budget Choice)
|
| 47 |
+
```bash
|
| 48 |
+
OPENAI_MODEL=gpt-4o-mini
|
| 49 |
+
```
|
| 50 |
+
- **Pricing**: $0.15/1M input, $0.60/1M output tokens
|
| 51 |
+
- **Monthly Cost**: ~$4 for 30K queries
|
| 52 |
+
- **Best For**: Cost-effective production deployments
|
| 53 |
+
- **Recipe Quality**: Very Good (86%)
|
| 54 |
+
- **Speed**: Very Fast
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
### Google AI (Gemini) Models
|
| 58 |
+
|
| 59 |
+
#### Gemini 2.5 Flash ⭐ **RECOMMENDED**
|
| 60 |
+
```bash
|
| 61 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 62 |
+
```
|
| 63 |
+
- **Pricing**: Free tier, then $0.30/1M input, $2.50/1M output
|
| 64 |
+
- **Monthly Cost**: Free - $2 for most usage patterns
|
| 65 |
+
- **Best For**: Development and cost-conscious production
|
| 66 |
+
- **Recipe Quality**: Excellent (90%)
|
| 67 |
+
- **Features**: Thinking budgets, 1M context window
|
| 68 |
+
|
| 69 |
+
#### Gemini 2.5 Pro (High-End)
|
| 70 |
+
```bash
|
| 71 |
+
GOOGLE_MODEL=gemini-2.5-pro
|
| 72 |
+
```
|
| 73 |
+
- **Pricing**: $1.25/1M input, $10/1M output (≤200K context)
|
| 74 |
+
- **Monthly Cost**: ~$25 for 30K queries
|
| 75 |
+
- **Best For**: Premium applications requiring best Google AI
|
| 76 |
+
- **Recipe Quality**: Excellent (92%)
|
| 77 |
+
|
| 78 |
+
#### Gemini 2.0 Flash-Lite (Ultra Budget)
|
| 79 |
+
```bash
|
| 80 |
+
GOOGLE_MODEL=gemini-2.0-flash-lite
|
| 81 |
+
```
|
| 82 |
+
- **Pricing**: $0.075/1M input, $0.30/1M output
|
| 83 |
+
- **Monthly Cost**: ~$0.90 for 30K queries
|
| 84 |
+
- **Best For**: High-volume, cost-sensitive applications
|
| 85 |
+
- **Recipe Quality**: Good (85%)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
## 🔓 Open Source Models (Self-Hosted)
|
| 89 |
+
|
| 90 |
+
### Ollama Models (Latest Releases)
|
| 91 |
+
|
| 92 |
+
#### DeepSeek-R1:7b ⭐ **BREAKTHROUGH MODEL**
|
| 93 |
+
```bash
|
| 94 |
+
OLLAMA_MODEL=deepseek-r1:7b
|
| 95 |
+
```
|
| 96 |
+
- **Parameters**: 7B
|
| 97 |
+
- **Download**: ~4.7GB
|
| 98 |
+
- **RAM Required**: 8GB
|
| 99 |
+
- **Best For**: Advanced reasoning tasks, O1-level performance
|
| 100 |
+
- **Recipe Quality**: Outstanding (88%)
|
| 101 |
+
- **Special**: Chain-of-thought reasoning, approaching GPT-4 performance
|
| 102 |
+
|
| 103 |
+
#### Gemma 3:27b ⭐ **NEW FLAGSHIP**
|
| 104 |
+
```bash
|
| 105 |
+
OLLAMA_MODEL=gemma3:27b
|
| 106 |
+
```
|
| 107 |
+
- **Parameters**: 27B
|
| 108 |
+
- **Download**: ~17GB
|
| 109 |
+
- **RAM Required**: 32GB
|
| 110 |
+
- **Best For**: Highest quality open source experience
|
| 111 |
+
- **Recipe Quality**: Outstanding (89%)
|
| 112 |
+
- **Features**: Vision capabilities, state-of-the-art performance
|
| 113 |
+
|
| 114 |
+
#### Llama 3.1:8b (Proven Choice)
|
| 115 |
+
```bash
|
| 116 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 117 |
+
```
|
| 118 |
+
- **Parameters**: 8B
|
| 119 |
+
- **Download**: ~4.7GB
|
| 120 |
+
- **RAM Required**: 8GB
|
| 121 |
+
- **Best For**: Balanced production deployment
|
| 122 |
+
- **Recipe Quality**: Very Good (82%)
|
| 123 |
+
- **Status**: Your current choice - excellent balance!
|
| 124 |
+
|
| 125 |
+
#### Qwen 3:8b ⭐ **NEW RELEASE**
|
| 126 |
+
```bash
|
| 127 |
+
OLLAMA_MODEL=qwen3:8b
|
| 128 |
+
```
|
| 129 |
+
- **Parameters**: 8B
|
| 130 |
+
- **Download**: ~4.4GB
|
| 131 |
+
- **RAM Required**: 8GB
|
| 132 |
+
- **Best For**: Multilingual support, latest technology
|
| 133 |
+
- **Recipe Quality**: Very Good (84%)
|
| 134 |
+
- **Features**: Tool use, thinking capabilities
|
| 135 |
+
|
| 136 |
+
#### Phi 4:14b ⭐ **MICROSOFT'S LATEST**
|
| 137 |
+
```bash
|
| 138 |
+
OLLAMA_MODEL=phi4:14b
|
| 139 |
+
```
|
| 140 |
+
- **Parameters**: 14B
|
| 141 |
+
- **Download**: ~9.1GB
|
| 142 |
+
- **RAM Required**: 16GB
|
| 143 |
+
- **Best For**: Reasoning and math tasks
|
| 144 |
+
- **Recipe Quality**: Very Good (85%)
|
| 145 |
+
- **Features**: State-of-the-art efficiency
|
| 146 |
+
|
| 147 |
+
#### Gemma 3:4b (Efficient Choice)
|
| 148 |
+
```bash
|
| 149 |
+
OLLAMA_MODEL=gemma3:4b
|
| 150 |
+
```
|
| 151 |
+
- **Parameters**: 4B
|
| 152 |
+
- **Download**: ~3.3GB
|
| 153 |
+
- **RAM Required**: 6GB
|
| 154 |
+
- **Best For**: Resource-constrained deployments
|
| 155 |
+
- **Recipe Quality**: Good (78%)
|
| 156 |
+
- **Features**: Excellent for size, runs on modest hardware
|
| 157 |
+
|
| 158 |
+
### HuggingFace Models (Downloadable for Local Use)
|
| 159 |
+
|
| 160 |
+
#### CodeQwen1.5:7b ⭐ **ALIBABA'S CODE MODEL**
|
| 161 |
+
```bash
|
| 162 |
+
OLLAMA_MODEL=codeqwen:7b
|
| 163 |
+
```
|
| 164 |
+
- **Parameters**: 7B
|
| 165 |
+
- **Download**: ~4.2GB
|
| 166 |
+
- **RAM Required**: 8GB
|
| 167 |
+
- **Best For**: Recipe parsing, ingredient analysis, structured data
|
| 168 |
+
- **Recipe Quality**: Very Good (85%)
|
| 169 |
+
- **Features**: Excellent at understanding structured recipe formats
|
| 170 |
+
|
| 171 |
+
#### Mistral-Nemo:12b ⭐ **BALANCED CHOICE**
|
| 172 |
+
```bash
|
| 173 |
+
OLLAMA_MODEL=mistral-nemo:12b
|
| 174 |
+
```
|
| 175 |
+
- **Parameters**: 12B
|
| 176 |
+
- **Download**: ~7GB
|
| 177 |
+
- **RAM Required**: 12GB
|
| 178 |
+
- **Best For**: General conversation with good reasoning
|
| 179 |
+
- **Recipe Quality**: Very Good (84%)
|
| 180 |
+
- **Features**: Multilingual, efficient, well-balanced
|
| 181 |
+
|
| 182 |
+
#### Nous-Hermes2:10.7b ⭐ **FINE-TUNED EXCELLENCE**
|
| 183 |
+
```bash
|
| 184 |
+
OLLAMA_MODEL=nous-hermes2:10.7b
|
| 185 |
+
```
|
| 186 |
+
- **Parameters**: 10.7B
|
| 187 |
+
- **Download**: ~6.4GB
|
| 188 |
+
- **RAM Required**: 12GB
|
| 189 |
+
- **Best For**: Instruction following, detailed responses
|
| 190 |
+
- **Recipe Quality**: Very Good (83%)
|
| 191 |
+
- **Features**: Excellent instruction following, helpful responses
|
| 192 |
+
|
| 193 |
+
#### OpenHermes2.5-Mistral:7b ⭐ **COMMUNITY FAVORITE**
|
| 194 |
+
```bash
|
| 195 |
+
OLLAMA_MODEL=openhermes2.5-mistral:7b
|
| 196 |
+
```
|
| 197 |
+
- **Parameters**: 7B
|
| 198 |
+
- **Download**: ~4.1GB
|
| 199 |
+
- **RAM Required**: 8GB
|
| 200 |
+
- **Best For**: Creative recipe suggestions, conversational AI
|
| 201 |
+
- **Recipe Quality**: Good (81%)
|
| 202 |
+
- **Features**: Creative, conversational, reliable
|
| 203 |
+
|
| 204 |
+
#### Solar:10.7b ⭐ **UPSTAGE'S MODEL**
|
| 205 |
+
```bash
|
| 206 |
+
OLLAMA_MODEL=solar:10.7b
|
| 207 |
+
```
|
| 208 |
+
- **Parameters**: 10.7B
|
| 209 |
+
- **Download**: ~6.1GB
|
| 210 |
+
- **RAM Required**: 12GB
|
| 211 |
+
- **Best For**: Analytical tasks, recipe modifications
|
| 212 |
+
- **Recipe Quality**: Very Good (83%)
|
| 213 |
+
- **Features**: Strong analytical capabilities, detailed explanations
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
### Anthropic Claude Models
|
| 217 |
+
|
| 218 |
+
#### Claude 3.5 Sonnet (Production Standard)
|
| 219 |
+
```bash
|
| 220 |
+
ANTHROPIC_MODEL=claude-3-5-sonnet-20241022
|
| 221 |
+
```
|
| 222 |
+
- **Pricing**: $3/1M input, $15/1M output tokens
|
| 223 |
+
- **Monthly Cost**: ~$45 for 30K queries
|
| 224 |
+
- **Best For**: Balanced performance and reasoning
|
| 225 |
+
- **Recipe Quality**: Outstanding (94%)
|
| 226 |
+
- **Features**: Advanced analysis, code understanding
|
| 227 |
+
|
| 228 |
+
#### Claude 3.5 Haiku (Speed Focused)
|
| 229 |
+
```bash
|
| 230 |
+
ANTHROPIC_MODEL=claude-3-5-haiku-20241022
|
| 231 |
+
```
|
| 232 |
+
- **Pricing**: $0.25/1M input, $1.25/1M output tokens
|
| 233 |
+
- **Monthly Cost**: ~$4 for 30K queries
|
| 234 |
+
- **Best For**: Fast, cost-effective responses
|
| 235 |
+
- **Recipe Quality**: Very Good (87%)
|
| 236 |
+
- **Features**: Lightning fast, good quality
|
| 237 |
+
|
| 238 |
+
#### Claude 3 Opus (Premium Reasoning)
|
| 239 |
+
```bash
|
| 240 |
+
ANTHROPIC_MODEL=claude-3-opus-20240229
|
| 241 |
+
```
|
| 242 |
+
- **Pricing**: $15/1M input, $75/1M output tokens
|
| 243 |
+
- **Monthly Cost**: ~$225 for 30K queries
|
| 244 |
+
- **Best For**: Complex reasoning, highest quality
|
| 245 |
+
- **Recipe Quality**: Outstanding (95%)
|
| 246 |
+
- **Features**: Top-tier reasoning, complex tasks
|
| 247 |
+
|
| 248 |
+
---
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
## 🎯 Scenario-Based Recommendations
|
| 252 |
+
|
| 253 |
+
### 👨💻 **Development & Testing**
|
| 254 |
+
**Choice**: Gemini 2.5 Flash
|
| 255 |
+
```bash
|
| 256 |
+
LLM_PROVIDER=google
|
| 257 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 258 |
+
```
|
| 259 |
+
- Free tier covers most development
|
| 260 |
+
- Excellent quality for testing
|
| 261 |
+
- Easy setup and integration
|
| 262 |
+
|
| 263 |
+
### 🚀 **Small to Medium Production**
|
| 264 |
+
**Choice**: Gemini 2.5 Flash or GPT-4o-mini
|
| 265 |
+
```bash
|
| 266 |
+
# Cost-focused
|
| 267 |
+
LLM_PROVIDER=google
|
| 268 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 269 |
+
|
| 270 |
+
# Quality-focused
|
| 271 |
+
LLM_PROVIDER=openai
|
| 272 |
+
OPENAI_MODEL=gpt-4o-mini
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
### 🏠 **Self-Hosted**
|
| 276 |
+
**Choice**: Llama 3.1:8b or upgrade to DeepSeek-R1:7b
|
| 277 |
+
```bash
|
| 278 |
+
# Your current (excellent choice)
|
| 279 |
+
LLM_PROVIDER=ollama
|
| 280 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 281 |
+
|
| 282 |
+
# Upgrade option (better reasoning)
|
| 283 |
+
LLM_PROVIDER=ollama
|
| 284 |
+
OLLAMA_MODEL=deepseek-r1:7b
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
### 💰 **Budget/Free**
|
| 288 |
+
**Choice**: Local models or GPT-5-nano
|
| 289 |
+
```bash
|
| 290 |
+
# Best local alternative
|
| 291 |
+
LLM_PROVIDER=ollama
|
| 292 |
+
OLLAMA_MODEL=codeqwen:7b
|
| 293 |
+
|
| 294 |
+
# Best budget paid option
|
| 295 |
+
LLM_PROVIDER=openai
|
| 296 |
+
OPENAI_MODEL=gpt-5-nano
|
| 297 |
+
|
| 298 |
+
# Quality budget cloud
|
| 299 |
+
LLM_PROVIDER=anthropic
|
| 300 |
+
ANTHROPIC_MODEL=claude-3-5-haiku-20241022
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
### 🔒 **Privacy/Offline**
|
| 304 |
+
**Choice**: DeepSeek-R1:7b or Gemma 3:4b
|
| 305 |
+
```bash
|
| 306 |
+
# Best reasoning
|
| 307 |
+
LLM_PROVIDER=ollama
|
| 308 |
+
OLLAMA_MODEL=deepseek-r1:7b
|
| 309 |
+
|
| 310 |
+
# Resource-efficient
|
| 311 |
+
LLM_PROVIDER=ollama
|
| 312 |
+
OLLAMA_MODEL=gemma3:4b
|
| 313 |
+
```
|
| 314 |
+
|
| 315 |
+
---
|
| 316 |
+
|
| 317 |
+
## ⚡ Quick Setup Commands
|
| 318 |
+
|
| 319 |
+
### Cloud Models (Instant Setup)
|
| 320 |
+
|
| 321 |
+
#### Gemini 2.5 Flash (Recommended)
|
| 322 |
+
```bash
|
| 323 |
+
# Update .env
|
| 324 |
+
LLM_PROVIDER=google
|
| 325 |
+
GOOGLE_MODEL=gemini-2.5-flash
|
| 326 |
+
GOOGLE_TEMPERATURE=0.7
|
| 327 |
+
GOOGLE_MAX_TOKENS=1000
|
| 328 |
+
|
| 329 |
+
# Test
|
| 330 |
+
python -c "
|
| 331 |
+
from services.llm_service import LLMService
|
| 332 |
+
service = LLMService()
|
| 333 |
+
print('✅ Gemini 2.5 Flash ready!')
|
| 334 |
+
response = service.simple_chat_completion('Suggest a quick pasta recipe')
|
| 335 |
+
print(f'Response: {response[:100]}...')
|
| 336 |
+
"
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
#### CodeQwen1.5:7b (Structured Data Expert)
|
| 340 |
+
```bash
|
| 341 |
+
# Pull model
|
| 342 |
+
ollama pull codeqwen:7b
|
| 343 |
+
|
| 344 |
+
# Update .env
|
| 345 |
+
LLM_PROVIDER=ollama
|
| 346 |
+
OLLAMA_MODEL=codeqwen:7b
|
| 347 |
+
OLLAMA_TEMPERATURE=0.7
|
| 348 |
+
|
| 349 |
+
# Test
|
| 350 |
+
python -c "
|
| 351 |
+
from services.llm_service import LLMService
|
| 352 |
+
service = LLMService()
|
| 353 |
+
print('✅ CodeQwen 1.5:7b ready!')
|
| 354 |
+
response = service.simple_chat_completion('Parse this recipe: 2 cups flour, 1 egg, 1 cup milk')
|
| 355 |
+
print(f'Response: {response[:100]}...')
|
| 356 |
+
"
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
#### Mistral-Nemo:12b (Balanced Performance)
|
| 360 |
+
```bash
|
| 361 |
+
# Pull model
|
| 362 |
+
ollama pull mistral-nemo:12b
|
| 363 |
+
|
| 364 |
+
# Update .env
|
| 365 |
+
LLM_PROVIDER=ollama
|
| 366 |
+
OLLAMA_MODEL=mistral-nemo:12b
|
| 367 |
+
OLLAMA_TEMPERATURE=0.7
|
| 368 |
+
|
| 369 |
+
# Test
|
| 370 |
+
python -c "
|
| 371 |
+
from services.llm_service import LLMService
|
| 372 |
+
service = LLMService()
|
| 373 |
+
print('✅ Mistral-Nemo ready!')
|
| 374 |
+
response = service.simple_chat_completion('Suggest a Mediterranean dinner menu')
|
| 375 |
+
print(f'Response: {response[:100]}...')
|
| 376 |
+
"
|
| 377 |
+
```
|
| 378 |
+
|
| 379 |
+
#### Claude 3.5 Haiku (Speed + Quality)
|
| 380 |
+
```bash
|
| 381 |
+
# Update .env
|
| 382 |
+
LLM_PROVIDER=anthropic
|
| 383 |
+
ANTHROPIC_MODEL=claude-3-5-haiku-20241022
|
| 384 |
+
ANTHROPIC_TEMPERATURE=0.7
|
| 385 |
+
ANTHROPIC_MAX_TOKENS=1000
|
| 386 |
+
|
| 387 |
+
# Test
|
| 388 |
+
python -c "
|
| 389 |
+
from services.llm_service import LLMService
|
| 390 |
+
service = LLMService()
|
| 391 |
+
print('✅ Claude 3.5 Haiku ready!')
|
| 392 |
+
response = service.simple_chat_completion('Quick dinner ideas with vegetables')
|
| 393 |
+
print(f'Response: {response[:100]}...')
|
| 394 |
+
"
|
| 395 |
+
```
|
| 396 |
+
|
| 397 |
+
#### GPT-5-nano (Budget Winner)
|
| 398 |
+
```bash
|
| 399 |
+
# Update .env
|
| 400 |
+
LLM_PROVIDER=openai
|
| 401 |
+
OPENAI_MODEL=gpt-5-nano
|
| 402 |
+
OPENAI_TEMPERATURE=0.7
|
| 403 |
+
OPENAI_MAX_TOKENS=1000
|
| 404 |
+
|
| 405 |
+
# Test
|
| 406 |
+
python -c "
|
| 407 |
+
from services.llm_service import LLMService
|
| 408 |
+
service = LLMService()
|
| 409 |
+
print('✅ GPT-5-nano ready!')
|
| 410 |
+
response = service.simple_chat_completion('Quick healthy breakfast ideas')
|
| 411 |
+
print(f'Response: {response[:100]}...')
|
| 412 |
+
"
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
#### GPT-5 (Premium)
|
| 416 |
+
```bash
|
| 417 |
+
# Update .env
|
| 418 |
+
LLM_PROVIDER=openai
|
| 419 |
+
OPENAI_MODEL=gpt-5
|
| 420 |
+
OPENAI_TEMPERATURE=0.7
|
| 421 |
+
OPENAI_MAX_TOKENS=1000
|
| 422 |
+
|
| 423 |
+
# Test
|
| 424 |
+
python -c "
|
| 425 |
+
from services.llm_service import LLMService
|
| 426 |
+
service = LLMService()
|
| 427 |
+
print('✅ GPT-5 ready!')
|
| 428 |
+
response = service.simple_chat_completion('Create a healthy meal plan')
|
| 429 |
+
print(f'Response: {response[:100]}...')
|
| 430 |
+
"
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
### Self-Hosted Models
|
| 434 |
+
|
| 435 |
+
#### DeepSeek-R1:7b (Latest Breakthrough)
|
| 436 |
+
```bash
|
| 437 |
+
# Pull model
|
| 438 |
+
ollama pull deepseek-r1:7b
|
| 439 |
+
|
| 440 |
+
# Update .env
|
| 441 |
+
LLM_PROVIDER=ollama
|
| 442 |
+
OLLAMA_MODEL=deepseek-r1:7b
|
| 443 |
+
OLLAMA_TEMPERATURE=0.7
|
| 444 |
+
|
| 445 |
+
# Start Ollama
|
| 446 |
+
ollama serve &
|
| 447 |
+
|
| 448 |
+
# Test
|
| 449 |
+
python -c "
|
| 450 |
+
from services.llm_service import LLMService
|
| 451 |
+
service = LLMService()
|
| 452 |
+
print('✅ DeepSeek-R1 ready!')
|
| 453 |
+
response = service.simple_chat_completion('Explain the science behind sourdough fermentation')
|
| 454 |
+
print(f'Response: {response[:100]}...')
|
| 455 |
+
"
|
| 456 |
+
```
|
| 457 |
+
|
| 458 |
+
#### Gemma 3:4b (Efficient)
|
| 459 |
+
```bash
|
| 460 |
+
# Pull model
|
| 461 |
+
ollama pull gemma3:4b
|
| 462 |
+
|
| 463 |
+
# Update .env
|
| 464 |
+
LLM_PROVIDER=ollama
|
| 465 |
+
OLLAMA_MODEL=gemma3:4b
|
| 466 |
+
OLLAMA_TEMPERATURE=0.7
|
| 467 |
+
|
| 468 |
+
# Test
|
| 469 |
+
python -c "
|
| 470 |
+
from services.llm_service import LLMService
|
| 471 |
+
service = LLMService()
|
| 472 |
+
print('✅ Gemma 3:4b ready!')
|
| 473 |
+
response = service.simple_chat_completion('Quick chicken recipes for weeknight dinners')
|
| 474 |
+
print(f'Response: {response[:100]}...')
|
| 475 |
+
"
|
| 476 |
+
```
|
| 477 |
+
|
| 478 |
+
---
|
| 479 |
+
|
| 480 |
+
## 🔧 Hardware Requirements
|
| 481 |
+
|
| 482 |
+
### Cloud Models
|
| 483 |
+
- **Requirements**: Internet connection, API key
|
| 484 |
+
- **RAM**: Any (processing done remotely)
|
| 485 |
+
- **Storage**: Minimal
|
| 486 |
+
- **Best For**: Instant setup, no hardware constraints
|
| 487 |
+
|
| 488 |
+
### Self-Hosted Requirements
|
| 489 |
+
|
| 490 |
+
| Model | Parameters | RAM Needed | Storage | GPU Beneficial | Best For |
|
| 491 |
+
|-------|------------|------------|---------|----------------|----------|
|
| 492 |
+
| `gemma3:4b` | 4B | 6GB | 3.3GB | Optional | Laptops, modest hardware |
|
| 493 |
+
| `codeqwen:7b` | 7B | 8GB | 4.2GB | Yes | Structured data, parsing |
|
| 494 |
+
| `llama3.1:8b` | 8B | 8GB | 4.7GB | Yes | Standard workstations |
|
| 495 |
+
| `deepseek-r1:7b` | 7B | 8GB | 4.7GB | Yes | Reasoning tasks |
|
| 496 |
+
| `openhermes2.5-mistral:7b` | 7B | 8GB | 4.1GB | Yes | Conversational AI |
|
| 497 |
+
| `nous-hermes2:10.7b` | 10.7B | 12GB | 6.4GB | Recommended | Instruction following |
|
| 498 |
+
| `mistral-nemo:12b` | 12B | 12GB | 7GB | Recommended | Balanced performance |
|
| 499 |
+
| `phi4:14b` | 14B | 16GB | 9.1GB | Recommended | High-end workstations |
|
| 500 |
+
| `gemma3:27b` | 27B | 32GB | 17GB | Required | Powerful servers |
|
| 501 |
+
|
| 502 |
+
---
|
backend/docs/opensource-llm-configuration.md
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Open Source LLM Configuration Guide (HuggingFace & Ollama)
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
The Recipe Recommendation Bot supports open source models through both HuggingFace and Ollama. This guide explains how to configure these providers for optimal performance, with recommended models under 20B parameters.
|
| 5 |
+
|
| 6 |
+
> 📚 **For comprehensive model comparisons including closed source options (OpenAI, Google), see [Comprehensive Model Guide](./comprehensive-model-guide.md)**
|
| 7 |
+
|
| 8 |
+
## Quick Model Recommendations
|
| 9 |
+
|
| 10 |
+
| Use Case | Model | Download Size | RAM Required | Quality |
|
| 11 |
+
|----------|-------|---------------|--------------|---------|
|
| 12 |
+
| **Development** | `gemma2:2b` | 1.6GB | 4GB | Good |
|
| 13 |
+
| **Production** | `llama3.1:8b` | 4.7GB | 8GB | Excellent |
|
| 14 |
+
| **High Quality** | `llama3.1:13b` | 7.4GB | 16GB | Outstanding |
|
| 15 |
+
| **API (Free)** | `deepseek-ai/DeepSeek-V3.1` | 0GB | N/A | Very Good |
|
| 16 |
+
|
| 17 |
+
## 🤗 HuggingFace Configuration
|
| 18 |
+
|
| 19 |
+
### Environment Variables
|
| 20 |
+
|
| 21 |
+
Add these variables to your `.env` file:
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
# LLM Provider Configuration
|
| 25 |
+
LLM_PROVIDER=huggingface
|
| 26 |
+
|
| 27 |
+
# HuggingFace Configuration
|
| 28 |
+
HUGGINGFACE_API_TOKEN=your_hf_token_here # Optional for public models
|
| 29 |
+
HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1 # Current recommended model
|
| 30 |
+
HUGGINGFACE_API_URL=https://api-inference.huggingface.co/models/
|
| 31 |
+
HUGGINGFACE_USE_API=true # Use API vs local inference
|
| 32 |
+
HUGGINGFACE_USE_GPU=false # Set to true for local GPU inference
|
| 33 |
+
|
| 34 |
+
# Embedding Configuration
|
| 35 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
### Deployment Options
|
| 39 |
+
|
| 40 |
+
#### Option 1: API Inference (Recommended)
|
| 41 |
+
```bash
|
| 42 |
+
HUGGINGFACE_USE_API=true
|
| 43 |
+
```
|
| 44 |
+
- **Pros**: No local downloads, fast startup, always latest models
|
| 45 |
+
- **Cons**: Requires internet connection, API rate limits
|
| 46 |
+
- **Download Size**: 0 bytes (no local storage needed)
|
| 47 |
+
- **Best for**: Development, testing, quick prototyping
|
| 48 |
+
|
| 49 |
+
#### Option 2: Local Inference
|
| 50 |
+
```bash
|
| 51 |
+
HUGGINGFACE_USE_API=false
|
| 52 |
+
HUGGINGFACE_USE_GPU=false # CPU-only
|
| 53 |
+
```
|
| 54 |
+
- **Pros**: No internet required, no rate limits, private
|
| 55 |
+
- **Cons**: Large model downloads, slower inference on CPU
|
| 56 |
+
- **Best for**: Production, offline deployments
|
| 57 |
+
|
| 58 |
+
#### Option 3: Local GPU Inference
|
| 59 |
+
```bash
|
| 60 |
+
HUGGINGFACE_USE_API=false
|
| 61 |
+
HUGGINGFACE_USE_GPU=true # Requires CUDA GPU
|
| 62 |
+
```
|
| 63 |
+
- **Pros**: Fast inference, no internet required, no rate limits
|
| 64 |
+
- **Cons**: Large downloads, requires GPU with sufficient VRAM
|
| 65 |
+
- **Best for**: Production with GPU resources
|
| 66 |
+
|
| 67 |
+
### Recommended HuggingFace Models
|
| 68 |
+
|
| 69 |
+
#### Lightweight Models (Good for CPU)
|
| 70 |
+
```bash
|
| 71 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-small # ~117MB download
|
| 72 |
+
HUGGINGFACE_MODEL=distilgpt2 # ~319MB download
|
| 73 |
+
HUGGINGFACE_MODEL=google/flan-t5-small # ~242MB download
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
#### Balanced Performance Models
|
| 77 |
+
```bash
|
| 78 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-medium # ~863MB download
|
| 79 |
+
HUGGINGFACE_MODEL=google/flan-t5-base # ~990MB download
|
| 80 |
+
HUGGINGFACE_MODEL=microsoft/CodeGPT-small-py # ~510MB download
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
#### High Quality Models (GPU Recommended)
|
| 84 |
+
```bash
|
| 85 |
+
HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1 # ~4.2GB download (7B params)
|
| 86 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-large # ~3.2GB download
|
| 87 |
+
HUGGINGFACE_MODEL=google/flan-t5-large # ~2.8GB download (770M params)
|
| 88 |
+
HUGGINGFACE_MODEL=huggingface/CodeBERTa-small-v1 # ~1.1GB download
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
#### Specialized Recipe/Cooking Models
|
| 92 |
+
```bash
|
| 93 |
+
HUGGINGFACE_MODEL=recipe-nlg/recipe-nlg-base # ~450MB download
|
| 94 |
+
HUGGINGFACE_MODEL=cooking-assistant/chef-gpt # ~2.1GB download (if available)
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## 🦙 Ollama Configuration
|
| 98 |
+
|
| 99 |
+
### Installation
|
| 100 |
+
|
| 101 |
+
First, install Ollama on your system:
|
| 102 |
+
|
| 103 |
+
```bash
|
| 104 |
+
# Linux/macOS
|
| 105 |
+
curl -fsSL https://ollama.ai/install.sh | sh
|
| 106 |
+
|
| 107 |
+
# Windows
|
| 108 |
+
# Download installer from https://ollama.ai/download
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Environment Variables
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
# LLM Provider Configuration
|
| 115 |
+
LLM_PROVIDER=ollama
|
| 116 |
+
|
| 117 |
+
# Ollama Configuration
|
| 118 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 119 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 120 |
+
OLLAMA_TEMPERATURE=0.7
|
| 121 |
+
|
| 122 |
+
# Embedding Configuration
|
| 123 |
+
OLLAMA_EMBEDDING_MODEL=nomic-embed-text
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
### Starting Ollama Service
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
# Start Ollama server
|
| 130 |
+
ollama serve
|
| 131 |
+
|
| 132 |
+
# In another terminal, pull your desired model
|
| 133 |
+
ollama pull llama3.1:8b
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
### Recommended Ollama Models
|
| 137 |
+
|
| 138 |
+
#### Lightweight Models (4GB RAM or less)
|
| 139 |
+
```bash
|
| 140 |
+
OLLAMA_MODEL=phi3:mini # ~2.3GB download (3.8B params)
|
| 141 |
+
OLLAMA_MODEL=gemma2:2b # ~1.6GB download (2B params)
|
| 142 |
+
OLLAMA_MODEL=qwen2:1.5b # ~934MB download (1.5B params)
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
#### Balanced Performance Models (8GB RAM)
|
| 146 |
+
```bash
|
| 147 |
+
OLLAMA_MODEL=llama3.1:8b # ~4.7GB download (8B params)
|
| 148 |
+
OLLAMA_MODEL=gemma2:9b # ~5.4GB download (9B params)
|
| 149 |
+
OLLAMA_MODEL=mistral:7b # ~4.1GB download (7B params)
|
| 150 |
+
OLLAMA_MODEL=qwen2:7b # ~4.4GB download (7B params)
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
#### High Quality Models (16GB+ RAM)
|
| 154 |
+
```bash
|
| 155 |
+
OLLAMA_MODEL=llama3.1:13b # ~7.4GB download (13B params)
|
| 156 |
+
OLLAMA_MODEL=mixtral:8x7b # ~26GB download (47B params - sparse)
|
| 157 |
+
OLLAMA_MODEL=qwen2:14b # ~8.2GB download (14B params)
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
#### Code/Instruction Following Models
|
| 161 |
+
```bash
|
| 162 |
+
OLLAMA_MODEL=codellama:7b # ~3.8GB download (7B params)
|
| 163 |
+
OLLAMA_MODEL=deepseek-coder:6.7b # ~3.8GB download (6.7B params)
|
| 164 |
+
OLLAMA_MODEL=wizard-coder:7b # ~4.1GB download (7B params)
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
### Ollama Model Management
|
| 168 |
+
|
| 169 |
+
```bash
|
| 170 |
+
# List available models
|
| 171 |
+
ollama list
|
| 172 |
+
|
| 173 |
+
# Pull a specific model
|
| 174 |
+
ollama pull llama3.1:8b
|
| 175 |
+
|
| 176 |
+
# Remove a model to free space
|
| 177 |
+
ollama rm old-model:tag
|
| 178 |
+
|
| 179 |
+
# Check model information
|
| 180 |
+
ollama show llama3.1:8b
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
## Installation Requirements
|
| 184 |
+
|
| 185 |
+
### HuggingFace Setup
|
| 186 |
+
|
| 187 |
+
#### For API Usage (No Downloads)
|
| 188 |
+
```bash
|
| 189 |
+
pip install -r requirements.txt
|
| 190 |
+
# No additional setup needed
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
#### For Local CPU Inference
|
| 194 |
+
```bash
|
| 195 |
+
pip install -r requirements.txt
|
| 196 |
+
# Models will be downloaded automatically on first use
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
#### For Local GPU Inference
|
| 200 |
+
```bash
|
| 201 |
+
# Install CUDA version of PyTorch
|
| 202 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 203 |
+
|
| 204 |
+
# Install other requirements
|
| 205 |
+
pip install -r requirements.txt
|
| 206 |
+
|
| 207 |
+
# Verify GPU availability
|
| 208 |
+
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### Ollama Setup
|
| 212 |
+
|
| 213 |
+
#### Installation
|
| 214 |
+
```bash
|
| 215 |
+
# Install Ollama
|
| 216 |
+
curl -fsSL https://ollama.ai/install.sh | sh
|
| 217 |
+
|
| 218 |
+
# Start Ollama service
|
| 219 |
+
ollama serve
|
| 220 |
+
|
| 221 |
+
# Pull your first model (in another terminal)
|
| 222 |
+
ollama pull llama3.1:8b
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
## Storage Requirements & Download Sizes
|
| 226 |
+
|
| 227 |
+
### HuggingFace Local Models
|
| 228 |
+
- **Storage Location**: `~/.cache/huggingface/transformers/`
|
| 229 |
+
- **Small Models**: 100MB - 1GB (good for development)
|
| 230 |
+
- **Medium Models**: 1GB - 5GB (balanced performance)
|
| 231 |
+
- **Large Models**: 5GB - 15GB (high quality, under 20B params)
|
| 232 |
+
|
| 233 |
+
### Ollama Models
|
| 234 |
+
- **Storage Location**: `~/.ollama/models/`
|
| 235 |
+
- **Quantized Storage**: Models use efficient quantization (4-bit, 8-bit)
|
| 236 |
+
- **2B Models**: ~1-2GB download
|
| 237 |
+
- **7-8B Models**: ~4-5GB download
|
| 238 |
+
- **13-14B Models**: ~7-8GB download
|
| 239 |
+
|
| 240 |
+
### Embedding Models
|
| 241 |
+
```bash
|
| 242 |
+
# HuggingFace Embeddings (auto-downloaded)
|
| 243 |
+
sentence-transformers/all-MiniLM-L6-v2 # ~80MB
|
| 244 |
+
sentence-transformers/all-mpnet-base-v2 # ~420MB
|
| 245 |
+
|
| 246 |
+
# Ollama Embeddings
|
| 247 |
+
ollama pull nomic-embed-text # ~274MB
|
| 248 |
+
ollama pull mxbai-embed-large # ~669MB
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
## Performance & Hardware Recommendations
|
| 252 |
+
|
| 253 |
+
### System Requirements
|
| 254 |
+
|
| 255 |
+
#### Minimum (API Usage)
|
| 256 |
+
- **RAM**: 2GB
|
| 257 |
+
- **Storage**: 100MB
|
| 258 |
+
- **Internet**: Required for API calls
|
| 259 |
+
|
| 260 |
+
#### CPU Inference
|
| 261 |
+
- **RAM**: 8GB+ (16GB for larger models)
|
| 262 |
+
- **CPU**: 4+ cores recommended
|
| 263 |
+
- **Storage**: 5GB+ for models cache
|
| 264 |
+
|
| 265 |
+
#### GPU Inference
|
| 266 |
+
- **GPU**: 8GB+ VRAM (for 7B models)
|
| 267 |
+
- **RAM**: 16GB+ system RAM
|
| 268 |
+
- **Storage**: 10GB+ for models
|
| 269 |
+
|
| 270 |
+
### Performance Tips
|
| 271 |
+
|
| 272 |
+
1. **Start Small**: Begin with lightweight models and upgrade based on quality needs
|
| 273 |
+
2. **Use API First**: Test with HuggingFace API before committing to local inference
|
| 274 |
+
3. **Monitor Resources**: Check CPU/GPU/RAM usage during inference
|
| 275 |
+
4. **Model Caching**: First run downloads models, subsequent runs are faster
|
| 276 |
+
|
| 277 |
+
## Troubleshooting
|
| 278 |
+
|
| 279 |
+
### HuggingFace Issues
|
| 280 |
+
|
| 281 |
+
#### "accelerate package required"
|
| 282 |
+
```bash
|
| 283 |
+
pip install accelerate
|
| 284 |
+
```
|
| 285 |
+
|
| 286 |
+
#### GPU not detected
|
| 287 |
+
```bash
|
| 288 |
+
# Check CUDA availability
|
| 289 |
+
python -c "import torch; print(torch.cuda.is_available())"
|
| 290 |
+
|
| 291 |
+
# If false, install CUDA PyTorch
|
| 292 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
#### Out of memory errors
|
| 296 |
+
- Switch to a smaller model
|
| 297 |
+
- Set `HUGGINGFACE_USE_GPU=false` for CPU inference
|
| 298 |
+
- Use API instead: `HUGGINGFACE_USE_API=true`
|
| 299 |
+
|
| 300 |
+
### Ollama Issues
|
| 301 |
+
|
| 302 |
+
#### Ollama service not starting
|
| 303 |
+
```bash
|
| 304 |
+
# Check if port 11434 is available
|
| 305 |
+
lsof -i :11434
|
| 306 |
+
|
| 307 |
+
# Restart Ollama
|
| 308 |
+
ollama serve
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
#### Model not found
|
| 312 |
+
```bash
|
| 313 |
+
# List available models
|
| 314 |
+
ollama list
|
| 315 |
+
|
| 316 |
+
# Pull the model
|
| 317 |
+
ollama pull llama3.1:8b
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
#### Slow inference
|
| 321 |
+
- Try a smaller model
|
| 322 |
+
- Check available RAM
|
| 323 |
+
- Consider using GPU if available
|
| 324 |
+
|
| 325 |
+
## Quick Tests
|
| 326 |
+
|
| 327 |
+
### Test HuggingFace Configuration
|
| 328 |
+
```bash
|
| 329 |
+
cd backend
|
| 330 |
+
python -c "
|
| 331 |
+
from services.llm_service import LLMService
|
| 332 |
+
import os
|
| 333 |
+
os.environ['LLM_PROVIDER'] = 'huggingface'
|
| 334 |
+
service = LLMService()
|
| 335 |
+
print('✅ HuggingFace LLM working!')
|
| 336 |
+
response = service.simple_chat_completion('Hello')
|
| 337 |
+
print(f'Response: {response}')
|
| 338 |
+
"
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
### Test Ollama Configuration
|
| 342 |
+
```bash
|
| 343 |
+
# First ensure Ollama is running
|
| 344 |
+
ollama serve &
|
| 345 |
+
|
| 346 |
+
# Test the service
|
| 347 |
+
cd backend
|
| 348 |
+
python -c "
|
| 349 |
+
from services.llm_service import LLMService
|
| 350 |
+
import os
|
| 351 |
+
os.environ['LLM_PROVIDER'] = 'ollama'
|
| 352 |
+
service = LLMService()
|
| 353 |
+
print('✅ Ollama LLM working!')
|
| 354 |
+
response = service.simple_chat_completion('Hello')
|
| 355 |
+
print(f'Response: {response}')
|
| 356 |
+
"
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
## Configuration Examples
|
| 360 |
+
|
| 361 |
+
### Development Setup (Fast Start)
|
| 362 |
+
```bash
|
| 363 |
+
# Use HuggingFace API for quick testing
|
| 364 |
+
LLM_PROVIDER=huggingface
|
| 365 |
+
HUGGINGFACE_USE_API=true
|
| 366 |
+
HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1
|
| 367 |
+
HUGGINGFACE_API_TOKEN=your_token_here
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
### Local CPU Setup
|
| 371 |
+
```bash
|
| 372 |
+
# Local inference on CPU
|
| 373 |
+
LLM_PROVIDER=ollama
|
| 374 |
+
OLLAMA_MODEL=llama3.1:8b
|
| 375 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 376 |
+
```
|
| 377 |
+
|
| 378 |
+
### Local GPU Setup
|
| 379 |
+
```bash
|
| 380 |
+
# Local inference with GPU acceleration
|
| 381 |
+
LLM_PROVIDER=huggingface
|
| 382 |
+
HUGGINGFACE_USE_API=false
|
| 383 |
+
HUGGINGFACE_USE_GPU=true
|
| 384 |
+
HUGGINGFACE_MODEL=deepseek-ai/DeepSeek-V3.1
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
### Production Setup (High Performance)
|
| 388 |
+
```bash
|
| 389 |
+
# Ollama with optimized model
|
| 390 |
+
LLM_PROVIDER=ollama
|
| 391 |
+
OLLAMA_MODEL=llama3.1:13b # Higher quality
|
| 392 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 393 |
+
# Ensure 16GB+ RAM available
|
| 394 |
+
```
|
backend/docs/optimal_recipes_structure.md
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Universal Recipe Data Structure
|
| 2 |
+
|
| 3 |
+
This document defines a simple, universal data structure for recipe storage that works efficiently with both ChromaDB and MongoDB Atlas for ingredient-based recipe recommendations.
|
| 4 |
+
|
| 5 |
+
## Core Principles
|
| 6 |
+
|
| 7 |
+
1. **Ingredient-focused**: Primary search is by ingredients
|
| 8 |
+
2. **Universal compatibility**: Same structure works for ChromaDB and MongoDB
|
| 9 |
+
3. **Simple and clean**: Easy to understand and maintain
|
| 10 |
+
4. **Efficient retrieval**: Optimized for RAG performance
|
| 11 |
+
|
| 12 |
+
## Universal Recipe Structure
|
| 13 |
+
|
| 14 |
+
### Required Fields
|
| 15 |
+
|
| 16 |
+
```json
|
| 17 |
+
{
|
| 18 |
+
"title": "String - Recipe name",
|
| 19 |
+
"ingredients": ["Array of strings - Individual ingredients"],
|
| 20 |
+
"instructions": "String - Step-by-step cooking instructions",
|
| 21 |
+
"metadata": {
|
| 22 |
+
"cook_time": "String - Optional cooking time",
|
| 23 |
+
"difficulty": "String - Optional difficulty level",
|
| 24 |
+
"servings": "String - Optional number of servings",
|
| 25 |
+
"category": "String - Optional recipe category",
|
| 26 |
+
"image_url": "String - Optional recipe image URL"
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### Example Document
|
| 32 |
+
|
| 33 |
+
```json
|
| 34 |
+
{
|
| 35 |
+
"title": "Mixed Seafood Coconut Fried Rice",
|
| 36 |
+
"ingredients": [
|
| 37 |
+
"jasmine rice",
|
| 38 |
+
"cooked shrimp",
|
| 39 |
+
"prawns",
|
| 40 |
+
"scallops",
|
| 41 |
+
"coconut milk",
|
| 42 |
+
"fish sauce",
|
| 43 |
+
"soy sauce",
|
| 44 |
+
"garlic",
|
| 45 |
+
"onion",
|
| 46 |
+
"ginger",
|
| 47 |
+
"green onions",
|
| 48 |
+
"cilantro",
|
| 49 |
+
"lime",
|
| 50 |
+
"vegetable oil",
|
| 51 |
+
"salt",
|
| 52 |
+
"pepper"
|
| 53 |
+
],
|
| 54 |
+
"instructions": "1. Heat vegetable oil in large pan. 2. Add garlic, onion, ginger and stir-fry until fragrant. 3. Add cooked rice and mix well. 4. Add seafood and cook until heated through. 5. Pour in coconut milk and season with fish sauce and soy sauce. 6. Garnish with green onions and cilantro. 7. Serve with lime wedges.",
|
| 55 |
+
"metadata": {
|
| 56 |
+
"cook_time": "25 minutes",
|
| 57 |
+
"difficulty": "medium",
|
| 58 |
+
"servings": "4",
|
| 59 |
+
"category": "seafood",
|
| 60 |
+
"image_url": "https://example.com/images/mixed-seafood-coconut-fried-rice.jpg"
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Key Features
|
| 66 |
+
|
| 67 |
+
### 1. Clean Ingredients Format
|
| 68 |
+
- **Array structure**: Each ingredient as separate string
|
| 69 |
+
- **Individual embedding**: Each ingredient can be embedded separately
|
| 70 |
+
- **Easy matching**: Simple array operations for ingredient search
|
| 71 |
+
- **No duplicates**: Each ingredient appears once in the array
|
| 72 |
+
|
| 73 |
+
### 2. Universal Compatibility
|
| 74 |
+
- **ChromaDB**: Automatically creates embeddings from full document
|
| 75 |
+
- **MongoDB Atlas**: Can use pre-computed embeddings or text search
|
| 76 |
+
- **Same structure**: No provider-specific modifications needed
|
| 77 |
+
|
| 78 |
+
### 3. Efficient Search Patterns
|
| 79 |
+
|
| 80 |
+
#### Primary: Ingredient-based Search
|
| 81 |
+
```
|
| 82 |
+
User: "I have shrimp, rice, and coconut milk"
|
| 83 |
+
Search: ingredients array for ["shrimp", "rice", "coconut"]
|
| 84 |
+
Result: Mixed Seafood Coconut Fried Rice (high relevance)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
#### Secondary: Title-based Search
|
| 88 |
+
```
|
| 89 |
+
User: "How to make fried rice"
|
| 90 |
+
Search: title field for "fried rice"
|
| 91 |
+
Result: All fried rice recipes
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
#### Fallback: Full-text Search
|
| 95 |
+
```
|
| 96 |
+
User: "Quick dinner recipes"
|
| 97 |
+
Search: Full document for "quick dinner"
|
| 98 |
+
Result: Recipes mentioning quick preparation
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Implementation Guidelines
|
| 102 |
+
|
| 103 |
+
### For ChromaDB
|
| 104 |
+
```python
|
| 105 |
+
# Documents are automatically embedded as full text
|
| 106 |
+
ingredients_text = ", ".join(recipe['ingredients'])
|
| 107 |
+
document = Document(
|
| 108 |
+
page_content=f"Title: {recipe['title']}. Ingredients: {ingredients_text}. Instructions: {recipe['instructions']}",
|
| 109 |
+
metadata=recipe['metadata']
|
| 110 |
+
)
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### For MongoDB Atlas
|
| 114 |
+
```python
|
| 115 |
+
# Can use array search or vector search on the same structure
|
| 116 |
+
# Array search on ingredients
|
| 117 |
+
{"ingredients": {"$in": user_ingredients_list}}
|
| 118 |
+
|
| 119 |
+
# Or vector search if embeddings are pre-computed
|
| 120 |
+
{"ingredients_vector": {"$near": query_embedding}}
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
## Data Preparation
|
| 124 |
+
|
| 125 |
+
### Ingredient Processing Rules
|
| 126 |
+
1. **Clean individual items**: "2 cups rice" → "rice"
|
| 127 |
+
2. **Remove measurements**: "1 lb chicken breast" → "chicken breast"
|
| 128 |
+
3. **Lowercase**: "Fresh Basil" → "fresh basil"
|
| 129 |
+
4. **Array format**: ["rice", "chicken breast", "fresh basil"]
|
| 130 |
+
5. **No duplicates**: Remove duplicate ingredients from array
|
| 131 |
+
|
| 132 |
+
### Example Transformation
|
| 133 |
+
```
|
| 134 |
+
Raw: "2 lbs fresh shrimp, 1 cup jasmine rice (cooked), 1/2 cup coconut milk"
|
| 135 |
+
Clean: ["fresh shrimp", "jasmine rice", "coconut milk"]
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## Benefits
|
| 139 |
+
|
| 140 |
+
### 1. Simplicity
|
| 141 |
+
- Single structure for all providers
|
| 142 |
+
- Easy to understand and maintain
|
| 143 |
+
- No complex transformations needed
|
| 144 |
+
|
| 145 |
+
### 2. Performance
|
| 146 |
+
- Optimized for ingredient matching
|
| 147 |
+
- Fast text and vector search
|
| 148 |
+
- Minimal processing overhead
|
| 149 |
+
|
| 150 |
+
### 3. Flexibility
|
| 151 |
+
- Works with existing MongoDB data
|
| 152 |
+
- Compatible with ChromaDB auto-embedding
|
| 153 |
+
- Supports both search types (text/vector)
|
| 154 |
+
|
| 155 |
+
### 4. Scalability
|
| 156 |
+
- Easy to add new recipes
|
| 157 |
+
- Simple data validation
|
| 158 |
+
- Consistent across providers
|
| 159 |
+
|
| 160 |
+
This universal structure ensures maximum compatibility and efficiency for ingredient-based recipe recommendations across all vector store providers.
|
backend/docs/sanitization_guide.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Simplified Data Sanitization Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The simplified data sanitization module provides focused input validation and sanitization for the Recipe Recommendation Bot API. It's designed specifically for recipe chatbot context with essential security protection.
|
| 6 |
+
|
| 7 |
+
## Features
|
| 8 |
+
|
| 9 |
+
### 🛡️ **Essential Security Protection**
|
| 10 |
+
- **XSS Prevention**: HTML encoding and basic script removal
|
| 11 |
+
- **Input Validation**: Length limits and content validation
|
| 12 |
+
- **Whitespace Normalization**: Clean formatting
|
| 13 |
+
|
| 14 |
+
### 🔧 **Simple Configuration**
|
| 15 |
+
- **Maximum Message Length**: 1000 characters
|
| 16 |
+
- **Minimum Message Length**: 1 character
|
| 17 |
+
- **Single Method**: One sanitization method for all inputs
|
| 18 |
+
|
| 19 |
+
## Usage
|
| 20 |
+
|
| 21 |
+
### Basic Sanitization
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
from utils.sanitization import sanitize_user_input
|
| 25 |
+
|
| 26 |
+
# Sanitize any user input (chat messages, demo prompts)
|
| 27 |
+
clean_input = sanitize_user_input("What are some chicken recipes?")
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
### Advanced Usage
|
| 31 |
+
|
| 32 |
+
```python
|
| 33 |
+
from utils.sanitization import DataSanitizer
|
| 34 |
+
|
| 35 |
+
# Direct class usage
|
| 36 |
+
sanitizer = DataSanitizer()
|
| 37 |
+
clean_text = sanitizer.sanitize_input("User input")
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## Security Patterns Handled
|
| 41 |
+
|
| 42 |
+
### Basic XSS Protection
|
| 43 |
+
- `<script>` tags → Removed
|
| 44 |
+
- `javascript:` URLs → Cleaned
|
| 45 |
+
- Event handlers (`onclick`, `onload`) → Removed
|
| 46 |
+
- HTML entities → Properly encoded
|
| 47 |
+
|
| 48 |
+
### Input Validation
|
| 49 |
+
- Length limits (1-1000 characters)
|
| 50 |
+
- Empty input detection
|
| 51 |
+
- Whitespace normalization
|
| 52 |
+
|
| 53 |
+
## Integration
|
| 54 |
+
|
| 55 |
+
The sanitization is automatically applied in FastAPI endpoints:
|
| 56 |
+
|
| 57 |
+
### Chat Endpoint
|
| 58 |
+
```python
|
| 59 |
+
class ChatMessage(BaseModel):
|
| 60 |
+
message: str = Field(..., min_length=1, max_length=1000)
|
| 61 |
+
|
| 62 |
+
@validator('message')
|
| 63 |
+
def sanitize_message_field(cls, v):
|
| 64 |
+
return sanitize_user_input(v)
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Demo Endpoint
|
| 68 |
+
```python
|
| 69 |
+
@app.get("/demo")
|
| 70 |
+
def demo(prompt: str = "What recipes do you have?"):
|
| 71 |
+
sanitized_prompt = sanitize_user_input(prompt)
|
| 72 |
+
# ... rest of the logic
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
## Error Handling
|
| 76 |
+
|
| 77 |
+
The sanitization raises `ValueError` for invalid input:
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
try:
|
| 81 |
+
clean_input = sanitize_user_input(user_input)
|
| 82 |
+
except ValueError as e:
|
| 83 |
+
return {"error": f"Invalid input: {str(e)}"}
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Testing
|
| 87 |
+
|
| 88 |
+
Run the sanitization tests:
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
python3 test_sanitization.py
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
The test suite covers:
|
| 95 |
+
- Normal recipe-related messages
|
| 96 |
+
- Basic harmful content (scripts, JavaScript)
|
| 97 |
+
- Length validation
|
| 98 |
+
- Whitespace normalization
|
| 99 |
+
- Edge cases
|
| 100 |
+
|
| 101 |
+
## What's Simplified
|
| 102 |
+
|
| 103 |
+
### Removed Overly Complex Features:
|
| 104 |
+
- ❌ SQL injection patterns (not relevant for LLM chatbot)
|
| 105 |
+
- ❌ Command injection patterns (not applicable)
|
| 106 |
+
- ❌ Separate strict/relaxed modes (unnecessary complexity)
|
| 107 |
+
- ❌ Multiple sanitization methods (unified approach)
|
| 108 |
+
|
| 109 |
+
### Kept Essential Features:
|
| 110 |
+
- ✅ Basic XSS protection
|
| 111 |
+
- ✅ Input length validation
|
| 112 |
+
- ✅ HTML encoding
|
| 113 |
+
- ✅ Whitespace normalization
|
| 114 |
+
- ✅ Clear error messages
|
| 115 |
+
|
| 116 |
+
## Performance
|
| 117 |
+
|
| 118 |
+
- **Lightweight**: Minimal regex patterns
|
| 119 |
+
- **Fast**: Simple operations only
|
| 120 |
+
- **Memory Efficient**: No complex state
|
| 121 |
+
- **Recipe-Focused**: Context-appropriate validation
|
| 122 |
+
|
| 123 |
+
## Examples
|
| 124 |
+
|
| 125 |
+
### Valid Inputs (Cleaned):
|
| 126 |
+
```python
|
| 127 |
+
"What are chicken recipes?" → "What are chicken recipes?"
|
| 128 |
+
"<script>alert('xss')</script>Tell me about pasta" → "Tell me about pasta"
|
| 129 |
+
" How to cook rice? " → "How to cook rice?"
|
| 130 |
+
"What about desserts & sweets?" → "What about desserts & sweets?"
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
### Invalid Inputs (Rejected):
|
| 134 |
+
```python
|
| 135 |
+
"" → ValueError: Input cannot be empty
|
| 136 |
+
"a" * 1001 → ValueError: Input too long (maximum 1000 characters)
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## Best Practices
|
| 140 |
+
|
| 141 |
+
1. **Keep It Simple**: Focus on actual threats for recipe chatbot
|
| 142 |
+
2. **Context Appropriate**: Don't over-engineer for non-existent threats
|
| 143 |
+
3. **User Friendly**: Allow normal recipe-related punctuation
|
| 144 |
+
4. **Clear Errors**: Provide helpful error messages
|
| 145 |
+
5. **Test Regularly**: Verify with real recipe queries
|
| 146 |
+
|
| 147 |
+
This simplified approach provides adequate protection while maintaining usability for a recipe recommendation chatbot context.
|
backend/docs/scraper.md
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Recipe Scraper – FastAPI demo
|
| 2 |
+
|
| 3 |
+
A tiny FastAPI service + CLI that scrapes recipe sites, normalizes data, and (optionally) embeds combined **ingredients + instructions** into a single vector (`recipe_emb`). Designed as a **test project**—simple to run locally, easy to extend.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Features
|
| 8 |
+
|
| 9 |
+
* 🔧 **Sites**: `yummy` (YummyMedley), `anr` (All Nigerian Recipes)
|
| 10 |
+
* 🧱 **Unified text**: builds `recipe_text` from sections, or embeds `("ingredients","instructions") → recipe_emb`
|
| 11 |
+
* 🧠 **Embeddings**: Hugging Face `sentence-transformers` via your `HFEmbedder` (default: `all-MiniLM-L6-v2`)
|
| 12 |
+
* 🚀 **API trigger**: `POST /scrape` runs scraping in the background
|
| 13 |
+
* 👀 **Progress**: `GET /jobs/{job_id}` (and optional `GET /jobs`) to check status
|
| 14 |
+
* 💾 **Output**: `output_type = "json"` (local file) or `"mongo"` (MongoDB/Atlas)
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## Project layout (essential bits)
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
backend/
|
| 22 |
+
app.py
|
| 23 |
+
data_minning/
|
| 24 |
+
base_scraper.py # BaseRecipeScraper (+ StreamOptions)
|
| 25 |
+
all_nigerian_recipe_scraper.py
|
| 26 |
+
yummy_medley_scraper.py
|
| 27 |
+
dto/recipe_doc.py
|
| 28 |
+
soup_client.py
|
| 29 |
+
utils/sanitization.py
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Make sure every package dir has an `__init__.py`.
|
| 33 |
+
|
| 34 |
+
---
|
| 35 |
+
|
| 36 |
+
## Requirements
|
| 37 |
+
|
| 38 |
+
* Python 3.9+
|
| 39 |
+
* macOS/Linux (Windows should work too)
|
| 40 |
+
* (Optional) MongoDB/Atlas for `"mongo"` output
|
| 41 |
+
|
| 42 |
+
### Install
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
python3 -m venv .venv
|
| 46 |
+
source .venv/bin/activate
|
| 47 |
+
|
| 48 |
+
pip install --upgrade pip
|
| 49 |
+
pip install -r requirements.txt
|
| 50 |
+
# If you don’t have a requirements.txt, minimum:
|
| 51 |
+
pip install fastapi "uvicorn[standard]" pydantic==2.* requests beautifulsoup4 \
|
| 52 |
+
sentence-transformers numpy pymongo python-dotenv
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
> If `uvicorn` isn’t found on your PATH, you can always run with `python3 -m uvicorn ...`.
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## Environment variables
|
| 60 |
+
|
| 61 |
+
Create `.env` in repo root (or export envs) as needed:
|
| 62 |
+
|
| 63 |
+
```dotenv
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# For Mongo output_type="mongo"
|
| 67 |
+
MONGODB_URI=mongodb+srv://user:pass@cluster/recipes?retryWrites=true&w=majority
|
| 68 |
+
MONGODB_DB=recipes
|
| 69 |
+
MONGODB_COL=items
|
| 70 |
+
ATLAS_INDEX=recipes_vec # your Atlas Search index name
|
| 71 |
+
|
| 72 |
+
# Embeddings (HFEmbedder)
|
| 73 |
+
HF_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 74 |
+
HF_DEVICE=cpu # or cuda
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
## Running the API
|
| 80 |
+
|
| 81 |
+
From the project root (the folder **containing** `backend/`):
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
python3 -m uvicorn app:app --reload --host 127.0.0.1 --port 8080
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
---
|
| 89 |
+
|
| 90 |
+
## API
|
| 91 |
+
|
| 92 |
+
### POST `/scrape`
|
| 93 |
+
|
| 94 |
+
Trigger a scrape job (non-blocking). **Body** is a JSON object:
|
| 95 |
+
|
| 96 |
+
```json
|
| 97 |
+
{
|
| 98 |
+
"site": "yummy",
|
| 99 |
+
"limit": 50, #optional
|
| 100 |
+
"output_type": "json" // or "mongo"
|
| 101 |
+
}
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Headers**
|
| 105 |
+
|
| 106 |
+
* `Content-Type: application/json`
|
| 107 |
+
* If enabled: `X-API-Key: <ADMIN_API_KEY>`
|
| 108 |
+
|
| 109 |
+
**curl example (JSON output):**
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
curl -X POST http://127.0.0.1:8080/scrape \
|
| 113 |
+
-H "Content-Type: application/json" \
|
| 114 |
+
-H "X-API-Key: dev-key" \
|
| 115 |
+
-d '{"site":"yummy","limit":20,"output_type":"json"}'
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
**Response**
|
| 119 |
+
|
| 120 |
+
```json
|
| 121 |
+
{ "job_id": "yummy-a1b2c3d4", "status": "queued" }
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
### GET `/jobs/{job_id}`
|
| 125 |
+
|
| 126 |
+
Check progress:
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
curl http://127.0.0.1:8080/jobs/yummy-a1b2c3d4
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
**Possible responses**
|
| 133 |
+
|
| 134 |
+
```json
|
| 135 |
+
{ "status": "running", "count": 13 }
|
| 136 |
+
{ "status": "done", "count": 50 }
|
| 137 |
+
{ "status": "error", "error": "Traceback ..." }
|
| 138 |
+
{ "status": "unknown" }
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### (Optional) GET `/jobs`
|
| 142 |
+
|
| 143 |
+
Return the whole in-memory job map (useful for debugging):
|
| 144 |
+
|
| 145 |
+
```bash
|
| 146 |
+
curl http://127.0.0.1:8080/jobs
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
> Note: jobs are stored in a process-local dict and clear on server restart.
|
| 150 |
+
|
| 151 |
+
---
|
| 152 |
+
|
| 153 |
+
## Output modes
|
| 154 |
+
|
| 155 |
+
### `"json"`
|
| 156 |
+
|
| 157 |
+
Writes batches to a JSON sink (e.g., newline-delimited file). Check the sink path configured in your `JsonArraySink`/`DualSink`.
|
| 158 |
+
|
| 159 |
+
Typical document shape:
|
| 160 |
+
|
| 161 |
+
```json
|
| 162 |
+
{
|
| 163 |
+
"title": "...",
|
| 164 |
+
"url": "...",
|
| 165 |
+
"source": "...",
|
| 166 |
+
"category": "...",
|
| 167 |
+
"ingredients": "- 1 cup rice\n- 2 tbsp oil\n...",
|
| 168 |
+
"instructions": "1. Heat oil...\n\n2. Add rice...",
|
| 169 |
+
"image_url": "...",
|
| 170 |
+
"needs_review": false,
|
| 171 |
+
"scraped_at": "2025-09-14 10:03:32.289232",
|
| 172 |
+
"recipe_emb": [0.0123, -0.0456, ...] // when embeddings enabled
|
| 173 |
+
}
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### `"mongo"`
|
| 177 |
+
|
| 178 |
+
Writes to `MONGODB_DB.MONGODB_COL`. Ensure your Atlas Search index is created if you plan to query vectors.
|
| 179 |
+
|
| 180 |
+
**Atlas index mapping (single vector field)**
|
| 181 |
+
|
| 182 |
+
```json
|
| 183 |
+
{
|
| 184 |
+
"mappings": {
|
| 185 |
+
"dynamic": false,
|
| 186 |
+
"fields": {
|
| 187 |
+
"recipe_emb": { "type": "knnVector", "dims": 384, "similarity": "cosine" }
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
**Query example:**
|
| 194 |
+
|
| 195 |
+
```python
|
| 196 |
+
qvec = embedder.encode([query])[0]
|
| 197 |
+
pipeline = [{
|
| 198 |
+
"$vectorSearch": {
|
| 199 |
+
"index": os.getenv("ATLAS_INDEX", "recipes_vec"),
|
| 200 |
+
"path": "recipe_emb",
|
| 201 |
+
"queryVector": qvec,
|
| 202 |
+
"numCandidates": 400,
|
| 203 |
+
"limit": 10,
|
| 204 |
+
"filter": { "needs_review": { "$ne": True } }
|
| 205 |
+
}
|
| 206 |
+
}]
|
| 207 |
+
results = list(col.aggregate(pipeline))
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
## Embeddings (combined fields → one vector)
|
| 213 |
+
|
| 214 |
+
We embed **ingredients + instructions** into a single `recipe_emb`. Two supported patterns:
|
| 215 |
+
|
| 216 |
+
### A) Combine at embedding time
|
| 217 |
+
|
| 218 |
+
Configure:
|
| 219 |
+
|
| 220 |
+
```python
|
| 221 |
+
embedding_fields = [
|
| 222 |
+
(("ingredients", "instructions"), "recipe_emb")
|
| 223 |
+
]
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
`_apply_embeddings` concatenates labeled sections:
|
| 227 |
+
|
| 228 |
+
```
|
| 229 |
+
Ingredients:
|
| 230 |
+
- ...
|
| 231 |
+
|
| 232 |
+
Instructions:
|
| 233 |
+
1. ...
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
### B) Build `recipe_text` in `RecipeDoc.finalize()` and embed once
|
| 237 |
+
|
| 238 |
+
```python
|
| 239 |
+
self.recipe_text = "\n\n".join(
|
| 240 |
+
[s for s in [
|
| 241 |
+
f"Title:\n{self.title}" if self.title else "",
|
| 242 |
+
f"Ingredients:\n{self.ingredients_text}" if self.ingredients_text else "",
|
| 243 |
+
f"Instructions:\n{self.instructions_text}" if self.instructions_text else ""
|
| 244 |
+
] if s]
|
| 245 |
+
)
|
| 246 |
+
# embedding_fields = [("recipe_text", "recipe_emb")]
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
**HFEmbedder config (defaults):**
|
| 250 |
+
|
| 251 |
+
```python
|
| 252 |
+
HF_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 253 |
+
HF_DEVICE=cpu
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## CLI (optional but handy)
|
| 259 |
+
|
| 260 |
+
Create `run_scrape.py`:
|
| 261 |
+
|
| 262 |
+
```python
|
| 263 |
+
from backend.services.data_minning.yummy_medley_scraper import YummyMedleyScraper
|
| 264 |
+
from backend.services.data_minning.all_nigerian_recipe_scraper import AllNigerianRecipesScraper
|
| 265 |
+
|
| 266 |
+
SCRAPERS = {
|
| 267 |
+
"yummy": YummyMedleyScraper,
|
| 268 |
+
"anr": AllNigerianRecipesScraper,
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
if __name__ == "__main__":
|
| 272 |
+
import argparse
|
| 273 |
+
from dataclasses import asdict
|
| 274 |
+
p = argparse.ArgumentParser()
|
| 275 |
+
p.add_argument("--site", choices=SCRAPERS.keys(), required=True)
|
| 276 |
+
p.add_argument("--limit", type=int, default=50)
|
| 277 |
+
args = p.parse_args()
|
| 278 |
+
|
| 279 |
+
s = SCRAPERS[args.site]()
|
| 280 |
+
saved = s.stream(sink=..., options=StreamOptions(limit=args.limit))
|
| 281 |
+
print(f"Saved {saved}")
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
Run:
|
| 285 |
+
|
| 286 |
+
```bash
|
| 287 |
+
python3 run_scrape.py --site yummy --limit 25
|
| 288 |
+
```
|
| 289 |
+
|
| 290 |
+
---
|
| 291 |
+
|
| 292 |
+
## Implementation notes
|
| 293 |
+
|
| 294 |
+
### `StreamOptions` (clean params)
|
| 295 |
+
|
| 296 |
+
```python
|
| 297 |
+
from dataclasses import dataclass
|
| 298 |
+
from typing import Optional, Callable
|
| 299 |
+
|
| 300 |
+
@dataclass
|
| 301 |
+
class StreamOptions:
|
| 302 |
+
delay: float = 0.3
|
| 303 |
+
limit: Optional[int] = None
|
| 304 |
+
batch_size: int = 50
|
| 305 |
+
resume_file: Optional[str] = None
|
| 306 |
+
progress_callback: Optional[Callable[[int], None]] = None
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
### Progress to `/jobs`
|
| 310 |
+
|
| 311 |
+
We pass a `progress_callback` that updates the job by `job_id`:
|
| 312 |
+
|
| 313 |
+
```python
|
| 314 |
+
def make_progress_cb(job_id: str):
|
| 315 |
+
def _cb(n: int):
|
| 316 |
+
JOBS[job_id]["count"] = n
|
| 317 |
+
return _cb
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
Used as:
|
| 321 |
+
|
| 322 |
+
```python
|
| 323 |
+
saved = s.stream(
|
| 324 |
+
sink=json_or_mongo_sink,
|
| 325 |
+
options=StreamOptions(
|
| 326 |
+
limit=body.limit,
|
| 327 |
+
batch_size=body.limit,
|
| 328 |
+
resume_file="recipes.resume",
|
| 329 |
+
progress_callback=make_progress_cb(job_id),
|
| 330 |
+
),
|
| 331 |
+
)
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
---
|
| 335 |
+
|
| 336 |
+
## Common pitfalls & fixes
|
| 337 |
+
|
| 338 |
+
* **`ModuleNotFoundError: No module named 'backend'`**
|
| 339 |
+
Run with module path:
|
| 340 |
+
`python3 -m uvicorn backend.app:app --reload`
|
| 341 |
+
|
| 342 |
+
* **Uvicorn not found (`zsh: command not found: uvicorn`)**
|
| 343 |
+
Use: `python3 -m uvicorn ...` or add `~/Library/Python/3.9/bin` to PATH.
|
| 344 |
+
|
| 345 |
+
* **`422 Unprocessable Entity` on `/scrape`**
|
| 346 |
+
In Postman: Body → **raw → JSON** and send:
|
| 347 |
+
`{"site":"yummy","limit":20,"output_type":"json"}`
|
| 348 |
+
|
| 349 |
+
* **Pydantic v2: “non-annotated attribute”**
|
| 350 |
+
Keep globals like `JOBS = {}` **outside** `BaseModel` classes.
|
| 351 |
+
|
| 352 |
+
* **`'int' object is not iterable`**
|
| 353 |
+
Don’t iterate `stream()`—it **returns** an `int`. Use the `progress_callback` if you need live updates.
|
| 354 |
+
|
| 355 |
+
* **`BackgroundTasks` undefined**
|
| 356 |
+
Import from FastAPI:
|
| 357 |
+
`from fastapi import BackgroundTasks`
|
| 358 |
+
|
| 359 |
+
* **Too many commas in ingredients**
|
| 360 |
+
Don’t `.join()` a **string**—only join if it’s a `list[str]`.
|
| 361 |
+
|
| 362 |
+
---
|
| 363 |
+
|
| 364 |
+
## Future ideas (nice-to-haves)
|
| 365 |
+
|
| 366 |
+
* Store jobs in Redis for persistence across restarts
|
| 367 |
+
* Add `started_at` / `finished_at` timestamps and durations to jobs
|
| 368 |
+
* Rate-limit per site; cool-down if a scrape ran recently
|
| 369 |
+
* Switch to task queue (Celery/RQ/BullMQ) if you need scale
|
| 370 |
+
* Add `/search` endpoint that calls `$vectorSearch` in MongoDB
|
| 371 |
+
|
| 372 |
+
---
|
backend/docs/unified-provider-configuration.md
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Unified Provider Configuration
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
The Recipe Recommendation Bot now uses a **unified provider approach** where a single `LLM_PROVIDER` setting controls both LLM and embedding models. This eliminates configuration mismatches and simplifies setup.
|
| 5 |
+
|
| 6 |
+
## Before vs After
|
| 7 |
+
|
| 8 |
+
### ❌ Previous Approach (Confusing)
|
| 9 |
+
```bash
|
| 10 |
+
LLM_PROVIDER=huggingface
|
| 11 |
+
EMBEDDING_PROVIDER=openai # 😵 Different providers - causes issues!
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
### ✅ New Approach (Simple)
|
| 15 |
+
```bash
|
| 16 |
+
LLM_PROVIDER=huggingface # 🎯 One setting controls both LLM and embeddings
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## Benefits
|
| 20 |
+
|
| 21 |
+
1. **Prevents Mismatches**: No more accidentally mixing providers
|
| 22 |
+
2. **Simplified Configuration**: One setting instead of two
|
| 23 |
+
3. **Better User Experience**: Less confusion, fewer errors
|
| 24 |
+
4. **Consistent Performance**: Same provider for both LLM and embeddings
|
| 25 |
+
5. **Easier Troubleshooting**: Single provider to debug
|
| 26 |
+
|
| 27 |
+
## Supported Combinations
|
| 28 |
+
|
| 29 |
+
| Provider | LLM Model | Embedding Model |
|
| 30 |
+
|----------|-----------|-----------------|
|
| 31 |
+
| `openai` | `gpt-5-nano` | `text-embedding-3-small` |
|
| 32 |
+
| `google` | `gemini-2.0-flash` | `models/embedding-001` |
|
| 33 |
+
| `huggingface` | `microsoft/DialoGPT-small` | `sentence-transformers/all-MiniLM-L6-v2` |
|
| 34 |
+
|
| 35 |
+
## Configuration Examples
|
| 36 |
+
|
| 37 |
+
### OpenAI (Complete Setup)
|
| 38 |
+
```bash
|
| 39 |
+
LLM_PROVIDER=openai
|
| 40 |
+
OPENAI_API_KEY=your_api_key_here
|
| 41 |
+
OPENAI_MODEL=gpt-5-nano
|
| 42 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-small
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### Google (Complete Setup)
|
| 46 |
+
```bash
|
| 47 |
+
LLM_PROVIDER=google
|
| 48 |
+
GOOGLE_API_KEY=your_api_key_here
|
| 49 |
+
GOOGLE_MODEL=gemini-2.0-flash
|
| 50 |
+
GOOGLE_EMBEDDING_MODEL=models/embedding-001
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### HuggingFace (Complete Setup)
|
| 54 |
+
```bash
|
| 55 |
+
LLM_PROVIDER=huggingface
|
| 56 |
+
HUGGINGFACE_API_TOKEN=your_token_here
|
| 57 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-small
|
| 58 |
+
HUGGINGFACE_USE_GPU=false
|
| 59 |
+
HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
## Migration Guide
|
| 63 |
+
|
| 64 |
+
If you have an existing `.env` file:
|
| 65 |
+
|
| 66 |
+
1. **Remove** the `EMBEDDING_PROVIDER` line
|
| 67 |
+
2. **Keep** the `LLM_PROVIDER` line
|
| 68 |
+
3. **Ensure** both LLM and embedding model settings are configured for your chosen provider
|
| 69 |
+
|
| 70 |
+
### Example Migration
|
| 71 |
+
```bash
|
| 72 |
+
# OLD .env
|
| 73 |
+
LLM_PROVIDER=huggingface
|
| 74 |
+
EMBEDDING_PROVIDER=huggingface # ← Remove this line
|
| 75 |
+
|
| 76 |
+
# NEW .env
|
| 77 |
+
LLM_PROVIDER=huggingface # ← Keep this, it controls both
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## Technical Implementation
|
| 81 |
+
|
| 82 |
+
The configuration system now:
|
| 83 |
+
- Uses `LLM_PROVIDER` for both `get_llm_config()` and `get_embedding_config()`
|
| 84 |
+
- Automatically matches provider types
|
| 85 |
+
- Validates that the provider supports both LLM and embeddings
|
| 86 |
+
- Provides clear error messages for unsupported providers
|
| 87 |
+
|
| 88 |
+
## Validation
|
| 89 |
+
|
| 90 |
+
You can verify your configuration works:
|
| 91 |
+
```bash
|
| 92 |
+
cd backend
|
| 93 |
+
python -c "
|
| 94 |
+
from config.settings import settings
|
| 95 |
+
llm = settings.get_llm_config()
|
| 96 |
+
emb = settings.get_embedding_config()
|
| 97 |
+
print(f'LLM: {llm[\"provider\"]}')
|
| 98 |
+
print(f'Embedding: {emb[\"provider\"]}')
|
| 99 |
+
print(f'Match: {llm[\"provider\"] == emb[\"provider\"]}')
|
| 100 |
+
"
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
Expected output:
|
| 104 |
+
```
|
| 105 |
+
LLM: huggingface
|
| 106 |
+
Embedding: huggingface
|
| 107 |
+
Match: True
|
| 108 |
+
```
|
backend/requirements.txt
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Production Requirements - Core dependencies only
|
| 2 |
+
# Keep this minimal for production deployments
|
| 3 |
+
|
| 4 |
+
# Core API dependencies
|
| 5 |
+
fastapi
|
| 6 |
+
uvicorn[standard]
|
| 7 |
+
python-dotenv
|
| 8 |
+
python-multipart
|
| 9 |
+
|
| 10 |
+
# Data processing
|
| 11 |
+
numpy
|
| 12 |
+
requests
|
| 13 |
+
|
| 14 |
+
# LLM Providers
|
| 15 |
+
openai
|
| 16 |
+
google-generativeai
|
| 17 |
+
|
| 18 |
+
# Vector Store & Embeddings (optional - choose based on needs)
|
| 19 |
+
# Uncomment if needed in production
|
| 20 |
+
chromadb
|
| 21 |
+
langchain-mongodb
|
| 22 |
+
pymongo
|
| 23 |
+
|
| 24 |
+
# MongoDB Atlas Vector Store (optional)
|
| 25 |
+
pymongo[srv]
|
| 26 |
+
|
| 27 |
+
# HuggingFace dependencies
|
| 28 |
+
# transformers
|
| 29 |
+
# accelerate
|
| 30 |
+
|
| 31 |
+
# Sentence Transformers - Choose ONE option below:
|
| 32 |
+
# FULL sentence-transformers (easier to use, ~800MB+ )
|
| 33 |
+
# sentence-transformers
|
| 34 |
+
|
| 35 |
+
# Note: sentence-transformers will automatically use CPU-only PyTorch if CUDA is not available (easier to use, ~200MB - 300MB )
|
| 36 |
+
# To force CPU-only installation: pip install torch --index-url https://download.pytorch.org/whl/cpu && pip install sentence-transformers
|
| 37 |
+
|
| 38 |
+
# LangChain for RAG
|
| 39 |
+
langchain
|
| 40 |
+
langchain-core
|
| 41 |
+
langchain-text-splitters
|
| 42 |
+
langchain-openai
|
| 43 |
+
langchain-chroma
|
| 44 |
+
langchain_google_genai
|
| 45 |
+
langchain-huggingface
|
| 46 |
+
langchain-community
|
| 47 |
+
|
| 48 |
+
bs4
|
| 49 |
+
lxml
|
backend/services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Services package initialization
|
backend/services/custom_mongo_vector.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Streamlined MongoDB Vector Store with Atlas Vector Search
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import List, Dict, Any, Optional, NamedTuple
|
| 6 |
+
import numpy as np
|
| 7 |
+
from langchain.schema import Document
|
| 8 |
+
from langchain.vectorstores.base import VectorStore
|
| 9 |
+
from pymongo.collection import Collection
|
| 10 |
+
from backend.config.logging_config import get_logger
|
| 11 |
+
|
| 12 |
+
logger = get_logger("custom_mongo_vector")
|
| 13 |
+
|
| 14 |
+
class VectorSearchOptions(NamedTuple):
|
| 15 |
+
"""Configuration options for vector search"""
|
| 16 |
+
index_name: str = "foodInstructionIndex"
|
| 17 |
+
embedding_key: str = "ingredients_emb"
|
| 18 |
+
text_key: str = "title"
|
| 19 |
+
num_candidates: int = 50
|
| 20 |
+
similarity_metric: str = "cosine" # cosine or dotProduct
|
| 21 |
+
|
| 22 |
+
class CustomMongoDBVectorStore(VectorStore):
|
| 23 |
+
"""
|
| 24 |
+
Streamlined MongoDB Atlas Vector Store with efficient $vectorSearch aggregation.
|
| 25 |
+
Falls back to Python similarity calculation when Atlas Vector Search is unavailable.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
collection: Collection,
|
| 31 |
+
embedding_function,
|
| 32 |
+
options: Optional[VectorSearchOptions] = None
|
| 33 |
+
):
|
| 34 |
+
self.collection = collection
|
| 35 |
+
self.embedding_function = embedding_function
|
| 36 |
+
self.options = options or VectorSearchOptions()
|
| 37 |
+
|
| 38 |
+
logger.info(f"🔧 Streamlined MongoDB Vector Store initialized")
|
| 39 |
+
logger.info(f"� Config: {self.options.index_name} index, {self.options.similarity_metric} similarity")
|
| 40 |
+
|
| 41 |
+
def _calculate_similarity(self, vec1: List[float], vec2: List[float]) -> float:
|
| 42 |
+
"""Calculate similarity using the most efficient method"""
|
| 43 |
+
a, b = np.array(vec1), np.array(vec2)
|
| 44 |
+
|
| 45 |
+
if self.options.similarity_metric == "dotProduct":
|
| 46 |
+
# Dot product (faster, good for normalized embeddings)
|
| 47 |
+
return float(np.dot(a, b))
|
| 48 |
+
else:
|
| 49 |
+
# Cosine similarity (more robust, handles non-normalized embeddings)
|
| 50 |
+
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
|
| 51 |
+
|
| 52 |
+
def similarity_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
|
| 53 |
+
"""Streamlined similarity search using Atlas Vector Search with Python fallback"""
|
| 54 |
+
logger.info(f"🔍 Searching: '{query}' (k={k})")
|
| 55 |
+
|
| 56 |
+
qvec = self.embedding_function.embed_query(query)
|
| 57 |
+
|
| 58 |
+
# Primary: Try Atlas Vector Search (efficient, server-side)
|
| 59 |
+
try:
|
| 60 |
+
pipeline = [
|
| 61 |
+
{
|
| 62 |
+
"$vectorSearch": {
|
| 63 |
+
"index": self.options.index_name,
|
| 64 |
+
"path": self.options.embedding_key,
|
| 65 |
+
"queryVector": qvec,
|
| 66 |
+
"numCandidates": self.options.num_candidates,
|
| 67 |
+
"limit": k
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"$match": {
|
| 72 |
+
'$or': [
|
| 73 |
+
{ 'needs_review': { '$exists': False } },
|
| 74 |
+
{ 'needs_review': False }
|
| 75 |
+
]
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
results = list(self.collection.aggregate(pipeline))
|
| 81 |
+
if results:
|
| 82 |
+
logger.info(f"✅ Atlas Vector Search: {len(results)} results")
|
| 83 |
+
return self._create_documents(results)
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.warning(f"⚠️ Atlas Vector Search failed: {e}")
|
| 87 |
+
|
| 88 |
+
# Fallback: Python similarity calculation
|
| 89 |
+
logger.info("🔄 Using Python similarity fallback")
|
| 90 |
+
return self._python_similarity_search(qvec, k)
|
| 91 |
+
|
| 92 |
+
def _python_similarity_search(self, qvec: List[float], k: int) -> List[Document]:
|
| 93 |
+
"""Efficient Python-based similarity search fallback"""
|
| 94 |
+
cursor = self.collection.find(
|
| 95 |
+
{'$or': [
|
| 96 |
+
{'needs_review': {'$exists': False}},
|
| 97 |
+
{'needs_review': False}
|
| 98 |
+
]},
|
| 99 |
+
{self.options.text_key: 1, self.options.embedding_key: 1, "ingredients": 1, "instructions": 1}
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Vectorized similarity calculation for efficiency
|
| 103 |
+
similarities = []
|
| 104 |
+
for doc in cursor:
|
| 105 |
+
doc_emb = doc.get(self.options.embedding_key)
|
| 106 |
+
if doc_emb and len(doc_emb) == len(qvec):
|
| 107 |
+
score = self._calculate_similarity(qvec, doc_emb)
|
| 108 |
+
similarities.append((doc, score))
|
| 109 |
+
|
| 110 |
+
# Return top-k results
|
| 111 |
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
| 112 |
+
top_docs = [doc for doc, _ in similarities[:k]]
|
| 113 |
+
|
| 114 |
+
logger.info(f"📊 Python fallback: {len(similarities)} processed, {len(top_docs)} returned")
|
| 115 |
+
return self._create_documents(top_docs)
|
| 116 |
+
|
| 117 |
+
def _create_documents(self, docs: List[Dict]) -> List[Document]:
|
| 118 |
+
"""Create LangChain Documents from MongoDB results using clean string content"""
|
| 119 |
+
documents = []
|
| 120 |
+
for doc in docs:
|
| 121 |
+
title = doc.get(self.options.text_key, "Untitled Recipe")
|
| 122 |
+
ingredients = doc.get("ingredients", "")
|
| 123 |
+
instructions = doc.get("instructions", "")
|
| 124 |
+
|
| 125 |
+
# Build clean content without extra formatting
|
| 126 |
+
content_parts = [f"Recipe: {title}"]
|
| 127 |
+
|
| 128 |
+
if ingredients:
|
| 129 |
+
content_parts.append(f"Ingredients: {ingredients}")
|
| 130 |
+
|
| 131 |
+
if instructions:
|
| 132 |
+
content_parts.append(f"Instructions: {instructions}")
|
| 133 |
+
|
| 134 |
+
content = "\n\n".join(content_parts)
|
| 135 |
+
|
| 136 |
+
documents.append(Document(
|
| 137 |
+
page_content=content,
|
| 138 |
+
metadata={"_id": str(doc["_id"]), "title": title}
|
| 139 |
+
))
|
| 140 |
+
|
| 141 |
+
return documents
|
| 142 |
+
|
| 143 |
+
def similarity_search_with_score(self, query: str, k: int = 4, **kwargs: Any) -> List[tuple]:
|
| 144 |
+
"""Return docs with similarity scores (simplified)"""
|
| 145 |
+
docs = self.similarity_search(query, k, **kwargs)
|
| 146 |
+
return [(doc, 1.0) for doc in docs] # Atlas Vector Search doesn't return raw scores
|
| 147 |
+
def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) -> List[str]:
|
| 148 |
+
"""Read-only vector store - adding texts not supported"""
|
| 149 |
+
raise NotImplementedError("This vector store is read-only for pre-existing embeddings")
|
| 150 |
+
|
| 151 |
+
@classmethod
|
| 152 |
+
def from_texts(cls, texts: List[str], embedding_function, metadatas: Optional[List[dict]] = None, **kwargs: Any):
|
| 153 |
+
"""Read-only vector store - creating from texts not supported"""
|
| 154 |
+
raise NotImplementedError("This vector store is read-only for pre-existing embeddings")
|
backend/services/llm_service.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LLM Service - RAG pipeline using ConversationalRetrievalChain
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
|
| 4 |
+
# Local imports
|
| 5 |
+
from backend.config.settings import settings
|
| 6 |
+
from backend.config.logging_config import get_logger
|
| 7 |
+
from backend.services.vector_store import vector_store_service
|
| 8 |
+
|
| 9 |
+
# Setup logging
|
| 10 |
+
logger = get_logger("llm_service")
|
| 11 |
+
|
| 12 |
+
class LLMService:
|
| 13 |
+
"""LLM service using ConversationalRetrievalChain for RAG pipeline"""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
logger.info("🤖 Initializing LLM Service...")
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
self.llm = self._setup_llm()
|
| 20 |
+
self.retriever = self._setup_retriever()
|
| 21 |
+
self.memory = self._setup_memory()
|
| 22 |
+
self.qa_chain = self._setup_qa_chain()
|
| 23 |
+
|
| 24 |
+
logger.info("🚀 LLM Service initialized successfully")
|
| 25 |
+
|
| 26 |
+
except Exception as e:
|
| 27 |
+
logger.error(f"❌ LLM Service initialization failed: {str(e)}", exc_info=True)
|
| 28 |
+
raise
|
| 29 |
+
|
| 30 |
+
def _setup_llm(self):
|
| 31 |
+
"""Setup LLM based on configuration with conditional imports"""
|
| 32 |
+
llm_config = settings.get_llm_config()
|
| 33 |
+
provider = llm_config["provider"]
|
| 34 |
+
|
| 35 |
+
logger.info(f"🔧 Setting up LLM provider: {provider}")
|
| 36 |
+
|
| 37 |
+
if provider == "openai":
|
| 38 |
+
try:
|
| 39 |
+
from langchain_openai import ChatOpenAI
|
| 40 |
+
logger.info("✅ OpenAI LLM imported successfully")
|
| 41 |
+
|
| 42 |
+
# Handle special cases for temperature restrictions
|
| 43 |
+
temperature = llm_config["temperature"]
|
| 44 |
+
model = llm_config["model"]
|
| 45 |
+
max_tokens = llm_config.get("max_tokens", 1000)
|
| 46 |
+
|
| 47 |
+
# GPT-5-nano has temperature restrictions (defaults to 1.0)
|
| 48 |
+
if "gpt-5-nano" in model.lower():
|
| 49 |
+
temperature = 1.0
|
| 50 |
+
logger.info(f"🔧 Using temperature=1.0 for {model} (model restriction)")
|
| 51 |
+
|
| 52 |
+
# Log token configuration
|
| 53 |
+
logger.info(f"🎯 OpenAI config - Model: {model}, Output tokens: {max_tokens}, Temperature: {temperature}")
|
| 54 |
+
|
| 55 |
+
return ChatOpenAI(
|
| 56 |
+
api_key=llm_config["api_key"],
|
| 57 |
+
model=model,
|
| 58 |
+
temperature=temperature,
|
| 59 |
+
max_tokens=max_tokens # This limits OUTPUT tokens only
|
| 60 |
+
)
|
| 61 |
+
except ImportError as e:
|
| 62 |
+
logger.error(f"❌ OpenAI LLM not available: {e}")
|
| 63 |
+
raise ImportError("OpenAI provider selected but langchain_openai not installed")
|
| 64 |
+
|
| 65 |
+
elif provider == "google":
|
| 66 |
+
try:
|
| 67 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 68 |
+
logger.info("✅ Google LLM imported successfully")
|
| 69 |
+
|
| 70 |
+
max_output_tokens = llm_config.get("max_tokens", 1000)
|
| 71 |
+
model = llm_config["model"]
|
| 72 |
+
temperature = llm_config["temperature"]
|
| 73 |
+
|
| 74 |
+
# Log token configuration
|
| 75 |
+
logger.info(f"🎯 Google config - Model: {model}, Output tokens: {max_output_tokens}, Temperature: {temperature}")
|
| 76 |
+
|
| 77 |
+
return ChatGoogleGenerativeAI(
|
| 78 |
+
google_api_key=llm_config["api_key"],
|
| 79 |
+
model=model,
|
| 80 |
+
temperature=temperature,
|
| 81 |
+
max_output_tokens=max_output_tokens # This limits OUTPUT tokens only
|
| 82 |
+
)
|
| 83 |
+
except ImportError as e:
|
| 84 |
+
logger.error(f"❌ Google LLM not available: {e}")
|
| 85 |
+
raise ImportError("Google provider selected but langchain_google_genai not installed")
|
| 86 |
+
|
| 87 |
+
elif provider == "ollama":
|
| 88 |
+
try:
|
| 89 |
+
from langchain_community.llms import Ollama
|
| 90 |
+
logger.info("✅ Ollama LLM imported successfully")
|
| 91 |
+
return Ollama(
|
| 92 |
+
base_url=llm_config["base_url"],
|
| 93 |
+
model=llm_config["model"],
|
| 94 |
+
temperature=llm_config["temperature"]
|
| 95 |
+
)
|
| 96 |
+
except ImportError as e:
|
| 97 |
+
logger.error(f"❌ Ollama LLM not available: {e}")
|
| 98 |
+
raise ImportError("Ollama provider selected but langchain_community not installed")
|
| 99 |
+
|
| 100 |
+
elif provider == "huggingface":
|
| 101 |
+
try:
|
| 102 |
+
# Check if we should use API or local pipeline
|
| 103 |
+
use_api = llm_config.get("use_api", False)
|
| 104 |
+
|
| 105 |
+
if use_api:
|
| 106 |
+
# Use HuggingFace Inference API with better error handling
|
| 107 |
+
try:
|
| 108 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
| 109 |
+
logger.info("✅ Using HuggingFace API (no local download)")
|
| 110 |
+
|
| 111 |
+
return HuggingFaceEndpoint(
|
| 112 |
+
repo_id=llm_config["model"],
|
| 113 |
+
huggingfacehub_api_token=llm_config["api_token"],
|
| 114 |
+
temperature=0.7, # HuggingFace API doesn't support dynamic temperature from config
|
| 115 |
+
max_new_tokens=200,
|
| 116 |
+
repetition_penalty=1.1,
|
| 117 |
+
top_p=0.9
|
| 118 |
+
)
|
| 119 |
+
except Exception as api_error:
|
| 120 |
+
logger.warning(f"⚠️ HuggingFace API failed: {api_error}")
|
| 121 |
+
logger.info("🔄 Falling back to HuggingFace Hub API...")
|
| 122 |
+
|
| 123 |
+
# Fallback to HuggingFaceHub (older but more reliable)
|
| 124 |
+
try:
|
| 125 |
+
from langchain_community.llms import HuggingFaceHub
|
| 126 |
+
|
| 127 |
+
return HuggingFaceHub(
|
| 128 |
+
repo_id=llm_config["model"],
|
| 129 |
+
huggingfacehub_api_token=llm_config["api_token"],
|
| 130 |
+
model_kwargs={
|
| 131 |
+
"temperature": 0.7, # HuggingFace Hub API has limited temperature control
|
| 132 |
+
"max_new_tokens": 200,
|
| 133 |
+
"repetition_penalty": 1.1,
|
| 134 |
+
"top_p": 0.9,
|
| 135 |
+
"do_sample": True
|
| 136 |
+
}
|
| 137 |
+
)
|
| 138 |
+
except Exception as hub_error:
|
| 139 |
+
logger.error(f"❌ HuggingFace Hub also failed: {hub_error}")
|
| 140 |
+
raise ImportError(f"Both HuggingFace API methods failed: {api_error}, {hub_error}")
|
| 141 |
+
else:
|
| 142 |
+
# Use local pipeline (downloads model)
|
| 143 |
+
from langchain_huggingface import HuggingFacePipeline
|
| 144 |
+
from transformers import pipeline
|
| 145 |
+
|
| 146 |
+
logger.info("✅ Using HuggingFace local pipeline")
|
| 147 |
+
|
| 148 |
+
# Create HuggingFace pipeline - avoid device_map for CPU-only setups
|
| 149 |
+
pipeline_kwargs = {
|
| 150 |
+
"task": "text-generation",
|
| 151 |
+
"model": llm_config["model"],
|
| 152 |
+
"max_length": 512, # Increase max length
|
| 153 |
+
"do_sample": True, # Enable sampling for better responses
|
| 154 |
+
"temperature": 0.7, # Local pipeline uses default 0.7 for stability
|
| 155 |
+
"pad_token_id": 50256, # Set pad token to avoid warnings
|
| 156 |
+
"eos_token_id": 50256, # Set end of sequence token
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
# Only add device_map if using GPU
|
| 160 |
+
if llm_config.get("use_gpu", False):
|
| 161 |
+
pipeline_kwargs["device_map"] = "auto"
|
| 162 |
+
else:
|
| 163 |
+
# For CPU, use device=0 which maps to CPU
|
| 164 |
+
pipeline_kwargs["device"] = "cpu"
|
| 165 |
+
|
| 166 |
+
hf_pipeline = pipeline(**pipeline_kwargs)
|
| 167 |
+
|
| 168 |
+
return HuggingFacePipeline(
|
| 169 |
+
pipeline=hf_pipeline,
|
| 170 |
+
model_kwargs={
|
| 171 |
+
"temperature": 0.7, # Local pipeline temperature (limited configurability)
|
| 172 |
+
"max_new_tokens": 150, # Reduced for efficiency
|
| 173 |
+
"do_sample": True,
|
| 174 |
+
"top_p": 0.9,
|
| 175 |
+
"repetition_penalty": 1.1,
|
| 176 |
+
"early_stopping": True,
|
| 177 |
+
"num_beams": 4 # Better quality for instruction following
|
| 178 |
+
}
|
| 179 |
+
)
|
| 180 |
+
except ImportError as e:
|
| 181 |
+
logger.error(f"❌ HuggingFace LLM not available: {e}")
|
| 182 |
+
raise ImportError("HuggingFace provider selected but required packages not installed")
|
| 183 |
+
|
| 184 |
+
else:
|
| 185 |
+
logger.warning(f"⚠️ Unknown LLM provider '{provider}', falling back to OpenAI")
|
| 186 |
+
try:
|
| 187 |
+
from langchain_openai import ChatOpenAI
|
| 188 |
+
return ChatOpenAI()
|
| 189 |
+
except ImportError:
|
| 190 |
+
logger.error("❌ No valid LLM provider available")
|
| 191 |
+
raise ImportError("No valid LLM provider available")
|
| 192 |
+
|
| 193 |
+
def _setup_retriever(self):
|
| 194 |
+
"""Setup retriever from vector store service"""
|
| 195 |
+
return vector_store_service.get_retriever()
|
| 196 |
+
|
| 197 |
+
def _setup_memory(self):
|
| 198 |
+
"""Setup conversation memory"""
|
| 199 |
+
try:
|
| 200 |
+
from langchain.memory import ConversationBufferMemory
|
| 201 |
+
return ConversationBufferMemory(memory_key='chat_history', return_messages=True)
|
| 202 |
+
except ImportError as e:
|
| 203 |
+
logger.error(f"❌ ConversationBufferMemory not available: {e}")
|
| 204 |
+
raise ImportError("langchain memory not available")
|
| 205 |
+
|
| 206 |
+
def _setup_qa_chain(self):
|
| 207 |
+
"""Setup ConversationalRetrievalChain"""
|
| 208 |
+
try:
|
| 209 |
+
from langchain.chains import ConversationalRetrievalChain
|
| 210 |
+
return ConversationalRetrievalChain.from_llm(
|
| 211 |
+
llm=self.llm,
|
| 212 |
+
retriever=self.retriever,
|
| 213 |
+
memory=self.memory,
|
| 214 |
+
verbose=settings.LANGCHAIN_DEBUG # Reduce debugging noise
|
| 215 |
+
)
|
| 216 |
+
except ImportError as e:
|
| 217 |
+
logger.error(f"❌ ConversationalRetrievalChain not available: {e}")
|
| 218 |
+
raise ImportError("langchain chains not available")
|
| 219 |
+
|
| 220 |
+
def _preprocess_query(self, question: str) -> str:
|
| 221 |
+
"""Preprocess user query to improve vector search accuracy"""
|
| 222 |
+
import re
|
| 223 |
+
|
| 224 |
+
# Convert to lowercase for consistency
|
| 225 |
+
processed = question.lower()
|
| 226 |
+
|
| 227 |
+
# Remove common stop words that don't help with recipe matching
|
| 228 |
+
stop_words = ['i', 'want', 'a', 'an', 'the', 'for', 'with', 'can', 'you', 'give', 'me', 'please', 'help']
|
| 229 |
+
words = processed.split()
|
| 230 |
+
words = [word for word in words if word not in stop_words]
|
| 231 |
+
|
| 232 |
+
# Remove punctuation except spaces
|
| 233 |
+
processed = ' '.join(words)
|
| 234 |
+
processed = re.sub(r'[^\w\s]', '', processed)
|
| 235 |
+
|
| 236 |
+
# Normalize multiple spaces
|
| 237 |
+
processed = ' '.join(processed.split())
|
| 238 |
+
|
| 239 |
+
logger.debug(f"🔧 Query preprocessing: '{question}' → '{processed}'")
|
| 240 |
+
return processed
|
| 241 |
+
|
| 242 |
+
def ask_question(self, user_question: str) -> str:
|
| 243 |
+
"""Ask a question using the conversational retrieval chain"""
|
| 244 |
+
logger.info(f"❓ Processing: '{user_question[:60]}...'")
|
| 245 |
+
|
| 246 |
+
try:
|
| 247 |
+
# Preprocess query for better matching
|
| 248 |
+
processed_query = self._preprocess_query(user_question)
|
| 249 |
+
|
| 250 |
+
# Get context for token tracking
|
| 251 |
+
document_retriever = getattr(self.qa_chain, 'retriever', None)
|
| 252 |
+
retrieved_context = ""
|
| 253 |
+
if document_retriever:
|
| 254 |
+
# Use both queries for comprehensive results
|
| 255 |
+
original_docs = document_retriever.invoke(user_question)
|
| 256 |
+
processed_docs = document_retriever.invoke(processed_query)
|
| 257 |
+
|
| 258 |
+
# Deduplicate documents
|
| 259 |
+
seen_content = set()
|
| 260 |
+
unique_documents = []
|
| 261 |
+
for document in original_docs + processed_docs:
|
| 262 |
+
if document.page_content not in seen_content:
|
| 263 |
+
unique_documents.append(document)
|
| 264 |
+
seen_content.add(document.page_content)
|
| 265 |
+
|
| 266 |
+
retrieved_context = "\n".join([doc.page_content for doc in unique_documents[:8]])
|
| 267 |
+
logger.debug(f"📄 Retrieved {len(unique_documents)} unique documents")
|
| 268 |
+
|
| 269 |
+
# Enhanced question for natural responses
|
| 270 |
+
enhanced_question = f"""Based on the available recipe information, please answer this cooking question: "{user_question}"
|
| 271 |
+
|
| 272 |
+
Respond directly and naturally as if you're sharing your own culinary knowledge. If there's a specific recipe that matches the request, share the complete recipe with ingredients and step-by-step instructions in a friendly, conversational way."""
|
| 273 |
+
|
| 274 |
+
result = self.qa_chain({"question": enhanced_question})
|
| 275 |
+
generated_answer = result["answer"]
|
| 276 |
+
|
| 277 |
+
self._log_token_usage(user_question, retrieved_context, generated_answer)
|
| 278 |
+
|
| 279 |
+
logger.info(f"✅ Response generated ({len(generated_answer)} chars)")
|
| 280 |
+
return generated_answer
|
| 281 |
+
|
| 282 |
+
except Exception as error:
|
| 283 |
+
logger.error(f"❌ Error in ask_question: {str(error)}")
|
| 284 |
+
return f"Sorry, I encountered an error: {str(error)}"
|
| 285 |
+
|
| 286 |
+
def _count_tokens(self, text: str) -> int:
|
| 287 |
+
"""Count tokens in text (rough estimate for debugging)"""
|
| 288 |
+
return len(text) // 4 if text else 0
|
| 289 |
+
|
| 290 |
+
def _log_token_usage(self, question: str, context: str, response: str):
|
| 291 |
+
"""Log token usage for monitoring"""
|
| 292 |
+
question_tokens = self._count_tokens(question)
|
| 293 |
+
context_tokens = self._count_tokens(context)
|
| 294 |
+
response_tokens = self._count_tokens(response)
|
| 295 |
+
total_input_tokens = question_tokens + context_tokens
|
| 296 |
+
|
| 297 |
+
logger.info(f"📊 Token Usage - Input:{total_input_tokens} (Q:{question_tokens}+C:{context_tokens}), Output:{response_tokens}")
|
| 298 |
+
|
| 299 |
+
if context_tokens > 3000:
|
| 300 |
+
logger.warning(f"⚠️ Large context detected: {context_tokens} tokens")
|
| 301 |
+
|
| 302 |
+
return {
|
| 303 |
+
"input_tokens": total_input_tokens,
|
| 304 |
+
"output_tokens": response_tokens,
|
| 305 |
+
"total_tokens": total_input_tokens + response_tokens
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
def clear_memory(self):
|
| 309 |
+
"""Clear conversation memory"""
|
| 310 |
+
try:
|
| 311 |
+
if hasattr(self.memory, 'clear'):
|
| 312 |
+
self.memory.clear()
|
| 313 |
+
logger.info("✅ Memory cleared")
|
| 314 |
+
return True
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.warning(f"⚠️ Could not clear memory: {e}")
|
| 317 |
+
return False
|
| 318 |
+
|
| 319 |
+
def simple_chat_completion(self, user_message: str) -> str:
|
| 320 |
+
"""Simple chat completion without RAG - direct LLM response"""
|
| 321 |
+
logger.info(f"💭 Simple chat: '{user_message[:50]}...'")
|
| 322 |
+
|
| 323 |
+
try:
|
| 324 |
+
llm_prompt = f"As a knowledgeable cooking expert, share your insights about {user_message}. Provide helpful culinary advice and recommendations:\n\n"
|
| 325 |
+
|
| 326 |
+
llm_response = self.llm.invoke(llm_prompt) if hasattr(self.llm, 'invoke') else self.llm(llm_prompt)
|
| 327 |
+
|
| 328 |
+
# Extract content based on response type
|
| 329 |
+
if hasattr(llm_response, 'content'):
|
| 330 |
+
generated_answer = llm_response.content
|
| 331 |
+
elif isinstance(llm_response, str):
|
| 332 |
+
generated_answer = llm_response.replace(llm_prompt, "").strip() if llm_prompt in llm_response else llm_response
|
| 333 |
+
else:
|
| 334 |
+
generated_answer = str(llm_response)
|
| 335 |
+
|
| 336 |
+
# Validate and clean response
|
| 337 |
+
generated_answer = generated_answer.strip()
|
| 338 |
+
if not generated_answer or len(generated_answer) < 10:
|
| 339 |
+
generated_answer = "I'd be happy to help with recipes! Ask me about specific ingredients or dishes."
|
| 340 |
+
|
| 341 |
+
# Limit response length
|
| 342 |
+
if len(generated_answer) > 300:
|
| 343 |
+
answer_sentences = generated_answer.split('. ')
|
| 344 |
+
generated_answer = '. '.join(answer_sentences[:2]) + '.' if len(answer_sentences) > 1 else generated_answer[:300]
|
| 345 |
+
|
| 346 |
+
logger.info(f"✅ Response generated ({len(generated_answer)} chars)")
|
| 347 |
+
return generated_answer
|
| 348 |
+
|
| 349 |
+
except Exception as error:
|
| 350 |
+
logger.error(f"❌ Simple chat completion error: {str(error)}")
|
| 351 |
+
return f"Sorry, I encountered an error: {str(error)}"
|
| 352 |
+
|
| 353 |
+
# Create global LLM service instance
|
| 354 |
+
llm_service = LLMService()
|
backend/services/vector_store.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Vector Store Service - Simple setup for retriever use
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# Core LangChain imports (always needed)
|
| 9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
+
from langchain.schema import Document
|
| 11 |
+
|
| 12 |
+
# Local imports
|
| 13 |
+
from backend.config.settings import settings
|
| 14 |
+
from backend.config.database import db_settings
|
| 15 |
+
from backend.config.logging_config import get_logger
|
| 16 |
+
|
| 17 |
+
# MongoDB imports
|
| 18 |
+
from pymongo import MongoClient
|
| 19 |
+
from backend.services.custom_mongo_vector import CustomMongoDBVectorStore, VectorSearchOptions
|
| 20 |
+
|
| 21 |
+
# Setup logging
|
| 22 |
+
logger = get_logger("vector_store")
|
| 23 |
+
|
| 24 |
+
class VectorStoreService:
|
| 25 |
+
"""Simple vector store service - creates or retrieves vector store for retriever use"""
|
| 26 |
+
|
| 27 |
+
def __init__(self):
|
| 28 |
+
logger.info("📚 Initializing Vector Store Service...")
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
self.embeddings = self._get_embeddings()
|
| 32 |
+
logger.info("✅ Embeddings setup completed")
|
| 33 |
+
|
| 34 |
+
self.vector_store = self._get_or_create_vector_store()
|
| 35 |
+
logger.info("✅ Vector store setup completed")
|
| 36 |
+
|
| 37 |
+
logger.info("🚀 Vector Store Service initialization successful")
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"❌ Vector Store Service initialization failed: {str(e)}", exc_info=True)
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
def _get_embeddings(self):
|
| 44 |
+
"""Get embeddings provider based on configuration with conditional imports"""
|
| 45 |
+
embedding_config = settings.get_embedding_config()
|
| 46 |
+
provider = embedding_config["provider"]
|
| 47 |
+
|
| 48 |
+
logger.info(f"🔧 Setting up embeddings provider: {provider}")
|
| 49 |
+
|
| 50 |
+
if provider == "openai":
|
| 51 |
+
try:
|
| 52 |
+
from langchain_openai import OpenAIEmbeddings
|
| 53 |
+
logger.info("✅ OpenAI embeddings imported successfully")
|
| 54 |
+
return OpenAIEmbeddings(
|
| 55 |
+
openai_api_key=embedding_config["api_key"],
|
| 56 |
+
model=embedding_config["model"]
|
| 57 |
+
)
|
| 58 |
+
except ImportError as e:
|
| 59 |
+
logger.error(f"❌ OpenAI embeddings not available: {e}")
|
| 60 |
+
raise ImportError("OpenAI provider selected but langchain_openai not installed")
|
| 61 |
+
|
| 62 |
+
elif provider == "google":
|
| 63 |
+
try:
|
| 64 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 65 |
+
logger.info("✅ Google embeddings imported successfully")
|
| 66 |
+
return GoogleGenerativeAIEmbeddings(
|
| 67 |
+
google_api_key=embedding_config["api_key"],
|
| 68 |
+
model=embedding_config["model"]
|
| 69 |
+
)
|
| 70 |
+
except ImportError as e:
|
| 71 |
+
logger.error(f"❌ Google embeddings not available: {e}")
|
| 72 |
+
raise ImportError("Google provider selected but langchain_google_genai not installed")
|
| 73 |
+
|
| 74 |
+
elif provider == "huggingface":
|
| 75 |
+
try:
|
| 76 |
+
# Try modern langchain-huggingface first
|
| 77 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 78 |
+
logger.info("✅ HuggingFace embeddings imported successfully")
|
| 79 |
+
return HuggingFaceEmbeddings(
|
| 80 |
+
model_name=embedding_config["model"]
|
| 81 |
+
)
|
| 82 |
+
except ImportError:
|
| 83 |
+
try:
|
| 84 |
+
# Fallback to sentence-transformers directly
|
| 85 |
+
from sentence_transformers import SentenceTransformer
|
| 86 |
+
logger.warning("⚠️ Using sentence-transformers directly (langchain-huggingface not available)")
|
| 87 |
+
# Return a wrapper that mimics the embeddings interface
|
| 88 |
+
return self._create_sentence_transformer_wrapper(embedding_config["model"])
|
| 89 |
+
except ImportError as e:
|
| 90 |
+
logger.error(f"❌ HuggingFace embeddings not available: {e}")
|
| 91 |
+
logger.error("💡 To fix this, install sentence-transformers: pip install sentence-transformers")
|
| 92 |
+
raise ImportError("HuggingFace provider selected but sentence-transformers not installed. Run: pip install sentence-transformers")
|
| 93 |
+
|
| 94 |
+
elif provider == "ollama":
|
| 95 |
+
try:
|
| 96 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
| 97 |
+
logger.info("✅ Ollama embeddings imported successfully")
|
| 98 |
+
return OllamaEmbeddings(
|
| 99 |
+
base_url=embedding_config["base_url"],
|
| 100 |
+
model=embedding_config["model"]
|
| 101 |
+
)
|
| 102 |
+
except ImportError as e:
|
| 103 |
+
logger.error(f"❌ Ollama embeddings not available: {e}")
|
| 104 |
+
raise ImportError("Ollama provider selected but langchain_community not installed")
|
| 105 |
+
|
| 106 |
+
else:
|
| 107 |
+
logger.warning(f"⚠️ Unknown embedding provider '{provider}', falling back to OpenAI")
|
| 108 |
+
try:
|
| 109 |
+
from langchain_openai import OpenAIEmbeddings
|
| 110 |
+
return OpenAIEmbeddings()
|
| 111 |
+
except ImportError:
|
| 112 |
+
logger.error("❌ No valid embedding provider available")
|
| 113 |
+
raise ImportError("No valid embedding provider available")
|
| 114 |
+
|
| 115 |
+
def _create_sentence_transformer_wrapper(self, model_name):
|
| 116 |
+
"""Create a simple wrapper for sentence-transformers to work with LangChain"""
|
| 117 |
+
from sentence_transformers import SentenceTransformer
|
| 118 |
+
|
| 119 |
+
class SentenceTransformerWrapper:
|
| 120 |
+
def __init__(self, model_name):
|
| 121 |
+
self.model = SentenceTransformer(model_name)
|
| 122 |
+
|
| 123 |
+
def encode(self, texts):
|
| 124 |
+
return self.model.encode(texts).tolist()
|
| 125 |
+
|
| 126 |
+
def embed_query(self, text):
|
| 127 |
+
return self.model.encode([text])[0].tolist()
|
| 128 |
+
|
| 129 |
+
return SentenceTransformerWrapper(model_name)
|
| 130 |
+
|
| 131 |
+
def _get_or_create_vector_store(self):
|
| 132 |
+
"""Get or create vector store with conditional imports"""
|
| 133 |
+
db_config = db_settings.get_vector_store_config()
|
| 134 |
+
provider = db_config["provider"]
|
| 135 |
+
|
| 136 |
+
if provider == "chromadb":
|
| 137 |
+
try:
|
| 138 |
+
from langchain_chroma import Chroma
|
| 139 |
+
|
| 140 |
+
persist_dir = Path(db_config["persist_directory"])
|
| 141 |
+
collection_name = db_config["collection_name"]
|
| 142 |
+
refresh_on_start = db_config.get("refresh_on_start", False)
|
| 143 |
+
|
| 144 |
+
# Check if refresh is requested
|
| 145 |
+
if refresh_on_start and persist_dir.exists():
|
| 146 |
+
logger.info(f"🔄 CHROMADB_REFRESH_ON_START=true - Deleting existing ChromaDB at {persist_dir}")
|
| 147 |
+
shutil.rmtree(persist_dir)
|
| 148 |
+
logger.info(f"✅ Existing ChromaDB deleted successfully")
|
| 149 |
+
|
| 150 |
+
# Check if persisted database exists
|
| 151 |
+
if persist_dir.exists() and any(persist_dir.iterdir()):
|
| 152 |
+
logger.info(f"📂 Loading existing ChromaDB from {persist_dir}")
|
| 153 |
+
return Chroma(
|
| 154 |
+
collection_name=collection_name,
|
| 155 |
+
embedding_function=self.embeddings,
|
| 156 |
+
persist_directory=str(persist_dir)
|
| 157 |
+
)
|
| 158 |
+
else:
|
| 159 |
+
# Create new vector store with documents
|
| 160 |
+
logger.info(f"🆕 Creating new ChromaDB at {persist_dir}")
|
| 161 |
+
documents = self._load_documents_from_folder()
|
| 162 |
+
|
| 163 |
+
if documents:
|
| 164 |
+
vector_store = Chroma.from_documents(
|
| 165 |
+
documents=documents,
|
| 166 |
+
embedding=self.embeddings,
|
| 167 |
+
collection_name=collection_name,
|
| 168 |
+
persist_directory=str(persist_dir)
|
| 169 |
+
)
|
| 170 |
+
logger.info(f"✅ Created ChromaDB with {len(documents)} document chunks")
|
| 171 |
+
return vector_store
|
| 172 |
+
else:
|
| 173 |
+
logger.info("📝 No documents found, creating empty ChromaDB")
|
| 174 |
+
return Chroma(
|
| 175 |
+
collection_name=collection_name,
|
| 176 |
+
embedding_function=self.embeddings,
|
| 177 |
+
persist_directory=str(persist_dir)
|
| 178 |
+
)
|
| 179 |
+
except ImportError as e:
|
| 180 |
+
logger.error(f"❌ ChromaDB not available: {e}")
|
| 181 |
+
raise ImportError("ChromaDB provider selected but langchain_chroma not installed")
|
| 182 |
+
|
| 183 |
+
elif provider == "mongodb":
|
| 184 |
+
try:
|
| 185 |
+
logger.info("🔗 Setting up MongoDB Atlas connection...")
|
| 186 |
+
client = MongoClient(db_config["uri"])
|
| 187 |
+
client.admin.command('ping')
|
| 188 |
+
logger.info(f"✅ MongoDB Atlas connection verified")
|
| 189 |
+
print(client.list_database_names())
|
| 190 |
+
# Get the collection
|
| 191 |
+
database = client[db_config["database"]]
|
| 192 |
+
collection = database[db_config["collection_name"]]
|
| 193 |
+
# Create streamlined vector store with Atlas Vector Search
|
| 194 |
+
options = VectorSearchOptions(
|
| 195 |
+
index_name=db_config.get("index_name", "vector_index"),
|
| 196 |
+
embedding_key=db_config.get("vector_field", "ingredients_emb"),
|
| 197 |
+
text_key="title",
|
| 198 |
+
num_candidates=db_config.get("num_candidates", 50),
|
| 199 |
+
similarity_metric=db_config.get("similarity_metric", "cosine")
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
vector_store = CustomMongoDBVectorStore(
|
| 203 |
+
collection=collection,
|
| 204 |
+
embedding_function=self.embeddings,
|
| 205 |
+
options=options
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
logger.info(f"✅ Custom MongoDB Vector Store created successfully")
|
| 209 |
+
logger.info("🎯 Using pre-existing embeddings without requiring vector search index")
|
| 210 |
+
return vector_store
|
| 211 |
+
|
| 212 |
+
except ImportError as e:
|
| 213 |
+
logger.error(f"❌ MongoDB packages not available: {e}")
|
| 214 |
+
raise ImportError("MongoDB provider selected but langchain-mongodb not installed. Run: pip install langchain-mongodb pymongo")
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.error(f"❌ MongoDB Atlas connection failed: {e}")
|
| 217 |
+
raise ConnectionError(f"Failed to connect to MongoDB Atlas: {e}")
|
| 218 |
+
|
| 219 |
+
else:
|
| 220 |
+
logger.warning(f"⚠️ Unknown vector store provider '{provider}', falling back to ChromaDB")
|
| 221 |
+
try:
|
| 222 |
+
from langchain_chroma import Chroma
|
| 223 |
+
return Chroma(
|
| 224 |
+
collection_name="fallback_collection",
|
| 225 |
+
embedding_function=self.embeddings,
|
| 226 |
+
persist_directory="./vector_store/fallback_chroma"
|
| 227 |
+
)
|
| 228 |
+
except ImportError:
|
| 229 |
+
logger.error("❌ No valid vector store provider available")
|
| 230 |
+
raise ImportError("No valid vector store provider available")
|
| 231 |
+
|
| 232 |
+
def _load_documents_from_folder(self, folder_path: str = "./data/recipes") -> List[Document]:
|
| 233 |
+
"""Load and chunk all documents from folder with UTF-8 encoding, fallback to sample data"""
|
| 234 |
+
logger.info(f"📄 Loading documents from: {folder_path}")
|
| 235 |
+
|
| 236 |
+
documents = []
|
| 237 |
+
folder = Path(folder_path)
|
| 238 |
+
|
| 239 |
+
# Check if folder exists and has files
|
| 240 |
+
has_recipe_files = False
|
| 241 |
+
if folder.exists():
|
| 242 |
+
# Check if there are any files in the recipes folder
|
| 243 |
+
recipe_files = list(folder.rglob("*"))
|
| 244 |
+
has_recipe_files = any(f.is_file() and f.stat().st_size > 0 for f in recipe_files)
|
| 245 |
+
|
| 246 |
+
# If no recipe files found, use sample data
|
| 247 |
+
if not has_recipe_files:
|
| 248 |
+
logger.info(f"📭 No recipe files found in {folder_path}, using sample data")
|
| 249 |
+
folder_path = "./data" # Use data folder where sample_recipes.json is located
|
| 250 |
+
folder = Path(folder_path)
|
| 251 |
+
|
| 252 |
+
if not folder.exists():
|
| 253 |
+
logger.error(f"❌ Folder does not exist: {folder.absolute()}")
|
| 254 |
+
return documents
|
| 255 |
+
|
| 256 |
+
# Text splitter for chunking
|
| 257 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 258 |
+
chunk_size=1000,
|
| 259 |
+
chunk_overlap=200
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
# Process all text-based files uniformly
|
| 263 |
+
for file_path in folder.rglob("*"):
|
| 264 |
+
if file_path.is_file():
|
| 265 |
+
try:
|
| 266 |
+
# Read file content with UTF-8 encoding
|
| 267 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 268 |
+
content = f.read()
|
| 269 |
+
|
| 270 |
+
# Skip empty files
|
| 271 |
+
if not content.strip():
|
| 272 |
+
continue
|
| 273 |
+
|
| 274 |
+
# Handle JSON files specially to format them properly
|
| 275 |
+
if file_path.suffix.lower() == '.json':
|
| 276 |
+
formatted_content = self._format_json_recipes(content, file_path)
|
| 277 |
+
if formatted_content:
|
| 278 |
+
content = formatted_content
|
| 279 |
+
|
| 280 |
+
# Split content into chunks using text splitter
|
| 281 |
+
chunks = text_splitter.split_text(content)
|
| 282 |
+
|
| 283 |
+
# Create documents for each chunk
|
| 284 |
+
for i, chunk in enumerate(chunks):
|
| 285 |
+
documents.append(Document(
|
| 286 |
+
page_content=chunk,
|
| 287 |
+
metadata={
|
| 288 |
+
"source": str(file_path),
|
| 289 |
+
"filename": file_path.name,
|
| 290 |
+
"chunk_index": i,
|
| 291 |
+
"file_type": file_path.suffix
|
| 292 |
+
}
|
| 293 |
+
))
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logger.error(f"❌ Error loading {file_path}: {e}")
|
| 297 |
+
continue
|
| 298 |
+
|
| 299 |
+
logger.info(f"✅ Loaded and chunked {len(documents)} document segments")
|
| 300 |
+
return documents
|
| 301 |
+
|
| 302 |
+
def _format_json_recipes(self, json_content: str, file_path: Path) -> str:
|
| 303 |
+
"""Format JSON recipe data into readable text format similar to MongoDB output"""
|
| 304 |
+
try:
|
| 305 |
+
import json
|
| 306 |
+
recipes = json.loads(json_content)
|
| 307 |
+
|
| 308 |
+
# Handle both single recipe object and array of recipes
|
| 309 |
+
if isinstance(recipes, dict):
|
| 310 |
+
recipes = [recipes]
|
| 311 |
+
elif not isinstance(recipes, list):
|
| 312 |
+
logger.warning(f"⚠️ Unexpected JSON structure in {file_path}")
|
| 313 |
+
return None
|
| 314 |
+
|
| 315 |
+
formatted_recipes = []
|
| 316 |
+
|
| 317 |
+
for recipe in recipes:
|
| 318 |
+
if not isinstance(recipe, dict):
|
| 319 |
+
continue
|
| 320 |
+
|
| 321 |
+
# Extract recipe components
|
| 322 |
+
title = recipe.get("title", "Untitled Recipe")
|
| 323 |
+
ingredients = recipe.get("ingredients", [])
|
| 324 |
+
instructions = recipe.get("instructions", "")
|
| 325 |
+
|
| 326 |
+
# Format similar to MongoDB output
|
| 327 |
+
formatted_content = f"Recipe: {title}\n"
|
| 328 |
+
|
| 329 |
+
if ingredients:
|
| 330 |
+
if isinstance(ingredients, list):
|
| 331 |
+
formatted_content += f"Ingredients: {', '.join(ingredients)}\n"
|
| 332 |
+
else:
|
| 333 |
+
formatted_content += f"Ingredients: {ingredients}\n"
|
| 334 |
+
|
| 335 |
+
if instructions:
|
| 336 |
+
# Handle both string and list instructions
|
| 337 |
+
if isinstance(instructions, list):
|
| 338 |
+
formatted_content += f"Instructions: {' '.join(instructions)}"
|
| 339 |
+
else:
|
| 340 |
+
formatted_content += f"Instructions: {instructions}"
|
| 341 |
+
|
| 342 |
+
# Add metadata if available
|
| 343 |
+
metadata = recipe.get("metadata", {})
|
| 344 |
+
if metadata:
|
| 345 |
+
formatted_content += f"\n"
|
| 346 |
+
for key, value in metadata.items():
|
| 347 |
+
if key in ["cook_time", "difficulty", "servings", "category"]:
|
| 348 |
+
formatted_content += f"{key.replace('_', ' ').title()}: {value}\n"
|
| 349 |
+
|
| 350 |
+
formatted_recipes.append(formatted_content)
|
| 351 |
+
|
| 352 |
+
# Join all recipes with double newlines
|
| 353 |
+
result = "\n\n".join(formatted_recipes)
|
| 354 |
+
logger.info(f"✅ Formatted {len(recipes)} JSON recipes from {file_path.name}")
|
| 355 |
+
return result
|
| 356 |
+
|
| 357 |
+
except json.JSONDecodeError as e:
|
| 358 |
+
logger.error(f"❌ Invalid JSON in {file_path}: {e}")
|
| 359 |
+
return None
|
| 360 |
+
except Exception as e:
|
| 361 |
+
logger.error(f"❌ Error formatting JSON recipes from {file_path}: {e}")
|
| 362 |
+
return None
|
| 363 |
+
|
| 364 |
+
def get_retriever(self):
|
| 365 |
+
"""Get retriever for use with ConversationalRetrievalChain"""
|
| 366 |
+
logger.info("🔍 Creating retriever from vector store...")
|
| 367 |
+
|
| 368 |
+
# For both ChromaDB and MongoDB Atlas, create standard retriever
|
| 369 |
+
retriever = self.vector_store.as_retriever()
|
| 370 |
+
|
| 371 |
+
# Configure search parameters based on provider
|
| 372 |
+
if hasattr(self.vector_store, '__class__'):
|
| 373 |
+
class_name = self.vector_store.__class__.__name__
|
| 374 |
+
if 'MongoDB' in class_name:
|
| 375 |
+
# MongoDB Atlas configuration
|
| 376 |
+
retriever.search_kwargs = {"k": 5}
|
| 377 |
+
logger.info("🔍 MongoDB Atlas retriever configured with k=5")
|
| 378 |
+
else:
|
| 379 |
+
# ChromaDB configuration
|
| 380 |
+
retriever.search_kwargs = {"k": 5}
|
| 381 |
+
logger.info("🔍 ChromaDB retriever configured with k=5")
|
| 382 |
+
|
| 383 |
+
return retriever
|
| 384 |
+
|
| 385 |
+
# Create global vector store service instance
|
| 386 |
+
vector_store_service = VectorStoreService()
|
backend/tests/__init__.py
ADDED
|
File without changes
|
backend/tests/test_db_settings.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import os
|
| 3 |
+
from unittest.mock import patch
|
| 4 |
+
from backend.config.database import db_settings
|
| 5 |
+
|
| 6 |
+
class TestMongoAtlasSettings(unittest.TestCase):
|
| 7 |
+
|
| 8 |
+
@patch.dict(os.environ, {
|
| 9 |
+
'VECTOR_STORE_PROVIDER': 'mongodb'
|
| 10 |
+
})
|
| 11 |
+
def test_mongo_settings(self):
|
| 12 |
+
"""
|
| 13 |
+
Test mongodb config contains expected fields when VECTOR_STORE_PROVIDER is set
|
| 14 |
+
"""
|
| 15 |
+
# reinitialize db_settings instance
|
| 16 |
+
db_settings.__init__()
|
| 17 |
+
|
| 18 |
+
db_config = db_settings.get_vector_store_config()
|
| 19 |
+
self.assertTrue('collection_name' in db_config)
|
| 20 |
+
self.assertTrue('index_name' in db_config)
|
| 21 |
+
self.assertTrue('text_field' in db_config)
|
| 22 |
+
|
| 23 |
+
class TestChromaDBSettings(unittest.TestCase):
|
| 24 |
+
|
| 25 |
+
@patch.dict(os.environ, {
|
| 26 |
+
'VECTOR_STORE_PROVIDER': 'chromadb'
|
| 27 |
+
})
|
| 28 |
+
def test_mongo_settings(self):
|
| 29 |
+
"""
|
| 30 |
+
Test chromadb config contains expected fields when VECTOR_STORE_PROVIDER is set
|
| 31 |
+
"""
|
| 32 |
+
# reinitialize db_settings instance
|
| 33 |
+
db_settings.__init__()
|
| 34 |
+
|
| 35 |
+
db_config = db_settings.get_vector_store_config()
|
| 36 |
+
self.assertTrue('collection_name' in db_config)
|
| 37 |
+
self.assertTrue('persist_directory' in db_config)
|
| 38 |
+
self.assertTrue('refresh_on_start' in db_config)
|
| 39 |
+
|
| 40 |
+
class TestInvalidDBSettings(unittest.TestCase):
|
| 41 |
+
|
| 42 |
+
@patch.dict(os.environ, {
|
| 43 |
+
'VECTOR_STORE_PROVIDER': 'postgres'
|
| 44 |
+
})
|
| 45 |
+
def test_mongo_settings(self):
|
| 46 |
+
"""
|
| 47 |
+
Test invalid db config raises correct error
|
| 48 |
+
"""
|
| 49 |
+
# reinitialize db_settings instance
|
| 50 |
+
db_settings.__init__()
|
| 51 |
+
|
| 52 |
+
with self.assertRaisesRegex(ValueError, "Unsupported"):
|
| 53 |
+
db_settings.get_vector_store_config()
|
backend/tests/test_llm_provider_settings.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import os
|
| 3 |
+
from unittest.mock import patch
|
| 4 |
+
from backend.config.settings import settings
|
| 5 |
+
|
| 6 |
+
class TestValidLLMProviderSettings(unittest.TestCase):
|
| 7 |
+
|
| 8 |
+
@patch.dict(os.environ, {
|
| 9 |
+
'GOOGLE_API_KEY': 'ivq',
|
| 10 |
+
'GOOGLE_MAX_TOKENS': '1200',
|
| 11 |
+
'OPENAI_API_KEY': 'xyz',
|
| 12 |
+
'OPENAI_MAX_TOKENS': '3800',
|
| 13 |
+
'LLM_PROVIDER': 'openai'
|
| 14 |
+
})
|
| 15 |
+
def test_valid_llm_provider_settings(self):
|
| 16 |
+
"""
|
| 17 |
+
Test settings object provides correct config when LLM_PROVIDER is set
|
| 18 |
+
"""
|
| 19 |
+
# reinitialize settings instance
|
| 20 |
+
settings.__init__()
|
| 21 |
+
|
| 22 |
+
llm_config = settings.get_llm_config()
|
| 23 |
+
self.assertEqual(llm_config['api_key'], 'xyz')
|
| 24 |
+
self.assertEqual(llm_config['max_tokens'], 3800)
|
| 25 |
+
|
| 26 |
+
class TestInvalidLLMProviderSettings(unittest.TestCase):
|
| 27 |
+
|
| 28 |
+
@patch.dict(os.environ, {
|
| 29 |
+
'LLM_PROVIDER': 'microsoft'
|
| 30 |
+
})
|
| 31 |
+
def test_invalid_llm_provider_settings(self):
|
| 32 |
+
"""
|
| 33 |
+
Test that improper provider config raises the right error
|
| 34 |
+
"""
|
| 35 |
+
# reinitialize settings instance
|
| 36 |
+
settings.__init__()
|
| 37 |
+
|
| 38 |
+
with self.assertRaisesRegex(ValueError, "Unsupported"):
|
| 39 |
+
settings.get_llm_config()
|
backend/tests/test_llm_service.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import os
|
| 3 |
+
from unittest.mock import patch
|
| 4 |
+
from backend.services.llm_service import llm_service
|
| 5 |
+
from backend.config.settings import settings
|
| 6 |
+
from backend.config.database import db_settings
|
| 7 |
+
|
| 8 |
+
class TestLLMService(unittest.TestCase):
|
| 9 |
+
|
| 10 |
+
@patch.dict(os.environ, {
|
| 11 |
+
'VECTOR_STORE_PROVIDER': 'chromadb',
|
| 12 |
+
'LLM_PROVIDER': 'google'
|
| 13 |
+
, })
|
| 14 |
+
def test_chat_completion(self):
|
| 15 |
+
"""
|
| 16 |
+
Test chat completions work, assuming proper config and API keys
|
| 17 |
+
"""
|
| 18 |
+
# reinitialize globals instance
|
| 19 |
+
settings.__init__()
|
| 20 |
+
db_settings.__init__()
|
| 21 |
+
llm_service.__init__()
|
| 22 |
+
|
| 23 |
+
response = llm_service.simple_chat_completion("Hello there 👋🏽")
|
| 24 |
+
print(response)
|
| 25 |
+
# There should be a better way to test this
|
| 26 |
+
self.assertTrue(len(response) > 0)
|
backend/utils/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utils package - Utility functions and helpers
|
| 2 |
+
from .sanitization import (
|
| 3 |
+
DataSanitizer,
|
| 4 |
+
sanitize_user_input
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
'DataSanitizer',
|
| 9 |
+
'sanitize_user_input'
|
| 10 |
+
]
|
backend/utils/helpers.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utility functions for the Recipe Recommendation Bot
|
| 2 |
+
|
backend/utils/request_dto/chat_response.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
|
| 3 |
+
class ChatResponse(BaseModel):
|
| 4 |
+
response: str = Field(..., description="Bot response to the user message")
|
backend/utils/request_dto/scrape_request.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class ScrapeRequest(BaseModel):
|
| 5 |
+
site: str
|
| 6 |
+
limit: int = 50
|
| 7 |
+
output_type: str = "json" # or "mongo"
|