Spaces:
Sleeping
Sleeping
Upload 103 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .env +12 -0
- .gitattributes +6 -0
- .gitignore +124 -0
- Backend.md +65 -0
- Dockerfile +17 -0
- RAG-architecture.md +254 -0
- README.md +310 -11
- VectorDB/user_1_vector_db/chroma.sqlite3 +3 -0
- app.py +90 -0
- clean_repo/.env +12 -0
- clean_repo/.gitattributes +35 -0
- clean_repo/.gitignore +128 -0
- clean_repo/Backend.md +65 -0
- clean_repo/Dockerfile +17 -0
- clean_repo/RAG-architecture.md +254 -0
- clean_repo/README.md +310 -0
- clean_repo/VectorDB/user_1_vector_db/chroma.sqlite3 +3 -0
- clean_repo/app.py +90 -0
- clean_repo/clean_repo/.env +12 -0
- clean_repo/clean_repo/.gitattributes +35 -0
- clean_repo/clean_repo/.gitignore +120 -0
- clean_repo/clean_repo/Backend.md +65 -0
- clean_repo/clean_repo/Dockerfile +17 -0
- clean_repo/clean_repo/RAG-architecture.md +254 -0
- clean_repo/clean_repo/README.md +310 -0
- clean_repo/clean_repo/app.py +90 -0
- clean_repo/clean_repo/env_template.txt +51 -0
- clean_repo/clean_repo/requirements.txt +0 -0
- clean_repo/clean_repo/src/streamlit_app.py +40 -0
- clean_repo/env_template.txt +51 -0
- clean_repo/images/DIAGRAM-RAG-diary.png +0 -0
- clean_repo/notebook/RAG-test.ipynb +0 -0
- clean_repo/notebook/exploration.ipynb +0 -0
- clean_repo/requirements.txt +0 -0
- clean_repo/src/Indexingstep/Datasplitting.py +44 -0
- clean_repo/src/Indexingstep/database_utils.py +140 -0
- clean_repo/src/Indexingstep/dataloading.py +603 -0
- clean_repo/src/Indexingstep/diary_text_splitter.py +241 -0
- clean_repo/src/Indexingstep/embedding_and_storing.py +499 -0
- clean_repo/src/Indexingstep/indexing_pipeline.py +110 -0
- clean_repo/src/Indexingstep/pipeline.py +459 -0
- clean_repo/src/Retrivel_And_Generation/Retrieval_And_Generator.py +739 -0
- clean_repo/src/Retrivel_And_Generation/__pycache__/Retrieval_And_Generator.cpython-311.pyc +0 -0
- clean_repo/src/rag_service/main.py +721 -0
- clean_repo/src/simple_diary_chatbot.py +274 -0
- clean_repo/src/streamlit_app.py +40 -0
- clean_repo/src/streamlit_app/__pycache__/auth_ui.cpython-311.pyc +0 -0
- clean_repo/src/streamlit_app/__pycache__/auto_sync.cpython-311.pyc +0 -0
- clean_repo/src/streamlit_app/__pycache__/rag_client.cpython-311.pyc +0 -0
- clean_repo/src/streamlit_app/__pycache__/user_auth.cpython-311.pyc +0 -0
.env
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Google API Configuration for RAG System
|
| 2 |
+
GOOGLE_API_KEY=AIzaSyAZQN21CjLySEybT6vOYDCz4V_e85gD42k
|
| 3 |
+
|
| 4 |
+
# Database Configuration
|
| 5 |
+
DATABASE_PATH=./src/streamlit_app/backend/diary.db
|
| 6 |
+
|
| 7 |
+
# Vector Database Configuration
|
| 8 |
+
VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
|
| 9 |
+
COLLECTION_NAME=diary_entries
|
| 10 |
+
|
| 11 |
+
# RAG Configuration
|
| 12 |
+
EMBEDDING_MODEL=models/embedding-001
|
.gitattributes
CHANGED
|
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
clean_repo/src/streamlit_app/src/Indexingstep/user_3_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
clean_repo/src/streamlit_app/temp/recorded_audio.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
clean_repo/VectorDB/user_1_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
src/streamlit_app/src/Indexingstep/user_3_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
src/streamlit_app/temp/recorded_audio.wav filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
VectorDB/user_1_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
./venv/
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
|
| 27 |
+
# PyInstaller
|
| 28 |
+
*.manifest
|
| 29 |
+
*.spec
|
| 30 |
+
|
| 31 |
+
# Installer logs
|
| 32 |
+
pip-log.txt
|
| 33 |
+
pip-delete-this-directory.txt
|
| 34 |
+
|
| 35 |
+
# Unit test / coverage reports
|
| 36 |
+
htmlcov/
|
| 37 |
+
.tox/
|
| 38 |
+
.coverage
|
| 39 |
+
.coverage.*
|
| 40 |
+
.cache
|
| 41 |
+
nosetests.xml
|
| 42 |
+
coverage.xml
|
| 43 |
+
*.cover
|
| 44 |
+
.hypothesis/
|
| 45 |
+
.pytest_cache/
|
| 46 |
+
|
| 47 |
+
# Translations
|
| 48 |
+
*.mo
|
| 49 |
+
*.pot
|
| 50 |
+
|
| 51 |
+
# Django stuff:
|
| 52 |
+
*.log
|
| 53 |
+
local_settings.py
|
| 54 |
+
db.sqlite3
|
| 55 |
+
|
| 56 |
+
# Flask stuff:
|
| 57 |
+
instance/
|
| 58 |
+
.webassets-cache
|
| 59 |
+
|
| 60 |
+
# Scrapy stuff:
|
| 61 |
+
.scrapy
|
| 62 |
+
|
| 63 |
+
# Sphinx documentation
|
| 64 |
+
docs/_build/
|
| 65 |
+
|
| 66 |
+
# PyBuilder
|
| 67 |
+
target/
|
| 68 |
+
|
| 69 |
+
# Jupyter Notebook
|
| 70 |
+
.ipynb_checkpoints
|
| 71 |
+
|
| 72 |
+
# pyenv
|
| 73 |
+
.python-version
|
| 74 |
+
|
| 75 |
+
# celery beat schedule file
|
| 76 |
+
celerybeat-schedule
|
| 77 |
+
|
| 78 |
+
# SageMath parsed files
|
| 79 |
+
*.sage.py
|
| 80 |
+
|
| 81 |
+
# Environments
|
| 82 |
+
.env
|
| 83 |
+
.venv
|
| 84 |
+
env/
|
| 85 |
+
venv/
|
| 86 |
+
ENV/
|
| 87 |
+
env.bak/
|
| 88 |
+
venv.bak/
|
| 89 |
+
|
| 90 |
+
# Spyder project settings
|
| 91 |
+
.spyderproject
|
| 92 |
+
.spyproject
|
| 93 |
+
|
| 94 |
+
# Rope project settings
|
| 95 |
+
.ropeproject
|
| 96 |
+
|
| 97 |
+
# mkdocs documentation
|
| 98 |
+
/site
|
| 99 |
+
|
| 100 |
+
# mypy
|
| 101 |
+
.mypy_cache/
|
| 102 |
+
.dmypy.json
|
| 103 |
+
dmypy.json
|
| 104 |
+
|
| 105 |
+
# IDE
|
| 106 |
+
.vscode/
|
| 107 |
+
.idea/
|
| 108 |
+
*.swp
|
| 109 |
+
*.swo
|
| 110 |
+
# Vector databases & generated files
|
| 111 |
+
/src/Indexingstep/src/Indexingstep/diary_vector_db_enhanced/
|
| 112 |
+
*.db
|
| 113 |
+
*.sqlite3
|
| 114 |
+
*.bin
|
| 115 |
+
# OS
|
| 116 |
+
.DS_Store
|
| 117 |
+
Thumbs.db
|
| 118 |
+
|
| 119 |
+
# Project specific
|
| 120 |
+
data/
|
| 121 |
+
models/
|
| 122 |
+
logs/
|
| 123 |
+
*.pkl
|
| 124 |
+
*.model
|
Backend.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Backend Architecture - Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## Tổng quan Backend
|
| 4 |
+
|
| 5 |
+
Backend của dự án được xây dựng trên nền tảng FastAPI, cung cấp API RESTful cho việc xử lý nhật ký, tìm kiếm và tương tác với chatbot RAG. Hệ thống được thiết kế theo kiến trúc microservices với khả năng mở rộng cao.
|
| 6 |
+
|
| 7 |
+
## 🏛️ Kiến trúc tổng thể
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Frontend │ │ API Gateway │ │ Core Services │
|
| 12 |
+
│ (Streamlit) │◄──►│ (FastAPI) │◄──►│ (RAG Engine) │
|
| 13 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 14 |
+
│
|
| 15 |
+
▼
|
| 16 |
+
┌─────────────────┐
|
| 17 |
+
│ Data Layer │
|
| 18 |
+
│ (Vector DB) │
|
| 19 |
+
└─────────────────┘
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 🔧 Cấu trúc thư mục Backend
|
| 23 |
+
|
| 24 |
+
```
|
| 25 |
+
src/
|
| 26 |
+
├── rag_service/ # FastAPI service
|
| 27 |
+
│ ├── main.py # Main application entry point
|
| 28 |
+
│ ├── __init__.py
|
| 29 |
+
│ └── __pycache__/
|
| 30 |
+
├── Indexingstep/ # Data processing pipeline
|
| 31 |
+
│ ├── pipeline.py # Main indexing pipeline
|
| 32 |
+
│ ├── dataloading.py # Document loading utilities
|
| 33 |
+
│ ├── diary_text_splitter.py # Text chunking logic
|
| 34 |
+
│ ├── embedding_and_storing.py # Vector embedding & storage
|
| 35 |
+
│ ├── database_utils.py # Database operations
|
| 36 |
+
│ └── indexing_pipeline.py # Pipeline orchestration
|
| 37 |
+
├── Retrivel_And_Generation/ # RAG core engine
|
| 38 |
+
│ ├── Retrieval_And_Generator.py # Main RAG system
|
| 39 |
+
│ └── __init__.py
|
| 40 |
+
├── VectorDB/ # Vector database storage
|
| 41 |
+
└── streamlit_app/ # Frontend application
|
| 42 |
+
├── backend/ # Backend utilities for UI
|
| 43 |
+
├── user_auth.py # Authentication system
|
| 44 |
+
├── rag_client.py # RAG service client
|
| 45 |
+
└── interface.py # Main UI interface
|
| 46 |
+
```
|
| 47 |
+
## 🔮 Future Enhancements
|
| 48 |
+
|
| 49 |
+
### 1. Microservices Architecture
|
| 50 |
+
- **User Service**: Dedicated user management
|
| 51 |
+
- **Document Service**: Document processing pipeline
|
| 52 |
+
- **Search Service**: Vector search optimization
|
| 53 |
+
- **Chat Service**: Conversation management
|
| 54 |
+
|
| 55 |
+
### 2. Advanced Features
|
| 56 |
+
- **Real-time synchronization**: WebSocket support
|
| 57 |
+
- **Multi-language support**: Internationalization
|
| 58 |
+
- **Advanced analytics**: User behavior tracking
|
| 59 |
+
- **Machine learning**: Continuous model improvement
|
| 60 |
+
|
| 61 |
+
### 3. Infrastructure Improvements
|
| 62 |
+
- **Kubernetes deployment**: Container orchestration
|
| 63 |
+
- **Service mesh**: Istio integration
|
| 64 |
+
- **Observability**: Distributed tracing
|
| 65 |
+
- **Auto-scaling**: Dynamic resource allocation
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sử dụng image Python chính thức
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Đặt thư mục làm việc trong container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Sao chép file requirements.txt vào container (nếu có)
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Cài đặt các thư viện phụ thuộc
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Sao chép toàn bộ mã nguồn vào container
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Chạy ứng dụng (thay main.py bằng file chạy chính của bạn)
|
| 17 |
+
CMD ["python", "app.py"]
|
RAG-architecture.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Architecture - Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## 🏗️ Tổng quan kiến trúc RAG
|
| 4 |
+
|
| 5 |
+
Kiến trúc RAG (Retrieval-Augmented Generation) trong dự án này được thiết kế để cung cấp khả năng tìm kiếm và trả lời thông minh dựa trên dữ liệu nhật ký cá nhân của người dùng.
|
| 6 |
+
|
| 7 |
+
## 🔄 Luồng xử lý RAG
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Input Query │───►│ Query │───►│ Vector │
|
| 12 |
+
│ (User Question)│ │ Processing │ │ Search │
|
| 13 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 14 |
+
│
|
| 15 |
+
▼
|
| 16 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 17 |
+
│ Final Answer │◄───│ Answer │◄───│ Context │
|
| 18 |
+
│ (Response) │ │ Generation │ │ Retrieval │
|
| 19 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 📊 Chi tiết các thành phần
|
| 23 |
+
|
| 24 |
+
### 1. Data Ingestion & Indexing
|
| 25 |
+
|
| 26 |
+
#### 1.1 Document Loading
|
| 27 |
+
- **Input formats**: PDF, DOCX, TXT
|
| 28 |
+
- **Processing**: Text extraction, cleaning, normalization
|
| 29 |
+
- **Output**: Structured text data
|
| 30 |
+
|
| 31 |
+
#### 1.2 Text Chunking
|
| 32 |
+
```python
|
| 33 |
+
# Chunking strategy
|
| 34 |
+
chunk_size = 1000 # characters
|
| 35 |
+
chunk_overlap = 200 # characters
|
| 36 |
+
chunking_method = "recursive_character_splitter"
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
#### 1.3 Embedding Generation
|
| 40 |
+
- **Model**: Google Universal Sentence Encoder (USE)
|
| 41 |
+
- **Vector dimension**: 512
|
| 42 |
+
- **Normalization**: L2 normalization
|
| 43 |
+
- **Storage**: ChromaDB vector database
|
| 44 |
+
|
| 45 |
+
### 2. Vector Database Architecture
|
| 46 |
+
|
| 47 |
+
#### 2.1 ChromaDB Configuration
|
| 48 |
+
```python
|
| 49 |
+
# Database settings
|
| 50 |
+
collection_name = f"user_{user_id}_diary"
|
| 51 |
+
metadata = {
|
| 52 |
+
"user_id": user_id,
|
| 53 |
+
"source": "diary_entry",
|
| 54 |
+
"date": entry_date,
|
| 55 |
+
"chunk_id": chunk_id
|
| 56 |
+
}
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
#### 2.2 Index Structure
|
| 60 |
+
- **Primary key**: `user_id + chunk_id`
|
| 61 |
+
- **Vector index**: HNSW (Hierarchical Navigable Small World)
|
| 62 |
+
- **Distance metric**: Cosine similarity
|
| 63 |
+
- **Sharding**: Per-user collections
|
| 64 |
+
|
| 65 |
+
### 3. Retrieval Engine
|
| 66 |
+
|
| 67 |
+
#### 3.1 Query Processing
|
| 68 |
+
```python
|
| 69 |
+
# Query preprocessing
|
| 70 |
+
def process_query(query: str):
|
| 71 |
+
# 1. Text cleaning
|
| 72 |
+
# 2. Stop word removal
|
| 73 |
+
# 3. Lemmatization
|
| 74 |
+
# 4. Query expansion
|
| 75 |
+
return processed_query
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
#### 3.2 Vector Search
|
| 79 |
+
- **Search algorithm**: K-Nearest Neighbors (KNN)
|
| 80 |
+
- **Top-k results**: 5-10 most relevant chunks
|
| 81 |
+
- **Similarity threshold**: 0.7 (cosine similarity)
|
| 82 |
+
- **Reranking**: Semantic relevance scoring
|
| 83 |
+
|
| 84 |
+
#### 3.3 Context Assembly
|
| 85 |
+
```python
|
| 86 |
+
# Context building
|
| 87 |
+
def build_context(retrieved_chunks, query):
|
| 88 |
+
# 1. Sort by relevance score
|
| 89 |
+
# 2. Remove duplicates
|
| 90 |
+
# 3. Truncate to token limit
|
| 91 |
+
# 4. Add metadata context
|
| 92 |
+
return final_context
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
### 4. Generation Engine
|
| 96 |
+
|
| 97 |
+
#### 4.1 LLM Integration
|
| 98 |
+
- **Primary model**: OpenAI GPT-3.5/4
|
| 99 |
+
- **Fallback model**: Local model (nếu cần)
|
| 100 |
+
- **Temperature**: 0.7 (balanced creativity)
|
| 101 |
+
- **Max tokens**: 500 (response length)
|
| 102 |
+
|
| 103 |
+
#### 4.2 Prompt Engineering
|
| 104 |
+
```python
|
| 105 |
+
# System prompt template
|
| 106 |
+
SYSTEM_PROMPT = """
|
| 107 |
+
You are a helpful AI assistant that answers questions about personal diary entries.
|
| 108 |
+
Use only the provided context to answer questions.
|
| 109 |
+
If the information is not in the context, say so.
|
| 110 |
+
Be conversational but professional.
|
| 111 |
+
"""
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
#### 4.3 Response Generation
|
| 115 |
+
```python
|
| 116 |
+
# Generation pipeline
|
| 117 |
+
def generate_response(query, context, chat_history):
|
| 118 |
+
# 1. Build prompt with context
|
| 119 |
+
# 2. Call LLM API
|
| 120 |
+
# 3. Post-process response
|
| 121 |
+
# 4. Validate against context
|
| 122 |
+
# 5. Return final answer
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## 🔧 Cấu hình kỹ thuật
|
| 126 |
+
|
| 127 |
+
### Performance Tuning
|
| 128 |
+
|
| 129 |
+
#### 1. Chunking Optimization
|
| 130 |
+
- **Optimal chunk size**: 1000 characters
|
| 131 |
+
- **Overlap ratio**: 20%
|
| 132 |
+
- **Chunking strategy**: Recursive character splitter
|
| 133 |
+
|
| 134 |
+
#### 2. Vector Search Optimization
|
| 135 |
+
- **Index type**: HNSW
|
| 136 |
+
- **Search parameters**:
|
| 137 |
+
- `ef_construction`: 200
|
| 138 |
+
- `ef_search`: 100
|
| 139 |
+
- `m`: 16
|
| 140 |
+
|
| 141 |
+
#### 3. Caching Strategy
|
| 142 |
+
- **Query cache**: Redis (in-memory)
|
| 143 |
+
- **Embedding cache**: Local file cache
|
| 144 |
+
- **Response cache**: TTL-based expiration
|
| 145 |
+
|
| 146 |
+
### Scalability Features
|
| 147 |
+
|
| 148 |
+
#### 1. Multi-User Support
|
| 149 |
+
- **User isolation**: Separate vector collections
|
| 150 |
+
- **Resource management**: Per-user memory limits
|
| 151 |
+
- **Concurrent access**: Async processing
|
| 152 |
+
|
| 153 |
+
#### 2. Horizontal Scaling
|
| 154 |
+
- **Load balancing**: Multiple RAG instances
|
| 155 |
+
- **Database sharding**: User-based distribution
|
| 156 |
+
- **Microservices**: Modular architecture
|
| 157 |
+
|
| 158 |
+
## 📈 Monitoring & Analytics
|
| 159 |
+
|
| 160 |
+
### 1. Performance Metrics
|
| 161 |
+
- **Query latency**: < 2 seconds
|
| 162 |
+
- **Retrieval accuracy**: > 85%
|
| 163 |
+
- **Generation quality**: User satisfaction score
|
| 164 |
+
- **System throughput**: Queries per second
|
| 165 |
+
|
| 166 |
+
### 2. Quality Assurance
|
| 167 |
+
- **Context relevance**: Similarity score tracking
|
| 168 |
+
- **Answer accuracy**: Human evaluation
|
| 169 |
+
- **User feedback**: Rating system
|
| 170 |
+
- **A/B testing**: Model comparison
|
| 171 |
+
|
| 172 |
+
## 🚀 Deployment Architecture
|
| 173 |
+
|
| 174 |
+
### 1. Development Environment
|
| 175 |
+
```
|
| 176 |
+
┌─────────────────┐ ┌─────────────────┐
|
| 177 |
+
│ Local Python │ │ Local │
|
| 178 |
+
│ Environment │◄──►│ ChromaDB │
|
| 179 |
+
└─────────────────┘ └─────────────────┘
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
### 2. Production Environment
|
| 183 |
+
```
|
| 184 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 185 |
+
│ Load Balancer│ │ RAG Service │ │ Vector DB │
|
| 186 |
+
│ (Nginx) │◄──►│ (FastAPI) │◄──►│ (ChromaDB) │
|
| 187 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 188 |
+
│
|
| 189 |
+
▼
|
| 190 |
+
┌─────────────────┐
|
| 191 |
+
│ Redis Cache │
|
| 192 |
+
└─────────────────┘
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
## 🔒 Security & Privacy
|
| 196 |
+
|
| 197 |
+
### 1. Data Protection
|
| 198 |
+
- **User isolation**: Strict separation of data
|
| 199 |
+
- **Encryption**: At-rest and in-transit
|
| 200 |
+
- **Access control**: Role-based permissions
|
| 201 |
+
- **Audit logging**: Complete access history
|
| 202 |
+
|
| 203 |
+
### 2. Privacy Compliance
|
| 204 |
+
- **GDPR compliance**: Data portability
|
| 205 |
+
- **Data retention**: Configurable policies
|
| 206 |
+
- **User consent**: Explicit permission management
|
| 207 |
+
- **Data anonymization**: Optional features
|
| 208 |
+
|
| 209 |
+
## 🧪 Testing Strategy
|
| 210 |
+
|
| 211 |
+
### 1. Unit Testing
|
| 212 |
+
- **Component testing**: Individual modules
|
| 213 |
+
- **Mock testing**: External API simulation
|
| 214 |
+
- **Coverage target**: > 90%
|
| 215 |
+
|
| 216 |
+
### 2. Integration Testing
|
| 217 |
+
- **End-to-end testing**: Complete RAG pipeline
|
| 218 |
+
- **Performance testing**: Load and stress tests
|
| 219 |
+
- **Security testing**: Vulnerability assessment
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
## 📚 Best Practices
|
| 223 |
+
|
| 224 |
+
### 1. Model Selection
|
| 225 |
+
- **Embedding models**: Domain-specific fine-tuning
|
| 226 |
+
- **LLM selection**: Cost-performance balance
|
| 227 |
+
- **Fallback strategies**: Graceful degradation
|
| 228 |
+
|
| 229 |
+
### 2. Data Quality
|
| 230 |
+
- **Input validation**: Strict data checking
|
| 231 |
+
- **Cleaning pipeline**: Automated preprocessing
|
| 232 |
+
- **Quality metrics**: Continuous monitoring
|
| 233 |
+
|
| 234 |
+
### 3. Error Handling
|
| 235 |
+
- **Graceful failures**: User-friendly error messages
|
| 236 |
+
- **Retry mechanisms**: Automatic recovery
|
| 237 |
+
- **Logging**: Comprehensive error tracking
|
| 238 |
+
|
| 239 |
+
## 🔮 Future Enhancements
|
| 240 |
+
|
| 241 |
+
### 1. Advanced Features
|
| 242 |
+
- **Multi-modal RAG**: Image and text processing
|
| 243 |
+
- **Temporal reasoning**: Time-based queries
|
| 244 |
+
- **Emotional analysis**: Sentiment-aware responses
|
| 245 |
+
|
| 246 |
+
### 2. Performance Improvements
|
| 247 |
+
- **Vector quantization**: Reduced memory usage
|
| 248 |
+
- **Approximate search**: Faster retrieval
|
| 249 |
+
- **Model distillation**: Smaller, faster models
|
| 250 |
+
|
| 251 |
+
### 3. Integration Capabilities
|
| 252 |
+
- **API ecosystem**: Third-party integrations
|
| 253 |
+
- **Mobile applications**: Native mobile support
|
| 254 |
+
- **Voice interface**: Speech-to-text integration
|
README.md
CHANGED
|
@@ -1,11 +1,310 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## 📖 Project Description
|
| 4 |
+
|
| 5 |
+
RAG Personal Diary Chatbot is an intelligent chatbot application that uses RAG (Retrieval-Augmented Generation) architecture to interact with users' personal diaries. The application allows users to ask questions about diary content and receive accurate answers based on actual data.
|
| 6 |
+
|
| 7 |
+
## ✨ Key Features
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
## 🏗️ System Architecture
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 14 |
+
│ Streamlit UI │ │ FastAPI │ │ Vector │
|
| 15 |
+
│ (Frontend) │◄──►│ Backend │◄──►│ Database │
|
| 16 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 17 |
+
│
|
| 18 |
+
▼
|
| 19 |
+
┌─────────────────┐
|
| 20 |
+
│ RAG Engine │
|
| 21 |
+
│ (LLM + │
|
| 22 |
+
│ Retrieval) │
|
| 23 |
+
└─────────────────┘
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## 🚀 Installation and Setup
|
| 27 |
+
|
| 28 |
+
### System Requirements
|
| 29 |
+
|
| 30 |
+
### Install Dependencies
|
| 31 |
+
```bash
|
| 32 |
+
# Create virtual environment
|
| 33 |
+
python -m venv .venv
|
| 34 |
+
|
| 35 |
+
# Activate virtual environment
|
| 36 |
+
# Windows
|
| 37 |
+
.venv\Scripts\activate
|
| 38 |
+
# Linux/Mac
|
| 39 |
+
source .venv/bin/activate
|
| 40 |
+
|
| 41 |
+
# Install packages
|
| 42 |
+
pip install -r requirements.txt
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### Environment Configuration
|
| 46 |
+
|
| 47 |
+
Create a `.env` file in the project root directory with the following structure:
|
| 48 |
+
|
| 49 |
+
```env
|
| 50 |
+
# API Keys
|
| 51 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 52 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 53 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 54 |
+
|
| 55 |
+
# Database Configuration
|
| 56 |
+
DATABASE_URL=sqlite:///./user_database/auth.db
|
| 57 |
+
VECTOR_DB_PATH=./VectorDB
|
| 58 |
+
|
| 59 |
+
# Model Configuration
|
| 60 |
+
EMBEDDING_MODEL=google-universal-sentence-encoder
|
| 61 |
+
LLM_MODEL=gpt-3.5-turbo
|
| 62 |
+
CHUNK_SIZE=1000
|
| 63 |
+
CHUNK_OVERLAP=200
|
| 64 |
+
|
| 65 |
+
# Server Configuration
|
| 66 |
+
RAG_SERVICE_PORT=8001
|
| 67 |
+
STREAMLIT_PORT=8501
|
| 68 |
+
FASTAPI_PORT=8000
|
| 69 |
+
|
| 70 |
+
# Security
|
| 71 |
+
SECRET_KEY=your_secret_key_here
|
| 72 |
+
JWT_SECRET=your_jwt_secret_here
|
| 73 |
+
|
| 74 |
+
# Logging
|
| 75 |
+
LOG_LEVEL=INFO
|
| 76 |
+
LOG_FILE=./logs/app.log
|
| 77 |
+
|
| 78 |
+
# Vector Database
|
| 79 |
+
CHROMA_DB_PATH=./VectorDB
|
| 80 |
+
PERSIST_DIRECTORY=./VectorDB
|
| 81 |
+
|
| 82 |
+
# File Processing
|
| 83 |
+
SUPPORTED_FORMATS=pdf,docx,txt,md
|
| 84 |
+
MAX_FILE_SIZE=10485760
|
| 85 |
+
TEMP_DIR=./temp
|
| 86 |
+
|
| 87 |
+
# RAG Configuration
|
| 88 |
+
TOP_K_RESULTS=5
|
| 89 |
+
SIMILARITY_THRESHOLD=0.7
|
| 90 |
+
MAX_TOKENS=4096
|
| 91 |
+
TEMPERATURE=0.7
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Important Notes:**
|
| 95 |
+
|
| 96 |
+
### Run the Application
|
| 97 |
+
|
| 98 |
+
#### 1. Start RAG Service
|
| 99 |
+
```bash
|
| 100 |
+
python start_rag_service.py
|
| 101 |
+
```
|
| 102 |
+
Service will run at: http://127.0.0.1:8001
|
| 103 |
+
|
| 104 |
+
#### 2. Start Streamlit UI
|
| 105 |
+
```bash
|
| 106 |
+
cd src/streamlit_app
|
| 107 |
+
streamlit run interface.py
|
| 108 |
+
```
|
| 109 |
+
UI will run at: http://localhost:8501
|
| 110 |
+
|
| 111 |
+
## 📁 Directory Structure
|
| 112 |
+
|
| 113 |
+
```
|
| 114 |
+
RAG-Personal-Diary-Chatbot/
|
| 115 |
+
├── src/
|
| 116 |
+
│ ├── Indexingstep/ # Data indexing pipeline
|
| 117 |
+
│ ├── Retrivel_And_Generation/ # RAG engine
|
| 118 |
+
│ ├── rag_service/ # FastAPI backend
|
| 119 |
+
│ ├── streamlit_app/ # User interface
|
| 120 |
+
│ └── VectorDB/ # Vector database
|
| 121 |
+
├── notebook/ # Jupyter notebooks
|
| 122 |
+
├── tests/ # Unit tests
|
| 123 |
+
├── images/ # Documentation images
|
| 124 |
+
├── start_rag_service.py # Service startup script
|
| 125 |
+
├── .env # Environment variables (create from template)
|
| 126 |
+
├── env_template.txt # Environment variables template
|
| 127 |
+
└── README.md
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## 🔧 Configuration
|
| 131 |
+
|
| 132 |
+
### Vector Database
|
| 133 |
+
|
| 134 |
+
### AI Models
|
| 135 |
+
|
| 136 |
+
## 📊 Performance
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
## 🧪 Testing
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Run all tests
|
| 143 |
+
python -m pytest tests/
|
| 144 |
+
|
| 145 |
+
# Run specific test
|
| 146 |
+
python -m pytest tests/test_rag_system.py
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## 🤝 Contributing
|
| 150 |
+
|
| 151 |
+
1. Fork the project
|
| 152 |
+
2. Create feature branch (`git checkout -b feature/AmazingFeature`)
|
| 153 |
+
3. Commit changes (`git commit -m 'Add some AmazingFeature'`)
|
| 154 |
+
4. Push to branch (`git push origin feature/AmazingFeature`)
|
| 155 |
+
5. Open Pull Request
|
| 156 |
+
|
| 157 |
+
## 📝 License
|
| 158 |
+
|
| 159 |
+
This project is distributed under the MIT License. See the `LICENSE` file for more details.
|
| 160 |
+
|
| 161 |
+
## 📞 Contact
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
## 🙏 Acknowledgments
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## 📖 Project Description
|
| 168 |
+
|
| 169 |
+
RAG Personal Diary Chatbot is an intelligent chatbot application that leverages Retrieval-Augmented Generation (RAG) architecture to interact with users' personal diaries. Users can ask questions about their diary content and receive accurate, context-based answers.
|
| 170 |
+
|
| 171 |
+
## ✨ Key Features
|
| 172 |
+
|
| 173 |
+
- **Diary Indexing**: Automatically processes and indexes diary files (PDF, DOCX, TXT)
|
| 174 |
+
- **Semantic Search**: Uses a vector database for semantic search
|
| 175 |
+
- **AI Chatbot**: Natural interaction with diary data
|
| 176 |
+
- **User Isolation**: Each user has a separate vector database
|
| 177 |
+
- **Web Interface**: Easy-to-use Streamlit UI
|
| 178 |
+
- **REST API**: FastAPI backend for integration
|
| 179 |
+
|
| 180 |
+
## 🏗️ System Architecture
|
| 181 |
+
|
| 182 |
+
```
|
| 183 |
+
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
|
| 184 |
+
│ Streamlit UI │◄──►│ FastAPI │◄──►│ Vector DB │
|
| 185 |
+
│ (Frontend) │ │ Backend │ │ (ChromaDB) │
|
| 186 |
+
└───────────────┘ └───────────────┘ └───────────────┘
|
| 187 |
+
│
|
| 188 |
+
▼
|
| 189 |
+
┌───────────────┐
|
| 190 |
+
│ RAG Engine │
|
| 191 |
+
│ (LLM + │
|
| 192 |
+
│ Retrieval) │
|
| 193 |
+
└───────────────┘
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
## 🚀 Installation and Setup
|
| 197 |
+
|
| 198 |
+
### System Requirements
|
| 199 |
+
|
| 200 |
+
- Python 3.8+
|
| 201 |
+
|
| 202 |
+
### Install Dependencies
|
| 203 |
+
|
| 204 |
+
```bash
|
| 205 |
+
# Create virtual environment
|
| 206 |
+
python -m venv .venv
|
| 207 |
+
|
| 208 |
+
# Activate virtual environment
|
| 209 |
+
# Windows
|
| 210 |
+
.venv\Scripts\activate
|
| 211 |
+
# Linux/Mac
|
| 212 |
+
source .venv/bin/activate
|
| 213 |
+
|
| 214 |
+
# Install packages
|
| 215 |
+
pip install -r requirements.txt
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
### Environment Configuration
|
| 219 |
+
|
| 220 |
+
Create a `.env` file in the project root directory with the following structure:
|
| 221 |
+
|
| 222 |
+
```env
|
| 223 |
+
# Google API Configuration for RAG System
|
| 224 |
+
GOOGLE_API_KEY=[Google API key]
|
| 225 |
+
|
| 226 |
+
# Database Configuration
|
| 227 |
+
DATABASE_PATH=./src/streamlit_app/backend/diary.db
|
| 228 |
+
|
| 229 |
+
# Vector Database Configuration
|
| 230 |
+
VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
|
| 231 |
+
COLLECTION_NAME=diary_entries
|
| 232 |
+
|
| 233 |
+
# RAG Configuration
|
| 234 |
+
EMBEDDING_MODEL=models/embedding-001
|
| 235 |
+
CHAT_MODEL=gemini-2.5-flash
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
**Important Notes:**
|
| 239 |
+
- Replace all placeholder values with your actual API keys and configuration
|
| 240 |
+
- Keep your `.env` file secure and never commit it to version control
|
| 241 |
+
- The `.env` file is already included in `.gitignore`
|
| 242 |
+
- Use `env_template.txt` as a reference to create your `.env` file
|
| 243 |
+
|
| 244 |
+
### Run the Application
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# Start the RAG backend service
|
| 248 |
+
python start_rag_service.py
|
| 249 |
+
|
| 250 |
+
# Start the Streamlit UI
|
| 251 |
+
streamlit run src/streamlit_app/interface.py
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
## 📁 Directory Structure
|
| 255 |
+
|
| 256 |
+
```
|
| 257 |
+
RAG-Personal-Diary-Chatbot/
|
| 258 |
+
├── src/
|
| 259 |
+
│ ├── Indexingstep/ # Data indexing pipeline
|
| 260 |
+
│ ├── Retrivel_And_Generation/ # RAG engine
|
| 261 |
+
│ ├── rag_service/ # FastAPI backend
|
| 262 |
+
│ ├── streamlit_app/ # User interface
|
| 263 |
+
│ └── VectorDB/ # Vector database
|
| 264 |
+
├── notebook/ # Jupyter notebooks
|
| 265 |
+
├── tests/ # Unit tests
|
| 266 |
+
├── images/ # Documentation images
|
| 267 |
+
├── start_rag_service.py # Service startup script
|
| 268 |
+
├── .env # Environment variables (create from template)
|
| 269 |
+
├── env_template.txt # Environment variables template
|
| 270 |
+
└── README.md
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
## 🔧 Configuration
|
| 274 |
+
|
| 275 |
+
### Vector Database
|
| 276 |
+
- **ChromaDB**: Main database for vector embeddings
|
| 277 |
+
- **Chunk size**: 1000 characters (customizable)
|
| 278 |
+
- **Overlap**: 200 characters between chunks
|
| 279 |
+
|
| 280 |
+
### AI Models
|
| 281 |
+
- **Embedding**: Google's Universal Sentence Encoder
|
| 282 |
+
- **LLM**: Google Gemini (can be replaced with other models)
|
| 283 |
+
|
| 284 |
+
## 📊 Performance
|
| 285 |
+
|
| 286 |
+
- **Processing time**: ~2-5 seconds per question
|
| 287 |
+
- **Accuracy**: 85-95% depending on data quality
|
| 288 |
+
- **Scalability**: Supports thousands of diaries
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
## 🤝 Contributing
|
| 292 |
+
|
| 293 |
+
1. Fork the project
|
| 294 |
+
2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
|
| 295 |
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
| 296 |
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
| 297 |
+
5. Open a Pull Request
|
| 298 |
+
## 📞 Contact
|
| 299 |
+
|
| 300 |
+
- **Author**: [DongAnh]
|
| 301 |
+
- **Email**: [donganhng098@gmail.com]
|
| 302 |
+
- **GitHub**: [github.com/DongAnh]
|
| 303 |
+
|
| 304 |
+
## 🙏 Acknowledgments
|
| 305 |
+
|
| 306 |
+
- Gemini for GPT models
|
| 307 |
+
- Google for Universal Sentence Encoder
|
| 308 |
+
- ChromaDB team for vector database
|
| 309 |
+
- FastAPI and Streamlit communities
|
| 310 |
+
- RAG architecture
|
VectorDB/user_1_vector_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e895be71c0752f44743806cd7439624649d5e41500865e79c77e07c6d1dca87
|
| 3 |
+
size 163840
|
app.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Start RAG Service for Personal Diary Chatbot
|
| 4 |
+
"""
|
| 5 |
+
import subprocess
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
def check_requirements():
|
| 12 |
+
"""Check if required packages are installed."""
|
| 13 |
+
required_packages = ['fastapi', 'uvicorn']
|
| 14 |
+
missing_packages = []
|
| 15 |
+
|
| 16 |
+
for package in required_packages:
|
| 17 |
+
try:
|
| 18 |
+
__import__(package)
|
| 19 |
+
except ImportError:
|
| 20 |
+
missing_packages.append(package)
|
| 21 |
+
|
| 22 |
+
if missing_packages:
|
| 23 |
+
print(f"❌ Missing packages: {', '.join(missing_packages)}")
|
| 24 |
+
print(f"Install with: pip install {' '.join(missing_packages)}")
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
def setup_environment():
|
| 30 |
+
"""Setup environment and directories."""
|
| 31 |
+
# Ensure VectorDB directory exists
|
| 32 |
+
vector_db_dir = Path("src/VectorDB")
|
| 33 |
+
vector_db_dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
print(f"📁 Vector DB directory: {vector_db_dir.absolute()}")
|
| 35 |
+
|
| 36 |
+
# Check for .env file
|
| 37 |
+
env_file = Path("src/Indexingstep/.env")
|
| 38 |
+
if env_file.exists():
|
| 39 |
+
print(f"✅ Environment file found: {env_file}")
|
| 40 |
+
else:
|
| 41 |
+
print(f"⚠️ Environment file not found: {env_file}")
|
| 42 |
+
print("Make sure GOOGLE_API_KEY is set in environment")
|
| 43 |
+
|
| 44 |
+
def start_service():
|
| 45 |
+
"""Start the RAG FastAPI service."""
|
| 46 |
+
if not check_requirements():
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
setup_environment()
|
| 50 |
+
|
| 51 |
+
service_file = Path("src/rag_service/main.py")
|
| 52 |
+
|
| 53 |
+
if not service_file.exists():
|
| 54 |
+
print(f"❌ Service file not found: {service_file}")
|
| 55 |
+
print("Please create the RAG service file first")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
print("🚀 Starting RAG Service...")
|
| 59 |
+
print("📍 Service URL: http://0.0.0.0:8001")
|
| 60 |
+
print("📖 API Docs: http://0.0.0.0:8001/docs")
|
| 61 |
+
print("💾 Vector databases will be stored in: src/VectorDB/")
|
| 62 |
+
print("\nPress Ctrl+C to stop the service")
|
| 63 |
+
print("-" * 50)
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Change to project root directory
|
| 67 |
+
os.chdir(Path(__file__).parent)
|
| 68 |
+
|
| 69 |
+
# Start the service in the background
|
| 70 |
+
process = subprocess.Popen([
|
| 71 |
+
sys.executable, "-m", "uvicorn",
|
| 72 |
+
"src.rag_service.main:app",
|
| 73 |
+
"--host", "0.0.0.0",
|
| 74 |
+
"--port", "8001",
|
| 75 |
+
"--reload"
|
| 76 |
+
])
|
| 77 |
+
print(f"🔄 RAG Service running in background (PID: {process.pid})")
|
| 78 |
+
return process
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Error starting service: {e}")
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
def start_streamlit():
|
| 84 |
+
# Start Streamlit UI on port 7860 (default for Spaces)
|
| 85 |
+
os.system("streamlit run src/streamlit_app/interface.py --server.port 7860")
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
start_service()
|
| 89 |
+
time.sleep(3)
|
| 90 |
+
start_streamlit()
|
clean_repo/.env
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Google API Configuration for RAG System
|
| 2 |
+
GOOGLE_API_KEY=AIzaSyAZQN21CjLySEybT6vOYDCz4V_e85gD42k
|
| 3 |
+
|
| 4 |
+
# Database Configuration
|
| 5 |
+
DATABASE_PATH=./src/streamlit_app/backend/diary.db
|
| 6 |
+
|
| 7 |
+
# Vector Database Configuration
|
| 8 |
+
VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
|
| 9 |
+
COLLECTION_NAME=diary_entries
|
| 10 |
+
|
| 11 |
+
# RAG Configuration
|
| 12 |
+
EMBEDDING_MODEL=models/embedding-001
|
clean_repo/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
clean_repo/.gitignore
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
./venv/
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
|
| 27 |
+
# PyInstaller
|
| 28 |
+
*.manifest
|
| 29 |
+
*.spec
|
| 30 |
+
|
| 31 |
+
# Installer logs
|
| 32 |
+
pip-log.txt
|
| 33 |
+
pip-delete-this-directory.txt
|
| 34 |
+
|
| 35 |
+
# Unit test / coverage reports
|
| 36 |
+
htmlcov/
|
| 37 |
+
.tox/
|
| 38 |
+
.coverage
|
| 39 |
+
.coverage.*
|
| 40 |
+
.cache
|
| 41 |
+
nosetests.xml
|
| 42 |
+
coverage.xml
|
| 43 |
+
*.cover
|
| 44 |
+
.hypothesis/
|
| 45 |
+
.pytest_cache/
|
| 46 |
+
|
| 47 |
+
# Translations
|
| 48 |
+
*.mo
|
| 49 |
+
*.pot
|
| 50 |
+
|
| 51 |
+
# Django stuff:
|
| 52 |
+
*.log
|
| 53 |
+
local_settings.py
|
| 54 |
+
db.sqlite3
|
| 55 |
+
|
| 56 |
+
# Flask stuff:
|
| 57 |
+
instance/
|
| 58 |
+
.webassets-cache
|
| 59 |
+
|
| 60 |
+
# Scrapy stuff:
|
| 61 |
+
.scrapy
|
| 62 |
+
|
| 63 |
+
# Sphinx documentation
|
| 64 |
+
docs/_build/
|
| 65 |
+
|
| 66 |
+
# PyBuilder
|
| 67 |
+
target/
|
| 68 |
+
|
| 69 |
+
# Jupyter Notebook
|
| 70 |
+
.ipynb_checkpoints
|
| 71 |
+
|
| 72 |
+
# pyenv
|
| 73 |
+
.python-version
|
| 74 |
+
|
| 75 |
+
# celery beat schedule file
|
| 76 |
+
celerybeat-schedule
|
| 77 |
+
|
| 78 |
+
# SageMath parsed files
|
| 79 |
+
*.sage.py
|
| 80 |
+
|
| 81 |
+
# Environments
|
| 82 |
+
.env
|
| 83 |
+
.venv
|
| 84 |
+
env/
|
| 85 |
+
venv/
|
| 86 |
+
ENV/
|
| 87 |
+
env.bak/
|
| 88 |
+
venv.bak/
|
| 89 |
+
|
| 90 |
+
# Spyder project settings
|
| 91 |
+
.spyderproject
|
| 92 |
+
.spyproject
|
| 93 |
+
|
| 94 |
+
# Rope project settings
|
| 95 |
+
.ropeproject
|
| 96 |
+
|
| 97 |
+
# mkdocs documentation
|
| 98 |
+
/site
|
| 99 |
+
|
| 100 |
+
# mypy
|
| 101 |
+
.mypy_cache/
|
| 102 |
+
.dmypy.json
|
| 103 |
+
dmypy.json
|
| 104 |
+
|
| 105 |
+
# IDE
|
| 106 |
+
.vscode/
|
| 107 |
+
.idea/
|
| 108 |
+
*.swp
|
| 109 |
+
*.swo
|
| 110 |
+
|
| 111 |
+
# OS
|
| 112 |
+
.DS_Store
|
| 113 |
+
Thumbs.db
|
| 114 |
+
|
| 115 |
+
# Project specific
|
| 116 |
+
data/
|
| 117 |
+
models/
|
| 118 |
+
logs/
|
| 119 |
+
*.pkl
|
| 120 |
+
*.model
|
| 121 |
+
|
| 122 |
+
# Vector databases & generated files
|
| 123 |
+
/VectorDB/
|
| 124 |
+
/src/VectorDB/
|
| 125 |
+
/src/Indexingstep/src/Indexingstep/diary_vector_db_enhanced/
|
| 126 |
+
*.db
|
| 127 |
+
*.sqlite3
|
| 128 |
+
*.bin
|
clean_repo/Backend.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Backend Architecture - Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## Tổng quan Backend
|
| 4 |
+
|
| 5 |
+
Backend của dự án được xây dựng trên nền tảng FastAPI, cung cấp API RESTful cho việc xử lý nhật ký, tìm kiếm và tương tác với chatbot RAG. Hệ thống được thiết kế theo kiến trúc microservices với khả năng mở rộng cao.
|
| 6 |
+
|
| 7 |
+
## 🏛️ Kiến trúc tổng thể
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Frontend │ │ API Gateway │ │ Core Services │
|
| 12 |
+
│ (Streamlit) │◄──►│ (FastAPI) │◄──►│ (RAG Engine) │
|
| 13 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 14 |
+
│
|
| 15 |
+
▼
|
| 16 |
+
┌─────────────────┐
|
| 17 |
+
│ Data Layer │
|
| 18 |
+
│ (Vector DB) │
|
| 19 |
+
└─────────────────┘
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 🔧 Cấu trúc thư mục Backend
|
| 23 |
+
|
| 24 |
+
```
|
| 25 |
+
src/
|
| 26 |
+
├── rag_service/ # FastAPI service
|
| 27 |
+
│ ├── main.py # Main application entry point
|
| 28 |
+
│ ├── __init__.py
|
| 29 |
+
│ └── __pycache__/
|
| 30 |
+
├── Indexingstep/ # Data processing pipeline
|
| 31 |
+
│ ├── pipeline.py # Main indexing pipeline
|
| 32 |
+
│ ├── dataloading.py # Document loading utilities
|
| 33 |
+
│ ├── diary_text_splitter.py # Text chunking logic
|
| 34 |
+
│ ├── embedding_and_storing.py # Vector embedding & storage
|
| 35 |
+
│ ├── database_utils.py # Database operations
|
| 36 |
+
│ └── indexing_pipeline.py # Pipeline orchestration
|
| 37 |
+
├── Retrivel_And_Generation/ # RAG core engine
|
| 38 |
+
│ ├── Retrieval_And_Generator.py # Main RAG system
|
| 39 |
+
│ └── __init__.py
|
| 40 |
+
├── VectorDB/ # Vector database storage
|
| 41 |
+
└── streamlit_app/ # Frontend application
|
| 42 |
+
├── backend/ # Backend utilities for UI
|
| 43 |
+
├── user_auth.py # Authentication system
|
| 44 |
+
├── rag_client.py # RAG service client
|
| 45 |
+
└── interface.py # Main UI interface
|
| 46 |
+
```
|
| 47 |
+
## 🔮 Future Enhancements
|
| 48 |
+
|
| 49 |
+
### 1. Microservices Architecture
|
| 50 |
+
- **User Service**: Dedicated user management
|
| 51 |
+
- **Document Service**: Document processing pipeline
|
| 52 |
+
- **Search Service**: Vector search optimization
|
| 53 |
+
- **Chat Service**: Conversation management
|
| 54 |
+
|
| 55 |
+
### 2. Advanced Features
|
| 56 |
+
- **Real-time synchronization**: WebSocket support
|
| 57 |
+
- **Multi-language support**: Internationalization
|
| 58 |
+
- **Advanced analytics**: User behavior tracking
|
| 59 |
+
- **Machine learning**: Continuous model improvement
|
| 60 |
+
|
| 61 |
+
### 3. Infrastructure Improvements
|
| 62 |
+
- **Kubernetes deployment**: Container orchestration
|
| 63 |
+
- **Service mesh**: Istio integration
|
| 64 |
+
- **Observability**: Distributed tracing
|
| 65 |
+
- **Auto-scaling**: Dynamic resource allocation
|
clean_repo/Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sử dụng image Python chính thức
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Đặt thư mục làm việc trong container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Sao chép file requirements.txt vào container (nếu có)
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Cài đặt các thư viện phụ thuộc
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Sao chép toàn bộ mã nguồn vào container
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Chạy ứng dụng (thay main.py bằng file chạy chính của bạn)
|
| 17 |
+
CMD ["python", "app.py"]
|
clean_repo/RAG-architecture.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Architecture - Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## 🏗️ Tổng quan kiến trúc RAG
|
| 4 |
+
|
| 5 |
+
Kiến trúc RAG (Retrieval-Augmented Generation) trong dự án này được thiết kế để cung cấp khả năng tìm kiếm và trả lời thông minh dựa trên dữ liệu nhật ký cá nhân của người dùng.
|
| 6 |
+
|
| 7 |
+
## 🔄 Luồng xử lý RAG
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Input Query │───►│ Query │───►│ Vector │
|
| 12 |
+
│ (User Question)│ │ Processing │ │ Search │
|
| 13 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 14 |
+
│
|
| 15 |
+
▼
|
| 16 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 17 |
+
│ Final Answer │◄───│ Answer │◄───│ Context │
|
| 18 |
+
│ (Response) │ │ Generation │ │ Retrieval │
|
| 19 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 📊 Chi tiết các thành phần
|
| 23 |
+
|
| 24 |
+
### 1. Data Ingestion & Indexing
|
| 25 |
+
|
| 26 |
+
#### 1.1 Document Loading
|
| 27 |
+
- **Input formats**: PDF, DOCX, TXT
|
| 28 |
+
- **Processing**: Text extraction, cleaning, normalization
|
| 29 |
+
- **Output**: Structured text data
|
| 30 |
+
|
| 31 |
+
#### 1.2 Text Chunking
|
| 32 |
+
```python
|
| 33 |
+
# Chunking strategy
|
| 34 |
+
chunk_size = 1000 # characters
|
| 35 |
+
chunk_overlap = 200 # characters
|
| 36 |
+
chunking_method = "recursive_character_splitter"
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
#### 1.3 Embedding Generation
|
| 40 |
+
- **Model**: Google Universal Sentence Encoder (USE)
|
| 41 |
+
- **Vector dimension**: 512
|
| 42 |
+
- **Normalization**: L2 normalization
|
| 43 |
+
- **Storage**: ChromaDB vector database
|
| 44 |
+
|
| 45 |
+
### 2. Vector Database Architecture
|
| 46 |
+
|
| 47 |
+
#### 2.1 ChromaDB Configuration
|
| 48 |
+
```python
|
| 49 |
+
# Database settings
|
| 50 |
+
collection_name = f"user_{user_id}_diary"
|
| 51 |
+
metadata = {
|
| 52 |
+
"user_id": user_id,
|
| 53 |
+
"source": "diary_entry",
|
| 54 |
+
"date": entry_date,
|
| 55 |
+
"chunk_id": chunk_id
|
| 56 |
+
}
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
#### 2.2 Index Structure
|
| 60 |
+
- **Primary key**: `user_id + chunk_id`
|
| 61 |
+
- **Vector index**: HNSW (Hierarchical Navigable Small World)
|
| 62 |
+
- **Distance metric**: Cosine similarity
|
| 63 |
+
- **Sharding**: Per-user collections
|
| 64 |
+
|
| 65 |
+
### 3. Retrieval Engine
|
| 66 |
+
|
| 67 |
+
#### 3.1 Query Processing
|
| 68 |
+
```python
|
| 69 |
+
# Query preprocessing
|
| 70 |
+
def process_query(query: str):
|
| 71 |
+
# 1. Text cleaning
|
| 72 |
+
# 2. Stop word removal
|
| 73 |
+
# 3. Lemmatization
|
| 74 |
+
# 4. Query expansion
|
| 75 |
+
return processed_query
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
#### 3.2 Vector Search
|
| 79 |
+
- **Search algorithm**: K-Nearest Neighbors (KNN)
|
| 80 |
+
- **Top-k results**: 5-10 most relevant chunks
|
| 81 |
+
- **Similarity threshold**: 0.7 (cosine similarity)
|
| 82 |
+
- **Reranking**: Semantic relevance scoring
|
| 83 |
+
|
| 84 |
+
#### 3.3 Context Assembly
|
| 85 |
+
```python
|
| 86 |
+
# Context building
|
| 87 |
+
def build_context(retrieved_chunks, query):
|
| 88 |
+
# 1. Sort by relevance score
|
| 89 |
+
# 2. Remove duplicates
|
| 90 |
+
# 3. Truncate to token limit
|
| 91 |
+
# 4. Add metadata context
|
| 92 |
+
return final_context
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
### 4. Generation Engine
|
| 96 |
+
|
| 97 |
+
#### 4.1 LLM Integration
|
| 98 |
+
- **Primary model**: OpenAI GPT-3.5/4
|
| 99 |
+
- **Fallback model**: Local model (nếu cần)
|
| 100 |
+
- **Temperature**: 0.7 (balanced creativity)
|
| 101 |
+
- **Max tokens**: 500 (response length)
|
| 102 |
+
|
| 103 |
+
#### 4.2 Prompt Engineering
|
| 104 |
+
```python
|
| 105 |
+
# System prompt template
|
| 106 |
+
SYSTEM_PROMPT = """
|
| 107 |
+
You are a helpful AI assistant that answers questions about personal diary entries.
|
| 108 |
+
Use only the provided context to answer questions.
|
| 109 |
+
If the information is not in the context, say so.
|
| 110 |
+
Be conversational but professional.
|
| 111 |
+
"""
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
#### 4.3 Response Generation
|
| 115 |
+
```python
|
| 116 |
+
# Generation pipeline
|
| 117 |
+
def generate_response(query, context, chat_history):
|
| 118 |
+
# 1. Build prompt with context
|
| 119 |
+
# 2. Call LLM API
|
| 120 |
+
# 3. Post-process response
|
| 121 |
+
# 4. Validate against context
|
| 122 |
+
# 5. Return final answer
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## 🔧 Cấu hình kỹ thuật
|
| 126 |
+
|
| 127 |
+
### Performance Tuning
|
| 128 |
+
|
| 129 |
+
#### 1. Chunking Optimization
|
| 130 |
+
- **Optimal chunk size**: 1000 characters
|
| 131 |
+
- **Overlap ratio**: 20%
|
| 132 |
+
- **Chunking strategy**: Recursive character splitter
|
| 133 |
+
|
| 134 |
+
#### 2. Vector Search Optimization
|
| 135 |
+
- **Index type**: HNSW
|
| 136 |
+
- **Search parameters**:
|
| 137 |
+
- `ef_construction`: 200
|
| 138 |
+
- `ef_search`: 100
|
| 139 |
+
- `m`: 16
|
| 140 |
+
|
| 141 |
+
#### 3. Caching Strategy
|
| 142 |
+
- **Query cache**: Redis (in-memory)
|
| 143 |
+
- **Embedding cache**: Local file cache
|
| 144 |
+
- **Response cache**: TTL-based expiration
|
| 145 |
+
|
| 146 |
+
### Scalability Features
|
| 147 |
+
|
| 148 |
+
#### 1. Multi-User Support
|
| 149 |
+
- **User isolation**: Separate vector collections
|
| 150 |
+
- **Resource management**: Per-user memory limits
|
| 151 |
+
- **Concurrent access**: Async processing
|
| 152 |
+
|
| 153 |
+
#### 2. Horizontal Scaling
|
| 154 |
+
- **Load balancing**: Multiple RAG instances
|
| 155 |
+
- **Database sharding**: User-based distribution
|
| 156 |
+
- **Microservices**: Modular architecture
|
| 157 |
+
|
| 158 |
+
## 📈 Monitoring & Analytics
|
| 159 |
+
|
| 160 |
+
### 1. Performance Metrics
|
| 161 |
+
- **Query latency**: < 2 seconds
|
| 162 |
+
- **Retrieval accuracy**: > 85%
|
| 163 |
+
- **Generation quality**: User satisfaction score
|
| 164 |
+
- **System throughput**: Queries per second
|
| 165 |
+
|
| 166 |
+
### 2. Quality Assurance
|
| 167 |
+
- **Context relevance**: Similarity score tracking
|
| 168 |
+
- **Answer accuracy**: Human evaluation
|
| 169 |
+
- **User feedback**: Rating system
|
| 170 |
+
- **A/B testing**: Model comparison
|
| 171 |
+
|
| 172 |
+
## 🚀 Deployment Architecture
|
| 173 |
+
|
| 174 |
+
### 1. Development Environment
|
| 175 |
+
```
|
| 176 |
+
┌─────────────────┐ ┌─────────────────┐
|
| 177 |
+
│ Local Python │ │ Local │
|
| 178 |
+
│ Environment │◄──►│ ChromaDB │
|
| 179 |
+
└─────────────────┘ └─────────────────┘
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
### 2. Production Environment
|
| 183 |
+
```
|
| 184 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 185 |
+
│ Load Balancer│ │ RAG Service │ │ Vector DB │
|
| 186 |
+
│ (Nginx) │◄──►│ (FastAPI) │◄──►│ (ChromaDB) │
|
| 187 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 188 |
+
│
|
| 189 |
+
▼
|
| 190 |
+
┌─────────────────┐
|
| 191 |
+
│ Redis Cache │
|
| 192 |
+
└─────────────────┘
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
## 🔒 Security & Privacy
|
| 196 |
+
|
| 197 |
+
### 1. Data Protection
|
| 198 |
+
- **User isolation**: Strict separation of data
|
| 199 |
+
- **Encryption**: At-rest and in-transit
|
| 200 |
+
- **Access control**: Role-based permissions
|
| 201 |
+
- **Audit logging**: Complete access history
|
| 202 |
+
|
| 203 |
+
### 2. Privacy Compliance
|
| 204 |
+
- **GDPR compliance**: Data portability
|
| 205 |
+
- **Data retention**: Configurable policies
|
| 206 |
+
- **User consent**: Explicit permission management
|
| 207 |
+
- **Data anonymization**: Optional features
|
| 208 |
+
|
| 209 |
+
## 🧪 Testing Strategy
|
| 210 |
+
|
| 211 |
+
### 1. Unit Testing
|
| 212 |
+
- **Component testing**: Individual modules
|
| 213 |
+
- **Mock testing**: External API simulation
|
| 214 |
+
- **Coverage target**: > 90%
|
| 215 |
+
|
| 216 |
+
### 2. Integration Testing
|
| 217 |
+
- **End-to-end testing**: Complete RAG pipeline
|
| 218 |
+
- **Performance testing**: Load and stress tests
|
| 219 |
+
- **Security testing**: Vulnerability assessment
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
## 📚 Best Practices
|
| 223 |
+
|
| 224 |
+
### 1. Model Selection
|
| 225 |
+
- **Embedding models**: Domain-specific fine-tuning
|
| 226 |
+
- **LLM selection**: Cost-performance balance
|
| 227 |
+
- **Fallback strategies**: Graceful degradation
|
| 228 |
+
|
| 229 |
+
### 2. Data Quality
|
| 230 |
+
- **Input validation**: Strict data checking
|
| 231 |
+
- **Cleaning pipeline**: Automated preprocessing
|
| 232 |
+
- **Quality metrics**: Continuous monitoring
|
| 233 |
+
|
| 234 |
+
### 3. Error Handling
|
| 235 |
+
- **Graceful failures**: User-friendly error messages
|
| 236 |
+
- **Retry mechanisms**: Automatic recovery
|
| 237 |
+
- **Logging**: Comprehensive error tracking
|
| 238 |
+
|
| 239 |
+
## 🔮 Future Enhancements
|
| 240 |
+
|
| 241 |
+
### 1. Advanced Features
|
| 242 |
+
- **Multi-modal RAG**: Image and text processing
|
| 243 |
+
- **Temporal reasoning**: Time-based queries
|
| 244 |
+
- **Emotional analysis**: Sentiment-aware responses
|
| 245 |
+
|
| 246 |
+
### 2. Performance Improvements
|
| 247 |
+
- **Vector quantization**: Reduced memory usage
|
| 248 |
+
- **Approximate search**: Faster retrieval
|
| 249 |
+
- **Model distillation**: Smaller, faster models
|
| 250 |
+
|
| 251 |
+
### 3. Integration Capabilities
|
| 252 |
+
- **API ecosystem**: Third-party integrations
|
| 253 |
+
- **Mobile applications**: Native mobile support
|
| 254 |
+
- **Voice interface**: Speech-to-text integration
|
clean_repo/README.md
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## 📖 Project Description
|
| 4 |
+
|
| 5 |
+
RAG Personal Diary Chatbot is an intelligent chatbot application that uses RAG (Retrieval-Augmented Generation) architecture to interact with users' personal diaries. The application allows users to ask questions about diary content and receive accurate answers based on actual data.
|
| 6 |
+
|
| 7 |
+
## ✨ Key Features
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
## 🏗️ System Architecture
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 14 |
+
│ Streamlit UI │ │ FastAPI │ │ Vector │
|
| 15 |
+
│ (Frontend) │◄──►│ Backend │◄──►│ Database │
|
| 16 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 17 |
+
│
|
| 18 |
+
▼
|
| 19 |
+
┌─────────────────┐
|
| 20 |
+
│ RAG Engine │
|
| 21 |
+
│ (LLM + │
|
| 22 |
+
│ Retrieval) │
|
| 23 |
+
└─────────────────┘
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## 🚀 Installation and Setup
|
| 27 |
+
|
| 28 |
+
### System Requirements
|
| 29 |
+
|
| 30 |
+
### Install Dependencies
|
| 31 |
+
```bash
|
| 32 |
+
# Create virtual environment
|
| 33 |
+
python -m venv .venv
|
| 34 |
+
|
| 35 |
+
# Activate virtual environment
|
| 36 |
+
# Windows
|
| 37 |
+
.venv\Scripts\activate
|
| 38 |
+
# Linux/Mac
|
| 39 |
+
source .venv/bin/activate
|
| 40 |
+
|
| 41 |
+
# Install packages
|
| 42 |
+
pip install -r requirements.txt
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### Environment Configuration
|
| 46 |
+
|
| 47 |
+
Create a `.env` file in the project root directory with the following structure:
|
| 48 |
+
|
| 49 |
+
```env
|
| 50 |
+
# API Keys
|
| 51 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 52 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 53 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 54 |
+
|
| 55 |
+
# Database Configuration
|
| 56 |
+
DATABASE_URL=sqlite:///./user_database/auth.db
|
| 57 |
+
VECTOR_DB_PATH=./VectorDB
|
| 58 |
+
|
| 59 |
+
# Model Configuration
|
| 60 |
+
EMBEDDING_MODEL=google-universal-sentence-encoder
|
| 61 |
+
LLM_MODEL=gpt-3.5-turbo
|
| 62 |
+
CHUNK_SIZE=1000
|
| 63 |
+
CHUNK_OVERLAP=200
|
| 64 |
+
|
| 65 |
+
# Server Configuration
|
| 66 |
+
RAG_SERVICE_PORT=8001
|
| 67 |
+
STREAMLIT_PORT=8501
|
| 68 |
+
FASTAPI_PORT=8000
|
| 69 |
+
|
| 70 |
+
# Security
|
| 71 |
+
SECRET_KEY=your_secret_key_here
|
| 72 |
+
JWT_SECRET=your_jwt_secret_here
|
| 73 |
+
|
| 74 |
+
# Logging
|
| 75 |
+
LOG_LEVEL=INFO
|
| 76 |
+
LOG_FILE=./logs/app.log
|
| 77 |
+
|
| 78 |
+
# Vector Database
|
| 79 |
+
CHROMA_DB_PATH=./VectorDB
|
| 80 |
+
PERSIST_DIRECTORY=./VectorDB
|
| 81 |
+
|
| 82 |
+
# File Processing
|
| 83 |
+
SUPPORTED_FORMATS=pdf,docx,txt,md
|
| 84 |
+
MAX_FILE_SIZE=10485760
|
| 85 |
+
TEMP_DIR=./temp
|
| 86 |
+
|
| 87 |
+
# RAG Configuration
|
| 88 |
+
TOP_K_RESULTS=5
|
| 89 |
+
SIMILARITY_THRESHOLD=0.7
|
| 90 |
+
MAX_TOKENS=4096
|
| 91 |
+
TEMPERATURE=0.7
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Important Notes:**
|
| 95 |
+
|
| 96 |
+
### Run the Application
|
| 97 |
+
|
| 98 |
+
#### 1. Start RAG Service
|
| 99 |
+
```bash
|
| 100 |
+
python start_rag_service.py
|
| 101 |
+
```
|
| 102 |
+
Service will run at: http://127.0.0.1:8001
|
| 103 |
+
|
| 104 |
+
#### 2. Start Streamlit UI
|
| 105 |
+
```bash
|
| 106 |
+
cd src/streamlit_app
|
| 107 |
+
streamlit run interface.py
|
| 108 |
+
```
|
| 109 |
+
UI will run at: http://localhost:8501
|
| 110 |
+
|
| 111 |
+
## 📁 Directory Structure
|
| 112 |
+
|
| 113 |
+
```
|
| 114 |
+
RAG-Personal-Diary-Chatbot/
|
| 115 |
+
├── src/
|
| 116 |
+
│ ├── Indexingstep/ # Data indexing pipeline
|
| 117 |
+
│ ├── Retrivel_And_Generation/ # RAG engine
|
| 118 |
+
│ ├── rag_service/ # FastAPI backend
|
| 119 |
+
│ ├── streamlit_app/ # User interface
|
| 120 |
+
│ └── VectorDB/ # Vector database
|
| 121 |
+
├── notebook/ # Jupyter notebooks
|
| 122 |
+
├── tests/ # Unit tests
|
| 123 |
+
├── images/ # Documentation images
|
| 124 |
+
├── start_rag_service.py # Service startup script
|
| 125 |
+
├── .env # Environment variables (create from template)
|
| 126 |
+
├── env_template.txt # Environment variables template
|
| 127 |
+
└── README.md
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## 🔧 Configuration
|
| 131 |
+
|
| 132 |
+
### Vector Database
|
| 133 |
+
|
| 134 |
+
### AI Models
|
| 135 |
+
|
| 136 |
+
## 📊 Performance
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
## 🧪 Testing
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Run all tests
|
| 143 |
+
python -m pytest tests/
|
| 144 |
+
|
| 145 |
+
# Run specific test
|
| 146 |
+
python -m pytest tests/test_rag_system.py
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## 🤝 Contributing
|
| 150 |
+
|
| 151 |
+
1. Fork the project
|
| 152 |
+
2. Create feature branch (`git checkout -b feature/AmazingFeature`)
|
| 153 |
+
3. Commit changes (`git commit -m 'Add some AmazingFeature'`)
|
| 154 |
+
4. Push to branch (`git push origin feature/AmazingFeature`)
|
| 155 |
+
5. Open Pull Request
|
| 156 |
+
|
| 157 |
+
## 📝 License
|
| 158 |
+
|
| 159 |
+
This project is distributed under the MIT License. See the `LICENSE` file for more details.
|
| 160 |
+
|
| 161 |
+
## 📞 Contact
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
## 🙏 Acknowledgments
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## 📖 Project Description
|
| 168 |
+
|
| 169 |
+
RAG Personal Diary Chatbot is an intelligent chatbot application that leverages Retrieval-Augmented Generation (RAG) architecture to interact with users' personal diaries. Users can ask questions about their diary content and receive accurate, context-based answers.
|
| 170 |
+
|
| 171 |
+
## ✨ Key Features
|
| 172 |
+
|
| 173 |
+
- **Diary Indexing**: Automatically processes and indexes diary files (PDF, DOCX, TXT)
|
| 174 |
+
- **Semantic Search**: Uses a vector database for semantic search
|
| 175 |
+
- **AI Chatbot**: Natural interaction with diary data
|
| 176 |
+
- **User Isolation**: Each user has a separate vector database
|
| 177 |
+
- **Web Interface**: Easy-to-use Streamlit UI
|
| 178 |
+
- **REST API**: FastAPI backend for integration
|
| 179 |
+
|
| 180 |
+
## 🏗️ System Architecture
|
| 181 |
+
|
| 182 |
+
```
|
| 183 |
+
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
|
| 184 |
+
│ Streamlit UI │◄──►│ FastAPI │◄──►│ Vector DB │
|
| 185 |
+
│ (Frontend) │ │ Backend │ │ (ChromaDB) │
|
| 186 |
+
└───────────────┘ └───────────────┘ └───────────────┘
|
| 187 |
+
│
|
| 188 |
+
▼
|
| 189 |
+
┌───────────────┐
|
| 190 |
+
│ RAG Engine │
|
| 191 |
+
│ (LLM + │
|
| 192 |
+
│ Retrieval) │
|
| 193 |
+
└───────────────┘
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
## 🚀 Installation and Setup
|
| 197 |
+
|
| 198 |
+
### System Requirements
|
| 199 |
+
|
| 200 |
+
- Python 3.8+
|
| 201 |
+
|
| 202 |
+
### Install Dependencies
|
| 203 |
+
|
| 204 |
+
```bash
|
| 205 |
+
# Create virtual environment
|
| 206 |
+
python -m venv .venv
|
| 207 |
+
|
| 208 |
+
# Activate virtual environment
|
| 209 |
+
# Windows
|
| 210 |
+
.venv\Scripts\activate
|
| 211 |
+
# Linux/Mac
|
| 212 |
+
source .venv/bin/activate
|
| 213 |
+
|
| 214 |
+
# Install packages
|
| 215 |
+
pip install -r requirements.txt
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
### Environment Configuration
|
| 219 |
+
|
| 220 |
+
Create a `.env` file in the project root directory with the following structure:
|
| 221 |
+
|
| 222 |
+
```env
|
| 223 |
+
# Google API Configuration for RAG System
|
| 224 |
+
GOOGLE_API_KEY=[Google API key]
|
| 225 |
+
|
| 226 |
+
# Database Configuration
|
| 227 |
+
DATABASE_PATH=./src/streamlit_app/backend/diary.db
|
| 228 |
+
|
| 229 |
+
# Vector Database Configuration
|
| 230 |
+
VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
|
| 231 |
+
COLLECTION_NAME=diary_entries
|
| 232 |
+
|
| 233 |
+
# RAG Configuration
|
| 234 |
+
EMBEDDING_MODEL=models/embedding-001
|
| 235 |
+
CHAT_MODEL=gemini-2.5-flash
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
**Important Notes:**
|
| 239 |
+
- Replace all placeholder values with your actual API keys and configuration
|
| 240 |
+
- Keep your `.env` file secure and never commit it to version control
|
| 241 |
+
- The `.env` file is already included in `.gitignore`
|
| 242 |
+
- Use `env_template.txt` as a reference to create your `.env` file
|
| 243 |
+
|
| 244 |
+
### Run the Application
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# Start the RAG backend service
|
| 248 |
+
python start_rag_service.py
|
| 249 |
+
|
| 250 |
+
# Start the Streamlit UI
|
| 251 |
+
streamlit run src/streamlit_app/interface.py
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
## 📁 Directory Structure
|
| 255 |
+
|
| 256 |
+
```
|
| 257 |
+
RAG-Personal-Diary-Chatbot/
|
| 258 |
+
├── src/
|
| 259 |
+
│ ├── Indexingstep/ # Data indexing pipeline
|
| 260 |
+
│ ├── Retrivel_And_Generation/ # RAG engine
|
| 261 |
+
│ ├── rag_service/ # FastAPI backend
|
| 262 |
+
│ ├── streamlit_app/ # User interface
|
| 263 |
+
│ └── VectorDB/ # Vector database
|
| 264 |
+
├── notebook/ # Jupyter notebooks
|
| 265 |
+
├── tests/ # Unit tests
|
| 266 |
+
├── images/ # Documentation images
|
| 267 |
+
├── start_rag_service.py # Service startup script
|
| 268 |
+
├── .env # Environment variables (create from template)
|
| 269 |
+
├── env_template.txt # Environment variables template
|
| 270 |
+
└── README.md
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
## 🔧 Configuration
|
| 274 |
+
|
| 275 |
+
### Vector Database
|
| 276 |
+
- **ChromaDB**: Main database for vector embeddings
|
| 277 |
+
- **Chunk size**: 1000 characters (customizable)
|
| 278 |
+
- **Overlap**: 200 characters between chunks
|
| 279 |
+
|
| 280 |
+
### AI Models
|
| 281 |
+
- **Embedding**: Google's Universal Sentence Encoder
|
| 282 |
+
- **LLM**: Google Gemini (can be replaced with other models)
|
| 283 |
+
|
| 284 |
+
## 📊 Performance
|
| 285 |
+
|
| 286 |
+
- **Processing time**: ~2-5 seconds per question
|
| 287 |
+
- **Accuracy**: 85-95% depending on data quality
|
| 288 |
+
- **Scalability**: Supports thousands of diaries
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
## 🤝 Contributing
|
| 292 |
+
|
| 293 |
+
1. Fork the project
|
| 294 |
+
2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
|
| 295 |
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
| 296 |
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
| 297 |
+
5. Open a Pull Request
|
| 298 |
+
## 📞 Contact
|
| 299 |
+
|
| 300 |
+
- **Author**: [huytrao]
|
| 301 |
+
- **Email**: [traohuy098@gmail.com]
|
| 302 |
+
- **GitHub**: [github.com/huytrao]
|
| 303 |
+
|
| 304 |
+
## 🙏 Acknowledgments
|
| 305 |
+
|
| 306 |
+
- Gemini for GPT models
|
| 307 |
+
- Google for Universal Sentence Encoder
|
| 308 |
+
- ChromaDB team for vector database
|
| 309 |
+
- FastAPI and Streamlit communities
|
| 310 |
+
- RAG architecture
|
clean_repo/VectorDB/user_1_vector_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e895be71c0752f44743806cd7439624649d5e41500865e79c77e07c6d1dca87
|
| 3 |
+
size 163840
|
clean_repo/app.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Start RAG Service for Personal Diary Chatbot
|
| 4 |
+
"""
|
| 5 |
+
import subprocess
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
def check_requirements():
|
| 12 |
+
"""Check if required packages are installed."""
|
| 13 |
+
required_packages = ['fastapi', 'uvicorn']
|
| 14 |
+
missing_packages = []
|
| 15 |
+
|
| 16 |
+
for package in required_packages:
|
| 17 |
+
try:
|
| 18 |
+
__import__(package)
|
| 19 |
+
except ImportError:
|
| 20 |
+
missing_packages.append(package)
|
| 21 |
+
|
| 22 |
+
if missing_packages:
|
| 23 |
+
print(f"❌ Missing packages: {', '.join(missing_packages)}")
|
| 24 |
+
print(f"Install with: pip install {' '.join(missing_packages)}")
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
def setup_environment():
|
| 30 |
+
"""Setup environment and directories."""
|
| 31 |
+
# Ensure VectorDB directory exists
|
| 32 |
+
vector_db_dir = Path("src/VectorDB")
|
| 33 |
+
vector_db_dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
print(f"📁 Vector DB directory: {vector_db_dir.absolute()}")
|
| 35 |
+
|
| 36 |
+
# Check for .env file
|
| 37 |
+
env_file = Path("src/Indexingstep/.env")
|
| 38 |
+
if env_file.exists():
|
| 39 |
+
print(f"✅ Environment file found: {env_file}")
|
| 40 |
+
else:
|
| 41 |
+
print(f"⚠️ Environment file not found: {env_file}")
|
| 42 |
+
print("Make sure GOOGLE_API_KEY is set in environment")
|
| 43 |
+
|
| 44 |
+
def start_service():
|
| 45 |
+
"""Start the RAG FastAPI service."""
|
| 46 |
+
if not check_requirements():
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
setup_environment()
|
| 50 |
+
|
| 51 |
+
service_file = Path("src/rag_service/main.py")
|
| 52 |
+
|
| 53 |
+
if not service_file.exists():
|
| 54 |
+
print(f"❌ Service file not found: {service_file}")
|
| 55 |
+
print("Please create the RAG service file first")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
print("🚀 Starting RAG Service...")
|
| 59 |
+
print("📍 Service URL: http://0.0.0.0:8001")
|
| 60 |
+
print("📖 API Docs: http://0.0.0.0:8001/docs")
|
| 61 |
+
print("💾 Vector databases will be stored in: src/VectorDB/")
|
| 62 |
+
print("\nPress Ctrl+C to stop the service")
|
| 63 |
+
print("-" * 50)
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Change to project root directory
|
| 67 |
+
os.chdir(Path(__file__).parent)
|
| 68 |
+
|
| 69 |
+
# Start the service in the background
|
| 70 |
+
process = subprocess.Popen([
|
| 71 |
+
sys.executable, "-m", "uvicorn",
|
| 72 |
+
"src.rag_service.main:app",
|
| 73 |
+
"--host", "0.0.0.0",
|
| 74 |
+
"--port", "8001",
|
| 75 |
+
"--reload"
|
| 76 |
+
])
|
| 77 |
+
print(f"🔄 RAG Service running in background (PID: {process.pid})")
|
| 78 |
+
return process
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Error starting service: {e}")
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
def start_streamlit():
|
| 84 |
+
# Start Streamlit UI on port 7860 (default for Spaces)
|
| 85 |
+
os.system("streamlit run src/streamlit_app/interface.py --server.port 7860")
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
start_service()
|
| 89 |
+
time.sleep(3)
|
| 90 |
+
start_streamlit()
|
clean_repo/clean_repo/.env
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Google API Configuration for RAG System
|
| 2 |
+
GOOGLE_API_KEY=AIzaSyAZQN21CjLySEybT6vOYDCz4V_e85gD42k
|
| 3 |
+
|
| 4 |
+
# Database Configuration
|
| 5 |
+
DATABASE_PATH=./src/streamlit_app/backend/diary.db
|
| 6 |
+
|
| 7 |
+
# Vector Database Configuration
|
| 8 |
+
VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
|
| 9 |
+
COLLECTION_NAME=diary_entries
|
| 10 |
+
|
| 11 |
+
# RAG Configuration
|
| 12 |
+
EMBEDDING_MODEL=models/embedding-001
|
clean_repo/clean_repo/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
clean_repo/clean_repo/.gitignore
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
./venv/
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
|
| 27 |
+
# PyInstaller
|
| 28 |
+
*.manifest
|
| 29 |
+
*.spec
|
| 30 |
+
|
| 31 |
+
# Installer logs
|
| 32 |
+
pip-log.txt
|
| 33 |
+
pip-delete-this-directory.txt
|
| 34 |
+
|
| 35 |
+
# Unit test / coverage reports
|
| 36 |
+
htmlcov/
|
| 37 |
+
.tox/
|
| 38 |
+
.coverage
|
| 39 |
+
.coverage.*
|
| 40 |
+
.cache
|
| 41 |
+
nosetests.xml
|
| 42 |
+
coverage.xml
|
| 43 |
+
*.cover
|
| 44 |
+
.hypothesis/
|
| 45 |
+
.pytest_cache/
|
| 46 |
+
|
| 47 |
+
# Translations
|
| 48 |
+
*.mo
|
| 49 |
+
*.pot
|
| 50 |
+
|
| 51 |
+
# Django stuff:
|
| 52 |
+
*.log
|
| 53 |
+
local_settings.py
|
| 54 |
+
db.sqlite3
|
| 55 |
+
|
| 56 |
+
# Flask stuff:
|
| 57 |
+
instance/
|
| 58 |
+
.webassets-cache
|
| 59 |
+
|
| 60 |
+
# Scrapy stuff:
|
| 61 |
+
.scrapy
|
| 62 |
+
|
| 63 |
+
# Sphinx documentation
|
| 64 |
+
docs/_build/
|
| 65 |
+
|
| 66 |
+
# PyBuilder
|
| 67 |
+
target/
|
| 68 |
+
|
| 69 |
+
# Jupyter Notebook
|
| 70 |
+
.ipynb_checkpoints
|
| 71 |
+
|
| 72 |
+
# pyenv
|
| 73 |
+
.python-version
|
| 74 |
+
|
| 75 |
+
# celery beat schedule file
|
| 76 |
+
celerybeat-schedule
|
| 77 |
+
|
| 78 |
+
# SageMath parsed files
|
| 79 |
+
*.sage.py
|
| 80 |
+
|
| 81 |
+
# Environments
|
| 82 |
+
.env
|
| 83 |
+
.venv
|
| 84 |
+
env/
|
| 85 |
+
venv/
|
| 86 |
+
ENV/
|
| 87 |
+
env.bak/
|
| 88 |
+
venv.bak/
|
| 89 |
+
|
| 90 |
+
# Spyder project settings
|
| 91 |
+
.spyderproject
|
| 92 |
+
.spyproject
|
| 93 |
+
|
| 94 |
+
# Rope project settings
|
| 95 |
+
.ropeproject
|
| 96 |
+
|
| 97 |
+
# mkdocs documentation
|
| 98 |
+
/site
|
| 99 |
+
|
| 100 |
+
# mypy
|
| 101 |
+
.mypy_cache/
|
| 102 |
+
.dmypy.json
|
| 103 |
+
dmypy.json
|
| 104 |
+
|
| 105 |
+
# IDE
|
| 106 |
+
.vscode/
|
| 107 |
+
.idea/
|
| 108 |
+
*.swp
|
| 109 |
+
*.swo
|
| 110 |
+
|
| 111 |
+
# OS
|
| 112 |
+
.DS_Store
|
| 113 |
+
Thumbs.db
|
| 114 |
+
|
| 115 |
+
# Project specific
|
| 116 |
+
data/
|
| 117 |
+
models/
|
| 118 |
+
logs/
|
| 119 |
+
*.pkl
|
| 120 |
+
*.model
|
clean_repo/clean_repo/Backend.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Backend Architecture - Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## Tổng quan Backend
|
| 4 |
+
|
| 5 |
+
Backend của dự án được xây dựng trên nền tảng FastAPI, cung cấp API RESTful cho việc xử lý nhật ký, tìm kiếm và tương tác với chatbot RAG. Hệ thống được thiết kế theo kiến trúc microservices với khả năng mở rộng cao.
|
| 6 |
+
|
| 7 |
+
## 🏛️ Kiến trúc tổng thể
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Frontend │ │ API Gateway │ │ Core Services │
|
| 12 |
+
│ (Streamlit) │◄──►│ (FastAPI) │◄──►│ (RAG Engine) │
|
| 13 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 14 |
+
│
|
| 15 |
+
▼
|
| 16 |
+
┌─────────────────┐
|
| 17 |
+
│ Data Layer │
|
| 18 |
+
│ (Vector DB) │
|
| 19 |
+
└─────────────────┘
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 🔧 Cấu trúc thư mục Backend
|
| 23 |
+
|
| 24 |
+
```
|
| 25 |
+
src/
|
| 26 |
+
├── rag_service/ # FastAPI service
|
| 27 |
+
│ ├── main.py # Main application entry point
|
| 28 |
+
│ ├── __init__.py
|
| 29 |
+
│ └── __pycache__/
|
| 30 |
+
├── Indexingstep/ # Data processing pipeline
|
| 31 |
+
│ ├── pipeline.py # Main indexing pipeline
|
| 32 |
+
│ ├── dataloading.py # Document loading utilities
|
| 33 |
+
│ ├── diary_text_splitter.py # Text chunking logic
|
| 34 |
+
│ ├── embedding_and_storing.py # Vector embedding & storage
|
| 35 |
+
│ ├── database_utils.py # Database operations
|
| 36 |
+
│ └── indexing_pipeline.py # Pipeline orchestration
|
| 37 |
+
├── Retrivel_And_Generation/ # RAG core engine
|
| 38 |
+
│ ├── Retrieval_And_Generator.py # Main RAG system
|
| 39 |
+
│ └── __init__.py
|
| 40 |
+
├── VectorDB/ # Vector database storage
|
| 41 |
+
└── streamlit_app/ # Frontend application
|
| 42 |
+
├── backend/ # Backend utilities for UI
|
| 43 |
+
├── user_auth.py # Authentication system
|
| 44 |
+
├── rag_client.py # RAG service client
|
| 45 |
+
└── interface.py # Main UI interface
|
| 46 |
+
```
|
| 47 |
+
## 🔮 Future Enhancements
|
| 48 |
+
|
| 49 |
+
### 1. Microservices Architecture
|
| 50 |
+
- **User Service**: Dedicated user management
|
| 51 |
+
- **Document Service**: Document processing pipeline
|
| 52 |
+
- **Search Service**: Vector search optimization
|
| 53 |
+
- **Chat Service**: Conversation management
|
| 54 |
+
|
| 55 |
+
### 2. Advanced Features
|
| 56 |
+
- **Real-time synchronization**: WebSocket support
|
| 57 |
+
- **Multi-language support**: Internationalization
|
| 58 |
+
- **Advanced analytics**: User behavior tracking
|
| 59 |
+
- **Machine learning**: Continuous model improvement
|
| 60 |
+
|
| 61 |
+
### 3. Infrastructure Improvements
|
| 62 |
+
- **Kubernetes deployment**: Container orchestration
|
| 63 |
+
- **Service mesh**: Istio integration
|
| 64 |
+
- **Observability**: Distributed tracing
|
| 65 |
+
- **Auto-scaling**: Dynamic resource allocation
|
clean_repo/clean_repo/Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sử dụng image Python chính thức
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Đặt thư mục làm việc trong container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Sao chép file requirements.txt vào container (nếu có)
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Cài đặt các thư viện phụ thuộc
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Sao chép toàn bộ mã nguồn vào container
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Chạy ứng dụng (thay main.py bằng file chạy chính của bạn)
|
| 17 |
+
CMD ["python", "app.py"]
|
clean_repo/clean_repo/RAG-architecture.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Architecture - Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## 🏗️ Tổng quan kiến trúc RAG
|
| 4 |
+
|
| 5 |
+
Kiến trúc RAG (Retrieval-Augmented Generation) trong dự án này được thiết kế để cung cấp khả năng tìm kiếm và trả lời thông minh dựa trên dữ liệu nhật ký cá nhân của người dùng.
|
| 6 |
+
|
| 7 |
+
## 🔄 Luồng xử lý RAG
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Input Query │───►│ Query │───►│ Vector │
|
| 12 |
+
│ (User Question)│ │ Processing │ │ Search │
|
| 13 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 14 |
+
│
|
| 15 |
+
▼
|
| 16 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 17 |
+
│ Final Answer │◄───│ Answer │◄───│ Context │
|
| 18 |
+
│ (Response) │ │ Generation │ │ Retrieval │
|
| 19 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 📊 Chi tiết các thành phần
|
| 23 |
+
|
| 24 |
+
### 1. Data Ingestion & Indexing
|
| 25 |
+
|
| 26 |
+
#### 1.1 Document Loading
|
| 27 |
+
- **Input formats**: PDF, DOCX, TXT
|
| 28 |
+
- **Processing**: Text extraction, cleaning, normalization
|
| 29 |
+
- **Output**: Structured text data
|
| 30 |
+
|
| 31 |
+
#### 1.2 Text Chunking
|
| 32 |
+
```python
|
| 33 |
+
# Chunking strategy
|
| 34 |
+
chunk_size = 1000 # characters
|
| 35 |
+
chunk_overlap = 200 # characters
|
| 36 |
+
chunking_method = "recursive_character_splitter"
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
#### 1.3 Embedding Generation
|
| 40 |
+
- **Model**: Google Universal Sentence Encoder (USE)
|
| 41 |
+
- **Vector dimension**: 512
|
| 42 |
+
- **Normalization**: L2 normalization
|
| 43 |
+
- **Storage**: ChromaDB vector database
|
| 44 |
+
|
| 45 |
+
### 2. Vector Database Architecture
|
| 46 |
+
|
| 47 |
+
#### 2.1 ChromaDB Configuration
|
| 48 |
+
```python
|
| 49 |
+
# Database settings
|
| 50 |
+
collection_name = f"user_{user_id}_diary"
|
| 51 |
+
metadata = {
|
| 52 |
+
"user_id": user_id,
|
| 53 |
+
"source": "diary_entry",
|
| 54 |
+
"date": entry_date,
|
| 55 |
+
"chunk_id": chunk_id
|
| 56 |
+
}
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
#### 2.2 Index Structure
|
| 60 |
+
- **Primary key**: `user_id + chunk_id`
|
| 61 |
+
- **Vector index**: HNSW (Hierarchical Navigable Small World)
|
| 62 |
+
- **Distance metric**: Cosine similarity
|
| 63 |
+
- **Sharding**: Per-user collections
|
| 64 |
+
|
| 65 |
+
### 3. Retrieval Engine
|
| 66 |
+
|
| 67 |
+
#### 3.1 Query Processing
|
| 68 |
+
```python
|
| 69 |
+
# Query preprocessing
|
| 70 |
+
def process_query(query: str):
|
| 71 |
+
# 1. Text cleaning
|
| 72 |
+
# 2. Stop word removal
|
| 73 |
+
# 3. Lemmatization
|
| 74 |
+
# 4. Query expansion
|
| 75 |
+
return processed_query
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
#### 3.2 Vector Search
|
| 79 |
+
- **Search algorithm**: K-Nearest Neighbors (KNN)
|
| 80 |
+
- **Top-k results**: 5-10 most relevant chunks
|
| 81 |
+
- **Similarity threshold**: 0.7 (cosine similarity)
|
| 82 |
+
- **Reranking**: Semantic relevance scoring
|
| 83 |
+
|
| 84 |
+
#### 3.3 Context Assembly
|
| 85 |
+
```python
|
| 86 |
+
# Context building
|
| 87 |
+
def build_context(retrieved_chunks, query):
|
| 88 |
+
# 1. Sort by relevance score
|
| 89 |
+
# 2. Remove duplicates
|
| 90 |
+
# 3. Truncate to token limit
|
| 91 |
+
# 4. Add metadata context
|
| 92 |
+
return final_context
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
### 4. Generation Engine
|
| 96 |
+
|
| 97 |
+
#### 4.1 LLM Integration
|
| 98 |
+
- **Primary model**: OpenAI GPT-3.5/4
|
| 99 |
+
- **Fallback model**: Local model (nếu cần)
|
| 100 |
+
- **Temperature**: 0.7 (balanced creativity)
|
| 101 |
+
- **Max tokens**: 500 (response length)
|
| 102 |
+
|
| 103 |
+
#### 4.2 Prompt Engineering
|
| 104 |
+
```python
|
| 105 |
+
# System prompt template
|
| 106 |
+
SYSTEM_PROMPT = """
|
| 107 |
+
You are a helpful AI assistant that answers questions about personal diary entries.
|
| 108 |
+
Use only the provided context to answer questions.
|
| 109 |
+
If the information is not in the context, say so.
|
| 110 |
+
Be conversational but professional.
|
| 111 |
+
"""
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
#### 4.3 Response Generation
|
| 115 |
+
```python
|
| 116 |
+
# Generation pipeline
|
| 117 |
+
def generate_response(query, context, chat_history):
|
| 118 |
+
# 1. Build prompt with context
|
| 119 |
+
# 2. Call LLM API
|
| 120 |
+
# 3. Post-process response
|
| 121 |
+
# 4. Validate against context
|
| 122 |
+
# 5. Return final answer
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## 🔧 Cấu hình kỹ thuật
|
| 126 |
+
|
| 127 |
+
### Performance Tuning
|
| 128 |
+
|
| 129 |
+
#### 1. Chunking Optimization
|
| 130 |
+
- **Optimal chunk size**: 1000 characters
|
| 131 |
+
- **Overlap ratio**: 20%
|
| 132 |
+
- **Chunking strategy**: Recursive character splitter
|
| 133 |
+
|
| 134 |
+
#### 2. Vector Search Optimization
|
| 135 |
+
- **Index type**: HNSW
|
| 136 |
+
- **Search parameters**:
|
| 137 |
+
- `ef_construction`: 200
|
| 138 |
+
- `ef_search`: 100
|
| 139 |
+
- `m`: 16
|
| 140 |
+
|
| 141 |
+
#### 3. Caching Strategy
|
| 142 |
+
- **Query cache**: Redis (in-memory)
|
| 143 |
+
- **Embedding cache**: Local file cache
|
| 144 |
+
- **Response cache**: TTL-based expiration
|
| 145 |
+
|
| 146 |
+
### Scalability Features
|
| 147 |
+
|
| 148 |
+
#### 1. Multi-User Support
|
| 149 |
+
- **User isolation**: Separate vector collections
|
| 150 |
+
- **Resource management**: Per-user memory limits
|
| 151 |
+
- **Concurrent access**: Async processing
|
| 152 |
+
|
| 153 |
+
#### 2. Horizontal Scaling
|
| 154 |
+
- **Load balancing**: Multiple RAG instances
|
| 155 |
+
- **Database sharding**: User-based distribution
|
| 156 |
+
- **Microservices**: Modular architecture
|
| 157 |
+
|
| 158 |
+
## 📈 Monitoring & Analytics
|
| 159 |
+
|
| 160 |
+
### 1. Performance Metrics
|
| 161 |
+
- **Query latency**: < 2 seconds
|
| 162 |
+
- **Retrieval accuracy**: > 85%
|
| 163 |
+
- **Generation quality**: User satisfaction score
|
| 164 |
+
- **System throughput**: Queries per second
|
| 165 |
+
|
| 166 |
+
### 2. Quality Assurance
|
| 167 |
+
- **Context relevance**: Similarity score tracking
|
| 168 |
+
- **Answer accuracy**: Human evaluation
|
| 169 |
+
- **User feedback**: Rating system
|
| 170 |
+
- **A/B testing**: Model comparison
|
| 171 |
+
|
| 172 |
+
## 🚀 Deployment Architecture
|
| 173 |
+
|
| 174 |
+
### 1. Development Environment
|
| 175 |
+
```
|
| 176 |
+
┌─────────────────┐ ┌─────────────────┐
|
| 177 |
+
│ Local Python │ │ Local │
|
| 178 |
+
│ Environment │◄──►│ ChromaDB │
|
| 179 |
+
└─────────────────┘ └─────────────────┘
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
### 2. Production Environment
|
| 183 |
+
```
|
| 184 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 185 |
+
│ Load Balancer│ │ RAG Service │ │ Vector DB │
|
| 186 |
+
│ (Nginx) │◄──►│ (FastAPI) │◄──►│ (ChromaDB) │
|
| 187 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 188 |
+
│
|
| 189 |
+
▼
|
| 190 |
+
┌─────────────────┐
|
| 191 |
+
│ Redis Cache │
|
| 192 |
+
└─────────────────┘
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
## 🔒 Security & Privacy
|
| 196 |
+
|
| 197 |
+
### 1. Data Protection
|
| 198 |
+
- **User isolation**: Strict separation of data
|
| 199 |
+
- **Encryption**: At-rest and in-transit
|
| 200 |
+
- **Access control**: Role-based permissions
|
| 201 |
+
- **Audit logging**: Complete access history
|
| 202 |
+
|
| 203 |
+
### 2. Privacy Compliance
|
| 204 |
+
- **GDPR compliance**: Data portability
|
| 205 |
+
- **Data retention**: Configurable policies
|
| 206 |
+
- **User consent**: Explicit permission management
|
| 207 |
+
- **Data anonymization**: Optional features
|
| 208 |
+
|
| 209 |
+
## 🧪 Testing Strategy
|
| 210 |
+
|
| 211 |
+
### 1. Unit Testing
|
| 212 |
+
- **Component testing**: Individual modules
|
| 213 |
+
- **Mock testing**: External API simulation
|
| 214 |
+
- **Coverage target**: > 90%
|
| 215 |
+
|
| 216 |
+
### 2. Integration Testing
|
| 217 |
+
- **End-to-end testing**: Complete RAG pipeline
|
| 218 |
+
- **Performance testing**: Load and stress tests
|
| 219 |
+
- **Security testing**: Vulnerability assessment
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
## 📚 Best Practices
|
| 223 |
+
|
| 224 |
+
### 1. Model Selection
|
| 225 |
+
- **Embedding models**: Domain-specific fine-tuning
|
| 226 |
+
- **LLM selection**: Cost-performance balance
|
| 227 |
+
- **Fallback strategies**: Graceful degradation
|
| 228 |
+
|
| 229 |
+
### 2. Data Quality
|
| 230 |
+
- **Input validation**: Strict data checking
|
| 231 |
+
- **Cleaning pipeline**: Automated preprocessing
|
| 232 |
+
- **Quality metrics**: Continuous monitoring
|
| 233 |
+
|
| 234 |
+
### 3. Error Handling
|
| 235 |
+
- **Graceful failures**: User-friendly error messages
|
| 236 |
+
- **Retry mechanisms**: Automatic recovery
|
| 237 |
+
- **Logging**: Comprehensive error tracking
|
| 238 |
+
|
| 239 |
+
## 🔮 Future Enhancements
|
| 240 |
+
|
| 241 |
+
### 1. Advanced Features
|
| 242 |
+
- **Multi-modal RAG**: Image and text processing
|
| 243 |
+
- **Temporal reasoning**: Time-based queries
|
| 244 |
+
- **Emotional analysis**: Sentiment-aware responses
|
| 245 |
+
|
| 246 |
+
### 2. Performance Improvements
|
| 247 |
+
- **Vector quantization**: Reduced memory usage
|
| 248 |
+
- **Approximate search**: Faster retrieval
|
| 249 |
+
- **Model distillation**: Smaller, faster models
|
| 250 |
+
|
| 251 |
+
### 3. Integration Capabilities
|
| 252 |
+
- **API ecosystem**: Third-party integrations
|
| 253 |
+
- **Mobile applications**: Native mobile support
|
| 254 |
+
- **Voice interface**: Speech-to-text integration
|
clean_repo/clean_repo/README.md
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Personal Diary Chatbot
|
| 2 |
+
|
| 3 |
+
## 📖 Project Description
|
| 4 |
+
|
| 5 |
+
RAG Personal Diary Chatbot is an intelligent chatbot application that uses RAG (Retrieval-Augmented Generation) architecture to interact with users' personal diaries. The application allows users to ask questions about diary content and receive accurate answers based on actual data.
|
| 6 |
+
|
| 7 |
+
## ✨ Key Features
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
## 🏗️ System Architecture
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 14 |
+
│ Streamlit UI │ │ FastAPI │ │ Vector │
|
| 15 |
+
│ (Frontend) │◄──►│ Backend │◄──►│ Database │
|
| 16 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 17 |
+
│
|
| 18 |
+
▼
|
| 19 |
+
┌─────────────────┐
|
| 20 |
+
│ RAG Engine │
|
| 21 |
+
│ (LLM + │
|
| 22 |
+
│ Retrieval) │
|
| 23 |
+
└─────────────────┘
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## 🚀 Installation and Setup
|
| 27 |
+
|
| 28 |
+
### System Requirements
|
| 29 |
+
|
| 30 |
+
### Install Dependencies
|
| 31 |
+
```bash
|
| 32 |
+
# Create virtual environment
|
| 33 |
+
python -m venv .venv
|
| 34 |
+
|
| 35 |
+
# Activate virtual environment
|
| 36 |
+
# Windows
|
| 37 |
+
.venv\Scripts\activate
|
| 38 |
+
# Linux/Mac
|
| 39 |
+
source .venv/bin/activate
|
| 40 |
+
|
| 41 |
+
# Install packages
|
| 42 |
+
pip install -r requirements.txt
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### Environment Configuration
|
| 46 |
+
|
| 47 |
+
Create a `.env` file in the project root directory with the following structure:
|
| 48 |
+
|
| 49 |
+
```env
|
| 50 |
+
# API Keys
|
| 51 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 52 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 53 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 54 |
+
|
| 55 |
+
# Database Configuration
|
| 56 |
+
DATABASE_URL=sqlite:///./user_database/auth.db
|
| 57 |
+
VECTOR_DB_PATH=./VectorDB
|
| 58 |
+
|
| 59 |
+
# Model Configuration
|
| 60 |
+
EMBEDDING_MODEL=google-universal-sentence-encoder
|
| 61 |
+
LLM_MODEL=gpt-3.5-turbo
|
| 62 |
+
CHUNK_SIZE=1000
|
| 63 |
+
CHUNK_OVERLAP=200
|
| 64 |
+
|
| 65 |
+
# Server Configuration
|
| 66 |
+
RAG_SERVICE_PORT=8001
|
| 67 |
+
STREAMLIT_PORT=8501
|
| 68 |
+
FASTAPI_PORT=8000
|
| 69 |
+
|
| 70 |
+
# Security
|
| 71 |
+
SECRET_KEY=your_secret_key_here
|
| 72 |
+
JWT_SECRET=your_jwt_secret_here
|
| 73 |
+
|
| 74 |
+
# Logging
|
| 75 |
+
LOG_LEVEL=INFO
|
| 76 |
+
LOG_FILE=./logs/app.log
|
| 77 |
+
|
| 78 |
+
# Vector Database
|
| 79 |
+
CHROMA_DB_PATH=./VectorDB
|
| 80 |
+
PERSIST_DIRECTORY=./VectorDB
|
| 81 |
+
|
| 82 |
+
# File Processing
|
| 83 |
+
SUPPORTED_FORMATS=pdf,docx,txt,md
|
| 84 |
+
MAX_FILE_SIZE=10485760
|
| 85 |
+
TEMP_DIR=./temp
|
| 86 |
+
|
| 87 |
+
# RAG Configuration
|
| 88 |
+
TOP_K_RESULTS=5
|
| 89 |
+
SIMILARITY_THRESHOLD=0.7
|
| 90 |
+
MAX_TOKENS=4096
|
| 91 |
+
TEMPERATURE=0.7
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Important Notes:**
|
| 95 |
+
|
| 96 |
+
### Run the Application
|
| 97 |
+
|
| 98 |
+
#### 1. Start RAG Service
|
| 99 |
+
```bash
|
| 100 |
+
python start_rag_service.py
|
| 101 |
+
```
|
| 102 |
+
Service will run at: http://127.0.0.1:8001
|
| 103 |
+
|
| 104 |
+
#### 2. Start Streamlit UI
|
| 105 |
+
```bash
|
| 106 |
+
cd src/streamlit_app
|
| 107 |
+
streamlit run interface.py
|
| 108 |
+
```
|
| 109 |
+
UI will run at: http://localhost:8501
|
| 110 |
+
|
| 111 |
+
## 📁 Directory Structure
|
| 112 |
+
|
| 113 |
+
```
|
| 114 |
+
RAG-Personal-Diary-Chatbot/
|
| 115 |
+
├── src/
|
| 116 |
+
│ ├── Indexingstep/ # Data indexing pipeline
|
| 117 |
+
│ ├── Retrivel_And_Generation/ # RAG engine
|
| 118 |
+
│ ├── rag_service/ # FastAPI backend
|
| 119 |
+
│ ├── streamlit_app/ # User interface
|
| 120 |
+
│ └── VectorDB/ # Vector database
|
| 121 |
+
├── notebook/ # Jupyter notebooks
|
| 122 |
+
├── tests/ # Unit tests
|
| 123 |
+
├── images/ # Documentation images
|
| 124 |
+
├── start_rag_service.py # Service startup script
|
| 125 |
+
├── .env # Environment variables (create from template)
|
| 126 |
+
├── env_template.txt # Environment variables template
|
| 127 |
+
└── README.md
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## 🔧 Configuration
|
| 131 |
+
|
| 132 |
+
### Vector Database
|
| 133 |
+
|
| 134 |
+
### AI Models
|
| 135 |
+
|
| 136 |
+
## 📊 Performance
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
## 🧪 Testing
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Run all tests
|
| 143 |
+
python -m pytest tests/
|
| 144 |
+
|
| 145 |
+
# Run specific test
|
| 146 |
+
python -m pytest tests/test_rag_system.py
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## 🤝 Contributing
|
| 150 |
+
|
| 151 |
+
1. Fork the project
|
| 152 |
+
2. Create feature branch (`git checkout -b feature/AmazingFeature`)
|
| 153 |
+
3. Commit changes (`git commit -m 'Add some AmazingFeature'`)
|
| 154 |
+
4. Push to branch (`git push origin feature/AmazingFeature`)
|
| 155 |
+
5. Open Pull Request
|
| 156 |
+
|
| 157 |
+
## 📝 License
|
| 158 |
+
|
| 159 |
+
This project is distributed under the MIT License. See the `LICENSE` file for more details.
|
| 160 |
+
|
| 161 |
+
## 📞 Contact
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
## 🙏 Acknowledgments
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## 📖 Project Description
|
| 168 |
+
|
| 169 |
+
RAG Personal Diary Chatbot is an intelligent chatbot application that leverages Retrieval-Augmented Generation (RAG) architecture to interact with users' personal diaries. Users can ask questions about their diary content and receive accurate, context-based answers.
|
| 170 |
+
|
| 171 |
+
## ✨ Key Features
|
| 172 |
+
|
| 173 |
+
- **Diary Indexing**: Automatically processes and indexes diary files (PDF, DOCX, TXT)
|
| 174 |
+
- **Semantic Search**: Uses a vector database for semantic search
|
| 175 |
+
- **AI Chatbot**: Natural interaction with diary data
|
| 176 |
+
- **User Isolation**: Each user has a separate vector database
|
| 177 |
+
- **Web Interface**: Easy-to-use Streamlit UI
|
| 178 |
+
- **REST API**: FastAPI backend for integration
|
| 179 |
+
|
| 180 |
+
## 🏗️ System Architecture
|
| 181 |
+
|
| 182 |
+
```
|
| 183 |
+
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
|
| 184 |
+
│ Streamlit UI │◄──►│ FastAPI │◄──►│ Vector DB │
|
| 185 |
+
│ (Frontend) │ │ Backend │ │ (ChromaDB) │
|
| 186 |
+
└───────────────┘ └───────────────┘ └───────────────┘
|
| 187 |
+
│
|
| 188 |
+
▼
|
| 189 |
+
┌───────────────┐
|
| 190 |
+
│ RAG Engine │
|
| 191 |
+
│ (LLM + │
|
| 192 |
+
│ Retrieval) │
|
| 193 |
+
└───────────────┘
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
## 🚀 Installation and Setup
|
| 197 |
+
|
| 198 |
+
### System Requirements
|
| 199 |
+
|
| 200 |
+
- Python 3.8+
|
| 201 |
+
|
| 202 |
+
### Install Dependencies
|
| 203 |
+
|
| 204 |
+
```bash
|
| 205 |
+
# Create virtual environment
|
| 206 |
+
python -m venv .venv
|
| 207 |
+
|
| 208 |
+
# Activate virtual environment
|
| 209 |
+
# Windows
|
| 210 |
+
.venv\Scripts\activate
|
| 211 |
+
# Linux/Mac
|
| 212 |
+
source .venv/bin/activate
|
| 213 |
+
|
| 214 |
+
# Install packages
|
| 215 |
+
pip install -r requirements.txt
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
### Environment Configuration
|
| 219 |
+
|
| 220 |
+
Create a `.env` file in the project root directory with the following structure:
|
| 221 |
+
|
| 222 |
+
```env
|
| 223 |
+
# Google API Configuration for RAG System
|
| 224 |
+
GOOGLE_API_KEY=[Google API key]
|
| 225 |
+
|
| 226 |
+
# Database Configuration
|
| 227 |
+
DATABASE_PATH=./src/streamlit_app/backend/diary.db
|
| 228 |
+
|
| 229 |
+
# Vector Database Configuration
|
| 230 |
+
VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
|
| 231 |
+
COLLECTION_NAME=diary_entries
|
| 232 |
+
|
| 233 |
+
# RAG Configuration
|
| 234 |
+
EMBEDDING_MODEL=models/embedding-001
|
| 235 |
+
CHAT_MODEL=gemini-2.5-flash
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
**Important Notes:**
|
| 239 |
+
- Replace all placeholder values with your actual API keys and configuration
|
| 240 |
+
- Keep your `.env` file secure and never commit it to version control
|
| 241 |
+
- The `.env` file is already included in `.gitignore`
|
| 242 |
+
- Use `env_template.txt` as a reference to create your `.env` file
|
| 243 |
+
|
| 244 |
+
### Run the Application
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# Start the RAG backend service
|
| 248 |
+
python start_rag_service.py
|
| 249 |
+
|
| 250 |
+
# Start the Streamlit UI
|
| 251 |
+
streamlit run src/streamlit_app/interface.py
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
## 📁 Directory Structure
|
| 255 |
+
|
| 256 |
+
```
|
| 257 |
+
RAG-Personal-Diary-Chatbot/
|
| 258 |
+
├── src/
|
| 259 |
+
│ ├── Indexingstep/ # Data indexing pipeline
|
| 260 |
+
│ ├── Retrivel_And_Generation/ # RAG engine
|
| 261 |
+
│ ├── rag_service/ # FastAPI backend
|
| 262 |
+
│ ├── streamlit_app/ # User interface
|
| 263 |
+
│ └── VectorDB/ # Vector database
|
| 264 |
+
├── notebook/ # Jupyter notebooks
|
| 265 |
+
├── tests/ # Unit tests
|
| 266 |
+
├── images/ # Documentation images
|
| 267 |
+
├── start_rag_service.py # Service startup script
|
| 268 |
+
├── .env # Environment variables (create from template)
|
| 269 |
+
├── env_template.txt # Environment variables template
|
| 270 |
+
└── README.md
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
## 🔧 Configuration
|
| 274 |
+
|
| 275 |
+
### Vector Database
|
| 276 |
+
- **ChromaDB**: Main database for vector embeddings
|
| 277 |
+
- **Chunk size**: 1000 characters (customizable)
|
| 278 |
+
- **Overlap**: 200 characters between chunks
|
| 279 |
+
|
| 280 |
+
### AI Models
|
| 281 |
+
- **Embedding**: Google's Universal Sentence Encoder
|
| 282 |
+
- **LLM**: Google Gemini (can be replaced with other models)
|
| 283 |
+
|
| 284 |
+
## 📊 Performance
|
| 285 |
+
|
| 286 |
+
- **Processing time**: ~2-5 seconds per question
|
| 287 |
+
- **Accuracy**: 85-95% depending on data quality
|
| 288 |
+
- **Scalability**: Supports thousands of diaries
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
## 🤝 Contributing
|
| 292 |
+
|
| 293 |
+
1. Fork the project
|
| 294 |
+
2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
|
| 295 |
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
| 296 |
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
| 297 |
+
5. Open a Pull Request
|
| 298 |
+
## 📞 Contact
|
| 299 |
+
|
| 300 |
+
- **Author**: [huytrao]
|
| 301 |
+
- **Email**: [traohuy098@gmail.com]
|
| 302 |
+
- **GitHub**: [github.com/huytrao]
|
| 303 |
+
|
| 304 |
+
## 🙏 Acknowledgments
|
| 305 |
+
|
| 306 |
+
- Gemini for GPT models
|
| 307 |
+
- Google for Universal Sentence Encoder
|
| 308 |
+
- ChromaDB team for vector database
|
| 309 |
+
- FastAPI and Streamlit communities
|
| 310 |
+
- RAG architecture
|
clean_repo/clean_repo/app.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Start RAG Service for Personal Diary Chatbot
|
| 4 |
+
"""
|
| 5 |
+
import subprocess
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
def check_requirements():
|
| 12 |
+
"""Check if required packages are installed."""
|
| 13 |
+
required_packages = ['fastapi', 'uvicorn']
|
| 14 |
+
missing_packages = []
|
| 15 |
+
|
| 16 |
+
for package in required_packages:
|
| 17 |
+
try:
|
| 18 |
+
__import__(package)
|
| 19 |
+
except ImportError:
|
| 20 |
+
missing_packages.append(package)
|
| 21 |
+
|
| 22 |
+
if missing_packages:
|
| 23 |
+
print(f"❌ Missing packages: {', '.join(missing_packages)}")
|
| 24 |
+
print(f"Install with: pip install {' '.join(missing_packages)}")
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
def setup_environment():
|
| 30 |
+
"""Setup environment and directories."""
|
| 31 |
+
# Ensure VectorDB directory exists
|
| 32 |
+
vector_db_dir = Path("src/VectorDB")
|
| 33 |
+
vector_db_dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
print(f"📁 Vector DB directory: {vector_db_dir.absolute()}")
|
| 35 |
+
|
| 36 |
+
# Check for .env file
|
| 37 |
+
env_file = Path("src/Indexingstep/.env")
|
| 38 |
+
if env_file.exists():
|
| 39 |
+
print(f"✅ Environment file found: {env_file}")
|
| 40 |
+
else:
|
| 41 |
+
print(f"⚠️ Environment file not found: {env_file}")
|
| 42 |
+
print("Make sure GOOGLE_API_KEY is set in environment")
|
| 43 |
+
|
| 44 |
+
def start_service():
|
| 45 |
+
"""Start the RAG FastAPI service."""
|
| 46 |
+
if not check_requirements():
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
setup_environment()
|
| 50 |
+
|
| 51 |
+
service_file = Path("src/rag_service/main.py")
|
| 52 |
+
|
| 53 |
+
if not service_file.exists():
|
| 54 |
+
print(f"❌ Service file not found: {service_file}")
|
| 55 |
+
print("Please create the RAG service file first")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
print("🚀 Starting RAG Service...")
|
| 59 |
+
print("📍 Service URL: http://0.0.0.0:8001")
|
| 60 |
+
print("📖 API Docs: http://0.0.0.0:8001/docs")
|
| 61 |
+
print("💾 Vector databases will be stored in: src/VectorDB/")
|
| 62 |
+
print("\nPress Ctrl+C to stop the service")
|
| 63 |
+
print("-" * 50)
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Change to project root directory
|
| 67 |
+
os.chdir(Path(__file__).parent)
|
| 68 |
+
|
| 69 |
+
# Start the service in the background
|
| 70 |
+
process = subprocess.Popen([
|
| 71 |
+
sys.executable, "-m", "uvicorn",
|
| 72 |
+
"src.rag_service.main:app",
|
| 73 |
+
"--host", "0.0.0.0",
|
| 74 |
+
"--port", "8001",
|
| 75 |
+
"--reload"
|
| 76 |
+
])
|
| 77 |
+
print(f"🔄 RAG Service running in background (PID: {process.pid})")
|
| 78 |
+
return process
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Error starting service: {e}")
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
def start_streamlit():
|
| 84 |
+
# Start Streamlit UI on port 7860 (default for Spaces)
|
| 85 |
+
os.system("streamlit run src/streamlit_app/interface.py --server.port 7860")
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
start_service()
|
| 89 |
+
time.sleep(3)
|
| 90 |
+
start_streamlit()
|
clean_repo/clean_repo/env_template.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Variables Template for RAG Personal Diary Chatbot
|
| 2 |
+
# Copy this file to .env and fill in your actual values
|
| 3 |
+
|
| 4 |
+
# API Keys
|
| 5 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 6 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 7 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 8 |
+
|
| 9 |
+
# Database Configuration
|
| 10 |
+
DATABASE_URL=sqlite:///./user_database/auth.db
|
| 11 |
+
VECTOR_DB_PATH=./VectorDB
|
| 12 |
+
|
| 13 |
+
# Model Configuration
|
| 14 |
+
EMBEDDING_MODEL=google-universal-sentence-encoder
|
| 15 |
+
LLM_MODEL=gpt-3.5-turbo
|
| 16 |
+
CHUNK_SIZE=1000
|
| 17 |
+
CHUNK_OVERLAP=200
|
| 18 |
+
|
| 19 |
+
# Server Configuration
|
| 20 |
+
RAG_SERVICE_PORT=8001
|
| 21 |
+
STREAMLIT_PORT=8501
|
| 22 |
+
FASTAPI_PORT=8000
|
| 23 |
+
|
| 24 |
+
# Security
|
| 25 |
+
SECRET_KEY=your_secret_key_here
|
| 26 |
+
JWT_SECRET=your_jwt_secret_here
|
| 27 |
+
|
| 28 |
+
# Logging
|
| 29 |
+
LOG_LEVEL=INFO
|
| 30 |
+
LOG_FILE=./logs/app.log
|
| 31 |
+
|
| 32 |
+
# Vector Database
|
| 33 |
+
CHROMA_DB_PATH=./VectorDB
|
| 34 |
+
PERSIST_DIRECTORY=./VectorDB
|
| 35 |
+
|
| 36 |
+
# File Processing
|
| 37 |
+
SUPPORTED_FORMATS=pdf,docx,txt,md
|
| 38 |
+
MAX_FILE_SIZE=10485760
|
| 39 |
+
TEMP_DIR=./temp
|
| 40 |
+
|
| 41 |
+
# RAG Configuration
|
| 42 |
+
TOP_K_RESULTS=5
|
| 43 |
+
SIMILARITY_THRESHOLD=0.7
|
| 44 |
+
MAX_TOKENS=4096
|
| 45 |
+
TEMPERATURE=0.7
|
| 46 |
+
|
| 47 |
+
# Instructions:
|
| 48 |
+
# 1. Copy this file to .env
|
| 49 |
+
# 2. Replace all placeholder values with your actual configuration
|
| 50 |
+
# 3. Never commit .env to version control
|
| 51 |
+
# 4. Keep your API keys secure
|
clean_repo/clean_repo/requirements.txt
ADDED
|
Binary file (5.86 kB). View file
|
|
|
clean_repo/clean_repo/src/streamlit_app.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import altair as alt
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
# Welcome to Streamlit!
|
| 8 |
+
|
| 9 |
+
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
+
forums](https://discuss.streamlit.io).
|
| 12 |
+
|
| 13 |
+
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
+
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
+
|
| 19 |
+
indices = np.linspace(0, 1, num_points)
|
| 20 |
+
theta = 2 * np.pi * num_turns * indices
|
| 21 |
+
radius = indices
|
| 22 |
+
|
| 23 |
+
x = radius * np.cos(theta)
|
| 24 |
+
y = radius * np.sin(theta)
|
| 25 |
+
|
| 26 |
+
df = pd.DataFrame({
|
| 27 |
+
"x": x,
|
| 28 |
+
"y": y,
|
| 29 |
+
"idx": indices,
|
| 30 |
+
"rand": np.random.randn(num_points),
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
+
.mark_point(filled=True)
|
| 35 |
+
.encode(
|
| 36 |
+
x=alt.X("x", axis=None),
|
| 37 |
+
y=alt.Y("y", axis=None),
|
| 38 |
+
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
+
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
+
))
|
clean_repo/env_template.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Variables Template for RAG Personal Diary Chatbot
|
| 2 |
+
# Copy this file to .env and fill in your actual values
|
| 3 |
+
|
| 4 |
+
# API Keys
|
| 5 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 6 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 7 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 8 |
+
|
| 9 |
+
# Database Configuration
|
| 10 |
+
DATABASE_URL=sqlite:///./user_database/auth.db
|
| 11 |
+
VECTOR_DB_PATH=./VectorDB
|
| 12 |
+
|
| 13 |
+
# Model Configuration
|
| 14 |
+
EMBEDDING_MODEL=google-universal-sentence-encoder
|
| 15 |
+
LLM_MODEL=gpt-3.5-turbo
|
| 16 |
+
CHUNK_SIZE=1000
|
| 17 |
+
CHUNK_OVERLAP=200
|
| 18 |
+
|
| 19 |
+
# Server Configuration
|
| 20 |
+
RAG_SERVICE_PORT=8001
|
| 21 |
+
STREAMLIT_PORT=8501
|
| 22 |
+
FASTAPI_PORT=8000
|
| 23 |
+
|
| 24 |
+
# Security
|
| 25 |
+
SECRET_KEY=your_secret_key_here
|
| 26 |
+
JWT_SECRET=your_jwt_secret_here
|
| 27 |
+
|
| 28 |
+
# Logging
|
| 29 |
+
LOG_LEVEL=INFO
|
| 30 |
+
LOG_FILE=./logs/app.log
|
| 31 |
+
|
| 32 |
+
# Vector Database
|
| 33 |
+
CHROMA_DB_PATH=./VectorDB
|
| 34 |
+
PERSIST_DIRECTORY=./VectorDB
|
| 35 |
+
|
| 36 |
+
# File Processing
|
| 37 |
+
SUPPORTED_FORMATS=pdf,docx,txt,md
|
| 38 |
+
MAX_FILE_SIZE=10485760
|
| 39 |
+
TEMP_DIR=./temp
|
| 40 |
+
|
| 41 |
+
# RAG Configuration
|
| 42 |
+
TOP_K_RESULTS=5
|
| 43 |
+
SIMILARITY_THRESHOLD=0.7
|
| 44 |
+
MAX_TOKENS=4096
|
| 45 |
+
TEMPERATURE=0.7
|
| 46 |
+
|
| 47 |
+
# Instructions:
|
| 48 |
+
# 1. Copy this file to .env
|
| 49 |
+
# 2. Replace all placeholder values with your actual configuration
|
| 50 |
+
# 3. Never commit .env to version control
|
| 51 |
+
# 4. Keep your API keys secure
|
clean_repo/images/DIAGRAM-RAG-diary.png
ADDED
|
clean_repo/notebook/RAG-test.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
clean_repo/notebook/exploration.ipynb
ADDED
|
File without changes
|
clean_repo/requirements.txt
ADDED
|
Binary file (5.86 kB). View file
|
|
|
clean_repo/src/Indexingstep/Datasplitting.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_text_splitters import CharacterTextSplitter
|
| 2 |
+
|
| 3 |
+
class DataSplitting:
|
| 4 |
+
def __init__(self, chunk_size=1000, chunk_overlap=200, separator="\n\n"):
|
| 5 |
+
"""
|
| 6 |
+
Initialize the DataSplitting class.
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
chunk_size (int): Maximum size of each chunk
|
| 10 |
+
chunk_overlap (int): Number of characters to overlap between chunks
|
| 11 |
+
separator (str): Character(s) to split on
|
| 12 |
+
"""
|
| 13 |
+
self.chunk_size = chunk_size
|
| 14 |
+
self.chunk_overlap = chunk_overlap
|
| 15 |
+
self.separator = separator
|
| 16 |
+
self.text_splitter = CharacterTextSplitter(
|
| 17 |
+
chunk_size=self.chunk_size,
|
| 18 |
+
chunk_overlap=self.chunk_overlap,
|
| 19 |
+
separator=self.separator
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
def split_text(self, text):
|
| 23 |
+
"""
|
| 24 |
+
Split the input text into chunks.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
text (str): The text to be split
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
list: List of text chunks
|
| 31 |
+
"""
|
| 32 |
+
return self.text_splitter.split_text(text)
|
| 33 |
+
|
| 34 |
+
def split_documents(self, documents):
|
| 35 |
+
"""
|
| 36 |
+
Split documents into chunks.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
documents (list): List of documents to be split
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
list: List of document chunks
|
| 43 |
+
"""
|
| 44 |
+
return self.text_splitter.split_documents(documents)
|
clean_repo/src/Indexingstep/database_utils.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database utilities and context managers.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import sqlite3
|
| 6 |
+
import os
|
| 7 |
+
from contextlib import contextmanager
|
| 8 |
+
from typing import Generator
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@contextmanager
|
| 15 |
+
def open_db(db_path: str) -> Generator[sqlite3.Connection, None, None]:
|
| 16 |
+
"""
|
| 17 |
+
Context manager for database connections.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
db_path: Path to the SQLite database
|
| 21 |
+
|
| 22 |
+
Yields:
|
| 23 |
+
Database connection
|
| 24 |
+
"""
|
| 25 |
+
conn = None
|
| 26 |
+
try:
|
| 27 |
+
conn = sqlite3.connect(db_path)
|
| 28 |
+
conn.row_factory = sqlite3.Row
|
| 29 |
+
yield conn
|
| 30 |
+
except Exception as e:
|
| 31 |
+
if conn:
|
| 32 |
+
conn.rollback()
|
| 33 |
+
logger.error(f"Database error with {db_path}: {e}")
|
| 34 |
+
raise
|
| 35 |
+
finally:
|
| 36 |
+
if conn:
|
| 37 |
+
conn.close()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def ensure_database_exists(db_path: str, user_id: int) -> None:
|
| 41 |
+
"""
|
| 42 |
+
Ensure user-specific database exists with proper schema.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
db_path: Path to the database file
|
| 46 |
+
user_id: User ID for default value
|
| 47 |
+
"""
|
| 48 |
+
if os.path.exists(db_path):
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
# Create directory if it doesn't exist
|
| 52 |
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
| 53 |
+
|
| 54 |
+
with open_db(db_path) as conn:
|
| 55 |
+
cursor = conn.cursor()
|
| 56 |
+
|
| 57 |
+
# Create table schema
|
| 58 |
+
cursor.execute(f"""
|
| 59 |
+
CREATE TABLE IF NOT EXISTS diary_entries (
|
| 60 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 61 |
+
user_id INTEGER NOT NULL DEFAULT {user_id},
|
| 62 |
+
date TEXT NOT NULL,
|
| 63 |
+
content TEXT NOT NULL,
|
| 64 |
+
tags TEXT DEFAULT '',
|
| 65 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 66 |
+
)
|
| 67 |
+
""")
|
| 68 |
+
|
| 69 |
+
# Create index
|
| 70 |
+
cursor.execute("""
|
| 71 |
+
CREATE INDEX IF NOT EXISTS idx_user_date ON diary_entries(user_id, date)
|
| 72 |
+
""")
|
| 73 |
+
|
| 74 |
+
conn.commit()
|
| 75 |
+
|
| 76 |
+
logger.info(f"Created user database: {db_path}")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def migrate_user_data(source_db_path: str, target_db_path: str, user_id: int) -> int:
|
| 80 |
+
"""
|
| 81 |
+
Migrate user data from shared database to user-specific database.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
source_db_path: Path to source database
|
| 85 |
+
target_db_path: Path to target database
|
| 86 |
+
user_id: User ID to migrate
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
Number of entries migrated
|
| 90 |
+
"""
|
| 91 |
+
if not os.path.exists(source_db_path):
|
| 92 |
+
return 0
|
| 93 |
+
|
| 94 |
+
migrated_count = 0
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
with open_db(source_db_path) as source_conn:
|
| 98 |
+
with open_db(target_db_path) as target_conn:
|
| 99 |
+
source_cursor = source_conn.cursor()
|
| 100 |
+
target_cursor = target_conn.cursor()
|
| 101 |
+
|
| 102 |
+
# Check if shared DB has user_id column
|
| 103 |
+
source_cursor.execute("PRAGMA table_info(diary_entries)")
|
| 104 |
+
columns = [col[1] for col in source_cursor.fetchall()]
|
| 105 |
+
|
| 106 |
+
if 'user_id' in columns:
|
| 107 |
+
# Migrate specific user data
|
| 108 |
+
source_cursor.execute("""
|
| 109 |
+
SELECT date, content, tags, created_at
|
| 110 |
+
FROM diary_entries
|
| 111 |
+
WHERE user_id = ?
|
| 112 |
+
""", (user_id,))
|
| 113 |
+
else:
|
| 114 |
+
# If no user_id column, migrate all data to user 1 only
|
| 115 |
+
if user_id == 1:
|
| 116 |
+
source_cursor.execute("""
|
| 117 |
+
SELECT date, content, COALESCE(tags, ''), created_at
|
| 118 |
+
FROM diary_entries
|
| 119 |
+
""")
|
| 120 |
+
else:
|
| 121 |
+
return 0
|
| 122 |
+
|
| 123 |
+
rows = source_cursor.fetchall()
|
| 124 |
+
|
| 125 |
+
for row in rows:
|
| 126 |
+
target_cursor.execute("""
|
| 127 |
+
INSERT OR IGNORE INTO diary_entries (user_id, date, content, tags, created_at)
|
| 128 |
+
VALUES (?, ?, ?, ?, ?)
|
| 129 |
+
""", (user_id, row[0], row[1], row[2] if len(row) > 2 else '', row[3] if len(row) > 3 else None))
|
| 130 |
+
|
| 131 |
+
target_conn.commit()
|
| 132 |
+
migrated_count = len(rows)
|
| 133 |
+
|
| 134 |
+
if migrated_count > 0:
|
| 135 |
+
logger.info(f"Migrated {migrated_count} entries for user {user_id}")
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.warning(f"Could not migrate data for user {user_id}: {e}")
|
| 139 |
+
|
| 140 |
+
return migrated_count
|
clean_repo/src/Indexingstep/dataloading.py
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
from typing import List, Optional, Dict, Any
|
| 3 |
+
from langchain.schema import Document
|
| 4 |
+
from langchain.document_loaders.base import BaseLoader
|
| 5 |
+
import logging
|
| 6 |
+
import re
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
# Set up logging
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class DiaryDataLoader(BaseLoader):
|
| 14 |
+
"""
|
| 15 |
+
Custom LangChain document loader for diary entries from SQLite database.
|
| 16 |
+
Enhanced with detailed metadata extraction for better indexing.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
db_path: str,
|
| 22 |
+
table_name: str = "diary_entries",
|
| 23 |
+
content_column: str = "content",
|
| 24 |
+
date_column: str = "date",
|
| 25 |
+
tags_column: str = "tags",
|
| 26 |
+
id_column: str = "id",
|
| 27 |
+
user_id: int = 1
|
| 28 |
+
):
|
| 29 |
+
"""
|
| 30 |
+
Initialize the DiaryDataLoader.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
db_path (str): Path to the SQLite database file
|
| 34 |
+
table_name (str): Name of the table containing diary entries
|
| 35 |
+
content_column (str): Name of the column containing diary content
|
| 36 |
+
date_column (str): Name of the column containing entry dates
|
| 37 |
+
tags_column (str): Name of the column containing entry tags
|
| 38 |
+
id_column (str): Name of the column containing entry IDs
|
| 39 |
+
user_id (int): ID of the user for filtering diary entries
|
| 40 |
+
"""
|
| 41 |
+
self.db_path = db_path
|
| 42 |
+
self.table_name = table_name
|
| 43 |
+
self.content_column = content_column
|
| 44 |
+
self.date_column = date_column
|
| 45 |
+
self.tags_column = tags_column
|
| 46 |
+
self.id_column = id_column
|
| 47 |
+
self.user_id = user_id
|
| 48 |
+
|
| 49 |
+
def _extract_tags_from_content(self, content: str) -> List[str]:
|
| 50 |
+
"""
|
| 51 |
+
Extract #tags from content string.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
content: The diary content string
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
List of tags found (without # symbol)
|
| 58 |
+
"""
|
| 59 |
+
if not content:
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
# Find all #tags in content
|
| 63 |
+
tag_pattern = r'#(\w+(?:[_-]\w+)*)'
|
| 64 |
+
matches = re.findall(tag_pattern, content, re.IGNORECASE)
|
| 65 |
+
|
| 66 |
+
# Remove duplicates and return lowercase tags
|
| 67 |
+
return list(set([tag.lower() for tag in matches]))
|
| 68 |
+
|
| 69 |
+
def _extract_location_from_content(self, content: str) -> Optional[str]:
|
| 70 |
+
"""
|
| 71 |
+
Extract location information from content using common patterns.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
content: The diary content string
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Location string if found, None otherwise
|
| 78 |
+
"""
|
| 79 |
+
if not content:
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
# Common location patterns
|
| 83 |
+
location_patterns = [
|
| 84 |
+
r'at\s+([A-Z][a-zA-Z\s]+(?:Park|Beach|Mall|Store|Restaurant|Cafe|Office|Home|School|University))',
|
| 85 |
+
r'in\s+([A-Z][a-zA-Z\s]+(?:City|District|Area|Street|Road))',
|
| 86 |
+
r'went\s+to\s+([A-Z][a-zA-Z\s]+)',
|
| 87 |
+
r'visited\s+([A-Z][a-zA-Z\s]+)',
|
| 88 |
+
r'location:\s*([A-Za-z\s]+)',
|
| 89 |
+
r'place:\s*([A-Za-z\s]+)'
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
for pattern in location_patterns:
|
| 93 |
+
matches = re.findall(pattern, content, re.IGNORECASE)
|
| 94 |
+
if matches:
|
| 95 |
+
return matches[0].strip()
|
| 96 |
+
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
def _extract_people_from_content(self, content: str) -> List[str]:
|
| 100 |
+
"""
|
| 101 |
+
Extract people/relationships mentioned in content.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
content: The diary content string
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
List of people/relationships mentioned
|
| 108 |
+
"""
|
| 109 |
+
if not content:
|
| 110 |
+
return []
|
| 111 |
+
|
| 112 |
+
# Common relationship patterns
|
| 113 |
+
people_patterns = [
|
| 114 |
+
r'with\s+(my\s+)?(\w+(?:\s+\w+)?)',
|
| 115 |
+
r'(mom|dad|mother|father|sister|brother|friend|colleague|boss|teacher)',
|
| 116 |
+
r'(family|friends|team|colleagues)',
|
| 117 |
+
r'met\s+([\w\s]+)',
|
| 118 |
+
r'talked\s+to\s+([\w\s]+)'
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
people = set()
|
| 122 |
+
for pattern in people_patterns:
|
| 123 |
+
matches = re.findall(pattern, content, re.IGNORECASE)
|
| 124 |
+
for match in matches:
|
| 125 |
+
if isinstance(match, tuple):
|
| 126 |
+
for part in match:
|
| 127 |
+
if part.strip():
|
| 128 |
+
people.add(part.strip().lower())
|
| 129 |
+
else:
|
| 130 |
+
people.add(match.strip().lower())
|
| 131 |
+
|
| 132 |
+
# Filter out common words that are not people
|
| 133 |
+
exclude_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
|
| 134 |
+
people = [p for p in people if p not in exclude_words and len(p) > 2]
|
| 135 |
+
|
| 136 |
+
return list(people)
|
| 137 |
+
|
| 138 |
+
def _get_day_of_week(self, date_str: str) -> str:
|
| 139 |
+
"""
|
| 140 |
+
Get day of week from date string.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
date_str: Date string in YYYY-MM-DD format
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
Day of week (e.g., 'Monday', 'Tuesday', etc.)
|
| 147 |
+
"""
|
| 148 |
+
try:
|
| 149 |
+
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
|
| 150 |
+
return date_obj.strftime('%A')
|
| 151 |
+
except:
|
| 152 |
+
return 'Unknown'
|
| 153 |
+
|
| 154 |
+
def _extract_content_from_structured_format(self, raw_content: str) -> tuple:
|
| 155 |
+
"""
|
| 156 |
+
Extract actual content from structured format like:
|
| 157 |
+
Title: xxxx
|
| 158 |
+
Type: Text
|
| 159 |
+
Content: actual content here
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
tuple: (title, actual_content)
|
| 163 |
+
"""
|
| 164 |
+
lines = raw_content.strip().split('\n')
|
| 165 |
+
title = ""
|
| 166 |
+
content = ""
|
| 167 |
+
|
| 168 |
+
for line in lines:
|
| 169 |
+
if line.startswith("Title: "):
|
| 170 |
+
title = line.replace("Title: ", "").strip()
|
| 171 |
+
elif line.startswith("Content: "):
|
| 172 |
+
content = line.replace("Content: ", "").strip()
|
| 173 |
+
|
| 174 |
+
# If no structured format found, return original content
|
| 175 |
+
if not content:
|
| 176 |
+
content = raw_content
|
| 177 |
+
|
| 178 |
+
return title, content
|
| 179 |
+
|
| 180 |
+
def load(self) -> List[Document]:
|
| 181 |
+
"""
|
| 182 |
+
Load diary entries from the database and convert them to LangChain Documents.
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
List[Document]: List of LangChain Document objects
|
| 186 |
+
"""
|
| 187 |
+
documents = []
|
| 188 |
+
|
| 189 |
+
try:
|
| 190 |
+
# Connect to the SQLite database
|
| 191 |
+
conn = sqlite3.connect(self.db_path)
|
| 192 |
+
conn.row_factory = sqlite3.Row # Enable accessing columns by name
|
| 193 |
+
cursor = conn.cursor()
|
| 194 |
+
|
| 195 |
+
# Build the SQL query with all required columns
|
| 196 |
+
columns = [self.id_column, self.date_column, self.content_column, self.tags_column]
|
| 197 |
+
|
| 198 |
+
query = f"SELECT {', '.join(columns)} FROM {self.table_name} WHERE user_id = ? ORDER BY {self.date_column} DESC"
|
| 199 |
+
|
| 200 |
+
# Execute the query
|
| 201 |
+
cursor.execute(query, (self.user_id,))
|
| 202 |
+
rows = cursor.fetchall()
|
| 203 |
+
|
| 204 |
+
logger.info(f"Loaded {len(rows)} diary entries from database")
|
| 205 |
+
|
| 206 |
+
# Convert each row to a LangChain Document with enhanced metadata
|
| 207 |
+
for row in rows:
|
| 208 |
+
row_dict = dict(row) if hasattr(row, 'keys') else {
|
| 209 |
+
self.id_column: row[0],
|
| 210 |
+
self.date_column: row[1],
|
| 211 |
+
self.content_column: row[2],
|
| 212 |
+
self.tags_column: row[3] if len(row) > 3 else ""
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
raw_content = row_dict[self.content_column]
|
| 216 |
+
date = row_dict[self.date_column]
|
| 217 |
+
entry_id = row_dict.get(self.id_column, "unknown")
|
| 218 |
+
db_tags = row_dict.get(self.tags_column, "")
|
| 219 |
+
|
| 220 |
+
# Extract structured content
|
| 221 |
+
title, actual_content = self._extract_content_from_structured_format(raw_content)
|
| 222 |
+
|
| 223 |
+
# Extract comprehensive metadata
|
| 224 |
+
content_tags = self._extract_tags_from_content(actual_content)
|
| 225 |
+
db_tag_list = [tag.strip() for tag in db_tags.split(',') if tag.strip()] if db_tags else []
|
| 226 |
+
all_tags = list(set(content_tags + db_tag_list)) # Combine and deduplicate
|
| 227 |
+
|
| 228 |
+
location = self._extract_location_from_content(actual_content)
|
| 229 |
+
people = self._extract_people_from_content(actual_content)
|
| 230 |
+
day_of_week = self._get_day_of_week(date)
|
| 231 |
+
|
| 232 |
+
# Create comprehensive metadata for the document
|
| 233 |
+
metadata = {
|
| 234 |
+
"source": self.db_path,
|
| 235 |
+
"entry_id": str(entry_id),
|
| 236 |
+
"date": date,
|
| 237 |
+
"day_of_week": day_of_week,
|
| 238 |
+
"type": "diary_entry",
|
| 239 |
+
"tags": all_tags,
|
| 240 |
+
"tag_count": len(all_tags),
|
| 241 |
+
"content_length": len(actual_content),
|
| 242 |
+
"word_count": len(actual_content.split())
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
# Add optional metadata if available
|
| 246 |
+
if title:
|
| 247 |
+
metadata["title"] = title
|
| 248 |
+
if location:
|
| 249 |
+
metadata["location"] = location
|
| 250 |
+
if people:
|
| 251 |
+
metadata["people"] = people
|
| 252 |
+
metadata["people_count"] = len(people)
|
| 253 |
+
|
| 254 |
+
# Add mood/sentiment tags if present
|
| 255 |
+
mood_tags = [tag for tag in all_tags if tag in ['happy', 'sad', 'excited', 'tired', 'angry', 'peaceful', 'stressed', 'grateful', 'frustrated', 'motivated']]
|
| 256 |
+
if mood_tags:
|
| 257 |
+
metadata["mood_tags"] = mood_tags
|
| 258 |
+
|
| 259 |
+
# Create Document object with actual content
|
| 260 |
+
document = Document(
|
| 261 |
+
page_content=actual_content,
|
| 262 |
+
metadata=metadata
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
documents.append(document)
|
| 266 |
+
|
| 267 |
+
conn.close()
|
| 268 |
+
logger.info(f"Successfully converted {len(documents)} entries to Documents")
|
| 269 |
+
|
| 270 |
+
except sqlite3.Error as e:
|
| 271 |
+
logger.error(f"Database error: {e}")
|
| 272 |
+
raise
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.error(f"Error loading diary data: {e}")
|
| 275 |
+
raise
|
| 276 |
+
|
| 277 |
+
return documents
|
| 278 |
+
|
| 279 |
+
def load_by_date_range(self, start_date: str, end_date: str) -> List[Document]:
|
| 280 |
+
"""
|
| 281 |
+
Load diary entries within a specific date range.
|
| 282 |
+
|
| 283 |
+
Args:
|
| 284 |
+
start_date (str): Start date in YYYY-MM-DD format
|
| 285 |
+
end_date (str): End date in YYYY-MM-DD format
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
List[Document]: Filtered list of Document objects
|
| 289 |
+
"""
|
| 290 |
+
documents = []
|
| 291 |
+
|
| 292 |
+
try:
|
| 293 |
+
conn = sqlite3.connect(self.db_path)
|
| 294 |
+
conn.row_factory = sqlite3.Row
|
| 295 |
+
cursor = conn.cursor()
|
| 296 |
+
|
| 297 |
+
columns = [self.content_column, self.date_column]
|
| 298 |
+
# if self.title_column:
|
| 299 |
+
# columns.append(self.title_column)
|
| 300 |
+
|
| 301 |
+
query = f"""
|
| 302 |
+
SELECT {', '.join(columns)}
|
| 303 |
+
FROM {self.table_name}
|
| 304 |
+
WHERE user_id = ? AND {self.date_column} BETWEEN ? AND ?
|
| 305 |
+
ORDER BY {self.date_column}
|
| 306 |
+
"""
|
| 307 |
+
|
| 308 |
+
cursor.execute(query, (self.user_id, start_date, end_date))
|
| 309 |
+
rows = cursor.fetchall()
|
| 310 |
+
|
| 311 |
+
logger.info(f"Loaded {len(rows)} diary entries from {start_date} to {end_date}")
|
| 312 |
+
|
| 313 |
+
for row in rows:
|
| 314 |
+
raw_content = row[self.content_column]
|
| 315 |
+
date = row[self.date_column]
|
| 316 |
+
|
| 317 |
+
# Extract structured content
|
| 318 |
+
title, actual_content = self._extract_content_from_structured_format(raw_content)
|
| 319 |
+
|
| 320 |
+
metadata = {
|
| 321 |
+
"source": self.db_path,
|
| 322 |
+
"date": date,
|
| 323 |
+
"type": "diary_entry",
|
| 324 |
+
"date_range": f"{start_date}_to_{end_date}"
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
# Add title to metadata if available
|
| 328 |
+
if title:
|
| 329 |
+
metadata["title"] = title
|
| 330 |
+
|
| 331 |
+
document = Document(
|
| 332 |
+
page_content=actual_content,
|
| 333 |
+
metadata=metadata
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
documents.append(document)
|
| 337 |
+
|
| 338 |
+
conn.close()
|
| 339 |
+
|
| 340 |
+
except sqlite3.Error as e:
|
| 341 |
+
logger.error(f"Database error: {e}")
|
| 342 |
+
raise
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logger.error(f"Error loading diary data by date range: {e}")
|
| 345 |
+
raise
|
| 346 |
+
|
| 347 |
+
return documents
|
| 348 |
+
|
| 349 |
+
def get_table_info(self) -> dict:
|
| 350 |
+
"""
|
| 351 |
+
Get information about the database table structure.
|
| 352 |
+
|
| 353 |
+
Returns:
|
| 354 |
+
dict: Table information including columns and row count
|
| 355 |
+
"""
|
| 356 |
+
try:
|
| 357 |
+
conn = sqlite3.connect(self.db_path)
|
| 358 |
+
cursor = conn.cursor()
|
| 359 |
+
|
| 360 |
+
# Get table schema
|
| 361 |
+
cursor.execute(f"PRAGMA table_info({self.table_name})")
|
| 362 |
+
columns = cursor.fetchall()
|
| 363 |
+
|
| 364 |
+
# Get row count
|
| 365 |
+
cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
|
| 366 |
+
row_count = cursor.fetchone()[0]
|
| 367 |
+
|
| 368 |
+
conn.close()
|
| 369 |
+
|
| 370 |
+
return {
|
| 371 |
+
"table_name": self.table_name,
|
| 372 |
+
"columns": [{"name": col[1], "type": col[2]} for col in columns],
|
| 373 |
+
"row_count": row_count
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
except sqlite3.Error as e:
|
| 377 |
+
logger.error(f"Database error: {e}")
|
| 378 |
+
raise
|
| 379 |
+
|
| 380 |
+
class DiaryContentPreprocessor:
|
| 381 |
+
"""
|
| 382 |
+
Preprocessor for diary content to clean and standardize text before indexing.
|
| 383 |
+
"""
|
| 384 |
+
|
| 385 |
+
def __init__(
|
| 386 |
+
self,
|
| 387 |
+
remove_extra_whitespace: bool = True,
|
| 388 |
+
normalize_line_breaks: bool = True,
|
| 389 |
+
min_content_length: int = 10,
|
| 390 |
+
max_content_length: Optional[int] = None
|
| 391 |
+
):
|
| 392 |
+
"""
|
| 393 |
+
Initialize the content preprocessor.
|
| 394 |
+
|
| 395 |
+
Args:
|
| 396 |
+
remove_extra_whitespace (bool): Remove extra spaces and tabs
|
| 397 |
+
normalize_line_breaks (bool): Normalize line breaks to single newlines
|
| 398 |
+
min_content_length (int): Minimum content length to keep
|
| 399 |
+
max_content_length (int, optional): Maximum content length to keep
|
| 400 |
+
"""
|
| 401 |
+
self.remove_extra_whitespace = remove_extra_whitespace
|
| 402 |
+
self.normalize_line_breaks = normalize_line_breaks
|
| 403 |
+
self.min_content_length = min_content_length
|
| 404 |
+
self.max_content_length = max_content_length
|
| 405 |
+
|
| 406 |
+
def preprocess_content(self, content: str) -> str:
|
| 407 |
+
"""
|
| 408 |
+
Preprocess diary content text.
|
| 409 |
+
|
| 410 |
+
Args:
|
| 411 |
+
content (str): Raw diary content
|
| 412 |
+
|
| 413 |
+
Returns:
|
| 414 |
+
str: Preprocessed content
|
| 415 |
+
"""
|
| 416 |
+
if not content or not isinstance(content, str):
|
| 417 |
+
return ""
|
| 418 |
+
|
| 419 |
+
processed_content = content
|
| 420 |
+
|
| 421 |
+
# Remove extra whitespace
|
| 422 |
+
if self.remove_extra_whitespace:
|
| 423 |
+
processed_content = ' '.join(processed_content.split())
|
| 424 |
+
|
| 425 |
+
# Normalize line breaks
|
| 426 |
+
if self.normalize_line_breaks:
|
| 427 |
+
processed_content = processed_content.replace('\r\n', '\n').replace('\r', '\n')
|
| 428 |
+
# Remove multiple consecutive newlines
|
| 429 |
+
processed_content = re.sub(r'\n+', '\n', processed_content)
|
| 430 |
+
|
| 431 |
+
# Strip leading/trailing whitespace
|
| 432 |
+
processed_content = processed_content.strip()
|
| 433 |
+
|
| 434 |
+
# Check length constraints
|
| 435 |
+
if len(processed_content) < self.min_content_length:
|
| 436 |
+
logger.warning(f"Content too short ({len(processed_content)} chars), skipping")
|
| 437 |
+
return ""
|
| 438 |
+
|
| 439 |
+
if self.max_content_length and len(processed_content) > self.max_content_length:
|
| 440 |
+
logger.warning(f"Content too long ({len(processed_content)} chars), truncating")
|
| 441 |
+
processed_content = processed_content[:self.max_content_length]
|
| 442 |
+
|
| 443 |
+
return processed_content
|
| 444 |
+
|
| 445 |
+
def preprocess_documents(self, documents: List[Document]) -> List[Document]:
|
| 446 |
+
"""
|
| 447 |
+
Preprocess a list of Document objects.
|
| 448 |
+
|
| 449 |
+
Args:
|
| 450 |
+
documents (List[Document]): List of documents to preprocess
|
| 451 |
+
|
| 452 |
+
Returns:
|
| 453 |
+
List[Document]: List of preprocessed documents
|
| 454 |
+
"""
|
| 455 |
+
preprocessed_docs = []
|
| 456 |
+
|
| 457 |
+
for doc in documents:
|
| 458 |
+
processed_content = self.preprocess_content(doc.page_content)
|
| 459 |
+
|
| 460 |
+
# Skip empty content after preprocessing
|
| 461 |
+
if not processed_content:
|
| 462 |
+
continue
|
| 463 |
+
|
| 464 |
+
# Create new document with processed content
|
| 465 |
+
preprocessed_doc = Document(
|
| 466 |
+
page_content=processed_content,
|
| 467 |
+
metadata=doc.metadata.copy()
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
preprocessed_docs.append(preprocessed_doc)
|
| 471 |
+
|
| 472 |
+
logger.info(f"Preprocessed {len(documents)} documents, kept {len(preprocessed_docs)}")
|
| 473 |
+
return preprocessed_docs
|
| 474 |
+
|
| 475 |
+
def load_all_entries(self, user_id: int = None) -> List[Dict[str, Any]]:
|
| 476 |
+
"""
|
| 477 |
+
Load all diary entries for a specific user.
|
| 478 |
+
|
| 479 |
+
Args:
|
| 480 |
+
user_id: User ID to filter entries
|
| 481 |
+
|
| 482 |
+
Returns:
|
| 483 |
+
List of diary entry dictionaries
|
| 484 |
+
"""
|
| 485 |
+
if user_id is None:
|
| 486 |
+
user_id = self.user_id
|
| 487 |
+
|
| 488 |
+
entries = []
|
| 489 |
+
|
| 490 |
+
try:
|
| 491 |
+
conn = sqlite3.connect(self.db_path)
|
| 492 |
+
conn.row_factory = sqlite3.Row
|
| 493 |
+
cursor = conn.cursor()
|
| 494 |
+
|
| 495 |
+
query = f"""
|
| 496 |
+
SELECT id, user_id, date, content, tags, created_at
|
| 497 |
+
FROM {self.table_name}
|
| 498 |
+
WHERE user_id = ?
|
| 499 |
+
ORDER BY date DESC, created_at DESC
|
| 500 |
+
"""
|
| 501 |
+
|
| 502 |
+
cursor.execute(query, (user_id,))
|
| 503 |
+
rows = cursor.fetchall()
|
| 504 |
+
|
| 505 |
+
for row in rows:
|
| 506 |
+
entries.append({
|
| 507 |
+
'id': row['id'],
|
| 508 |
+
'user_id': row['user_id'],
|
| 509 |
+
'date': row['date'],
|
| 510 |
+
'content': row['content'],
|
| 511 |
+
'tags': row['tags'] or '',
|
| 512 |
+
'created_at': row['created_at']
|
| 513 |
+
})
|
| 514 |
+
|
| 515 |
+
conn.close()
|
| 516 |
+
logger.info(f"Loaded {len(entries)} entries for user {user_id}")
|
| 517 |
+
|
| 518 |
+
except sqlite3.Error as e:
|
| 519 |
+
logger.error(f"Database error loading entries: {e}")
|
| 520 |
+
|
| 521 |
+
return entries
|
| 522 |
+
|
| 523 |
+
def load_entries_since(self, since_date, user_id: int = None) -> List[Dict[str, Any]]:
|
| 524 |
+
"""
|
| 525 |
+
Load diary entries since a specific date.
|
| 526 |
+
|
| 527 |
+
Args:
|
| 528 |
+
since_date: datetime object or ISO string
|
| 529 |
+
user_id: User ID to filter entries
|
| 530 |
+
|
| 531 |
+
Returns:
|
| 532 |
+
List of diary entry dictionaries
|
| 533 |
+
"""
|
| 534 |
+
if user_id is None:
|
| 535 |
+
user_id = self.user_id
|
| 536 |
+
|
| 537 |
+
entries = []
|
| 538 |
+
|
| 539 |
+
try:
|
| 540 |
+
# Convert datetime to string if needed
|
| 541 |
+
if hasattr(since_date, 'isoformat'):
|
| 542 |
+
since_str = since_date.isoformat()
|
| 543 |
+
else:
|
| 544 |
+
since_str = str(since_date)
|
| 545 |
+
|
| 546 |
+
conn = sqlite3.connect(self.db_path)
|
| 547 |
+
conn.row_factory = sqlite3.Row
|
| 548 |
+
cursor = conn.cursor()
|
| 549 |
+
|
| 550 |
+
query = f"""
|
| 551 |
+
SELECT id, user_id, date, content, tags, created_at
|
| 552 |
+
FROM {self.table_name}
|
| 553 |
+
WHERE user_id = ? AND created_at > ?
|
| 554 |
+
ORDER BY date DESC, created_at DESC
|
| 555 |
+
"""
|
| 556 |
+
|
| 557 |
+
cursor.execute(query, (user_id, since_str))
|
| 558 |
+
rows = cursor.fetchall()
|
| 559 |
+
|
| 560 |
+
for row in rows:
|
| 561 |
+
entries.append({
|
| 562 |
+
'id': row['id'],
|
| 563 |
+
'user_id': row['user_id'],
|
| 564 |
+
'date': row['date'],
|
| 565 |
+
'content': row['content'],
|
| 566 |
+
'tags': row['tags'] or '',
|
| 567 |
+
'created_at': row['created_at']
|
| 568 |
+
})
|
| 569 |
+
|
| 570 |
+
conn.close()
|
| 571 |
+
logger.info(f"Loaded {len(entries)} entries since {since_str} for user {user_id}")
|
| 572 |
+
|
| 573 |
+
except sqlite3.Error as e:
|
| 574 |
+
logger.error(f"Database error loading entries since {since_date}: {e}")
|
| 575 |
+
|
| 576 |
+
return entries
|
| 577 |
+
|
| 578 |
+
# Example usage
|
| 579 |
+
if __name__ == "__main__":
|
| 580 |
+
# Initialize the loader
|
| 581 |
+
loader = DiaryDataLoader(
|
| 582 |
+
db_path="../streamlit_app/backend/diary.db",
|
| 583 |
+
table_name="diary_entries",
|
| 584 |
+
content_column="content",
|
| 585 |
+
date_column="date" #,
|
| 586 |
+
# title_column="title"
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
# Load all documents
|
| 590 |
+
documents = loader.load()
|
| 591 |
+
print(f"Loaded {len(documents)} diary entries")
|
| 592 |
+
|
| 593 |
+
# Load documents by date range
|
| 594 |
+
filtered_docs = loader.load_by_date_range("2024-01-01", "2026-12-31")
|
| 595 |
+
print(f"Loaded {len(filtered_docs)} entries from 2024")
|
| 596 |
+
|
| 597 |
+
# Get table information
|
| 598 |
+
table_info = loader.get_table_info()
|
| 599 |
+
print(f"Table info: {table_info}")
|
| 600 |
+
|
| 601 |
+
# view document contents
|
| 602 |
+
for doc in documents:
|
| 603 |
+
print(f"Document content: {doc.page_content}")
|
clean_repo/src/Indexingstep/diary_text_splitter.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom text splitter optimized for diary entries.
|
| 3 |
+
Handles entry-based chunking with smart splitting for long entries.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Any, Dict
|
| 7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain.schema import Document
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class DiaryTextSplitter:
|
| 14 |
+
"""
|
| 15 |
+
Custom text splitter optimized for diary entries.
|
| 16 |
+
|
| 17 |
+
Strategy:
|
| 18 |
+
1. Each diary entry = 1 chunk (for short entries)
|
| 19 |
+
2. Long entries → split into 200-300 tokens with 50-token sliding window
|
| 20 |
+
3. Preserve metadata across all chunks
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
chunk_size: int = 300, # ~200-300 tokens
|
| 26 |
+
chunk_overlap: int = 50, # ~50 tokens overlap
|
| 27 |
+
length_function: callable = len,
|
| 28 |
+
separators: Optional[List[str]] = None
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Initialize the DiaryTextSplitter.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
chunk_size: Maximum chunk size in characters (~300 chars ≈ 200-300 tokens)
|
| 35 |
+
chunk_overlap: Overlap between chunks to preserve context
|
| 36 |
+
length_function: Function to calculate text length
|
| 37 |
+
separators: List of separators for splitting (sentence-aware)
|
| 38 |
+
"""
|
| 39 |
+
self.chunk_size = chunk_size
|
| 40 |
+
self.chunk_overlap = chunk_overlap
|
| 41 |
+
self.length_function = length_function
|
| 42 |
+
|
| 43 |
+
# Diary-optimized separators (sentence and paragraph aware)
|
| 44 |
+
self.separators = separators or [
|
| 45 |
+
"\n\n", # Paragraph breaks
|
| 46 |
+
"\n", # Line breaks
|
| 47 |
+
". ", # Sentence endings
|
| 48 |
+
"! ", # Exclamation sentences
|
| 49 |
+
"? ", # Question sentences
|
| 50 |
+
"; ", # Semicolon breaks
|
| 51 |
+
", ", # Comma breaks
|
| 52 |
+
" ", # Word breaks
|
| 53 |
+
"" # Character breaks (last resort)
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
# Initialize recursive character splitter for long entries
|
| 57 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 58 |
+
chunk_size=self.chunk_size,
|
| 59 |
+
chunk_overlap=self.chunk_overlap,
|
| 60 |
+
length_function=self.length_function,
|
| 61 |
+
separators=self.separators
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def _estimate_tokens(self, text: str) -> int:
|
| 65 |
+
"""
|
| 66 |
+
Estimate token count from character count.
|
| 67 |
+
Rule of thumb: ~4 characters per token for English text.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
text: Input text
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Estimated token count
|
| 74 |
+
"""
|
| 75 |
+
return len(text) // 4
|
| 76 |
+
|
| 77 |
+
def _should_split_entry(self, content: str) -> bool:
|
| 78 |
+
"""
|
| 79 |
+
Determine if a diary entry should be split into multiple chunks.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
content: Diary entry content
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
True if entry should be split, False otherwise
|
| 86 |
+
"""
|
| 87 |
+
estimated_tokens = self._estimate_tokens(content)
|
| 88 |
+
# Split if entry is longer than ~250 tokens (considering our 200-300 target)
|
| 89 |
+
return estimated_tokens > 250
|
| 90 |
+
|
| 91 |
+
def _create_chunk_metadata(self, original_doc: Document, chunk_index: int, total_chunks: int) -> Dict[str, Any]:
|
| 92 |
+
"""
|
| 93 |
+
Create metadata for a chunk, preserving original metadata.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
original_doc: Original document
|
| 97 |
+
chunk_index: Index of current chunk (0-based)
|
| 98 |
+
total_chunks: Total number of chunks for this entry
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Metadata dictionary for the chunk
|
| 102 |
+
"""
|
| 103 |
+
chunk_metadata = original_doc.metadata.copy()
|
| 104 |
+
|
| 105 |
+
# Add chunk-specific metadata
|
| 106 |
+
chunk_metadata.update({
|
| 107 |
+
"chunk_index": chunk_index,
|
| 108 |
+
"total_chunks": total_chunks,
|
| 109 |
+
"is_chunked": total_chunks > 1,
|
| 110 |
+
"chunk_id": f"{chunk_metadata.get('entry_id', 'unknown')}_{chunk_index}"
|
| 111 |
+
})
|
| 112 |
+
|
| 113 |
+
return chunk_metadata
|
| 114 |
+
|
| 115 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 116 |
+
"""
|
| 117 |
+
Split diary documents into optimized chunks.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
documents: List of diary entry documents
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
List of chunked documents with preserved metadata
|
| 124 |
+
"""
|
| 125 |
+
chunked_documents = []
|
| 126 |
+
|
| 127 |
+
for doc in documents:
|
| 128 |
+
content = doc.page_content
|
| 129 |
+
|
| 130 |
+
# Check if entry needs splitting
|
| 131 |
+
if not self._should_split_entry(content):
|
| 132 |
+
# Keep as single chunk for short entries
|
| 133 |
+
chunk_metadata = self._create_chunk_metadata(doc, 0, 1)
|
| 134 |
+
|
| 135 |
+
chunked_doc = Document(
|
| 136 |
+
page_content=content,
|
| 137 |
+
metadata=chunk_metadata
|
| 138 |
+
)
|
| 139 |
+
chunked_documents.append(chunked_doc)
|
| 140 |
+
|
| 141 |
+
logger.debug(f"Entry {doc.metadata.get('entry_id', 'unknown')} kept as single chunk")
|
| 142 |
+
|
| 143 |
+
else:
|
| 144 |
+
# Split long entry into multiple chunks
|
| 145 |
+
text_chunks = self.text_splitter.split_text(content)
|
| 146 |
+
total_chunks = len(text_chunks)
|
| 147 |
+
|
| 148 |
+
logger.info(f"Entry {doc.metadata.get('entry_id', 'unknown')} split into {total_chunks} chunks")
|
| 149 |
+
|
| 150 |
+
for i, chunk_text in enumerate(text_chunks):
|
| 151 |
+
chunk_metadata = self._create_chunk_metadata(doc, i, total_chunks)
|
| 152 |
+
|
| 153 |
+
# Add chunk position information
|
| 154 |
+
chunk_metadata["chunk_position"] = "start" if i == 0 else "end" if i == total_chunks - 1 else "middle"
|
| 155 |
+
|
| 156 |
+
chunked_doc = Document(
|
| 157 |
+
page_content=chunk_text,
|
| 158 |
+
metadata=chunk_metadata
|
| 159 |
+
)
|
| 160 |
+
chunked_documents.append(chunked_doc)
|
| 161 |
+
|
| 162 |
+
logger.info(f"Split {len(documents)} entries into {len(chunked_documents)} chunks")
|
| 163 |
+
return chunked_documents
|
| 164 |
+
|
| 165 |
+
def get_chunk_stats(self, documents: List[Document]) -> Dict[str, Any]:
|
| 166 |
+
"""
|
| 167 |
+
Get statistics about chunking results.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
documents: List of chunked documents
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Dictionary with chunking statistics
|
| 174 |
+
"""
|
| 175 |
+
total_chunks = len(documents)
|
| 176 |
+
single_chunks = sum(1 for doc in documents if doc.metadata.get("total_chunks", 1) == 1)
|
| 177 |
+
multi_chunks = total_chunks - single_chunks
|
| 178 |
+
|
| 179 |
+
unique_entries = len(set(doc.metadata.get("entry_id", "unknown") for doc in documents))
|
| 180 |
+
|
| 181 |
+
avg_chunk_size = sum(len(doc.page_content) for doc in documents) / total_chunks if total_chunks > 0 else 0
|
| 182 |
+
avg_tokens = sum(self._estimate_tokens(doc.page_content) for doc in documents) / total_chunks if total_chunks > 0 else 0
|
| 183 |
+
|
| 184 |
+
return {
|
| 185 |
+
"total_chunks": total_chunks,
|
| 186 |
+
"unique_entries": unique_entries,
|
| 187 |
+
"single_chunk_entries": single_chunks,
|
| 188 |
+
"multi_chunk_entries": multi_chunks,
|
| 189 |
+
"avg_chunk_size_chars": round(avg_chunk_size, 2),
|
| 190 |
+
"avg_chunk_size_tokens": round(avg_tokens, 2),
|
| 191 |
+
"chunking_ratio": round(total_chunks / unique_entries, 2) if unique_entries > 0 else 0
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
def split_diary_entry(self, entry: Dict[str, Any]) -> List[Document]:
|
| 195 |
+
"""
|
| 196 |
+
Split a single diary entry into document chunks.
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
entry: Dictionary containing diary entry data
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
List of Document objects
|
| 203 |
+
"""
|
| 204 |
+
# Create Document from entry
|
| 205 |
+
content = entry.get('content', '')
|
| 206 |
+
|
| 207 |
+
# Extract title from content if it's in structured format
|
| 208 |
+
title = ""
|
| 209 |
+
actual_content = content
|
| 210 |
+
|
| 211 |
+
if content.startswith("Title: "):
|
| 212 |
+
lines = content.split('\n')
|
| 213 |
+
for line in lines:
|
| 214 |
+
if line.startswith("Title: "):
|
| 215 |
+
title = line.replace("Title: ", "").strip()
|
| 216 |
+
elif line.startswith("Content: "):
|
| 217 |
+
actual_content = line.replace("Content: ", "").strip()
|
| 218 |
+
|
| 219 |
+
# Create metadata
|
| 220 |
+
metadata = {
|
| 221 |
+
"entry_id": str(entry.get('id', 'unknown')),
|
| 222 |
+
"user_id": entry.get('user_id', 1),
|
| 223 |
+
"date": entry.get('date', ''),
|
| 224 |
+
"tags": entry.get('tags', ''),
|
| 225 |
+
"created_at": entry.get('created_at', ''),
|
| 226 |
+
"type": "diary_entry",
|
| 227 |
+
"content_length": len(actual_content),
|
| 228 |
+
"word_count": len(actual_content.split())
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
if title:
|
| 232 |
+
metadata["title"] = title
|
| 233 |
+
|
| 234 |
+
# Create Document
|
| 235 |
+
doc = Document(
|
| 236 |
+
page_content=actual_content,
|
| 237 |
+
metadata=metadata
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Split using the existing split_documents method
|
| 241 |
+
return self.split_documents([doc])
|
clean_repo/src/Indexingstep/embedding_and_storing.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_chroma import Chroma
|
| 2 |
+
from langchain.schema import Document
|
| 3 |
+
from typing import List, Optional, Dict, Any, Union
|
| 4 |
+
import os
|
| 5 |
+
import logging
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 8 |
+
|
| 9 |
+
# Set up logging
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class DiaryEmbeddingAndStorage:
|
| 14 |
+
"""
|
| 15 |
+
Class for embedding diary documents and storing them in Chroma vector database.
|
| 16 |
+
Enhanced with metadata filtering for ChromaDB compatibility.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def _filter_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Union[str, int, float, bool]]:
|
| 20 |
+
"""
|
| 21 |
+
Filter metadata to only include types supported by ChromaDB.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
metadata: Original metadata dictionary
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Filtered metadata with only supported types
|
| 28 |
+
"""
|
| 29 |
+
filtered = {}
|
| 30 |
+
|
| 31 |
+
for key, value in metadata.items():
|
| 32 |
+
if isinstance(value, (str, int, float, bool)) or value is None:
|
| 33 |
+
filtered[key] = value
|
| 34 |
+
elif isinstance(value, list):
|
| 35 |
+
# Convert lists to comma-separated strings
|
| 36 |
+
if value: # Only if list is not empty
|
| 37 |
+
filtered[f"{key}_list"] = ", ".join(str(item) for item in value)
|
| 38 |
+
filtered[f"{key}_count"] = len(value)
|
| 39 |
+
elif isinstance(value, dict):
|
| 40 |
+
# Skip complex nested objects
|
| 41 |
+
logger.debug(f"Skipping complex metadata field: {key}")
|
| 42 |
+
continue
|
| 43 |
+
else:
|
| 44 |
+
# Convert other types to string
|
| 45 |
+
filtered[key] = str(value)
|
| 46 |
+
|
| 47 |
+
return filtered
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
user_id: int = 1,
|
| 52 |
+
api_key: Optional[str] = None,
|
| 53 |
+
base_persist_directory: str = "./",
|
| 54 |
+
embedding_model: str = "models/embedding-001",
|
| 55 |
+
chunk_size: int = 1000,
|
| 56 |
+
chunk_overlap: int = 200
|
| 57 |
+
):
|
| 58 |
+
"""
|
| 59 |
+
Initialize the embedding and storage system with user-specific database.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
user_id (int): User ID for user-specific vector database
|
| 63 |
+
api_key (str, optional): Google API key for embeddings
|
| 64 |
+
base_persist_directory (str): Base directory for vector databases
|
| 65 |
+
embedding_model (str): Google embedding model to use
|
| 66 |
+
chunk_size (int): Size of text chunks for embedding
|
| 67 |
+
chunk_overlap (int): Overlap between chunks
|
| 68 |
+
"""
|
| 69 |
+
# Set up Google API key
|
| 70 |
+
if api_key:
|
| 71 |
+
os.environ["GOOGLE_API_KEY"] = api_key
|
| 72 |
+
elif "GOOGLE_API_KEY" not in os.environ:
|
| 73 |
+
raise ValueError("Google API key must be provided either as parameter or environment variable")
|
| 74 |
+
|
| 75 |
+
self.user_id = user_id
|
| 76 |
+
self.base_persist_directory = base_persist_directory
|
| 77 |
+
|
| 78 |
+
# Create user-specific paths
|
| 79 |
+
self.persist_directory = os.path.join(base_persist_directory, f"user_{user_id}_vector_db")
|
| 80 |
+
self.collection_name = f"user_{user_id}_diary_entries"
|
| 81 |
+
|
| 82 |
+
self.chunk_size = chunk_size
|
| 83 |
+
self.chunk_overlap = chunk_overlap
|
| 84 |
+
|
| 85 |
+
# Initialize embedding model
|
| 86 |
+
try:
|
| 87 |
+
self.embeddings = GoogleGenerativeAIEmbeddings(
|
| 88 |
+
model=embedding_model
|
| 89 |
+
)
|
| 90 |
+
# logger.info(f"Initialized Google embeddings with model: {embedding_model}")
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Failed to initialize embeddings: {e}")
|
| 93 |
+
raise
|
| 94 |
+
|
| 95 |
+
# Initialize or load existing vector store
|
| 96 |
+
self.vector_store = None
|
| 97 |
+
self._setup_vector_store()
|
| 98 |
+
|
| 99 |
+
def _setup_vector_store(self):
|
| 100 |
+
"""Set up the Chroma vector store."""
|
| 101 |
+
try:
|
| 102 |
+
# Create persist directory if it doesn't exist
|
| 103 |
+
Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
|
| 104 |
+
|
| 105 |
+
# Initialize Chroma vector store
|
| 106 |
+
self.vector_store = Chroma(
|
| 107 |
+
collection_name=self.collection_name,
|
| 108 |
+
embedding_function=self.embeddings,
|
| 109 |
+
persist_directory=self.persist_directory
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# logger.info(f"Vector store initialized with persist directory: {self.persist_directory}")
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Failed to setup vector store: {e}")
|
| 116 |
+
raise
|
| 117 |
+
|
| 118 |
+
def embed_and_store_documents(self, documents: List[Document]) -> List[str]:
|
| 119 |
+
"""
|
| 120 |
+
Embed and store documents in the vector database.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
documents (List[Document]): List of LangChain Document objects
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
List[str]: List of document IDs
|
| 127 |
+
"""
|
| 128 |
+
if not documents:
|
| 129 |
+
logger.warning("No documents provided for embedding")
|
| 130 |
+
return []
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
# Filter metadata for each document
|
| 134 |
+
filtered_documents = []
|
| 135 |
+
for doc in documents:
|
| 136 |
+
filtered_metadata = self._filter_metadata(doc.metadata)
|
| 137 |
+
filtered_doc = Document(
|
| 138 |
+
page_content=doc.page_content,
|
| 139 |
+
metadata=filtered_metadata
|
| 140 |
+
)
|
| 141 |
+
filtered_documents.append(filtered_doc)
|
| 142 |
+
|
| 143 |
+
# Log metadata transformation for debugging
|
| 144 |
+
logger.debug(f"Original metadata keys: {list(doc.metadata.keys())}")
|
| 145 |
+
logger.debug(f"Filtered metadata keys: {list(filtered_metadata.keys())}")
|
| 146 |
+
|
| 147 |
+
# Add documents to vector store
|
| 148 |
+
document_ids = self.vector_store.add_documents(filtered_documents)
|
| 149 |
+
|
| 150 |
+
# Persist the vector store (auto-persisted in new langchain-chroma)
|
| 151 |
+
# self.vector_store.persist() # Not needed in langchain-chroma 0.2+
|
| 152 |
+
|
| 153 |
+
logger.info(f"Successfully embedded and stored {len(filtered_documents)} documents")
|
| 154 |
+
return document_ids
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Failed to embed and store documents: {e}")
|
| 158 |
+
raise
|
| 159 |
+
|
| 160 |
+
def embed_and_store_texts(
|
| 161 |
+
self,
|
| 162 |
+
texts: List[str],
|
| 163 |
+
metadatas: Optional[List[Dict[str, Any]]] = None
|
| 164 |
+
) -> List[str]:
|
| 165 |
+
"""
|
| 166 |
+
Embed and store raw texts in the vector database.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
texts (List[str]): List of text strings
|
| 170 |
+
metadatas (List[Dict], optional): List of metadata dictionaries
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
List[str]: List of document IDs
|
| 174 |
+
"""
|
| 175 |
+
if not texts:
|
| 176 |
+
logger.warning("No texts provided for embedding")
|
| 177 |
+
return []
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
# Filter metadata if provided
|
| 181 |
+
filtered_metadatas = None
|
| 182 |
+
if metadatas:
|
| 183 |
+
filtered_metadatas = []
|
| 184 |
+
for metadata in metadatas:
|
| 185 |
+
filtered_metadata = self._filter_metadata(metadata)
|
| 186 |
+
filtered_metadatas.append(filtered_metadata)
|
| 187 |
+
|
| 188 |
+
# Log metadata transformation for debugging
|
| 189 |
+
logger.debug(f"Original metadata keys: {list(metadata.keys())}")
|
| 190 |
+
logger.debug(f"Filtered metadata keys: {list(filtered_metadata.keys())}")
|
| 191 |
+
|
| 192 |
+
# Add texts to vector store
|
| 193 |
+
document_ids = self.vector_store.add_texts(
|
| 194 |
+
texts=texts,
|
| 195 |
+
metadatas=filtered_metadatas
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# ChromaDB auto-persists in newer versions
|
| 199 |
+
logger.info(f"Successfully embedded and stored {len(texts)} text documents")
|
| 200 |
+
return document_ids
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
print(f"DEBUG: Error in embed_and_store_texts: {e}")
|
| 204 |
+
print(f"DEBUG: Error type: {type(e)}")
|
| 205 |
+
import traceback
|
| 206 |
+
traceback.print_exc()
|
| 207 |
+
logger.error(f"Failed to embed and store texts: {e}")
|
| 208 |
+
raise
|
| 209 |
+
|
| 210 |
+
def similarity_search(
|
| 211 |
+
self,
|
| 212 |
+
query: str,
|
| 213 |
+
k: int = 4,
|
| 214 |
+
filter: Optional[Dict[str, Any]] = None
|
| 215 |
+
) -> List[Document]:
|
| 216 |
+
"""
|
| 217 |
+
Perform similarity search on stored documents.
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
query (str): Search query
|
| 221 |
+
k (int): Number of results to return
|
| 222 |
+
filter (Dict, optional): Metadata filter
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
List[Document]: List of similar documents
|
| 226 |
+
"""
|
| 227 |
+
try:
|
| 228 |
+
results = self.vector_store.similarity_search(
|
| 229 |
+
query=query,
|
| 230 |
+
k=k,
|
| 231 |
+
filter=filter
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
logger.info(f"Found {len(results)} similar documents for query: '{query[:50]}...'")
|
| 235 |
+
return results
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Failed to perform similarity search: {e}")
|
| 239 |
+
raise
|
| 240 |
+
|
| 241 |
+
def similarity_search_with_score(
|
| 242 |
+
self,
|
| 243 |
+
query: str,
|
| 244 |
+
k: int = 4,
|
| 245 |
+
filter: Optional[Dict[str, Any]] = None
|
| 246 |
+
) -> List[tuple]:
|
| 247 |
+
"""
|
| 248 |
+
Perform similarity search with relevance scores.
|
| 249 |
+
|
| 250 |
+
Args:
|
| 251 |
+
query (str): Search query
|
| 252 |
+
k (int): Number of results to return
|
| 253 |
+
filter (Dict, optional): Metadata filter
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
List[tuple]: List of (Document, score) tuples
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
results = self.vector_store.similarity_search_with_score(
|
| 260 |
+
query=query,
|
| 261 |
+
k=k,
|
| 262 |
+
filter=filter
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
logger.info(f"Found {len(results)} similar documents with scores for query: '{query[:50]}...'")
|
| 266 |
+
return results
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
logger.error(f"Failed to perform similarity search with scores: {e}")
|
| 270 |
+
raise
|
| 271 |
+
|
| 272 |
+
def get_collection_info(self) -> Dict[str, Any]:
|
| 273 |
+
"""
|
| 274 |
+
Get information about the vector store collection.
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
Dict: Collection information
|
| 278 |
+
"""
|
| 279 |
+
try:
|
| 280 |
+
collection = self.vector_store._collection
|
| 281 |
+
count = collection.count()
|
| 282 |
+
|
| 283 |
+
return {
|
| 284 |
+
"collection_name": self.collection_name,
|
| 285 |
+
"document_count": count,
|
| 286 |
+
"persist_directory": self.persist_directory
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.error(f"Failed to get collection info: {e}")
|
| 291 |
+
return {}
|
| 292 |
+
|
| 293 |
+
def delete_documents(self, ids: List[str]) -> bool:
|
| 294 |
+
"""
|
| 295 |
+
Delete documents by their IDs.
|
| 296 |
+
|
| 297 |
+
Args:
|
| 298 |
+
ids (List[str]): List of document IDs to delete
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
bool: Success status
|
| 302 |
+
"""
|
| 303 |
+
try:
|
| 304 |
+
self.vector_store.delete(ids=ids)
|
| 305 |
+
# ChromaDB auto-persists in newer versions
|
| 306 |
+
|
| 307 |
+
logger.info(f"Successfully deleted {len(ids)} documents")
|
| 308 |
+
return True
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.error(f"Failed to delete documents: {e}")
|
| 312 |
+
return False
|
| 313 |
+
|
| 314 |
+
def delete_documents_by_metadata(self, filter_criteria: Dict[str, Any]) -> bool:
|
| 315 |
+
"""
|
| 316 |
+
Delete documents based on metadata criteria.
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
filter_criteria (Dict): Metadata criteria to filter documents for deletion
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
bool: Success status
|
| 323 |
+
"""
|
| 324 |
+
try:
|
| 325 |
+
collection = self.vector_store._collection
|
| 326 |
+
|
| 327 |
+
# Get all documents with their metadata
|
| 328 |
+
all_data = collection.get(include=['metadatas'])
|
| 329 |
+
ids_to_delete = []
|
| 330 |
+
|
| 331 |
+
# Find documents that match the criteria
|
| 332 |
+
for i, metadata in enumerate(all_data['metadatas']):
|
| 333 |
+
match = True
|
| 334 |
+
for key, value in filter_criteria.items():
|
| 335 |
+
if metadata.get(key) != value:
|
| 336 |
+
match = False
|
| 337 |
+
break
|
| 338 |
+
|
| 339 |
+
if match:
|
| 340 |
+
ids_to_delete.append(all_data['ids'][i])
|
| 341 |
+
|
| 342 |
+
if ids_to_delete:
|
| 343 |
+
self.vector_store.delete(ids=ids_to_delete)
|
| 344 |
+
# ChromaDB auto-persists in newer versions
|
| 345 |
+
logger.info(f"Successfully deleted {len(ids_to_delete)} documents matching criteria: {filter_criteria}")
|
| 346 |
+
return True
|
| 347 |
+
else:
|
| 348 |
+
logger.info(f"No documents found matching criteria: {filter_criteria}")
|
| 349 |
+
return True
|
| 350 |
+
|
| 351 |
+
except Exception as e:
|
| 352 |
+
logger.error(f"Failed to delete documents by metadata: {e}")
|
| 353 |
+
return False
|
| 354 |
+
|
| 355 |
+
def clear_collection(self) -> bool:
|
| 356 |
+
"""
|
| 357 |
+
Clear all documents from the collection.
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
bool: Success status
|
| 361 |
+
"""
|
| 362 |
+
try:
|
| 363 |
+
# Get all document IDs and delete them
|
| 364 |
+
collection = self.vector_store._collection
|
| 365 |
+
all_ids = collection.get()['ids']
|
| 366 |
+
|
| 367 |
+
if all_ids:
|
| 368 |
+
self.vector_store.delete(ids=all_ids)
|
| 369 |
+
# ChromaDB auto-persists in newer versions
|
| 370 |
+
logger.info(f"Cleared {len(all_ids)} documents from collection")
|
| 371 |
+
else:
|
| 372 |
+
logger.info("Collection is already empty")
|
| 373 |
+
|
| 374 |
+
return True
|
| 375 |
+
|
| 376 |
+
except Exception as e:
|
| 377 |
+
logger.error(f"Failed to clear collection: {e}")
|
| 378 |
+
return False
|
| 379 |
+
|
| 380 |
+
def batch_process_documents(
|
| 381 |
+
self,
|
| 382 |
+
documents: List[Document],
|
| 383 |
+
batch_size: int = 100
|
| 384 |
+
) -> List[str]:
|
| 385 |
+
"""
|
| 386 |
+
Process documents in batches for large datasets.
|
| 387 |
+
|
| 388 |
+
Args:
|
| 389 |
+
documents (List[Document]): List of documents to process
|
| 390 |
+
batch_size (int): Size of each batch
|
| 391 |
+
|
| 392 |
+
Returns:
|
| 393 |
+
List[str]: List of all document IDs
|
| 394 |
+
"""
|
| 395 |
+
all_ids = []
|
| 396 |
+
|
| 397 |
+
for i in range(0, len(documents), batch_size):
|
| 398 |
+
batch = documents[i:i + batch_size]
|
| 399 |
+
logger.info(f"Processing batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
|
| 400 |
+
|
| 401 |
+
try:
|
| 402 |
+
batch_ids = self.embed_and_store_documents(batch)
|
| 403 |
+
all_ids.extend(batch_ids)
|
| 404 |
+
except Exception as e:
|
| 405 |
+
logger.error(f"Failed to process batch {i//batch_size + 1}: {e}")
|
| 406 |
+
continue
|
| 407 |
+
|
| 408 |
+
logger.info(f"Completed batch processing. Total documents processed: {len(all_ids)}")
|
| 409 |
+
return all_ids
|
| 410 |
+
|
| 411 |
+
class EmbeddingDemo:
|
| 412 |
+
def __init__(self, api_key=None):
|
| 413 |
+
"""Initialize the embedding model with Google API key."""
|
| 414 |
+
if api_key:
|
| 415 |
+
os.environ["GOOGLE_API_KEY"] = api_key
|
| 416 |
+
|
| 417 |
+
self.embeddings = GoogleGenerativeAIEmbeddings(
|
| 418 |
+
model="models/embedding-001"
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
def embed_text(self, text):
|
| 422 |
+
"""Generate embedding for a single text."""
|
| 423 |
+
return self.embeddings.embed_query(text)
|
| 424 |
+
|
| 425 |
+
def embed_documents(self, documents):
|
| 426 |
+
"""Generate embeddings for multiple documents."""
|
| 427 |
+
return self.embeddings.embed_documents(documents)
|
| 428 |
+
|
| 429 |
+
def demonstrate(self):
|
| 430 |
+
"""Show basic embedding functionality."""
|
| 431 |
+
sample_text = "This is a sample text for embedding."
|
| 432 |
+
sample_docs = ["First document", "Second document", "Third document"]
|
| 433 |
+
|
| 434 |
+
# Single text embedding
|
| 435 |
+
text_embedding = self.embed_text(sample_text)
|
| 436 |
+
print(f"Text embedding dimension: {len(text_embedding)}")
|
| 437 |
+
|
| 438 |
+
# Multiple documents embedding
|
| 439 |
+
doc_embeddings = self.embed_documents(sample_docs)
|
| 440 |
+
print(f"Number of document embeddings: {len(doc_embeddings)}")
|
| 441 |
+
print(f"Each embedding dimension: {len(doc_embeddings[0])}")
|
| 442 |
+
|
| 443 |
+
# Usage example
|
| 444 |
+
if __name__ == "__main__":
|
| 445 |
+
# Initialize the embedding and storage system
|
| 446 |
+
try:
|
| 447 |
+
# You need to set your Google API key
|
| 448 |
+
embedding_storage = DiaryEmbeddingAndStorage(
|
| 449 |
+
api_key="your_google_api_key_here", # Replace with your actual API key
|
| 450 |
+
persist_directory="./diary_vector_db",
|
| 451 |
+
collection_name="diary_entries"
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
# Example documents
|
| 455 |
+
sample_documents = [
|
| 456 |
+
Document(
|
| 457 |
+
page_content="Today was a wonderful day. I went to the park and enjoyed the sunshine.",
|
| 458 |
+
metadata={"date": "2024-01-15", "mood": "happy"}
|
| 459 |
+
),
|
| 460 |
+
Document(
|
| 461 |
+
page_content="Had a challenging day at work but learned a lot of new things.",
|
| 462 |
+
metadata={"date": "2024-01-16", "mood": "productive"}
|
| 463 |
+
),
|
| 464 |
+
Document(
|
| 465 |
+
page_content="Spent time with family and friends. Made some great memories.",
|
| 466 |
+
metadata={"date": "2024-01-17", "mood": "grateful"}
|
| 467 |
+
)
|
| 468 |
+
]
|
| 469 |
+
|
| 470 |
+
# Embed and store documents
|
| 471 |
+
doc_ids = embedding_storage.embed_and_store_documents(sample_documents)
|
| 472 |
+
print(f"Stored documents with IDs: {doc_ids}")
|
| 473 |
+
|
| 474 |
+
# Get collection info
|
| 475 |
+
info = embedding_storage.get_collection_info()
|
| 476 |
+
print(f"Collection info: {info}")
|
| 477 |
+
|
| 478 |
+
# Perform similarity search
|
| 479 |
+
query = "happy day at the park"
|
| 480 |
+
results = embedding_storage.similarity_search(query, k=2)
|
| 481 |
+
|
| 482 |
+
print(f"\nSimilarity search results for '{query}':")
|
| 483 |
+
for i, doc in enumerate(results):
|
| 484 |
+
print(f"Result {i+1}: {doc.page_content[:100]}...")
|
| 485 |
+
print(f"Metadata: {doc.metadata}")
|
| 486 |
+
|
| 487 |
+
# Search with scores
|
| 488 |
+
scored_results = embedding_storage.similarity_search_with_score(query, k=2)
|
| 489 |
+
|
| 490 |
+
print(f"\nSimilarity search with scores:")
|
| 491 |
+
for doc, score in scored_results:
|
| 492 |
+
print(f"Score: {score:.4f} - {doc.page_content[:50]}...")
|
| 493 |
+
|
| 494 |
+
except Exception as e:
|
| 495 |
+
print(f"Error in example: {e}")
|
| 496 |
+
|
| 497 |
+
# Original demo
|
| 498 |
+
# demo = EmbeddingDemo(api_key="your_google_api_key_here")
|
| 499 |
+
# demo.demonstrate()
|
clean_repo/src/Indexingstep/indexing_pipeline.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
# Add parent directory to path
|
| 7 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
+
|
| 9 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 10 |
+
from langchain_chroma import Chroma
|
| 11 |
+
from langchain.schema import Document
|
| 12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 13 |
+
|
| 14 |
+
def create_user_vector_database(user_id: int, diary_entries: List[Dict[str, Any]]) -> bool:
|
| 15 |
+
"""
|
| 16 |
+
Create vector database for a specific user from their diary entries.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
user_id: User ID
|
| 20 |
+
diary_entries: List of diary entries from database
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
True if successful, False otherwise
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
# Setup paths
|
| 27 |
+
base_vector_path = os.path.dirname(os.path.abspath(__file__))
|
| 28 |
+
vector_db_path = os.path.join(base_vector_path, f"user_{user_id}_vector_db")
|
| 29 |
+
collection_name = f"user_{user_id}_diary_entries"
|
| 30 |
+
|
| 31 |
+
# Create directory
|
| 32 |
+
os.makedirs(vector_db_path, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# Initialize embeddings
|
| 35 |
+
google_api_key = os.getenv("GOOGLE_API_KEY")
|
| 36 |
+
if not google_api_key:
|
| 37 |
+
raise ValueError("Google API key not found")
|
| 38 |
+
|
| 39 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| 40 |
+
|
| 41 |
+
# Process diary entries into documents
|
| 42 |
+
documents = []
|
| 43 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 44 |
+
chunk_size=1000,
|
| 45 |
+
chunk_overlap=200,
|
| 46 |
+
length_function=len,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
for entry in diary_entries:
|
| 50 |
+
# Extract content
|
| 51 |
+
content = entry.get('content', '')
|
| 52 |
+
if not content:
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
# Extract title and content
|
| 56 |
+
lines = content.split('\n')
|
| 57 |
+
title = "Untitled"
|
| 58 |
+
actual_content = content
|
| 59 |
+
|
| 60 |
+
for line in lines:
|
| 61 |
+
if line.startswith('Title: '):
|
| 62 |
+
title = line.replace('Title: ', '').strip()
|
| 63 |
+
elif line.startswith('Content: '):
|
| 64 |
+
actual_content = line.replace('Content: ', '').strip()
|
| 65 |
+
break
|
| 66 |
+
|
| 67 |
+
# Create metadata
|
| 68 |
+
metadata = {
|
| 69 |
+
'user_id': user_id,
|
| 70 |
+
'entry_id': entry.get('id'),
|
| 71 |
+
'date': entry.get('date', ''),
|
| 72 |
+
'title': title,
|
| 73 |
+
'tags': entry.get('tags', ''),
|
| 74 |
+
'tags_list': [tag.strip() for tag in entry.get('tags', '').split(',') if tag.strip()],
|
| 75 |
+
'source': f"diary_entry_{entry.get('id')}"
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# Split content if too long
|
| 79 |
+
if len(actual_content) > 1000:
|
| 80 |
+
chunks = text_splitter.split_text(actual_content)
|
| 81 |
+
for i, chunk in enumerate(chunks):
|
| 82 |
+
chunk_metadata = metadata.copy()
|
| 83 |
+
chunk_metadata['chunk_id'] = i
|
| 84 |
+
documents.append(Document(page_content=chunk, metadata=chunk_metadata))
|
| 85 |
+
else:
|
| 86 |
+
documents.append(Document(page_content=actual_content, metadata=metadata))
|
| 87 |
+
|
| 88 |
+
if not documents:
|
| 89 |
+
print(f"No documents to index for user {user_id}")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
# Create vector store
|
| 93 |
+
vector_store = Chroma(
|
| 94 |
+
persist_directory=vector_db_path,
|
| 95 |
+
embedding_function=embeddings,
|
| 96 |
+
collection_name=collection_name
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Add documents to vector store
|
| 100 |
+
vector_store.add_documents(documents)
|
| 101 |
+
|
| 102 |
+
# Persist the database
|
| 103 |
+
vector_store.persist()
|
| 104 |
+
|
| 105 |
+
print(f"Successfully created vector database for user {user_id} with {len(documents)} documents")
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"Error creating vector database for user {user_id}: {e}")
|
| 110 |
+
return False
|
clean_repo/src/Indexingstep/pipeline.py
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
+
|
| 5 |
+
from dataloading import DiaryDataLoader, DiaryContentPreprocessor
|
| 6 |
+
from diary_text_splitter import DiaryTextSplitter
|
| 7 |
+
from embedding_and_storing import DiaryEmbeddingAndStorage
|
| 8 |
+
from langchain.schema import Document
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
import logging
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Configure logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class DiaryIndexingPipeline:
|
| 18 |
+
"""
|
| 19 |
+
Enhanced pipeline for indexing diary entries with optimized chunking and metadata.
|
| 20 |
+
Integrates data loading, preprocessing, diary-specific splitting, embedding, and storage.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
db_path: str = "./diary.db",
|
| 26 |
+
persist_directory: str = "./chroma_db",
|
| 27 |
+
collection_name: str = "diary_collection",
|
| 28 |
+
google_api_key: Optional[str] = None,
|
| 29 |
+
chunk_size: int = 300, # Optimized for diary entries (200-300 tokens)
|
| 30 |
+
chunk_overlap: int = 50, # 50-token sliding window
|
| 31 |
+
embedding_model: str = "models/embedding-001",
|
| 32 |
+
batch_size: int = 50,
|
| 33 |
+
user_id: int = 1
|
| 34 |
+
):
|
| 35 |
+
"""
|
| 36 |
+
Initialize the enhanced diary indexing pipeline.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
db_path (str): Path to SQLite database
|
| 40 |
+
persist_directory (str): Directory for vector database
|
| 41 |
+
collection_name (str): Name of the collection
|
| 42 |
+
google_api_key (str, optional): Google API key for embeddings
|
| 43 |
+
chunk_size (int): Size of text chunks (optimized for diary entries)
|
| 44 |
+
chunk_overlap (int): Overlap between chunks (sliding window)
|
| 45 |
+
embedding_model (str): Google embedding model name
|
| 46 |
+
batch_size (int): Batch size for processing
|
| 47 |
+
user_id (int): ID of the user for user-specific isolation
|
| 48 |
+
"""
|
| 49 |
+
self.db_path = db_path
|
| 50 |
+
self.persist_directory = persist_directory
|
| 51 |
+
self.collection_name = collection_name
|
| 52 |
+
self.batch_size = batch_size
|
| 53 |
+
self.user_id = user_id
|
| 54 |
+
|
| 55 |
+
# Validate database exists
|
| 56 |
+
if not os.path.exists(db_path):
|
| 57 |
+
raise FileNotFoundError(f"Database file not found: {db_path}")
|
| 58 |
+
|
| 59 |
+
# Initialize components
|
| 60 |
+
self._initialize_components(
|
| 61 |
+
google_api_key, chunk_size, chunk_overlap, embedding_model
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
logger.info("Diary Indexing Pipeline initialized successfully")
|
| 65 |
+
|
| 66 |
+
def _initialize_components(
|
| 67 |
+
self,
|
| 68 |
+
google_api_key: Optional[str],
|
| 69 |
+
chunk_size: int,
|
| 70 |
+
chunk_overlap: int,
|
| 71 |
+
embedding_model: str
|
| 72 |
+
):
|
| 73 |
+
"""Initialize all pipeline components."""
|
| 74 |
+
|
| 75 |
+
# 1. Data Loader
|
| 76 |
+
self.data_loader = DiaryDataLoader(
|
| 77 |
+
db_path=self.db_path,
|
| 78 |
+
table_name="diary_entries",
|
| 79 |
+
content_column="content",
|
| 80 |
+
date_column="date",
|
| 81 |
+
user_id=self.user_id
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# 2. Content Preprocessor
|
| 85 |
+
self.preprocessor = DiaryContentPreprocessor(
|
| 86 |
+
remove_extra_whitespace=True,
|
| 87 |
+
normalize_line_breaks=True,
|
| 88 |
+
min_content_length=3, # Keep short entries
|
| 89 |
+
max_content_length=10000
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# 3. Diary-optimized Text Splitter
|
| 93 |
+
self.text_splitter = DiaryTextSplitter(
|
| 94 |
+
chunk_size=chunk_size,
|
| 95 |
+
chunk_overlap=chunk_overlap
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# 4. Embedding and Storage
|
| 99 |
+
self.embedding_storage = DiaryEmbeddingAndStorage(
|
| 100 |
+
user_id=self.user_id,
|
| 101 |
+
api_key=google_api_key,
|
| 102 |
+
base_persist_directory=self.persist_directory,
|
| 103 |
+
embedding_model=embedding_model,
|
| 104 |
+
chunk_size=chunk_size,
|
| 105 |
+
chunk_overlap=chunk_overlap
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
logger.info("All pipeline components initialized")
|
| 109 |
+
|
| 110 |
+
def load_diary_data(self, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[Document]:
|
| 111 |
+
"""
|
| 112 |
+
Load diary entries from database.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
start_date (str, optional): Start date filter (YYYY-MM-DD)
|
| 116 |
+
end_date (str, optional): End date filter (YYYY-MM-DD)
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List[Document]: Loaded diary documents
|
| 120 |
+
"""
|
| 121 |
+
try:
|
| 122 |
+
logger.info("Loading diary entries from database...")
|
| 123 |
+
|
| 124 |
+
if start_date and end_date:
|
| 125 |
+
documents = self.data_loader.load_by_date_range(start_date, end_date)
|
| 126 |
+
logger.info(f"Loaded {len(documents)} entries from {start_date} to {end_date}")
|
| 127 |
+
else:
|
| 128 |
+
documents = self.data_loader.load()
|
| 129 |
+
logger.info(f"Loaded {len(documents)} total diary entries")
|
| 130 |
+
|
| 131 |
+
if not documents:
|
| 132 |
+
logger.warning("No diary entries found in database")
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
return documents
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.error(f"Error loading diary data: {str(e)}")
|
| 139 |
+
raise
|
| 140 |
+
|
| 141 |
+
def preprocess_documents(self, documents: List[Document]) -> List[Document]:
|
| 142 |
+
"""
|
| 143 |
+
Preprocess diary documents.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
documents (List[Document]): Raw documents
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
List[Document]: Preprocessed documents
|
| 150 |
+
"""
|
| 151 |
+
try:
|
| 152 |
+
logger.info(f"Preprocessing {len(documents)} documents...")
|
| 153 |
+
|
| 154 |
+
preprocessed_docs = self.preprocessor.preprocess_documents(documents)
|
| 155 |
+
|
| 156 |
+
logger.info(f"Preprocessing complete: {len(preprocessed_docs)} documents kept")
|
| 157 |
+
return preprocessed_docs
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.error(f"Error preprocessing documents: {str(e)}")
|
| 161 |
+
raise
|
| 162 |
+
|
| 163 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 164 |
+
"""
|
| 165 |
+
Split documents into optimized chunks using diary-specific splitter.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
documents (List[Document]): Documents to split
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
List[Document]: Split document chunks with enhanced metadata
|
| 172 |
+
"""
|
| 173 |
+
try:
|
| 174 |
+
logger.info(f"Splitting {len(documents)} diary entries into optimized chunks...")
|
| 175 |
+
|
| 176 |
+
split_docs = self.text_splitter.split_documents(documents)
|
| 177 |
+
|
| 178 |
+
# Get and log chunking statistics
|
| 179 |
+
stats = self.text_splitter.get_chunk_stats(split_docs)
|
| 180 |
+
logger.info(f"Document splitting complete: {stats}")
|
| 181 |
+
|
| 182 |
+
return split_docs
|
| 183 |
+
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"Error splitting documents: {str(e)}")
|
| 186 |
+
raise
|
| 187 |
+
|
| 188 |
+
def embed_and_store(self, documents: List[Document]) -> List[str]:
|
| 189 |
+
"""
|
| 190 |
+
Generate embeddings and store documents.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
documents (List[Document]): Documents to embed and store
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
List[str]: Document IDs
|
| 197 |
+
"""
|
| 198 |
+
try:
|
| 199 |
+
logger.info(f"Generating embeddings and storing {len(documents)} document chunks...")
|
| 200 |
+
|
| 201 |
+
# Process in batches for large datasets
|
| 202 |
+
if len(documents) > self.batch_size:
|
| 203 |
+
document_ids = self.embedding_storage.batch_process_documents(
|
| 204 |
+
documents, self.batch_size
|
| 205 |
+
)
|
| 206 |
+
else:
|
| 207 |
+
document_ids = self.embedding_storage.embed_and_store_documents(documents)
|
| 208 |
+
|
| 209 |
+
logger.info(f"Successfully embedded and stored {len(document_ids)} documents")
|
| 210 |
+
return document_ids
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"Error embedding and storing documents: {str(e)}")
|
| 214 |
+
raise
|
| 215 |
+
|
| 216 |
+
def run_full_pipeline(
|
| 217 |
+
self,
|
| 218 |
+
start_date: Optional[str] = None,
|
| 219 |
+
end_date: Optional[str] = None,
|
| 220 |
+
clear_existing: bool = False
|
| 221 |
+
) -> Dict[str, Any]:
|
| 222 |
+
"""
|
| 223 |
+
Run the complete indexing pipeline.
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
start_date (str, optional): Start date filter
|
| 227 |
+
end_date (str, optional): End date filter
|
| 228 |
+
clear_existing (bool): Whether to clear existing data
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
Dict: Pipeline execution results
|
| 232 |
+
"""
|
| 233 |
+
try:
|
| 234 |
+
logger.info("="*60)
|
| 235 |
+
logger.info("STARTING DIARY INDEXING PIPELINE")
|
| 236 |
+
logger.info("="*60)
|
| 237 |
+
|
| 238 |
+
pipeline_stats = {
|
| 239 |
+
"status": "running",
|
| 240 |
+
"steps_completed": 0,
|
| 241 |
+
"total_steps": 5,
|
| 242 |
+
"documents_loaded": 0,
|
| 243 |
+
"documents_preprocessed": 0,
|
| 244 |
+
"chunks_created": 0,
|
| 245 |
+
"documents_stored": 0,
|
| 246 |
+
"errors": []
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
# Step 1: Clear existing data if requested
|
| 250 |
+
if clear_existing:
|
| 251 |
+
logger.info("Step 1: Clearing existing vector store...")
|
| 252 |
+
self.embedding_storage.clear_collection()
|
| 253 |
+
pipeline_stats["steps_completed"] += 1
|
| 254 |
+
|
| 255 |
+
# Step 2: Load diary data
|
| 256 |
+
logger.info("Step 2: Loading diary entries...")
|
| 257 |
+
documents = self.load_diary_data(start_date, end_date)
|
| 258 |
+
pipeline_stats["documents_loaded"] = len(documents)
|
| 259 |
+
pipeline_stats["steps_completed"] += 1
|
| 260 |
+
|
| 261 |
+
if not documents:
|
| 262 |
+
pipeline_stats["status"] = "completed_with_warnings"
|
| 263 |
+
pipeline_stats["errors"].append("No documents found to process")
|
| 264 |
+
return pipeline_stats
|
| 265 |
+
|
| 266 |
+
# Step 3: Preprocess documents
|
| 267 |
+
logger.info("Step 3: Preprocessing documents...")
|
| 268 |
+
preprocessed_docs = self.preprocess_documents(documents)
|
| 269 |
+
pipeline_stats["documents_preprocessed"] = len(preprocessed_docs)
|
| 270 |
+
pipeline_stats["steps_completed"] += 1
|
| 271 |
+
|
| 272 |
+
if not preprocessed_docs:
|
| 273 |
+
pipeline_stats["status"] = "failed"
|
| 274 |
+
pipeline_stats["errors"].append("No documents survived preprocessing")
|
| 275 |
+
return pipeline_stats
|
| 276 |
+
|
| 277 |
+
# Step 4: Split documents into chunks
|
| 278 |
+
logger.info("Step 4: Splitting documents into chunks...")
|
| 279 |
+
split_docs = self.split_documents(preprocessed_docs)
|
| 280 |
+
pipeline_stats["chunks_created"] = len(split_docs)
|
| 281 |
+
pipeline_stats["steps_completed"] += 1
|
| 282 |
+
|
| 283 |
+
# Step 5: Generate embeddings and store
|
| 284 |
+
logger.info("Step 5: Generating embeddings and storing...")
|
| 285 |
+
document_ids = self.embed_and_store(split_docs)
|
| 286 |
+
pipeline_stats["documents_stored"] = len(document_ids)
|
| 287 |
+
pipeline_stats["steps_completed"] += 1
|
| 288 |
+
|
| 289 |
+
# Update final status
|
| 290 |
+
pipeline_stats["status"] = "completed_successfully"
|
| 291 |
+
|
| 292 |
+
logger.info("="*60)
|
| 293 |
+
logger.info("PIPELINE COMPLETED SUCCESSFULLY!")
|
| 294 |
+
logger.info("="*60)
|
| 295 |
+
logger.info(f"Documents loaded: {pipeline_stats['documents_loaded']}")
|
| 296 |
+
logger.info(f"Documents preprocessed: {pipeline_stats['documents_preprocessed']}")
|
| 297 |
+
logger.info(f"Chunks created: {pipeline_stats['chunks_created']}")
|
| 298 |
+
logger.info(f"Documents stored: {pipeline_stats['documents_stored']}")
|
| 299 |
+
logger.info("="*60)
|
| 300 |
+
|
| 301 |
+
return pipeline_stats
|
| 302 |
+
|
| 303 |
+
except Exception as e:
|
| 304 |
+
logger.error(f"Pipeline failed with error: {str(e)}")
|
| 305 |
+
pipeline_stats["status"] = "failed"
|
| 306 |
+
pipeline_stats["errors"].append(str(e))
|
| 307 |
+
return pipeline_stats
|
| 308 |
+
|
| 309 |
+
def incremental_update(self, start_date: str, end_date: Optional[str] = None) -> Dict[str, Any]:
|
| 310 |
+
"""
|
| 311 |
+
Perform incremental update for new diary entries.
|
| 312 |
+
|
| 313 |
+
Args:
|
| 314 |
+
start_date (str): Start date for incremental update
|
| 315 |
+
end_date (str, optional): End date for incremental update
|
| 316 |
+
|
| 317 |
+
Returns:
|
| 318 |
+
Dict: Update results
|
| 319 |
+
"""
|
| 320 |
+
try:
|
| 321 |
+
logger.info(f"Starting incremental update from {start_date}")
|
| 322 |
+
|
| 323 |
+
# Load only new entries
|
| 324 |
+
new_documents = self.load_diary_data(start_date, end_date)
|
| 325 |
+
|
| 326 |
+
if not new_documents:
|
| 327 |
+
logger.info("No new documents found for incremental update")
|
| 328 |
+
return {"status": "no_updates", "documents_added": 0}
|
| 329 |
+
|
| 330 |
+
# Process new documents
|
| 331 |
+
preprocessed_docs = self.preprocess_documents(new_documents)
|
| 332 |
+
split_docs = self.split_documents(preprocessed_docs)
|
| 333 |
+
document_ids = self.embed_and_store(split_docs)
|
| 334 |
+
|
| 335 |
+
logger.info(f"Incremental update completed: {len(document_ids)} new documents added")
|
| 336 |
+
|
| 337 |
+
return {
|
| 338 |
+
"status": "success",
|
| 339 |
+
"documents_loaded": len(new_documents),
|
| 340 |
+
"documents_added": len(document_ids)
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logger.error(f"Incremental update failed: {str(e)}")
|
| 345 |
+
return {"status": "failed", "error": str(e)}
|
| 346 |
+
|
| 347 |
+
def search_similar_entries(
|
| 348 |
+
self,
|
| 349 |
+
query: str,
|
| 350 |
+
k: int = 5,
|
| 351 |
+
filter_metadata: Optional[Dict[str, Any]] = None
|
| 352 |
+
) -> List[Document]:
|
| 353 |
+
"""
|
| 354 |
+
Search for similar diary entries.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
query (str): Search query
|
| 358 |
+
k (int): Number of results to return
|
| 359 |
+
filter_metadata (Dict, optional): Metadata filter
|
| 360 |
+
|
| 361 |
+
Returns:
|
| 362 |
+
List[Document]: Similar documents
|
| 363 |
+
"""
|
| 364 |
+
try:
|
| 365 |
+
return self.embedding_storage.similarity_search(
|
| 366 |
+
query=query,
|
| 367 |
+
k=k,
|
| 368 |
+
filter=filter_metadata
|
| 369 |
+
)
|
| 370 |
+
except Exception as e:
|
| 371 |
+
logger.error(f"Error searching similar entries: {str(e)}")
|
| 372 |
+
return []
|
| 373 |
+
|
| 374 |
+
def get_pipeline_stats(self) -> Dict[str, Any]:
|
| 375 |
+
"""
|
| 376 |
+
Get comprehensive pipeline statistics.
|
| 377 |
+
|
| 378 |
+
Returns:
|
| 379 |
+
Dict: Pipeline and database statistics
|
| 380 |
+
"""
|
| 381 |
+
try:
|
| 382 |
+
# Database stats
|
| 383 |
+
db_info = self.data_loader.get_table_info()
|
| 384 |
+
|
| 385 |
+
# Vector store stats
|
| 386 |
+
vector_info = self.embedding_storage.get_collection_info()
|
| 387 |
+
|
| 388 |
+
return {
|
| 389 |
+
"database": db_info,
|
| 390 |
+
"vector_store": vector_info,
|
| 391 |
+
"pipeline_config": {
|
| 392 |
+
"chunk_size": self.text_splitter.chunk_size,
|
| 393 |
+
"chunk_overlap": self.text_splitter.chunk_overlap,
|
| 394 |
+
"batch_size": self.batch_size,
|
| 395 |
+
"collection_name": self.collection_name
|
| 396 |
+
}
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
except Exception as e:
|
| 400 |
+
logger.error(f"Error getting pipeline stats: {str(e)}")
|
| 401 |
+
return {}
|
| 402 |
+
|
| 403 |
+
def main():
|
| 404 |
+
"""Main function to demonstrate pipeline usage."""
|
| 405 |
+
|
| 406 |
+
# Configuration
|
| 407 |
+
config = {
|
| 408 |
+
"db_path": "../streamlit_app/backend/diary.db", # Adjust path as needed
|
| 409 |
+
"persist_directory": "./diary_vector_db",
|
| 410 |
+
"collection_name": "diary_entries",
|
| 411 |
+
"google_api_key": None, # Set your API key or use environment variable
|
| 412 |
+
"chunk_size": 800,
|
| 413 |
+
"chunk_overlap": 100,
|
| 414 |
+
"batch_size": 50
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
try:
|
| 418 |
+
# Initialize pipeline
|
| 419 |
+
logger.info("Initializing Diary Indexing Pipeline...")
|
| 420 |
+
pipeline = DiaryIndexingPipeline(**config)
|
| 421 |
+
|
| 422 |
+
# Run full pipeline
|
| 423 |
+
results = pipeline.run_full_pipeline(clear_existing=True)
|
| 424 |
+
|
| 425 |
+
# Print results
|
| 426 |
+
print("\n" + "="*60)
|
| 427 |
+
print("PIPELINE EXECUTION RESULTS")
|
| 428 |
+
print("="*60)
|
| 429 |
+
print(f"Status: {results['status']}")
|
| 430 |
+
print(f"Steps completed: {results['steps_completed']}/{results['total_steps']}")
|
| 431 |
+
print(f"Documents loaded: {results['documents_loaded']}")
|
| 432 |
+
print(f"Documents preprocessed: {results['documents_preprocessed']}")
|
| 433 |
+
print(f"Chunks created: {results['chunks_created']}")
|
| 434 |
+
print(f"Documents stored: {results['documents_stored']}")
|
| 435 |
+
|
| 436 |
+
if results['errors']:
|
| 437 |
+
print(f"Errors: {results['errors']}")
|
| 438 |
+
|
| 439 |
+
# Get and display stats
|
| 440 |
+
stats = pipeline.get_pipeline_stats()
|
| 441 |
+
print("\nPIPELINE STATISTICS:")
|
| 442 |
+
print(f"Database entries: {stats.get('database', {}).get('row_count', 'N/A')}")
|
| 443 |
+
print(f"Vector store documents: {stats.get('vector_store', {}).get('document_count', 'N/A')}")
|
| 444 |
+
print("="*60)
|
| 445 |
+
|
| 446 |
+
# Example search
|
| 447 |
+
if results['status'] == 'completed_successfully':
|
| 448 |
+
print("\nTesting similarity search...")
|
| 449 |
+
search_results = pipeline.search_similar_entries("happy day", k=3)
|
| 450 |
+
print(f"Found {len(search_results)} similar entries")
|
| 451 |
+
for i, doc in enumerate(search_results[:2]):
|
| 452 |
+
print(f"Result {i+1}: {doc.page_content[:100]}...")
|
| 453 |
+
|
| 454 |
+
except Exception as e:
|
| 455 |
+
logger.error(f"Main execution failed: {str(e)}")
|
| 456 |
+
print(f"Error: {str(e)}")
|
| 457 |
+
|
| 458 |
+
if __name__ == "__main__":
|
| 459 |
+
main()
|
clean_repo/src/Retrivel_And_Generation/Retrieval_And_Generator.py
ADDED
|
@@ -0,0 +1,739 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Retrieval and Generation System for Personal Diary Chatbot
|
| 4 |
+
|
| 5 |
+
This module implements the RAG (Retrieval-Augmented Generation) pipeline for the diary chatbot.
|
| 6 |
+
It handles document retrieval from the vector database and generates contextual responses
|
| 7 |
+
using Google's Generative AI.
|
| 8 |
+
|
| 9 |
+
Components:
|
| 10 |
+
- Document Retrieval: Query vector database for relevant diary entries
|
| 11 |
+
- Context Processing: Format retrieved documents for LLM consumption
|
| 12 |
+
- Response Generation: Generate contextual responses using retrieved diary content
|
| 13 |
+
- Conversation Management: Handle chat history and context preservation
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
import logging
|
| 19 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 20 |
+
from datetime import datetime
|
| 21 |
+
from functools import lru_cache
|
| 22 |
+
import hashlib
|
| 23 |
+
|
| 24 |
+
# Add parent directory to path for imports
|
| 25 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 26 |
+
|
| 27 |
+
# LangChain imports
|
| 28 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
|
| 29 |
+
from langchain_chroma import Chroma
|
| 30 |
+
from langchain.schema import Document
|
| 31 |
+
from langchain.schema.runnable import RunnablePassthrough
|
| 32 |
+
from langchain.schema.output_parser import StrOutputParser
|
| 33 |
+
from langchain.prompts import ChatPromptTemplate, PromptTemplate
|
| 34 |
+
|
| 35 |
+
# Configure logging
|
| 36 |
+
logging.basicConfig(level=logging.INFO)
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
|
| 39 |
+
class DiaryRAGSystem:
|
| 40 |
+
"""
|
| 41 |
+
Retrieval-Augmented Generation system for personal diary chatbot.
|
| 42 |
+
|
| 43 |
+
This class handles the complete RAG pipeline:
|
| 44 |
+
1. Retrieve relevant diary entries from vector database
|
| 45 |
+
2. Format context for LLM consumption
|
| 46 |
+
3. Generate contextual responses using Google's Generative AI
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
user_id: int = 1,
|
| 52 |
+
base_vector_path: str = "./src/VectorDB",
|
| 53 |
+
google_api_key: Optional[str] = None,
|
| 54 |
+
embedding_model: str = "models/embedding-001",
|
| 55 |
+
chat_model: str = "gemini-2.5-flash-lite",
|
| 56 |
+
max_retrieval_docs: int = 5
|
| 57 |
+
):
|
| 58 |
+
"""
|
| 59 |
+
Initialize the RAG system with user-specific vector database.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
user_id: User ID for user-specific vector database
|
| 63 |
+
base_vector_path: Base path for vector databases
|
| 64 |
+
google_api_key: Google API key for embeddings and chat
|
| 65 |
+
embedding_model: Model for text embeddings
|
| 66 |
+
chat_model: Model for chat completion
|
| 67 |
+
max_retrieval_docs: Maximum number of documents to retrieve
|
| 68 |
+
"""
|
| 69 |
+
self.user_id = user_id
|
| 70 |
+
self.base_vector_path = base_vector_path
|
| 71 |
+
|
| 72 |
+
# Create user-specific paths
|
| 73 |
+
self.vector_db_path = os.path.join(base_vector_path, f"user_{user_id}_vector_db")
|
| 74 |
+
self.collection_name = f"user_{user_id}_diary_entries"
|
| 75 |
+
self.max_retrieval_docs = max_retrieval_docs
|
| 76 |
+
|
| 77 |
+
# Ensure user vector database directory exists
|
| 78 |
+
os.makedirs(self.vector_db_path, exist_ok=True)
|
| 79 |
+
|
| 80 |
+
# Set up Google API key
|
| 81 |
+
if google_api_key:
|
| 82 |
+
os.environ["GOOGLE_API_KEY"] = google_api_key
|
| 83 |
+
elif not os.getenv("GOOGLE_API_KEY"):
|
| 84 |
+
raise ValueError("Google API key must be provided either as parameter or environment variable")
|
| 85 |
+
|
| 86 |
+
# Initialize embedding and chat models
|
| 87 |
+
try:
|
| 88 |
+
# Fix for Streamlit event loop issue
|
| 89 |
+
import asyncio
|
| 90 |
+
import nest_asyncio
|
| 91 |
+
|
| 92 |
+
# Allow nested event loops for Streamlit compatibility
|
| 93 |
+
try:
|
| 94 |
+
nest_asyncio.apply()
|
| 95 |
+
except:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
# Set event loop for thread if not exists
|
| 99 |
+
try:
|
| 100 |
+
loop = asyncio.get_event_loop()
|
| 101 |
+
if loop.is_closed():
|
| 102 |
+
raise RuntimeError("Event loop is closed")
|
| 103 |
+
except RuntimeError:
|
| 104 |
+
# Create new event loop for this thread
|
| 105 |
+
loop = asyncio.new_event_loop()
|
| 106 |
+
asyncio.set_event_loop(loop)
|
| 107 |
+
|
| 108 |
+
self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model)
|
| 109 |
+
self.chat_model = ChatGoogleGenerativeAI(
|
| 110 |
+
model=chat_model,
|
| 111 |
+
temperature=0.3, # Lower temperature for faster, more focused responses
|
| 112 |
+
max_tokens=800, # Shorter responses for speed
|
| 113 |
+
top_k=20, # Limit token choices for speed
|
| 114 |
+
top_p=0.8 # Nucleus sampling for faster generation
|
| 115 |
+
)
|
| 116 |
+
logger.info(f"Initialized embeddings with model: {embedding_model}")
|
| 117 |
+
logger.info(f"Initialized chat model: {chat_model}")
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Failed to initialize models: {str(e)}")
|
| 120 |
+
raise
|
| 121 |
+
|
| 122 |
+
# Initialize vector store
|
| 123 |
+
self.vector_store = None
|
| 124 |
+
self._setup_vector_store()
|
| 125 |
+
|
| 126 |
+
# Set up prompt templates
|
| 127 |
+
self._setup_prompts()
|
| 128 |
+
|
| 129 |
+
# Initialize conversation chain
|
| 130 |
+
self._setup_conversation_chain()
|
| 131 |
+
|
| 132 |
+
def _setup_vector_store(self):
|
| 133 |
+
"""Set up connection to the vector database."""
|
| 134 |
+
try:
|
| 135 |
+
if os.path.exists(self.vector_db_path):
|
| 136 |
+
self.vector_store = Chroma(
|
| 137 |
+
persist_directory=self.vector_db_path,
|
| 138 |
+
embedding_function=self.embeddings,
|
| 139 |
+
collection_name=self.collection_name
|
| 140 |
+
)
|
| 141 |
+
collection_info = self.vector_store._collection.count()
|
| 142 |
+
logger.info(f"Connected to vector database (primary) with {collection_info} documents")
|
| 143 |
+
# Fallback: legacy nested path if empty
|
| 144 |
+
if collection_info == 0:
|
| 145 |
+
nested_path = os.path.join(self.vector_db_path, os.path.basename(self.vector_db_path))
|
| 146 |
+
if os.path.isdir(nested_path):
|
| 147 |
+
try:
|
| 148 |
+
nested_vs = Chroma(
|
| 149 |
+
persist_directory=nested_path,
|
| 150 |
+
embedding_function=self.embeddings,
|
| 151 |
+
collection_name=self.collection_name
|
| 152 |
+
)
|
| 153 |
+
nested_count = nested_vs._collection.count()
|
| 154 |
+
if nested_count > 0:
|
| 155 |
+
logger.warning(
|
| 156 |
+
f"Primary path empty. Switching to legacy nested path {nested_path} with {nested_count} docs"
|
| 157 |
+
)
|
| 158 |
+
self.vector_store = nested_vs
|
| 159 |
+
self.vector_db_path = nested_path
|
| 160 |
+
except Exception as ne:
|
| 161 |
+
logger.debug(f"Failed to read nested path: {ne}")
|
| 162 |
+
else:
|
| 163 |
+
logger.warning(f"Vector database not found at {self.vector_db_path}")
|
| 164 |
+
logger.info("Run indexing pipeline first.")
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Failed to setup vector store: {str(e)}")
|
| 167 |
+
self.vector_store = None
|
| 168 |
+
|
| 169 |
+
def reload_vector_store(self) -> int:
|
| 170 |
+
"""Reload vector store from disk. Returns new document count or 0."""
|
| 171 |
+
try:
|
| 172 |
+
self._setup_vector_store()
|
| 173 |
+
if self.vector_store:
|
| 174 |
+
return self.vector_store._collection.count()
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.warning(f"reload_vector_store failed: {e}")
|
| 177 |
+
return 0
|
| 178 |
+
|
| 179 |
+
def get_document_count(self) -> int:
|
| 180 |
+
try:
|
| 181 |
+
if self.vector_store:
|
| 182 |
+
return self.vector_store._collection.count()
|
| 183 |
+
except Exception:
|
| 184 |
+
pass
|
| 185 |
+
return 0
|
| 186 |
+
|
| 187 |
+
def _setup_prompts(self):
|
| 188 |
+
"""Set up prompt templates for different scenarios."""
|
| 189 |
+
|
| 190 |
+
# Main RAG prompt template
|
| 191 |
+
self.rag_prompt = ChatPromptTemplate.from_template("""
|
| 192 |
+
Bạn là một trợ lý AI thông minh và thấu hiểu, chuyên về việc phân tích và thảo luận nội dung về nhật ký cá nhân.
|
| 193 |
+
|
| 194 |
+
Dựa trên các mục nhật ký sau đây được tìm kiếm từ cơ sở dữ liệu:
|
| 195 |
+
|
| 196 |
+
{context}
|
| 197 |
+
|
| 198 |
+
Người dùng hỏi: {question}
|
| 199 |
+
|
| 200 |
+
Hãy trả lời một cách:
|
| 201 |
+
- Thấu hiểu và empathetic (đồng cảm)
|
| 202 |
+
- Dựa trên nội dung nhật ký được cung cấp
|
| 203 |
+
- Cung cấp insights và connections giữa các entries
|
| 204 |
+
- Đưa ra suggestions hoặc reflections nếu phù hợp
|
| 205 |
+
- Sử dụng tiếng Việt tự nhiên và ấm áp
|
| 206 |
+
|
| 207 |
+
Nếu không tìm thấy thông tin liên quan trong nhật ký, hãy thành thật nói và đề xuất các cách khác để giúp đỡ.
|
| 208 |
+
|
| 209 |
+
Trả lời:
|
| 210 |
+
""")
|
| 211 |
+
|
| 212 |
+
# Fallback prompt when no relevant documents found
|
| 213 |
+
self.fallback_prompt = ChatPromptTemplate.from_template("""
|
| 214 |
+
Bạn là một trợ lý AI thân thiện và hữu ích cho việc quản lý nhật ký cá nhân.
|
| 215 |
+
|
| 216 |
+
Người dùng hỏi: {question}
|
| 217 |
+
|
| 218 |
+
Vì không tìm thấy thông tin liên quan trong nhật ký hiện tại, hãy:
|
| 219 |
+
- Trả lời một cách thân thiện ngắn gọn và hữu ích
|
| 220 |
+
- Đề xuất cách người dùng có thể ghi nhật ký về chủ đề này
|
| 221 |
+
- Khuyến khích reflection và self-discovery
|
| 222 |
+
- Cung cấp general guidance nếu phù hợp
|
| 223 |
+
|
| 224 |
+
Sử dụng tiếng Việt tự nhiên và ấm áp.
|
| 225 |
+
|
| 226 |
+
Trả lời:
|
| 227 |
+
""")
|
| 228 |
+
|
| 229 |
+
# Summary prompt for multiple diary entries
|
| 230 |
+
self.summary_prompt = ChatPromptTemplate.from_template("""
|
| 231 |
+
Dựa trên các mục nhật ký sau đây:
|
| 232 |
+
|
| 233 |
+
{context}
|
| 234 |
+
|
| 235 |
+
Hãy tạo một summary ngắn gọn về:
|
| 236 |
+
- Chủ đề chính được đề cập
|
| 237 |
+
- Cảm xúc và mood tổng thể
|
| 238 |
+
- Patterns hoặc themes đáng chú ý
|
| 239 |
+
- Insights về personal growth
|
| 240 |
+
|
| 241 |
+
Sử dụng tiếng Việt và giữ tính cách empathetic.
|
| 242 |
+
|
| 243 |
+
Summary:
|
| 244 |
+
""")
|
| 245 |
+
|
| 246 |
+
def _setup_conversation_chain(self):
|
| 247 |
+
"""Set up the conversation chain for RAG processing."""
|
| 248 |
+
try:
|
| 249 |
+
# Create retriever from vector store
|
| 250 |
+
if self.vector_store:
|
| 251 |
+
self.retriever = self.vector_store.as_retriever(
|
| 252 |
+
search_kwargs={"k": self.max_retrieval_docs}
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Set up the main RAG chain
|
| 256 |
+
self.rag_chain = (
|
| 257 |
+
{
|
| 258 |
+
"context": self.retriever | self._format_docs,
|
| 259 |
+
"question": RunnablePassthrough()
|
| 260 |
+
}
|
| 261 |
+
| self.rag_prompt
|
| 262 |
+
| self.chat_model
|
| 263 |
+
| StrOutputParser()
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
# Set up fallback chain
|
| 267 |
+
self.fallback_chain = (
|
| 268 |
+
{"question": RunnablePassthrough()}
|
| 269 |
+
| self.fallback_prompt
|
| 270 |
+
| self.chat_model
|
| 271 |
+
| StrOutputParser()
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
logger.info("Conversation chain setup complete")
|
| 275 |
+
else:
|
| 276 |
+
logger.warning("Cannot setup conversation chain without vector store")
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.error(f"Failed to setup conversation chain: {str(e)}")
|
| 280 |
+
raise
|
| 281 |
+
|
| 282 |
+
def _format_docs(self, docs: List[Document]) -> str:
|
| 283 |
+
"""
|
| 284 |
+
Format retrieved documents for LLM consumption.
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
docs: List of retrieved documents
|
| 288 |
+
|
| 289 |
+
Returns:
|
| 290 |
+
Formatted string with document content and metadata
|
| 291 |
+
"""
|
| 292 |
+
if not docs:
|
| 293 |
+
return "Không tìm thấy mục nhật ký liên quan."
|
| 294 |
+
|
| 295 |
+
formatted_docs = []
|
| 296 |
+
for i, doc in enumerate(docs, 1):
|
| 297 |
+
# Extract metadata
|
| 298 |
+
metadata = doc.metadata
|
| 299 |
+
date = metadata.get('date', 'Unknown date')
|
| 300 |
+
title = metadata.get('title', 'Untitled')
|
| 301 |
+
tags = metadata.get('tags_list', metadata.get('tags', ''))
|
| 302 |
+
|
| 303 |
+
# Format document
|
| 304 |
+
doc_text = f"""
|
| 305 |
+
Mục {i}:
|
| 306 |
+
Ngày: {date}
|
| 307 |
+
Tiêu đề: {title}
|
| 308 |
+
Tags: {tags if tags else 'Không có tags'}
|
| 309 |
+
Nội dung: {doc.page_content.strip()}
|
| 310 |
+
---
|
| 311 |
+
"""
|
| 312 |
+
formatted_docs.append(doc_text)
|
| 313 |
+
|
| 314 |
+
return "\n".join(formatted_docs)
|
| 315 |
+
|
| 316 |
+
def retrieve_relevant_entries(
|
| 317 |
+
self,
|
| 318 |
+
query: str,
|
| 319 |
+
filters: Optional[Dict[str, Any]] = None,
|
| 320 |
+
k: Optional[int] = None
|
| 321 |
+
) -> List[Document]:
|
| 322 |
+
"""
|
| 323 |
+
Retrieve relevant diary entries based on query with optimized performance.
|
| 324 |
+
|
| 325 |
+
Args:
|
| 326 |
+
query: Search query
|
| 327 |
+
filters: Optional metadata filters
|
| 328 |
+
k: Number of documents to retrieve (overrides default)
|
| 329 |
+
|
| 330 |
+
Returns:
|
| 331 |
+
List of relevant documents
|
| 332 |
+
"""
|
| 333 |
+
if not self.vector_store:
|
| 334 |
+
logger.warning("Vector store not available for retrieval")
|
| 335 |
+
return []
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
# Use smaller k for faster response
|
| 339 |
+
k = k or min(self.max_retrieval_docs, 3) # Limit to 3 docs for speed
|
| 340 |
+
|
| 341 |
+
if filters:
|
| 342 |
+
docs = self.vector_store.similarity_search(
|
| 343 |
+
query=query,
|
| 344 |
+
k=k,
|
| 345 |
+
filter=filters
|
| 346 |
+
)
|
| 347 |
+
else:
|
| 348 |
+
docs = self.vector_store.similarity_search(
|
| 349 |
+
query=query,
|
| 350 |
+
k=k
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
logger.info(f"Retrieved {len(docs)} documents for query: '{query[:50]}...'")
|
| 354 |
+
return docs
|
| 355 |
+
|
| 356 |
+
except Exception as e:
|
| 357 |
+
logger.error(f"Error during retrieval: {str(e)}")
|
| 358 |
+
return []
|
| 359 |
+
|
| 360 |
+
def format_documents_for_context(self, docs: List[Document]) -> str:
|
| 361 |
+
"""
|
| 362 |
+
Format retrieved documents into context string for the prompt.
|
| 363 |
+
|
| 364 |
+
Args:
|
| 365 |
+
docs: List of retrieved documents
|
| 366 |
+
|
| 367 |
+
Returns:
|
| 368 |
+
Formatted context string
|
| 369 |
+
"""
|
| 370 |
+
if not docs:
|
| 371 |
+
return "Không có thông tin nhật ký liên quan."
|
| 372 |
+
|
| 373 |
+
formatted_docs = []
|
| 374 |
+
for i, doc in enumerate(docs, 1):
|
| 375 |
+
# Extract metadata
|
| 376 |
+
metadata = doc.metadata
|
| 377 |
+
date = metadata.get('date', 'Không có ngày')
|
| 378 |
+
source = metadata.get('source', 'Không rõ nguồn')
|
| 379 |
+
|
| 380 |
+
# Format document
|
| 381 |
+
doc_text = f"Nhật ký {i} (Ngày: {date}):\n{doc.page_content}"
|
| 382 |
+
formatted_docs.append(doc_text)
|
| 383 |
+
|
| 384 |
+
return "\n\n".join(formatted_docs)
|
| 385 |
+
|
| 386 |
+
def generate_fast_response(
|
| 387 |
+
self,
|
| 388 |
+
query: str,
|
| 389 |
+
filters: Optional[Dict[str, Any]] = None
|
| 390 |
+
) -> str:
|
| 391 |
+
"""
|
| 392 |
+
Generate fast response with optimized settings for speed.
|
| 393 |
+
|
| 394 |
+
Args:
|
| 395 |
+
query: User question
|
| 396 |
+
filters: Optional metadata filters
|
| 397 |
+
|
| 398 |
+
Returns:
|
| 399 |
+
AI response string (optimized for speed)
|
| 400 |
+
"""
|
| 401 |
+
try:
|
| 402 |
+
# Fast retrieval with only 1 most relevant doc for maximum speed
|
| 403 |
+
relevant_docs = self.retrieve_relevant_entries(
|
| 404 |
+
query=query,
|
| 405 |
+
filters=filters,
|
| 406 |
+
k=1 # Only 1 doc for maximum speed
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
if not relevant_docs:
|
| 410 |
+
# Use simple fallback without chain to avoid timeout
|
| 411 |
+
return "Xin lỗi, tôi không tìm thấy thông tin liên quan trong nhật ký của bạn."
|
| 412 |
+
|
| 413 |
+
# Create very concise context (limit content length)
|
| 414 |
+
context = self._format_docs(relevant_docs[:1])
|
| 415 |
+
if len(context) > 500: # Limit context length
|
| 416 |
+
context = context[:500] + "..."
|
| 417 |
+
|
| 418 |
+
# Fast prompt template with timeout optimization
|
| 419 |
+
fast_prompt = ChatPromptTemplate.from_template(
|
| 420 |
+
"""Dựa vào nhật ký: {context}
|
| 421 |
+
|
| 422 |
+
Câu hỏi: {question}
|
| 423 |
+
|
| 424 |
+
Trả lời ngắn (1 câu):"""
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
# Create optimized chain with pre-computed context
|
| 428 |
+
chain = (
|
| 429 |
+
{"context": lambda x: context, "question": RunnablePassthrough()}
|
| 430 |
+
| fast_prompt
|
| 431 |
+
| self.chat_model
|
| 432 |
+
| StrOutputParser()
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
# Generate response with timeout handling
|
| 436 |
+
response = chain.invoke(query)
|
| 437 |
+
logger.info("Generated fast response successfully")
|
| 438 |
+
return response.strip()
|
| 439 |
+
|
| 440 |
+
except Exception as e:
|
| 441 |
+
logger.error(f"Error in fast response generation: {str(e)}")
|
| 442 |
+
# Direct fallback without chain to avoid timeout
|
| 443 |
+
return "Xin lỗi, tôi gặp lỗi khi xử lý câu hỏi của bạn."
|
| 444 |
+
|
| 445 |
+
def generate_response(
|
| 446 |
+
self,
|
| 447 |
+
query: str,
|
| 448 |
+
filters: Optional[Dict[str, Any]] = None,
|
| 449 |
+
use_fallback: bool = False
|
| 450 |
+
) -> str:
|
| 451 |
+
"""
|
| 452 |
+
Generate a response to user query using RAG.
|
| 453 |
+
|
| 454 |
+
Args:
|
| 455 |
+
query: User's question or message
|
| 456 |
+
filters: Optional metadata filters for retrieval
|
| 457 |
+
use_fallback: Whether to use fallback response (no retrieval)
|
| 458 |
+
|
| 459 |
+
Returns:
|
| 460 |
+
Generated response
|
| 461 |
+
"""
|
| 462 |
+
try:
|
| 463 |
+
if use_fallback or not self.vector_store:
|
| 464 |
+
# Use fallback chain without retrieval
|
| 465 |
+
response = self.fallback_chain.invoke(query)
|
| 466 |
+
logger.info("Generated fallback response")
|
| 467 |
+
return response
|
| 468 |
+
|
| 469 |
+
# Retrieve relevant documents first
|
| 470 |
+
relevant_docs = self.retrieve_relevant_entries(query, filters)
|
| 471 |
+
|
| 472 |
+
if not relevant_docs:
|
| 473 |
+
# No relevant documents found, use fallback
|
| 474 |
+
response = self.fallback_chain.invoke(query)
|
| 475 |
+
logger.info("No relevant docs found, used fallback response")
|
| 476 |
+
return response
|
| 477 |
+
|
| 478 |
+
# Use RAG chain with retrieved context
|
| 479 |
+
response = self.rag_chain.invoke(query)
|
| 480 |
+
logger.info("Generated RAG response with context")
|
| 481 |
+
return response
|
| 482 |
+
|
| 483 |
+
except Exception as e:
|
| 484 |
+
logger.error(f"Error generating response: {str(e)}")
|
| 485 |
+
return f"Xin lỗi, tôi gặp lỗi khi xử lý câu hỏi của bạn: {str(e)}"
|
| 486 |
+
|
| 487 |
+
def generate_summary(self, date_range: Optional[Tuple[str, str]] = None) -> str:
|
| 488 |
+
"""
|
| 489 |
+
Generate a summary of diary entries.
|
| 490 |
+
|
| 491 |
+
Args:
|
| 492 |
+
date_range: Optional tuple of (start_date, end_date) in YYYY-MM-DD format
|
| 493 |
+
|
| 494 |
+
Returns:
|
| 495 |
+
Generated summary
|
| 496 |
+
"""
|
| 497 |
+
try:
|
| 498 |
+
if not self.vector_store:
|
| 499 |
+
return "Không thể tạo summary: vector database không khả dụng."
|
| 500 |
+
|
| 501 |
+
# Build filter for date range if provided
|
| 502 |
+
filters = {}
|
| 503 |
+
if date_range:
|
| 504 |
+
start_date, end_date = date_range
|
| 505 |
+
# Note: This depends on how dates are stored in metadata
|
| 506 |
+
# May need adjustment based on actual metadata structure
|
| 507 |
+
pass
|
| 508 |
+
|
| 509 |
+
# Retrieve documents for summary (more documents for better overview)
|
| 510 |
+
docs = self.vector_store.similarity_search(
|
| 511 |
+
query="nhật ký cảm xúc thoughts feelings", # General query
|
| 512 |
+
k=min(10, self.max_retrieval_docs * 2) # More docs for summary
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
if not docs:
|
| 516 |
+
return "Không tìm thấy nh��t ký để tạo summary."
|
| 517 |
+
|
| 518 |
+
# Format context for summary
|
| 519 |
+
context = self._format_docs(docs)
|
| 520 |
+
|
| 521 |
+
# Generate summary
|
| 522 |
+
summary_chain = (
|
| 523 |
+
{"context": lambda x: context}
|
| 524 |
+
| self.summary_prompt
|
| 525 |
+
| self.chat_model
|
| 526 |
+
| StrOutputParser()
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
summary = summary_chain.invoke({})
|
| 530 |
+
logger.info("Generated diary summary")
|
| 531 |
+
return summary
|
| 532 |
+
|
| 533 |
+
except Exception as e:
|
| 534 |
+
logger.error(f"Error generating summary: {str(e)}")
|
| 535 |
+
return f"Lỗi khi tạo summary: {str(e)}"
|
| 536 |
+
|
| 537 |
+
def search_by_tags(self, tags: List[str], k: int = 5) -> List[Document]:
|
| 538 |
+
"""
|
| 539 |
+
Search diary entries by specific tags.
|
| 540 |
+
|
| 541 |
+
Args:
|
| 542 |
+
tags: List of tags to search for
|
| 543 |
+
k: Number of documents to return
|
| 544 |
+
|
| 545 |
+
Returns:
|
| 546 |
+
List of documents matching the tags
|
| 547 |
+
"""
|
| 548 |
+
if not self.vector_store or not tags:
|
| 549 |
+
return []
|
| 550 |
+
|
| 551 |
+
try:
|
| 552 |
+
# Build tag query
|
| 553 |
+
tag_query = " ".join([f"#{tag}" for tag in tags])
|
| 554 |
+
|
| 555 |
+
# Search with tag-based query
|
| 556 |
+
docs = self.vector_store.similarity_search(
|
| 557 |
+
query=tag_query,
|
| 558 |
+
k=k
|
| 559 |
+
)
|
| 560 |
+
|
| 561 |
+
# Filter by tags in metadata if available
|
| 562 |
+
filtered_docs = []
|
| 563 |
+
for doc in docs:
|
| 564 |
+
doc_tags = doc.metadata.get('tags_list', '')
|
| 565 |
+
if any(tag.lower() in doc_tags.lower() for tag in tags):
|
| 566 |
+
filtered_docs.append(doc)
|
| 567 |
+
|
| 568 |
+
logger.info(f"Found {len(filtered_docs)} documents with tags: {tags}")
|
| 569 |
+
return filtered_docs
|
| 570 |
+
|
| 571 |
+
except Exception as e:
|
| 572 |
+
logger.error(f"Error searching by tags: {str(e)}")
|
| 573 |
+
return []
|
| 574 |
+
|
| 575 |
+
def get_conversation_context(self, chat_history: List[Dict[str, str]]) -> str:
|
| 576 |
+
"""
|
| 577 |
+
Process chat history to maintain conversation context.
|
| 578 |
+
|
| 579 |
+
Args:
|
| 580 |
+
chat_history: List of chat messages with 'role' and 'content'
|
| 581 |
+
|
| 582 |
+
Returns:
|
| 583 |
+
Formatted conversation context
|
| 584 |
+
"""
|
| 585 |
+
if not chat_history:
|
| 586 |
+
return ""
|
| 587 |
+
|
| 588 |
+
# Take last few messages for context
|
| 589 |
+
recent_messages = chat_history[-5:] # Last 5 messages
|
| 590 |
+
|
| 591 |
+
context_parts = []
|
| 592 |
+
for msg in recent_messages:
|
| 593 |
+
role = "Người dùng" if msg['role'] == 'user' else "Trợ lý"
|
| 594 |
+
context_parts.append(f"{role}: {msg['content']}")
|
| 595 |
+
|
| 596 |
+
return "\n".join(context_parts)
|
| 597 |
+
|
| 598 |
+
def generate_contextual_response(
|
| 599 |
+
self,
|
| 600 |
+
query: str,
|
| 601 |
+
chat_history: List[Dict[str, str]] = None,
|
| 602 |
+
filters: Optional[Dict[str, Any]] = None
|
| 603 |
+
) -> str:
|
| 604 |
+
"""
|
| 605 |
+
Generate response with conversation context.
|
| 606 |
+
|
| 607 |
+
Args:
|
| 608 |
+
query: Current user query
|
| 609 |
+
chat_history: Previous conversation messages
|
| 610 |
+
filters: Optional metadata filters
|
| 611 |
+
|
| 612 |
+
Returns:
|
| 613 |
+
Contextual response
|
| 614 |
+
"""
|
| 615 |
+
# Get conversation context
|
| 616 |
+
conv_context = self.get_conversation_context(chat_history or [])
|
| 617 |
+
|
| 618 |
+
# Enhance query with conversation context
|
| 619 |
+
if conv_context:
|
| 620 |
+
enhanced_query = f"Bối cảnh cuộc trò chuyện:\n{conv_context}\n\nCâu hỏi hiện tại: {query}"
|
| 621 |
+
else:
|
| 622 |
+
enhanced_query = query
|
| 623 |
+
|
| 624 |
+
# Generate response
|
| 625 |
+
return self.generate_response(enhanced_query, filters)
|
| 626 |
+
|
| 627 |
+
def health_check(self) -> Dict[str, Any]:
|
| 628 |
+
"""
|
| 629 |
+
Check the health status of the RAG system.
|
| 630 |
+
|
| 631 |
+
Returns:
|
| 632 |
+
Dictionary with system status information
|
| 633 |
+
"""
|
| 634 |
+
status = {
|
| 635 |
+
"vector_store_available": self.vector_store is not None,
|
| 636 |
+
"vector_db_path": self.vector_db_path,
|
| 637 |
+
"models_initialized": True,
|
| 638 |
+
"embedding_model": "models/embedding-001",
|
| 639 |
+
"chat_model": "gemini-1.5-flash"
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
if self.vector_store:
|
| 643 |
+
try:
|
| 644 |
+
doc_count = self.vector_store._collection.count()
|
| 645 |
+
status["document_count"] = doc_count
|
| 646 |
+
status["vector_store_healthy"] = True
|
| 647 |
+
except Exception as e:
|
| 648 |
+
status["vector_store_healthy"] = False
|
| 649 |
+
status["vector_store_error"] = str(e)
|
| 650 |
+
else:
|
| 651 |
+
status["document_count"] = 0
|
| 652 |
+
status["vector_store_healthy"] = False
|
| 653 |
+
|
| 654 |
+
return status
|
| 655 |
+
|
| 656 |
+
# ========================================
|
| 657 |
+
# CONVENIENCE FUNCTIONS
|
| 658 |
+
# ========================================
|
| 659 |
+
|
| 660 |
+
def create_rag_system(
|
| 661 |
+
user_id: int = 1,
|
| 662 |
+
base_vector_path: str = "./src/Indexingstep",
|
| 663 |
+
google_api_key: Optional[str] = None
|
| 664 |
+
) -> DiaryRAGSystem:
|
| 665 |
+
"""
|
| 666 |
+
Create and initialize a user-specific DiaryRAGSystem instance.
|
| 667 |
+
|
| 668 |
+
Args:
|
| 669 |
+
user_id: User ID for user-specific vector database
|
| 670 |
+
base_vector_path: Base path for vector databases
|
| 671 |
+
google_api_key: Google API key
|
| 672 |
+
|
| 673 |
+
Returns:
|
| 674 |
+
Initialized DiaryRAGSystem for the specific user
|
| 675 |
+
"""
|
| 676 |
+
return DiaryRAGSystem(
|
| 677 |
+
user_id=user_id,
|
| 678 |
+
base_vector_path=base_vector_path,
|
| 679 |
+
google_api_key=google_api_key
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
def quick_query(
|
| 683 |
+
query: str,
|
| 684 |
+
user_id: int = 1,
|
| 685 |
+
base_vector_path: str = "./src/VectorDB"
|
| 686 |
+
) -> str:
|
| 687 |
+
"""
|
| 688 |
+
Quick query function for testing with user-specific database.
|
| 689 |
+
|
| 690 |
+
Args:
|
| 691 |
+
query: Question to ask
|
| 692 |
+
user_id: User ID for user-specific vector database
|
| 693 |
+
base_vector_path: Base path for vector databases
|
| 694 |
+
|
| 695 |
+
Returns:
|
| 696 |
+
Response string
|
| 697 |
+
"""
|
| 698 |
+
try:
|
| 699 |
+
rag = create_rag_system(user_id, base_vector_path)
|
| 700 |
+
return rag.generate_response(query)
|
| 701 |
+
except Exception as e:
|
| 702 |
+
return f"Error: {str(e)}"
|
| 703 |
+
|
| 704 |
+
if __name__ == "__main__":
|
| 705 |
+
# Example usage
|
| 706 |
+
print("🤖 Diary RAG System - Example Usage")
|
| 707 |
+
print("=" * 50)
|
| 708 |
+
|
| 709 |
+
try:
|
| 710 |
+
# Initialize system
|
| 711 |
+
rag = create_rag_system()
|
| 712 |
+
|
| 713 |
+
# Health check
|
| 714 |
+
status = rag.health_check()
|
| 715 |
+
print("System Status:")
|
| 716 |
+
for key, value in status.items():
|
| 717 |
+
print(f" {key}: {value}")
|
| 718 |
+
|
| 719 |
+
# Example queries
|
| 720 |
+
if status.get("vector_store_healthy"):
|
| 721 |
+
print("\n📝 Example Queries:")
|
| 722 |
+
|
| 723 |
+
queries = [
|
| 724 |
+
"Tôi cảm thấy như thế nào trong tuần này?",
|
| 725 |
+
"Có những hoạt động nào tôi đã làm gần đây?",
|
| 726 |
+
"Tâm trạng của tôi đã thay đổi như thế nào?"
|
| 727 |
+
]
|
| 728 |
+
|
| 729 |
+
for query in queries:
|
| 730 |
+
print(f"\n❓ Query: {query}")
|
| 731 |
+
response = rag.generate_response(query)
|
| 732 |
+
print(f"🤖 Response: {response[:200]}...")
|
| 733 |
+
|
| 734 |
+
except Exception as e:
|
| 735 |
+
print(f"❌ Error: {str(e)}")
|
| 736 |
+
print("Make sure to:")
|
| 737 |
+
print("1. Set GOOGLE_API_KEY environment variable")
|
| 738 |
+
print("2. Run the indexing pipeline first")
|
| 739 |
+
print("3. Check vector database path")
|
clean_repo/src/Retrivel_And_Generation/__pycache__/Retrieval_And_Generator.cpython-311.pyc
ADDED
|
Binary file (31.8 kB). View file
|
|
|
clean_repo/src/rag_service/main.py
ADDED
|
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import uvicorn
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
from fastapi import Query
|
| 12 |
+
|
| 13 |
+
# Load environment variables
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
# Add paths for imports
|
| 18 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 19 |
+
src_dir = os.path.dirname(current_dir)
|
| 20 |
+
sys.path.append(src_dir)
|
| 21 |
+
sys.path.append(os.path.join(src_dir, "Indexingstep"))
|
| 22 |
+
sys.path.append(os.path.join(src_dir, "Retrivel_And_Generation"))
|
| 23 |
+
|
| 24 |
+
# Import your modules
|
| 25 |
+
try:
|
| 26 |
+
from Indexingstep.pipeline import DiaryIndexingPipeline
|
| 27 |
+
from Retrivel_And_Generation.Retrieval_And_Generator import create_rag_system, DiaryRAGSystem
|
| 28 |
+
RAG_MODULES_AVAILABLE = True
|
| 29 |
+
except ImportError as e:
|
| 30 |
+
print(f"Warning: RAG modules not available: {e}")
|
| 31 |
+
RAG_MODULES_AVAILABLE = False
|
| 32 |
+
|
| 33 |
+
# Configure logging
|
| 34 |
+
logging.basicConfig(filename="logs/service.log",
|
| 35 |
+
level=logging.INFO )
|
| 36 |
+
logger = logging.getLogger(__name__)
|
| 37 |
+
|
| 38 |
+
app = FastAPI(
|
| 39 |
+
title="Personal Diary RAG Service",
|
| 40 |
+
description="RAG service for personal diary chatbot with user isolation",
|
| 41 |
+
version="1.0.0"
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# CORS middleware
|
| 45 |
+
app.add_middleware(
|
| 46 |
+
CORSMiddleware,
|
| 47 |
+
allow_origins=["*"],
|
| 48 |
+
allow_credentials=True,
|
| 49 |
+
allow_methods=["*"],
|
| 50 |
+
allow_headers=["*"],
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# In-memory cache for RAG systems
|
| 54 |
+
rag_systems_cache: Dict[int, DiaryRAGSystem] = {}
|
| 55 |
+
|
| 56 |
+
# ========================================
|
| 57 |
+
# PYDANTIC MODELS
|
| 58 |
+
# ========================================
|
| 59 |
+
|
| 60 |
+
class DiaryEntry(BaseModel):
|
| 61 |
+
date: str
|
| 62 |
+
content: str
|
| 63 |
+
tags: str = ""
|
| 64 |
+
|
| 65 |
+
class IndexRequest(BaseModel):
|
| 66 |
+
user_id: int
|
| 67 |
+
clear_existing: bool = False
|
| 68 |
+
start_date: Optional[str] = None
|
| 69 |
+
end_date: Optional[str] = None
|
| 70 |
+
|
| 71 |
+
class QueryRequest(BaseModel):
|
| 72 |
+
user_id: int
|
| 73 |
+
query: str
|
| 74 |
+
fast_mode: bool = False
|
| 75 |
+
chat_history: List[Dict[str, str]] = []
|
| 76 |
+
|
| 77 |
+
class UserStatusResponse(BaseModel):
|
| 78 |
+
user_id: int
|
| 79 |
+
status: str
|
| 80 |
+
document_count: int
|
| 81 |
+
vector_db_path: str
|
| 82 |
+
last_updated: Optional[str] = None
|
| 83 |
+
error: Optional[str] = None
|
| 84 |
+
|
| 85 |
+
class QueryResponse(BaseModel):
|
| 86 |
+
user_id: int
|
| 87 |
+
response: str
|
| 88 |
+
processing_time: float
|
| 89 |
+
documents_used: int
|
| 90 |
+
fast_mode: bool
|
| 91 |
+
|
| 92 |
+
class IndexResponse(BaseModel):
|
| 93 |
+
user_id: int
|
| 94 |
+
status: str
|
| 95 |
+
documents_processed: int
|
| 96 |
+
chunks_created: int
|
| 97 |
+
vector_db_path: str
|
| 98 |
+
processing_time: float
|
| 99 |
+
error: Optional[str] = None
|
| 100 |
+
|
| 101 |
+
# ========================================
|
| 102 |
+
# HELPER FUNCTIONS
|
| 103 |
+
# ========================================
|
| 104 |
+
|
| 105 |
+
def format_error_message(errors) -> str:
|
| 106 |
+
"""Convert error list to string for API response."""
|
| 107 |
+
if isinstance(errors, list):
|
| 108 |
+
return '; '.join(str(e) for e in errors)
|
| 109 |
+
return str(errors) if errors else 'Unknown error'
|
| 110 |
+
|
| 111 |
+
def get_user_paths(user_id: int) -> Dict[str, str]:
|
| 112 |
+
"""Get all paths for a user."""
|
| 113 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 114 |
+
|
| 115 |
+
return {
|
| 116 |
+
"vector_db_path": os.path.join(base_dir, "VectorDB", f"user_{user_id}_vector_db"),
|
| 117 |
+
"diary_db_path": os.path.join(base_dir, "streamlit_app", "backend", f"user_{user_id}_diary.db"),
|
| 118 |
+
"base_vector_path": os.path.join(base_dir, "VectorDB")
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
def get_pipeline_config(user_id: int) -> Dict[str, Any]:
|
| 122 |
+
"""Get configuration for DiaryIndexingPipeline."""
|
| 123 |
+
paths = get_user_paths(user_id)
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"db_path": paths["diary_db_path"],
|
| 127 |
+
"persist_directory": paths["vector_db_path"],
|
| 128 |
+
"collection_name": f"user_{user_id}_diary_entries",
|
| 129 |
+
"google_api_key": os.getenv("GOOGLE_API_KEY"),
|
| 130 |
+
"chunk_size": 800,
|
| 131 |
+
"chunk_overlap": 100,
|
| 132 |
+
"batch_size": 50,
|
| 133 |
+
"user_id": user_id
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
def check_vector_db_exists(user_id: int) -> bool:
|
| 137 |
+
"""Check if vector database exists for user."""
|
| 138 |
+
paths = get_user_paths(user_id)
|
| 139 |
+
return os.path.exists(paths["vector_db_path"])
|
| 140 |
+
|
| 141 |
+
def get_document_count(user_id: int) -> int:
|
| 142 |
+
"""Get document count from vector database."""
|
| 143 |
+
try:
|
| 144 |
+
if user_id in rag_systems_cache:
|
| 145 |
+
return rag_systems_cache[user_id].get_document_count()
|
| 146 |
+
|
| 147 |
+
if not check_vector_db_exists(user_id):
|
| 148 |
+
return 0
|
| 149 |
+
|
| 150 |
+
# Create temporary RAG system to check count
|
| 151 |
+
paths = get_user_paths(user_id)
|
| 152 |
+
temp_rag = create_rag_system(
|
| 153 |
+
user_id=user_id,
|
| 154 |
+
base_vector_path=paths["base_vector_path"],
|
| 155 |
+
google_api_key=os.getenv("GOOGLE_API_KEY")
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
if temp_rag:
|
| 159 |
+
return temp_rag.get_document_count()
|
| 160 |
+
return 0
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Error getting document count for user {user_id}: {e}")
|
| 164 |
+
return 0
|
| 165 |
+
|
| 166 |
+
def get_or_create_rag_system(user_id: int) -> DiaryRAGSystem:
|
| 167 |
+
"""Get existing RAG system or create new one."""
|
| 168 |
+
if user_id not in rag_systems_cache:
|
| 169 |
+
if not check_vector_db_exists(user_id):
|
| 170 |
+
raise HTTPException(
|
| 171 |
+
status_code=404,
|
| 172 |
+
detail=f"Vector database not found for user {user_id}. Please run indexing first."
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
paths = get_user_paths(user_id)
|
| 176 |
+
rag_system = create_rag_system(
|
| 177 |
+
user_id=user_id,
|
| 178 |
+
base_vector_path=paths["base_vector_path"],
|
| 179 |
+
google_api_key=os.getenv("GOOGLE_API_KEY")
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
if not rag_system:
|
| 183 |
+
raise HTTPException(
|
| 184 |
+
status_code=500,
|
| 185 |
+
detail=f"Failed to create RAG system for user {user_id}"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
rag_systems_cache[user_id] = rag_system
|
| 189 |
+
logger.info(f"Created RAG system for user {user_id}")
|
| 190 |
+
|
| 191 |
+
return rag_systems_cache[user_id]
|
| 192 |
+
|
| 193 |
+
# ========================================
|
| 194 |
+
# API ENDPOINTS
|
| 195 |
+
# ========================================
|
| 196 |
+
|
| 197 |
+
@app.get("/")
|
| 198 |
+
async def root():
|
| 199 |
+
"""Health check endpoint."""
|
| 200 |
+
return {
|
| 201 |
+
"message": "Personal Diary RAG Service is running",
|
| 202 |
+
"version": "1.0.0",
|
| 203 |
+
"cached_users": list(rag_systems_cache.keys()),
|
| 204 |
+
"vector_db_base": os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "VectorDB")
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
@app.get("/health")
|
| 208 |
+
async def health_check():
|
| 209 |
+
"""Detailed health check."""
|
| 210 |
+
try:
|
| 211 |
+
google_api_key = os.getenv("GOOGLE_API_KEY")
|
| 212 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 213 |
+
vector_db_base = os.path.join(base_dir, "VectorDB")
|
| 214 |
+
|
| 215 |
+
return {
|
| 216 |
+
"status": "healthy",
|
| 217 |
+
"google_api_configured": bool(google_api_key),
|
| 218 |
+
"vector_db_base_exists": os.path.exists(vector_db_base),
|
| 219 |
+
"cached_users": list(rag_systems_cache.keys()),
|
| 220 |
+
"timestamp": datetime.now().isoformat()
|
| 221 |
+
}
|
| 222 |
+
except Exception as e:
|
| 223 |
+
raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
|
| 224 |
+
|
| 225 |
+
@app.get("/users/{user_id}/ai-availability")
|
| 226 |
+
async def check_ai_availability(user_id: int):
|
| 227 |
+
"""Check AI availability and provide detailed status for troubleshooting."""
|
| 228 |
+
try:
|
| 229 |
+
# Check all prerequisites for AI availability
|
| 230 |
+
availability_info = {
|
| 231 |
+
"user_id": user_id,
|
| 232 |
+
"overall_status": "checking",
|
| 233 |
+
"checks": {
|
| 234 |
+
"rag_modules": {
|
| 235 |
+
"available": RAG_MODULES_AVAILABLE,
|
| 236 |
+
"status": "✅ Available" if RAG_MODULES_AVAILABLE else "❌ Not Available",
|
| 237 |
+
"details": "Required modules: DiaryIndexingPipeline, DiaryRAGSystem"
|
| 238 |
+
},
|
| 239 |
+
"google_api_key": {
|
| 240 |
+
"configured": bool(os.getenv("GOOGLE_API_KEY")),
|
| 241 |
+
"status": "✅ Configured" if os.getenv("GOOGLE_API_KEY") else "❌ Not Configured",
|
| 242 |
+
"details": "Required for embeddings and LLM responses"
|
| 243 |
+
},
|
| 244 |
+
"vector_database": {
|
| 245 |
+
"exists": check_vector_db_exists(user_id),
|
| 246 |
+
"status": "✅ Exists" if check_vector_db_exists(user_id) else "⚠️ Not Found",
|
| 247 |
+
"path": get_user_paths(user_id)["vector_db_path"]
|
| 248 |
+
},
|
| 249 |
+
"document_count": {
|
| 250 |
+
"count": get_document_count(user_id),
|
| 251 |
+
"status": "✅ Has Documents" if get_document_count(user_id) > 0 else "⚠️ Empty",
|
| 252 |
+
"details": f"{get_document_count(user_id)} documents indexed"
|
| 253 |
+
}
|
| 254 |
+
},
|
| 255 |
+
"recommendations": [],
|
| 256 |
+
"actions": []
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
# Determine overall status and recommendations
|
| 260 |
+
if not RAG_MODULES_AVAILABLE:
|
| 261 |
+
availability_info["overall_status"] = "unavailable"
|
| 262 |
+
availability_info["recommendations"].append("Install missing RAG modules")
|
| 263 |
+
availability_info["actions"].append({
|
| 264 |
+
"action": "check_imports",
|
| 265 |
+
"description": "Verify DiaryIndexingPipeline and DiaryRAGSystem imports"
|
| 266 |
+
})
|
| 267 |
+
elif not os.getenv("GOOGLE_API_KEY"):
|
| 268 |
+
availability_info["overall_status"] = "not_configured"
|
| 269 |
+
availability_info["recommendations"].append("Configure Google API key")
|
| 270 |
+
availability_info["actions"].append({
|
| 271 |
+
"action": "set_api_key",
|
| 272 |
+
"description": "Add GOOGLE_API_KEY to environment variables"
|
| 273 |
+
})
|
| 274 |
+
elif not check_vector_db_exists(user_id):
|
| 275 |
+
availability_info["overall_status"] = "needs_indexing"
|
| 276 |
+
availability_info["recommendations"].append("Create vector database for user")
|
| 277 |
+
availability_info["actions"].append({
|
| 278 |
+
"action": "initial_index",
|
| 279 |
+
"endpoint": f"/users/{user_id}/auto-index-new-entry",
|
| 280 |
+
"description": "Run initial indexing to create vector database"
|
| 281 |
+
})
|
| 282 |
+
elif get_document_count(user_id) == 0:
|
| 283 |
+
availability_info["overall_status"] = "empty_database"
|
| 284 |
+
availability_info["recommendations"].append("Add diary entries or rebuild index")
|
| 285 |
+
availability_info["actions"].append({
|
| 286 |
+
"action": "check_diary_entries",
|
| 287 |
+
"description": "Verify user has diary entries in database"
|
| 288 |
+
})
|
| 289 |
+
availability_info["actions"].append({
|
| 290 |
+
"action": "rebuild_index",
|
| 291 |
+
"endpoint": f"/users/{user_id}/auto-index-new-entry",
|
| 292 |
+
"description": "Rebuild vector database from existing entries"
|
| 293 |
+
})
|
| 294 |
+
else:
|
| 295 |
+
availability_info["overall_status"] = "available"
|
| 296 |
+
availability_info["recommendations"].append("AI is ready for use")
|
| 297 |
+
availability_info["actions"].append({
|
| 298 |
+
"action": "query_ready",
|
| 299 |
+
"endpoint": f"/users/{user_id}/query",
|
| 300 |
+
"description": "AI is ready to answer questions"
|
| 301 |
+
})
|
| 302 |
+
|
| 303 |
+
# Add cache status
|
| 304 |
+
availability_info["cache_status"] = {
|
| 305 |
+
"user_cached": user_id in rag_systems_cache,
|
| 306 |
+
"total_cached_users": len(rag_systems_cache),
|
| 307 |
+
"cached_users": list(rag_systems_cache.keys())
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
return availability_info
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.error(f"Error checking AI availability for user {user_id}: {e}")
|
| 314 |
+
return {
|
| 315 |
+
"user_id": user_id,
|
| 316 |
+
"overall_status": "error",
|
| 317 |
+
"error": str(e),
|
| 318 |
+
"recommendations": ["Check service logs for detailed error information"],
|
| 319 |
+
"actions": [{
|
| 320 |
+
"action": "check_logs",
|
| 321 |
+
"description": "Review service logs for error details"
|
| 322 |
+
}]
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
@app.post("/users/{user_id}/fix-ai-availability")
|
| 326 |
+
async def fix_ai_availability(user_id: int):
|
| 327 |
+
"""Attempt to automatically fix AI availability issues."""
|
| 328 |
+
try:
|
| 329 |
+
if not RAG_MODULES_AVAILABLE:
|
| 330 |
+
return {
|
| 331 |
+
"status": "cannot_fix",
|
| 332 |
+
"reason": "RAG modules not available - requires code/environment fix",
|
| 333 |
+
"action_needed": "Install missing Python modules"
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
if not os.getenv("GOOGLE_API_KEY"):
|
| 337 |
+
return {
|
| 338 |
+
"status": "cannot_fix",
|
| 339 |
+
"reason": "Google API key not configured",
|
| 340 |
+
"action_needed": "Set GOOGLE_API_KEY environment variable"
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
# Try to fix vector database issues
|
| 344 |
+
if not check_vector_db_exists(user_id) or get_document_count(user_id) == 0:
|
| 345 |
+
logger.info(f"Attempting to fix AI availability for user {user_id}")
|
| 346 |
+
|
| 347 |
+
# Clear cache first
|
| 348 |
+
if user_id in rag_systems_cache:
|
| 349 |
+
del rag_systems_cache[user_id]
|
| 350 |
+
|
| 351 |
+
# Create/rebuild vector database
|
| 352 |
+
config = get_pipeline_config(user_id)
|
| 353 |
+
paths = get_user_paths(user_id)
|
| 354 |
+
os.makedirs(os.path.dirname(paths["vector_db_path"]), exist_ok=True)
|
| 355 |
+
|
| 356 |
+
pipeline = DiaryIndexingPipeline(**config)
|
| 357 |
+
results = pipeline.run_full_pipeline(clear_existing=True)
|
| 358 |
+
|
| 359 |
+
if results.get('status') == 'completed_successfully':
|
| 360 |
+
doc_count = get_document_count(user_id)
|
| 361 |
+
return {
|
| 362 |
+
"status": "fixed",
|
| 363 |
+
"action_taken": "Created/rebuilt vector database",
|
| 364 |
+
"documents_processed": results.get('documents_loaded', 0),
|
| 365 |
+
"chunks_created": results.get('chunks_created', 0),
|
| 366 |
+
"final_document_count": doc_count,
|
| 367 |
+
"ai_status": "ready" if doc_count > 0 else "empty"
|
| 368 |
+
}
|
| 369 |
+
else:
|
| 370 |
+
return {
|
| 371 |
+
"status": "fix_failed",
|
| 372 |
+
"reason": "Failed to create vector database",
|
| 373 |
+
"error": format_error_message(results.get('errors', 'Unknown error'))
|
| 374 |
+
}
|
| 375 |
+
else:
|
| 376 |
+
return {
|
| 377 |
+
"status": "already_available",
|
| 378 |
+
"message": "AI is already available for this user",
|
| 379 |
+
"document_count": get_document_count(user_id)
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
except Exception as e:
|
| 383 |
+
logger.error(f"Error fixing AI availability for user {user_id}: {e}")
|
| 384 |
+
return {
|
| 385 |
+
"status": "error",
|
| 386 |
+
"error": str(e),
|
| 387 |
+
"action_needed": "Check service logs and try manual troubleshooting"
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
@app.get("/users/{user_id}/status", response_model=UserStatusResponse)
|
| 391 |
+
async def get_user_status(user_id: int):
|
| 392 |
+
"""Get RAG system status for a user."""
|
| 393 |
+
try:
|
| 394 |
+
paths = get_user_paths(user_id)
|
| 395 |
+
|
| 396 |
+
if not check_vector_db_exists(user_id):
|
| 397 |
+
return UserStatusResponse(
|
| 398 |
+
user_id=user_id,
|
| 399 |
+
status="not_indexed",
|
| 400 |
+
document_count=0,
|
| 401 |
+
vector_db_path=paths["vector_db_path"]
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
doc_count = get_document_count(user_id)
|
| 405 |
+
|
| 406 |
+
return UserStatusResponse(
|
| 407 |
+
user_id=user_id,
|
| 408 |
+
status="ready" if doc_count > 0 else "empty",
|
| 409 |
+
document_count=doc_count,
|
| 410 |
+
vector_db_path=paths["vector_db_path"],
|
| 411 |
+
last_updated=datetime.now().isoformat()
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
except Exception as e:
|
| 415 |
+
logger.error(f"Error getting status for user {user_id}: {e}")
|
| 416 |
+
return UserStatusResponse(
|
| 417 |
+
user_id=user_id,
|
| 418 |
+
status="error",
|
| 419 |
+
document_count=0,
|
| 420 |
+
vector_db_path="",
|
| 421 |
+
error=str(e)
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
@app.post("/users/{user_id}/index", response_model=IndexResponse)
|
| 425 |
+
async def index_user_data(user_id: int, request: IndexRequest, background_tasks: BackgroundTasks):
|
| 426 |
+
"""Index diary entries for a user."""
|
| 427 |
+
start_time = datetime.now()
|
| 428 |
+
|
| 429 |
+
try:
|
| 430 |
+
# Ensure VectorDB directory exists
|
| 431 |
+
paths = get_user_paths(user_id)
|
| 432 |
+
os.makedirs(os.path.dirname(paths["vector_db_path"]), exist_ok=True)
|
| 433 |
+
|
| 434 |
+
# Get pipeline configuration
|
| 435 |
+
config = get_pipeline_config(user_id)
|
| 436 |
+
|
| 437 |
+
logger.info(f"Starting indexing for user {user_id} with config: {config}")
|
| 438 |
+
|
| 439 |
+
# Create and run pipeline
|
| 440 |
+
pipeline = DiaryIndexingPipeline(**config)
|
| 441 |
+
|
| 442 |
+
if request.start_date and request.end_date:
|
| 443 |
+
# Date range indexing
|
| 444 |
+
results = pipeline.run_full_pipeline(
|
| 445 |
+
start_date=request.start_date,
|
| 446 |
+
end_date=request.end_date,
|
| 447 |
+
clear_existing=request.clear_existing
|
| 448 |
+
)
|
| 449 |
+
else:
|
| 450 |
+
# Full indexing
|
| 451 |
+
results = pipeline.run_full_pipeline(clear_existing=request.clear_existing)
|
| 452 |
+
|
| 453 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 454 |
+
|
| 455 |
+
if results.get('status') == 'completed_successfully':
|
| 456 |
+
# Clear cache to force reload
|
| 457 |
+
if user_id in rag_systems_cache:
|
| 458 |
+
del rag_systems_cache[user_id]
|
| 459 |
+
|
| 460 |
+
return IndexResponse(
|
| 461 |
+
user_id=user_id,
|
| 462 |
+
status="success",
|
| 463 |
+
documents_processed=results.get('documents_loaded', 0),
|
| 464 |
+
chunks_created=results.get('chunks_created', 0),
|
| 465 |
+
vector_db_path=paths["vector_db_path"],
|
| 466 |
+
processing_time=processing_time
|
| 467 |
+
)
|
| 468 |
+
else:
|
| 469 |
+
return IndexResponse(
|
| 470 |
+
user_id=user_id,
|
| 471 |
+
status="failed",
|
| 472 |
+
documents_processed=0,
|
| 473 |
+
chunks_created=0,
|
| 474 |
+
vector_db_path=paths["vector_db_path"],
|
| 475 |
+
processing_time=processing_time,
|
| 476 |
+
error=format_error_message(results.get('errors', 'Unknown error'))
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
except Exception as e:
|
| 480 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 481 |
+
logger.error(f"Indexing error for user {user_id}: {e}")
|
| 482 |
+
|
| 483 |
+
return IndexResponse(
|
| 484 |
+
user_id=user_id,
|
| 485 |
+
status="error",
|
| 486 |
+
documents_processed=0,
|
| 487 |
+
chunks_created=0,
|
| 488 |
+
vector_db_path="",
|
| 489 |
+
processing_time=processing_time,
|
| 490 |
+
error=str(e)
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
@app.post("/users/{user_id}/incremental-index")
|
| 494 |
+
async def incremental_index(user_id: int, start_date: str = None):
|
| 495 |
+
"""Run incremental indexing for user."""
|
| 496 |
+
try:
|
| 497 |
+
config = get_pipeline_config(user_id)
|
| 498 |
+
pipeline = DiaryIndexingPipeline(**config)
|
| 499 |
+
|
| 500 |
+
if start_date:
|
| 501 |
+
results = pipeline.incremental_update(start_date)
|
| 502 |
+
else:
|
| 503 |
+
# Default to last 7 days
|
| 504 |
+
from datetime import timedelta
|
| 505 |
+
default_start = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
|
| 506 |
+
results = pipeline.incremental_update(default_start)
|
| 507 |
+
|
| 508 |
+
if results.get('status') == 'success':
|
| 509 |
+
# Clear cache to force reload
|
| 510 |
+
if user_id in rag_systems_cache:
|
| 511 |
+
del rag_systems_cache[user_id]
|
| 512 |
+
|
| 513 |
+
return {
|
| 514 |
+
"user_id": user_id,
|
| 515 |
+
"status": "success",
|
| 516 |
+
"documents_added": results.get('documents_added', 0),
|
| 517 |
+
"start_date": start_date or default_start
|
| 518 |
+
}
|
| 519 |
+
else:
|
| 520 |
+
raise HTTPException(
|
| 521 |
+
status_code=500,
|
| 522 |
+
detail=f"Incremental indexing failed: {results.get('error', 'Unknown error')}"
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
except HTTPException:
|
| 526 |
+
raise
|
| 527 |
+
except Exception as e:
|
| 528 |
+
logger.error(f"Incremental indexing error for user {user_id}: {e}")
|
| 529 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 530 |
+
|
| 531 |
+
@app.get("/users/{user_id}/query", response_model=QueryResponse)
|
| 532 |
+
async def query_user_rag(
|
| 533 |
+
user_id: int,
|
| 534 |
+
query: str = Query(...),
|
| 535 |
+
fast_mode: bool = Query(False),
|
| 536 |
+
chat_history: str = Query("[]")
|
| 537 |
+
):
|
| 538 |
+
"""Query RAG system for a user."""
|
| 539 |
+
start_time = datetime.now()
|
| 540 |
+
import json
|
| 541 |
+
|
| 542 |
+
try:
|
| 543 |
+
rag_system = get_or_create_rag_system(user_id)
|
| 544 |
+
chat_history_list = json.loads(chat_history)
|
| 545 |
+
if fast_mode:
|
| 546 |
+
response = rag_system.generate_fast_response(query=query)
|
| 547 |
+
else:
|
| 548 |
+
response = rag_system.generate_contextual_response(
|
| 549 |
+
query=query,
|
| 550 |
+
chat_history=chat_history_list
|
| 551 |
+
)
|
| 552 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 553 |
+
return QueryResponse(
|
| 554 |
+
user_id=user_id,
|
| 555 |
+
response=response,
|
| 556 |
+
processing_time=processing_time,
|
| 557 |
+
documents_used=5,
|
| 558 |
+
fast_mode=fast_mode
|
| 559 |
+
)
|
| 560 |
+
except HTTPException:
|
| 561 |
+
raise
|
| 562 |
+
except Exception as e:
|
| 563 |
+
logger.error(f"Query error for user {user_id}: {e}")
|
| 564 |
+
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
| 565 |
+
|
| 566 |
+
@app.post("/users/{user_id}/auto-index-new-entry")
|
| 567 |
+
async def auto_index_new_entry(user_id: int):
|
| 568 |
+
"""Auto-index after saving new diary entry. Creates initial index if not exists."""
|
| 569 |
+
try:
|
| 570 |
+
if not RAG_MODULES_AVAILABLE:
|
| 571 |
+
return {"status": "skipped", "reason": "RAG modules not available"}
|
| 572 |
+
|
| 573 |
+
# Check if vector DB exists
|
| 574 |
+
if not check_vector_db_exists(user_id):
|
| 575 |
+
# First time - create full index
|
| 576 |
+
logger.info(f"Creating initial vector database for user {user_id}")
|
| 577 |
+
|
| 578 |
+
config = get_pipeline_config(user_id)
|
| 579 |
+
paths = get_user_paths(user_id)
|
| 580 |
+
os.makedirs(os.path.dirname(paths["vector_db_path"]), exist_ok=True)
|
| 581 |
+
|
| 582 |
+
pipeline = DiaryIndexingPipeline(**config)
|
| 583 |
+
results = pipeline.run_full_pipeline(clear_existing=True)
|
| 584 |
+
|
| 585 |
+
if results.get('status') == 'completed_successfully':
|
| 586 |
+
# Clear cache to force reload
|
| 587 |
+
if user_id in rag_systems_cache:
|
| 588 |
+
del rag_systems_cache[user_id]
|
| 589 |
+
|
| 590 |
+
return {
|
| 591 |
+
"status": "initial_index_created",
|
| 592 |
+
"message": f"Created initial vector database for user {user_id}",
|
| 593 |
+
"documents_processed": results.get('documents_loaded', 0),
|
| 594 |
+
"chunks_created": results.get('chunks_created', 0)
|
| 595 |
+
}
|
| 596 |
+
else:
|
| 597 |
+
return {
|
| 598 |
+
"status": "failed",
|
| 599 |
+
"error": format_error_message(results.get('errors', 'Unknown error'))
|
| 600 |
+
}
|
| 601 |
+
else:
|
| 602 |
+
# Incremental update for existing DB
|
| 603 |
+
config = get_pipeline_config(user_id)
|
| 604 |
+
pipeline = DiaryIndexingPipeline(**config)
|
| 605 |
+
|
| 606 |
+
# Get recent entries (last 3 days to catch new ones)
|
| 607 |
+
from datetime import timedelta
|
| 608 |
+
start_date = (datetime.now() - timedelta(days=3)).strftime("%Y-%m-%d")
|
| 609 |
+
results = pipeline.incremental_update(start_date)
|
| 610 |
+
|
| 611 |
+
if results.get('status') == 'success':
|
| 612 |
+
# Clear cache to force reload
|
| 613 |
+
if user_id in rag_systems_cache:
|
| 614 |
+
del rag_systems_cache[user_id]
|
| 615 |
+
|
| 616 |
+
documents_added = results.get('documents_added', 0)
|
| 617 |
+
return {
|
| 618 |
+
"status": "incremental_update_success",
|
| 619 |
+
"message": f"Updated vector database for user {user_id}",
|
| 620 |
+
"documents_added": documents_added
|
| 621 |
+
}
|
| 622 |
+
else:
|
| 623 |
+
# If incremental fails, try full rebuild
|
| 624 |
+
logger.warning(f"Incremental update failed for user {user_id}, trying full rebuild")
|
| 625 |
+
results = pipeline.run_full_pipeline(clear_existing=True)
|
| 626 |
+
|
| 627 |
+
if results.get('status') == 'completed_successfully':
|
| 628 |
+
if user_id in rag_systems_cache:
|
| 629 |
+
del rag_systems_cache[user_id]
|
| 630 |
+
|
| 631 |
+
return {
|
| 632 |
+
"status": "full_rebuild_success",
|
| 633 |
+
"message": f"Rebuilt vector database for user {user_id}",
|
| 634 |
+
"documents_processed": results.get('documents_loaded', 0)
|
| 635 |
+
}
|
| 636 |
+
else:
|
| 637 |
+
return {
|
| 638 |
+
"status": "failed",
|
| 639 |
+
"error": f"Both incremental and full rebuild failed: {format_error_message(results.get('errors', 'Unknown error'))}"
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
except Exception as e:
|
| 643 |
+
logger.error(f"Auto-index error for user {user_id}: {e}")
|
| 644 |
+
return {"status": "error", "error": str(e)}
|
| 645 |
+
|
| 646 |
+
@app.delete("/users/{user_id}/cache")
|
| 647 |
+
async def clear_user_cache(user_id: int):
|
| 648 |
+
"""Clear RAG system cache for a user."""
|
| 649 |
+
if user_id in rag_systems_cache:
|
| 650 |
+
del rag_systems_cache[user_id]
|
| 651 |
+
logger.info(f"Cleared cache for user {user_id}")
|
| 652 |
+
return {"message": f"Cache cleared for user {user_id}"}
|
| 653 |
+
else:
|
| 654 |
+
return {"message": f"No cache found for user {user_id}"}
|
| 655 |
+
|
| 656 |
+
@app.delete("/users/{user_id}/vector-db")
|
| 657 |
+
async def delete_user_vector_db(user_id: int):
|
| 658 |
+
"""Delete vector database for a user."""
|
| 659 |
+
try:
|
| 660 |
+
paths = get_user_paths(user_id)
|
| 661 |
+
|
| 662 |
+
# Clear cache first
|
| 663 |
+
if user_id in rag_systems_cache:
|
| 664 |
+
del rag_systems_cache[user_id]
|
| 665 |
+
|
| 666 |
+
# Delete vector database directory
|
| 667 |
+
if os.path.exists(paths["vector_db_path"]):
|
| 668 |
+
import shutil
|
| 669 |
+
shutil.rmtree(paths["vector_db_path"])
|
| 670 |
+
logger.info(f"Deleted vector database for user {user_id}")
|
| 671 |
+
return {"message": f"Vector database deleted for user {user_id}"}
|
| 672 |
+
else:
|
| 673 |
+
return {"message": f"No vector database found for user {user_id}"}
|
| 674 |
+
|
| 675 |
+
except Exception as e:
|
| 676 |
+
logger.error(f"Error deleting vector database for user {user_id}: {e}")
|
| 677 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 678 |
+
|
| 679 |
+
@app.get("/stats")
|
| 680 |
+
async def get_service_stats():
|
| 681 |
+
"""Get service statistics."""
|
| 682 |
+
try:
|
| 683 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 684 |
+
vector_db_base = os.path.join(base_dir, "VectorDB")
|
| 685 |
+
|
| 686 |
+
# Get list of existing vector databases
|
| 687 |
+
existing_dbs = []
|
| 688 |
+
if os.path.exists(vector_db_base):
|
| 689 |
+
for item in os.listdir(vector_db_base):
|
| 690 |
+
if item.startswith("user_") and item.endswith("_vector_db"):
|
| 691 |
+
user_id = int(item.replace("user_", "").replace("_vector_db", ""))
|
| 692 |
+
doc_count = get_document_count(user_id)
|
| 693 |
+
existing_dbs.append({
|
| 694 |
+
"user_id": user_id,
|
| 695 |
+
"path": os.path.join(vector_db_base, item),
|
| 696 |
+
"document_count": doc_count
|
| 697 |
+
})
|
| 698 |
+
|
| 699 |
+
return {
|
| 700 |
+
"cached_users": list(rag_systems_cache.keys()),
|
| 701 |
+
"total_cached_systems": len(rag_systems_cache),
|
| 702 |
+
"existing_vector_databases": existing_dbs,
|
| 703 |
+
"vector_db_base_path": vector_db_base,
|
| 704 |
+
"service_status": "running"
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
except Exception as e:
|
| 708 |
+
logger.error(f"Error getting stats: {e}")
|
| 709 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 710 |
+
|
| 711 |
+
if __name__ == "__main__":
|
| 712 |
+
# Ensure VectorDB directory exists
|
| 713 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 714 |
+
vector_db_dir = os.path.join(base_dir, "VectorDB")
|
| 715 |
+
os.makedirs(vector_db_dir, exist_ok=True)
|
| 716 |
+
|
| 717 |
+
print(f"🚀 Starting RAG Service...")
|
| 718 |
+
print(f"📁 Vector DB base path: {vector_db_dir}")
|
| 719 |
+
print(f"🔑 Google API Key configured: {bool(os.getenv('GOOGLE_API_KEY'))}")
|
| 720 |
+
|
| 721 |
+
uvicorn.run(app, host="127.0.0.1", port=8001, reload=False)
|
clean_repo/src/simple_diary_chatbot.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple Diary Chatbot (RAG Minimal Core)
|
| 3 |
+
|
| 4 |
+
Chức năng bắt buộc (giữ thật đơn giản nhưng vẫn là RAG):
|
| 5 |
+
1. add -> Lưu entry vào SQLite + CHUNK + EMBEDDING bắt buộc + lưu vector
|
| 6 |
+
2. delete -> Xoá entry (DB + vector theo entry_id)
|
| 7 |
+
3. chat -> similarity search (k), nếu có API key thì generate câu trả lời, không thì trả về context
|
| 8 |
+
|
| 9 |
+
Embedding LÀ BẮT BUỘC (sản phẩm RAG). Nếu không có GOOGLE_API_KEY sẽ báo lỗi rõ ràng.
|
| 10 |
+
|
| 11 |
+
Chunking tối giản: cắt theo độ dài cố định (mặc định 800 ký tự) và không overlap để giảm phức tạp.
|
| 12 |
+
|
| 13 |
+
File này thay thế các pipeline phức tạp trước đây khi bạn chỉ cần RAG CRUD cơ bản.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import sqlite3
|
| 20 |
+
from dataclasses import dataclass
|
| 21 |
+
from typing import List, Optional, Dict, Any
|
| 22 |
+
from datetime import datetime
|
| 23 |
+
import logging
|
| 24 |
+
from textwrap import wrap
|
| 25 |
+
import asyncio
|
| 26 |
+
|
| 27 |
+
# Fix event loop issue for Streamlit
|
| 28 |
+
try:
|
| 29 |
+
import nest_asyncio
|
| 30 |
+
nest_asyncio.apply()
|
| 31 |
+
except ImportError:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
# Dùng lại lớp embedding hiện có (Chroma + Google Embedding)
|
| 35 |
+
from Indexingstep.embedding_and_storing import DiaryEmbeddingAndStorage
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
import google.generativeai as genai # type: ignore
|
| 39 |
+
except Exception: # pragma: no cover
|
| 40 |
+
genai = None
|
| 41 |
+
|
| 42 |
+
logging.basicConfig(level=logging.INFO)
|
| 43 |
+
logger = logging.getLogger("simple_diary")
|
| 44 |
+
|
| 45 |
+
DB_PATH = os.path.join(os.getcwd(), "diary.db") # Một file DB duy nhất
|
| 46 |
+
CHUNK_SIZE = 800 # Có thể chỉnh nếu cần
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def get_conn():
|
| 50 |
+
return sqlite3.connect(DB_PATH)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def ensure_db():
|
| 54 |
+
conn = get_conn()
|
| 55 |
+
cur = conn.cursor()
|
| 56 |
+
cur.execute(
|
| 57 |
+
"""
|
| 58 |
+
CREATE TABLE IF NOT EXISTS diary_entries (
|
| 59 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 60 |
+
date TEXT NOT NULL,
|
| 61 |
+
content TEXT NOT NULL,
|
| 62 |
+
tags TEXT DEFAULT '',
|
| 63 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 64 |
+
)
|
| 65 |
+
"""
|
| 66 |
+
)
|
| 67 |
+
conn.commit()
|
| 68 |
+
conn.close()
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@dataclass
|
| 72 |
+
class DiaryEntry:
|
| 73 |
+
id: int
|
| 74 |
+
date: str
|
| 75 |
+
content: str
|
| 76 |
+
tags: str
|
| 77 |
+
created_at: str
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class SimpleDiaryChatbot:
|
| 81 |
+
"""Core RAG tối giản – luôn yêu cầu embedding hoạt động."""
|
| 82 |
+
|
| 83 |
+
def __init__(self, api_key: Optional[str] = None, user_id: int = 1, chunk_size: int = CHUNK_SIZE):
|
| 84 |
+
if api_key:
|
| 85 |
+
os.environ["GOOGLE_API_KEY"] = api_key
|
| 86 |
+
ensure_db()
|
| 87 |
+
self.user_id = user_id
|
| 88 |
+
self.chunk_size = chunk_size
|
| 89 |
+
|
| 90 |
+
key = os.getenv("GOOGLE_API_KEY")
|
| 91 |
+
if not key:
|
| 92 |
+
raise RuntimeError(
|
| 93 |
+
"GOOGLE_API_KEY chưa được thiết lập. Set trong PowerShell: $env:GOOGLE_API_KEY='YOUR_KEY'"
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Fix event loop for Streamlit
|
| 97 |
+
try:
|
| 98 |
+
loop = asyncio.get_event_loop()
|
| 99 |
+
except RuntimeError:
|
| 100 |
+
loop = asyncio.new_event_loop()
|
| 101 |
+
asyncio.set_event_loop(loop)
|
| 102 |
+
|
| 103 |
+
# Khởi tạo embedding + vector store (bắt buộc)
|
| 104 |
+
self.embedding_store = DiaryEmbeddingAndStorage(user_id=user_id, api_key=key)
|
| 105 |
+
|
| 106 |
+
# (Tuỳ chọn) LLM để tạo câu trả lời tự nhiên – nếu lỗi vẫn tiếp tục dùng context
|
| 107 |
+
self._model = None
|
| 108 |
+
if genai:
|
| 109 |
+
try:
|
| 110 |
+
genai.configure(api_key=key)
|
| 111 |
+
self._model = genai.GenerativeModel("gemini-1.5-flash")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.warning(f"Không khởi tạo được LLM (tiếp tục với retrieval-only): {e}")
|
| 114 |
+
|
| 115 |
+
# ------------- CRUD -------------
|
| 116 |
+
def _chunk(self, text: str) -> List[str]:
|
| 117 |
+
"""Chunk đơn giản theo độ dài cố định, cắt ở khoảng trắng gần nhất nếu có."""
|
| 118 |
+
if len(text) <= self.chunk_size:
|
| 119 |
+
return [text]
|
| 120 |
+
chunks: List[str] = []
|
| 121 |
+
start = 0
|
| 122 |
+
while start < len(text):
|
| 123 |
+
end = min(start + self.chunk_size, len(text))
|
| 124 |
+
# Cố gắng lùi về khoảng trắng để tránh cắt từ
|
| 125 |
+
if end < len(text):
|
| 126 |
+
last_space = text.rfind(" ", start, end)
|
| 127 |
+
if last_space != -1 and last_space - start > self.chunk_size * 0.5:
|
| 128 |
+
end = last_space
|
| 129 |
+
chunks.append(text[start:end].strip())
|
| 130 |
+
start = end
|
| 131 |
+
return [c for c in chunks if c]
|
| 132 |
+
|
| 133 |
+
def add_entry(self, date: str, content: str, tags: str = "") -> int:
|
| 134 |
+
conn = get_conn()
|
| 135 |
+
cur = conn.cursor()
|
| 136 |
+
cur.execute(
|
| 137 |
+
"INSERT INTO diary_entries(date, content, tags) VALUES (?, ?, ?)",
|
| 138 |
+
(date, content, tags),
|
| 139 |
+
)
|
| 140 |
+
entry_id = cur.lastrowid
|
| 141 |
+
conn.commit()
|
| 142 |
+
conn.close()
|
| 143 |
+
|
| 144 |
+
# Chunk + embed từng chunk (metadata chung entry)
|
| 145 |
+
chunks = self._chunk(content)
|
| 146 |
+
metadatas = [
|
| 147 |
+
{"entry_id": entry_id, "date": date, "tags": tags, "chunk_index": i, "total_chunks": len(chunks)}
|
| 148 |
+
for i, _ in enumerate(chunks)
|
| 149 |
+
]
|
| 150 |
+
self.embedding_store.embed_and_store_texts(chunks, metadatas)
|
| 151 |
+
# logger.info(f"Added entry {entry_id} với {len(chunks)} chunk")
|
| 152 |
+
return entry_id
|
| 153 |
+
|
| 154 |
+
def delete_entry(self, entry_id: int) -> bool:
|
| 155 |
+
# Xoá vector theo metadata
|
| 156 |
+
try:
|
| 157 |
+
self.embedding_store.delete_documents_by_metadata({"entry_id": entry_id})
|
| 158 |
+
except Exception as e:
|
| 159 |
+
logger.warning(f"Failed to delete vectors for entry {entry_id}: {e}")
|
| 160 |
+
|
| 161 |
+
conn = get_conn()
|
| 162 |
+
cur = conn.cursor()
|
| 163 |
+
cur.execute("DELETE FROM diary_entries WHERE id = ?", (entry_id,))
|
| 164 |
+
deleted = cur.rowcount
|
| 165 |
+
conn.commit()
|
| 166 |
+
conn.close()
|
| 167 |
+
if deleted:
|
| 168 |
+
logger.info(f"Deleted entry {entry_id}")
|
| 169 |
+
return True
|
| 170 |
+
logger.warning(f"Entry {entry_id} not found")
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
def list_entries(self, limit: int = 10) -> List[DiaryEntry]:
|
| 174 |
+
conn = get_conn()
|
| 175 |
+
cur = conn.cursor()
|
| 176 |
+
cur.execute(
|
| 177 |
+
"SELECT id, date, content, tags, created_at FROM diary_entries ORDER BY created_at DESC LIMIT ?",
|
| 178 |
+
(limit,),
|
| 179 |
+
)
|
| 180 |
+
rows = [DiaryEntry(*r) for r in cur.fetchall()]
|
| 181 |
+
conn.close()
|
| 182 |
+
return rows
|
| 183 |
+
|
| 184 |
+
# ------------- Chat -------------
|
| 185 |
+
def chat(self, question: str, k: int = 4) -> Dict[str, Any]:
|
| 186 |
+
"""
|
| 187 |
+
Trả về:
|
| 188 |
+
{
|
| 189 |
+
'answer': str,
|
| 190 |
+
'contexts': [ { 'snippet': ..., 'date': ..., 'entry_id': ... } ]
|
| 191 |
+
}
|
| 192 |
+
"""
|
| 193 |
+
try:
|
| 194 |
+
results = self.embedding_store.similarity_search(question, k=k)
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logger.warning(f"Similarity search failed: {e}")
|
| 197 |
+
results = []
|
| 198 |
+
contexts = []
|
| 199 |
+
for doc in results:
|
| 200 |
+
contexts.append(
|
| 201 |
+
{
|
| 202 |
+
"snippet": doc.page_content[:300],
|
| 203 |
+
"date": doc.metadata.get("date"),
|
| 204 |
+
"entry_id": doc.metadata.get("entry_id"),
|
| 205 |
+
"tags": doc.metadata.get("tags"),
|
| 206 |
+
}
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
if self._model and contexts:
|
| 210 |
+
context_text = "\n".join(
|
| 211 |
+
[f"[Entry {c['entry_id']} - {c['date']}] {c['snippet']}" for c in contexts]
|
| 212 |
+
)
|
| 213 |
+
prompt = (
|
| 214 |
+
"You are a helpful diary assistant. Use only the context below to answer.\n\n"
|
| 215 |
+
f"CONTEXT:\n{context_text}\n\nQUESTION: {question}\n\nAnswer in the same language as the question."
|
| 216 |
+
)
|
| 217 |
+
try:
|
| 218 |
+
resp = self._model.generate_content(prompt)
|
| 219 |
+
answer = resp.text.strip()
|
| 220 |
+
except Exception as e:
|
| 221 |
+
answer = f"(LLM error, showing raw context) -> {e}\n" + " | ".join(
|
| 222 |
+
c["snippet"] for c in contexts
|
| 223 |
+
)
|
| 224 |
+
else:
|
| 225 |
+
answer = " | ".join(c["snippet"] for c in contexts) if contexts else "Không tìm thấy nội dung liên quan."
|
| 226 |
+
|
| 227 |
+
return {"answer": answer, "contexts": contexts}
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def _cli(): # Simple command line interface
|
| 231 |
+
import argparse
|
| 232 |
+
parser = argparse.ArgumentParser(description="Simple Diary Chatbot")
|
| 233 |
+
sub = parser.add_subparsers(dest="cmd")
|
| 234 |
+
|
| 235 |
+
p_add = sub.add_parser("add", help="Add a diary entry")
|
| 236 |
+
p_add.add_argument("--date", default=datetime.now().strftime("%Y-%m-%d"))
|
| 237 |
+
p_add.add_argument("--content", required=True)
|
| 238 |
+
p_add.add_argument("--tags", default="")
|
| 239 |
+
|
| 240 |
+
p_del = sub.add_parser("delete", help="Delete an entry by id")
|
| 241 |
+
p_del.add_argument("--id", type=int, required=True)
|
| 242 |
+
|
| 243 |
+
p_chat = sub.add_parser("chat", help="Ask a question")
|
| 244 |
+
p_chat.add_argument("--q", required=True, help="Question")
|
| 245 |
+
p_chat.add_argument("--k", type=int, default=4)
|
| 246 |
+
|
| 247 |
+
p_list = sub.add_parser("list", help="List recent entries")
|
| 248 |
+
p_list.add_argument("--limit", type=int, default=5)
|
| 249 |
+
|
| 250 |
+
args = parser.parse_args()
|
| 251 |
+
bot = SimpleDiaryChatbot(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 252 |
+
|
| 253 |
+
if args.cmd == "add":
|
| 254 |
+
eid = bot.add_entry(args.date, args.content, args.tags)
|
| 255 |
+
print(f"Added entry id={eid}")
|
| 256 |
+
elif args.cmd == "delete":
|
| 257 |
+
ok = bot.delete_entry(args.id)
|
| 258 |
+
print("Deleted" if ok else "Not found")
|
| 259 |
+
elif args.cmd == "chat":
|
| 260 |
+
resp = bot.chat(args.q, k=args.k)
|
| 261 |
+
print("Answer:\n", resp["answer"])
|
| 262 |
+
print("\nContexts:")
|
| 263 |
+
for c in resp["contexts"]:
|
| 264 |
+
print(f"- ({c['entry_id']}) {c['date']} :: {c['snippet'][:80]}...")
|
| 265 |
+
elif args.cmd == "list":
|
| 266 |
+
entries = bot.list_entries(limit=args.limit)
|
| 267 |
+
for e in entries:
|
| 268 |
+
print(f"{e.id} | {e.date} | {e.tags} | {e.content[:60]}...")
|
| 269 |
+
else:
|
| 270 |
+
parser.print_help()
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
_cli()
|
clean_repo/src/streamlit_app.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import altair as alt
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
# Welcome to Streamlit!
|
| 8 |
+
|
| 9 |
+
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
+
forums](https://discuss.streamlit.io).
|
| 12 |
+
|
| 13 |
+
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
+
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
+
|
| 19 |
+
indices = np.linspace(0, 1, num_points)
|
| 20 |
+
theta = 2 * np.pi * num_turns * indices
|
| 21 |
+
radius = indices
|
| 22 |
+
|
| 23 |
+
x = radius * np.cos(theta)
|
| 24 |
+
y = radius * np.sin(theta)
|
| 25 |
+
|
| 26 |
+
df = pd.DataFrame({
|
| 27 |
+
"x": x,
|
| 28 |
+
"y": y,
|
| 29 |
+
"idx": indices,
|
| 30 |
+
"rand": np.random.randn(num_points),
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
+
.mark_point(filled=True)
|
| 35 |
+
.encode(
|
| 36 |
+
x=alt.X("x", axis=None),
|
| 37 |
+
y=alt.Y("y", axis=None),
|
| 38 |
+
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
+
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
+
))
|
clean_repo/src/streamlit_app/__pycache__/auth_ui.cpython-311.pyc
ADDED
|
Binary file (17.9 kB). View file
|
|
|
clean_repo/src/streamlit_app/__pycache__/auto_sync.cpython-311.pyc
ADDED
|
Binary file (18.5 kB). View file
|
|
|
clean_repo/src/streamlit_app/__pycache__/rag_client.cpython-311.pyc
ADDED
|
Binary file (10.5 kB). View file
|
|
|
clean_repo/src/streamlit_app/__pycache__/user_auth.cpython-311.pyc
ADDED
|
Binary file (12.9 kB). View file
|
|
|