huytrao123 commited on
Commit
ced61cd
·
verified ·
1 Parent(s): d780792

Upload 103 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env +12 -0
  2. .gitattributes +6 -0
  3. .gitignore +124 -0
  4. Backend.md +65 -0
  5. Dockerfile +17 -0
  6. RAG-architecture.md +254 -0
  7. README.md +310 -11
  8. VectorDB/user_1_vector_db/chroma.sqlite3 +3 -0
  9. app.py +90 -0
  10. clean_repo/.env +12 -0
  11. clean_repo/.gitattributes +35 -0
  12. clean_repo/.gitignore +128 -0
  13. clean_repo/Backend.md +65 -0
  14. clean_repo/Dockerfile +17 -0
  15. clean_repo/RAG-architecture.md +254 -0
  16. clean_repo/README.md +310 -0
  17. clean_repo/VectorDB/user_1_vector_db/chroma.sqlite3 +3 -0
  18. clean_repo/app.py +90 -0
  19. clean_repo/clean_repo/.env +12 -0
  20. clean_repo/clean_repo/.gitattributes +35 -0
  21. clean_repo/clean_repo/.gitignore +120 -0
  22. clean_repo/clean_repo/Backend.md +65 -0
  23. clean_repo/clean_repo/Dockerfile +17 -0
  24. clean_repo/clean_repo/RAG-architecture.md +254 -0
  25. clean_repo/clean_repo/README.md +310 -0
  26. clean_repo/clean_repo/app.py +90 -0
  27. clean_repo/clean_repo/env_template.txt +51 -0
  28. clean_repo/clean_repo/requirements.txt +0 -0
  29. clean_repo/clean_repo/src/streamlit_app.py +40 -0
  30. clean_repo/env_template.txt +51 -0
  31. clean_repo/images/DIAGRAM-RAG-diary.png +0 -0
  32. clean_repo/notebook/RAG-test.ipynb +0 -0
  33. clean_repo/notebook/exploration.ipynb +0 -0
  34. clean_repo/requirements.txt +0 -0
  35. clean_repo/src/Indexingstep/Datasplitting.py +44 -0
  36. clean_repo/src/Indexingstep/database_utils.py +140 -0
  37. clean_repo/src/Indexingstep/dataloading.py +603 -0
  38. clean_repo/src/Indexingstep/diary_text_splitter.py +241 -0
  39. clean_repo/src/Indexingstep/embedding_and_storing.py +499 -0
  40. clean_repo/src/Indexingstep/indexing_pipeline.py +110 -0
  41. clean_repo/src/Indexingstep/pipeline.py +459 -0
  42. clean_repo/src/Retrivel_And_Generation/Retrieval_And_Generator.py +739 -0
  43. clean_repo/src/Retrivel_And_Generation/__pycache__/Retrieval_And_Generator.cpython-311.pyc +0 -0
  44. clean_repo/src/rag_service/main.py +721 -0
  45. clean_repo/src/simple_diary_chatbot.py +274 -0
  46. clean_repo/src/streamlit_app.py +40 -0
  47. clean_repo/src/streamlit_app/__pycache__/auth_ui.cpython-311.pyc +0 -0
  48. clean_repo/src/streamlit_app/__pycache__/auto_sync.cpython-311.pyc +0 -0
  49. clean_repo/src/streamlit_app/__pycache__/rag_client.cpython-311.pyc +0 -0
  50. clean_repo/src/streamlit_app/__pycache__/user_auth.cpython-311.pyc +0 -0
.env ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google API Configuration for RAG System
2
+ GOOGLE_API_KEY=AIzaSyAZQN21CjLySEybT6vOYDCz4V_e85gD42k
3
+
4
+ # Database Configuration
5
+ DATABASE_PATH=./src/streamlit_app/backend/diary.db
6
+
7
+ # Vector Database Configuration
8
+ VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
9
+ COLLECTION_NAME=diary_entries
10
+
11
+ # RAG Configuration
12
+ EMBEDDING_MODEL=models/embedding-001
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ clean_repo/src/streamlit_app/src/Indexingstep/user_3_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ clean_repo/src/streamlit_app/temp/recorded_audio.wav filter=lfs diff=lfs merge=lfs -text
38
+ clean_repo/VectorDB/user_1_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
39
+ src/streamlit_app/src/Indexingstep/user_3_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
40
+ src/streamlit_app/temp/recorded_audio.wav filter=lfs diff=lfs merge=lfs -text
41
+ VectorDB/user_1_vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ ./venv/
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .coverage
39
+ .coverage.*
40
+ .cache
41
+ nosetests.xml
42
+ coverage.xml
43
+ *.cover
44
+ .hypothesis/
45
+ .pytest_cache/
46
+
47
+ # Translations
48
+ *.mo
49
+ *.pot
50
+
51
+ # Django stuff:
52
+ *.log
53
+ local_settings.py
54
+ db.sqlite3
55
+
56
+ # Flask stuff:
57
+ instance/
58
+ .webassets-cache
59
+
60
+ # Scrapy stuff:
61
+ .scrapy
62
+
63
+ # Sphinx documentation
64
+ docs/_build/
65
+
66
+ # PyBuilder
67
+ target/
68
+
69
+ # Jupyter Notebook
70
+ .ipynb_checkpoints
71
+
72
+ # pyenv
73
+ .python-version
74
+
75
+ # celery beat schedule file
76
+ celerybeat-schedule
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # IDE
106
+ .vscode/
107
+ .idea/
108
+ *.swp
109
+ *.swo
110
+ # Vector databases & generated files
111
+ /src/Indexingstep/src/Indexingstep/diary_vector_db_enhanced/
112
+ *.db
113
+ *.sqlite3
114
+ *.bin
115
+ # OS
116
+ .DS_Store
117
+ Thumbs.db
118
+
119
+ # Project specific
120
+ data/
121
+ models/
122
+ logs/
123
+ *.pkl
124
+ *.model
Backend.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Backend Architecture - Personal Diary Chatbot
2
+
3
+ ## Tổng quan Backend
4
+
5
+ Backend của dự án được xây dựng trên nền tảng FastAPI, cung cấp API RESTful cho việc xử lý nhật ký, tìm kiếm và tương tác với chatbot RAG. Hệ thống được thiết kế theo kiến trúc microservices với khả năng mở rộng cao.
6
+
7
+ ## 🏛️ Kiến trúc tổng thể
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
11
+ │ Frontend │ │ API Gateway │ │ Core Services │
12
+ │ (Streamlit) │◄──►│ (FastAPI) │◄──►│ (RAG Engine) │
13
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
14
+
15
+
16
+ ┌─────────────────┐
17
+ │ Data Layer │
18
+ │ (Vector DB) │
19
+ └─────────────────┘
20
+ ```
21
+
22
+ ## 🔧 Cấu trúc thư mục Backend
23
+
24
+ ```
25
+ src/
26
+ ├── rag_service/ # FastAPI service
27
+ │ ├── main.py # Main application entry point
28
+ │ ├── __init__.py
29
+ │ └── __pycache__/
30
+ ├── Indexingstep/ # Data processing pipeline
31
+ │ ├── pipeline.py # Main indexing pipeline
32
+ │ ├── dataloading.py # Document loading utilities
33
+ │ ├── diary_text_splitter.py # Text chunking logic
34
+ │ ├── embedding_and_storing.py # Vector embedding & storage
35
+ │ ├── database_utils.py # Database operations
36
+ │ └── indexing_pipeline.py # Pipeline orchestration
37
+ ├── Retrivel_And_Generation/ # RAG core engine
38
+ │ ├── Retrieval_And_Generator.py # Main RAG system
39
+ │ └── __init__.py
40
+ ├── VectorDB/ # Vector database storage
41
+ └── streamlit_app/ # Frontend application
42
+ ├── backend/ # Backend utilities for UI
43
+ ├── user_auth.py # Authentication system
44
+ ├── rag_client.py # RAG service client
45
+ └── interface.py # Main UI interface
46
+ ```
47
+ ## 🔮 Future Enhancements
48
+
49
+ ### 1. Microservices Architecture
50
+ - **User Service**: Dedicated user management
51
+ - **Document Service**: Document processing pipeline
52
+ - **Search Service**: Vector search optimization
53
+ - **Chat Service**: Conversation management
54
+
55
+ ### 2. Advanced Features
56
+ - **Real-time synchronization**: WebSocket support
57
+ - **Multi-language support**: Internationalization
58
+ - **Advanced analytics**: User behavior tracking
59
+ - **Machine learning**: Continuous model improvement
60
+
61
+ ### 3. Infrastructure Improvements
62
+ - **Kubernetes deployment**: Container orchestration
63
+ - **Service mesh**: Istio integration
64
+ - **Observability**: Distributed tracing
65
+ - **Auto-scaling**: Dynamic resource allocation
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sử dụng image Python chính thức
2
+ FROM python:3.10-slim
3
+
4
+ # Đặt thư mục làm việc trong container
5
+ WORKDIR /app
6
+
7
+ # Sao chép file requirements.txt vào container (nếu có)
8
+ COPY requirements.txt .
9
+
10
+ # Cài đặt các thư viện phụ thuộc
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Sao chép toàn bộ mã nguồn vào container
14
+ COPY . .
15
+
16
+ # Chạy ứng dụng (thay main.py bằng file chạy chính của bạn)
17
+ CMD ["python", "app.py"]
RAG-architecture.md ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Architecture - Personal Diary Chatbot
2
+
3
+ ## 🏗️ Tổng quan kiến trúc RAG
4
+
5
+ Kiến trúc RAG (Retrieval-Augmented Generation) trong dự án này được thiết kế để cung cấp khả năng tìm kiếm và trả lời thông minh dựa trên dữ liệu nhật ký cá nhân của người dùng.
6
+
7
+ ## 🔄 Luồng xử lý RAG
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
11
+ │ Input Query │───►│ Query │───►│ Vector │
12
+ │ (User Question)│ │ Processing │ │ Search │
13
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
14
+
15
+
16
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
17
+ │ Final Answer │◄───│ Answer │◄───│ Context │
18
+ │ (Response) │ │ Generation │ │ Retrieval │
19
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
20
+ ```
21
+
22
+ ## 📊 Chi tiết các thành phần
23
+
24
+ ### 1. Data Ingestion & Indexing
25
+
26
+ #### 1.1 Document Loading
27
+ - **Input formats**: PDF, DOCX, TXT
28
+ - **Processing**: Text extraction, cleaning, normalization
29
+ - **Output**: Structured text data
30
+
31
+ #### 1.2 Text Chunking
32
+ ```python
33
+ # Chunking strategy
34
+ chunk_size = 1000 # characters
35
+ chunk_overlap = 200 # characters
36
+ chunking_method = "recursive_character_splitter"
37
+ ```
38
+
39
+ #### 1.3 Embedding Generation
40
+ - **Model**: Google Universal Sentence Encoder (USE)
41
+ - **Vector dimension**: 512
42
+ - **Normalization**: L2 normalization
43
+ - **Storage**: ChromaDB vector database
44
+
45
+ ### 2. Vector Database Architecture
46
+
47
+ #### 2.1 ChromaDB Configuration
48
+ ```python
49
+ # Database settings
50
+ collection_name = f"user_{user_id}_diary"
51
+ metadata = {
52
+ "user_id": user_id,
53
+ "source": "diary_entry",
54
+ "date": entry_date,
55
+ "chunk_id": chunk_id
56
+ }
57
+ ```
58
+
59
+ #### 2.2 Index Structure
60
+ - **Primary key**: `user_id + chunk_id`
61
+ - **Vector index**: HNSW (Hierarchical Navigable Small World)
62
+ - **Distance metric**: Cosine similarity
63
+ - **Sharding**: Per-user collections
64
+
65
+ ### 3. Retrieval Engine
66
+
67
+ #### 3.1 Query Processing
68
+ ```python
69
+ # Query preprocessing
70
+ def process_query(query: str):
71
+ # 1. Text cleaning
72
+ # 2. Stop word removal
73
+ # 3. Lemmatization
74
+ # 4. Query expansion
75
+ return processed_query
76
+ ```
77
+
78
+ #### 3.2 Vector Search
79
+ - **Search algorithm**: K-Nearest Neighbors (KNN)
80
+ - **Top-k results**: 5-10 most relevant chunks
81
+ - **Similarity threshold**: 0.7 (cosine similarity)
82
+ - **Reranking**: Semantic relevance scoring
83
+
84
+ #### 3.3 Context Assembly
85
+ ```python
86
+ # Context building
87
+ def build_context(retrieved_chunks, query):
88
+ # 1. Sort by relevance score
89
+ # 2. Remove duplicates
90
+ # 3. Truncate to token limit
91
+ # 4. Add metadata context
92
+ return final_context
93
+ ```
94
+
95
+ ### 4. Generation Engine
96
+
97
+ #### 4.1 LLM Integration
98
+ - **Primary model**: OpenAI GPT-3.5/4
99
+ - **Fallback model**: Local model (nếu cần)
100
+ - **Temperature**: 0.7 (balanced creativity)
101
+ - **Max tokens**: 500 (response length)
102
+
103
+ #### 4.2 Prompt Engineering
104
+ ```python
105
+ # System prompt template
106
+ SYSTEM_PROMPT = """
107
+ You are a helpful AI assistant that answers questions about personal diary entries.
108
+ Use only the provided context to answer questions.
109
+ If the information is not in the context, say so.
110
+ Be conversational but professional.
111
+ """
112
+ ```
113
+
114
+ #### 4.3 Response Generation
115
+ ```python
116
+ # Generation pipeline
117
+ def generate_response(query, context, chat_history):
118
+ # 1. Build prompt with context
119
+ # 2. Call LLM API
120
+ # 3. Post-process response
121
+ # 4. Validate against context
122
+ # 5. Return final answer
123
+ ```
124
+
125
+ ## 🔧 Cấu hình kỹ thuật
126
+
127
+ ### Performance Tuning
128
+
129
+ #### 1. Chunking Optimization
130
+ - **Optimal chunk size**: 1000 characters
131
+ - **Overlap ratio**: 20%
132
+ - **Chunking strategy**: Recursive character splitter
133
+
134
+ #### 2. Vector Search Optimization
135
+ - **Index type**: HNSW
136
+ - **Search parameters**:
137
+ - `ef_construction`: 200
138
+ - `ef_search`: 100
139
+ - `m`: 16
140
+
141
+ #### 3. Caching Strategy
142
+ - **Query cache**: Redis (in-memory)
143
+ - **Embedding cache**: Local file cache
144
+ - **Response cache**: TTL-based expiration
145
+
146
+ ### Scalability Features
147
+
148
+ #### 1. Multi-User Support
149
+ - **User isolation**: Separate vector collections
150
+ - **Resource management**: Per-user memory limits
151
+ - **Concurrent access**: Async processing
152
+
153
+ #### 2. Horizontal Scaling
154
+ - **Load balancing**: Multiple RAG instances
155
+ - **Database sharding**: User-based distribution
156
+ - **Microservices**: Modular architecture
157
+
158
+ ## 📈 Monitoring & Analytics
159
+
160
+ ### 1. Performance Metrics
161
+ - **Query latency**: < 2 seconds
162
+ - **Retrieval accuracy**: > 85%
163
+ - **Generation quality**: User satisfaction score
164
+ - **System throughput**: Queries per second
165
+
166
+ ### 2. Quality Assurance
167
+ - **Context relevance**: Similarity score tracking
168
+ - **Answer accuracy**: Human evaluation
169
+ - **User feedback**: Rating system
170
+ - **A/B testing**: Model comparison
171
+
172
+ ## 🚀 Deployment Architecture
173
+
174
+ ### 1. Development Environment
175
+ ```
176
+ ┌─────────────────┐ ┌─────────────────┐
177
+ │ Local Python │ │ Local │
178
+ │ Environment │◄──►│ ChromaDB │
179
+ └─────────────────┘ └─────────────────┘
180
+ ```
181
+
182
+ ### 2. Production Environment
183
+ ```
184
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
185
+ │ Load Balancer│ │ RAG Service │ │ Vector DB │
186
+ │ (Nginx) │◄──►│ (FastAPI) │◄──►│ (ChromaDB) │
187
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
188
+
189
+
190
+ ┌─────────────────┐
191
+ │ Redis Cache │
192
+ └─────────────────┘
193
+ ```
194
+
195
+ ## 🔒 Security & Privacy
196
+
197
+ ### 1. Data Protection
198
+ - **User isolation**: Strict separation of data
199
+ - **Encryption**: At-rest and in-transit
200
+ - **Access control**: Role-based permissions
201
+ - **Audit logging**: Complete access history
202
+
203
+ ### 2. Privacy Compliance
204
+ - **GDPR compliance**: Data portability
205
+ - **Data retention**: Configurable policies
206
+ - **User consent**: Explicit permission management
207
+ - **Data anonymization**: Optional features
208
+
209
+ ## 🧪 Testing Strategy
210
+
211
+ ### 1. Unit Testing
212
+ - **Component testing**: Individual modules
213
+ - **Mock testing**: External API simulation
214
+ - **Coverage target**: > 90%
215
+
216
+ ### 2. Integration Testing
217
+ - **End-to-end testing**: Complete RAG pipeline
218
+ - **Performance testing**: Load and stress tests
219
+ - **Security testing**: Vulnerability assessment
220
+
221
+
222
+ ## 📚 Best Practices
223
+
224
+ ### 1. Model Selection
225
+ - **Embedding models**: Domain-specific fine-tuning
226
+ - **LLM selection**: Cost-performance balance
227
+ - **Fallback strategies**: Graceful degradation
228
+
229
+ ### 2. Data Quality
230
+ - **Input validation**: Strict data checking
231
+ - **Cleaning pipeline**: Automated preprocessing
232
+ - **Quality metrics**: Continuous monitoring
233
+
234
+ ### 3. Error Handling
235
+ - **Graceful failures**: User-friendly error messages
236
+ - **Retry mechanisms**: Automatic recovery
237
+ - **Logging**: Comprehensive error tracking
238
+
239
+ ## 🔮 Future Enhancements
240
+
241
+ ### 1. Advanced Features
242
+ - **Multi-modal RAG**: Image and text processing
243
+ - **Temporal reasoning**: Time-based queries
244
+ - **Emotional analysis**: Sentiment-aware responses
245
+
246
+ ### 2. Performance Improvements
247
+ - **Vector quantization**: Reduced memory usage
248
+ - **Approximate search**: Faster retrieval
249
+ - **Model distillation**: Smaller, faster models
250
+
251
+ ### 3. Integration Capabilities
252
+ - **API ecosystem**: Third-party integrations
253
+ - **Mobile applications**: Native mobile support
254
+ - **Voice interface**: Speech-to-text integration
README.md CHANGED
@@ -1,11 +1,310 @@
1
- ---
2
- title: Diary Chatbot
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- short_description: An Diary chat bot with AI power using RAG technique.
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Personal Diary Chatbot
2
+
3
+ ## 📖 Project Description
4
+
5
+ RAG Personal Diary Chatbot is an intelligent chatbot application that uses RAG (Retrieval-Augmented Generation) architecture to interact with users' personal diaries. The application allows users to ask questions about diary content and receive accurate answers based on actual data.
6
+
7
+ ## ✨ Key Features
8
+
9
+
10
+ ## 🏗️ System Architecture
11
+
12
+ ```
13
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
14
+ │ Streamlit UI │ │ FastAPI │ │ Vector │
15
+ │ (Frontend) │◄──►│ Backend │◄──►│ Database │
16
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
17
+
18
+
19
+ ┌─────────────────┐
20
+ │ RAG Engine │
21
+ │ (LLM + │
22
+ │ Retrieval) │
23
+ └─────────────────┘
24
+ ```
25
+
26
+ ## 🚀 Installation and Setup
27
+
28
+ ### System Requirements
29
+
30
+ ### Install Dependencies
31
+ ```bash
32
+ # Create virtual environment
33
+ python -m venv .venv
34
+
35
+ # Activate virtual environment
36
+ # Windows
37
+ .venv\Scripts\activate
38
+ # Linux/Mac
39
+ source .venv/bin/activate
40
+
41
+ # Install packages
42
+ pip install -r requirements.txt
43
+ ```
44
+
45
+ ### Environment Configuration
46
+
47
+ Create a `.env` file in the project root directory with the following structure:
48
+
49
+ ```env
50
+ # API Keys
51
+ OPENAI_API_KEY=your_openai_api_key_here
52
+ GOOGLE_API_KEY=your_google_api_key_here
53
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
54
+
55
+ # Database Configuration
56
+ DATABASE_URL=sqlite:///./user_database/auth.db
57
+ VECTOR_DB_PATH=./VectorDB
58
+
59
+ # Model Configuration
60
+ EMBEDDING_MODEL=google-universal-sentence-encoder
61
+ LLM_MODEL=gpt-3.5-turbo
62
+ CHUNK_SIZE=1000
63
+ CHUNK_OVERLAP=200
64
+
65
+ # Server Configuration
66
+ RAG_SERVICE_PORT=8001
67
+ STREAMLIT_PORT=8501
68
+ FASTAPI_PORT=8000
69
+
70
+ # Security
71
+ SECRET_KEY=your_secret_key_here
72
+ JWT_SECRET=your_jwt_secret_here
73
+
74
+ # Logging
75
+ LOG_LEVEL=INFO
76
+ LOG_FILE=./logs/app.log
77
+
78
+ # Vector Database
79
+ CHROMA_DB_PATH=./VectorDB
80
+ PERSIST_DIRECTORY=./VectorDB
81
+
82
+ # File Processing
83
+ SUPPORTED_FORMATS=pdf,docx,txt,md
84
+ MAX_FILE_SIZE=10485760
85
+ TEMP_DIR=./temp
86
+
87
+ # RAG Configuration
88
+ TOP_K_RESULTS=5
89
+ SIMILARITY_THRESHOLD=0.7
90
+ MAX_TOKENS=4096
91
+ TEMPERATURE=0.7
92
+ ```
93
+
94
+ **Important Notes:**
95
+
96
+ ### Run the Application
97
+
98
+ #### 1. Start RAG Service
99
+ ```bash
100
+ python start_rag_service.py
101
+ ```
102
+ Service will run at: http://127.0.0.1:8001
103
+
104
+ #### 2. Start Streamlit UI
105
+ ```bash
106
+ cd src/streamlit_app
107
+ streamlit run interface.py
108
+ ```
109
+ UI will run at: http://localhost:8501
110
+
111
+ ## 📁 Directory Structure
112
+
113
+ ```
114
+ RAG-Personal-Diary-Chatbot/
115
+ ├── src/
116
+ │ ├── Indexingstep/ # Data indexing pipeline
117
+ │ ├── Retrivel_And_Generation/ # RAG engine
118
+ │ ├── rag_service/ # FastAPI backend
119
+ │ ├── streamlit_app/ # User interface
120
+ │ └── VectorDB/ # Vector database
121
+ ├── notebook/ # Jupyter notebooks
122
+ ├── tests/ # Unit tests
123
+ ├── images/ # Documentation images
124
+ ├── start_rag_service.py # Service startup script
125
+ ├── .env # Environment variables (create from template)
126
+ ├── env_template.txt # Environment variables template
127
+ └── README.md
128
+ ```
129
+
130
+ ## 🔧 Configuration
131
+
132
+ ### Vector Database
133
+
134
+ ### AI Models
135
+
136
+ ## 📊 Performance
137
+
138
+
139
+ ## 🧪 Testing
140
+
141
+ ```bash
142
+ # Run all tests
143
+ python -m pytest tests/
144
+
145
+ # Run specific test
146
+ python -m pytest tests/test_rag_system.py
147
+ ```
148
+
149
+ ## 🤝 Contributing
150
+
151
+ 1. Fork the project
152
+ 2. Create feature branch (`git checkout -b feature/AmazingFeature`)
153
+ 3. Commit changes (`git commit -m 'Add some AmazingFeature'`)
154
+ 4. Push to branch (`git push origin feature/AmazingFeature`)
155
+ 5. Open Pull Request
156
+
157
+ ## 📝 License
158
+
159
+ This project is distributed under the MIT License. See the `LICENSE` file for more details.
160
+
161
+ ## 📞 Contact
162
+
163
+
164
+ ## 🙏 Acknowledgments
165
+
166
+
167
+ ## 📖 Project Description
168
+
169
+ RAG Personal Diary Chatbot is an intelligent chatbot application that leverages Retrieval-Augmented Generation (RAG) architecture to interact with users' personal diaries. Users can ask questions about their diary content and receive accurate, context-based answers.
170
+
171
+ ## ✨ Key Features
172
+
173
+ - **Diary Indexing**: Automatically processes and indexes diary files (PDF, DOCX, TXT)
174
+ - **Semantic Search**: Uses a vector database for semantic search
175
+ - **AI Chatbot**: Natural interaction with diary data
176
+ - **User Isolation**: Each user has a separate vector database
177
+ - **Web Interface**: Easy-to-use Streamlit UI
178
+ - **REST API**: FastAPI backend for integration
179
+
180
+ ## 🏗️ System Architecture
181
+
182
+ ```
183
+ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐
184
+ │ Streamlit UI │◄──►│ FastAPI │◄──►│ Vector DB │
185
+ │ (Frontend) │ │ Backend │ │ (ChromaDB) │
186
+ └───────────────┘ └───────────────┘ └───────────────┘
187
+
188
+
189
+ ┌───────────────┐
190
+ │ RAG Engine │
191
+ │ (LLM + │
192
+ │ Retrieval) │
193
+ └───────────────┘
194
+ ```
195
+
196
+ ## 🚀 Installation and Setup
197
+
198
+ ### System Requirements
199
+
200
+ - Python 3.8+
201
+
202
+ ### Install Dependencies
203
+
204
+ ```bash
205
+ # Create virtual environment
206
+ python -m venv .venv
207
+
208
+ # Activate virtual environment
209
+ # Windows
210
+ .venv\Scripts\activate
211
+ # Linux/Mac
212
+ source .venv/bin/activate
213
+
214
+ # Install packages
215
+ pip install -r requirements.txt
216
+ ```
217
+
218
+ ### Environment Configuration
219
+
220
+ Create a `.env` file in the project root directory with the following structure:
221
+
222
+ ```env
223
+ # Google API Configuration for RAG System
224
+ GOOGLE_API_KEY=[Google API key]
225
+
226
+ # Database Configuration
227
+ DATABASE_PATH=./src/streamlit_app/backend/diary.db
228
+
229
+ # Vector Database Configuration
230
+ VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
231
+ COLLECTION_NAME=diary_entries
232
+
233
+ # RAG Configuration
234
+ EMBEDDING_MODEL=models/embedding-001
235
+ CHAT_MODEL=gemini-2.5-flash
236
+ ```
237
+
238
+ **Important Notes:**
239
+ - Replace all placeholder values with your actual API keys and configuration
240
+ - Keep your `.env` file secure and never commit it to version control
241
+ - The `.env` file is already included in `.gitignore`
242
+ - Use `env_template.txt` as a reference to create your `.env` file
243
+
244
+ ### Run the Application
245
+
246
+ ```bash
247
+ # Start the RAG backend service
248
+ python start_rag_service.py
249
+
250
+ # Start the Streamlit UI
251
+ streamlit run src/streamlit_app/interface.py
252
+ ```
253
+
254
+ ## 📁 Directory Structure
255
+
256
+ ```
257
+ RAG-Personal-Diary-Chatbot/
258
+ ├── src/
259
+ │ ├── Indexingstep/ # Data indexing pipeline
260
+ │ ├── Retrivel_And_Generation/ # RAG engine
261
+ │ ├── rag_service/ # FastAPI backend
262
+ │ ├── streamlit_app/ # User interface
263
+ │ └── VectorDB/ # Vector database
264
+ ├── notebook/ # Jupyter notebooks
265
+ ├── tests/ # Unit tests
266
+ ├── images/ # Documentation images
267
+ ├── start_rag_service.py # Service startup script
268
+ ├── .env # Environment variables (create from template)
269
+ ├── env_template.txt # Environment variables template
270
+ └── README.md
271
+ ```
272
+
273
+ ## 🔧 Configuration
274
+
275
+ ### Vector Database
276
+ - **ChromaDB**: Main database for vector embeddings
277
+ - **Chunk size**: 1000 characters (customizable)
278
+ - **Overlap**: 200 characters between chunks
279
+
280
+ ### AI Models
281
+ - **Embedding**: Google's Universal Sentence Encoder
282
+ - **LLM**: Google Gemini (can be replaced with other models)
283
+
284
+ ## 📊 Performance
285
+
286
+ - **Processing time**: ~2-5 seconds per question
287
+ - **Accuracy**: 85-95% depending on data quality
288
+ - **Scalability**: Supports thousands of diaries
289
+
290
+
291
+ ## 🤝 Contributing
292
+
293
+ 1. Fork the project
294
+ 2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
295
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
296
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
297
+ 5. Open a Pull Request
298
+ ## 📞 Contact
299
+
300
+ - **Author**: [DongAnh]
301
+ - **Email**: [donganhng098@gmail.com]
302
+ - **GitHub**: [github.com/DongAnh]
303
+
304
+ ## 🙏 Acknowledgments
305
+
306
+ - Gemini for GPT models
307
+ - Google for Universal Sentence Encoder
308
+ - ChromaDB team for vector database
309
+ - FastAPI and Streamlit communities
310
+ - RAG architecture
VectorDB/user_1_vector_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e895be71c0752f44743806cd7439624649d5e41500865e79c77e07c6d1dca87
3
+ size 163840
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Start RAG Service for Personal Diary Chatbot
4
+ """
5
+ import subprocess
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+ import time
10
+
11
+ def check_requirements():
12
+ """Check if required packages are installed."""
13
+ required_packages = ['fastapi', 'uvicorn']
14
+ missing_packages = []
15
+
16
+ for package in required_packages:
17
+ try:
18
+ __import__(package)
19
+ except ImportError:
20
+ missing_packages.append(package)
21
+
22
+ if missing_packages:
23
+ print(f"❌ Missing packages: {', '.join(missing_packages)}")
24
+ print(f"Install with: pip install {' '.join(missing_packages)}")
25
+ return False
26
+
27
+ return True
28
+
29
+ def setup_environment():
30
+ """Setup environment and directories."""
31
+ # Ensure VectorDB directory exists
32
+ vector_db_dir = Path("src/VectorDB")
33
+ vector_db_dir.mkdir(parents=True, exist_ok=True)
34
+ print(f"📁 Vector DB directory: {vector_db_dir.absolute()}")
35
+
36
+ # Check for .env file
37
+ env_file = Path("src/Indexingstep/.env")
38
+ if env_file.exists():
39
+ print(f"✅ Environment file found: {env_file}")
40
+ else:
41
+ print(f"⚠️ Environment file not found: {env_file}")
42
+ print("Make sure GOOGLE_API_KEY is set in environment")
43
+
44
+ def start_service():
45
+ """Start the RAG FastAPI service."""
46
+ if not check_requirements():
47
+ return
48
+
49
+ setup_environment()
50
+
51
+ service_file = Path("src/rag_service/main.py")
52
+
53
+ if not service_file.exists():
54
+ print(f"❌ Service file not found: {service_file}")
55
+ print("Please create the RAG service file first")
56
+ return
57
+
58
+ print("🚀 Starting RAG Service...")
59
+ print("📍 Service URL: http://0.0.0.0:8001")
60
+ print("📖 API Docs: http://0.0.0.0:8001/docs")
61
+ print("💾 Vector databases will be stored in: src/VectorDB/")
62
+ print("\nPress Ctrl+C to stop the service")
63
+ print("-" * 50)
64
+
65
+ try:
66
+ # Change to project root directory
67
+ os.chdir(Path(__file__).parent)
68
+
69
+ # Start the service in the background
70
+ process = subprocess.Popen([
71
+ sys.executable, "-m", "uvicorn",
72
+ "src.rag_service.main:app",
73
+ "--host", "0.0.0.0",
74
+ "--port", "8001",
75
+ "--reload"
76
+ ])
77
+ print(f"🔄 RAG Service running in background (PID: {process.pid})")
78
+ return process
79
+ except Exception as e:
80
+ print(f"❌ Error starting service: {e}")
81
+ return None
82
+
83
+ def start_streamlit():
84
+ # Start Streamlit UI on port 7860 (default for Spaces)
85
+ os.system("streamlit run src/streamlit_app/interface.py --server.port 7860")
86
+
87
+ if __name__ == "__main__":
88
+ start_service()
89
+ time.sleep(3)
90
+ start_streamlit()
clean_repo/.env ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google API Configuration for RAG System
2
+ GOOGLE_API_KEY=AIzaSyAZQN21CjLySEybT6vOYDCz4V_e85gD42k
3
+
4
+ # Database Configuration
5
+ DATABASE_PATH=./src/streamlit_app/backend/diary.db
6
+
7
+ # Vector Database Configuration
8
+ VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
9
+ COLLECTION_NAME=diary_entries
10
+
11
+ # RAG Configuration
12
+ EMBEDDING_MODEL=models/embedding-001
clean_repo/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
clean_repo/.gitignore ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ ./venv/
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .coverage
39
+ .coverage.*
40
+ .cache
41
+ nosetests.xml
42
+ coverage.xml
43
+ *.cover
44
+ .hypothesis/
45
+ .pytest_cache/
46
+
47
+ # Translations
48
+ *.mo
49
+ *.pot
50
+
51
+ # Django stuff:
52
+ *.log
53
+ local_settings.py
54
+ db.sqlite3
55
+
56
+ # Flask stuff:
57
+ instance/
58
+ .webassets-cache
59
+
60
+ # Scrapy stuff:
61
+ .scrapy
62
+
63
+ # Sphinx documentation
64
+ docs/_build/
65
+
66
+ # PyBuilder
67
+ target/
68
+
69
+ # Jupyter Notebook
70
+ .ipynb_checkpoints
71
+
72
+ # pyenv
73
+ .python-version
74
+
75
+ # celery beat schedule file
76
+ celerybeat-schedule
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # IDE
106
+ .vscode/
107
+ .idea/
108
+ *.swp
109
+ *.swo
110
+
111
+ # OS
112
+ .DS_Store
113
+ Thumbs.db
114
+
115
+ # Project specific
116
+ data/
117
+ models/
118
+ logs/
119
+ *.pkl
120
+ *.model
121
+
122
+ # Vector databases & generated files
123
+ /VectorDB/
124
+ /src/VectorDB/
125
+ /src/Indexingstep/src/Indexingstep/diary_vector_db_enhanced/
126
+ *.db
127
+ *.sqlite3
128
+ *.bin
clean_repo/Backend.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Backend Architecture - Personal Diary Chatbot
2
+
3
+ ## Tổng quan Backend
4
+
5
+ Backend của dự án được xây dựng trên nền tảng FastAPI, cung cấp API RESTful cho việc xử lý nhật ký, tìm kiếm và tương tác với chatbot RAG. Hệ thống được thiết kế theo kiến trúc microservices với khả năng mở rộng cao.
6
+
7
+ ## 🏛️ Kiến trúc tổng thể
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
11
+ │ Frontend │ │ API Gateway │ │ Core Services │
12
+ │ (Streamlit) │◄──►│ (FastAPI) │◄──►│ (RAG Engine) │
13
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
14
+
15
+
16
+ ┌─────────────────┐
17
+ │ Data Layer │
18
+ │ (Vector DB) │
19
+ └─────────────────┘
20
+ ```
21
+
22
+ ## 🔧 Cấu trúc thư mục Backend
23
+
24
+ ```
25
+ src/
26
+ ├── rag_service/ # FastAPI service
27
+ │ ├── main.py # Main application entry point
28
+ │ ├── __init__.py
29
+ │ └── __pycache__/
30
+ ├── Indexingstep/ # Data processing pipeline
31
+ │ ├── pipeline.py # Main indexing pipeline
32
+ │ ├── dataloading.py # Document loading utilities
33
+ │ ├── diary_text_splitter.py # Text chunking logic
34
+ │ ├── embedding_and_storing.py # Vector embedding & storage
35
+ │ ├── database_utils.py # Database operations
36
+ │ └── indexing_pipeline.py # Pipeline orchestration
37
+ ├── Retrivel_And_Generation/ # RAG core engine
38
+ │ ├── Retrieval_And_Generator.py # Main RAG system
39
+ │ └── __init__.py
40
+ ├── VectorDB/ # Vector database storage
41
+ └── streamlit_app/ # Frontend application
42
+ ├── backend/ # Backend utilities for UI
43
+ ├── user_auth.py # Authentication system
44
+ ├── rag_client.py # RAG service client
45
+ └── interface.py # Main UI interface
46
+ ```
47
+ ## 🔮 Future Enhancements
48
+
49
+ ### 1. Microservices Architecture
50
+ - **User Service**: Dedicated user management
51
+ - **Document Service**: Document processing pipeline
52
+ - **Search Service**: Vector search optimization
53
+ - **Chat Service**: Conversation management
54
+
55
+ ### 2. Advanced Features
56
+ - **Real-time synchronization**: WebSocket support
57
+ - **Multi-language support**: Internationalization
58
+ - **Advanced analytics**: User behavior tracking
59
+ - **Machine learning**: Continuous model improvement
60
+
61
+ ### 3. Infrastructure Improvements
62
+ - **Kubernetes deployment**: Container orchestration
63
+ - **Service mesh**: Istio integration
64
+ - **Observability**: Distributed tracing
65
+ - **Auto-scaling**: Dynamic resource allocation
clean_repo/Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sử dụng image Python chính thức
2
+ FROM python:3.10-slim
3
+
4
+ # Đặt thư mục làm việc trong container
5
+ WORKDIR /app
6
+
7
+ # Sao chép file requirements.txt vào container (nếu có)
8
+ COPY requirements.txt .
9
+
10
+ # Cài đặt các thư viện phụ thuộc
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Sao chép toàn bộ mã nguồn vào container
14
+ COPY . .
15
+
16
+ # Chạy ứng dụng (thay main.py bằng file chạy chính của bạn)
17
+ CMD ["python", "app.py"]
clean_repo/RAG-architecture.md ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Architecture - Personal Diary Chatbot
2
+
3
+ ## 🏗️ Tổng quan kiến trúc RAG
4
+
5
+ Kiến trúc RAG (Retrieval-Augmented Generation) trong dự án này được thiết kế để cung cấp khả năng tìm kiếm và trả lời thông minh dựa trên dữ liệu nhật ký cá nhân của người dùng.
6
+
7
+ ## 🔄 Luồng xử lý RAG
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
11
+ │ Input Query │───►│ Query │───►│ Vector │
12
+ │ (User Question)│ │ Processing │ │ Search │
13
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
14
+
15
+
16
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
17
+ │ Final Answer │◄───│ Answer │◄───│ Context │
18
+ │ (Response) │ │ Generation │ │ Retrieval │
19
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
20
+ ```
21
+
22
+ ## 📊 Chi tiết các thành phần
23
+
24
+ ### 1. Data Ingestion & Indexing
25
+
26
+ #### 1.1 Document Loading
27
+ - **Input formats**: PDF, DOCX, TXT
28
+ - **Processing**: Text extraction, cleaning, normalization
29
+ - **Output**: Structured text data
30
+
31
+ #### 1.2 Text Chunking
32
+ ```python
33
+ # Chunking strategy
34
+ chunk_size = 1000 # characters
35
+ chunk_overlap = 200 # characters
36
+ chunking_method = "recursive_character_splitter"
37
+ ```
38
+
39
+ #### 1.3 Embedding Generation
40
+ - **Model**: Google Universal Sentence Encoder (USE)
41
+ - **Vector dimension**: 512
42
+ - **Normalization**: L2 normalization
43
+ - **Storage**: ChromaDB vector database
44
+
45
+ ### 2. Vector Database Architecture
46
+
47
+ #### 2.1 ChromaDB Configuration
48
+ ```python
49
+ # Database settings
50
+ collection_name = f"user_{user_id}_diary"
51
+ metadata = {
52
+ "user_id": user_id,
53
+ "source": "diary_entry",
54
+ "date": entry_date,
55
+ "chunk_id": chunk_id
56
+ }
57
+ ```
58
+
59
+ #### 2.2 Index Structure
60
+ - **Primary key**: `user_id + chunk_id`
61
+ - **Vector index**: HNSW (Hierarchical Navigable Small World)
62
+ - **Distance metric**: Cosine similarity
63
+ - **Sharding**: Per-user collections
64
+
65
+ ### 3. Retrieval Engine
66
+
67
+ #### 3.1 Query Processing
68
+ ```python
69
+ # Query preprocessing
70
+ def process_query(query: str):
71
+ # 1. Text cleaning
72
+ # 2. Stop word removal
73
+ # 3. Lemmatization
74
+ # 4. Query expansion
75
+ return processed_query
76
+ ```
77
+
78
+ #### 3.2 Vector Search
79
+ - **Search algorithm**: K-Nearest Neighbors (KNN)
80
+ - **Top-k results**: 5-10 most relevant chunks
81
+ - **Similarity threshold**: 0.7 (cosine similarity)
82
+ - **Reranking**: Semantic relevance scoring
83
+
84
+ #### 3.3 Context Assembly
85
+ ```python
86
+ # Context building
87
+ def build_context(retrieved_chunks, query):
88
+ # 1. Sort by relevance score
89
+ # 2. Remove duplicates
90
+ # 3. Truncate to token limit
91
+ # 4. Add metadata context
92
+ return final_context
93
+ ```
94
+
95
+ ### 4. Generation Engine
96
+
97
+ #### 4.1 LLM Integration
98
+ - **Primary model**: OpenAI GPT-3.5/4
99
+ - **Fallback model**: Local model (nếu cần)
100
+ - **Temperature**: 0.7 (balanced creativity)
101
+ - **Max tokens**: 500 (response length)
102
+
103
+ #### 4.2 Prompt Engineering
104
+ ```python
105
+ # System prompt template
106
+ SYSTEM_PROMPT = """
107
+ You are a helpful AI assistant that answers questions about personal diary entries.
108
+ Use only the provided context to answer questions.
109
+ If the information is not in the context, say so.
110
+ Be conversational but professional.
111
+ """
112
+ ```
113
+
114
+ #### 4.3 Response Generation
115
+ ```python
116
+ # Generation pipeline
117
+ def generate_response(query, context, chat_history):
118
+ # 1. Build prompt with context
119
+ # 2. Call LLM API
120
+ # 3. Post-process response
121
+ # 4. Validate against context
122
+ # 5. Return final answer
123
+ ```
124
+
125
+ ## 🔧 Cấu hình kỹ thuật
126
+
127
+ ### Performance Tuning
128
+
129
+ #### 1. Chunking Optimization
130
+ - **Optimal chunk size**: 1000 characters
131
+ - **Overlap ratio**: 20%
132
+ - **Chunking strategy**: Recursive character splitter
133
+
134
+ #### 2. Vector Search Optimization
135
+ - **Index type**: HNSW
136
+ - **Search parameters**:
137
+ - `ef_construction`: 200
138
+ - `ef_search`: 100
139
+ - `m`: 16
140
+
141
+ #### 3. Caching Strategy
142
+ - **Query cache**: Redis (in-memory)
143
+ - **Embedding cache**: Local file cache
144
+ - **Response cache**: TTL-based expiration
145
+
146
+ ### Scalability Features
147
+
148
+ #### 1. Multi-User Support
149
+ - **User isolation**: Separate vector collections
150
+ - **Resource management**: Per-user memory limits
151
+ - **Concurrent access**: Async processing
152
+
153
+ #### 2. Horizontal Scaling
154
+ - **Load balancing**: Multiple RAG instances
155
+ - **Database sharding**: User-based distribution
156
+ - **Microservices**: Modular architecture
157
+
158
+ ## 📈 Monitoring & Analytics
159
+
160
+ ### 1. Performance Metrics
161
+ - **Query latency**: < 2 seconds
162
+ - **Retrieval accuracy**: > 85%
163
+ - **Generation quality**: User satisfaction score
164
+ - **System throughput**: Queries per second
165
+
166
+ ### 2. Quality Assurance
167
+ - **Context relevance**: Similarity score tracking
168
+ - **Answer accuracy**: Human evaluation
169
+ - **User feedback**: Rating system
170
+ - **A/B testing**: Model comparison
171
+
172
+ ## 🚀 Deployment Architecture
173
+
174
+ ### 1. Development Environment
175
+ ```
176
+ ┌─────────────────┐ ┌─────────────────┐
177
+ │ Local Python │ │ Local │
178
+ │ Environment │◄──►│ ChromaDB │
179
+ └─────────────────┘ └─────────────────┘
180
+ ```
181
+
182
+ ### 2. Production Environment
183
+ ```
184
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
185
+ │ Load Balancer│ │ RAG Service │ │ Vector DB │
186
+ │ (Nginx) │◄──►│ (FastAPI) │◄──►│ (ChromaDB) │
187
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
188
+
189
+
190
+ ┌─────────────────┐
191
+ │ Redis Cache │
192
+ └─────────────────┘
193
+ ```
194
+
195
+ ## 🔒 Security & Privacy
196
+
197
+ ### 1. Data Protection
198
+ - **User isolation**: Strict separation of data
199
+ - **Encryption**: At-rest and in-transit
200
+ - **Access control**: Role-based permissions
201
+ - **Audit logging**: Complete access history
202
+
203
+ ### 2. Privacy Compliance
204
+ - **GDPR compliance**: Data portability
205
+ - **Data retention**: Configurable policies
206
+ - **User consent**: Explicit permission management
207
+ - **Data anonymization**: Optional features
208
+
209
+ ## 🧪 Testing Strategy
210
+
211
+ ### 1. Unit Testing
212
+ - **Component testing**: Individual modules
213
+ - **Mock testing**: External API simulation
214
+ - **Coverage target**: > 90%
215
+
216
+ ### 2. Integration Testing
217
+ - **End-to-end testing**: Complete RAG pipeline
218
+ - **Performance testing**: Load and stress tests
219
+ - **Security testing**: Vulnerability assessment
220
+
221
+
222
+ ## 📚 Best Practices
223
+
224
+ ### 1. Model Selection
225
+ - **Embedding models**: Domain-specific fine-tuning
226
+ - **LLM selection**: Cost-performance balance
227
+ - **Fallback strategies**: Graceful degradation
228
+
229
+ ### 2. Data Quality
230
+ - **Input validation**: Strict data checking
231
+ - **Cleaning pipeline**: Automated preprocessing
232
+ - **Quality metrics**: Continuous monitoring
233
+
234
+ ### 3. Error Handling
235
+ - **Graceful failures**: User-friendly error messages
236
+ - **Retry mechanisms**: Automatic recovery
237
+ - **Logging**: Comprehensive error tracking
238
+
239
+ ## 🔮 Future Enhancements
240
+
241
+ ### 1. Advanced Features
242
+ - **Multi-modal RAG**: Image and text processing
243
+ - **Temporal reasoning**: Time-based queries
244
+ - **Emotional analysis**: Sentiment-aware responses
245
+
246
+ ### 2. Performance Improvements
247
+ - **Vector quantization**: Reduced memory usage
248
+ - **Approximate search**: Faster retrieval
249
+ - **Model distillation**: Smaller, faster models
250
+
251
+ ### 3. Integration Capabilities
252
+ - **API ecosystem**: Third-party integrations
253
+ - **Mobile applications**: Native mobile support
254
+ - **Voice interface**: Speech-to-text integration
clean_repo/README.md ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Personal Diary Chatbot
2
+
3
+ ## 📖 Project Description
4
+
5
+ RAG Personal Diary Chatbot is an intelligent chatbot application that uses RAG (Retrieval-Augmented Generation) architecture to interact with users' personal diaries. The application allows users to ask questions about diary content and receive accurate answers based on actual data.
6
+
7
+ ## ✨ Key Features
8
+
9
+
10
+ ## 🏗️ System Architecture
11
+
12
+ ```
13
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
14
+ │ Streamlit UI │ │ FastAPI │ │ Vector │
15
+ │ (Frontend) │◄──►│ Backend │◄──►│ Database │
16
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
17
+
18
+
19
+ ┌─────────────────┐
20
+ │ RAG Engine │
21
+ │ (LLM + │
22
+ │ Retrieval) │
23
+ └─────────────────┘
24
+ ```
25
+
26
+ ## 🚀 Installation and Setup
27
+
28
+ ### System Requirements
29
+
30
+ ### Install Dependencies
31
+ ```bash
32
+ # Create virtual environment
33
+ python -m venv .venv
34
+
35
+ # Activate virtual environment
36
+ # Windows
37
+ .venv\Scripts\activate
38
+ # Linux/Mac
39
+ source .venv/bin/activate
40
+
41
+ # Install packages
42
+ pip install -r requirements.txt
43
+ ```
44
+
45
+ ### Environment Configuration
46
+
47
+ Create a `.env` file in the project root directory with the following structure:
48
+
49
+ ```env
50
+ # API Keys
51
+ OPENAI_API_KEY=your_openai_api_key_here
52
+ GOOGLE_API_KEY=your_google_api_key_here
53
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
54
+
55
+ # Database Configuration
56
+ DATABASE_URL=sqlite:///./user_database/auth.db
57
+ VECTOR_DB_PATH=./VectorDB
58
+
59
+ # Model Configuration
60
+ EMBEDDING_MODEL=google-universal-sentence-encoder
61
+ LLM_MODEL=gpt-3.5-turbo
62
+ CHUNK_SIZE=1000
63
+ CHUNK_OVERLAP=200
64
+
65
+ # Server Configuration
66
+ RAG_SERVICE_PORT=8001
67
+ STREAMLIT_PORT=8501
68
+ FASTAPI_PORT=8000
69
+
70
+ # Security
71
+ SECRET_KEY=your_secret_key_here
72
+ JWT_SECRET=your_jwt_secret_here
73
+
74
+ # Logging
75
+ LOG_LEVEL=INFO
76
+ LOG_FILE=./logs/app.log
77
+
78
+ # Vector Database
79
+ CHROMA_DB_PATH=./VectorDB
80
+ PERSIST_DIRECTORY=./VectorDB
81
+
82
+ # File Processing
83
+ SUPPORTED_FORMATS=pdf,docx,txt,md
84
+ MAX_FILE_SIZE=10485760
85
+ TEMP_DIR=./temp
86
+
87
+ # RAG Configuration
88
+ TOP_K_RESULTS=5
89
+ SIMILARITY_THRESHOLD=0.7
90
+ MAX_TOKENS=4096
91
+ TEMPERATURE=0.7
92
+ ```
93
+
94
+ **Important Notes:**
95
+
96
+ ### Run the Application
97
+
98
+ #### 1. Start RAG Service
99
+ ```bash
100
+ python start_rag_service.py
101
+ ```
102
+ Service will run at: http://127.0.0.1:8001
103
+
104
+ #### 2. Start Streamlit UI
105
+ ```bash
106
+ cd src/streamlit_app
107
+ streamlit run interface.py
108
+ ```
109
+ UI will run at: http://localhost:8501
110
+
111
+ ## 📁 Directory Structure
112
+
113
+ ```
114
+ RAG-Personal-Diary-Chatbot/
115
+ ├── src/
116
+ │ ├── Indexingstep/ # Data indexing pipeline
117
+ │ ├── Retrivel_And_Generation/ # RAG engine
118
+ │ ├── rag_service/ # FastAPI backend
119
+ │ ├── streamlit_app/ # User interface
120
+ │ └── VectorDB/ # Vector database
121
+ ├── notebook/ # Jupyter notebooks
122
+ ├── tests/ # Unit tests
123
+ ├── images/ # Documentation images
124
+ ├── start_rag_service.py # Service startup script
125
+ ├── .env # Environment variables (create from template)
126
+ ├── env_template.txt # Environment variables template
127
+ └── README.md
128
+ ```
129
+
130
+ ## 🔧 Configuration
131
+
132
+ ### Vector Database
133
+
134
+ ### AI Models
135
+
136
+ ## 📊 Performance
137
+
138
+
139
+ ## 🧪 Testing
140
+
141
+ ```bash
142
+ # Run all tests
143
+ python -m pytest tests/
144
+
145
+ # Run specific test
146
+ python -m pytest tests/test_rag_system.py
147
+ ```
148
+
149
+ ## 🤝 Contributing
150
+
151
+ 1. Fork the project
152
+ 2. Create feature branch (`git checkout -b feature/AmazingFeature`)
153
+ 3. Commit changes (`git commit -m 'Add some AmazingFeature'`)
154
+ 4. Push to branch (`git push origin feature/AmazingFeature`)
155
+ 5. Open Pull Request
156
+
157
+ ## 📝 License
158
+
159
+ This project is distributed under the MIT License. See the `LICENSE` file for more details.
160
+
161
+ ## 📞 Contact
162
+
163
+
164
+ ## 🙏 Acknowledgments
165
+
166
+
167
+ ## 📖 Project Description
168
+
169
+ RAG Personal Diary Chatbot is an intelligent chatbot application that leverages Retrieval-Augmented Generation (RAG) architecture to interact with users' personal diaries. Users can ask questions about their diary content and receive accurate, context-based answers.
170
+
171
+ ## ✨ Key Features
172
+
173
+ - **Diary Indexing**: Automatically processes and indexes diary files (PDF, DOCX, TXT)
174
+ - **Semantic Search**: Uses a vector database for semantic search
175
+ - **AI Chatbot**: Natural interaction with diary data
176
+ - **User Isolation**: Each user has a separate vector database
177
+ - **Web Interface**: Easy-to-use Streamlit UI
178
+ - **REST API**: FastAPI backend for integration
179
+
180
+ ## 🏗️ System Architecture
181
+
182
+ ```
183
+ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐
184
+ │ Streamlit UI │◄──►│ FastAPI │◄──►│ Vector DB │
185
+ │ (Frontend) │ │ Backend │ │ (ChromaDB) │
186
+ └───────────────┘ └───────────────┘ └───────────────┘
187
+
188
+
189
+ ┌───────────────┐
190
+ │ RAG Engine │
191
+ │ (LLM + │
192
+ │ Retrieval) │
193
+ └───────────────┘
194
+ ```
195
+
196
+ ## 🚀 Installation and Setup
197
+
198
+ ### System Requirements
199
+
200
+ - Python 3.8+
201
+
202
+ ### Install Dependencies
203
+
204
+ ```bash
205
+ # Create virtual environment
206
+ python -m venv .venv
207
+
208
+ # Activate virtual environment
209
+ # Windows
210
+ .venv\Scripts\activate
211
+ # Linux/Mac
212
+ source .venv/bin/activate
213
+
214
+ # Install packages
215
+ pip install -r requirements.txt
216
+ ```
217
+
218
+ ### Environment Configuration
219
+
220
+ Create a `.env` file in the project root directory with the following structure:
221
+
222
+ ```env
223
+ # Google API Configuration for RAG System
224
+ GOOGLE_API_KEY=[Google API key]
225
+
226
+ # Database Configuration
227
+ DATABASE_PATH=./src/streamlit_app/backend/diary.db
228
+
229
+ # Vector Database Configuration
230
+ VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
231
+ COLLECTION_NAME=diary_entries
232
+
233
+ # RAG Configuration
234
+ EMBEDDING_MODEL=models/embedding-001
235
+ CHAT_MODEL=gemini-2.5-flash
236
+ ```
237
+
238
+ **Important Notes:**
239
+ - Replace all placeholder values with your actual API keys and configuration
240
+ - Keep your `.env` file secure and never commit it to version control
241
+ - The `.env` file is already included in `.gitignore`
242
+ - Use `env_template.txt` as a reference to create your `.env` file
243
+
244
+ ### Run the Application
245
+
246
+ ```bash
247
+ # Start the RAG backend service
248
+ python start_rag_service.py
249
+
250
+ # Start the Streamlit UI
251
+ streamlit run src/streamlit_app/interface.py
252
+ ```
253
+
254
+ ## 📁 Directory Structure
255
+
256
+ ```
257
+ RAG-Personal-Diary-Chatbot/
258
+ ├── src/
259
+ │ ├── Indexingstep/ # Data indexing pipeline
260
+ │ ├── Retrivel_And_Generation/ # RAG engine
261
+ │ ├── rag_service/ # FastAPI backend
262
+ │ ├── streamlit_app/ # User interface
263
+ │ └── VectorDB/ # Vector database
264
+ ├── notebook/ # Jupyter notebooks
265
+ ├── tests/ # Unit tests
266
+ ├── images/ # Documentation images
267
+ ├── start_rag_service.py # Service startup script
268
+ ├── .env # Environment variables (create from template)
269
+ ├── env_template.txt # Environment variables template
270
+ └── README.md
271
+ ```
272
+
273
+ ## 🔧 Configuration
274
+
275
+ ### Vector Database
276
+ - **ChromaDB**: Main database for vector embeddings
277
+ - **Chunk size**: 1000 characters (customizable)
278
+ - **Overlap**: 200 characters between chunks
279
+
280
+ ### AI Models
281
+ - **Embedding**: Google's Universal Sentence Encoder
282
+ - **LLM**: Google Gemini (can be replaced with other models)
283
+
284
+ ## 📊 Performance
285
+
286
+ - **Processing time**: ~2-5 seconds per question
287
+ - **Accuracy**: 85-95% depending on data quality
288
+ - **Scalability**: Supports thousands of diaries
289
+
290
+
291
+ ## 🤝 Contributing
292
+
293
+ 1. Fork the project
294
+ 2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
295
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
296
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
297
+ 5. Open a Pull Request
298
+ ## 📞 Contact
299
+
300
+ - **Author**: [huytrao]
301
+ - **Email**: [traohuy098@gmail.com]
302
+ - **GitHub**: [github.com/huytrao]
303
+
304
+ ## 🙏 Acknowledgments
305
+
306
+ - Gemini for GPT models
307
+ - Google for Universal Sentence Encoder
308
+ - ChromaDB team for vector database
309
+ - FastAPI and Streamlit communities
310
+ - RAG architecture
clean_repo/VectorDB/user_1_vector_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e895be71c0752f44743806cd7439624649d5e41500865e79c77e07c6d1dca87
3
+ size 163840
clean_repo/app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Start RAG Service for Personal Diary Chatbot
4
+ """
5
+ import subprocess
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+ import time
10
+
11
+ def check_requirements():
12
+ """Check if required packages are installed."""
13
+ required_packages = ['fastapi', 'uvicorn']
14
+ missing_packages = []
15
+
16
+ for package in required_packages:
17
+ try:
18
+ __import__(package)
19
+ except ImportError:
20
+ missing_packages.append(package)
21
+
22
+ if missing_packages:
23
+ print(f"❌ Missing packages: {', '.join(missing_packages)}")
24
+ print(f"Install with: pip install {' '.join(missing_packages)}")
25
+ return False
26
+
27
+ return True
28
+
29
+ def setup_environment():
30
+ """Setup environment and directories."""
31
+ # Ensure VectorDB directory exists
32
+ vector_db_dir = Path("src/VectorDB")
33
+ vector_db_dir.mkdir(parents=True, exist_ok=True)
34
+ print(f"📁 Vector DB directory: {vector_db_dir.absolute()}")
35
+
36
+ # Check for .env file
37
+ env_file = Path("src/Indexingstep/.env")
38
+ if env_file.exists():
39
+ print(f"✅ Environment file found: {env_file}")
40
+ else:
41
+ print(f"⚠️ Environment file not found: {env_file}")
42
+ print("Make sure GOOGLE_API_KEY is set in environment")
43
+
44
+ def start_service():
45
+ """Start the RAG FastAPI service."""
46
+ if not check_requirements():
47
+ return
48
+
49
+ setup_environment()
50
+
51
+ service_file = Path("src/rag_service/main.py")
52
+
53
+ if not service_file.exists():
54
+ print(f"❌ Service file not found: {service_file}")
55
+ print("Please create the RAG service file first")
56
+ return
57
+
58
+ print("🚀 Starting RAG Service...")
59
+ print("📍 Service URL: http://0.0.0.0:8001")
60
+ print("📖 API Docs: http://0.0.0.0:8001/docs")
61
+ print("💾 Vector databases will be stored in: src/VectorDB/")
62
+ print("\nPress Ctrl+C to stop the service")
63
+ print("-" * 50)
64
+
65
+ try:
66
+ # Change to project root directory
67
+ os.chdir(Path(__file__).parent)
68
+
69
+ # Start the service in the background
70
+ process = subprocess.Popen([
71
+ sys.executable, "-m", "uvicorn",
72
+ "src.rag_service.main:app",
73
+ "--host", "0.0.0.0",
74
+ "--port", "8001",
75
+ "--reload"
76
+ ])
77
+ print(f"🔄 RAG Service running in background (PID: {process.pid})")
78
+ return process
79
+ except Exception as e:
80
+ print(f"❌ Error starting service: {e}")
81
+ return None
82
+
83
+ def start_streamlit():
84
+ # Start Streamlit UI on port 7860 (default for Spaces)
85
+ os.system("streamlit run src/streamlit_app/interface.py --server.port 7860")
86
+
87
+ if __name__ == "__main__":
88
+ start_service()
89
+ time.sleep(3)
90
+ start_streamlit()
clean_repo/clean_repo/.env ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google API Configuration for RAG System
2
+ GOOGLE_API_KEY=AIzaSyAZQN21CjLySEybT6vOYDCz4V_e85gD42k
3
+
4
+ # Database Configuration
5
+ DATABASE_PATH=./src/streamlit_app/backend/diary.db
6
+
7
+ # Vector Database Configuration
8
+ VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
9
+ COLLECTION_NAME=diary_entries
10
+
11
+ # RAG Configuration
12
+ EMBEDDING_MODEL=models/embedding-001
clean_repo/clean_repo/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
clean_repo/clean_repo/.gitignore ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ ./venv/
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .coverage
39
+ .coverage.*
40
+ .cache
41
+ nosetests.xml
42
+ coverage.xml
43
+ *.cover
44
+ .hypothesis/
45
+ .pytest_cache/
46
+
47
+ # Translations
48
+ *.mo
49
+ *.pot
50
+
51
+ # Django stuff:
52
+ *.log
53
+ local_settings.py
54
+ db.sqlite3
55
+
56
+ # Flask stuff:
57
+ instance/
58
+ .webassets-cache
59
+
60
+ # Scrapy stuff:
61
+ .scrapy
62
+
63
+ # Sphinx documentation
64
+ docs/_build/
65
+
66
+ # PyBuilder
67
+ target/
68
+
69
+ # Jupyter Notebook
70
+ .ipynb_checkpoints
71
+
72
+ # pyenv
73
+ .python-version
74
+
75
+ # celery beat schedule file
76
+ celerybeat-schedule
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # IDE
106
+ .vscode/
107
+ .idea/
108
+ *.swp
109
+ *.swo
110
+
111
+ # OS
112
+ .DS_Store
113
+ Thumbs.db
114
+
115
+ # Project specific
116
+ data/
117
+ models/
118
+ logs/
119
+ *.pkl
120
+ *.model
clean_repo/clean_repo/Backend.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Backend Architecture - Personal Diary Chatbot
2
+
3
+ ## Tổng quan Backend
4
+
5
+ Backend của dự án được xây dựng trên nền tảng FastAPI, cung cấp API RESTful cho việc xử lý nhật ký, tìm kiếm và tương tác với chatbot RAG. Hệ thống được thiết kế theo kiến trúc microservices với khả năng mở rộng cao.
6
+
7
+ ## 🏛️ Kiến trúc tổng thể
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
11
+ │ Frontend │ │ API Gateway │ │ Core Services │
12
+ │ (Streamlit) │◄──►│ (FastAPI) │◄──►│ (RAG Engine) │
13
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
14
+
15
+
16
+ ┌─────────────────┐
17
+ │ Data Layer │
18
+ │ (Vector DB) │
19
+ └─────────────────┘
20
+ ```
21
+
22
+ ## 🔧 Cấu trúc thư mục Backend
23
+
24
+ ```
25
+ src/
26
+ ├── rag_service/ # FastAPI service
27
+ │ ├── main.py # Main application entry point
28
+ │ ├── __init__.py
29
+ │ └── __pycache__/
30
+ ├── Indexingstep/ # Data processing pipeline
31
+ │ ├── pipeline.py # Main indexing pipeline
32
+ │ ├── dataloading.py # Document loading utilities
33
+ │ ├── diary_text_splitter.py # Text chunking logic
34
+ │ ├── embedding_and_storing.py # Vector embedding & storage
35
+ │ ├── database_utils.py # Database operations
36
+ │ └── indexing_pipeline.py # Pipeline orchestration
37
+ ├── Retrivel_And_Generation/ # RAG core engine
38
+ │ ├── Retrieval_And_Generator.py # Main RAG system
39
+ │ └── __init__.py
40
+ ├── VectorDB/ # Vector database storage
41
+ └── streamlit_app/ # Frontend application
42
+ ├── backend/ # Backend utilities for UI
43
+ ├── user_auth.py # Authentication system
44
+ ├── rag_client.py # RAG service client
45
+ └── interface.py # Main UI interface
46
+ ```
47
+ ## 🔮 Future Enhancements
48
+
49
+ ### 1. Microservices Architecture
50
+ - **User Service**: Dedicated user management
51
+ - **Document Service**: Document processing pipeline
52
+ - **Search Service**: Vector search optimization
53
+ - **Chat Service**: Conversation management
54
+
55
+ ### 2. Advanced Features
56
+ - **Real-time synchronization**: WebSocket support
57
+ - **Multi-language support**: Internationalization
58
+ - **Advanced analytics**: User behavior tracking
59
+ - **Machine learning**: Continuous model improvement
60
+
61
+ ### 3. Infrastructure Improvements
62
+ - **Kubernetes deployment**: Container orchestration
63
+ - **Service mesh**: Istio integration
64
+ - **Observability**: Distributed tracing
65
+ - **Auto-scaling**: Dynamic resource allocation
clean_repo/clean_repo/Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sử dụng image Python chính thức
2
+ FROM python:3.10-slim
3
+
4
+ # Đặt thư mục làm việc trong container
5
+ WORKDIR /app
6
+
7
+ # Sao chép file requirements.txt vào container (nếu có)
8
+ COPY requirements.txt .
9
+
10
+ # Cài đặt các thư viện phụ thuộc
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Sao chép toàn bộ mã nguồn vào container
14
+ COPY . .
15
+
16
+ # Chạy ứng dụng (thay main.py bằng file chạy chính của bạn)
17
+ CMD ["python", "app.py"]
clean_repo/clean_repo/RAG-architecture.md ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Architecture - Personal Diary Chatbot
2
+
3
+ ## 🏗️ Tổng quan kiến trúc RAG
4
+
5
+ Kiến trúc RAG (Retrieval-Augmented Generation) trong dự án này được thiết kế để cung cấp khả năng tìm kiếm và trả lời thông minh dựa trên dữ liệu nhật ký cá nhân của người dùng.
6
+
7
+ ## 🔄 Luồng xử lý RAG
8
+
9
+ ```
10
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
11
+ │ Input Query │───►│ Query │───►│ Vector │
12
+ │ (User Question)│ │ Processing │ │ Search │
13
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
14
+
15
+
16
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
17
+ │ Final Answer │◄───│ Answer │◄───│ Context │
18
+ │ (Response) │ │ Generation │ │ Retrieval │
19
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
20
+ ```
21
+
22
+ ## 📊 Chi tiết các thành phần
23
+
24
+ ### 1. Data Ingestion & Indexing
25
+
26
+ #### 1.1 Document Loading
27
+ - **Input formats**: PDF, DOCX, TXT
28
+ - **Processing**: Text extraction, cleaning, normalization
29
+ - **Output**: Structured text data
30
+
31
+ #### 1.2 Text Chunking
32
+ ```python
33
+ # Chunking strategy
34
+ chunk_size = 1000 # characters
35
+ chunk_overlap = 200 # characters
36
+ chunking_method = "recursive_character_splitter"
37
+ ```
38
+
39
+ #### 1.3 Embedding Generation
40
+ - **Model**: Google Universal Sentence Encoder (USE)
41
+ - **Vector dimension**: 512
42
+ - **Normalization**: L2 normalization
43
+ - **Storage**: ChromaDB vector database
44
+
45
+ ### 2. Vector Database Architecture
46
+
47
+ #### 2.1 ChromaDB Configuration
48
+ ```python
49
+ # Database settings
50
+ collection_name = f"user_{user_id}_diary"
51
+ metadata = {
52
+ "user_id": user_id,
53
+ "source": "diary_entry",
54
+ "date": entry_date,
55
+ "chunk_id": chunk_id
56
+ }
57
+ ```
58
+
59
+ #### 2.2 Index Structure
60
+ - **Primary key**: `user_id + chunk_id`
61
+ - **Vector index**: HNSW (Hierarchical Navigable Small World)
62
+ - **Distance metric**: Cosine similarity
63
+ - **Sharding**: Per-user collections
64
+
65
+ ### 3. Retrieval Engine
66
+
67
+ #### 3.1 Query Processing
68
+ ```python
69
+ # Query preprocessing
70
+ def process_query(query: str):
71
+ # 1. Text cleaning
72
+ # 2. Stop word removal
73
+ # 3. Lemmatization
74
+ # 4. Query expansion
75
+ return processed_query
76
+ ```
77
+
78
+ #### 3.2 Vector Search
79
+ - **Search algorithm**: K-Nearest Neighbors (KNN)
80
+ - **Top-k results**: 5-10 most relevant chunks
81
+ - **Similarity threshold**: 0.7 (cosine similarity)
82
+ - **Reranking**: Semantic relevance scoring
83
+
84
+ #### 3.3 Context Assembly
85
+ ```python
86
+ # Context building
87
+ def build_context(retrieved_chunks, query):
88
+ # 1. Sort by relevance score
89
+ # 2. Remove duplicates
90
+ # 3. Truncate to token limit
91
+ # 4. Add metadata context
92
+ return final_context
93
+ ```
94
+
95
+ ### 4. Generation Engine
96
+
97
+ #### 4.1 LLM Integration
98
+ - **Primary model**: OpenAI GPT-3.5/4
99
+ - **Fallback model**: Local model (nếu cần)
100
+ - **Temperature**: 0.7 (balanced creativity)
101
+ - **Max tokens**: 500 (response length)
102
+
103
+ #### 4.2 Prompt Engineering
104
+ ```python
105
+ # System prompt template
106
+ SYSTEM_PROMPT = """
107
+ You are a helpful AI assistant that answers questions about personal diary entries.
108
+ Use only the provided context to answer questions.
109
+ If the information is not in the context, say so.
110
+ Be conversational but professional.
111
+ """
112
+ ```
113
+
114
+ #### 4.3 Response Generation
115
+ ```python
116
+ # Generation pipeline
117
+ def generate_response(query, context, chat_history):
118
+ # 1. Build prompt with context
119
+ # 2. Call LLM API
120
+ # 3. Post-process response
121
+ # 4. Validate against context
122
+ # 5. Return final answer
123
+ ```
124
+
125
+ ## 🔧 Cấu hình kỹ thuật
126
+
127
+ ### Performance Tuning
128
+
129
+ #### 1. Chunking Optimization
130
+ - **Optimal chunk size**: 1000 characters
131
+ - **Overlap ratio**: 20%
132
+ - **Chunking strategy**: Recursive character splitter
133
+
134
+ #### 2. Vector Search Optimization
135
+ - **Index type**: HNSW
136
+ - **Search parameters**:
137
+ - `ef_construction`: 200
138
+ - `ef_search`: 100
139
+ - `m`: 16
140
+
141
+ #### 3. Caching Strategy
142
+ - **Query cache**: Redis (in-memory)
143
+ - **Embedding cache**: Local file cache
144
+ - **Response cache**: TTL-based expiration
145
+
146
+ ### Scalability Features
147
+
148
+ #### 1. Multi-User Support
149
+ - **User isolation**: Separate vector collections
150
+ - **Resource management**: Per-user memory limits
151
+ - **Concurrent access**: Async processing
152
+
153
+ #### 2. Horizontal Scaling
154
+ - **Load balancing**: Multiple RAG instances
155
+ - **Database sharding**: User-based distribution
156
+ - **Microservices**: Modular architecture
157
+
158
+ ## 📈 Monitoring & Analytics
159
+
160
+ ### 1. Performance Metrics
161
+ - **Query latency**: < 2 seconds
162
+ - **Retrieval accuracy**: > 85%
163
+ - **Generation quality**: User satisfaction score
164
+ - **System throughput**: Queries per second
165
+
166
+ ### 2. Quality Assurance
167
+ - **Context relevance**: Similarity score tracking
168
+ - **Answer accuracy**: Human evaluation
169
+ - **User feedback**: Rating system
170
+ - **A/B testing**: Model comparison
171
+
172
+ ## 🚀 Deployment Architecture
173
+
174
+ ### 1. Development Environment
175
+ ```
176
+ ┌─────────────────┐ ┌─────────────────┐
177
+ │ Local Python │ │ Local │
178
+ │ Environment │◄──►│ ChromaDB │
179
+ └─────────────────┘ └─────────────────┘
180
+ ```
181
+
182
+ ### 2. Production Environment
183
+ ```
184
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
185
+ │ Load Balancer│ │ RAG Service │ │ Vector DB │
186
+ │ (Nginx) │◄──►│ (FastAPI) │◄──►│ (ChromaDB) │
187
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
188
+
189
+
190
+ ┌─────────────────┐
191
+ │ Redis Cache │
192
+ └─────────────────┘
193
+ ```
194
+
195
+ ## 🔒 Security & Privacy
196
+
197
+ ### 1. Data Protection
198
+ - **User isolation**: Strict separation of data
199
+ - **Encryption**: At-rest and in-transit
200
+ - **Access control**: Role-based permissions
201
+ - **Audit logging**: Complete access history
202
+
203
+ ### 2. Privacy Compliance
204
+ - **GDPR compliance**: Data portability
205
+ - **Data retention**: Configurable policies
206
+ - **User consent**: Explicit permission management
207
+ - **Data anonymization**: Optional features
208
+
209
+ ## 🧪 Testing Strategy
210
+
211
+ ### 1. Unit Testing
212
+ - **Component testing**: Individual modules
213
+ - **Mock testing**: External API simulation
214
+ - **Coverage target**: > 90%
215
+
216
+ ### 2. Integration Testing
217
+ - **End-to-end testing**: Complete RAG pipeline
218
+ - **Performance testing**: Load and stress tests
219
+ - **Security testing**: Vulnerability assessment
220
+
221
+
222
+ ## 📚 Best Practices
223
+
224
+ ### 1. Model Selection
225
+ - **Embedding models**: Domain-specific fine-tuning
226
+ - **LLM selection**: Cost-performance balance
227
+ - **Fallback strategies**: Graceful degradation
228
+
229
+ ### 2. Data Quality
230
+ - **Input validation**: Strict data checking
231
+ - **Cleaning pipeline**: Automated preprocessing
232
+ - **Quality metrics**: Continuous monitoring
233
+
234
+ ### 3. Error Handling
235
+ - **Graceful failures**: User-friendly error messages
236
+ - **Retry mechanisms**: Automatic recovery
237
+ - **Logging**: Comprehensive error tracking
238
+
239
+ ## 🔮 Future Enhancements
240
+
241
+ ### 1. Advanced Features
242
+ - **Multi-modal RAG**: Image and text processing
243
+ - **Temporal reasoning**: Time-based queries
244
+ - **Emotional analysis**: Sentiment-aware responses
245
+
246
+ ### 2. Performance Improvements
247
+ - **Vector quantization**: Reduced memory usage
248
+ - **Approximate search**: Faster retrieval
249
+ - **Model distillation**: Smaller, faster models
250
+
251
+ ### 3. Integration Capabilities
252
+ - **API ecosystem**: Third-party integrations
253
+ - **Mobile applications**: Native mobile support
254
+ - **Voice interface**: Speech-to-text integration
clean_repo/clean_repo/README.md ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Personal Diary Chatbot
2
+
3
+ ## 📖 Project Description
4
+
5
+ RAG Personal Diary Chatbot is an intelligent chatbot application that uses RAG (Retrieval-Augmented Generation) architecture to interact with users' personal diaries. The application allows users to ask questions about diary content and receive accurate answers based on actual data.
6
+
7
+ ## ✨ Key Features
8
+
9
+
10
+ ## 🏗️ System Architecture
11
+
12
+ ```
13
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
14
+ │ Streamlit UI │ │ FastAPI │ │ Vector │
15
+ │ (Frontend) │◄──►│ Backend │◄──►│ Database │
16
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
17
+
18
+
19
+ ┌─────────────────┐
20
+ │ RAG Engine │
21
+ │ (LLM + │
22
+ │ Retrieval) │
23
+ └─────────────────┘
24
+ ```
25
+
26
+ ## 🚀 Installation and Setup
27
+
28
+ ### System Requirements
29
+
30
+ ### Install Dependencies
31
+ ```bash
32
+ # Create virtual environment
33
+ python -m venv .venv
34
+
35
+ # Activate virtual environment
36
+ # Windows
37
+ .venv\Scripts\activate
38
+ # Linux/Mac
39
+ source .venv/bin/activate
40
+
41
+ # Install packages
42
+ pip install -r requirements.txt
43
+ ```
44
+
45
+ ### Environment Configuration
46
+
47
+ Create a `.env` file in the project root directory with the following structure:
48
+
49
+ ```env
50
+ # API Keys
51
+ OPENAI_API_KEY=your_openai_api_key_here
52
+ GOOGLE_API_KEY=your_google_api_key_here
53
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
54
+
55
+ # Database Configuration
56
+ DATABASE_URL=sqlite:///./user_database/auth.db
57
+ VECTOR_DB_PATH=./VectorDB
58
+
59
+ # Model Configuration
60
+ EMBEDDING_MODEL=google-universal-sentence-encoder
61
+ LLM_MODEL=gpt-3.5-turbo
62
+ CHUNK_SIZE=1000
63
+ CHUNK_OVERLAP=200
64
+
65
+ # Server Configuration
66
+ RAG_SERVICE_PORT=8001
67
+ STREAMLIT_PORT=8501
68
+ FASTAPI_PORT=8000
69
+
70
+ # Security
71
+ SECRET_KEY=your_secret_key_here
72
+ JWT_SECRET=your_jwt_secret_here
73
+
74
+ # Logging
75
+ LOG_LEVEL=INFO
76
+ LOG_FILE=./logs/app.log
77
+
78
+ # Vector Database
79
+ CHROMA_DB_PATH=./VectorDB
80
+ PERSIST_DIRECTORY=./VectorDB
81
+
82
+ # File Processing
83
+ SUPPORTED_FORMATS=pdf,docx,txt,md
84
+ MAX_FILE_SIZE=10485760
85
+ TEMP_DIR=./temp
86
+
87
+ # RAG Configuration
88
+ TOP_K_RESULTS=5
89
+ SIMILARITY_THRESHOLD=0.7
90
+ MAX_TOKENS=4096
91
+ TEMPERATURE=0.7
92
+ ```
93
+
94
+ **Important Notes:**
95
+
96
+ ### Run the Application
97
+
98
+ #### 1. Start RAG Service
99
+ ```bash
100
+ python start_rag_service.py
101
+ ```
102
+ Service will run at: http://127.0.0.1:8001
103
+
104
+ #### 2. Start Streamlit UI
105
+ ```bash
106
+ cd src/streamlit_app
107
+ streamlit run interface.py
108
+ ```
109
+ UI will run at: http://localhost:8501
110
+
111
+ ## 📁 Directory Structure
112
+
113
+ ```
114
+ RAG-Personal-Diary-Chatbot/
115
+ ├── src/
116
+ │ ├── Indexingstep/ # Data indexing pipeline
117
+ │ ├── Retrivel_And_Generation/ # RAG engine
118
+ │ ├── rag_service/ # FastAPI backend
119
+ │ ├── streamlit_app/ # User interface
120
+ │ └── VectorDB/ # Vector database
121
+ ├── notebook/ # Jupyter notebooks
122
+ ├── tests/ # Unit tests
123
+ ├── images/ # Documentation images
124
+ ├── start_rag_service.py # Service startup script
125
+ ├── .env # Environment variables (create from template)
126
+ ├── env_template.txt # Environment variables template
127
+ └── README.md
128
+ ```
129
+
130
+ ## 🔧 Configuration
131
+
132
+ ### Vector Database
133
+
134
+ ### AI Models
135
+
136
+ ## 📊 Performance
137
+
138
+
139
+ ## 🧪 Testing
140
+
141
+ ```bash
142
+ # Run all tests
143
+ python -m pytest tests/
144
+
145
+ # Run specific test
146
+ python -m pytest tests/test_rag_system.py
147
+ ```
148
+
149
+ ## 🤝 Contributing
150
+
151
+ 1. Fork the project
152
+ 2. Create feature branch (`git checkout -b feature/AmazingFeature`)
153
+ 3. Commit changes (`git commit -m 'Add some AmazingFeature'`)
154
+ 4. Push to branch (`git push origin feature/AmazingFeature`)
155
+ 5. Open Pull Request
156
+
157
+ ## 📝 License
158
+
159
+ This project is distributed under the MIT License. See the `LICENSE` file for more details.
160
+
161
+ ## 📞 Contact
162
+
163
+
164
+ ## 🙏 Acknowledgments
165
+
166
+
167
+ ## 📖 Project Description
168
+
169
+ RAG Personal Diary Chatbot is an intelligent chatbot application that leverages Retrieval-Augmented Generation (RAG) architecture to interact with users' personal diaries. Users can ask questions about their diary content and receive accurate, context-based answers.
170
+
171
+ ## ✨ Key Features
172
+
173
+ - **Diary Indexing**: Automatically processes and indexes diary files (PDF, DOCX, TXT)
174
+ - **Semantic Search**: Uses a vector database for semantic search
175
+ - **AI Chatbot**: Natural interaction with diary data
176
+ - **User Isolation**: Each user has a separate vector database
177
+ - **Web Interface**: Easy-to-use Streamlit UI
178
+ - **REST API**: FastAPI backend for integration
179
+
180
+ ## 🏗️ System Architecture
181
+
182
+ ```
183
+ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐
184
+ │ Streamlit UI │◄──►│ FastAPI │◄──►│ Vector DB │
185
+ │ (Frontend) │ │ Backend │ │ (ChromaDB) │
186
+ └───────────────┘ └───────────────┘ └───────────────┘
187
+
188
+
189
+ ┌───────────────┐
190
+ │ RAG Engine │
191
+ │ (LLM + │
192
+ │ Retrieval) │
193
+ └───────────────┘
194
+ ```
195
+
196
+ ## 🚀 Installation and Setup
197
+
198
+ ### System Requirements
199
+
200
+ - Python 3.8+
201
+
202
+ ### Install Dependencies
203
+
204
+ ```bash
205
+ # Create virtual environment
206
+ python -m venv .venv
207
+
208
+ # Activate virtual environment
209
+ # Windows
210
+ .venv\Scripts\activate
211
+ # Linux/Mac
212
+ source .venv/bin/activate
213
+
214
+ # Install packages
215
+ pip install -r requirements.txt
216
+ ```
217
+
218
+ ### Environment Configuration
219
+
220
+ Create a `.env` file in the project root directory with the following structure:
221
+
222
+ ```env
223
+ # Google API Configuration for RAG System
224
+ GOOGLE_API_KEY=[Google API key]
225
+
226
+ # Database Configuration
227
+ DATABASE_PATH=./src/streamlit_app/backend/diary.db
228
+
229
+ # Vector Database Configuration
230
+ VECTOR_DB_PATH=./src/Indexingstep/diary_vector_db_enhanced
231
+ COLLECTION_NAME=diary_entries
232
+
233
+ # RAG Configuration
234
+ EMBEDDING_MODEL=models/embedding-001
235
+ CHAT_MODEL=gemini-2.5-flash
236
+ ```
237
+
238
+ **Important Notes:**
239
+ - Replace all placeholder values with your actual API keys and configuration
240
+ - Keep your `.env` file secure and never commit it to version control
241
+ - The `.env` file is already included in `.gitignore`
242
+ - Use `env_template.txt` as a reference to create your `.env` file
243
+
244
+ ### Run the Application
245
+
246
+ ```bash
247
+ # Start the RAG backend service
248
+ python start_rag_service.py
249
+
250
+ # Start the Streamlit UI
251
+ streamlit run src/streamlit_app/interface.py
252
+ ```
253
+
254
+ ## 📁 Directory Structure
255
+
256
+ ```
257
+ RAG-Personal-Diary-Chatbot/
258
+ ├── src/
259
+ │ ├── Indexingstep/ # Data indexing pipeline
260
+ │ ├── Retrivel_And_Generation/ # RAG engine
261
+ │ ├── rag_service/ # FastAPI backend
262
+ │ ├── streamlit_app/ # User interface
263
+ │ └── VectorDB/ # Vector database
264
+ ├── notebook/ # Jupyter notebooks
265
+ ├── tests/ # Unit tests
266
+ ├── images/ # Documentation images
267
+ ├── start_rag_service.py # Service startup script
268
+ ├── .env # Environment variables (create from template)
269
+ ├── env_template.txt # Environment variables template
270
+ └── README.md
271
+ ```
272
+
273
+ ## 🔧 Configuration
274
+
275
+ ### Vector Database
276
+ - **ChromaDB**: Main database for vector embeddings
277
+ - **Chunk size**: 1000 characters (customizable)
278
+ - **Overlap**: 200 characters between chunks
279
+
280
+ ### AI Models
281
+ - **Embedding**: Google's Universal Sentence Encoder
282
+ - **LLM**: Google Gemini (can be replaced with other models)
283
+
284
+ ## 📊 Performance
285
+
286
+ - **Processing time**: ~2-5 seconds per question
287
+ - **Accuracy**: 85-95% depending on data quality
288
+ - **Scalability**: Supports thousands of diaries
289
+
290
+
291
+ ## 🤝 Contributing
292
+
293
+ 1. Fork the project
294
+ 2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
295
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
296
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
297
+ 5. Open a Pull Request
298
+ ## 📞 Contact
299
+
300
+ - **Author**: [huytrao]
301
+ - **Email**: [traohuy098@gmail.com]
302
+ - **GitHub**: [github.com/huytrao]
303
+
304
+ ## 🙏 Acknowledgments
305
+
306
+ - Gemini for GPT models
307
+ - Google for Universal Sentence Encoder
308
+ - ChromaDB team for vector database
309
+ - FastAPI and Streamlit communities
310
+ - RAG architecture
clean_repo/clean_repo/app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Start RAG Service for Personal Diary Chatbot
4
+ """
5
+ import subprocess
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+ import time
10
+
11
+ def check_requirements():
12
+ """Check if required packages are installed."""
13
+ required_packages = ['fastapi', 'uvicorn']
14
+ missing_packages = []
15
+
16
+ for package in required_packages:
17
+ try:
18
+ __import__(package)
19
+ except ImportError:
20
+ missing_packages.append(package)
21
+
22
+ if missing_packages:
23
+ print(f"❌ Missing packages: {', '.join(missing_packages)}")
24
+ print(f"Install with: pip install {' '.join(missing_packages)}")
25
+ return False
26
+
27
+ return True
28
+
29
+ def setup_environment():
30
+ """Setup environment and directories."""
31
+ # Ensure VectorDB directory exists
32
+ vector_db_dir = Path("src/VectorDB")
33
+ vector_db_dir.mkdir(parents=True, exist_ok=True)
34
+ print(f"📁 Vector DB directory: {vector_db_dir.absolute()}")
35
+
36
+ # Check for .env file
37
+ env_file = Path("src/Indexingstep/.env")
38
+ if env_file.exists():
39
+ print(f"✅ Environment file found: {env_file}")
40
+ else:
41
+ print(f"⚠️ Environment file not found: {env_file}")
42
+ print("Make sure GOOGLE_API_KEY is set in environment")
43
+
44
+ def start_service():
45
+ """Start the RAG FastAPI service."""
46
+ if not check_requirements():
47
+ return
48
+
49
+ setup_environment()
50
+
51
+ service_file = Path("src/rag_service/main.py")
52
+
53
+ if not service_file.exists():
54
+ print(f"❌ Service file not found: {service_file}")
55
+ print("Please create the RAG service file first")
56
+ return
57
+
58
+ print("🚀 Starting RAG Service...")
59
+ print("📍 Service URL: http://0.0.0.0:8001")
60
+ print("📖 API Docs: http://0.0.0.0:8001/docs")
61
+ print("💾 Vector databases will be stored in: src/VectorDB/")
62
+ print("\nPress Ctrl+C to stop the service")
63
+ print("-" * 50)
64
+
65
+ try:
66
+ # Change to project root directory
67
+ os.chdir(Path(__file__).parent)
68
+
69
+ # Start the service in the background
70
+ process = subprocess.Popen([
71
+ sys.executable, "-m", "uvicorn",
72
+ "src.rag_service.main:app",
73
+ "--host", "0.0.0.0",
74
+ "--port", "8001",
75
+ "--reload"
76
+ ])
77
+ print(f"🔄 RAG Service running in background (PID: {process.pid})")
78
+ return process
79
+ except Exception as e:
80
+ print(f"❌ Error starting service: {e}")
81
+ return None
82
+
83
+ def start_streamlit():
84
+ # Start Streamlit UI on port 7860 (default for Spaces)
85
+ os.system("streamlit run src/streamlit_app/interface.py --server.port 7860")
86
+
87
+ if __name__ == "__main__":
88
+ start_service()
89
+ time.sleep(3)
90
+ start_streamlit()
clean_repo/clean_repo/env_template.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment Variables Template for RAG Personal Diary Chatbot
2
+ # Copy this file to .env and fill in your actual values
3
+
4
+ # API Keys
5
+ OPENAI_API_KEY=your_openai_api_key_here
6
+ GOOGLE_API_KEY=your_google_api_key_here
7
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
8
+
9
+ # Database Configuration
10
+ DATABASE_URL=sqlite:///./user_database/auth.db
11
+ VECTOR_DB_PATH=./VectorDB
12
+
13
+ # Model Configuration
14
+ EMBEDDING_MODEL=google-universal-sentence-encoder
15
+ LLM_MODEL=gpt-3.5-turbo
16
+ CHUNK_SIZE=1000
17
+ CHUNK_OVERLAP=200
18
+
19
+ # Server Configuration
20
+ RAG_SERVICE_PORT=8001
21
+ STREAMLIT_PORT=8501
22
+ FASTAPI_PORT=8000
23
+
24
+ # Security
25
+ SECRET_KEY=your_secret_key_here
26
+ JWT_SECRET=your_jwt_secret_here
27
+
28
+ # Logging
29
+ LOG_LEVEL=INFO
30
+ LOG_FILE=./logs/app.log
31
+
32
+ # Vector Database
33
+ CHROMA_DB_PATH=./VectorDB
34
+ PERSIST_DIRECTORY=./VectorDB
35
+
36
+ # File Processing
37
+ SUPPORTED_FORMATS=pdf,docx,txt,md
38
+ MAX_FILE_SIZE=10485760
39
+ TEMP_DIR=./temp
40
+
41
+ # RAG Configuration
42
+ TOP_K_RESULTS=5
43
+ SIMILARITY_THRESHOLD=0.7
44
+ MAX_TOKENS=4096
45
+ TEMPERATURE=0.7
46
+
47
+ # Instructions:
48
+ # 1. Copy this file to .env
49
+ # 2. Replace all placeholder values with your actual configuration
50
+ # 3. Never commit .env to version control
51
+ # 4. Keep your API keys secure
clean_repo/clean_repo/requirements.txt ADDED
Binary file (5.86 kB). View file
 
clean_repo/clean_repo/src/streamlit_app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ """
7
+ # Welcome to Streamlit!
8
+
9
+ Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
+ forums](https://discuss.streamlit.io).
12
+
13
+ In the meantime, below is an example of what you can do with just a few lines of code:
14
+ """
15
+
16
+ num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
+ num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
+
19
+ indices = np.linspace(0, 1, num_points)
20
+ theta = 2 * np.pi * num_turns * indices
21
+ radius = indices
22
+
23
+ x = radius * np.cos(theta)
24
+ y = radius * np.sin(theta)
25
+
26
+ df = pd.DataFrame({
27
+ "x": x,
28
+ "y": y,
29
+ "idx": indices,
30
+ "rand": np.random.randn(num_points),
31
+ })
32
+
33
+ st.altair_chart(alt.Chart(df, height=700, width=700)
34
+ .mark_point(filled=True)
35
+ .encode(
36
+ x=alt.X("x", axis=None),
37
+ y=alt.Y("y", axis=None),
38
+ color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
+ size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
+ ))
clean_repo/env_template.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment Variables Template for RAG Personal Diary Chatbot
2
+ # Copy this file to .env and fill in your actual values
3
+
4
+ # API Keys
5
+ OPENAI_API_KEY=your_openai_api_key_here
6
+ GOOGLE_API_KEY=your_google_api_key_here
7
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
8
+
9
+ # Database Configuration
10
+ DATABASE_URL=sqlite:///./user_database/auth.db
11
+ VECTOR_DB_PATH=./VectorDB
12
+
13
+ # Model Configuration
14
+ EMBEDDING_MODEL=google-universal-sentence-encoder
15
+ LLM_MODEL=gpt-3.5-turbo
16
+ CHUNK_SIZE=1000
17
+ CHUNK_OVERLAP=200
18
+
19
+ # Server Configuration
20
+ RAG_SERVICE_PORT=8001
21
+ STREAMLIT_PORT=8501
22
+ FASTAPI_PORT=8000
23
+
24
+ # Security
25
+ SECRET_KEY=your_secret_key_here
26
+ JWT_SECRET=your_jwt_secret_here
27
+
28
+ # Logging
29
+ LOG_LEVEL=INFO
30
+ LOG_FILE=./logs/app.log
31
+
32
+ # Vector Database
33
+ CHROMA_DB_PATH=./VectorDB
34
+ PERSIST_DIRECTORY=./VectorDB
35
+
36
+ # File Processing
37
+ SUPPORTED_FORMATS=pdf,docx,txt,md
38
+ MAX_FILE_SIZE=10485760
39
+ TEMP_DIR=./temp
40
+
41
+ # RAG Configuration
42
+ TOP_K_RESULTS=5
43
+ SIMILARITY_THRESHOLD=0.7
44
+ MAX_TOKENS=4096
45
+ TEMPERATURE=0.7
46
+
47
+ # Instructions:
48
+ # 1. Copy this file to .env
49
+ # 2. Replace all placeholder values with your actual configuration
50
+ # 3. Never commit .env to version control
51
+ # 4. Keep your API keys secure
clean_repo/images/DIAGRAM-RAG-diary.png ADDED
clean_repo/notebook/RAG-test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
clean_repo/notebook/exploration.ipynb ADDED
File without changes
clean_repo/requirements.txt ADDED
Binary file (5.86 kB). View file
 
clean_repo/src/Indexingstep/Datasplitting.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import CharacterTextSplitter
2
+
3
+ class DataSplitting:
4
+ def __init__(self, chunk_size=1000, chunk_overlap=200, separator="\n\n"):
5
+ """
6
+ Initialize the DataSplitting class.
7
+
8
+ Args:
9
+ chunk_size (int): Maximum size of each chunk
10
+ chunk_overlap (int): Number of characters to overlap between chunks
11
+ separator (str): Character(s) to split on
12
+ """
13
+ self.chunk_size = chunk_size
14
+ self.chunk_overlap = chunk_overlap
15
+ self.separator = separator
16
+ self.text_splitter = CharacterTextSplitter(
17
+ chunk_size=self.chunk_size,
18
+ chunk_overlap=self.chunk_overlap,
19
+ separator=self.separator
20
+ )
21
+
22
+ def split_text(self, text):
23
+ """
24
+ Split the input text into chunks.
25
+
26
+ Args:
27
+ text (str): The text to be split
28
+
29
+ Returns:
30
+ list: List of text chunks
31
+ """
32
+ return self.text_splitter.split_text(text)
33
+
34
+ def split_documents(self, documents):
35
+ """
36
+ Split documents into chunks.
37
+
38
+ Args:
39
+ documents (list): List of documents to be split
40
+
41
+ Returns:
42
+ list: List of document chunks
43
+ """
44
+ return self.text_splitter.split_documents(documents)
clean_repo/src/Indexingstep/database_utils.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database utilities and context managers.
3
+ """
4
+
5
+ import sqlite3
6
+ import os
7
+ from contextlib import contextmanager
8
+ from typing import Generator
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @contextmanager
15
+ def open_db(db_path: str) -> Generator[sqlite3.Connection, None, None]:
16
+ """
17
+ Context manager for database connections.
18
+
19
+ Args:
20
+ db_path: Path to the SQLite database
21
+
22
+ Yields:
23
+ Database connection
24
+ """
25
+ conn = None
26
+ try:
27
+ conn = sqlite3.connect(db_path)
28
+ conn.row_factory = sqlite3.Row
29
+ yield conn
30
+ except Exception as e:
31
+ if conn:
32
+ conn.rollback()
33
+ logger.error(f"Database error with {db_path}: {e}")
34
+ raise
35
+ finally:
36
+ if conn:
37
+ conn.close()
38
+
39
+
40
+ def ensure_database_exists(db_path: str, user_id: int) -> None:
41
+ """
42
+ Ensure user-specific database exists with proper schema.
43
+
44
+ Args:
45
+ db_path: Path to the database file
46
+ user_id: User ID for default value
47
+ """
48
+ if os.path.exists(db_path):
49
+ return
50
+
51
+ # Create directory if it doesn't exist
52
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
53
+
54
+ with open_db(db_path) as conn:
55
+ cursor = conn.cursor()
56
+
57
+ # Create table schema
58
+ cursor.execute(f"""
59
+ CREATE TABLE IF NOT EXISTS diary_entries (
60
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
61
+ user_id INTEGER NOT NULL DEFAULT {user_id},
62
+ date TEXT NOT NULL,
63
+ content TEXT NOT NULL,
64
+ tags TEXT DEFAULT '',
65
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
66
+ )
67
+ """)
68
+
69
+ # Create index
70
+ cursor.execute("""
71
+ CREATE INDEX IF NOT EXISTS idx_user_date ON diary_entries(user_id, date)
72
+ """)
73
+
74
+ conn.commit()
75
+
76
+ logger.info(f"Created user database: {db_path}")
77
+
78
+
79
+ def migrate_user_data(source_db_path: str, target_db_path: str, user_id: int) -> int:
80
+ """
81
+ Migrate user data from shared database to user-specific database.
82
+
83
+ Args:
84
+ source_db_path: Path to source database
85
+ target_db_path: Path to target database
86
+ user_id: User ID to migrate
87
+
88
+ Returns:
89
+ Number of entries migrated
90
+ """
91
+ if not os.path.exists(source_db_path):
92
+ return 0
93
+
94
+ migrated_count = 0
95
+
96
+ try:
97
+ with open_db(source_db_path) as source_conn:
98
+ with open_db(target_db_path) as target_conn:
99
+ source_cursor = source_conn.cursor()
100
+ target_cursor = target_conn.cursor()
101
+
102
+ # Check if shared DB has user_id column
103
+ source_cursor.execute("PRAGMA table_info(diary_entries)")
104
+ columns = [col[1] for col in source_cursor.fetchall()]
105
+
106
+ if 'user_id' in columns:
107
+ # Migrate specific user data
108
+ source_cursor.execute("""
109
+ SELECT date, content, tags, created_at
110
+ FROM diary_entries
111
+ WHERE user_id = ?
112
+ """, (user_id,))
113
+ else:
114
+ # If no user_id column, migrate all data to user 1 only
115
+ if user_id == 1:
116
+ source_cursor.execute("""
117
+ SELECT date, content, COALESCE(tags, ''), created_at
118
+ FROM diary_entries
119
+ """)
120
+ else:
121
+ return 0
122
+
123
+ rows = source_cursor.fetchall()
124
+
125
+ for row in rows:
126
+ target_cursor.execute("""
127
+ INSERT OR IGNORE INTO diary_entries (user_id, date, content, tags, created_at)
128
+ VALUES (?, ?, ?, ?, ?)
129
+ """, (user_id, row[0], row[1], row[2] if len(row) > 2 else '', row[3] if len(row) > 3 else None))
130
+
131
+ target_conn.commit()
132
+ migrated_count = len(rows)
133
+
134
+ if migrated_count > 0:
135
+ logger.info(f"Migrated {migrated_count} entries for user {user_id}")
136
+
137
+ except Exception as e:
138
+ logger.warning(f"Could not migrate data for user {user_id}: {e}")
139
+
140
+ return migrated_count
clean_repo/src/Indexingstep/dataloading.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from typing import List, Optional, Dict, Any
3
+ from langchain.schema import Document
4
+ from langchain.document_loaders.base import BaseLoader
5
+ import logging
6
+ import re
7
+ from datetime import datetime
8
+
9
+ # Set up logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class DiaryDataLoader(BaseLoader):
14
+ """
15
+ Custom LangChain document loader for diary entries from SQLite database.
16
+ Enhanced with detailed metadata extraction for better indexing.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ db_path: str,
22
+ table_name: str = "diary_entries",
23
+ content_column: str = "content",
24
+ date_column: str = "date",
25
+ tags_column: str = "tags",
26
+ id_column: str = "id",
27
+ user_id: int = 1
28
+ ):
29
+ """
30
+ Initialize the DiaryDataLoader.
31
+
32
+ Args:
33
+ db_path (str): Path to the SQLite database file
34
+ table_name (str): Name of the table containing diary entries
35
+ content_column (str): Name of the column containing diary content
36
+ date_column (str): Name of the column containing entry dates
37
+ tags_column (str): Name of the column containing entry tags
38
+ id_column (str): Name of the column containing entry IDs
39
+ user_id (int): ID of the user for filtering diary entries
40
+ """
41
+ self.db_path = db_path
42
+ self.table_name = table_name
43
+ self.content_column = content_column
44
+ self.date_column = date_column
45
+ self.tags_column = tags_column
46
+ self.id_column = id_column
47
+ self.user_id = user_id
48
+
49
+ def _extract_tags_from_content(self, content: str) -> List[str]:
50
+ """
51
+ Extract #tags from content string.
52
+
53
+ Args:
54
+ content: The diary content string
55
+
56
+ Returns:
57
+ List of tags found (without # symbol)
58
+ """
59
+ if not content:
60
+ return []
61
+
62
+ # Find all #tags in content
63
+ tag_pattern = r'#(\w+(?:[_-]\w+)*)'
64
+ matches = re.findall(tag_pattern, content, re.IGNORECASE)
65
+
66
+ # Remove duplicates and return lowercase tags
67
+ return list(set([tag.lower() for tag in matches]))
68
+
69
+ def _extract_location_from_content(self, content: str) -> Optional[str]:
70
+ """
71
+ Extract location information from content using common patterns.
72
+
73
+ Args:
74
+ content: The diary content string
75
+
76
+ Returns:
77
+ Location string if found, None otherwise
78
+ """
79
+ if not content:
80
+ return None
81
+
82
+ # Common location patterns
83
+ location_patterns = [
84
+ r'at\s+([A-Z][a-zA-Z\s]+(?:Park|Beach|Mall|Store|Restaurant|Cafe|Office|Home|School|University))',
85
+ r'in\s+([A-Z][a-zA-Z\s]+(?:City|District|Area|Street|Road))',
86
+ r'went\s+to\s+([A-Z][a-zA-Z\s]+)',
87
+ r'visited\s+([A-Z][a-zA-Z\s]+)',
88
+ r'location:\s*([A-Za-z\s]+)',
89
+ r'place:\s*([A-Za-z\s]+)'
90
+ ]
91
+
92
+ for pattern in location_patterns:
93
+ matches = re.findall(pattern, content, re.IGNORECASE)
94
+ if matches:
95
+ return matches[0].strip()
96
+
97
+ return None
98
+
99
+ def _extract_people_from_content(self, content: str) -> List[str]:
100
+ """
101
+ Extract people/relationships mentioned in content.
102
+
103
+ Args:
104
+ content: The diary content string
105
+
106
+ Returns:
107
+ List of people/relationships mentioned
108
+ """
109
+ if not content:
110
+ return []
111
+
112
+ # Common relationship patterns
113
+ people_patterns = [
114
+ r'with\s+(my\s+)?(\w+(?:\s+\w+)?)',
115
+ r'(mom|dad|mother|father|sister|brother|friend|colleague|boss|teacher)',
116
+ r'(family|friends|team|colleagues)',
117
+ r'met\s+([\w\s]+)',
118
+ r'talked\s+to\s+([\w\s]+)'
119
+ ]
120
+
121
+ people = set()
122
+ for pattern in people_patterns:
123
+ matches = re.findall(pattern, content, re.IGNORECASE)
124
+ for match in matches:
125
+ if isinstance(match, tuple):
126
+ for part in match:
127
+ if part.strip():
128
+ people.add(part.strip().lower())
129
+ else:
130
+ people.add(match.strip().lower())
131
+
132
+ # Filter out common words that are not people
133
+ exclude_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
134
+ people = [p for p in people if p not in exclude_words and len(p) > 2]
135
+
136
+ return list(people)
137
+
138
+ def _get_day_of_week(self, date_str: str) -> str:
139
+ """
140
+ Get day of week from date string.
141
+
142
+ Args:
143
+ date_str: Date string in YYYY-MM-DD format
144
+
145
+ Returns:
146
+ Day of week (e.g., 'Monday', 'Tuesday', etc.)
147
+ """
148
+ try:
149
+ date_obj = datetime.strptime(date_str, '%Y-%m-%d')
150
+ return date_obj.strftime('%A')
151
+ except:
152
+ return 'Unknown'
153
+
154
+ def _extract_content_from_structured_format(self, raw_content: str) -> tuple:
155
+ """
156
+ Extract actual content from structured format like:
157
+ Title: xxxx
158
+ Type: Text
159
+ Content: actual content here
160
+
161
+ Returns:
162
+ tuple: (title, actual_content)
163
+ """
164
+ lines = raw_content.strip().split('\n')
165
+ title = ""
166
+ content = ""
167
+
168
+ for line in lines:
169
+ if line.startswith("Title: "):
170
+ title = line.replace("Title: ", "").strip()
171
+ elif line.startswith("Content: "):
172
+ content = line.replace("Content: ", "").strip()
173
+
174
+ # If no structured format found, return original content
175
+ if not content:
176
+ content = raw_content
177
+
178
+ return title, content
179
+
180
+ def load(self) -> List[Document]:
181
+ """
182
+ Load diary entries from the database and convert them to LangChain Documents.
183
+
184
+ Returns:
185
+ List[Document]: List of LangChain Document objects
186
+ """
187
+ documents = []
188
+
189
+ try:
190
+ # Connect to the SQLite database
191
+ conn = sqlite3.connect(self.db_path)
192
+ conn.row_factory = sqlite3.Row # Enable accessing columns by name
193
+ cursor = conn.cursor()
194
+
195
+ # Build the SQL query with all required columns
196
+ columns = [self.id_column, self.date_column, self.content_column, self.tags_column]
197
+
198
+ query = f"SELECT {', '.join(columns)} FROM {self.table_name} WHERE user_id = ? ORDER BY {self.date_column} DESC"
199
+
200
+ # Execute the query
201
+ cursor.execute(query, (self.user_id,))
202
+ rows = cursor.fetchall()
203
+
204
+ logger.info(f"Loaded {len(rows)} diary entries from database")
205
+
206
+ # Convert each row to a LangChain Document with enhanced metadata
207
+ for row in rows:
208
+ row_dict = dict(row) if hasattr(row, 'keys') else {
209
+ self.id_column: row[0],
210
+ self.date_column: row[1],
211
+ self.content_column: row[2],
212
+ self.tags_column: row[3] if len(row) > 3 else ""
213
+ }
214
+
215
+ raw_content = row_dict[self.content_column]
216
+ date = row_dict[self.date_column]
217
+ entry_id = row_dict.get(self.id_column, "unknown")
218
+ db_tags = row_dict.get(self.tags_column, "")
219
+
220
+ # Extract structured content
221
+ title, actual_content = self._extract_content_from_structured_format(raw_content)
222
+
223
+ # Extract comprehensive metadata
224
+ content_tags = self._extract_tags_from_content(actual_content)
225
+ db_tag_list = [tag.strip() for tag in db_tags.split(',') if tag.strip()] if db_tags else []
226
+ all_tags = list(set(content_tags + db_tag_list)) # Combine and deduplicate
227
+
228
+ location = self._extract_location_from_content(actual_content)
229
+ people = self._extract_people_from_content(actual_content)
230
+ day_of_week = self._get_day_of_week(date)
231
+
232
+ # Create comprehensive metadata for the document
233
+ metadata = {
234
+ "source": self.db_path,
235
+ "entry_id": str(entry_id),
236
+ "date": date,
237
+ "day_of_week": day_of_week,
238
+ "type": "diary_entry",
239
+ "tags": all_tags,
240
+ "tag_count": len(all_tags),
241
+ "content_length": len(actual_content),
242
+ "word_count": len(actual_content.split())
243
+ }
244
+
245
+ # Add optional metadata if available
246
+ if title:
247
+ metadata["title"] = title
248
+ if location:
249
+ metadata["location"] = location
250
+ if people:
251
+ metadata["people"] = people
252
+ metadata["people_count"] = len(people)
253
+
254
+ # Add mood/sentiment tags if present
255
+ mood_tags = [tag for tag in all_tags if tag in ['happy', 'sad', 'excited', 'tired', 'angry', 'peaceful', 'stressed', 'grateful', 'frustrated', 'motivated']]
256
+ if mood_tags:
257
+ metadata["mood_tags"] = mood_tags
258
+
259
+ # Create Document object with actual content
260
+ document = Document(
261
+ page_content=actual_content,
262
+ metadata=metadata
263
+ )
264
+
265
+ documents.append(document)
266
+
267
+ conn.close()
268
+ logger.info(f"Successfully converted {len(documents)} entries to Documents")
269
+
270
+ except sqlite3.Error as e:
271
+ logger.error(f"Database error: {e}")
272
+ raise
273
+ except Exception as e:
274
+ logger.error(f"Error loading diary data: {e}")
275
+ raise
276
+
277
+ return documents
278
+
279
+ def load_by_date_range(self, start_date: str, end_date: str) -> List[Document]:
280
+ """
281
+ Load diary entries within a specific date range.
282
+
283
+ Args:
284
+ start_date (str): Start date in YYYY-MM-DD format
285
+ end_date (str): End date in YYYY-MM-DD format
286
+
287
+ Returns:
288
+ List[Document]: Filtered list of Document objects
289
+ """
290
+ documents = []
291
+
292
+ try:
293
+ conn = sqlite3.connect(self.db_path)
294
+ conn.row_factory = sqlite3.Row
295
+ cursor = conn.cursor()
296
+
297
+ columns = [self.content_column, self.date_column]
298
+ # if self.title_column:
299
+ # columns.append(self.title_column)
300
+
301
+ query = f"""
302
+ SELECT {', '.join(columns)}
303
+ FROM {self.table_name}
304
+ WHERE user_id = ? AND {self.date_column} BETWEEN ? AND ?
305
+ ORDER BY {self.date_column}
306
+ """
307
+
308
+ cursor.execute(query, (self.user_id, start_date, end_date))
309
+ rows = cursor.fetchall()
310
+
311
+ logger.info(f"Loaded {len(rows)} diary entries from {start_date} to {end_date}")
312
+
313
+ for row in rows:
314
+ raw_content = row[self.content_column]
315
+ date = row[self.date_column]
316
+
317
+ # Extract structured content
318
+ title, actual_content = self._extract_content_from_structured_format(raw_content)
319
+
320
+ metadata = {
321
+ "source": self.db_path,
322
+ "date": date,
323
+ "type": "diary_entry",
324
+ "date_range": f"{start_date}_to_{end_date}"
325
+ }
326
+
327
+ # Add title to metadata if available
328
+ if title:
329
+ metadata["title"] = title
330
+
331
+ document = Document(
332
+ page_content=actual_content,
333
+ metadata=metadata
334
+ )
335
+
336
+ documents.append(document)
337
+
338
+ conn.close()
339
+
340
+ except sqlite3.Error as e:
341
+ logger.error(f"Database error: {e}")
342
+ raise
343
+ except Exception as e:
344
+ logger.error(f"Error loading diary data by date range: {e}")
345
+ raise
346
+
347
+ return documents
348
+
349
+ def get_table_info(self) -> dict:
350
+ """
351
+ Get information about the database table structure.
352
+
353
+ Returns:
354
+ dict: Table information including columns and row count
355
+ """
356
+ try:
357
+ conn = sqlite3.connect(self.db_path)
358
+ cursor = conn.cursor()
359
+
360
+ # Get table schema
361
+ cursor.execute(f"PRAGMA table_info({self.table_name})")
362
+ columns = cursor.fetchall()
363
+
364
+ # Get row count
365
+ cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
366
+ row_count = cursor.fetchone()[0]
367
+
368
+ conn.close()
369
+
370
+ return {
371
+ "table_name": self.table_name,
372
+ "columns": [{"name": col[1], "type": col[2]} for col in columns],
373
+ "row_count": row_count
374
+ }
375
+
376
+ except sqlite3.Error as e:
377
+ logger.error(f"Database error: {e}")
378
+ raise
379
+
380
+ class DiaryContentPreprocessor:
381
+ """
382
+ Preprocessor for diary content to clean and standardize text before indexing.
383
+ """
384
+
385
+ def __init__(
386
+ self,
387
+ remove_extra_whitespace: bool = True,
388
+ normalize_line_breaks: bool = True,
389
+ min_content_length: int = 10,
390
+ max_content_length: Optional[int] = None
391
+ ):
392
+ """
393
+ Initialize the content preprocessor.
394
+
395
+ Args:
396
+ remove_extra_whitespace (bool): Remove extra spaces and tabs
397
+ normalize_line_breaks (bool): Normalize line breaks to single newlines
398
+ min_content_length (int): Minimum content length to keep
399
+ max_content_length (int, optional): Maximum content length to keep
400
+ """
401
+ self.remove_extra_whitespace = remove_extra_whitespace
402
+ self.normalize_line_breaks = normalize_line_breaks
403
+ self.min_content_length = min_content_length
404
+ self.max_content_length = max_content_length
405
+
406
+ def preprocess_content(self, content: str) -> str:
407
+ """
408
+ Preprocess diary content text.
409
+
410
+ Args:
411
+ content (str): Raw diary content
412
+
413
+ Returns:
414
+ str: Preprocessed content
415
+ """
416
+ if not content or not isinstance(content, str):
417
+ return ""
418
+
419
+ processed_content = content
420
+
421
+ # Remove extra whitespace
422
+ if self.remove_extra_whitespace:
423
+ processed_content = ' '.join(processed_content.split())
424
+
425
+ # Normalize line breaks
426
+ if self.normalize_line_breaks:
427
+ processed_content = processed_content.replace('\r\n', '\n').replace('\r', '\n')
428
+ # Remove multiple consecutive newlines
429
+ processed_content = re.sub(r'\n+', '\n', processed_content)
430
+
431
+ # Strip leading/trailing whitespace
432
+ processed_content = processed_content.strip()
433
+
434
+ # Check length constraints
435
+ if len(processed_content) < self.min_content_length:
436
+ logger.warning(f"Content too short ({len(processed_content)} chars), skipping")
437
+ return ""
438
+
439
+ if self.max_content_length and len(processed_content) > self.max_content_length:
440
+ logger.warning(f"Content too long ({len(processed_content)} chars), truncating")
441
+ processed_content = processed_content[:self.max_content_length]
442
+
443
+ return processed_content
444
+
445
+ def preprocess_documents(self, documents: List[Document]) -> List[Document]:
446
+ """
447
+ Preprocess a list of Document objects.
448
+
449
+ Args:
450
+ documents (List[Document]): List of documents to preprocess
451
+
452
+ Returns:
453
+ List[Document]: List of preprocessed documents
454
+ """
455
+ preprocessed_docs = []
456
+
457
+ for doc in documents:
458
+ processed_content = self.preprocess_content(doc.page_content)
459
+
460
+ # Skip empty content after preprocessing
461
+ if not processed_content:
462
+ continue
463
+
464
+ # Create new document with processed content
465
+ preprocessed_doc = Document(
466
+ page_content=processed_content,
467
+ metadata=doc.metadata.copy()
468
+ )
469
+
470
+ preprocessed_docs.append(preprocessed_doc)
471
+
472
+ logger.info(f"Preprocessed {len(documents)} documents, kept {len(preprocessed_docs)}")
473
+ return preprocessed_docs
474
+
475
+ def load_all_entries(self, user_id: int = None) -> List[Dict[str, Any]]:
476
+ """
477
+ Load all diary entries for a specific user.
478
+
479
+ Args:
480
+ user_id: User ID to filter entries
481
+
482
+ Returns:
483
+ List of diary entry dictionaries
484
+ """
485
+ if user_id is None:
486
+ user_id = self.user_id
487
+
488
+ entries = []
489
+
490
+ try:
491
+ conn = sqlite3.connect(self.db_path)
492
+ conn.row_factory = sqlite3.Row
493
+ cursor = conn.cursor()
494
+
495
+ query = f"""
496
+ SELECT id, user_id, date, content, tags, created_at
497
+ FROM {self.table_name}
498
+ WHERE user_id = ?
499
+ ORDER BY date DESC, created_at DESC
500
+ """
501
+
502
+ cursor.execute(query, (user_id,))
503
+ rows = cursor.fetchall()
504
+
505
+ for row in rows:
506
+ entries.append({
507
+ 'id': row['id'],
508
+ 'user_id': row['user_id'],
509
+ 'date': row['date'],
510
+ 'content': row['content'],
511
+ 'tags': row['tags'] or '',
512
+ 'created_at': row['created_at']
513
+ })
514
+
515
+ conn.close()
516
+ logger.info(f"Loaded {len(entries)} entries for user {user_id}")
517
+
518
+ except sqlite3.Error as e:
519
+ logger.error(f"Database error loading entries: {e}")
520
+
521
+ return entries
522
+
523
+ def load_entries_since(self, since_date, user_id: int = None) -> List[Dict[str, Any]]:
524
+ """
525
+ Load diary entries since a specific date.
526
+
527
+ Args:
528
+ since_date: datetime object or ISO string
529
+ user_id: User ID to filter entries
530
+
531
+ Returns:
532
+ List of diary entry dictionaries
533
+ """
534
+ if user_id is None:
535
+ user_id = self.user_id
536
+
537
+ entries = []
538
+
539
+ try:
540
+ # Convert datetime to string if needed
541
+ if hasattr(since_date, 'isoformat'):
542
+ since_str = since_date.isoformat()
543
+ else:
544
+ since_str = str(since_date)
545
+
546
+ conn = sqlite3.connect(self.db_path)
547
+ conn.row_factory = sqlite3.Row
548
+ cursor = conn.cursor()
549
+
550
+ query = f"""
551
+ SELECT id, user_id, date, content, tags, created_at
552
+ FROM {self.table_name}
553
+ WHERE user_id = ? AND created_at > ?
554
+ ORDER BY date DESC, created_at DESC
555
+ """
556
+
557
+ cursor.execute(query, (user_id, since_str))
558
+ rows = cursor.fetchall()
559
+
560
+ for row in rows:
561
+ entries.append({
562
+ 'id': row['id'],
563
+ 'user_id': row['user_id'],
564
+ 'date': row['date'],
565
+ 'content': row['content'],
566
+ 'tags': row['tags'] or '',
567
+ 'created_at': row['created_at']
568
+ })
569
+
570
+ conn.close()
571
+ logger.info(f"Loaded {len(entries)} entries since {since_str} for user {user_id}")
572
+
573
+ except sqlite3.Error as e:
574
+ logger.error(f"Database error loading entries since {since_date}: {e}")
575
+
576
+ return entries
577
+
578
+ # Example usage
579
+ if __name__ == "__main__":
580
+ # Initialize the loader
581
+ loader = DiaryDataLoader(
582
+ db_path="../streamlit_app/backend/diary.db",
583
+ table_name="diary_entries",
584
+ content_column="content",
585
+ date_column="date" #,
586
+ # title_column="title"
587
+ )
588
+
589
+ # Load all documents
590
+ documents = loader.load()
591
+ print(f"Loaded {len(documents)} diary entries")
592
+
593
+ # Load documents by date range
594
+ filtered_docs = loader.load_by_date_range("2024-01-01", "2026-12-31")
595
+ print(f"Loaded {len(filtered_docs)} entries from 2024")
596
+
597
+ # Get table information
598
+ table_info = loader.get_table_info()
599
+ print(f"Table info: {table_info}")
600
+
601
+ # view document contents
602
+ for doc in documents:
603
+ print(f"Document content: {doc.page_content}")
clean_repo/src/Indexingstep/diary_text_splitter.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom text splitter optimized for diary entries.
3
+ Handles entry-based chunking with smart splitting for long entries.
4
+ """
5
+
6
+ from typing import List, Optional, Any, Dict
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.schema import Document
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class DiaryTextSplitter:
14
+ """
15
+ Custom text splitter optimized for diary entries.
16
+
17
+ Strategy:
18
+ 1. Each diary entry = 1 chunk (for short entries)
19
+ 2. Long entries → split into 200-300 tokens with 50-token sliding window
20
+ 3. Preserve metadata across all chunks
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ chunk_size: int = 300, # ~200-300 tokens
26
+ chunk_overlap: int = 50, # ~50 tokens overlap
27
+ length_function: callable = len,
28
+ separators: Optional[List[str]] = None
29
+ ):
30
+ """
31
+ Initialize the DiaryTextSplitter.
32
+
33
+ Args:
34
+ chunk_size: Maximum chunk size in characters (~300 chars ≈ 200-300 tokens)
35
+ chunk_overlap: Overlap between chunks to preserve context
36
+ length_function: Function to calculate text length
37
+ separators: List of separators for splitting (sentence-aware)
38
+ """
39
+ self.chunk_size = chunk_size
40
+ self.chunk_overlap = chunk_overlap
41
+ self.length_function = length_function
42
+
43
+ # Diary-optimized separators (sentence and paragraph aware)
44
+ self.separators = separators or [
45
+ "\n\n", # Paragraph breaks
46
+ "\n", # Line breaks
47
+ ". ", # Sentence endings
48
+ "! ", # Exclamation sentences
49
+ "? ", # Question sentences
50
+ "; ", # Semicolon breaks
51
+ ", ", # Comma breaks
52
+ " ", # Word breaks
53
+ "" # Character breaks (last resort)
54
+ ]
55
+
56
+ # Initialize recursive character splitter for long entries
57
+ self.text_splitter = RecursiveCharacterTextSplitter(
58
+ chunk_size=self.chunk_size,
59
+ chunk_overlap=self.chunk_overlap,
60
+ length_function=self.length_function,
61
+ separators=self.separators
62
+ )
63
+
64
+ def _estimate_tokens(self, text: str) -> int:
65
+ """
66
+ Estimate token count from character count.
67
+ Rule of thumb: ~4 characters per token for English text.
68
+
69
+ Args:
70
+ text: Input text
71
+
72
+ Returns:
73
+ Estimated token count
74
+ """
75
+ return len(text) // 4
76
+
77
+ def _should_split_entry(self, content: str) -> bool:
78
+ """
79
+ Determine if a diary entry should be split into multiple chunks.
80
+
81
+ Args:
82
+ content: Diary entry content
83
+
84
+ Returns:
85
+ True if entry should be split, False otherwise
86
+ """
87
+ estimated_tokens = self._estimate_tokens(content)
88
+ # Split if entry is longer than ~250 tokens (considering our 200-300 target)
89
+ return estimated_tokens > 250
90
+
91
+ def _create_chunk_metadata(self, original_doc: Document, chunk_index: int, total_chunks: int) -> Dict[str, Any]:
92
+ """
93
+ Create metadata for a chunk, preserving original metadata.
94
+
95
+ Args:
96
+ original_doc: Original document
97
+ chunk_index: Index of current chunk (0-based)
98
+ total_chunks: Total number of chunks for this entry
99
+
100
+ Returns:
101
+ Metadata dictionary for the chunk
102
+ """
103
+ chunk_metadata = original_doc.metadata.copy()
104
+
105
+ # Add chunk-specific metadata
106
+ chunk_metadata.update({
107
+ "chunk_index": chunk_index,
108
+ "total_chunks": total_chunks,
109
+ "is_chunked": total_chunks > 1,
110
+ "chunk_id": f"{chunk_metadata.get('entry_id', 'unknown')}_{chunk_index}"
111
+ })
112
+
113
+ return chunk_metadata
114
+
115
+ def split_documents(self, documents: List[Document]) -> List[Document]:
116
+ """
117
+ Split diary documents into optimized chunks.
118
+
119
+ Args:
120
+ documents: List of diary entry documents
121
+
122
+ Returns:
123
+ List of chunked documents with preserved metadata
124
+ """
125
+ chunked_documents = []
126
+
127
+ for doc in documents:
128
+ content = doc.page_content
129
+
130
+ # Check if entry needs splitting
131
+ if not self._should_split_entry(content):
132
+ # Keep as single chunk for short entries
133
+ chunk_metadata = self._create_chunk_metadata(doc, 0, 1)
134
+
135
+ chunked_doc = Document(
136
+ page_content=content,
137
+ metadata=chunk_metadata
138
+ )
139
+ chunked_documents.append(chunked_doc)
140
+
141
+ logger.debug(f"Entry {doc.metadata.get('entry_id', 'unknown')} kept as single chunk")
142
+
143
+ else:
144
+ # Split long entry into multiple chunks
145
+ text_chunks = self.text_splitter.split_text(content)
146
+ total_chunks = len(text_chunks)
147
+
148
+ logger.info(f"Entry {doc.metadata.get('entry_id', 'unknown')} split into {total_chunks} chunks")
149
+
150
+ for i, chunk_text in enumerate(text_chunks):
151
+ chunk_metadata = self._create_chunk_metadata(doc, i, total_chunks)
152
+
153
+ # Add chunk position information
154
+ chunk_metadata["chunk_position"] = "start" if i == 0 else "end" if i == total_chunks - 1 else "middle"
155
+
156
+ chunked_doc = Document(
157
+ page_content=chunk_text,
158
+ metadata=chunk_metadata
159
+ )
160
+ chunked_documents.append(chunked_doc)
161
+
162
+ logger.info(f"Split {len(documents)} entries into {len(chunked_documents)} chunks")
163
+ return chunked_documents
164
+
165
+ def get_chunk_stats(self, documents: List[Document]) -> Dict[str, Any]:
166
+ """
167
+ Get statistics about chunking results.
168
+
169
+ Args:
170
+ documents: List of chunked documents
171
+
172
+ Returns:
173
+ Dictionary with chunking statistics
174
+ """
175
+ total_chunks = len(documents)
176
+ single_chunks = sum(1 for doc in documents if doc.metadata.get("total_chunks", 1) == 1)
177
+ multi_chunks = total_chunks - single_chunks
178
+
179
+ unique_entries = len(set(doc.metadata.get("entry_id", "unknown") for doc in documents))
180
+
181
+ avg_chunk_size = sum(len(doc.page_content) for doc in documents) / total_chunks if total_chunks > 0 else 0
182
+ avg_tokens = sum(self._estimate_tokens(doc.page_content) for doc in documents) / total_chunks if total_chunks > 0 else 0
183
+
184
+ return {
185
+ "total_chunks": total_chunks,
186
+ "unique_entries": unique_entries,
187
+ "single_chunk_entries": single_chunks,
188
+ "multi_chunk_entries": multi_chunks,
189
+ "avg_chunk_size_chars": round(avg_chunk_size, 2),
190
+ "avg_chunk_size_tokens": round(avg_tokens, 2),
191
+ "chunking_ratio": round(total_chunks / unique_entries, 2) if unique_entries > 0 else 0
192
+ }
193
+
194
+ def split_diary_entry(self, entry: Dict[str, Any]) -> List[Document]:
195
+ """
196
+ Split a single diary entry into document chunks.
197
+
198
+ Args:
199
+ entry: Dictionary containing diary entry data
200
+
201
+ Returns:
202
+ List of Document objects
203
+ """
204
+ # Create Document from entry
205
+ content = entry.get('content', '')
206
+
207
+ # Extract title from content if it's in structured format
208
+ title = ""
209
+ actual_content = content
210
+
211
+ if content.startswith("Title: "):
212
+ lines = content.split('\n')
213
+ for line in lines:
214
+ if line.startswith("Title: "):
215
+ title = line.replace("Title: ", "").strip()
216
+ elif line.startswith("Content: "):
217
+ actual_content = line.replace("Content: ", "").strip()
218
+
219
+ # Create metadata
220
+ metadata = {
221
+ "entry_id": str(entry.get('id', 'unknown')),
222
+ "user_id": entry.get('user_id', 1),
223
+ "date": entry.get('date', ''),
224
+ "tags": entry.get('tags', ''),
225
+ "created_at": entry.get('created_at', ''),
226
+ "type": "diary_entry",
227
+ "content_length": len(actual_content),
228
+ "word_count": len(actual_content.split())
229
+ }
230
+
231
+ if title:
232
+ metadata["title"] = title
233
+
234
+ # Create Document
235
+ doc = Document(
236
+ page_content=actual_content,
237
+ metadata=metadata
238
+ )
239
+
240
+ # Split using the existing split_documents method
241
+ return self.split_documents([doc])
clean_repo/src/Indexingstep/embedding_and_storing.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from langchain.schema import Document
3
+ from typing import List, Optional, Dict, Any, Union
4
+ import os
5
+ import logging
6
+ from pathlib import Path
7
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
8
+
9
+ # Set up logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class DiaryEmbeddingAndStorage:
14
+ """
15
+ Class for embedding diary documents and storing them in Chroma vector database.
16
+ Enhanced with metadata filtering for ChromaDB compatibility.
17
+ """
18
+
19
+ def _filter_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Union[str, int, float, bool]]:
20
+ """
21
+ Filter metadata to only include types supported by ChromaDB.
22
+
23
+ Args:
24
+ metadata: Original metadata dictionary
25
+
26
+ Returns:
27
+ Filtered metadata with only supported types
28
+ """
29
+ filtered = {}
30
+
31
+ for key, value in metadata.items():
32
+ if isinstance(value, (str, int, float, bool)) or value is None:
33
+ filtered[key] = value
34
+ elif isinstance(value, list):
35
+ # Convert lists to comma-separated strings
36
+ if value: # Only if list is not empty
37
+ filtered[f"{key}_list"] = ", ".join(str(item) for item in value)
38
+ filtered[f"{key}_count"] = len(value)
39
+ elif isinstance(value, dict):
40
+ # Skip complex nested objects
41
+ logger.debug(f"Skipping complex metadata field: {key}")
42
+ continue
43
+ else:
44
+ # Convert other types to string
45
+ filtered[key] = str(value)
46
+
47
+ return filtered
48
+
49
+ def __init__(
50
+ self,
51
+ user_id: int = 1,
52
+ api_key: Optional[str] = None,
53
+ base_persist_directory: str = "./",
54
+ embedding_model: str = "models/embedding-001",
55
+ chunk_size: int = 1000,
56
+ chunk_overlap: int = 200
57
+ ):
58
+ """
59
+ Initialize the embedding and storage system with user-specific database.
60
+
61
+ Args:
62
+ user_id (int): User ID for user-specific vector database
63
+ api_key (str, optional): Google API key for embeddings
64
+ base_persist_directory (str): Base directory for vector databases
65
+ embedding_model (str): Google embedding model to use
66
+ chunk_size (int): Size of text chunks for embedding
67
+ chunk_overlap (int): Overlap between chunks
68
+ """
69
+ # Set up Google API key
70
+ if api_key:
71
+ os.environ["GOOGLE_API_KEY"] = api_key
72
+ elif "GOOGLE_API_KEY" not in os.environ:
73
+ raise ValueError("Google API key must be provided either as parameter or environment variable")
74
+
75
+ self.user_id = user_id
76
+ self.base_persist_directory = base_persist_directory
77
+
78
+ # Create user-specific paths
79
+ self.persist_directory = os.path.join(base_persist_directory, f"user_{user_id}_vector_db")
80
+ self.collection_name = f"user_{user_id}_diary_entries"
81
+
82
+ self.chunk_size = chunk_size
83
+ self.chunk_overlap = chunk_overlap
84
+
85
+ # Initialize embedding model
86
+ try:
87
+ self.embeddings = GoogleGenerativeAIEmbeddings(
88
+ model=embedding_model
89
+ )
90
+ # logger.info(f"Initialized Google embeddings with model: {embedding_model}")
91
+ except Exception as e:
92
+ logger.error(f"Failed to initialize embeddings: {e}")
93
+ raise
94
+
95
+ # Initialize or load existing vector store
96
+ self.vector_store = None
97
+ self._setup_vector_store()
98
+
99
+ def _setup_vector_store(self):
100
+ """Set up the Chroma vector store."""
101
+ try:
102
+ # Create persist directory if it doesn't exist
103
+ Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
104
+
105
+ # Initialize Chroma vector store
106
+ self.vector_store = Chroma(
107
+ collection_name=self.collection_name,
108
+ embedding_function=self.embeddings,
109
+ persist_directory=self.persist_directory
110
+ )
111
+
112
+ # logger.info(f"Vector store initialized with persist directory: {self.persist_directory}")
113
+
114
+ except Exception as e:
115
+ logger.error(f"Failed to setup vector store: {e}")
116
+ raise
117
+
118
+ def embed_and_store_documents(self, documents: List[Document]) -> List[str]:
119
+ """
120
+ Embed and store documents in the vector database.
121
+
122
+ Args:
123
+ documents (List[Document]): List of LangChain Document objects
124
+
125
+ Returns:
126
+ List[str]: List of document IDs
127
+ """
128
+ if not documents:
129
+ logger.warning("No documents provided for embedding")
130
+ return []
131
+
132
+ try:
133
+ # Filter metadata for each document
134
+ filtered_documents = []
135
+ for doc in documents:
136
+ filtered_metadata = self._filter_metadata(doc.metadata)
137
+ filtered_doc = Document(
138
+ page_content=doc.page_content,
139
+ metadata=filtered_metadata
140
+ )
141
+ filtered_documents.append(filtered_doc)
142
+
143
+ # Log metadata transformation for debugging
144
+ logger.debug(f"Original metadata keys: {list(doc.metadata.keys())}")
145
+ logger.debug(f"Filtered metadata keys: {list(filtered_metadata.keys())}")
146
+
147
+ # Add documents to vector store
148
+ document_ids = self.vector_store.add_documents(filtered_documents)
149
+
150
+ # Persist the vector store (auto-persisted in new langchain-chroma)
151
+ # self.vector_store.persist() # Not needed in langchain-chroma 0.2+
152
+
153
+ logger.info(f"Successfully embedded and stored {len(filtered_documents)} documents")
154
+ return document_ids
155
+
156
+ except Exception as e:
157
+ logger.error(f"Failed to embed and store documents: {e}")
158
+ raise
159
+
160
+ def embed_and_store_texts(
161
+ self,
162
+ texts: List[str],
163
+ metadatas: Optional[List[Dict[str, Any]]] = None
164
+ ) -> List[str]:
165
+ """
166
+ Embed and store raw texts in the vector database.
167
+
168
+ Args:
169
+ texts (List[str]): List of text strings
170
+ metadatas (List[Dict], optional): List of metadata dictionaries
171
+
172
+ Returns:
173
+ List[str]: List of document IDs
174
+ """
175
+ if not texts:
176
+ logger.warning("No texts provided for embedding")
177
+ return []
178
+
179
+ try:
180
+ # Filter metadata if provided
181
+ filtered_metadatas = None
182
+ if metadatas:
183
+ filtered_metadatas = []
184
+ for metadata in metadatas:
185
+ filtered_metadata = self._filter_metadata(metadata)
186
+ filtered_metadatas.append(filtered_metadata)
187
+
188
+ # Log metadata transformation for debugging
189
+ logger.debug(f"Original metadata keys: {list(metadata.keys())}")
190
+ logger.debug(f"Filtered metadata keys: {list(filtered_metadata.keys())}")
191
+
192
+ # Add texts to vector store
193
+ document_ids = self.vector_store.add_texts(
194
+ texts=texts,
195
+ metadatas=filtered_metadatas
196
+ )
197
+
198
+ # ChromaDB auto-persists in newer versions
199
+ logger.info(f"Successfully embedded and stored {len(texts)} text documents")
200
+ return document_ids
201
+
202
+ except Exception as e:
203
+ print(f"DEBUG: Error in embed_and_store_texts: {e}")
204
+ print(f"DEBUG: Error type: {type(e)}")
205
+ import traceback
206
+ traceback.print_exc()
207
+ logger.error(f"Failed to embed and store texts: {e}")
208
+ raise
209
+
210
+ def similarity_search(
211
+ self,
212
+ query: str,
213
+ k: int = 4,
214
+ filter: Optional[Dict[str, Any]] = None
215
+ ) -> List[Document]:
216
+ """
217
+ Perform similarity search on stored documents.
218
+
219
+ Args:
220
+ query (str): Search query
221
+ k (int): Number of results to return
222
+ filter (Dict, optional): Metadata filter
223
+
224
+ Returns:
225
+ List[Document]: List of similar documents
226
+ """
227
+ try:
228
+ results = self.vector_store.similarity_search(
229
+ query=query,
230
+ k=k,
231
+ filter=filter
232
+ )
233
+
234
+ logger.info(f"Found {len(results)} similar documents for query: '{query[:50]}...'")
235
+ return results
236
+
237
+ except Exception as e:
238
+ logger.error(f"Failed to perform similarity search: {e}")
239
+ raise
240
+
241
+ def similarity_search_with_score(
242
+ self,
243
+ query: str,
244
+ k: int = 4,
245
+ filter: Optional[Dict[str, Any]] = None
246
+ ) -> List[tuple]:
247
+ """
248
+ Perform similarity search with relevance scores.
249
+
250
+ Args:
251
+ query (str): Search query
252
+ k (int): Number of results to return
253
+ filter (Dict, optional): Metadata filter
254
+
255
+ Returns:
256
+ List[tuple]: List of (Document, score) tuples
257
+ """
258
+ try:
259
+ results = self.vector_store.similarity_search_with_score(
260
+ query=query,
261
+ k=k,
262
+ filter=filter
263
+ )
264
+
265
+ logger.info(f"Found {len(results)} similar documents with scores for query: '{query[:50]}...'")
266
+ return results
267
+
268
+ except Exception as e:
269
+ logger.error(f"Failed to perform similarity search with scores: {e}")
270
+ raise
271
+
272
+ def get_collection_info(self) -> Dict[str, Any]:
273
+ """
274
+ Get information about the vector store collection.
275
+
276
+ Returns:
277
+ Dict: Collection information
278
+ """
279
+ try:
280
+ collection = self.vector_store._collection
281
+ count = collection.count()
282
+
283
+ return {
284
+ "collection_name": self.collection_name,
285
+ "document_count": count,
286
+ "persist_directory": self.persist_directory
287
+ }
288
+
289
+ except Exception as e:
290
+ logger.error(f"Failed to get collection info: {e}")
291
+ return {}
292
+
293
+ def delete_documents(self, ids: List[str]) -> bool:
294
+ """
295
+ Delete documents by their IDs.
296
+
297
+ Args:
298
+ ids (List[str]): List of document IDs to delete
299
+
300
+ Returns:
301
+ bool: Success status
302
+ """
303
+ try:
304
+ self.vector_store.delete(ids=ids)
305
+ # ChromaDB auto-persists in newer versions
306
+
307
+ logger.info(f"Successfully deleted {len(ids)} documents")
308
+ return True
309
+
310
+ except Exception as e:
311
+ logger.error(f"Failed to delete documents: {e}")
312
+ return False
313
+
314
+ def delete_documents_by_metadata(self, filter_criteria: Dict[str, Any]) -> bool:
315
+ """
316
+ Delete documents based on metadata criteria.
317
+
318
+ Args:
319
+ filter_criteria (Dict): Metadata criteria to filter documents for deletion
320
+
321
+ Returns:
322
+ bool: Success status
323
+ """
324
+ try:
325
+ collection = self.vector_store._collection
326
+
327
+ # Get all documents with their metadata
328
+ all_data = collection.get(include=['metadatas'])
329
+ ids_to_delete = []
330
+
331
+ # Find documents that match the criteria
332
+ for i, metadata in enumerate(all_data['metadatas']):
333
+ match = True
334
+ for key, value in filter_criteria.items():
335
+ if metadata.get(key) != value:
336
+ match = False
337
+ break
338
+
339
+ if match:
340
+ ids_to_delete.append(all_data['ids'][i])
341
+
342
+ if ids_to_delete:
343
+ self.vector_store.delete(ids=ids_to_delete)
344
+ # ChromaDB auto-persists in newer versions
345
+ logger.info(f"Successfully deleted {len(ids_to_delete)} documents matching criteria: {filter_criteria}")
346
+ return True
347
+ else:
348
+ logger.info(f"No documents found matching criteria: {filter_criteria}")
349
+ return True
350
+
351
+ except Exception as e:
352
+ logger.error(f"Failed to delete documents by metadata: {e}")
353
+ return False
354
+
355
+ def clear_collection(self) -> bool:
356
+ """
357
+ Clear all documents from the collection.
358
+
359
+ Returns:
360
+ bool: Success status
361
+ """
362
+ try:
363
+ # Get all document IDs and delete them
364
+ collection = self.vector_store._collection
365
+ all_ids = collection.get()['ids']
366
+
367
+ if all_ids:
368
+ self.vector_store.delete(ids=all_ids)
369
+ # ChromaDB auto-persists in newer versions
370
+ logger.info(f"Cleared {len(all_ids)} documents from collection")
371
+ else:
372
+ logger.info("Collection is already empty")
373
+
374
+ return True
375
+
376
+ except Exception as e:
377
+ logger.error(f"Failed to clear collection: {e}")
378
+ return False
379
+
380
+ def batch_process_documents(
381
+ self,
382
+ documents: List[Document],
383
+ batch_size: int = 100
384
+ ) -> List[str]:
385
+ """
386
+ Process documents in batches for large datasets.
387
+
388
+ Args:
389
+ documents (List[Document]): List of documents to process
390
+ batch_size (int): Size of each batch
391
+
392
+ Returns:
393
+ List[str]: List of all document IDs
394
+ """
395
+ all_ids = []
396
+
397
+ for i in range(0, len(documents), batch_size):
398
+ batch = documents[i:i + batch_size]
399
+ logger.info(f"Processing batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
400
+
401
+ try:
402
+ batch_ids = self.embed_and_store_documents(batch)
403
+ all_ids.extend(batch_ids)
404
+ except Exception as e:
405
+ logger.error(f"Failed to process batch {i//batch_size + 1}: {e}")
406
+ continue
407
+
408
+ logger.info(f"Completed batch processing. Total documents processed: {len(all_ids)}")
409
+ return all_ids
410
+
411
+ class EmbeddingDemo:
412
+ def __init__(self, api_key=None):
413
+ """Initialize the embedding model with Google API key."""
414
+ if api_key:
415
+ os.environ["GOOGLE_API_KEY"] = api_key
416
+
417
+ self.embeddings = GoogleGenerativeAIEmbeddings(
418
+ model="models/embedding-001"
419
+ )
420
+
421
+ def embed_text(self, text):
422
+ """Generate embedding for a single text."""
423
+ return self.embeddings.embed_query(text)
424
+
425
+ def embed_documents(self, documents):
426
+ """Generate embeddings for multiple documents."""
427
+ return self.embeddings.embed_documents(documents)
428
+
429
+ def demonstrate(self):
430
+ """Show basic embedding functionality."""
431
+ sample_text = "This is a sample text for embedding."
432
+ sample_docs = ["First document", "Second document", "Third document"]
433
+
434
+ # Single text embedding
435
+ text_embedding = self.embed_text(sample_text)
436
+ print(f"Text embedding dimension: {len(text_embedding)}")
437
+
438
+ # Multiple documents embedding
439
+ doc_embeddings = self.embed_documents(sample_docs)
440
+ print(f"Number of document embeddings: {len(doc_embeddings)}")
441
+ print(f"Each embedding dimension: {len(doc_embeddings[0])}")
442
+
443
+ # Usage example
444
+ if __name__ == "__main__":
445
+ # Initialize the embedding and storage system
446
+ try:
447
+ # You need to set your Google API key
448
+ embedding_storage = DiaryEmbeddingAndStorage(
449
+ api_key="your_google_api_key_here", # Replace with your actual API key
450
+ persist_directory="./diary_vector_db",
451
+ collection_name="diary_entries"
452
+ )
453
+
454
+ # Example documents
455
+ sample_documents = [
456
+ Document(
457
+ page_content="Today was a wonderful day. I went to the park and enjoyed the sunshine.",
458
+ metadata={"date": "2024-01-15", "mood": "happy"}
459
+ ),
460
+ Document(
461
+ page_content="Had a challenging day at work but learned a lot of new things.",
462
+ metadata={"date": "2024-01-16", "mood": "productive"}
463
+ ),
464
+ Document(
465
+ page_content="Spent time with family and friends. Made some great memories.",
466
+ metadata={"date": "2024-01-17", "mood": "grateful"}
467
+ )
468
+ ]
469
+
470
+ # Embed and store documents
471
+ doc_ids = embedding_storage.embed_and_store_documents(sample_documents)
472
+ print(f"Stored documents with IDs: {doc_ids}")
473
+
474
+ # Get collection info
475
+ info = embedding_storage.get_collection_info()
476
+ print(f"Collection info: {info}")
477
+
478
+ # Perform similarity search
479
+ query = "happy day at the park"
480
+ results = embedding_storage.similarity_search(query, k=2)
481
+
482
+ print(f"\nSimilarity search results for '{query}':")
483
+ for i, doc in enumerate(results):
484
+ print(f"Result {i+1}: {doc.page_content[:100]}...")
485
+ print(f"Metadata: {doc.metadata}")
486
+
487
+ # Search with scores
488
+ scored_results = embedding_storage.similarity_search_with_score(query, k=2)
489
+
490
+ print(f"\nSimilarity search with scores:")
491
+ for doc, score in scored_results:
492
+ print(f"Score: {score:.4f} - {doc.page_content[:50]}...")
493
+
494
+ except Exception as e:
495
+ print(f"Error in example: {e}")
496
+
497
+ # Original demo
498
+ # demo = EmbeddingDemo(api_key="your_google_api_key_here")
499
+ # demo.demonstrate()
clean_repo/src/Indexingstep/indexing_pipeline.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import List, Dict, Any
4
+ from datetime import datetime
5
+
6
+ # Add parent directory to path
7
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
10
+ from langchain_chroma import Chroma
11
+ from langchain.schema import Document
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+
14
+ def create_user_vector_database(user_id: int, diary_entries: List[Dict[str, Any]]) -> bool:
15
+ """
16
+ Create vector database for a specific user from their diary entries.
17
+
18
+ Args:
19
+ user_id: User ID
20
+ diary_entries: List of diary entries from database
21
+
22
+ Returns:
23
+ True if successful, False otherwise
24
+ """
25
+ try:
26
+ # Setup paths
27
+ base_vector_path = os.path.dirname(os.path.abspath(__file__))
28
+ vector_db_path = os.path.join(base_vector_path, f"user_{user_id}_vector_db")
29
+ collection_name = f"user_{user_id}_diary_entries"
30
+
31
+ # Create directory
32
+ os.makedirs(vector_db_path, exist_ok=True)
33
+
34
+ # Initialize embeddings
35
+ google_api_key = os.getenv("GOOGLE_API_KEY")
36
+ if not google_api_key:
37
+ raise ValueError("Google API key not found")
38
+
39
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
40
+
41
+ # Process diary entries into documents
42
+ documents = []
43
+ text_splitter = RecursiveCharacterTextSplitter(
44
+ chunk_size=1000,
45
+ chunk_overlap=200,
46
+ length_function=len,
47
+ )
48
+
49
+ for entry in diary_entries:
50
+ # Extract content
51
+ content = entry.get('content', '')
52
+ if not content:
53
+ continue
54
+
55
+ # Extract title and content
56
+ lines = content.split('\n')
57
+ title = "Untitled"
58
+ actual_content = content
59
+
60
+ for line in lines:
61
+ if line.startswith('Title: '):
62
+ title = line.replace('Title: ', '').strip()
63
+ elif line.startswith('Content: '):
64
+ actual_content = line.replace('Content: ', '').strip()
65
+ break
66
+
67
+ # Create metadata
68
+ metadata = {
69
+ 'user_id': user_id,
70
+ 'entry_id': entry.get('id'),
71
+ 'date': entry.get('date', ''),
72
+ 'title': title,
73
+ 'tags': entry.get('tags', ''),
74
+ 'tags_list': [tag.strip() for tag in entry.get('tags', '').split(',') if tag.strip()],
75
+ 'source': f"diary_entry_{entry.get('id')}"
76
+ }
77
+
78
+ # Split content if too long
79
+ if len(actual_content) > 1000:
80
+ chunks = text_splitter.split_text(actual_content)
81
+ for i, chunk in enumerate(chunks):
82
+ chunk_metadata = metadata.copy()
83
+ chunk_metadata['chunk_id'] = i
84
+ documents.append(Document(page_content=chunk, metadata=chunk_metadata))
85
+ else:
86
+ documents.append(Document(page_content=actual_content, metadata=metadata))
87
+
88
+ if not documents:
89
+ print(f"No documents to index for user {user_id}")
90
+ return False
91
+
92
+ # Create vector store
93
+ vector_store = Chroma(
94
+ persist_directory=vector_db_path,
95
+ embedding_function=embeddings,
96
+ collection_name=collection_name
97
+ )
98
+
99
+ # Add documents to vector store
100
+ vector_store.add_documents(documents)
101
+
102
+ # Persist the database
103
+ vector_store.persist()
104
+
105
+ print(f"Successfully created vector database for user {user_id} with {len(documents)} documents")
106
+ return True
107
+
108
+ except Exception as e:
109
+ print(f"Error creating vector database for user {user_id}: {e}")
110
+ return False
clean_repo/src/Indexingstep/pipeline.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4
+
5
+ from dataloading import DiaryDataLoader, DiaryContentPreprocessor
6
+ from diary_text_splitter import DiaryTextSplitter
7
+ from embedding_and_storing import DiaryEmbeddingAndStorage
8
+ from langchain.schema import Document
9
+ from typing import List, Dict, Any, Optional
10
+ import logging
11
+ from pathlib import Path
12
+
13
+ # Configure logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class DiaryIndexingPipeline:
18
+ """
19
+ Enhanced pipeline for indexing diary entries with optimized chunking and metadata.
20
+ Integrates data loading, preprocessing, diary-specific splitting, embedding, and storage.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ db_path: str = "./diary.db",
26
+ persist_directory: str = "./chroma_db",
27
+ collection_name: str = "diary_collection",
28
+ google_api_key: Optional[str] = None,
29
+ chunk_size: int = 300, # Optimized for diary entries (200-300 tokens)
30
+ chunk_overlap: int = 50, # 50-token sliding window
31
+ embedding_model: str = "models/embedding-001",
32
+ batch_size: int = 50,
33
+ user_id: int = 1
34
+ ):
35
+ """
36
+ Initialize the enhanced diary indexing pipeline.
37
+
38
+ Args:
39
+ db_path (str): Path to SQLite database
40
+ persist_directory (str): Directory for vector database
41
+ collection_name (str): Name of the collection
42
+ google_api_key (str, optional): Google API key for embeddings
43
+ chunk_size (int): Size of text chunks (optimized for diary entries)
44
+ chunk_overlap (int): Overlap between chunks (sliding window)
45
+ embedding_model (str): Google embedding model name
46
+ batch_size (int): Batch size for processing
47
+ user_id (int): ID of the user for user-specific isolation
48
+ """
49
+ self.db_path = db_path
50
+ self.persist_directory = persist_directory
51
+ self.collection_name = collection_name
52
+ self.batch_size = batch_size
53
+ self.user_id = user_id
54
+
55
+ # Validate database exists
56
+ if not os.path.exists(db_path):
57
+ raise FileNotFoundError(f"Database file not found: {db_path}")
58
+
59
+ # Initialize components
60
+ self._initialize_components(
61
+ google_api_key, chunk_size, chunk_overlap, embedding_model
62
+ )
63
+
64
+ logger.info("Diary Indexing Pipeline initialized successfully")
65
+
66
+ def _initialize_components(
67
+ self,
68
+ google_api_key: Optional[str],
69
+ chunk_size: int,
70
+ chunk_overlap: int,
71
+ embedding_model: str
72
+ ):
73
+ """Initialize all pipeline components."""
74
+
75
+ # 1. Data Loader
76
+ self.data_loader = DiaryDataLoader(
77
+ db_path=self.db_path,
78
+ table_name="diary_entries",
79
+ content_column="content",
80
+ date_column="date",
81
+ user_id=self.user_id
82
+ )
83
+
84
+ # 2. Content Preprocessor
85
+ self.preprocessor = DiaryContentPreprocessor(
86
+ remove_extra_whitespace=True,
87
+ normalize_line_breaks=True,
88
+ min_content_length=3, # Keep short entries
89
+ max_content_length=10000
90
+ )
91
+
92
+ # 3. Diary-optimized Text Splitter
93
+ self.text_splitter = DiaryTextSplitter(
94
+ chunk_size=chunk_size,
95
+ chunk_overlap=chunk_overlap
96
+ )
97
+
98
+ # 4. Embedding and Storage
99
+ self.embedding_storage = DiaryEmbeddingAndStorage(
100
+ user_id=self.user_id,
101
+ api_key=google_api_key,
102
+ base_persist_directory=self.persist_directory,
103
+ embedding_model=embedding_model,
104
+ chunk_size=chunk_size,
105
+ chunk_overlap=chunk_overlap
106
+ )
107
+
108
+ logger.info("All pipeline components initialized")
109
+
110
+ def load_diary_data(self, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[Document]:
111
+ """
112
+ Load diary entries from database.
113
+
114
+ Args:
115
+ start_date (str, optional): Start date filter (YYYY-MM-DD)
116
+ end_date (str, optional): End date filter (YYYY-MM-DD)
117
+
118
+ Returns:
119
+ List[Document]: Loaded diary documents
120
+ """
121
+ try:
122
+ logger.info("Loading diary entries from database...")
123
+
124
+ if start_date and end_date:
125
+ documents = self.data_loader.load_by_date_range(start_date, end_date)
126
+ logger.info(f"Loaded {len(documents)} entries from {start_date} to {end_date}")
127
+ else:
128
+ documents = self.data_loader.load()
129
+ logger.info(f"Loaded {len(documents)} total diary entries")
130
+
131
+ if not documents:
132
+ logger.warning("No diary entries found in database")
133
+ return []
134
+
135
+ return documents
136
+
137
+ except Exception as e:
138
+ logger.error(f"Error loading diary data: {str(e)}")
139
+ raise
140
+
141
+ def preprocess_documents(self, documents: List[Document]) -> List[Document]:
142
+ """
143
+ Preprocess diary documents.
144
+
145
+ Args:
146
+ documents (List[Document]): Raw documents
147
+
148
+ Returns:
149
+ List[Document]: Preprocessed documents
150
+ """
151
+ try:
152
+ logger.info(f"Preprocessing {len(documents)} documents...")
153
+
154
+ preprocessed_docs = self.preprocessor.preprocess_documents(documents)
155
+
156
+ logger.info(f"Preprocessing complete: {len(preprocessed_docs)} documents kept")
157
+ return preprocessed_docs
158
+
159
+ except Exception as e:
160
+ logger.error(f"Error preprocessing documents: {str(e)}")
161
+ raise
162
+
163
+ def split_documents(self, documents: List[Document]) -> List[Document]:
164
+ """
165
+ Split documents into optimized chunks using diary-specific splitter.
166
+
167
+ Args:
168
+ documents (List[Document]): Documents to split
169
+
170
+ Returns:
171
+ List[Document]: Split document chunks with enhanced metadata
172
+ """
173
+ try:
174
+ logger.info(f"Splitting {len(documents)} diary entries into optimized chunks...")
175
+
176
+ split_docs = self.text_splitter.split_documents(documents)
177
+
178
+ # Get and log chunking statistics
179
+ stats = self.text_splitter.get_chunk_stats(split_docs)
180
+ logger.info(f"Document splitting complete: {stats}")
181
+
182
+ return split_docs
183
+
184
+ except Exception as e:
185
+ logger.error(f"Error splitting documents: {str(e)}")
186
+ raise
187
+
188
+ def embed_and_store(self, documents: List[Document]) -> List[str]:
189
+ """
190
+ Generate embeddings and store documents.
191
+
192
+ Args:
193
+ documents (List[Document]): Documents to embed and store
194
+
195
+ Returns:
196
+ List[str]: Document IDs
197
+ """
198
+ try:
199
+ logger.info(f"Generating embeddings and storing {len(documents)} document chunks...")
200
+
201
+ # Process in batches for large datasets
202
+ if len(documents) > self.batch_size:
203
+ document_ids = self.embedding_storage.batch_process_documents(
204
+ documents, self.batch_size
205
+ )
206
+ else:
207
+ document_ids = self.embedding_storage.embed_and_store_documents(documents)
208
+
209
+ logger.info(f"Successfully embedded and stored {len(document_ids)} documents")
210
+ return document_ids
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error embedding and storing documents: {str(e)}")
214
+ raise
215
+
216
+ def run_full_pipeline(
217
+ self,
218
+ start_date: Optional[str] = None,
219
+ end_date: Optional[str] = None,
220
+ clear_existing: bool = False
221
+ ) -> Dict[str, Any]:
222
+ """
223
+ Run the complete indexing pipeline.
224
+
225
+ Args:
226
+ start_date (str, optional): Start date filter
227
+ end_date (str, optional): End date filter
228
+ clear_existing (bool): Whether to clear existing data
229
+
230
+ Returns:
231
+ Dict: Pipeline execution results
232
+ """
233
+ try:
234
+ logger.info("="*60)
235
+ logger.info("STARTING DIARY INDEXING PIPELINE")
236
+ logger.info("="*60)
237
+
238
+ pipeline_stats = {
239
+ "status": "running",
240
+ "steps_completed": 0,
241
+ "total_steps": 5,
242
+ "documents_loaded": 0,
243
+ "documents_preprocessed": 0,
244
+ "chunks_created": 0,
245
+ "documents_stored": 0,
246
+ "errors": []
247
+ }
248
+
249
+ # Step 1: Clear existing data if requested
250
+ if clear_existing:
251
+ logger.info("Step 1: Clearing existing vector store...")
252
+ self.embedding_storage.clear_collection()
253
+ pipeline_stats["steps_completed"] += 1
254
+
255
+ # Step 2: Load diary data
256
+ logger.info("Step 2: Loading diary entries...")
257
+ documents = self.load_diary_data(start_date, end_date)
258
+ pipeline_stats["documents_loaded"] = len(documents)
259
+ pipeline_stats["steps_completed"] += 1
260
+
261
+ if not documents:
262
+ pipeline_stats["status"] = "completed_with_warnings"
263
+ pipeline_stats["errors"].append("No documents found to process")
264
+ return pipeline_stats
265
+
266
+ # Step 3: Preprocess documents
267
+ logger.info("Step 3: Preprocessing documents...")
268
+ preprocessed_docs = self.preprocess_documents(documents)
269
+ pipeline_stats["documents_preprocessed"] = len(preprocessed_docs)
270
+ pipeline_stats["steps_completed"] += 1
271
+
272
+ if not preprocessed_docs:
273
+ pipeline_stats["status"] = "failed"
274
+ pipeline_stats["errors"].append("No documents survived preprocessing")
275
+ return pipeline_stats
276
+
277
+ # Step 4: Split documents into chunks
278
+ logger.info("Step 4: Splitting documents into chunks...")
279
+ split_docs = self.split_documents(preprocessed_docs)
280
+ pipeline_stats["chunks_created"] = len(split_docs)
281
+ pipeline_stats["steps_completed"] += 1
282
+
283
+ # Step 5: Generate embeddings and store
284
+ logger.info("Step 5: Generating embeddings and storing...")
285
+ document_ids = self.embed_and_store(split_docs)
286
+ pipeline_stats["documents_stored"] = len(document_ids)
287
+ pipeline_stats["steps_completed"] += 1
288
+
289
+ # Update final status
290
+ pipeline_stats["status"] = "completed_successfully"
291
+
292
+ logger.info("="*60)
293
+ logger.info("PIPELINE COMPLETED SUCCESSFULLY!")
294
+ logger.info("="*60)
295
+ logger.info(f"Documents loaded: {pipeline_stats['documents_loaded']}")
296
+ logger.info(f"Documents preprocessed: {pipeline_stats['documents_preprocessed']}")
297
+ logger.info(f"Chunks created: {pipeline_stats['chunks_created']}")
298
+ logger.info(f"Documents stored: {pipeline_stats['documents_stored']}")
299
+ logger.info("="*60)
300
+
301
+ return pipeline_stats
302
+
303
+ except Exception as e:
304
+ logger.error(f"Pipeline failed with error: {str(e)}")
305
+ pipeline_stats["status"] = "failed"
306
+ pipeline_stats["errors"].append(str(e))
307
+ return pipeline_stats
308
+
309
+ def incremental_update(self, start_date: str, end_date: Optional[str] = None) -> Dict[str, Any]:
310
+ """
311
+ Perform incremental update for new diary entries.
312
+
313
+ Args:
314
+ start_date (str): Start date for incremental update
315
+ end_date (str, optional): End date for incremental update
316
+
317
+ Returns:
318
+ Dict: Update results
319
+ """
320
+ try:
321
+ logger.info(f"Starting incremental update from {start_date}")
322
+
323
+ # Load only new entries
324
+ new_documents = self.load_diary_data(start_date, end_date)
325
+
326
+ if not new_documents:
327
+ logger.info("No new documents found for incremental update")
328
+ return {"status": "no_updates", "documents_added": 0}
329
+
330
+ # Process new documents
331
+ preprocessed_docs = self.preprocess_documents(new_documents)
332
+ split_docs = self.split_documents(preprocessed_docs)
333
+ document_ids = self.embed_and_store(split_docs)
334
+
335
+ logger.info(f"Incremental update completed: {len(document_ids)} new documents added")
336
+
337
+ return {
338
+ "status": "success",
339
+ "documents_loaded": len(new_documents),
340
+ "documents_added": len(document_ids)
341
+ }
342
+
343
+ except Exception as e:
344
+ logger.error(f"Incremental update failed: {str(e)}")
345
+ return {"status": "failed", "error": str(e)}
346
+
347
+ def search_similar_entries(
348
+ self,
349
+ query: str,
350
+ k: int = 5,
351
+ filter_metadata: Optional[Dict[str, Any]] = None
352
+ ) -> List[Document]:
353
+ """
354
+ Search for similar diary entries.
355
+
356
+ Args:
357
+ query (str): Search query
358
+ k (int): Number of results to return
359
+ filter_metadata (Dict, optional): Metadata filter
360
+
361
+ Returns:
362
+ List[Document]: Similar documents
363
+ """
364
+ try:
365
+ return self.embedding_storage.similarity_search(
366
+ query=query,
367
+ k=k,
368
+ filter=filter_metadata
369
+ )
370
+ except Exception as e:
371
+ logger.error(f"Error searching similar entries: {str(e)}")
372
+ return []
373
+
374
+ def get_pipeline_stats(self) -> Dict[str, Any]:
375
+ """
376
+ Get comprehensive pipeline statistics.
377
+
378
+ Returns:
379
+ Dict: Pipeline and database statistics
380
+ """
381
+ try:
382
+ # Database stats
383
+ db_info = self.data_loader.get_table_info()
384
+
385
+ # Vector store stats
386
+ vector_info = self.embedding_storage.get_collection_info()
387
+
388
+ return {
389
+ "database": db_info,
390
+ "vector_store": vector_info,
391
+ "pipeline_config": {
392
+ "chunk_size": self.text_splitter.chunk_size,
393
+ "chunk_overlap": self.text_splitter.chunk_overlap,
394
+ "batch_size": self.batch_size,
395
+ "collection_name": self.collection_name
396
+ }
397
+ }
398
+
399
+ except Exception as e:
400
+ logger.error(f"Error getting pipeline stats: {str(e)}")
401
+ return {}
402
+
403
+ def main():
404
+ """Main function to demonstrate pipeline usage."""
405
+
406
+ # Configuration
407
+ config = {
408
+ "db_path": "../streamlit_app/backend/diary.db", # Adjust path as needed
409
+ "persist_directory": "./diary_vector_db",
410
+ "collection_name": "diary_entries",
411
+ "google_api_key": None, # Set your API key or use environment variable
412
+ "chunk_size": 800,
413
+ "chunk_overlap": 100,
414
+ "batch_size": 50
415
+ }
416
+
417
+ try:
418
+ # Initialize pipeline
419
+ logger.info("Initializing Diary Indexing Pipeline...")
420
+ pipeline = DiaryIndexingPipeline(**config)
421
+
422
+ # Run full pipeline
423
+ results = pipeline.run_full_pipeline(clear_existing=True)
424
+
425
+ # Print results
426
+ print("\n" + "="*60)
427
+ print("PIPELINE EXECUTION RESULTS")
428
+ print("="*60)
429
+ print(f"Status: {results['status']}")
430
+ print(f"Steps completed: {results['steps_completed']}/{results['total_steps']}")
431
+ print(f"Documents loaded: {results['documents_loaded']}")
432
+ print(f"Documents preprocessed: {results['documents_preprocessed']}")
433
+ print(f"Chunks created: {results['chunks_created']}")
434
+ print(f"Documents stored: {results['documents_stored']}")
435
+
436
+ if results['errors']:
437
+ print(f"Errors: {results['errors']}")
438
+
439
+ # Get and display stats
440
+ stats = pipeline.get_pipeline_stats()
441
+ print("\nPIPELINE STATISTICS:")
442
+ print(f"Database entries: {stats.get('database', {}).get('row_count', 'N/A')}")
443
+ print(f"Vector store documents: {stats.get('vector_store', {}).get('document_count', 'N/A')}")
444
+ print("="*60)
445
+
446
+ # Example search
447
+ if results['status'] == 'completed_successfully':
448
+ print("\nTesting similarity search...")
449
+ search_results = pipeline.search_similar_entries("happy day", k=3)
450
+ print(f"Found {len(search_results)} similar entries")
451
+ for i, doc in enumerate(search_results[:2]):
452
+ print(f"Result {i+1}: {doc.page_content[:100]}...")
453
+
454
+ except Exception as e:
455
+ logger.error(f"Main execution failed: {str(e)}")
456
+ print(f"Error: {str(e)}")
457
+
458
+ if __name__ == "__main__":
459
+ main()
clean_repo/src/Retrivel_And_Generation/Retrieval_And_Generator.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Retrieval and Generation System for Personal Diary Chatbot
4
+
5
+ This module implements the RAG (Retrieval-Augmented Generation) pipeline for the diary chatbot.
6
+ It handles document retrieval from the vector database and generates contextual responses
7
+ using Google's Generative AI.
8
+
9
+ Components:
10
+ - Document Retrieval: Query vector database for relevant diary entries
11
+ - Context Processing: Format retrieved documents for LLM consumption
12
+ - Response Generation: Generate contextual responses using retrieved diary content
13
+ - Conversation Management: Handle chat history and context preservation
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import logging
19
+ from typing import List, Dict, Any, Optional, Tuple
20
+ from datetime import datetime
21
+ from functools import lru_cache
22
+ import hashlib
23
+
24
+ # Add parent directory to path for imports
25
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
26
+
27
+ # LangChain imports
28
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
29
+ from langchain_chroma import Chroma
30
+ from langchain.schema import Document
31
+ from langchain.schema.runnable import RunnablePassthrough
32
+ from langchain.schema.output_parser import StrOutputParser
33
+ from langchain.prompts import ChatPromptTemplate, PromptTemplate
34
+
35
+ # Configure logging
36
+ logging.basicConfig(level=logging.INFO)
37
+ logger = logging.getLogger(__name__)
38
+
39
+ class DiaryRAGSystem:
40
+ """
41
+ Retrieval-Augmented Generation system for personal diary chatbot.
42
+
43
+ This class handles the complete RAG pipeline:
44
+ 1. Retrieve relevant diary entries from vector database
45
+ 2. Format context for LLM consumption
46
+ 3. Generate contextual responses using Google's Generative AI
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ user_id: int = 1,
52
+ base_vector_path: str = "./src/VectorDB",
53
+ google_api_key: Optional[str] = None,
54
+ embedding_model: str = "models/embedding-001",
55
+ chat_model: str = "gemini-2.5-flash-lite",
56
+ max_retrieval_docs: int = 5
57
+ ):
58
+ """
59
+ Initialize the RAG system with user-specific vector database.
60
+
61
+ Args:
62
+ user_id: User ID for user-specific vector database
63
+ base_vector_path: Base path for vector databases
64
+ google_api_key: Google API key for embeddings and chat
65
+ embedding_model: Model for text embeddings
66
+ chat_model: Model for chat completion
67
+ max_retrieval_docs: Maximum number of documents to retrieve
68
+ """
69
+ self.user_id = user_id
70
+ self.base_vector_path = base_vector_path
71
+
72
+ # Create user-specific paths
73
+ self.vector_db_path = os.path.join(base_vector_path, f"user_{user_id}_vector_db")
74
+ self.collection_name = f"user_{user_id}_diary_entries"
75
+ self.max_retrieval_docs = max_retrieval_docs
76
+
77
+ # Ensure user vector database directory exists
78
+ os.makedirs(self.vector_db_path, exist_ok=True)
79
+
80
+ # Set up Google API key
81
+ if google_api_key:
82
+ os.environ["GOOGLE_API_KEY"] = google_api_key
83
+ elif not os.getenv("GOOGLE_API_KEY"):
84
+ raise ValueError("Google API key must be provided either as parameter or environment variable")
85
+
86
+ # Initialize embedding and chat models
87
+ try:
88
+ # Fix for Streamlit event loop issue
89
+ import asyncio
90
+ import nest_asyncio
91
+
92
+ # Allow nested event loops for Streamlit compatibility
93
+ try:
94
+ nest_asyncio.apply()
95
+ except:
96
+ pass
97
+
98
+ # Set event loop for thread if not exists
99
+ try:
100
+ loop = asyncio.get_event_loop()
101
+ if loop.is_closed():
102
+ raise RuntimeError("Event loop is closed")
103
+ except RuntimeError:
104
+ # Create new event loop for this thread
105
+ loop = asyncio.new_event_loop()
106
+ asyncio.set_event_loop(loop)
107
+
108
+ self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model)
109
+ self.chat_model = ChatGoogleGenerativeAI(
110
+ model=chat_model,
111
+ temperature=0.3, # Lower temperature for faster, more focused responses
112
+ max_tokens=800, # Shorter responses for speed
113
+ top_k=20, # Limit token choices for speed
114
+ top_p=0.8 # Nucleus sampling for faster generation
115
+ )
116
+ logger.info(f"Initialized embeddings with model: {embedding_model}")
117
+ logger.info(f"Initialized chat model: {chat_model}")
118
+ except Exception as e:
119
+ logger.error(f"Failed to initialize models: {str(e)}")
120
+ raise
121
+
122
+ # Initialize vector store
123
+ self.vector_store = None
124
+ self._setup_vector_store()
125
+
126
+ # Set up prompt templates
127
+ self._setup_prompts()
128
+
129
+ # Initialize conversation chain
130
+ self._setup_conversation_chain()
131
+
132
+ def _setup_vector_store(self):
133
+ """Set up connection to the vector database."""
134
+ try:
135
+ if os.path.exists(self.vector_db_path):
136
+ self.vector_store = Chroma(
137
+ persist_directory=self.vector_db_path,
138
+ embedding_function=self.embeddings,
139
+ collection_name=self.collection_name
140
+ )
141
+ collection_info = self.vector_store._collection.count()
142
+ logger.info(f"Connected to vector database (primary) with {collection_info} documents")
143
+ # Fallback: legacy nested path if empty
144
+ if collection_info == 0:
145
+ nested_path = os.path.join(self.vector_db_path, os.path.basename(self.vector_db_path))
146
+ if os.path.isdir(nested_path):
147
+ try:
148
+ nested_vs = Chroma(
149
+ persist_directory=nested_path,
150
+ embedding_function=self.embeddings,
151
+ collection_name=self.collection_name
152
+ )
153
+ nested_count = nested_vs._collection.count()
154
+ if nested_count > 0:
155
+ logger.warning(
156
+ f"Primary path empty. Switching to legacy nested path {nested_path} with {nested_count} docs"
157
+ )
158
+ self.vector_store = nested_vs
159
+ self.vector_db_path = nested_path
160
+ except Exception as ne:
161
+ logger.debug(f"Failed to read nested path: {ne}")
162
+ else:
163
+ logger.warning(f"Vector database not found at {self.vector_db_path}")
164
+ logger.info("Run indexing pipeline first.")
165
+ except Exception as e:
166
+ logger.error(f"Failed to setup vector store: {str(e)}")
167
+ self.vector_store = None
168
+
169
+ def reload_vector_store(self) -> int:
170
+ """Reload vector store from disk. Returns new document count or 0."""
171
+ try:
172
+ self._setup_vector_store()
173
+ if self.vector_store:
174
+ return self.vector_store._collection.count()
175
+ except Exception as e:
176
+ logger.warning(f"reload_vector_store failed: {e}")
177
+ return 0
178
+
179
+ def get_document_count(self) -> int:
180
+ try:
181
+ if self.vector_store:
182
+ return self.vector_store._collection.count()
183
+ except Exception:
184
+ pass
185
+ return 0
186
+
187
+ def _setup_prompts(self):
188
+ """Set up prompt templates for different scenarios."""
189
+
190
+ # Main RAG prompt template
191
+ self.rag_prompt = ChatPromptTemplate.from_template("""
192
+ Bạn là một trợ lý AI thông minh và thấu hiểu, chuyên về việc phân tích và thảo luận nội dung về nhật ký cá nhân.
193
+
194
+ Dựa trên các mục nhật ký sau đây được tìm kiếm từ cơ sở dữ liệu:
195
+
196
+ {context}
197
+
198
+ Người dùng hỏi: {question}
199
+
200
+ Hãy trả lời một cách:
201
+ - Thấu hiểu và empathetic (đồng cảm)
202
+ - Dựa trên nội dung nhật ký được cung cấp
203
+ - Cung cấp insights và connections giữa các entries
204
+ - Đưa ra suggestions hoặc reflections nếu phù hợp
205
+ - Sử dụng tiếng Việt tự nhiên và ấm áp
206
+
207
+ Nếu không tìm thấy thông tin liên quan trong nhật ký, hãy thành thật nói và đề xuất các cách khác để giúp đỡ.
208
+
209
+ Trả lời:
210
+ """)
211
+
212
+ # Fallback prompt when no relevant documents found
213
+ self.fallback_prompt = ChatPromptTemplate.from_template("""
214
+ Bạn là một trợ lý AI thân thiện và hữu ích cho việc quản lý nhật ký cá nhân.
215
+
216
+ Người dùng hỏi: {question}
217
+
218
+ Vì không tìm thấy thông tin liên quan trong nhật ký hiện tại, hãy:
219
+ - Trả lời một cách thân thiện ngắn gọn và hữu ích
220
+ - Đề xuất cách người dùng có thể ghi nhật ký về chủ đề này
221
+ - Khuyến khích reflection và self-discovery
222
+ - Cung cấp general guidance nếu phù hợp
223
+
224
+ Sử dụng tiếng Việt tự nhiên và ấm áp.
225
+
226
+ Trả lời:
227
+ """)
228
+
229
+ # Summary prompt for multiple diary entries
230
+ self.summary_prompt = ChatPromptTemplate.from_template("""
231
+ Dựa trên các mục nhật ký sau đây:
232
+
233
+ {context}
234
+
235
+ Hãy tạo một summary ngắn gọn về:
236
+ - Chủ đề chính được đề cập
237
+ - Cảm xúc và mood tổng thể
238
+ - Patterns hoặc themes đáng chú ý
239
+ - Insights về personal growth
240
+
241
+ Sử dụng tiếng Việt và giữ tính cách empathetic.
242
+
243
+ Summary:
244
+ """)
245
+
246
+ def _setup_conversation_chain(self):
247
+ """Set up the conversation chain for RAG processing."""
248
+ try:
249
+ # Create retriever from vector store
250
+ if self.vector_store:
251
+ self.retriever = self.vector_store.as_retriever(
252
+ search_kwargs={"k": self.max_retrieval_docs}
253
+ )
254
+
255
+ # Set up the main RAG chain
256
+ self.rag_chain = (
257
+ {
258
+ "context": self.retriever | self._format_docs,
259
+ "question": RunnablePassthrough()
260
+ }
261
+ | self.rag_prompt
262
+ | self.chat_model
263
+ | StrOutputParser()
264
+ )
265
+
266
+ # Set up fallback chain
267
+ self.fallback_chain = (
268
+ {"question": RunnablePassthrough()}
269
+ | self.fallback_prompt
270
+ | self.chat_model
271
+ | StrOutputParser()
272
+ )
273
+
274
+ logger.info("Conversation chain setup complete")
275
+ else:
276
+ logger.warning("Cannot setup conversation chain without vector store")
277
+
278
+ except Exception as e:
279
+ logger.error(f"Failed to setup conversation chain: {str(e)}")
280
+ raise
281
+
282
+ def _format_docs(self, docs: List[Document]) -> str:
283
+ """
284
+ Format retrieved documents for LLM consumption.
285
+
286
+ Args:
287
+ docs: List of retrieved documents
288
+
289
+ Returns:
290
+ Formatted string with document content and metadata
291
+ """
292
+ if not docs:
293
+ return "Không tìm thấy mục nhật ký liên quan."
294
+
295
+ formatted_docs = []
296
+ for i, doc in enumerate(docs, 1):
297
+ # Extract metadata
298
+ metadata = doc.metadata
299
+ date = metadata.get('date', 'Unknown date')
300
+ title = metadata.get('title', 'Untitled')
301
+ tags = metadata.get('tags_list', metadata.get('tags', ''))
302
+
303
+ # Format document
304
+ doc_text = f"""
305
+ Mục {i}:
306
+ Ngày: {date}
307
+ Tiêu đề: {title}
308
+ Tags: {tags if tags else 'Không có tags'}
309
+ Nội dung: {doc.page_content.strip()}
310
+ ---
311
+ """
312
+ formatted_docs.append(doc_text)
313
+
314
+ return "\n".join(formatted_docs)
315
+
316
+ def retrieve_relevant_entries(
317
+ self,
318
+ query: str,
319
+ filters: Optional[Dict[str, Any]] = None,
320
+ k: Optional[int] = None
321
+ ) -> List[Document]:
322
+ """
323
+ Retrieve relevant diary entries based on query with optimized performance.
324
+
325
+ Args:
326
+ query: Search query
327
+ filters: Optional metadata filters
328
+ k: Number of documents to retrieve (overrides default)
329
+
330
+ Returns:
331
+ List of relevant documents
332
+ """
333
+ if not self.vector_store:
334
+ logger.warning("Vector store not available for retrieval")
335
+ return []
336
+
337
+ try:
338
+ # Use smaller k for faster response
339
+ k = k or min(self.max_retrieval_docs, 3) # Limit to 3 docs for speed
340
+
341
+ if filters:
342
+ docs = self.vector_store.similarity_search(
343
+ query=query,
344
+ k=k,
345
+ filter=filters
346
+ )
347
+ else:
348
+ docs = self.vector_store.similarity_search(
349
+ query=query,
350
+ k=k
351
+ )
352
+
353
+ logger.info(f"Retrieved {len(docs)} documents for query: '{query[:50]}...'")
354
+ return docs
355
+
356
+ except Exception as e:
357
+ logger.error(f"Error during retrieval: {str(e)}")
358
+ return []
359
+
360
+ def format_documents_for_context(self, docs: List[Document]) -> str:
361
+ """
362
+ Format retrieved documents into context string for the prompt.
363
+
364
+ Args:
365
+ docs: List of retrieved documents
366
+
367
+ Returns:
368
+ Formatted context string
369
+ """
370
+ if not docs:
371
+ return "Không có thông tin nhật ký liên quan."
372
+
373
+ formatted_docs = []
374
+ for i, doc in enumerate(docs, 1):
375
+ # Extract metadata
376
+ metadata = doc.metadata
377
+ date = metadata.get('date', 'Không có ngày')
378
+ source = metadata.get('source', 'Không rõ nguồn')
379
+
380
+ # Format document
381
+ doc_text = f"Nhật ký {i} (Ngày: {date}):\n{doc.page_content}"
382
+ formatted_docs.append(doc_text)
383
+
384
+ return "\n\n".join(formatted_docs)
385
+
386
+ def generate_fast_response(
387
+ self,
388
+ query: str,
389
+ filters: Optional[Dict[str, Any]] = None
390
+ ) -> str:
391
+ """
392
+ Generate fast response with optimized settings for speed.
393
+
394
+ Args:
395
+ query: User question
396
+ filters: Optional metadata filters
397
+
398
+ Returns:
399
+ AI response string (optimized for speed)
400
+ """
401
+ try:
402
+ # Fast retrieval with only 1 most relevant doc for maximum speed
403
+ relevant_docs = self.retrieve_relevant_entries(
404
+ query=query,
405
+ filters=filters,
406
+ k=1 # Only 1 doc for maximum speed
407
+ )
408
+
409
+ if not relevant_docs:
410
+ # Use simple fallback without chain to avoid timeout
411
+ return "Xin lỗi, tôi không tìm thấy thông tin liên quan trong nhật ký của bạn."
412
+
413
+ # Create very concise context (limit content length)
414
+ context = self._format_docs(relevant_docs[:1])
415
+ if len(context) > 500: # Limit context length
416
+ context = context[:500] + "..."
417
+
418
+ # Fast prompt template with timeout optimization
419
+ fast_prompt = ChatPromptTemplate.from_template(
420
+ """Dựa vào nhật ký: {context}
421
+
422
+ Câu hỏi: {question}
423
+
424
+ Trả lời ngắn (1 câu):"""
425
+ )
426
+
427
+ # Create optimized chain with pre-computed context
428
+ chain = (
429
+ {"context": lambda x: context, "question": RunnablePassthrough()}
430
+ | fast_prompt
431
+ | self.chat_model
432
+ | StrOutputParser()
433
+ )
434
+
435
+ # Generate response with timeout handling
436
+ response = chain.invoke(query)
437
+ logger.info("Generated fast response successfully")
438
+ return response.strip()
439
+
440
+ except Exception as e:
441
+ logger.error(f"Error in fast response generation: {str(e)}")
442
+ # Direct fallback without chain to avoid timeout
443
+ return "Xin lỗi, tôi gặp lỗi khi xử lý câu hỏi của bạn."
444
+
445
+ def generate_response(
446
+ self,
447
+ query: str,
448
+ filters: Optional[Dict[str, Any]] = None,
449
+ use_fallback: bool = False
450
+ ) -> str:
451
+ """
452
+ Generate a response to user query using RAG.
453
+
454
+ Args:
455
+ query: User's question or message
456
+ filters: Optional metadata filters for retrieval
457
+ use_fallback: Whether to use fallback response (no retrieval)
458
+
459
+ Returns:
460
+ Generated response
461
+ """
462
+ try:
463
+ if use_fallback or not self.vector_store:
464
+ # Use fallback chain without retrieval
465
+ response = self.fallback_chain.invoke(query)
466
+ logger.info("Generated fallback response")
467
+ return response
468
+
469
+ # Retrieve relevant documents first
470
+ relevant_docs = self.retrieve_relevant_entries(query, filters)
471
+
472
+ if not relevant_docs:
473
+ # No relevant documents found, use fallback
474
+ response = self.fallback_chain.invoke(query)
475
+ logger.info("No relevant docs found, used fallback response")
476
+ return response
477
+
478
+ # Use RAG chain with retrieved context
479
+ response = self.rag_chain.invoke(query)
480
+ logger.info("Generated RAG response with context")
481
+ return response
482
+
483
+ except Exception as e:
484
+ logger.error(f"Error generating response: {str(e)}")
485
+ return f"Xin lỗi, tôi gặp lỗi khi xử lý câu hỏi của bạn: {str(e)}"
486
+
487
+ def generate_summary(self, date_range: Optional[Tuple[str, str]] = None) -> str:
488
+ """
489
+ Generate a summary of diary entries.
490
+
491
+ Args:
492
+ date_range: Optional tuple of (start_date, end_date) in YYYY-MM-DD format
493
+
494
+ Returns:
495
+ Generated summary
496
+ """
497
+ try:
498
+ if not self.vector_store:
499
+ return "Không thể tạo summary: vector database không khả dụng."
500
+
501
+ # Build filter for date range if provided
502
+ filters = {}
503
+ if date_range:
504
+ start_date, end_date = date_range
505
+ # Note: This depends on how dates are stored in metadata
506
+ # May need adjustment based on actual metadata structure
507
+ pass
508
+
509
+ # Retrieve documents for summary (more documents for better overview)
510
+ docs = self.vector_store.similarity_search(
511
+ query="nhật ký cảm xúc thoughts feelings", # General query
512
+ k=min(10, self.max_retrieval_docs * 2) # More docs for summary
513
+ )
514
+
515
+ if not docs:
516
+ return "Không tìm thấy nh��t ký để tạo summary."
517
+
518
+ # Format context for summary
519
+ context = self._format_docs(docs)
520
+
521
+ # Generate summary
522
+ summary_chain = (
523
+ {"context": lambda x: context}
524
+ | self.summary_prompt
525
+ | self.chat_model
526
+ | StrOutputParser()
527
+ )
528
+
529
+ summary = summary_chain.invoke({})
530
+ logger.info("Generated diary summary")
531
+ return summary
532
+
533
+ except Exception as e:
534
+ logger.error(f"Error generating summary: {str(e)}")
535
+ return f"Lỗi khi tạo summary: {str(e)}"
536
+
537
+ def search_by_tags(self, tags: List[str], k: int = 5) -> List[Document]:
538
+ """
539
+ Search diary entries by specific tags.
540
+
541
+ Args:
542
+ tags: List of tags to search for
543
+ k: Number of documents to return
544
+
545
+ Returns:
546
+ List of documents matching the tags
547
+ """
548
+ if not self.vector_store or not tags:
549
+ return []
550
+
551
+ try:
552
+ # Build tag query
553
+ tag_query = " ".join([f"#{tag}" for tag in tags])
554
+
555
+ # Search with tag-based query
556
+ docs = self.vector_store.similarity_search(
557
+ query=tag_query,
558
+ k=k
559
+ )
560
+
561
+ # Filter by tags in metadata if available
562
+ filtered_docs = []
563
+ for doc in docs:
564
+ doc_tags = doc.metadata.get('tags_list', '')
565
+ if any(tag.lower() in doc_tags.lower() for tag in tags):
566
+ filtered_docs.append(doc)
567
+
568
+ logger.info(f"Found {len(filtered_docs)} documents with tags: {tags}")
569
+ return filtered_docs
570
+
571
+ except Exception as e:
572
+ logger.error(f"Error searching by tags: {str(e)}")
573
+ return []
574
+
575
+ def get_conversation_context(self, chat_history: List[Dict[str, str]]) -> str:
576
+ """
577
+ Process chat history to maintain conversation context.
578
+
579
+ Args:
580
+ chat_history: List of chat messages with 'role' and 'content'
581
+
582
+ Returns:
583
+ Formatted conversation context
584
+ """
585
+ if not chat_history:
586
+ return ""
587
+
588
+ # Take last few messages for context
589
+ recent_messages = chat_history[-5:] # Last 5 messages
590
+
591
+ context_parts = []
592
+ for msg in recent_messages:
593
+ role = "Người dùng" if msg['role'] == 'user' else "Trợ lý"
594
+ context_parts.append(f"{role}: {msg['content']}")
595
+
596
+ return "\n".join(context_parts)
597
+
598
+ def generate_contextual_response(
599
+ self,
600
+ query: str,
601
+ chat_history: List[Dict[str, str]] = None,
602
+ filters: Optional[Dict[str, Any]] = None
603
+ ) -> str:
604
+ """
605
+ Generate response with conversation context.
606
+
607
+ Args:
608
+ query: Current user query
609
+ chat_history: Previous conversation messages
610
+ filters: Optional metadata filters
611
+
612
+ Returns:
613
+ Contextual response
614
+ """
615
+ # Get conversation context
616
+ conv_context = self.get_conversation_context(chat_history or [])
617
+
618
+ # Enhance query with conversation context
619
+ if conv_context:
620
+ enhanced_query = f"Bối cảnh cuộc trò chuyện:\n{conv_context}\n\nCâu hỏi hiện tại: {query}"
621
+ else:
622
+ enhanced_query = query
623
+
624
+ # Generate response
625
+ return self.generate_response(enhanced_query, filters)
626
+
627
+ def health_check(self) -> Dict[str, Any]:
628
+ """
629
+ Check the health status of the RAG system.
630
+
631
+ Returns:
632
+ Dictionary with system status information
633
+ """
634
+ status = {
635
+ "vector_store_available": self.vector_store is not None,
636
+ "vector_db_path": self.vector_db_path,
637
+ "models_initialized": True,
638
+ "embedding_model": "models/embedding-001",
639
+ "chat_model": "gemini-1.5-flash"
640
+ }
641
+
642
+ if self.vector_store:
643
+ try:
644
+ doc_count = self.vector_store._collection.count()
645
+ status["document_count"] = doc_count
646
+ status["vector_store_healthy"] = True
647
+ except Exception as e:
648
+ status["vector_store_healthy"] = False
649
+ status["vector_store_error"] = str(e)
650
+ else:
651
+ status["document_count"] = 0
652
+ status["vector_store_healthy"] = False
653
+
654
+ return status
655
+
656
+ # ========================================
657
+ # CONVENIENCE FUNCTIONS
658
+ # ========================================
659
+
660
+ def create_rag_system(
661
+ user_id: int = 1,
662
+ base_vector_path: str = "./src/Indexingstep",
663
+ google_api_key: Optional[str] = None
664
+ ) -> DiaryRAGSystem:
665
+ """
666
+ Create and initialize a user-specific DiaryRAGSystem instance.
667
+
668
+ Args:
669
+ user_id: User ID for user-specific vector database
670
+ base_vector_path: Base path for vector databases
671
+ google_api_key: Google API key
672
+
673
+ Returns:
674
+ Initialized DiaryRAGSystem for the specific user
675
+ """
676
+ return DiaryRAGSystem(
677
+ user_id=user_id,
678
+ base_vector_path=base_vector_path,
679
+ google_api_key=google_api_key
680
+ )
681
+
682
+ def quick_query(
683
+ query: str,
684
+ user_id: int = 1,
685
+ base_vector_path: str = "./src/VectorDB"
686
+ ) -> str:
687
+ """
688
+ Quick query function for testing with user-specific database.
689
+
690
+ Args:
691
+ query: Question to ask
692
+ user_id: User ID for user-specific vector database
693
+ base_vector_path: Base path for vector databases
694
+
695
+ Returns:
696
+ Response string
697
+ """
698
+ try:
699
+ rag = create_rag_system(user_id, base_vector_path)
700
+ return rag.generate_response(query)
701
+ except Exception as e:
702
+ return f"Error: {str(e)}"
703
+
704
+ if __name__ == "__main__":
705
+ # Example usage
706
+ print("🤖 Diary RAG System - Example Usage")
707
+ print("=" * 50)
708
+
709
+ try:
710
+ # Initialize system
711
+ rag = create_rag_system()
712
+
713
+ # Health check
714
+ status = rag.health_check()
715
+ print("System Status:")
716
+ for key, value in status.items():
717
+ print(f" {key}: {value}")
718
+
719
+ # Example queries
720
+ if status.get("vector_store_healthy"):
721
+ print("\n📝 Example Queries:")
722
+
723
+ queries = [
724
+ "Tôi cảm thấy như thế nào trong tuần này?",
725
+ "Có những hoạt động nào tôi đã làm gần đây?",
726
+ "Tâm trạng của tôi đã thay đổi như thế nào?"
727
+ ]
728
+
729
+ for query in queries:
730
+ print(f"\n❓ Query: {query}")
731
+ response = rag.generate_response(query)
732
+ print(f"🤖 Response: {response[:200]}...")
733
+
734
+ except Exception as e:
735
+ print(f"❌ Error: {str(e)}")
736
+ print("Make sure to:")
737
+ print("1. Set GOOGLE_API_KEY environment variable")
738
+ print("2. Run the indexing pipeline first")
739
+ print("3. Check vector database path")
clean_repo/src/Retrivel_And_Generation/__pycache__/Retrieval_And_Generator.cpython-311.pyc ADDED
Binary file (31.8 kB). View file
 
clean_repo/src/rag_service/main.py ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from typing import List, Dict, Any, Optional
5
+ import os
6
+ import sys
7
+ import uvicorn
8
+ from datetime import datetime
9
+ import json
10
+ import logging
11
+ from fastapi import Query
12
+
13
+ # Load environment variables
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+
17
+ # Add paths for imports
18
+ current_dir = os.path.dirname(os.path.abspath(__file__))
19
+ src_dir = os.path.dirname(current_dir)
20
+ sys.path.append(src_dir)
21
+ sys.path.append(os.path.join(src_dir, "Indexingstep"))
22
+ sys.path.append(os.path.join(src_dir, "Retrivel_And_Generation"))
23
+
24
+ # Import your modules
25
+ try:
26
+ from Indexingstep.pipeline import DiaryIndexingPipeline
27
+ from Retrivel_And_Generation.Retrieval_And_Generator import create_rag_system, DiaryRAGSystem
28
+ RAG_MODULES_AVAILABLE = True
29
+ except ImportError as e:
30
+ print(f"Warning: RAG modules not available: {e}")
31
+ RAG_MODULES_AVAILABLE = False
32
+
33
+ # Configure logging
34
+ logging.basicConfig(filename="logs/service.log",
35
+ level=logging.INFO )
36
+ logger = logging.getLogger(__name__)
37
+
38
+ app = FastAPI(
39
+ title="Personal Diary RAG Service",
40
+ description="RAG service for personal diary chatbot with user isolation",
41
+ version="1.0.0"
42
+ )
43
+
44
+ # CORS middleware
45
+ app.add_middleware(
46
+ CORSMiddleware,
47
+ allow_origins=["*"],
48
+ allow_credentials=True,
49
+ allow_methods=["*"],
50
+ allow_headers=["*"],
51
+ )
52
+
53
+ # In-memory cache for RAG systems
54
+ rag_systems_cache: Dict[int, DiaryRAGSystem] = {}
55
+
56
+ # ========================================
57
+ # PYDANTIC MODELS
58
+ # ========================================
59
+
60
+ class DiaryEntry(BaseModel):
61
+ date: str
62
+ content: str
63
+ tags: str = ""
64
+
65
+ class IndexRequest(BaseModel):
66
+ user_id: int
67
+ clear_existing: bool = False
68
+ start_date: Optional[str] = None
69
+ end_date: Optional[str] = None
70
+
71
+ class QueryRequest(BaseModel):
72
+ user_id: int
73
+ query: str
74
+ fast_mode: bool = False
75
+ chat_history: List[Dict[str, str]] = []
76
+
77
+ class UserStatusResponse(BaseModel):
78
+ user_id: int
79
+ status: str
80
+ document_count: int
81
+ vector_db_path: str
82
+ last_updated: Optional[str] = None
83
+ error: Optional[str] = None
84
+
85
+ class QueryResponse(BaseModel):
86
+ user_id: int
87
+ response: str
88
+ processing_time: float
89
+ documents_used: int
90
+ fast_mode: bool
91
+
92
+ class IndexResponse(BaseModel):
93
+ user_id: int
94
+ status: str
95
+ documents_processed: int
96
+ chunks_created: int
97
+ vector_db_path: str
98
+ processing_time: float
99
+ error: Optional[str] = None
100
+
101
+ # ========================================
102
+ # HELPER FUNCTIONS
103
+ # ========================================
104
+
105
+ def format_error_message(errors) -> str:
106
+ """Convert error list to string for API response."""
107
+ if isinstance(errors, list):
108
+ return '; '.join(str(e) for e in errors)
109
+ return str(errors) if errors else 'Unknown error'
110
+
111
+ def get_user_paths(user_id: int) -> Dict[str, str]:
112
+ """Get all paths for a user."""
113
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
114
+
115
+ return {
116
+ "vector_db_path": os.path.join(base_dir, "VectorDB", f"user_{user_id}_vector_db"),
117
+ "diary_db_path": os.path.join(base_dir, "streamlit_app", "backend", f"user_{user_id}_diary.db"),
118
+ "base_vector_path": os.path.join(base_dir, "VectorDB")
119
+ }
120
+
121
+ def get_pipeline_config(user_id: int) -> Dict[str, Any]:
122
+ """Get configuration for DiaryIndexingPipeline."""
123
+ paths = get_user_paths(user_id)
124
+
125
+ return {
126
+ "db_path": paths["diary_db_path"],
127
+ "persist_directory": paths["vector_db_path"],
128
+ "collection_name": f"user_{user_id}_diary_entries",
129
+ "google_api_key": os.getenv("GOOGLE_API_KEY"),
130
+ "chunk_size": 800,
131
+ "chunk_overlap": 100,
132
+ "batch_size": 50,
133
+ "user_id": user_id
134
+ }
135
+
136
+ def check_vector_db_exists(user_id: int) -> bool:
137
+ """Check if vector database exists for user."""
138
+ paths = get_user_paths(user_id)
139
+ return os.path.exists(paths["vector_db_path"])
140
+
141
+ def get_document_count(user_id: int) -> int:
142
+ """Get document count from vector database."""
143
+ try:
144
+ if user_id in rag_systems_cache:
145
+ return rag_systems_cache[user_id].get_document_count()
146
+
147
+ if not check_vector_db_exists(user_id):
148
+ return 0
149
+
150
+ # Create temporary RAG system to check count
151
+ paths = get_user_paths(user_id)
152
+ temp_rag = create_rag_system(
153
+ user_id=user_id,
154
+ base_vector_path=paths["base_vector_path"],
155
+ google_api_key=os.getenv("GOOGLE_API_KEY")
156
+ )
157
+
158
+ if temp_rag:
159
+ return temp_rag.get_document_count()
160
+ return 0
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error getting document count for user {user_id}: {e}")
164
+ return 0
165
+
166
+ def get_or_create_rag_system(user_id: int) -> DiaryRAGSystem:
167
+ """Get existing RAG system or create new one."""
168
+ if user_id not in rag_systems_cache:
169
+ if not check_vector_db_exists(user_id):
170
+ raise HTTPException(
171
+ status_code=404,
172
+ detail=f"Vector database not found for user {user_id}. Please run indexing first."
173
+ )
174
+
175
+ paths = get_user_paths(user_id)
176
+ rag_system = create_rag_system(
177
+ user_id=user_id,
178
+ base_vector_path=paths["base_vector_path"],
179
+ google_api_key=os.getenv("GOOGLE_API_KEY")
180
+ )
181
+
182
+ if not rag_system:
183
+ raise HTTPException(
184
+ status_code=500,
185
+ detail=f"Failed to create RAG system for user {user_id}"
186
+ )
187
+
188
+ rag_systems_cache[user_id] = rag_system
189
+ logger.info(f"Created RAG system for user {user_id}")
190
+
191
+ return rag_systems_cache[user_id]
192
+
193
+ # ========================================
194
+ # API ENDPOINTS
195
+ # ========================================
196
+
197
+ @app.get("/")
198
+ async def root():
199
+ """Health check endpoint."""
200
+ return {
201
+ "message": "Personal Diary RAG Service is running",
202
+ "version": "1.0.0",
203
+ "cached_users": list(rag_systems_cache.keys()),
204
+ "vector_db_base": os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "VectorDB")
205
+ }
206
+
207
+ @app.get("/health")
208
+ async def health_check():
209
+ """Detailed health check."""
210
+ try:
211
+ google_api_key = os.getenv("GOOGLE_API_KEY")
212
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
213
+ vector_db_base = os.path.join(base_dir, "VectorDB")
214
+
215
+ return {
216
+ "status": "healthy",
217
+ "google_api_configured": bool(google_api_key),
218
+ "vector_db_base_exists": os.path.exists(vector_db_base),
219
+ "cached_users": list(rag_systems_cache.keys()),
220
+ "timestamp": datetime.now().isoformat()
221
+ }
222
+ except Exception as e:
223
+ raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
224
+
225
+ @app.get("/users/{user_id}/ai-availability")
226
+ async def check_ai_availability(user_id: int):
227
+ """Check AI availability and provide detailed status for troubleshooting."""
228
+ try:
229
+ # Check all prerequisites for AI availability
230
+ availability_info = {
231
+ "user_id": user_id,
232
+ "overall_status": "checking",
233
+ "checks": {
234
+ "rag_modules": {
235
+ "available": RAG_MODULES_AVAILABLE,
236
+ "status": "✅ Available" if RAG_MODULES_AVAILABLE else "❌ Not Available",
237
+ "details": "Required modules: DiaryIndexingPipeline, DiaryRAGSystem"
238
+ },
239
+ "google_api_key": {
240
+ "configured": bool(os.getenv("GOOGLE_API_KEY")),
241
+ "status": "✅ Configured" if os.getenv("GOOGLE_API_KEY") else "❌ Not Configured",
242
+ "details": "Required for embeddings and LLM responses"
243
+ },
244
+ "vector_database": {
245
+ "exists": check_vector_db_exists(user_id),
246
+ "status": "✅ Exists" if check_vector_db_exists(user_id) else "⚠️ Not Found",
247
+ "path": get_user_paths(user_id)["vector_db_path"]
248
+ },
249
+ "document_count": {
250
+ "count": get_document_count(user_id),
251
+ "status": "✅ Has Documents" if get_document_count(user_id) > 0 else "⚠️ Empty",
252
+ "details": f"{get_document_count(user_id)} documents indexed"
253
+ }
254
+ },
255
+ "recommendations": [],
256
+ "actions": []
257
+ }
258
+
259
+ # Determine overall status and recommendations
260
+ if not RAG_MODULES_AVAILABLE:
261
+ availability_info["overall_status"] = "unavailable"
262
+ availability_info["recommendations"].append("Install missing RAG modules")
263
+ availability_info["actions"].append({
264
+ "action": "check_imports",
265
+ "description": "Verify DiaryIndexingPipeline and DiaryRAGSystem imports"
266
+ })
267
+ elif not os.getenv("GOOGLE_API_KEY"):
268
+ availability_info["overall_status"] = "not_configured"
269
+ availability_info["recommendations"].append("Configure Google API key")
270
+ availability_info["actions"].append({
271
+ "action": "set_api_key",
272
+ "description": "Add GOOGLE_API_KEY to environment variables"
273
+ })
274
+ elif not check_vector_db_exists(user_id):
275
+ availability_info["overall_status"] = "needs_indexing"
276
+ availability_info["recommendations"].append("Create vector database for user")
277
+ availability_info["actions"].append({
278
+ "action": "initial_index",
279
+ "endpoint": f"/users/{user_id}/auto-index-new-entry",
280
+ "description": "Run initial indexing to create vector database"
281
+ })
282
+ elif get_document_count(user_id) == 0:
283
+ availability_info["overall_status"] = "empty_database"
284
+ availability_info["recommendations"].append("Add diary entries or rebuild index")
285
+ availability_info["actions"].append({
286
+ "action": "check_diary_entries",
287
+ "description": "Verify user has diary entries in database"
288
+ })
289
+ availability_info["actions"].append({
290
+ "action": "rebuild_index",
291
+ "endpoint": f"/users/{user_id}/auto-index-new-entry",
292
+ "description": "Rebuild vector database from existing entries"
293
+ })
294
+ else:
295
+ availability_info["overall_status"] = "available"
296
+ availability_info["recommendations"].append("AI is ready for use")
297
+ availability_info["actions"].append({
298
+ "action": "query_ready",
299
+ "endpoint": f"/users/{user_id}/query",
300
+ "description": "AI is ready to answer questions"
301
+ })
302
+
303
+ # Add cache status
304
+ availability_info["cache_status"] = {
305
+ "user_cached": user_id in rag_systems_cache,
306
+ "total_cached_users": len(rag_systems_cache),
307
+ "cached_users": list(rag_systems_cache.keys())
308
+ }
309
+
310
+ return availability_info
311
+
312
+ except Exception as e:
313
+ logger.error(f"Error checking AI availability for user {user_id}: {e}")
314
+ return {
315
+ "user_id": user_id,
316
+ "overall_status": "error",
317
+ "error": str(e),
318
+ "recommendations": ["Check service logs for detailed error information"],
319
+ "actions": [{
320
+ "action": "check_logs",
321
+ "description": "Review service logs for error details"
322
+ }]
323
+ }
324
+
325
+ @app.post("/users/{user_id}/fix-ai-availability")
326
+ async def fix_ai_availability(user_id: int):
327
+ """Attempt to automatically fix AI availability issues."""
328
+ try:
329
+ if not RAG_MODULES_AVAILABLE:
330
+ return {
331
+ "status": "cannot_fix",
332
+ "reason": "RAG modules not available - requires code/environment fix",
333
+ "action_needed": "Install missing Python modules"
334
+ }
335
+
336
+ if not os.getenv("GOOGLE_API_KEY"):
337
+ return {
338
+ "status": "cannot_fix",
339
+ "reason": "Google API key not configured",
340
+ "action_needed": "Set GOOGLE_API_KEY environment variable"
341
+ }
342
+
343
+ # Try to fix vector database issues
344
+ if not check_vector_db_exists(user_id) or get_document_count(user_id) == 0:
345
+ logger.info(f"Attempting to fix AI availability for user {user_id}")
346
+
347
+ # Clear cache first
348
+ if user_id in rag_systems_cache:
349
+ del rag_systems_cache[user_id]
350
+
351
+ # Create/rebuild vector database
352
+ config = get_pipeline_config(user_id)
353
+ paths = get_user_paths(user_id)
354
+ os.makedirs(os.path.dirname(paths["vector_db_path"]), exist_ok=True)
355
+
356
+ pipeline = DiaryIndexingPipeline(**config)
357
+ results = pipeline.run_full_pipeline(clear_existing=True)
358
+
359
+ if results.get('status') == 'completed_successfully':
360
+ doc_count = get_document_count(user_id)
361
+ return {
362
+ "status": "fixed",
363
+ "action_taken": "Created/rebuilt vector database",
364
+ "documents_processed": results.get('documents_loaded', 0),
365
+ "chunks_created": results.get('chunks_created', 0),
366
+ "final_document_count": doc_count,
367
+ "ai_status": "ready" if doc_count > 0 else "empty"
368
+ }
369
+ else:
370
+ return {
371
+ "status": "fix_failed",
372
+ "reason": "Failed to create vector database",
373
+ "error": format_error_message(results.get('errors', 'Unknown error'))
374
+ }
375
+ else:
376
+ return {
377
+ "status": "already_available",
378
+ "message": "AI is already available for this user",
379
+ "document_count": get_document_count(user_id)
380
+ }
381
+
382
+ except Exception as e:
383
+ logger.error(f"Error fixing AI availability for user {user_id}: {e}")
384
+ return {
385
+ "status": "error",
386
+ "error": str(e),
387
+ "action_needed": "Check service logs and try manual troubleshooting"
388
+ }
389
+
390
+ @app.get("/users/{user_id}/status", response_model=UserStatusResponse)
391
+ async def get_user_status(user_id: int):
392
+ """Get RAG system status for a user."""
393
+ try:
394
+ paths = get_user_paths(user_id)
395
+
396
+ if not check_vector_db_exists(user_id):
397
+ return UserStatusResponse(
398
+ user_id=user_id,
399
+ status="not_indexed",
400
+ document_count=0,
401
+ vector_db_path=paths["vector_db_path"]
402
+ )
403
+
404
+ doc_count = get_document_count(user_id)
405
+
406
+ return UserStatusResponse(
407
+ user_id=user_id,
408
+ status="ready" if doc_count > 0 else "empty",
409
+ document_count=doc_count,
410
+ vector_db_path=paths["vector_db_path"],
411
+ last_updated=datetime.now().isoformat()
412
+ )
413
+
414
+ except Exception as e:
415
+ logger.error(f"Error getting status for user {user_id}: {e}")
416
+ return UserStatusResponse(
417
+ user_id=user_id,
418
+ status="error",
419
+ document_count=0,
420
+ vector_db_path="",
421
+ error=str(e)
422
+ )
423
+
424
+ @app.post("/users/{user_id}/index", response_model=IndexResponse)
425
+ async def index_user_data(user_id: int, request: IndexRequest, background_tasks: BackgroundTasks):
426
+ """Index diary entries for a user."""
427
+ start_time = datetime.now()
428
+
429
+ try:
430
+ # Ensure VectorDB directory exists
431
+ paths = get_user_paths(user_id)
432
+ os.makedirs(os.path.dirname(paths["vector_db_path"]), exist_ok=True)
433
+
434
+ # Get pipeline configuration
435
+ config = get_pipeline_config(user_id)
436
+
437
+ logger.info(f"Starting indexing for user {user_id} with config: {config}")
438
+
439
+ # Create and run pipeline
440
+ pipeline = DiaryIndexingPipeline(**config)
441
+
442
+ if request.start_date and request.end_date:
443
+ # Date range indexing
444
+ results = pipeline.run_full_pipeline(
445
+ start_date=request.start_date,
446
+ end_date=request.end_date,
447
+ clear_existing=request.clear_existing
448
+ )
449
+ else:
450
+ # Full indexing
451
+ results = pipeline.run_full_pipeline(clear_existing=request.clear_existing)
452
+
453
+ processing_time = (datetime.now() - start_time).total_seconds()
454
+
455
+ if results.get('status') == 'completed_successfully':
456
+ # Clear cache to force reload
457
+ if user_id in rag_systems_cache:
458
+ del rag_systems_cache[user_id]
459
+
460
+ return IndexResponse(
461
+ user_id=user_id,
462
+ status="success",
463
+ documents_processed=results.get('documents_loaded', 0),
464
+ chunks_created=results.get('chunks_created', 0),
465
+ vector_db_path=paths["vector_db_path"],
466
+ processing_time=processing_time
467
+ )
468
+ else:
469
+ return IndexResponse(
470
+ user_id=user_id,
471
+ status="failed",
472
+ documents_processed=0,
473
+ chunks_created=0,
474
+ vector_db_path=paths["vector_db_path"],
475
+ processing_time=processing_time,
476
+ error=format_error_message(results.get('errors', 'Unknown error'))
477
+ )
478
+
479
+ except Exception as e:
480
+ processing_time = (datetime.now() - start_time).total_seconds()
481
+ logger.error(f"Indexing error for user {user_id}: {e}")
482
+
483
+ return IndexResponse(
484
+ user_id=user_id,
485
+ status="error",
486
+ documents_processed=0,
487
+ chunks_created=0,
488
+ vector_db_path="",
489
+ processing_time=processing_time,
490
+ error=str(e)
491
+ )
492
+
493
+ @app.post("/users/{user_id}/incremental-index")
494
+ async def incremental_index(user_id: int, start_date: str = None):
495
+ """Run incremental indexing for user."""
496
+ try:
497
+ config = get_pipeline_config(user_id)
498
+ pipeline = DiaryIndexingPipeline(**config)
499
+
500
+ if start_date:
501
+ results = pipeline.incremental_update(start_date)
502
+ else:
503
+ # Default to last 7 days
504
+ from datetime import timedelta
505
+ default_start = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
506
+ results = pipeline.incremental_update(default_start)
507
+
508
+ if results.get('status') == 'success':
509
+ # Clear cache to force reload
510
+ if user_id in rag_systems_cache:
511
+ del rag_systems_cache[user_id]
512
+
513
+ return {
514
+ "user_id": user_id,
515
+ "status": "success",
516
+ "documents_added": results.get('documents_added', 0),
517
+ "start_date": start_date or default_start
518
+ }
519
+ else:
520
+ raise HTTPException(
521
+ status_code=500,
522
+ detail=f"Incremental indexing failed: {results.get('error', 'Unknown error')}"
523
+ )
524
+
525
+ except HTTPException:
526
+ raise
527
+ except Exception as e:
528
+ logger.error(f"Incremental indexing error for user {user_id}: {e}")
529
+ raise HTTPException(status_code=500, detail=str(e))
530
+
531
+ @app.get("/users/{user_id}/query", response_model=QueryResponse)
532
+ async def query_user_rag(
533
+ user_id: int,
534
+ query: str = Query(...),
535
+ fast_mode: bool = Query(False),
536
+ chat_history: str = Query("[]")
537
+ ):
538
+ """Query RAG system for a user."""
539
+ start_time = datetime.now()
540
+ import json
541
+
542
+ try:
543
+ rag_system = get_or_create_rag_system(user_id)
544
+ chat_history_list = json.loads(chat_history)
545
+ if fast_mode:
546
+ response = rag_system.generate_fast_response(query=query)
547
+ else:
548
+ response = rag_system.generate_contextual_response(
549
+ query=query,
550
+ chat_history=chat_history_list
551
+ )
552
+ processing_time = (datetime.now() - start_time).total_seconds()
553
+ return QueryResponse(
554
+ user_id=user_id,
555
+ response=response,
556
+ processing_time=processing_time,
557
+ documents_used=5,
558
+ fast_mode=fast_mode
559
+ )
560
+ except HTTPException:
561
+ raise
562
+ except Exception as e:
563
+ logger.error(f"Query error for user {user_id}: {e}")
564
+ raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
565
+
566
+ @app.post("/users/{user_id}/auto-index-new-entry")
567
+ async def auto_index_new_entry(user_id: int):
568
+ """Auto-index after saving new diary entry. Creates initial index if not exists."""
569
+ try:
570
+ if not RAG_MODULES_AVAILABLE:
571
+ return {"status": "skipped", "reason": "RAG modules not available"}
572
+
573
+ # Check if vector DB exists
574
+ if not check_vector_db_exists(user_id):
575
+ # First time - create full index
576
+ logger.info(f"Creating initial vector database for user {user_id}")
577
+
578
+ config = get_pipeline_config(user_id)
579
+ paths = get_user_paths(user_id)
580
+ os.makedirs(os.path.dirname(paths["vector_db_path"]), exist_ok=True)
581
+
582
+ pipeline = DiaryIndexingPipeline(**config)
583
+ results = pipeline.run_full_pipeline(clear_existing=True)
584
+
585
+ if results.get('status') == 'completed_successfully':
586
+ # Clear cache to force reload
587
+ if user_id in rag_systems_cache:
588
+ del rag_systems_cache[user_id]
589
+
590
+ return {
591
+ "status": "initial_index_created",
592
+ "message": f"Created initial vector database for user {user_id}",
593
+ "documents_processed": results.get('documents_loaded', 0),
594
+ "chunks_created": results.get('chunks_created', 0)
595
+ }
596
+ else:
597
+ return {
598
+ "status": "failed",
599
+ "error": format_error_message(results.get('errors', 'Unknown error'))
600
+ }
601
+ else:
602
+ # Incremental update for existing DB
603
+ config = get_pipeline_config(user_id)
604
+ pipeline = DiaryIndexingPipeline(**config)
605
+
606
+ # Get recent entries (last 3 days to catch new ones)
607
+ from datetime import timedelta
608
+ start_date = (datetime.now() - timedelta(days=3)).strftime("%Y-%m-%d")
609
+ results = pipeline.incremental_update(start_date)
610
+
611
+ if results.get('status') == 'success':
612
+ # Clear cache to force reload
613
+ if user_id in rag_systems_cache:
614
+ del rag_systems_cache[user_id]
615
+
616
+ documents_added = results.get('documents_added', 0)
617
+ return {
618
+ "status": "incremental_update_success",
619
+ "message": f"Updated vector database for user {user_id}",
620
+ "documents_added": documents_added
621
+ }
622
+ else:
623
+ # If incremental fails, try full rebuild
624
+ logger.warning(f"Incremental update failed for user {user_id}, trying full rebuild")
625
+ results = pipeline.run_full_pipeline(clear_existing=True)
626
+
627
+ if results.get('status') == 'completed_successfully':
628
+ if user_id in rag_systems_cache:
629
+ del rag_systems_cache[user_id]
630
+
631
+ return {
632
+ "status": "full_rebuild_success",
633
+ "message": f"Rebuilt vector database for user {user_id}",
634
+ "documents_processed": results.get('documents_loaded', 0)
635
+ }
636
+ else:
637
+ return {
638
+ "status": "failed",
639
+ "error": f"Both incremental and full rebuild failed: {format_error_message(results.get('errors', 'Unknown error'))}"
640
+ }
641
+
642
+ except Exception as e:
643
+ logger.error(f"Auto-index error for user {user_id}: {e}")
644
+ return {"status": "error", "error": str(e)}
645
+
646
+ @app.delete("/users/{user_id}/cache")
647
+ async def clear_user_cache(user_id: int):
648
+ """Clear RAG system cache for a user."""
649
+ if user_id in rag_systems_cache:
650
+ del rag_systems_cache[user_id]
651
+ logger.info(f"Cleared cache for user {user_id}")
652
+ return {"message": f"Cache cleared for user {user_id}"}
653
+ else:
654
+ return {"message": f"No cache found for user {user_id}"}
655
+
656
+ @app.delete("/users/{user_id}/vector-db")
657
+ async def delete_user_vector_db(user_id: int):
658
+ """Delete vector database for a user."""
659
+ try:
660
+ paths = get_user_paths(user_id)
661
+
662
+ # Clear cache first
663
+ if user_id in rag_systems_cache:
664
+ del rag_systems_cache[user_id]
665
+
666
+ # Delete vector database directory
667
+ if os.path.exists(paths["vector_db_path"]):
668
+ import shutil
669
+ shutil.rmtree(paths["vector_db_path"])
670
+ logger.info(f"Deleted vector database for user {user_id}")
671
+ return {"message": f"Vector database deleted for user {user_id}"}
672
+ else:
673
+ return {"message": f"No vector database found for user {user_id}"}
674
+
675
+ except Exception as e:
676
+ logger.error(f"Error deleting vector database for user {user_id}: {e}")
677
+ raise HTTPException(status_code=500, detail=str(e))
678
+
679
+ @app.get("/stats")
680
+ async def get_service_stats():
681
+ """Get service statistics."""
682
+ try:
683
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
684
+ vector_db_base = os.path.join(base_dir, "VectorDB")
685
+
686
+ # Get list of existing vector databases
687
+ existing_dbs = []
688
+ if os.path.exists(vector_db_base):
689
+ for item in os.listdir(vector_db_base):
690
+ if item.startswith("user_") and item.endswith("_vector_db"):
691
+ user_id = int(item.replace("user_", "").replace("_vector_db", ""))
692
+ doc_count = get_document_count(user_id)
693
+ existing_dbs.append({
694
+ "user_id": user_id,
695
+ "path": os.path.join(vector_db_base, item),
696
+ "document_count": doc_count
697
+ })
698
+
699
+ return {
700
+ "cached_users": list(rag_systems_cache.keys()),
701
+ "total_cached_systems": len(rag_systems_cache),
702
+ "existing_vector_databases": existing_dbs,
703
+ "vector_db_base_path": vector_db_base,
704
+ "service_status": "running"
705
+ }
706
+
707
+ except Exception as e:
708
+ logger.error(f"Error getting stats: {e}")
709
+ raise HTTPException(status_code=500, detail=str(e))
710
+
711
+ if __name__ == "__main__":
712
+ # Ensure VectorDB directory exists
713
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
714
+ vector_db_dir = os.path.join(base_dir, "VectorDB")
715
+ os.makedirs(vector_db_dir, exist_ok=True)
716
+
717
+ print(f"🚀 Starting RAG Service...")
718
+ print(f"📁 Vector DB base path: {vector_db_dir}")
719
+ print(f"🔑 Google API Key configured: {bool(os.getenv('GOOGLE_API_KEY'))}")
720
+
721
+ uvicorn.run(app, host="127.0.0.1", port=8001, reload=False)
clean_repo/src/simple_diary_chatbot.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple Diary Chatbot (RAG Minimal Core)
3
+
4
+ Chức năng bắt buộc (giữ thật đơn giản nhưng vẫn là RAG):
5
+ 1. add -> Lưu entry vào SQLite + CHUNK + EMBEDDING bắt buộc + lưu vector
6
+ 2. delete -> Xoá entry (DB + vector theo entry_id)
7
+ 3. chat -> similarity search (k), nếu có API key thì generate câu trả lời, không thì trả về context
8
+
9
+ Embedding LÀ BẮT BUỘC (sản phẩm RAG). Nếu không có GOOGLE_API_KEY sẽ báo lỗi rõ ràng.
10
+
11
+ Chunking tối giản: cắt theo độ dài cố định (mặc định 800 ký tự) và không overlap để giảm phức tạp.
12
+
13
+ File này thay thế các pipeline phức tạp trước đây khi bạn chỉ cần RAG CRUD cơ bản.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import sqlite3
20
+ from dataclasses import dataclass
21
+ from typing import List, Optional, Dict, Any
22
+ from datetime import datetime
23
+ import logging
24
+ from textwrap import wrap
25
+ import asyncio
26
+
27
+ # Fix event loop issue for Streamlit
28
+ try:
29
+ import nest_asyncio
30
+ nest_asyncio.apply()
31
+ except ImportError:
32
+ pass
33
+
34
+ # Dùng lại lớp embedding hiện có (Chroma + Google Embedding)
35
+ from Indexingstep.embedding_and_storing import DiaryEmbeddingAndStorage
36
+
37
+ try:
38
+ import google.generativeai as genai # type: ignore
39
+ except Exception: # pragma: no cover
40
+ genai = None
41
+
42
+ logging.basicConfig(level=logging.INFO)
43
+ logger = logging.getLogger("simple_diary")
44
+
45
+ DB_PATH = os.path.join(os.getcwd(), "diary.db") # Một file DB duy nhất
46
+ CHUNK_SIZE = 800 # Có thể chỉnh nếu cần
47
+
48
+
49
+ def get_conn():
50
+ return sqlite3.connect(DB_PATH)
51
+
52
+
53
+ def ensure_db():
54
+ conn = get_conn()
55
+ cur = conn.cursor()
56
+ cur.execute(
57
+ """
58
+ CREATE TABLE IF NOT EXISTS diary_entries (
59
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
60
+ date TEXT NOT NULL,
61
+ content TEXT NOT NULL,
62
+ tags TEXT DEFAULT '',
63
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
64
+ )
65
+ """
66
+ )
67
+ conn.commit()
68
+ conn.close()
69
+
70
+
71
+ @dataclass
72
+ class DiaryEntry:
73
+ id: int
74
+ date: str
75
+ content: str
76
+ tags: str
77
+ created_at: str
78
+
79
+
80
+ class SimpleDiaryChatbot:
81
+ """Core RAG tối giản – luôn yêu cầu embedding hoạt động."""
82
+
83
+ def __init__(self, api_key: Optional[str] = None, user_id: int = 1, chunk_size: int = CHUNK_SIZE):
84
+ if api_key:
85
+ os.environ["GOOGLE_API_KEY"] = api_key
86
+ ensure_db()
87
+ self.user_id = user_id
88
+ self.chunk_size = chunk_size
89
+
90
+ key = os.getenv("GOOGLE_API_KEY")
91
+ if not key:
92
+ raise RuntimeError(
93
+ "GOOGLE_API_KEY chưa được thiết lập. Set trong PowerShell: $env:GOOGLE_API_KEY='YOUR_KEY'"
94
+ )
95
+
96
+ # Fix event loop for Streamlit
97
+ try:
98
+ loop = asyncio.get_event_loop()
99
+ except RuntimeError:
100
+ loop = asyncio.new_event_loop()
101
+ asyncio.set_event_loop(loop)
102
+
103
+ # Khởi tạo embedding + vector store (bắt buộc)
104
+ self.embedding_store = DiaryEmbeddingAndStorage(user_id=user_id, api_key=key)
105
+
106
+ # (Tuỳ chọn) LLM để tạo câu trả lời tự nhiên – nếu lỗi vẫn tiếp tục dùng context
107
+ self._model = None
108
+ if genai:
109
+ try:
110
+ genai.configure(api_key=key)
111
+ self._model = genai.GenerativeModel("gemini-1.5-flash")
112
+ except Exception as e:
113
+ logger.warning(f"Không khởi tạo được LLM (tiếp tục với retrieval-only): {e}")
114
+
115
+ # ------------- CRUD -------------
116
+ def _chunk(self, text: str) -> List[str]:
117
+ """Chunk đơn giản theo độ dài cố định, cắt ở khoảng trắng gần nhất nếu có."""
118
+ if len(text) <= self.chunk_size:
119
+ return [text]
120
+ chunks: List[str] = []
121
+ start = 0
122
+ while start < len(text):
123
+ end = min(start + self.chunk_size, len(text))
124
+ # Cố gắng lùi về khoảng trắng để tránh cắt từ
125
+ if end < len(text):
126
+ last_space = text.rfind(" ", start, end)
127
+ if last_space != -1 and last_space - start > self.chunk_size * 0.5:
128
+ end = last_space
129
+ chunks.append(text[start:end].strip())
130
+ start = end
131
+ return [c for c in chunks if c]
132
+
133
+ def add_entry(self, date: str, content: str, tags: str = "") -> int:
134
+ conn = get_conn()
135
+ cur = conn.cursor()
136
+ cur.execute(
137
+ "INSERT INTO diary_entries(date, content, tags) VALUES (?, ?, ?)",
138
+ (date, content, tags),
139
+ )
140
+ entry_id = cur.lastrowid
141
+ conn.commit()
142
+ conn.close()
143
+
144
+ # Chunk + embed từng chunk (metadata chung entry)
145
+ chunks = self._chunk(content)
146
+ metadatas = [
147
+ {"entry_id": entry_id, "date": date, "tags": tags, "chunk_index": i, "total_chunks": len(chunks)}
148
+ for i, _ in enumerate(chunks)
149
+ ]
150
+ self.embedding_store.embed_and_store_texts(chunks, metadatas)
151
+ # logger.info(f"Added entry {entry_id} với {len(chunks)} chunk")
152
+ return entry_id
153
+
154
+ def delete_entry(self, entry_id: int) -> bool:
155
+ # Xoá vector theo metadata
156
+ try:
157
+ self.embedding_store.delete_documents_by_metadata({"entry_id": entry_id})
158
+ except Exception as e:
159
+ logger.warning(f"Failed to delete vectors for entry {entry_id}: {e}")
160
+
161
+ conn = get_conn()
162
+ cur = conn.cursor()
163
+ cur.execute("DELETE FROM diary_entries WHERE id = ?", (entry_id,))
164
+ deleted = cur.rowcount
165
+ conn.commit()
166
+ conn.close()
167
+ if deleted:
168
+ logger.info(f"Deleted entry {entry_id}")
169
+ return True
170
+ logger.warning(f"Entry {entry_id} not found")
171
+ return False
172
+
173
+ def list_entries(self, limit: int = 10) -> List[DiaryEntry]:
174
+ conn = get_conn()
175
+ cur = conn.cursor()
176
+ cur.execute(
177
+ "SELECT id, date, content, tags, created_at FROM diary_entries ORDER BY created_at DESC LIMIT ?",
178
+ (limit,),
179
+ )
180
+ rows = [DiaryEntry(*r) for r in cur.fetchall()]
181
+ conn.close()
182
+ return rows
183
+
184
+ # ------------- Chat -------------
185
+ def chat(self, question: str, k: int = 4) -> Dict[str, Any]:
186
+ """
187
+ Trả về:
188
+ {
189
+ 'answer': str,
190
+ 'contexts': [ { 'snippet': ..., 'date': ..., 'entry_id': ... } ]
191
+ }
192
+ """
193
+ try:
194
+ results = self.embedding_store.similarity_search(question, k=k)
195
+ except Exception as e:
196
+ logger.warning(f"Similarity search failed: {e}")
197
+ results = []
198
+ contexts = []
199
+ for doc in results:
200
+ contexts.append(
201
+ {
202
+ "snippet": doc.page_content[:300],
203
+ "date": doc.metadata.get("date"),
204
+ "entry_id": doc.metadata.get("entry_id"),
205
+ "tags": doc.metadata.get("tags"),
206
+ }
207
+ )
208
+
209
+ if self._model and contexts:
210
+ context_text = "\n".join(
211
+ [f"[Entry {c['entry_id']} - {c['date']}] {c['snippet']}" for c in contexts]
212
+ )
213
+ prompt = (
214
+ "You are a helpful diary assistant. Use only the context below to answer.\n\n"
215
+ f"CONTEXT:\n{context_text}\n\nQUESTION: {question}\n\nAnswer in the same language as the question."
216
+ )
217
+ try:
218
+ resp = self._model.generate_content(prompt)
219
+ answer = resp.text.strip()
220
+ except Exception as e:
221
+ answer = f"(LLM error, showing raw context) -> {e}\n" + " | ".join(
222
+ c["snippet"] for c in contexts
223
+ )
224
+ else:
225
+ answer = " | ".join(c["snippet"] for c in contexts) if contexts else "Không tìm thấy nội dung liên quan."
226
+
227
+ return {"answer": answer, "contexts": contexts}
228
+
229
+
230
+ def _cli(): # Simple command line interface
231
+ import argparse
232
+ parser = argparse.ArgumentParser(description="Simple Diary Chatbot")
233
+ sub = parser.add_subparsers(dest="cmd")
234
+
235
+ p_add = sub.add_parser("add", help="Add a diary entry")
236
+ p_add.add_argument("--date", default=datetime.now().strftime("%Y-%m-%d"))
237
+ p_add.add_argument("--content", required=True)
238
+ p_add.add_argument("--tags", default="")
239
+
240
+ p_del = sub.add_parser("delete", help="Delete an entry by id")
241
+ p_del.add_argument("--id", type=int, required=True)
242
+
243
+ p_chat = sub.add_parser("chat", help="Ask a question")
244
+ p_chat.add_argument("--q", required=True, help="Question")
245
+ p_chat.add_argument("--k", type=int, default=4)
246
+
247
+ p_list = sub.add_parser("list", help="List recent entries")
248
+ p_list.add_argument("--limit", type=int, default=5)
249
+
250
+ args = parser.parse_args()
251
+ bot = SimpleDiaryChatbot(api_key=os.getenv("GOOGLE_API_KEY"))
252
+
253
+ if args.cmd == "add":
254
+ eid = bot.add_entry(args.date, args.content, args.tags)
255
+ print(f"Added entry id={eid}")
256
+ elif args.cmd == "delete":
257
+ ok = bot.delete_entry(args.id)
258
+ print("Deleted" if ok else "Not found")
259
+ elif args.cmd == "chat":
260
+ resp = bot.chat(args.q, k=args.k)
261
+ print("Answer:\n", resp["answer"])
262
+ print("\nContexts:")
263
+ for c in resp["contexts"]:
264
+ print(f"- ({c['entry_id']}) {c['date']} :: {c['snippet'][:80]}...")
265
+ elif args.cmd == "list":
266
+ entries = bot.list_entries(limit=args.limit)
267
+ for e in entries:
268
+ print(f"{e.id} | {e.date} | {e.tags} | {e.content[:60]}...")
269
+ else:
270
+ parser.print_help()
271
+
272
+
273
+ if __name__ == "__main__":
274
+ _cli()
clean_repo/src/streamlit_app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ """
7
+ # Welcome to Streamlit!
8
+
9
+ Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
+ forums](https://discuss.streamlit.io).
12
+
13
+ In the meantime, below is an example of what you can do with just a few lines of code:
14
+ """
15
+
16
+ num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
+ num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
+
19
+ indices = np.linspace(0, 1, num_points)
20
+ theta = 2 * np.pi * num_turns * indices
21
+ radius = indices
22
+
23
+ x = radius * np.cos(theta)
24
+ y = radius * np.sin(theta)
25
+
26
+ df = pd.DataFrame({
27
+ "x": x,
28
+ "y": y,
29
+ "idx": indices,
30
+ "rand": np.random.randn(num_points),
31
+ })
32
+
33
+ st.altair_chart(alt.Chart(df, height=700, width=700)
34
+ .mark_point(filled=True)
35
+ .encode(
36
+ x=alt.X("x", axis=None),
37
+ y=alt.Y("y", axis=None),
38
+ color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
+ size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
+ ))
clean_repo/src/streamlit_app/__pycache__/auth_ui.cpython-311.pyc ADDED
Binary file (17.9 kB). View file
 
clean_repo/src/streamlit_app/__pycache__/auto_sync.cpython-311.pyc ADDED
Binary file (18.5 kB). View file
 
clean_repo/src/streamlit_app/__pycache__/rag_client.cpython-311.pyc ADDED
Binary file (10.5 kB). View file
 
clean_repo/src/streamlit_app/__pycache__/user_auth.cpython-311.pyc ADDED
Binary file (12.9 kB). View file