pikamomo commited on
Commit
c32cdfb
Β·
0 Parent(s):

initial deploy

Browse files
Files changed (13) hide show
  1. .gitignore +158 -0
  2. ARCHITECTURE.md +555 -0
  3. README.md +27 -0
  4. admin.py +422 -0
  5. app.py +149 -0
  6. chatbot-widget.html +336 -0
  7. requirements.txt +30 -0
  8. src/__init__.py +0 -0
  9. src/chatbot.py +195 -0
  10. src/ingestion.py +102 -0
  11. src/scraper.py +148 -0
  12. src/vector_store.py +134 -0
  13. tests/test_connections.py +82 -0
.gitignore ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ PIPFILE.lock
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Django stuff:
57
+ *.log
58
+ local_settings.py
59
+ db.sqlite3
60
+ db.sqlite3-journal
61
+
62
+ # Flask stuff:
63
+ instance/
64
+ .webassets-cache
65
+
66
+ # Scrapy stuff:
67
+ .scrapy
68
+
69
+ # Sphinx documentation
70
+ docs/_build/
71
+
72
+ # PyBuilder
73
+ .pybuilder/
74
+ target/
75
+
76
+ # Jupyter Notebook
77
+ .ipynb_checkpoints
78
+
79
+ # IPython
80
+ profile_default/
81
+ ipython_config.py
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # pipenv
87
+ Pipfile.lock
88
+
89
+ # poetry
90
+ poetry.lock
91
+
92
+ # pdm
93
+ .pdm.toml
94
+
95
+ # PEP 582
96
+ __pypackages__/
97
+
98
+ # Celery stuff
99
+ celerybeat-schedule
100
+ celerybeat.pid
101
+
102
+ # SageMath parsed files
103
+ *.sage.py
104
+
105
+ # Environments
106
+ .env
107
+ .venv
108
+ env/
109
+ venv/
110
+ ENV/
111
+ env.bak/
112
+ venv.bak/
113
+
114
+ # Spyder project settings
115
+ .spyderproject
116
+ .spyproject
117
+
118
+ # Rope project settings
119
+ .ropeproject
120
+
121
+ # mkdocs documentation
122
+ /site
123
+
124
+ # mypy
125
+ .mypy_cache/
126
+ .dmypy.json
127
+ dmypy.json
128
+
129
+ # Pyre type checker
130
+ .pyre/
131
+
132
+ # pytype static type analyzer
133
+ .pytype/
134
+
135
+ # Cython debug symbols
136
+ cython_debug/
137
+
138
+ # IDEs
139
+ .vscode/
140
+ .idea/
141
+ *.swp
142
+ *.swo
143
+ *~
144
+ .DS_Store
145
+
146
+ # OS-specific
147
+ Thumbs.db
148
+ Desktop.ini
149
+
150
+ # Project-specific
151
+ data/
152
+ *.db
153
+ *.sqlite
154
+ logs/
155
+ *.log
156
+ temp/
157
+ tmp/
158
+
ARCHITECTURE.md ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HR Intervals AI Assistant - Architecture Documentation
2
+
3
+ ## Project Overview
4
+
5
+ An AI-powered bilingual chatbot for nonprofit organizations providing HR support, policy generation, and compliance checking.
6
+
7
+ **Tech Stack:**
8
+ - Backend: Python 3.12 + LangChain
9
+ - Vector Database: Qdrant Cloud
10
+ - AI Models: OpenAI (GPT-4o-mini, text-embedding-3-small)
11
+ - UI Framework: Gradio
12
+ - Web Scraping: Firecrawl
13
+ - Monitoring: LangSmith (optional)
14
+ - Deployment: Hugging Face Spaces
15
+
16
+ ---
17
+
18
+ ## System Architecture
19
+
20
+ ### High-Level Architecture
21
+ ```
22
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
23
+ β”‚ USER LAYER β”‚
24
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
25
+ β”‚ app.py β”‚ admin.py β”‚
26
+ β”‚ (Chat Interface) β”‚ (Admin Interface) β”‚
27
+ β”‚ - User Q&A β”‚ - Upload documents β”‚
28
+ β”‚ - Policy generation β”‚ - Scrape web pages β”‚
29
+ β”‚ - View sources β”‚ - Manage content β”‚
30
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
31
+ β”‚ β”‚
32
+ β–Ό β–Ό
33
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
34
+ β”‚ APPLICATION LAYER β”‚
35
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
36
+ β”‚ chatbot.py β”‚ ingestion.py β”‚ scraper.py β”‚
37
+ β”‚ - RAG chain β”‚ - PDF/DOCX β”‚ - Web scraping β”‚
38
+ β”‚ - Retrieval β”‚ - Text chunking β”‚ - URL processing β”‚
39
+ β”‚ - QA logic β”‚ - Metadata β”‚ - Content storage β”‚
40
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
41
+ β”‚ β”‚
42
+ β–Ό β–Ό
43
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
44
+ β”‚ EXTERNAL SERVICES β”‚
45
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
46
+ β”‚ Qdrant β”‚ OpenAI β”‚ Firecrawl β”‚ LangSmith β”‚
47
+ β”‚ Cloud β”‚ API β”‚ API β”‚ (optional) β”‚
48
+ β”‚ - Vectors β”‚ - Embeddingsβ”‚ - Scraping β”‚ - Monitoring β”‚
49
+ β”‚ - Search β”‚ - Chat β”‚ - Markdown β”‚ - Debugging β”‚
50
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
51
+ ```
52
+
53
+ ---
54
+
55
+ ## Module Relationships
56
+
57
+ ### Core Modules
58
+
59
+ #### 1. `src/ingestion.py` - Document Processing Module
60
+
61
+ **Purpose:** Load, process, and store PDF/DOCX documents into vector database
62
+
63
+ **Key Functions:**
64
+ ```python
65
+ create_vectorstore() -> (vectorstore, embeddings, client)
66
+ load_document(file_path: str) -> List[Document]
67
+ chunk_documents(documents, chunk_size=1000, chunk_overlap=200) -> List[Document]
68
+ add_metadata(chunks, source_name, doc_type="document") -> List[Document]
69
+ ingest_document(file_path: str, doc_type="document") -> int
70
+ ```
71
+
72
+ **Dependencies:**
73
+ - `langchain_community.document_loaders` (PyPDFLoader, Docx2txtLoader)
74
+ - `langchain.text_splitter` (RecursiveCharacterTextSplitter)
75
+ - `langchain_openai` (OpenAIEmbeddings)
76
+ - `langchain_qdrant` (QdrantVectorStore)
77
+ - `qdrant_client` (QdrantClient)
78
+
79
+ **Used By:**
80
+ - `admin.py` (upload functionality)
81
+
82
+ ---
83
+
84
+ #### 2. `src/scraper.py` - Web Scraping Module
85
+
86
+ **Purpose:** Scrape web pages and store content in vector database
87
+
88
+ **Key Functions:**
89
+ ```python
90
+ scrape_url(url: str) -> str
91
+ process_and_store_webpage(url: str) -> int
92
+ ```
93
+
94
+ **Dependencies:**
95
+ - `firecrawl` (FirecrawlApp)
96
+ - `langchain.schema` (Document)
97
+ - `langchain.text_splitter` (RecursiveCharacterTextSplitter)
98
+ - `langchain_openai` (OpenAIEmbeddings)
99
+ - `langchain_qdrant` (QdrantVectorStore)
100
+
101
+ **Used By:**
102
+ - `admin.py` (URL scraping functionality)
103
+
104
+ ---
105
+
106
+ #### 3. `src/chatbot.py` - RAG Question-Answering Module
107
+
108
+ **Purpose:** Handle user questions using Retrieval-Augmented Generation
109
+
110
+ **Key Functions:**
111
+ ```python
112
+ create_rag_chain() -> ConversationalRetrievalChain
113
+ ask_question(qa_chain, question: str) -> (answer: str, sources: List[Document])
114
+ ```
115
+
116
+ **Components:**
117
+ - Vector store retriever (k=5 similar documents)
118
+ - LLM: GPT-4o-mini (temperature=0.3)
119
+ - Conversation memory (ConversationBufferMemory)
120
+ - System prompt with disclaimers
121
+
122
+ **Dependencies:**
123
+ - `langchain_openai` (ChatOpenAI, OpenAIEmbeddings)
124
+ - `langchain_qdrant` (QdrantVectorStore)
125
+ - `langchain.chains` (ConversationalRetrievalChain)
126
+ - `langchain.memory` (ConversationBufferMemory)
127
+ - `qdrant_client` (QdrantClient)
128
+
129
+ **Used By:**
130
+ - `app.py` (chat interface)
131
+
132
+ ---
133
+
134
+ ### User Interface Modules
135
+
136
+ #### 4. `app.py` - Chat Interface (End Users)
137
+
138
+ **Purpose:** Gradio-based chat interface for nonprofit users
139
+
140
+ **Features:**
141
+ - Real-time Q&A
142
+ - PII detection and warnings
143
+ - Source citations
144
+ - Disclaimer display
145
+ - Conversation history
146
+ - Example questions
147
+
148
+ **Calls:**
149
+ - `src/chatbot.py` β†’ `create_rag_chain()`, `ask_question()`
150
+
151
+ **Port:** 7860
152
+
153
+ ---
154
+
155
+ #### 5. `admin.py` - Admin Interface (Content Managers)
156
+
157
+ **Purpose:** Gradio-based management interface for HR Intervals team
158
+
159
+ **Features:**
160
+ - View all documents
161
+ - Upload PDF/DOCX files
162
+ - Scrape single/multiple URLs
163
+ - Delete documents by source
164
+ - Update/replace documents
165
+
166
+ **Calls:**
167
+ - `src/ingestion.py` β†’ `ingest_document()`
168
+ - `src/scraper.py` β†’ `process_and_store_webpage()`
169
+ - `qdrant_client.QdrantClient` β†’ direct CRUD operations
170
+
171
+ **Port:** 7861
172
+
173
+ ---
174
+
175
+ ## Data Flow Diagrams
176
+
177
+ ### Flow 1: Document Upload
178
+ ```
179
+ User (admin.py)
180
+ ↓
181
+ [Select PDF/DOCX file]
182
+ ↓
183
+ admin.py: upload_document()
184
+ ↓
185
+ ingestion.py: ingest_document()
186
+ ↓
187
+ [Load document] β†’ PyPDFLoader / Docx2txtLoader
188
+ ↓
189
+ [Split into chunks] β†’ RecursiveCharacterTextSplitter
190
+ β”‚ - chunk_size: 1000
191
+ β”‚ - chunk_overlap: 200
192
+ ↓
193
+ [Add metadata]
194
+ β”‚ - source: filename
195
+ β”‚ - type: document/policy/guide
196
+ β”‚ - upload_date: YYYY-MM-DD
197
+ ↓
198
+ [Generate embeddings] β†’ OpenAI text-embedding-3-small
199
+ ↓
200
+ [Store vectors + metadata] β†’ Qdrant Cloud
201
+ ↓
202
+ βœ… Success: N chunks uploaded
203
+ ```
204
+
205
+ ---
206
+
207
+ ### Flow 2: Web Scraping
208
+ ```
209
+ User (admin.py)
210
+ ↓
211
+ [Enter URL(s)]
212
+ ↓
213
+ admin.py: scrape_single_url() / scrape_multiple_urls()
214
+ ↓
215
+ scraper.py: process_and_store_webpage()
216
+ ↓
217
+ [Scrape webpage] β†’ Firecrawl API
218
+ β”‚ - Returns: Markdown content
219
+ ↓
220
+ [Create document with metadata]
221
+ β”‚ - source: URL
222
+ β”‚ - type: webpage
223
+ β”‚ - upload_date: YYYY-MM-DD
224
+ ↓
225
+ [Split into chunks] β†’ RecursiveCharacterTextSplitter
226
+ ↓
227
+ [Generate embeddings] β†’ OpenAI text-embedding-3-small
228
+ ↓
229
+ [Store vectors + metadata] β†’ Qdrant Cloud
230
+ ↓
231
+ βœ… Success: N chunks uploaded
232
+ ```
233
+
234
+ ---
235
+
236
+ ### Flow 3: Question Answering (RAG)
237
+ ```
238
+ User (app.py)
239
+ ↓
240
+ [Type question]
241
+ ↓
242
+ app.py: chat()
243
+ ↓
244
+ [Check for PII] β†’ Regex patterns
245
+ β”‚ - Capitalized names: [A-Z][a-z]+ [A-Z][a-z]+
246
+ β”‚ - If detected: Show warning
247
+ ↓
248
+ chatbot.py: ask_question()
249
+ ↓
250
+ ConversationalRetrievalChain
251
+ ↓
252
+ [Convert question to embedding] β†’ OpenAI text-embedding-3-small
253
+ ↓
254
+ [Similarity search] β†’ Qdrant Cloud
255
+ β”‚ - Retrieve top 5 similar chunks
256
+ β”‚ - Return: chunks + metadata
257
+ ↓
258
+ [Combine context + question + chat history]
259
+ ↓
260
+ [Generate answer] β†’ OpenAI GPT-4o-mini
261
+ β”‚ - Temperature: 0.3
262
+ β”‚ - System prompt: HR assistant with disclaimers
263
+ ↓
264
+ [Return answer + source documents]
265
+ ↓
266
+ app.py: Display answer with sources
267
+ ↓
268
+ User sees:
269
+ - Answer
270
+ - ⚠️ PII warning (if applicable)
271
+ - πŸ“š Sources (top 3)
272
+ ```
273
+
274
+ ---
275
+
276
+ ### Flow 4: Document Deletion
277
+ ```
278
+ User (admin.py)
279
+ ↓
280
+ [Enter document name or URL]
281
+ ↓
282
+ admin.py: delete_document()
283
+ ↓
284
+ Qdrant Client: delete()
285
+ ↓
286
+ [Filter by metadata]
287
+ β”‚ - Field: "source"
288
+ β”‚ - Match: exact document name
289
+ ↓
290
+ [Delete all matching points]
291
+ ↓
292
+ βœ… Success: All chunks from source deleted
293
+ ```
294
+
295
+ ---
296
+
297
+ ### Flow 5: Document Update
298
+ ```
299
+ User (admin.py)
300
+ ↓
301
+ [Specify old document name]
302
+ [Select new file]
303
+ ↓
304
+ admin.py: update_document()
305
+ ↓
306
+ [Step 1: Delete old document]
307
+ β”‚ └─→ delete_document(old_source)
308
+ ↓
309
+ [Step 2: Upload new document]
310
+ β”‚ └─→ upload_document(new_file)
311
+ ↓
312
+ βœ… Success: Document replaced
313
+ ```
314
+
315
+ ---
316
+
317
+ ## Configuration
318
+
319
+ ### Environment Variables (`.env`)
320
+ ```bash
321
+ # OpenAI API
322
+ OPENAI_API_KEY=sk-proj-...
323
+ OPEN_AI_EMBEDDING_MODEL=text-embedding-3-small
324
+ OPEN_AI_CHAT_MODEL=gpt-4o-mini
325
+
326
+ # Qdrant Cloud
327
+ QDRANT_URL=https://xxx.cloud.qdrant.io:6333
328
+ QDRANT_API_KEY=xxx
329
+ QDRANT_COLLECTION=hr-intervals
330
+
331
+ # Firecrawl
332
+ FIRECRAWL_API_KEY=fc-xxx
333
+
334
+ # LangSmith (Optional)
335
+ LANGSMITH_TRACING=false
336
+ LANGSMITH_API_KEY=xxx
337
+ LANGSMITH_PROJECT=hr-intervals-chatbot
338
+ ```
339
+
340
+ ---
341
+
342
+ ## Project Structure
343
+ ```
344
+ hr-intervals-chatbot/
345
+ β”œβ”€β”€ src/
346
+ β”‚ β”œβ”€β”€ __init__.py
347
+ β”‚ β”œβ”€β”€ ingestion.py # Document processing
348
+ β”‚ β”œβ”€β”€ chatbot.py # RAG Q&A logic
349
+ β”‚ └── scraper.py # Web scraping
350
+ β”œβ”€β”€ data/
351
+ β”‚ β”œβ”€β”€ documents/ # Uploaded files
352
+ β”‚ └── scraped/ # Scraped content (cache)
353
+ β”œβ”€β”€ app.py # User chat interface
354
+ β”œβ”€β”€ admin.py # Admin management interface
355
+ β”œβ”€β”€ .env # API keys and config
356
+ β”œβ”€β”€ requirements.txt # Python dependencies
357
+ β”œβ”€β”€ ARCHITECTURE.md # This file
358
+ └── README.md # Project overview
359
+ ```
360
+
361
+ ---
362
+
363
+ ## Key Technical Decisions
364
+
365
+ ### 1. Vector Database: Qdrant Cloud
366
+ - **Why:** Built-in web UI, easy document management, free tier
367
+ - **Alternative considered:** Pinecone (limited free tier, no document-level UI)
368
+
369
+ ### 2. Embedding Model: text-embedding-3-small
370
+ - **Dimensions:** 1536
371
+ - **Why:** Excellent quality with best cost-performance ratio, multilingual support (English/French)
372
+
373
+ ### 3. LLM: GPT-4o-mini
374
+ - **Why:** Cost-effective, sufficient for HR Q&A, fast response
375
+ - **Alternative:** GPT-4o (more expensive but higher quality)
376
+
377
+ ### 4. Chunking Strategy
378
+ - **Chunk size:** 1000 characters
379
+ - **Overlap:** 200 characters
380
+ - **Separators:** `["\n\n", "\n", ". ", " ", ""]`
381
+ - **Why:** Balances context preservation and retrieval accuracy
382
+
383
+ ### 5. Retrieval: Top-k similarity search
384
+ - **k=5:** Retrieve 5 most similar chunks
385
+ - **Distance metric:** Cosine similarity
386
+ - **Why:** Good balance between context and noise
387
+
388
+ ---
389
+
390
+ ## Metadata Schema
391
+
392
+ Every chunk stored in Qdrant has the following metadata:
393
+ ```python
394
+ {
395
+ "source": str, # Filename or URL
396
+ "type": str, # "document" | "webpage" | "policy" | "guide"
397
+ "upload_date": str, # "YYYY-MM-DD"
398
+ "page": int, # (optional) Page number for PDFs
399
+ "valid_until": str, # (optional) Expiry date for policies
400
+ "version": str, # (optional) Version number
401
+ }
402
+ ```
403
+
404
+ ---
405
+
406
+ ## Document Management Operations
407
+
408
+ ### View Documents
409
+ ```python
410
+ # List all unique documents
411
+ client.scroll(collection_name, limit=1000, with_payload=True)
412
+ # Group by 'source' field
413
+ ```
414
+
415
+ ### Upload Document
416
+ ```python
417
+ # 1. Load: PyPDFLoader / Docx2txtLoader
418
+ # 2. Chunk: RecursiveCharacterTextSplitter
419
+ # 3. Add metadata: source, type, date
420
+ # 4. Embed: OpenAI text-embedding-3-small
421
+ # 5. Store: QdrantVectorStore.from_documents()
422
+ ```
423
+
424
+ ### Delete Document
425
+ ```python
426
+ client.delete(
427
+ collection_name=collection_name,
428
+ points_selector=FilterSelector(
429
+ filter=Filter(
430
+ must=[
431
+ FieldCondition(
432
+ key="source",
433
+ match=MatchValue(value="filename.pdf")
434
+ )
435
+ ]
436
+ )
437
+ )
438
+ )
439
+ ```
440
+
441
+ ### Update Document
442
+ ```python
443
+ # 1. Delete old version (by source name)
444
+ # 2. Upload new version
445
+ ```
446
+
447
+ ---
448
+
449
+ ## Security Features
450
+
451
+ ### PII Detection
452
+ - Regex pattern for names: `\b[A-Z][a-z]+ [A-Z][a-z]+\b`
453
+ - Warning displayed to user if detected
454
+ - Future: Integrate Microsoft Presidio for advanced PII detection
455
+
456
+ ### Disclaimers
457
+ - Shown on first interaction
458
+ - Embedded in system prompt
459
+ - Reminds users to consult professionals
460
+
461
+ ### API Key Security
462
+ - Stored in `.env` file (not in version control)
463
+ - `.env` added to `.gitignore`
464
+
465
+ ---
466
+
467
+ ## Performance Considerations
468
+
469
+ ### Embedding Cost
470
+ - Model: text-embedding-3-small
471
+ - Cost: ~$0.13 per 1M tokens
472
+ - Typical document: 10 pages β‰ˆ 5,000 tokens β‰ˆ $0.0007
473
+
474
+ ### Chat Cost
475
+ - Model: GPT-4o-mini
476
+ - Cost: ~$0.15 per 1M input tokens, $0.60 per 1M output tokens
477
+ - Typical query: 5 chunks (5,000 tokens) + question (100 tokens) β‰ˆ $0.0008
478
+
479
+ ### Storage
480
+ - Qdrant free tier: 1 GB
481
+ - Each chunk: ~1 KB metadata + 12 KB vector (3072 dims Γ— 4 bytes)
482
+ - Capacity: ~75,000 chunks (approximately 1,500 documents of 50 chunks each)
483
+
484
+ ---
485
+
486
+ ## Future Enhancements
487
+
488
+ ### Phase 1 (Week 9-12) - Policy Features
489
+ - Policy template library
490
+ - Policy generation from user input
491
+ - Policy compliance checking
492
+ - Risk identification
493
+
494
+ ### Phase 2 (Week 13-18) - Advanced Features
495
+ - Bilingual support (French)
496
+ - Language detection and switching
497
+ - Content recommendation system
498
+ - Feedback collection mechanism
499
+
500
+ ### Phase 3 (Week 19-20) - Production
501
+ - Deployment to Hugging Face Spaces
502
+ - User authentication (if needed)
503
+ - Analytics dashboard
504
+ - Automated expiry detection for policies
505
+
506
+ ---
507
+
508
+ ## Troubleshooting
509
+
510
+ ### Common Issues
511
+
512
+ **1. "Collection not found" error**
513
+ ```bash
514
+ # Solution: Collection is created automatically on first upload
515
+ # Just upload a document and it will be created
516
+ ```
517
+
518
+ **2. "No documents found" when asking questions**
519
+ ```bash
520
+ # Solution: Upload at least one document first via admin.py
521
+ ```
522
+
523
+ **3. "Rate limit exceeded" from OpenAI**
524
+ ```bash
525
+ # Solution: Add delays between requests or upgrade OpenAI plan
526
+ ```
527
+
528
+ **4. "Firecrawl scraping failed"**
529
+ ```bash
530
+ # Solution: Check if URL is accessible, verify Firecrawl API key
531
+ ```
532
+
533
+ ---
534
+
535
+ ## Development Timeline
536
+
537
+ - **Week 1-2:** Infrastructure setup βœ…
538
+ - **Week 3-4:** Basic RAG system βœ…
539
+ - **Week 5-6:** Web scraping + chat interface
540
+ - **Week 7-8:** Quality improvements
541
+ - **Week 9-10:** Admin interface
542
+ - **Week 11-12:** Demo delivery
543
+ - **Week 13-16:** Policy features
544
+ - **Week 17-18:** Bilingual support
545
+ - **Week 19-20:** Final delivery
546
+
547
+ ---
548
+
549
+ ## References
550
+
551
+ - LangChain Documentation: https://python.langchain.com/docs/
552
+ - Qdrant Documentation: https://qdrant.tech/documentation/
553
+ - OpenAI API Reference: https://platform.openai.com/docs/
554
+ - Gradio Documentation: https://www.gradio.app/docs/
555
+ ```
README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: HR Intervals Chatbot
3
+ emoji: πŸ’Ό
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.49.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # HR Intervals AI Assistant
13
+
14
+ A RAG-powered chatbot that provides HR knowledge and policy guidance for non-profit organizations.
15
+
16
+ ## Features
17
+ - πŸ€– AI-powered Q&A based on HR knowledge base
18
+ - πŸ“š Source citations for transparency
19
+ - ⚠️ PII detection and warnings
20
+ - πŸ’¬ Interactive chat interface
21
+
22
+ ## Setup
23
+ This Space requires the following environment variables to be set:
24
+ - `OPENAI_API_KEY`: Your OpenAI API key
25
+ - `QDRANT_URL`: Your Qdrant vector database URL
26
+ - `QDRANT_API_KEY`: Your Qdrant API key
27
+
admin.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio admin interface for content management
3
+ Allows uploading documents, scraping URLs, and managing content
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ from dotenv import load_dotenv
9
+ from qdrant_client import QdrantClient, models
10
+ from src.ingestion import ingest_document
11
+ from src.scraper import process_and_store_webpage
12
+
13
+ load_dotenv()
14
+
15
+ # Initialize Qdrant client
16
+ client = QdrantClient(
17
+ url=os.getenv("QDRANT_URL"),
18
+ api_key=os.getenv("QDRANT_API_KEY")
19
+ )
20
+ collection_name = os.getenv("QDRANT_COLLECTION")
21
+
22
+ # Create index for metadata.source to enable filtering
23
+ try:
24
+ client.create_payload_index(
25
+ collection_name=collection_name,
26
+ field_name="metadata.source",
27
+ field_schema=models.PayloadSchemaType.KEYWORD
28
+ )
29
+ print("βœ… Payload index for metadata.source created successfully")
30
+ except Exception as e:
31
+ # Index might already exist or collection not found
32
+ print(f"ℹ️ Index status: {str(e)}")
33
+
34
+
35
+ # ==================== Functions ====================
36
+
37
+ def list_all_documents():
38
+ """
39
+ List all uploaded documents
40
+
41
+ Returns:
42
+ HTML table string with selectable content
43
+ """
44
+ try:
45
+ # Paginate through ALL points (Qdrant has 5800+ points)
46
+ all_points = []
47
+ offset = None
48
+
49
+ while True:
50
+ result = client.scroll(
51
+ collection_name=collection_name,
52
+ limit=1000,
53
+ offset=offset,
54
+ with_payload=True
55
+ )
56
+ points, next_offset = result
57
+ all_points.extend(points)
58
+
59
+ if next_offset is None:
60
+ break
61
+ offset = next_offset
62
+
63
+ # Group by source
64
+ docs_dict = {}
65
+ for point in all_points:
66
+ payload = point.payload
67
+ # Metadata is nested inside payload
68
+ metadata = payload.get("metadata", {})
69
+ source = metadata.get("source", "Unknown")
70
+
71
+ if source not in docs_dict:
72
+ docs_dict[source] = {
73
+ "name": source,
74
+ "type": metadata.get("type", "Unknown"),
75
+ "date": metadata.get("upload_date", "Unknown"),
76
+ "chunks": 0
77
+ }
78
+ docs_dict[source]["chunks"] += 1
79
+
80
+ # Create HTML table with selectable text
81
+ if not docs_dict or (len(docs_dict) == 1 and "Unknown" in docs_dict):
82
+ return """
83
+ <div style="padding: 20px; text-align: center; color: #666;">
84
+ <p>πŸ“‚ No documents yet</p>
85
+ </div>
86
+ """
87
+
88
+ html = """
89
+ <style>
90
+ .docs-table {
91
+ width: 100%;
92
+ border-collapse: collapse;
93
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Arial, sans-serif;
94
+ user-select: text;
95
+ -webkit-user-select: text;
96
+ -moz-user-select: text;
97
+ -ms-user-select: text;
98
+ }
99
+ .docs-table thead {
100
+ background-color: #f8f9fa;
101
+ }
102
+ .docs-table th {
103
+ padding: 12px;
104
+ text-align: left;
105
+ font-weight: 600;
106
+ border-bottom: 2px solid #dee2e6;
107
+ user-select: text;
108
+ }
109
+ .docs-table td {
110
+ padding: 12px;
111
+ border-bottom: 1px solid #dee2e6;
112
+ user-select: text;
113
+ cursor: text;
114
+ }
115
+ .docs-table tr:hover {
116
+ background-color: #f8f9fa;
117
+ }
118
+ .doc-name {
119
+ color: #0066cc;
120
+ word-break: break-all;
121
+ }
122
+ </style>
123
+ <table class="docs-table">
124
+ <thead>
125
+ <tr>
126
+ <th>Document Name</th>
127
+ <th>Type</th>
128
+ <th>Upload Date</th>
129
+ <th>Chunks</th>
130
+ </tr>
131
+ </thead>
132
+ <tbody>
133
+ """
134
+
135
+ for doc in docs_dict.values():
136
+ html += f"""
137
+ <tr>
138
+ <td class="doc-name">{doc['name']}</td>
139
+ <td>{doc['type']}</td>
140
+ <td>{doc['date']}</td>
141
+ <td>{doc['chunks']}</td>
142
+ </tr>
143
+ """
144
+
145
+ html += """
146
+ </tbody>
147
+ </table>
148
+ """
149
+
150
+ return html
151
+
152
+ except Exception as e:
153
+ return f"""
154
+ <div style="padding: 20px; color: #dc3545;">
155
+ <p>❌ Error: {str(e)}</p>
156
+ </div>
157
+ """
158
+
159
+
160
+ def upload_document(file, doc_type="document"):
161
+ """
162
+ Upload PDF or DOCX file
163
+
164
+ Args:
165
+ file: Uploaded file object
166
+ doc_type: Type of document
167
+
168
+ Returns:
169
+ Success message
170
+ """
171
+ if file is None:
172
+ return "❌ Please select a file"
173
+
174
+ try:
175
+ file_path = file.name
176
+
177
+ # Ingest document
178
+ num_chunks = ingest_document(file_path, doc_type)
179
+
180
+ return f"βœ… Success!\n\nFile: {os.path.basename(file_path)}\nChunks created: {num_chunks}\nType: {doc_type}"
181
+
182
+ except Exception as e:
183
+ return f"❌ Upload failed:\n{str(e)}"
184
+
185
+
186
+ def scrape_single_url(url):
187
+ """
188
+ Scrape single URL
189
+
190
+ Args:
191
+ url: URL to scrape
192
+
193
+ Returns:
194
+ Success message
195
+ """
196
+ if not url:
197
+ return "❌ Please enter a URL"
198
+
199
+ try:
200
+ num_chunks = process_and_store_webpage(url)
201
+ return f"βœ… Success!\n\nURL: {url}\nChunks created: {num_chunks}"
202
+
203
+ except Exception as e:
204
+ return f"❌ Scraping failed:\n{str(e)}"
205
+
206
+
207
+ def scrape_multiple_urls(urls_text):
208
+ """
209
+ Scrape multiple URLs
210
+
211
+ Args:
212
+ urls_text: URLs separated by newlines
213
+
214
+ Returns:
215
+ Summary of results
216
+ """
217
+ if not urls_text:
218
+ return "❌ Please enter URLs (one per line)"
219
+
220
+ urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
221
+
222
+ results = []
223
+ success_count = 0
224
+ fail_count = 0
225
+
226
+ for url in urls:
227
+ try:
228
+ num_chunks = process_and_store_webpage(url)
229
+ results.append(f"βœ… {url}: {num_chunks} chunks")
230
+ success_count += 1
231
+ except Exception as e:
232
+ results.append(f"❌ {url}: {str(e)}")
233
+ fail_count += 1
234
+
235
+ summary = f"πŸ“Š Summary: {success_count} succeeded, {fail_count} failed\n\n"
236
+ return summary + "\n".join(results)
237
+
238
+
239
+ def delete_document(source_name):
240
+ """
241
+ Delete document by source name
242
+
243
+ Args:
244
+ source_name: Name or URL of the source
245
+
246
+ Returns:
247
+ Success message
248
+ """
249
+ if not source_name:
250
+ return "❌ Please enter document name or URL"
251
+
252
+ try:
253
+ client.delete(
254
+ collection_name=collection_name,
255
+ points_selector=models.FilterSelector(
256
+ filter=models.Filter(
257
+ must=[
258
+ models.FieldCondition(
259
+ key="metadata.source",
260
+ match=models.MatchValue(value=source_name)
261
+ )
262
+ ]
263
+ )
264
+ )
265
+ )
266
+
267
+ return f"βœ… Successfully deleted all content from:\n{source_name}"
268
+
269
+ except Exception as e:
270
+ return f"❌ Deletion failed:\n{str(e)}"
271
+
272
+
273
+ # ==================== Gradio Interface (5.49) ====================
274
+
275
+ with gr.Blocks(
276
+ title="HR Intervals - Admin Panel",
277
+ theme=gr.themes.Soft()
278
+ ) as demo:
279
+
280
+ gr.Markdown("# πŸ“ HR Intervals - Knowledge Base Management")
281
+ gr.Markdown("Manage documents and web content for the AI assistant")
282
+
283
+ with gr.Tabs():
284
+
285
+ # Tab 1: View Documents
286
+ with gr.Tab("πŸ“‹ View Documents"):
287
+ gr.Markdown("### Current documents in knowledge base")
288
+ gr.Markdown("πŸ’‘ *Tip: You can select and copy any text from the table below*")
289
+
290
+ refresh_btn = gr.Button("πŸ”„ Refresh List", variant="primary")
291
+
292
+ docs_table = gr.HTML(
293
+ label="Documents"
294
+ )
295
+
296
+ refresh_btn.click(list_all_documents, outputs=docs_table)
297
+ demo.load(list_all_documents, outputs=docs_table)
298
+
299
+ # Tab 2: Upload Documents
300
+ with gr.Tab("⬆️ Upload Documents"):
301
+ gr.Markdown("### Upload PDF or DOCX files")
302
+
303
+ file_input = gr.File(
304
+ label="Select File (PDF or DOCX)",
305
+ file_types=[".pdf", ".docx"]
306
+ )
307
+
308
+ doc_type_input = gr.Radio(
309
+ choices=["document", "policy", "guide", "article"],
310
+ value="document",
311
+ label="Document Type"
312
+ )
313
+
314
+ upload_btn = gr.Button("πŸ“€ Upload", variant="primary", size="lg")
315
+ upload_output = gr.Textbox(label="Upload Result", lines=5)
316
+
317
+ upload_btn.click(
318
+ upload_document,
319
+ inputs=[file_input, doc_type_input],
320
+ outputs=upload_output
321
+ )
322
+
323
+ # Tab 3: Scrape URLs
324
+ with gr.Tab("🌐 Scrape Web Pages"):
325
+ gr.Markdown("### Scrape content from URLs")
326
+
327
+ with gr.Row():
328
+ with gr.Column():
329
+ gr.Markdown("#### Single URL")
330
+ url_input = gr.Textbox(
331
+ label="Enter URL",
332
+ placeholder="https://example.com/article"
333
+ )
334
+ scrape_btn = gr.Button("πŸ” Scrape", variant="primary")
335
+ scrape_output = gr.Textbox(label="Result", lines=4)
336
+
337
+ scrape_btn.click(
338
+ scrape_single_url,
339
+ inputs=url_input,
340
+ outputs=scrape_output
341
+ )
342
+
343
+ with gr.Column():
344
+ gr.Markdown("#### Batch URLs")
345
+ urls_input = gr.Textbox(
346
+ label="Enter multiple URLs (one per line)",
347
+ placeholder="https://example.com/page1\nhttps://example.com/page2",
348
+ lines=6
349
+ )
350
+ batch_btn = gr.Button("πŸ” Batch Scrape", variant="primary")
351
+ batch_output = gr.Textbox(label="Batch Results", lines=8)
352
+
353
+ batch_btn.click(
354
+ scrape_multiple_urls,
355
+ inputs=urls_input,
356
+ outputs=batch_output
357
+ )
358
+
359
+ # Tab 4: Delete Documents
360
+ with gr.Tab("πŸ—‘οΈ Delete Documents"):
361
+ gr.Markdown("### Delete documents or web pages")
362
+ gr.Markdown("⚠️ **Warning**: This operation cannot be undone!")
363
+
364
+ delete_input = gr.Textbox(
365
+ label="Document Name or URL",
366
+ placeholder="e.g., hiring_policy.pdf or https://example.com/article"
367
+ )
368
+
369
+ delete_btn = gr.Button("πŸ—‘οΈ Delete", variant="stop", size="lg")
370
+ delete_output = gr.Textbox(label="Delete Result", lines=3)
371
+
372
+ delete_btn.click(
373
+ delete_document,
374
+ inputs=delete_input,
375
+ outputs=delete_output
376
+ )
377
+
378
+ # Tab 5: Help
379
+ with gr.Tab("ℹ️ Help"):
380
+ gr.Markdown("""
381
+ ### Usage Guide
382
+
383
+ #### πŸ“‹ View Documents
384
+ - Shows all uploaded documents and web pages
385
+ - Displays document type, upload date, and number of chunks
386
+ - Click "Refresh" to see the latest status
387
+
388
+ #### ⬆️ Upload Documents
389
+ - Supports PDF and DOCX formats
390
+ - Documents are automatically split into chunks (~1000 characters each)
391
+ - You can categorize documents by type
392
+
393
+ #### 🌐 Scrape Web Pages
394
+ - Enter full URLs (including https://)
395
+ - Supports single or batch scraping
396
+ - Content is automatically converted to Markdown format
397
+
398
+ #### πŸ—‘οΈ Delete Documents
399
+ - Enter exact filename or URL
400
+ - Deletes all chunks from that source
401
+ - **Warning**: Cannot be undone!
402
+ - **Tip**: To update a document, delete it first then upload the new version
403
+
404
+ ---
405
+
406
+ ### Advanced Management
407
+
408
+ For detailed vector database management, visit:
409
+ [Qdrant Cloud Dashboard](https://cloud.qdrant.io)
410
+
411
+ ### Technical Support
412
+
413
+ If you encounter issues, please contact the development team.
414
+ """)
415
+
416
+
417
+ if __name__ == "__main__":
418
+ demo.launch(
419
+ server_name="0.0.0.0",
420
+ server_port=7861,
421
+ share=False
422
+ )
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio chat interface for end users
3
+ Uses Gradio 5.49 ChatInterface API
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ from dotenv import load_dotenv
9
+ from src.chatbot import create_rag_chain, ask_question
10
+ import re
11
+ import uuid
12
+
13
+ load_dotenv()
14
+
15
+ # Initialize chatbot
16
+ print("πŸ€– Initializing chatbot...")
17
+ rag_chain, retriever = create_rag_chain()
18
+ print("βœ… Chatbot ready!")
19
+
20
+ # Generate unique session ID for each user
21
+ session_id = str(uuid.uuid4())
22
+
23
+
24
+ def check_pii(text: str) -> bool:
25
+ """
26
+ Simple PII detection - checks for potential names
27
+
28
+ Args:
29
+ text: Input text to check
30
+
31
+ Returns:
32
+ True if PII detected
33
+ """
34
+ # Check for capitalized words that might be names
35
+ name_pattern = r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'
36
+ if re.search(name_pattern, text):
37
+ return True
38
+ return False
39
+
40
+
41
+ def chat_response(message: str, history: list) -> str:
42
+ """
43
+ Handle chat messages (Gradio 5.x format)
44
+
45
+ Args:
46
+ message: User's message
47
+ history: Conversation history
48
+
49
+ Returns:
50
+ Bot's response
51
+ """
52
+
53
+ # Check for PII
54
+ warning = ""
55
+ if check_pii(message):
56
+ warning = "⚠️ **Warning**: Please avoid sharing personal information about specific individuals.\n\n"
57
+
58
+ # Get answer from chatbot
59
+ try:
60
+ answer, sources = ask_question(rag_chain, retriever, message, session_id)
61
+
62
+ # Format response with sources
63
+ response = warning + answer
64
+
65
+ if sources:
66
+ response += "\n\nπŸ“š **Sources:**\n"
67
+ for i, doc in enumerate(sources[:3], 1):
68
+ source = doc.metadata.get("source", "Unknown")
69
+ response += f"{i}. {source}\n"
70
+
71
+ return response
72
+
73
+ except Exception as e:
74
+ return f"❌ Error: {str(e)}\n\nPlease make sure documents have been uploaded to the system."
75
+
76
+
77
+ # Create Gradio interface (Gradio 5.49 API)
78
+ with gr.Blocks(
79
+ title="HR Intervals AI Assistant",
80
+ theme=gr.themes.Soft()
81
+ ) as demo:
82
+
83
+ gr.Markdown("""
84
+ # πŸ’Ό HR Intervals AI Assistant
85
+
86
+ Get instant answers to your HR questions based on our knowledge base.
87
+ """)
88
+
89
+ # Disclaimer
90
+ with gr.Accordion("⚠️ Important Disclaimer - Please Read", open=False):
91
+ gr.Markdown("""
92
+ **This tool is designed to provide general HR-related information and draft policy suggestions.**
93
+
94
+ - This is **NOT** a substitute for professional legal or HR advice
95
+ - For legal compliance and important decisions, consult a qualified attorney or HR professional
96
+ - Do **NOT** share personal information about specific individuals
97
+
98
+ By using this tool, you acknowledge that you understand these limitations.
99
+ """)
100
+
101
+ # Welcome message with disclaimer and example questions
102
+ WELCOME_MESSAGE = """πŸ‘‹ **Welcome to the HR Intervals AI Assistant!**
103
+
104
+ ⚠️ **Important Disclaimer:**
105
+
106
+ This tool is designed to provide general HR-related information and draft policy suggestions. It is not a substitute for professional legal or HR advice. For legal compliance and to ensure the best outcome for your organization, we recommend consulting a qualified attorney or HR professional before implementing any policies or making decisions based on the information provided.
107
+
108
+ ---
109
+
110
+ How can I help you today? **Try asking:**
111
+
112
+ β€’ What should I include in a remote work policy?
113
+ β€’ How do I handle employee terminations properly?
114
+ β€’ What are best practices for hiring in Canada?
115
+ β€’ Tell me about workplace safety requirements"""
116
+
117
+ # Chat interface (Gradio 5.x ChatInterface)
118
+ chat_interface = gr.ChatInterface(
119
+ fn=chat_response,
120
+ chatbot=gr.Chatbot(
121
+ height=500,
122
+ show_label=False,
123
+ type='messages',
124
+ avatar_images=(None, "https://em-content.zobj.net/thumbs/120/apple/354/robot_1f916.png"),
125
+ value=[{"role": "assistant", "content": WELCOME_MESSAGE}]
126
+ ),
127
+ textbox=gr.Textbox(
128
+ placeholder="Ask your HR question here...",
129
+ container=False,
130
+ scale=7
131
+ ),
132
+ title="",
133
+ description="",
134
+ theme=gr.themes.Soft()
135
+ )
136
+
137
+ # Footer
138
+ gr.Markdown("""
139
+ ---
140
+ πŸ’‘ **Tip**: Be specific in your questions for better answers. Remember to consult professionals for legal matters.
141
+ """)
142
+
143
+
144
+ if __name__ == "__main__":
145
+ demo.launch(
146
+ server_name="0.0.0.0",
147
+ server_port=7860,
148
+ share=False
149
+ )
chatbot-widget.html ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>HR Chatbot Widget - Bottom Right Corner</title>
7
+ <style>
8
+ /* Demo page styles */
9
+ * {
10
+ margin: 0;
11
+ padding: 0;
12
+ box-sizing: border-box;
13
+ }
14
+
15
+ body {
16
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
17
+ background: #f8fafc;
18
+ min-height: 100vh;
19
+ padding: 40px;
20
+ }
21
+
22
+ .demo-content {
23
+ max-width: 800px;
24
+ margin: 0 auto;
25
+ }
26
+
27
+ .demo-content h1 {
28
+ font-size: 2rem;
29
+ color: #1e293b;
30
+ margin-bottom: 20px;
31
+ }
32
+
33
+ .demo-content p {
34
+ color: #64748b;
35
+ line-height: 1.8;
36
+ margin-bottom: 15px;
37
+ }
38
+
39
+ .demo-content .note {
40
+ background: #fef3c7;
41
+ border-left: 4px solid #f59e0b;
42
+ padding: 16px 20px;
43
+ border-radius: 0 8px 8px 0;
44
+ margin: 30px 0;
45
+ }
46
+
47
+ .demo-content .note strong {
48
+ color: #92400e;
49
+ }
50
+
51
+ /* ============================================
52
+ CHATBOT WIDGET STYLES - COPY FROM HERE
53
+ ============================================ */
54
+
55
+ /* Chat Toggle Button */
56
+ .chat-widget-button {
57
+ position: fixed;
58
+ bottom: 24px;
59
+ right: 24px;
60
+ width: 64px;
61
+ height: 64px;
62
+ border-radius: 50%;
63
+ background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
64
+ border: none;
65
+ cursor: pointer;
66
+ box-shadow: 0 8px 32px rgba(99, 102, 241, 0.4);
67
+ display: flex;
68
+ align-items: center;
69
+ justify-content: center;
70
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
71
+ z-index: 9998;
72
+ }
73
+
74
+ .chat-widget-button:hover {
75
+ transform: scale(1.1);
76
+ box-shadow: 0 12px 40px rgba(99, 102, 241, 0.5);
77
+ }
78
+
79
+ .chat-widget-button svg {
80
+ width: 28px;
81
+ height: 28px;
82
+ fill: white;
83
+ transition: transform 0.3s ease;
84
+ }
85
+
86
+ .chat-widget-button.active svg {
87
+ transform: rotate(90deg);
88
+ }
89
+
90
+ /* Notification Badge */
91
+ .chat-widget-badge {
92
+ position: absolute;
93
+ top: -4px;
94
+ right: -4px;
95
+ width: 20px;
96
+ height: 20px;
97
+ background: #ef4444;
98
+ border-radius: 50%;
99
+ color: white;
100
+ font-size: 12px;
101
+ font-weight: 600;
102
+ display: flex;
103
+ align-items: center;
104
+ justify-content: center;
105
+ animation: pulse-badge 2s infinite;
106
+ }
107
+
108
+ @keyframes pulse-badge {
109
+ 0%, 100% { transform: scale(1); }
110
+ 50% { transform: scale(1.1); }
111
+ }
112
+
113
+ /* Chat Window */
114
+ .chat-widget-window {
115
+ position: fixed;
116
+ bottom: 100px;
117
+ right: 24px;
118
+ width: 400px;
119
+ height: 600px;
120
+ background: white;
121
+ border-radius: 20px;
122
+ box-shadow: 0 25px 80px rgba(0, 0, 0, 0.2);
123
+ display: flex;
124
+ flex-direction: column;
125
+ overflow: hidden;
126
+ z-index: 9999;
127
+ opacity: 0;
128
+ visibility: hidden;
129
+ transform: translateY(20px) scale(0.95);
130
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
131
+ }
132
+
133
+ .chat-widget-window.open {
134
+ opacity: 1;
135
+ visibility: visible;
136
+ transform: translateY(0) scale(1);
137
+ }
138
+
139
+ /* Chat Header */
140
+ .chat-widget-header {
141
+ background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
142
+ padding: 18px 20px;
143
+ display: flex;
144
+ align-items: center;
145
+ justify-content: space-between;
146
+ }
147
+
148
+ .chat-widget-header-info {
149
+ display: flex;
150
+ align-items: center;
151
+ gap: 12px;
152
+ }
153
+
154
+ .chat-widget-avatar {
155
+ width: 44px;
156
+ height: 44px;
157
+ background: rgba(255, 255, 255, 0.2);
158
+ border-radius: 50%;
159
+ display: flex;
160
+ align-items: center;
161
+ justify-content: center;
162
+ font-size: 24px;
163
+ }
164
+
165
+ .chat-widget-title {
166
+ color: white;
167
+ }
168
+
169
+ .chat-widget-title h3 {
170
+ font-size: 1rem;
171
+ font-weight: 600;
172
+ margin-bottom: 2px;
173
+ }
174
+
175
+ .chat-widget-title span {
176
+ font-size: 0.8rem;
177
+ opacity: 0.9;
178
+ display: flex;
179
+ align-items: center;
180
+ gap: 6px;
181
+ }
182
+
183
+ .chat-widget-title .status-dot {
184
+ width: 8px;
185
+ height: 8px;
186
+ background: #4ade80;
187
+ border-radius: 50%;
188
+ }
189
+
190
+ .chat-widget-close {
191
+ background: rgba(255, 255, 255, 0.2);
192
+ border: none;
193
+ width: 32px;
194
+ height: 32px;
195
+ border-radius: 50%;
196
+ color: white;
197
+ cursor: pointer;
198
+ display: flex;
199
+ align-items: center;
200
+ justify-content: center;
201
+ transition: background 0.2s;
202
+ }
203
+
204
+ .chat-widget-close:hover {
205
+ background: rgba(255, 255, 255, 0.3);
206
+ }
207
+
208
+ /* Chat Body (iframe container) */
209
+ .chat-widget-body {
210
+ flex: 1;
211
+ overflow: hidden;
212
+ }
213
+
214
+ .chat-widget-body iframe {
215
+ width: 100%;
216
+ height: 100%;
217
+ border: none;
218
+ }
219
+
220
+ /* Mobile Responsive */
221
+ @media (max-width: 480px) {
222
+ .chat-widget-window {
223
+ width: calc(100% - 20px);
224
+ height: calc(100% - 120px);
225
+ right: 10px;
226
+ bottom: 90px;
227
+ border-radius: 16px;
228
+ }
229
+
230
+ .chat-widget-button {
231
+ width: 56px;
232
+ height: 56px;
233
+ bottom: 20px;
234
+ right: 20px;
235
+ }
236
+ }
237
+
238
+ /* ============================================
239
+ END OF CHATBOT WIDGET STYLES
240
+ ============================================ */
241
+ </style>
242
+ </head>
243
+ <body>
244
+ <!-- Demo Page Content -->
245
+ <div class="demo-content">
246
+ <h1>Your Website Content Here</h1>
247
+ <p>This is a demo page showing how the HR Chatbot widget appears in the bottom right corner. The chatbot is always accessible to visitors while they browse your website.</p>
248
+ <p>Our HR Assistant is powered by advanced AI technology, providing instant answers to your human resources questions. Whether you need guidance on policies, hiring practices, or workplace regulations, our chatbot is here to help.</p>
249
+ <p>The widget is designed to be non-intrusive while remaining easily accessible. It works seamlessly on both desktop and mobile devices.</p>
250
+
251
+ <div class="note">
252
+ <strong>πŸ‘‰ Click the chat button in the bottom right corner to open the HR Assistant!</strong>
253
+ </div>
254
+
255
+ <p>Get instant support for common HR inquiries including employee onboarding, benefits information, leave policies, and workplace compliance. Our AI assistant is available 24/7 to provide helpful guidance.</p>
256
+ <p>Please note that while our chatbot provides general HR information, it should not replace professional legal or HR advice for important decisions.</p>
257
+ </div>
258
+
259
+ <!-- ============================================
260
+ CHATBOT WIDGET HTML - COPY FROM HERE
261
+ ============================================ -->
262
+
263
+ <!-- Chat Toggle Button -->
264
+ <button class="chat-widget-button" id="chatWidgetButton" onclick="toggleChat()">
265
+ <span class="chat-widget-badge">1</span>
266
+ <svg viewBox="0 0 24 24" id="chatIcon">
267
+ <path d="M20 2H4c-1.1 0-2 .9-2 2v18l4-4h14c1.1 0 2-.9 2-2V4c0-1.1-.9-2-2-2zm0 14H6l-2 2V4h16v12z"/>
268
+ </svg>
269
+ <svg viewBox="0 0 24 24" id="closeIcon" style="display: none;">
270
+ <path d="M19 6.41L17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/>
271
+ </svg>
272
+ </button>
273
+
274
+ <!-- Chat Window -->
275
+ <div class="chat-widget-window" id="chatWidgetWindow">
276
+ <div class="chat-widget-header">
277
+ <div class="chat-widget-header-info">
278
+ <div class="chat-widget-avatar">πŸ€–</div>
279
+ <div class="chat-widget-title">
280
+ <h3>HR Assistant</h3>
281
+ <span><span class="status-dot"></span> Online</span>
282
+ </div>
283
+ </div>
284
+ <button class="chat-widget-close" onclick="toggleChat()">
285
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
286
+ <path d="M19 6.41L17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/>
287
+ </svg>
288
+ </button>
289
+ </div>
290
+ <div class="chat-widget-body">
291
+ <iframe
292
+ src="https://pikamomo-hr-intervals-chatbot.hf.space"
293
+ title="HR Chatbot"
294
+ loading="lazy">
295
+ </iframe>
296
+ </div>
297
+ </div>
298
+
299
+ <!-- ============================================
300
+ CHATBOT WIDGET JAVASCRIPT - COPY THIS TOO
301
+ ============================================ -->
302
+ <script>
303
+ let isOpen = false;
304
+ const button = document.getElementById('chatWidgetButton');
305
+ const window_el = document.getElementById('chatWidgetWindow');
306
+ const chatIcon = document.getElementById('chatIcon');
307
+ const closeIcon = document.getElementById('closeIcon');
308
+ const badge = document.querySelector('.chat-widget-badge');
309
+
310
+ function toggleChat() {
311
+ isOpen = !isOpen;
312
+
313
+ if (isOpen) {
314
+ window_el.classList.add('open');
315
+ button.classList.add('active');
316
+ chatIcon.style.display = 'none';
317
+ closeIcon.style.display = 'block';
318
+ badge.style.display = 'none';
319
+ } else {
320
+ window_el.classList.remove('open');
321
+ button.classList.remove('active');
322
+ chatIcon.style.display = 'block';
323
+ closeIcon.style.display = 'none';
324
+ }
325
+ }
326
+
327
+ // Close on escape key
328
+ document.addEventListener('keydown', function(e) {
329
+ if (e.key === 'Escape' && isOpen) {
330
+ toggleChat();
331
+ }
332
+ });
333
+ </script>
334
+ </body>
335
+ </html>
336
+
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ======================= LangChain Core =======================
2
+ langchain==1.0.2
3
+ langchain-openai==1.0.1
4
+ langchain-qdrant==1.1.0
5
+ langchain-community==0.4.1
6
+ langchain-core==1.0.1
7
+
8
+ # ======================= Vector Database =======================
9
+ qdrant-client==1.15.1
10
+
11
+ # ======================= Document Processing =======================
12
+ pypdf==6.1.1
13
+ python-docx==1.2.0
14
+ unstructured==0.18.15
15
+
16
+ # ======================= Web Scraping =======================
17
+ firecrawl-py==4.5.0
18
+
19
+ # ======================= User Interface =======================
20
+ gradio==5.49.1
21
+
22
+ # ======================= OpenAI =======================
23
+ openai==1.109.1
24
+
25
+ # ======================= Utilities =======================
26
+ python-dotenv==1.1.1
27
+ requests==2.32.5
28
+ numpy==2.2.6
29
+ pandas==2.2.3
30
+ tiktoken==0.11.0
src/__init__.py ADDED
File without changes
src/chatbot.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG chatbot module using latest LangChain with LCEL
3
+ Handles question-answering with conversation memory using modern patterns
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
+ from langchain_qdrant import QdrantVectorStore
10
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
11
+ from langchain_core.chat_history import BaseChatMessageHistory
12
+ from langchain_community.chat_message_histories import ChatMessageHistory
13
+ from langchain_core.runnables import RunnablePassthrough, RunnableLambda
14
+ from langchain_core.runnables.history import RunnableWithMessageHistory
15
+ from langchain_core.output_parsers import StrOutputParser
16
+ from langchain_core.documents import Document
17
+ from qdrant_client import QdrantClient
18
+ from typing import Tuple, List, Dict, Any
19
+ from operator import itemgetter
20
+
21
+ load_dotenv()
22
+
23
+ # Store for chat sessions
24
+ session_store = {}
25
+
26
+
27
+ def get_session_history(session_id: str) -> BaseChatMessageHistory:
28
+ """
29
+ Get or create chat history for a session
30
+
31
+ Args:
32
+ session_id: Unique identifier for the session
33
+
34
+ Returns:
35
+ Chat message history object
36
+ """
37
+ if session_id not in session_store:
38
+ session_store[session_id] = ChatMessageHistory()
39
+ return session_store[session_id]
40
+
41
+
42
+ def format_docs(docs: List[Document]) -> str:
43
+ """
44
+ Format retrieved documents into a single string
45
+
46
+ Args:
47
+ docs: List of retrieved documents
48
+
49
+ Returns:
50
+ Formatted string with document contents
51
+ """
52
+ return "\n\n".join(doc.page_content for doc in docs)
53
+
54
+
55
+ def create_rag_chain():
56
+ """
57
+ Create RAG question-answering chain using LCEL (LangChain Expression Language)
58
+ Modern approach with pipe operator for better composability
59
+
60
+ Returns:
61
+ Conversational RAG chain with message history
62
+ """
63
+
64
+ # 1. Connect to Qdrant
65
+ client = QdrantClient(
66
+ url=os.getenv("QDRANT_URL"),
67
+ api_key=os.getenv("QDRANT_API_KEY")
68
+ )
69
+
70
+ embeddings = OpenAIEmbeddings(
71
+ model=os.getenv("OPEN_AI_EMBEDDING_MODEL", "text-embedding-3-small")
72
+ )
73
+
74
+ vectorstore = QdrantVectorStore(
75
+ client=client,
76
+ collection_name=os.getenv("QDRANT_COLLECTION"),
77
+ embedding=embeddings
78
+ )
79
+
80
+ # 2. Create retriever
81
+ retriever = vectorstore.as_retriever(
82
+ search_type="similarity",
83
+ search_kwargs={"k": 5}
84
+ )
85
+
86
+ # 3. Create LLM
87
+ llm = ChatOpenAI(
88
+ model=os.getenv("OPEN_AI_CHAT_MODEL", "gpt-4o-mini"),
89
+ temperature=0.3
90
+ )
91
+
92
+ # 4. System prompt
93
+ system_prompt = """You are an HR assistant for nonprofit organizations in Canada.
94
+ Use the following context to answer questions accurately and helpfully.
95
+
96
+ IMPORTANT DISCLAIMERS:
97
+ - This tool provides general HR information only
98
+ - Not a substitute for professional legal or HR advice
99
+ - Consult qualified professionals before implementing policies
100
+ - Do NOT share personal information about specific individuals
101
+
102
+ Context:
103
+ {context}
104
+
105
+ Provide a clear, helpful answer. If you're not certain, say so. Always remind users to consult HR/legal professionals for important decisions."""
106
+
107
+ prompt = ChatPromptTemplate.from_messages([
108
+ ("system", system_prompt),
109
+ MessagesPlaceholder(variable_name="chat_history"),
110
+ ("human", "{input}")
111
+ ])
112
+
113
+ # 5. Build RAG chain using LCEL (pipe operator)
114
+ # This is the modern LangChain approach for better composability
115
+ rag_chain = (
116
+ {
117
+ "context": itemgetter("input") | retriever | format_docs,
118
+ "input": itemgetter("input"),
119
+ "chat_history": itemgetter("chat_history")
120
+ }
121
+ | prompt
122
+ | llm
123
+ | StrOutputParser()
124
+ )
125
+
126
+ # 6. Add chat history with message management
127
+ conversational_rag_chain = RunnableWithMessageHistory(
128
+ rag_chain,
129
+ get_session_history,
130
+ input_messages_key="input",
131
+ history_messages_key="chat_history",
132
+ )
133
+
134
+ return conversational_rag_chain, retriever
135
+
136
+
137
+ def ask_question(
138
+ rag_chain,
139
+ retriever,
140
+ question: str,
141
+ session_id: str = "default"
142
+ ) -> Tuple[str, List[Document]]:
143
+ """
144
+ Ask a question and get answer with sources
145
+
146
+ Args:
147
+ rag_chain: The RAG chain
148
+ retriever: The vector store retriever for getting sources
149
+ question: User's question
150
+ session_id: Session identifier for conversation history
151
+
152
+ Returns:
153
+ Tuple of (answer, source_documents)
154
+ """
155
+
156
+ # Get answer from conversational chain
157
+ answer = rag_chain.invoke(
158
+ {"input": question},
159
+ config={"configurable": {"session_id": session_id}}
160
+ )
161
+
162
+ # Retrieve source documents separately for display
163
+ sources = retriever.invoke(question)
164
+
165
+ return answer, sources
166
+
167
+
168
+ # Test function
169
+ if __name__ == "__main__":
170
+ print("πŸ€– Initializing chatbot with latest LangChain (LCEL)...")
171
+ rag_chain, retriever = create_rag_chain()
172
+
173
+ print("\nβœ… Ready! Enter your question (type 'quit' to exit):\n")
174
+
175
+ session_id = "test_session"
176
+
177
+ while True:
178
+ question = input("You: ")
179
+ if question.lower() in ['quit', 'exit', 'q']:
180
+ break
181
+
182
+ try:
183
+ answer, sources = ask_question(rag_chain, retriever, question, session_id)
184
+
185
+ print(f"\nBot: {answer}\n")
186
+
187
+ if sources:
188
+ print("πŸ“š Sources:")
189
+ for i, doc in enumerate(sources[:3], 1):
190
+ source = doc.metadata.get("source", "Unknown")
191
+ print(f" {i}. {source}")
192
+ print()
193
+ except Exception as e:
194
+ print(f"\n❌ Error: {str(e)}")
195
+ print("Make sure you have uploaded some documents first.\n")
src/ingestion.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document ingestion module
3
+ Loads PDF/DOCX files and stores them in Qdrant
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+ from dotenv import load_dotenv
10
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
11
+ from datetime import datetime
12
+
13
+ # Add parent directory to path for imports
14
+ current_dir = Path(__file__).resolve().parent
15
+ parent_dir = current_dir.parent
16
+ if str(parent_dir) not in sys.path:
17
+ sys.path.insert(0, str(parent_dir))
18
+
19
+ from src.vector_store import process_and_store
20
+
21
+ load_dotenv()
22
+
23
+
24
+ def load_document(file_path: str):
25
+ """
26
+ Load PDF or DOCX document
27
+
28
+ Args:
29
+ file_path: Path to the document file
30
+
31
+ Returns:
32
+ List of Document objects
33
+ """
34
+ if file_path.endswith('.pdf'):
35
+ loader = PyPDFLoader(file_path)
36
+ elif file_path.endswith('.docx'):
37
+ loader = Docx2txtLoader(file_path)
38
+ else:
39
+ raise ValueError("Only PDF and DOCX files are supported")
40
+
41
+ documents = loader.load()
42
+ return documents
43
+
44
+
45
+ def add_metadata(documents, source_name: str, doc_type: str = "document"):
46
+ """
47
+ Add metadata to documents
48
+
49
+ Args:
50
+ documents: List of Document objects
51
+ source_name: Source filename
52
+ doc_type: Type of document (document, policy, guide, etc.)
53
+
54
+ Returns:
55
+ Documents with added metadata
56
+ """
57
+ for doc in documents:
58
+ doc.metadata["source"] = source_name
59
+ doc.metadata["type"] = doc_type
60
+ doc.metadata["upload_date"] = datetime.now().strftime("%Y-%m-%d")
61
+
62
+ return documents
63
+
64
+
65
+ def ingest_document(file_path: str, doc_type: str = "document") -> int:
66
+ """
67
+ Complete document ingestion pipeline
68
+
69
+ Args:
70
+ file_path: Path to the document file
71
+ doc_type: Type of document
72
+
73
+ Returns:
74
+ Number of chunks created
75
+ """
76
+ print(f"πŸ“„ Processing: {file_path}")
77
+
78
+ # 1. Load document
79
+ documents = load_document(file_path)
80
+ print(f" βœ… Loaded {len(documents)} pages")
81
+
82
+ # 2. Add metadata
83
+ source_name = os.path.basename(file_path)
84
+ documents = add_metadata(documents, source_name, doc_type)
85
+
86
+ # 3. Chunk and store (using shared function)
87
+ num_chunks = process_and_store(documents)
88
+
89
+ return num_chunks
90
+
91
+
92
+ # Test function
93
+ if __name__ == "__main__":
94
+ print("πŸ§ͺ Testing document ingestion...")
95
+ print("\nPlease place a test PDF or DOCX file in data/documents/")
96
+ print("Then update the file path below and run again.\n")
97
+
98
+ # Example:
99
+ # test_file = "data/documents/test.pdf"
100
+ # if os.path.exists(test_file):
101
+ # num_chunks = ingest_document(test_file)
102
+ # print(f"\nπŸŽ‰ Success! Processed {num_chunks} chunks")
src/scraper.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web scraping module
3
+ Scrapes web pages using Firecrawl and stores in Qdrant
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+ from dotenv import load_dotenv
10
+ from firecrawl import FirecrawlApp
11
+ from langchain_core.documents import Document
12
+ from datetime import datetime
13
+ from qdrant_client import QdrantClient
14
+
15
+ # Add parent directory to path for imports
16
+ current_dir = Path(__file__).resolve().parent
17
+ parent_dir = current_dir.parent
18
+ if str(parent_dir) not in sys.path:
19
+ sys.path.insert(0, str(parent_dir))
20
+
21
+ from src.vector_store import process_and_store
22
+
23
+ load_dotenv()
24
+
25
+
26
+ def check_url_exists(url: str) -> int:
27
+ """
28
+ Check if URL already exists in Qdrant
29
+
30
+ Args:
31
+ url: URL to check
32
+
33
+ Returns:
34
+ Number of existing chunks for this URL (0 if not found)
35
+ """
36
+ client = QdrantClient(
37
+ url=os.getenv("QDRANT_URL"),
38
+ api_key=os.getenv("QDRANT_API_KEY")
39
+ )
40
+ collection_name = os.getenv("QDRANT_COLLECTION")
41
+
42
+ try:
43
+ result = client.scroll(
44
+ collection_name=collection_name,
45
+ limit=1,
46
+ scroll_filter={
47
+ "must": [{"key": "metadata.source", "match": {"value": url}}]
48
+ },
49
+ with_payload=False
50
+ )
51
+
52
+ # Count total chunks for this URL
53
+ count_result = client.count(
54
+ collection_name=collection_name,
55
+ count_filter={
56
+ "must": [{"key": "metadata.source", "match": {"value": url}}]
57
+ }
58
+ )
59
+ return count_result.count
60
+ except Exception:
61
+ return 0
62
+
63
+
64
+ def scrape_url(url: str) -> str:
65
+ """
66
+ Scrape webpage content using Firecrawl
67
+
68
+ Args:
69
+ url: URL to scrape
70
+
71
+ Returns:
72
+ Markdown content of the webpage
73
+ """
74
+ print(f"🌐 Scraping: {url}")
75
+
76
+ app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
77
+ result = app.scrape(url, formats=['markdown'])
78
+
79
+ # Handle different return types
80
+ if hasattr(result, 'markdown'):
81
+ markdown_content = result.markdown
82
+ elif isinstance(result, dict) and 'markdown' in result:
83
+ markdown_content = result['markdown']
84
+ else:
85
+ raise ValueError(f"Failed to scrape - unexpected result type: {type(result)}")
86
+
87
+ if not markdown_content:
88
+ raise ValueError("Failed to scrape - no content retrieved")
89
+
90
+ return markdown_content
91
+
92
+
93
+ def process_and_store_webpage(url: str, force: bool = False) -> int:
94
+ """
95
+ Scrape webpage and store in vector database
96
+
97
+ Args:
98
+ url: URL to scrape
99
+ force: If True, skip duplicate check and store anyway
100
+
101
+ Returns:
102
+ Number of chunks created
103
+
104
+ Raises:
105
+ ValueError: If URL already exists and force=False
106
+ """
107
+
108
+ # 0. Check if URL already exists
109
+ if not force:
110
+ existing_chunks = check_url_exists(url)
111
+ if existing_chunks > 0:
112
+ raise ValueError(
113
+ f"URL already exists with {existing_chunks} chunks. "
114
+ f"Use 'Delete' to remove it first, or force=True to add anyway."
115
+ )
116
+
117
+ # 1. Scrape content
118
+ markdown_content = scrape_url(url)
119
+ print(f" βœ… Scraped {len(markdown_content)} characters")
120
+
121
+ # 2. Create document with metadata
122
+ doc = Document(
123
+ page_content=markdown_content,
124
+ metadata={
125
+ "source": url,
126
+ "type": "webpage",
127
+ "upload_date": datetime.now().strftime("%Y-%m-%d")
128
+ }
129
+ )
130
+
131
+ # 3. Chunk and store (using shared function)
132
+ num_chunks = process_and_store([doc])
133
+
134
+ return num_chunks
135
+
136
+
137
+ # Test function
138
+ if __name__ == "__main__":
139
+ print("πŸ§ͺ Testing web scraper...")
140
+
141
+ # Test with a simple webpage
142
+ test_url = "https://hrintervals.ca/resources/sample-policy-inclusive-and-equitable-hiring-practices/"
143
+
144
+ try:
145
+ num_chunks = process_and_store_webpage(test_url)
146
+ print(f"\nπŸŽ‰ Success! Processed {num_chunks} chunks")
147
+ except Exception as e:
148
+ print(f"\n❌ Error: {str(e)}")
src/vector_store.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared vector storage utilities
3
+ Handles chunking and storing documents in Qdrant
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from langchain_openai import OpenAIEmbeddings
10
+ from langchain_qdrant import QdrantVectorStore
11
+ from qdrant_client import QdrantClient
12
+ from langchain_core.documents import Document
13
+ from typing import List
14
+
15
+ load_dotenv()
16
+
17
+
18
+ def get_embeddings():
19
+ """Get OpenAI embeddings instance"""
20
+ return OpenAIEmbeddings(
21
+ model=os.getenv("OPEN_AI_EMBEDDING_MODEL", "text-embedding-3-small")
22
+ )
23
+
24
+
25
+ def get_qdrant_client():
26
+ """Get Qdrant client instance"""
27
+ return QdrantClient(
28
+ url=os.getenv("QDRANT_URL"),
29
+ api_key=os.getenv("QDRANT_API_KEY")
30
+ )
31
+
32
+
33
+ def chunk_documents(
34
+ documents: List[Document],
35
+ chunk_size: int = 1000,
36
+ chunk_overlap: int = 200
37
+ ) -> List[Document]:
38
+ """
39
+ Split documents into chunks
40
+
41
+ Args:
42
+ documents: List of LangChain Document objects
43
+ chunk_size: Maximum characters per chunk
44
+ chunk_overlap: Overlapping characters between chunks
45
+
46
+ Returns:
47
+ List of chunked Document objects
48
+ """
49
+ text_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=chunk_size,
51
+ chunk_overlap=chunk_overlap,
52
+ separators=["\n\n", "\n", ". ", " ", ""]
53
+ )
54
+
55
+ chunks = text_splitter.split_documents(documents)
56
+ return chunks
57
+
58
+
59
+ def store_documents(documents: List[Document]) -> tuple[int, int]:
60
+ """
61
+ Store documents in Qdrant vector database
62
+
63
+ Args:
64
+ documents: List of Document objects with content and metadata
65
+
66
+ Returns:
67
+ Tuple of (expected_count, actual_stored_count)
68
+ """
69
+ embeddings = get_embeddings()
70
+ client = get_qdrant_client()
71
+ collection_name = os.getenv("QDRANT_COLLECTION")
72
+
73
+ # Get count before storing
74
+ try:
75
+ before_count = client.count(collection_name=collection_name).count
76
+ except Exception:
77
+ before_count = 0
78
+
79
+ # Store documents
80
+ vectorstore = QdrantVectorStore.from_documents(
81
+ documents=documents,
82
+ embedding=embeddings,
83
+ url=os.getenv("QDRANT_URL"),
84
+ api_key=os.getenv("QDRANT_API_KEY"),
85
+ collection_name=collection_name
86
+ )
87
+
88
+ # Verify storage by counting after
89
+ try:
90
+ after_count = client.count(collection_name=collection_name).count
91
+ actual_stored = after_count - before_count
92
+ except Exception as e:
93
+ print(f" ⚠️ Warning: Could not verify storage: {str(e)}")
94
+ actual_stored = len(documents) # Assume success if can't verify
95
+
96
+ return len(documents), actual_stored
97
+
98
+
99
+ def process_and_store(
100
+ documents: List[Document],
101
+ chunk_size: int = 1000,
102
+ chunk_overlap: int = 200
103
+ ) -> int:
104
+ """
105
+ Complete pipeline: chunk documents and store in vector database
106
+
107
+ Args:
108
+ documents: List of Document objects
109
+ chunk_size: Maximum characters per chunk
110
+ chunk_overlap: Overlapping characters between chunks
111
+
112
+ Returns:
113
+ Number of chunks stored
114
+ """
115
+ # 1. Chunk documents
116
+ chunks = chunk_documents(documents, chunk_size, chunk_overlap)
117
+ print(f" βœ… Created {len(chunks)} chunks")
118
+
119
+ # 2. Store in Qdrant with verification
120
+ try:
121
+ expected, actual_stored = store_documents(chunks)
122
+
123
+ if actual_stored == expected:
124
+ print(f" βœ… Stored {actual_stored} chunks in Qdrant")
125
+ elif actual_stored > 0:
126
+ print(f" ⚠️ Partial storage: expected {expected}, actually stored {actual_stored}")
127
+ else:
128
+ print(f" ❌ Storage failed: 0 chunks stored (expected {expected})")
129
+
130
+ return actual_stored
131
+
132
+ except Exception as e:
133
+ print(f" ❌ Error storing in Qdrant: {str(e)}")
134
+ raise
tests/test_connections.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test all API connections with 2025 October versions
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ print("πŸ§ͺ Testing API Connections (October 2025)...\n")
12
+
13
+ # Test 1: OpenAI
14
+ print("1️⃣ Testing OpenAI...")
15
+ try:
16
+ from openai import OpenAI
17
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
18
+
19
+ # Test embeddings
20
+ response = client.embeddings.create(
21
+ model="text-embedding-3-small",
22
+ input="test"
23
+ )
24
+ print(" βœ… OpenAI connected successfully!")
25
+ print(f" βœ… Embeddings working (dimension: {len(response.data[0].embedding)})")
26
+ except Exception as e:
27
+ print(f" ❌ OpenAI error: {str(e)}")
28
+
29
+ # Test 2: Qdrant
30
+ print("\n2️⃣ Testing Qdrant...")
31
+ try:
32
+ from qdrant_client import QdrantClient
33
+ client = QdrantClient(
34
+ url=os.getenv("QDRANT_URL"),
35
+ api_key=os.getenv("QDRANT_API_KEY")
36
+ )
37
+ collections = client.get_collections()
38
+ print(f" βœ… Qdrant connected! Collections: {len(collections.collections)}")
39
+ except Exception as e:
40
+ print(f" ❌ Qdrant error: {str(e)}")
41
+
42
+ # Test 3: Firecrawl
43
+ print("\n3️⃣ Testing Firecrawl...")
44
+ try:
45
+ from firecrawl import FirecrawlApp
46
+ app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
47
+ print(" βœ… Firecrawl initialized successfully!")
48
+ except Exception as e:
49
+ print(f" ❌ Firecrawl error: {str(e)}")
50
+
51
+ # Test 4: LangChain imports (LCEL)
52
+ print("\n4️⃣ Testing LangChain with LCEL imports...")
53
+ try:
54
+ import langchain
55
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
56
+ from langchain_qdrant import QdrantVectorStore
57
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
58
+ from langchain_core.runnables.history import RunnableWithMessageHistory
59
+ from langchain_core.output_parsers import StrOutputParser
60
+ from langchain_core.documents import Document
61
+ from operator import itemgetter
62
+
63
+ print(f" βœ… LangChain version: {langchain.__version__}")
64
+ print(" βœ… All LangChain LCEL imports successful!")
65
+ except Exception as e:
66
+ print(f" ❌ LangChain import error: {str(e)}")
67
+
68
+ # Test 5: Gradio
69
+ print("\n5️⃣ Testing Gradio...")
70
+ try:
71
+ import gradio as gr
72
+ print(f" βœ… Gradio version: {gr.__version__}")
73
+ except Exception as e:
74
+ print(f" ❌ Gradio error: {str(e)}")
75
+
76
+ print("\n" + "="*50)
77
+ print("πŸŽ‰ Connection tests complete!")
78
+ print("\nNext steps:")
79
+ print("1. Upload a test document: python src/ingestion.py")
80
+ print("2. Test the chatbot: python src/chatbot.py")
81
+ print("3. Start the user interface: python app.py")
82
+ print("4. Start the admin interface: python admin.py")