Spaces:
Sleeping
Sleeping
Commit Β·
c32cdfb
0
Parent(s):
initial deploy
Browse files- .gitignore +158 -0
- ARCHITECTURE.md +555 -0
- README.md +27 -0
- admin.py +422 -0
- app.py +149 -0
- chatbot-widget.html +336 -0
- requirements.txt +30 -0
- src/__init__.py +0 -0
- src/chatbot.py +195 -0
- src/ingestion.py +102 -0
- src/scraper.py +148 -0
- src/vector_store.py +134 -0
- tests/test_connections.py +82 -0
.gitignore
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
PIPFILE.lock
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
*.manifest
|
| 31 |
+
*.spec
|
| 32 |
+
|
| 33 |
+
# Installer logs
|
| 34 |
+
pip-log.txt
|
| 35 |
+
pip-delete-this-directory.txt
|
| 36 |
+
|
| 37 |
+
# Unit test / coverage reports
|
| 38 |
+
htmlcov/
|
| 39 |
+
.tox/
|
| 40 |
+
.nox/
|
| 41 |
+
.coverage
|
| 42 |
+
.coverage.*
|
| 43 |
+
.cache
|
| 44 |
+
nosetests.xml
|
| 45 |
+
coverage.xml
|
| 46 |
+
*.cover
|
| 47 |
+
*.py,cover
|
| 48 |
+
.hypothesis/
|
| 49 |
+
.pytest_cache/
|
| 50 |
+
cover/
|
| 51 |
+
|
| 52 |
+
# Translations
|
| 53 |
+
*.mo
|
| 54 |
+
*.pot
|
| 55 |
+
|
| 56 |
+
# Django stuff:
|
| 57 |
+
*.log
|
| 58 |
+
local_settings.py
|
| 59 |
+
db.sqlite3
|
| 60 |
+
db.sqlite3-journal
|
| 61 |
+
|
| 62 |
+
# Flask stuff:
|
| 63 |
+
instance/
|
| 64 |
+
.webassets-cache
|
| 65 |
+
|
| 66 |
+
# Scrapy stuff:
|
| 67 |
+
.scrapy
|
| 68 |
+
|
| 69 |
+
# Sphinx documentation
|
| 70 |
+
docs/_build/
|
| 71 |
+
|
| 72 |
+
# PyBuilder
|
| 73 |
+
.pybuilder/
|
| 74 |
+
target/
|
| 75 |
+
|
| 76 |
+
# Jupyter Notebook
|
| 77 |
+
.ipynb_checkpoints
|
| 78 |
+
|
| 79 |
+
# IPython
|
| 80 |
+
profile_default/
|
| 81 |
+
ipython_config.py
|
| 82 |
+
|
| 83 |
+
# pyenv
|
| 84 |
+
.python-version
|
| 85 |
+
|
| 86 |
+
# pipenv
|
| 87 |
+
Pipfile.lock
|
| 88 |
+
|
| 89 |
+
# poetry
|
| 90 |
+
poetry.lock
|
| 91 |
+
|
| 92 |
+
# pdm
|
| 93 |
+
.pdm.toml
|
| 94 |
+
|
| 95 |
+
# PEP 582
|
| 96 |
+
__pypackages__/
|
| 97 |
+
|
| 98 |
+
# Celery stuff
|
| 99 |
+
celerybeat-schedule
|
| 100 |
+
celerybeat.pid
|
| 101 |
+
|
| 102 |
+
# SageMath parsed files
|
| 103 |
+
*.sage.py
|
| 104 |
+
|
| 105 |
+
# Environments
|
| 106 |
+
.env
|
| 107 |
+
.venv
|
| 108 |
+
env/
|
| 109 |
+
venv/
|
| 110 |
+
ENV/
|
| 111 |
+
env.bak/
|
| 112 |
+
venv.bak/
|
| 113 |
+
|
| 114 |
+
# Spyder project settings
|
| 115 |
+
.spyderproject
|
| 116 |
+
.spyproject
|
| 117 |
+
|
| 118 |
+
# Rope project settings
|
| 119 |
+
.ropeproject
|
| 120 |
+
|
| 121 |
+
# mkdocs documentation
|
| 122 |
+
/site
|
| 123 |
+
|
| 124 |
+
# mypy
|
| 125 |
+
.mypy_cache/
|
| 126 |
+
.dmypy.json
|
| 127 |
+
dmypy.json
|
| 128 |
+
|
| 129 |
+
# Pyre type checker
|
| 130 |
+
.pyre/
|
| 131 |
+
|
| 132 |
+
# pytype static type analyzer
|
| 133 |
+
.pytype/
|
| 134 |
+
|
| 135 |
+
# Cython debug symbols
|
| 136 |
+
cython_debug/
|
| 137 |
+
|
| 138 |
+
# IDEs
|
| 139 |
+
.vscode/
|
| 140 |
+
.idea/
|
| 141 |
+
*.swp
|
| 142 |
+
*.swo
|
| 143 |
+
*~
|
| 144 |
+
.DS_Store
|
| 145 |
+
|
| 146 |
+
# OS-specific
|
| 147 |
+
Thumbs.db
|
| 148 |
+
Desktop.ini
|
| 149 |
+
|
| 150 |
+
# Project-specific
|
| 151 |
+
data/
|
| 152 |
+
*.db
|
| 153 |
+
*.sqlite
|
| 154 |
+
logs/
|
| 155 |
+
*.log
|
| 156 |
+
temp/
|
| 157 |
+
tmp/
|
| 158 |
+
|
ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,555 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HR Intervals AI Assistant - Architecture Documentation
|
| 2 |
+
|
| 3 |
+
## Project Overview
|
| 4 |
+
|
| 5 |
+
An AI-powered bilingual chatbot for nonprofit organizations providing HR support, policy generation, and compliance checking.
|
| 6 |
+
|
| 7 |
+
**Tech Stack:**
|
| 8 |
+
- Backend: Python 3.12 + LangChain
|
| 9 |
+
- Vector Database: Qdrant Cloud
|
| 10 |
+
- AI Models: OpenAI (GPT-4o-mini, text-embedding-3-small)
|
| 11 |
+
- UI Framework: Gradio
|
| 12 |
+
- Web Scraping: Firecrawl
|
| 13 |
+
- Monitoring: LangSmith (optional)
|
| 14 |
+
- Deployment: Hugging Face Spaces
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## System Architecture
|
| 19 |
+
|
| 20 |
+
### High-Level Architecture
|
| 21 |
+
```
|
| 22 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
β USER LAYER β
|
| 24 |
+
ββββββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββββ€
|
| 25 |
+
β app.py β admin.py β
|
| 26 |
+
β (Chat Interface) β (Admin Interface) β
|
| 27 |
+
β - User Q&A β - Upload documents β
|
| 28 |
+
β - Policy generation β - Scrape web pages β
|
| 29 |
+
β - View sources β - Manage content β
|
| 30 |
+
ββββββββββββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββ
|
| 31 |
+
β β
|
| 32 |
+
βΌ βΌ
|
| 33 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
β APPLICATION LAYER β
|
| 35 |
+
ββββββββββββββββ¬ββββββββββββββββββ¬βββββββββββββββββββββββββββββ€
|
| 36 |
+
β chatbot.py β ingestion.py β scraper.py β
|
| 37 |
+
β - RAG chain β - PDF/DOCX β - Web scraping β
|
| 38 |
+
β - Retrieval β - Text chunking β - URL processing β
|
| 39 |
+
β - QA logic β - Metadata β - Content storage β
|
| 40 |
+
ββββββββββββββββ΄ββββββββββββββββββ΄βββββββββββββββββββββββββββββ
|
| 41 |
+
β β
|
| 42 |
+
βΌ βΌ
|
| 43 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
β EXTERNAL SERVICES β
|
| 45 |
+
βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββββ¬ββββββββββββββββββ€
|
| 46 |
+
β Qdrant β OpenAI β Firecrawl β LangSmith β
|
| 47 |
+
β Cloud β API β API β (optional) β
|
| 48 |
+
β - Vectors β - Embeddingsβ - Scraping β - Monitoring β
|
| 49 |
+
β - Search β - Chat β - Markdown β - Debugging β
|
| 50 |
+
βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββββ΄ββββββββββββββββββ
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## Module Relationships
|
| 56 |
+
|
| 57 |
+
### Core Modules
|
| 58 |
+
|
| 59 |
+
#### 1. `src/ingestion.py` - Document Processing Module
|
| 60 |
+
|
| 61 |
+
**Purpose:** Load, process, and store PDF/DOCX documents into vector database
|
| 62 |
+
|
| 63 |
+
**Key Functions:**
|
| 64 |
+
```python
|
| 65 |
+
create_vectorstore() -> (vectorstore, embeddings, client)
|
| 66 |
+
load_document(file_path: str) -> List[Document]
|
| 67 |
+
chunk_documents(documents, chunk_size=1000, chunk_overlap=200) -> List[Document]
|
| 68 |
+
add_metadata(chunks, source_name, doc_type="document") -> List[Document]
|
| 69 |
+
ingest_document(file_path: str, doc_type="document") -> int
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
**Dependencies:**
|
| 73 |
+
- `langchain_community.document_loaders` (PyPDFLoader, Docx2txtLoader)
|
| 74 |
+
- `langchain.text_splitter` (RecursiveCharacterTextSplitter)
|
| 75 |
+
- `langchain_openai` (OpenAIEmbeddings)
|
| 76 |
+
- `langchain_qdrant` (QdrantVectorStore)
|
| 77 |
+
- `qdrant_client` (QdrantClient)
|
| 78 |
+
|
| 79 |
+
**Used By:**
|
| 80 |
+
- `admin.py` (upload functionality)
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
#### 2. `src/scraper.py` - Web Scraping Module
|
| 85 |
+
|
| 86 |
+
**Purpose:** Scrape web pages and store content in vector database
|
| 87 |
+
|
| 88 |
+
**Key Functions:**
|
| 89 |
+
```python
|
| 90 |
+
scrape_url(url: str) -> str
|
| 91 |
+
process_and_store_webpage(url: str) -> int
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Dependencies:**
|
| 95 |
+
- `firecrawl` (FirecrawlApp)
|
| 96 |
+
- `langchain.schema` (Document)
|
| 97 |
+
- `langchain.text_splitter` (RecursiveCharacterTextSplitter)
|
| 98 |
+
- `langchain_openai` (OpenAIEmbeddings)
|
| 99 |
+
- `langchain_qdrant` (QdrantVectorStore)
|
| 100 |
+
|
| 101 |
+
**Used By:**
|
| 102 |
+
- `admin.py` (URL scraping functionality)
|
| 103 |
+
|
| 104 |
+
---
|
| 105 |
+
|
| 106 |
+
#### 3. `src/chatbot.py` - RAG Question-Answering Module
|
| 107 |
+
|
| 108 |
+
**Purpose:** Handle user questions using Retrieval-Augmented Generation
|
| 109 |
+
|
| 110 |
+
**Key Functions:**
|
| 111 |
+
```python
|
| 112 |
+
create_rag_chain() -> ConversationalRetrievalChain
|
| 113 |
+
ask_question(qa_chain, question: str) -> (answer: str, sources: List[Document])
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
**Components:**
|
| 117 |
+
- Vector store retriever (k=5 similar documents)
|
| 118 |
+
- LLM: GPT-4o-mini (temperature=0.3)
|
| 119 |
+
- Conversation memory (ConversationBufferMemory)
|
| 120 |
+
- System prompt with disclaimers
|
| 121 |
+
|
| 122 |
+
**Dependencies:**
|
| 123 |
+
- `langchain_openai` (ChatOpenAI, OpenAIEmbeddings)
|
| 124 |
+
- `langchain_qdrant` (QdrantVectorStore)
|
| 125 |
+
- `langchain.chains` (ConversationalRetrievalChain)
|
| 126 |
+
- `langchain.memory` (ConversationBufferMemory)
|
| 127 |
+
- `qdrant_client` (QdrantClient)
|
| 128 |
+
|
| 129 |
+
**Used By:**
|
| 130 |
+
- `app.py` (chat interface)
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
|
| 134 |
+
### User Interface Modules
|
| 135 |
+
|
| 136 |
+
#### 4. `app.py` - Chat Interface (End Users)
|
| 137 |
+
|
| 138 |
+
**Purpose:** Gradio-based chat interface for nonprofit users
|
| 139 |
+
|
| 140 |
+
**Features:**
|
| 141 |
+
- Real-time Q&A
|
| 142 |
+
- PII detection and warnings
|
| 143 |
+
- Source citations
|
| 144 |
+
- Disclaimer display
|
| 145 |
+
- Conversation history
|
| 146 |
+
- Example questions
|
| 147 |
+
|
| 148 |
+
**Calls:**
|
| 149 |
+
- `src/chatbot.py` β `create_rag_chain()`, `ask_question()`
|
| 150 |
+
|
| 151 |
+
**Port:** 7860
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
#### 5. `admin.py` - Admin Interface (Content Managers)
|
| 156 |
+
|
| 157 |
+
**Purpose:** Gradio-based management interface for HR Intervals team
|
| 158 |
+
|
| 159 |
+
**Features:**
|
| 160 |
+
- View all documents
|
| 161 |
+
- Upload PDF/DOCX files
|
| 162 |
+
- Scrape single/multiple URLs
|
| 163 |
+
- Delete documents by source
|
| 164 |
+
- Update/replace documents
|
| 165 |
+
|
| 166 |
+
**Calls:**
|
| 167 |
+
- `src/ingestion.py` β `ingest_document()`
|
| 168 |
+
- `src/scraper.py` β `process_and_store_webpage()`
|
| 169 |
+
- `qdrant_client.QdrantClient` β direct CRUD operations
|
| 170 |
+
|
| 171 |
+
**Port:** 7861
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## Data Flow Diagrams
|
| 176 |
+
|
| 177 |
+
### Flow 1: Document Upload
|
| 178 |
+
```
|
| 179 |
+
User (admin.py)
|
| 180 |
+
β
|
| 181 |
+
[Select PDF/DOCX file]
|
| 182 |
+
β
|
| 183 |
+
admin.py: upload_document()
|
| 184 |
+
β
|
| 185 |
+
ingestion.py: ingest_document()
|
| 186 |
+
β
|
| 187 |
+
[Load document] β PyPDFLoader / Docx2txtLoader
|
| 188 |
+
β
|
| 189 |
+
[Split into chunks] β RecursiveCharacterTextSplitter
|
| 190 |
+
β - chunk_size: 1000
|
| 191 |
+
β - chunk_overlap: 200
|
| 192 |
+
β
|
| 193 |
+
[Add metadata]
|
| 194 |
+
β - source: filename
|
| 195 |
+
β - type: document/policy/guide
|
| 196 |
+
β - upload_date: YYYY-MM-DD
|
| 197 |
+
β
|
| 198 |
+
[Generate embeddings] β OpenAI text-embedding-3-small
|
| 199 |
+
β
|
| 200 |
+
[Store vectors + metadata] β Qdrant Cloud
|
| 201 |
+
β
|
| 202 |
+
β
Success: N chunks uploaded
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
---
|
| 206 |
+
|
| 207 |
+
### Flow 2: Web Scraping
|
| 208 |
+
```
|
| 209 |
+
User (admin.py)
|
| 210 |
+
β
|
| 211 |
+
[Enter URL(s)]
|
| 212 |
+
β
|
| 213 |
+
admin.py: scrape_single_url() / scrape_multiple_urls()
|
| 214 |
+
β
|
| 215 |
+
scraper.py: process_and_store_webpage()
|
| 216 |
+
β
|
| 217 |
+
[Scrape webpage] β Firecrawl API
|
| 218 |
+
β - Returns: Markdown content
|
| 219 |
+
β
|
| 220 |
+
[Create document with metadata]
|
| 221 |
+
β - source: URL
|
| 222 |
+
β - type: webpage
|
| 223 |
+
β - upload_date: YYYY-MM-DD
|
| 224 |
+
β
|
| 225 |
+
[Split into chunks] β RecursiveCharacterTextSplitter
|
| 226 |
+
β
|
| 227 |
+
[Generate embeddings] β OpenAI text-embedding-3-small
|
| 228 |
+
β
|
| 229 |
+
[Store vectors + metadata] β Qdrant Cloud
|
| 230 |
+
β
|
| 231 |
+
β
Success: N chunks uploaded
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
---
|
| 235 |
+
|
| 236 |
+
### Flow 3: Question Answering (RAG)
|
| 237 |
+
```
|
| 238 |
+
User (app.py)
|
| 239 |
+
β
|
| 240 |
+
[Type question]
|
| 241 |
+
β
|
| 242 |
+
app.py: chat()
|
| 243 |
+
β
|
| 244 |
+
[Check for PII] β Regex patterns
|
| 245 |
+
β - Capitalized names: [A-Z][a-z]+ [A-Z][a-z]+
|
| 246 |
+
β - If detected: Show warning
|
| 247 |
+
β
|
| 248 |
+
chatbot.py: ask_question()
|
| 249 |
+
β
|
| 250 |
+
ConversationalRetrievalChain
|
| 251 |
+
β
|
| 252 |
+
[Convert question to embedding] β OpenAI text-embedding-3-small
|
| 253 |
+
β
|
| 254 |
+
[Similarity search] β Qdrant Cloud
|
| 255 |
+
β - Retrieve top 5 similar chunks
|
| 256 |
+
β - Return: chunks + metadata
|
| 257 |
+
β
|
| 258 |
+
[Combine context + question + chat history]
|
| 259 |
+
β
|
| 260 |
+
[Generate answer] β OpenAI GPT-4o-mini
|
| 261 |
+
β - Temperature: 0.3
|
| 262 |
+
β - System prompt: HR assistant with disclaimers
|
| 263 |
+
β
|
| 264 |
+
[Return answer + source documents]
|
| 265 |
+
β
|
| 266 |
+
app.py: Display answer with sources
|
| 267 |
+
β
|
| 268 |
+
User sees:
|
| 269 |
+
- Answer
|
| 270 |
+
- β οΈ PII warning (if applicable)
|
| 271 |
+
- π Sources (top 3)
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
---
|
| 275 |
+
|
| 276 |
+
### Flow 4: Document Deletion
|
| 277 |
+
```
|
| 278 |
+
User (admin.py)
|
| 279 |
+
β
|
| 280 |
+
[Enter document name or URL]
|
| 281 |
+
β
|
| 282 |
+
admin.py: delete_document()
|
| 283 |
+
β
|
| 284 |
+
Qdrant Client: delete()
|
| 285 |
+
β
|
| 286 |
+
[Filter by metadata]
|
| 287 |
+
β - Field: "source"
|
| 288 |
+
β - Match: exact document name
|
| 289 |
+
β
|
| 290 |
+
[Delete all matching points]
|
| 291 |
+
β
|
| 292 |
+
β
Success: All chunks from source deleted
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
### Flow 5: Document Update
|
| 298 |
+
```
|
| 299 |
+
User (admin.py)
|
| 300 |
+
β
|
| 301 |
+
[Specify old document name]
|
| 302 |
+
[Select new file]
|
| 303 |
+
β
|
| 304 |
+
admin.py: update_document()
|
| 305 |
+
β
|
| 306 |
+
[Step 1: Delete old document]
|
| 307 |
+
β βββ delete_document(old_source)
|
| 308 |
+
β
|
| 309 |
+
[Step 2: Upload new document]
|
| 310 |
+
β βββ upload_document(new_file)
|
| 311 |
+
β
|
| 312 |
+
β
Success: Document replaced
|
| 313 |
+
```
|
| 314 |
+
|
| 315 |
+
---
|
| 316 |
+
|
| 317 |
+
## Configuration
|
| 318 |
+
|
| 319 |
+
### Environment Variables (`.env`)
|
| 320 |
+
```bash
|
| 321 |
+
# OpenAI API
|
| 322 |
+
OPENAI_API_KEY=sk-proj-...
|
| 323 |
+
OPEN_AI_EMBEDDING_MODEL=text-embedding-3-small
|
| 324 |
+
OPEN_AI_CHAT_MODEL=gpt-4o-mini
|
| 325 |
+
|
| 326 |
+
# Qdrant Cloud
|
| 327 |
+
QDRANT_URL=https://xxx.cloud.qdrant.io:6333
|
| 328 |
+
QDRANT_API_KEY=xxx
|
| 329 |
+
QDRANT_COLLECTION=hr-intervals
|
| 330 |
+
|
| 331 |
+
# Firecrawl
|
| 332 |
+
FIRECRAWL_API_KEY=fc-xxx
|
| 333 |
+
|
| 334 |
+
# LangSmith (Optional)
|
| 335 |
+
LANGSMITH_TRACING=false
|
| 336 |
+
LANGSMITH_API_KEY=xxx
|
| 337 |
+
LANGSMITH_PROJECT=hr-intervals-chatbot
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
---
|
| 341 |
+
|
| 342 |
+
## Project Structure
|
| 343 |
+
```
|
| 344 |
+
hr-intervals-chatbot/
|
| 345 |
+
βββ src/
|
| 346 |
+
β βββ __init__.py
|
| 347 |
+
β βββ ingestion.py # Document processing
|
| 348 |
+
β βββ chatbot.py # RAG Q&A logic
|
| 349 |
+
β βββ scraper.py # Web scraping
|
| 350 |
+
βββ data/
|
| 351 |
+
β βββ documents/ # Uploaded files
|
| 352 |
+
β βββ scraped/ # Scraped content (cache)
|
| 353 |
+
βββ app.py # User chat interface
|
| 354 |
+
βββ admin.py # Admin management interface
|
| 355 |
+
βββ .env # API keys and config
|
| 356 |
+
βββ requirements.txt # Python dependencies
|
| 357 |
+
βββ ARCHITECTURE.md # This file
|
| 358 |
+
βββ README.md # Project overview
|
| 359 |
+
```
|
| 360 |
+
|
| 361 |
+
---
|
| 362 |
+
|
| 363 |
+
## Key Technical Decisions
|
| 364 |
+
|
| 365 |
+
### 1. Vector Database: Qdrant Cloud
|
| 366 |
+
- **Why:** Built-in web UI, easy document management, free tier
|
| 367 |
+
- **Alternative considered:** Pinecone (limited free tier, no document-level UI)
|
| 368 |
+
|
| 369 |
+
### 2. Embedding Model: text-embedding-3-small
|
| 370 |
+
- **Dimensions:** 1536
|
| 371 |
+
- **Why:** Excellent quality with best cost-performance ratio, multilingual support (English/French)
|
| 372 |
+
|
| 373 |
+
### 3. LLM: GPT-4o-mini
|
| 374 |
+
- **Why:** Cost-effective, sufficient for HR Q&A, fast response
|
| 375 |
+
- **Alternative:** GPT-4o (more expensive but higher quality)
|
| 376 |
+
|
| 377 |
+
### 4. Chunking Strategy
|
| 378 |
+
- **Chunk size:** 1000 characters
|
| 379 |
+
- **Overlap:** 200 characters
|
| 380 |
+
- **Separators:** `["\n\n", "\n", ". ", " ", ""]`
|
| 381 |
+
- **Why:** Balances context preservation and retrieval accuracy
|
| 382 |
+
|
| 383 |
+
### 5. Retrieval: Top-k similarity search
|
| 384 |
+
- **k=5:** Retrieve 5 most similar chunks
|
| 385 |
+
- **Distance metric:** Cosine similarity
|
| 386 |
+
- **Why:** Good balance between context and noise
|
| 387 |
+
|
| 388 |
+
---
|
| 389 |
+
|
| 390 |
+
## Metadata Schema
|
| 391 |
+
|
| 392 |
+
Every chunk stored in Qdrant has the following metadata:
|
| 393 |
+
```python
|
| 394 |
+
{
|
| 395 |
+
"source": str, # Filename or URL
|
| 396 |
+
"type": str, # "document" | "webpage" | "policy" | "guide"
|
| 397 |
+
"upload_date": str, # "YYYY-MM-DD"
|
| 398 |
+
"page": int, # (optional) Page number for PDFs
|
| 399 |
+
"valid_until": str, # (optional) Expiry date for policies
|
| 400 |
+
"version": str, # (optional) Version number
|
| 401 |
+
}
|
| 402 |
+
```
|
| 403 |
+
|
| 404 |
+
---
|
| 405 |
+
|
| 406 |
+
## Document Management Operations
|
| 407 |
+
|
| 408 |
+
### View Documents
|
| 409 |
+
```python
|
| 410 |
+
# List all unique documents
|
| 411 |
+
client.scroll(collection_name, limit=1000, with_payload=True)
|
| 412 |
+
# Group by 'source' field
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
### Upload Document
|
| 416 |
+
```python
|
| 417 |
+
# 1. Load: PyPDFLoader / Docx2txtLoader
|
| 418 |
+
# 2. Chunk: RecursiveCharacterTextSplitter
|
| 419 |
+
# 3. Add metadata: source, type, date
|
| 420 |
+
# 4. Embed: OpenAI text-embedding-3-small
|
| 421 |
+
# 5. Store: QdrantVectorStore.from_documents()
|
| 422 |
+
```
|
| 423 |
+
|
| 424 |
+
### Delete Document
|
| 425 |
+
```python
|
| 426 |
+
client.delete(
|
| 427 |
+
collection_name=collection_name,
|
| 428 |
+
points_selector=FilterSelector(
|
| 429 |
+
filter=Filter(
|
| 430 |
+
must=[
|
| 431 |
+
FieldCondition(
|
| 432 |
+
key="source",
|
| 433 |
+
match=MatchValue(value="filename.pdf")
|
| 434 |
+
)
|
| 435 |
+
]
|
| 436 |
+
)
|
| 437 |
+
)
|
| 438 |
+
)
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
### Update Document
|
| 442 |
+
```python
|
| 443 |
+
# 1. Delete old version (by source name)
|
| 444 |
+
# 2. Upload new version
|
| 445 |
+
```
|
| 446 |
+
|
| 447 |
+
---
|
| 448 |
+
|
| 449 |
+
## Security Features
|
| 450 |
+
|
| 451 |
+
### PII Detection
|
| 452 |
+
- Regex pattern for names: `\b[A-Z][a-z]+ [A-Z][a-z]+\b`
|
| 453 |
+
- Warning displayed to user if detected
|
| 454 |
+
- Future: Integrate Microsoft Presidio for advanced PII detection
|
| 455 |
+
|
| 456 |
+
### Disclaimers
|
| 457 |
+
- Shown on first interaction
|
| 458 |
+
- Embedded in system prompt
|
| 459 |
+
- Reminds users to consult professionals
|
| 460 |
+
|
| 461 |
+
### API Key Security
|
| 462 |
+
- Stored in `.env` file (not in version control)
|
| 463 |
+
- `.env` added to `.gitignore`
|
| 464 |
+
|
| 465 |
+
---
|
| 466 |
+
|
| 467 |
+
## Performance Considerations
|
| 468 |
+
|
| 469 |
+
### Embedding Cost
|
| 470 |
+
- Model: text-embedding-3-small
|
| 471 |
+
- Cost: ~$0.13 per 1M tokens
|
| 472 |
+
- Typical document: 10 pages β 5,000 tokens β $0.0007
|
| 473 |
+
|
| 474 |
+
### Chat Cost
|
| 475 |
+
- Model: GPT-4o-mini
|
| 476 |
+
- Cost: ~$0.15 per 1M input tokens, $0.60 per 1M output tokens
|
| 477 |
+
- Typical query: 5 chunks (5,000 tokens) + question (100 tokens) β $0.0008
|
| 478 |
+
|
| 479 |
+
### Storage
|
| 480 |
+
- Qdrant free tier: 1 GB
|
| 481 |
+
- Each chunk: ~1 KB metadata + 12 KB vector (3072 dims Γ 4 bytes)
|
| 482 |
+
- Capacity: ~75,000 chunks (approximately 1,500 documents of 50 chunks each)
|
| 483 |
+
|
| 484 |
+
---
|
| 485 |
+
|
| 486 |
+
## Future Enhancements
|
| 487 |
+
|
| 488 |
+
### Phase 1 (Week 9-12) - Policy Features
|
| 489 |
+
- Policy template library
|
| 490 |
+
- Policy generation from user input
|
| 491 |
+
- Policy compliance checking
|
| 492 |
+
- Risk identification
|
| 493 |
+
|
| 494 |
+
### Phase 2 (Week 13-18) - Advanced Features
|
| 495 |
+
- Bilingual support (French)
|
| 496 |
+
- Language detection and switching
|
| 497 |
+
- Content recommendation system
|
| 498 |
+
- Feedback collection mechanism
|
| 499 |
+
|
| 500 |
+
### Phase 3 (Week 19-20) - Production
|
| 501 |
+
- Deployment to Hugging Face Spaces
|
| 502 |
+
- User authentication (if needed)
|
| 503 |
+
- Analytics dashboard
|
| 504 |
+
- Automated expiry detection for policies
|
| 505 |
+
|
| 506 |
+
---
|
| 507 |
+
|
| 508 |
+
## Troubleshooting
|
| 509 |
+
|
| 510 |
+
### Common Issues
|
| 511 |
+
|
| 512 |
+
**1. "Collection not found" error**
|
| 513 |
+
```bash
|
| 514 |
+
# Solution: Collection is created automatically on first upload
|
| 515 |
+
# Just upload a document and it will be created
|
| 516 |
+
```
|
| 517 |
+
|
| 518 |
+
**2. "No documents found" when asking questions**
|
| 519 |
+
```bash
|
| 520 |
+
# Solution: Upload at least one document first via admin.py
|
| 521 |
+
```
|
| 522 |
+
|
| 523 |
+
**3. "Rate limit exceeded" from OpenAI**
|
| 524 |
+
```bash
|
| 525 |
+
# Solution: Add delays between requests or upgrade OpenAI plan
|
| 526 |
+
```
|
| 527 |
+
|
| 528 |
+
**4. "Firecrawl scraping failed"**
|
| 529 |
+
```bash
|
| 530 |
+
# Solution: Check if URL is accessible, verify Firecrawl API key
|
| 531 |
+
```
|
| 532 |
+
|
| 533 |
+
---
|
| 534 |
+
|
| 535 |
+
## Development Timeline
|
| 536 |
+
|
| 537 |
+
- **Week 1-2:** Infrastructure setup β
|
| 538 |
+
- **Week 3-4:** Basic RAG system β
|
| 539 |
+
- **Week 5-6:** Web scraping + chat interface
|
| 540 |
+
- **Week 7-8:** Quality improvements
|
| 541 |
+
- **Week 9-10:** Admin interface
|
| 542 |
+
- **Week 11-12:** Demo delivery
|
| 543 |
+
- **Week 13-16:** Policy features
|
| 544 |
+
- **Week 17-18:** Bilingual support
|
| 545 |
+
- **Week 19-20:** Final delivery
|
| 546 |
+
|
| 547 |
+
---
|
| 548 |
+
|
| 549 |
+
## References
|
| 550 |
+
|
| 551 |
+
- LangChain Documentation: https://python.langchain.com/docs/
|
| 552 |
+
- Qdrant Documentation: https://qdrant.tech/documentation/
|
| 553 |
+
- OpenAI API Reference: https://platform.openai.com/docs/
|
| 554 |
+
- Gradio Documentation: https://www.gradio.app/docs/
|
| 555 |
+
```
|
README.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: HR Intervals Chatbot
|
| 3 |
+
emoji: πΌ
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# HR Intervals AI Assistant
|
| 13 |
+
|
| 14 |
+
A RAG-powered chatbot that provides HR knowledge and policy guidance for non-profit organizations.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
- π€ AI-powered Q&A based on HR knowledge base
|
| 18 |
+
- π Source citations for transparency
|
| 19 |
+
- β οΈ PII detection and warnings
|
| 20 |
+
- π¬ Interactive chat interface
|
| 21 |
+
|
| 22 |
+
## Setup
|
| 23 |
+
This Space requires the following environment variables to be set:
|
| 24 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
| 25 |
+
- `QDRANT_URL`: Your Qdrant vector database URL
|
| 26 |
+
- `QDRANT_API_KEY`: Your Qdrant API key
|
| 27 |
+
|
admin.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gradio admin interface for content management
|
| 3 |
+
Allows uploading documents, scraping URLs, and managing content
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import os
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from qdrant_client import QdrantClient, models
|
| 10 |
+
from src.ingestion import ingest_document
|
| 11 |
+
from src.scraper import process_and_store_webpage
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
# Initialize Qdrant client
|
| 16 |
+
client = QdrantClient(
|
| 17 |
+
url=os.getenv("QDRANT_URL"),
|
| 18 |
+
api_key=os.getenv("QDRANT_API_KEY")
|
| 19 |
+
)
|
| 20 |
+
collection_name = os.getenv("QDRANT_COLLECTION")
|
| 21 |
+
|
| 22 |
+
# Create index for metadata.source to enable filtering
|
| 23 |
+
try:
|
| 24 |
+
client.create_payload_index(
|
| 25 |
+
collection_name=collection_name,
|
| 26 |
+
field_name="metadata.source",
|
| 27 |
+
field_schema=models.PayloadSchemaType.KEYWORD
|
| 28 |
+
)
|
| 29 |
+
print("β
Payload index for metadata.source created successfully")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
# Index might already exist or collection not found
|
| 32 |
+
print(f"βΉοΈ Index status: {str(e)}")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ==================== Functions ====================
|
| 36 |
+
|
| 37 |
+
def list_all_documents():
|
| 38 |
+
"""
|
| 39 |
+
List all uploaded documents
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
HTML table string with selectable content
|
| 43 |
+
"""
|
| 44 |
+
try:
|
| 45 |
+
# Paginate through ALL points (Qdrant has 5800+ points)
|
| 46 |
+
all_points = []
|
| 47 |
+
offset = None
|
| 48 |
+
|
| 49 |
+
while True:
|
| 50 |
+
result = client.scroll(
|
| 51 |
+
collection_name=collection_name,
|
| 52 |
+
limit=1000,
|
| 53 |
+
offset=offset,
|
| 54 |
+
with_payload=True
|
| 55 |
+
)
|
| 56 |
+
points, next_offset = result
|
| 57 |
+
all_points.extend(points)
|
| 58 |
+
|
| 59 |
+
if next_offset is None:
|
| 60 |
+
break
|
| 61 |
+
offset = next_offset
|
| 62 |
+
|
| 63 |
+
# Group by source
|
| 64 |
+
docs_dict = {}
|
| 65 |
+
for point in all_points:
|
| 66 |
+
payload = point.payload
|
| 67 |
+
# Metadata is nested inside payload
|
| 68 |
+
metadata = payload.get("metadata", {})
|
| 69 |
+
source = metadata.get("source", "Unknown")
|
| 70 |
+
|
| 71 |
+
if source not in docs_dict:
|
| 72 |
+
docs_dict[source] = {
|
| 73 |
+
"name": source,
|
| 74 |
+
"type": metadata.get("type", "Unknown"),
|
| 75 |
+
"date": metadata.get("upload_date", "Unknown"),
|
| 76 |
+
"chunks": 0
|
| 77 |
+
}
|
| 78 |
+
docs_dict[source]["chunks"] += 1
|
| 79 |
+
|
| 80 |
+
# Create HTML table with selectable text
|
| 81 |
+
if not docs_dict or (len(docs_dict) == 1 and "Unknown" in docs_dict):
|
| 82 |
+
return """
|
| 83 |
+
<div style="padding: 20px; text-align: center; color: #666;">
|
| 84 |
+
<p>π No documents yet</p>
|
| 85 |
+
</div>
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
html = """
|
| 89 |
+
<style>
|
| 90 |
+
.docs-table {
|
| 91 |
+
width: 100%;
|
| 92 |
+
border-collapse: collapse;
|
| 93 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Arial, sans-serif;
|
| 94 |
+
user-select: text;
|
| 95 |
+
-webkit-user-select: text;
|
| 96 |
+
-moz-user-select: text;
|
| 97 |
+
-ms-user-select: text;
|
| 98 |
+
}
|
| 99 |
+
.docs-table thead {
|
| 100 |
+
background-color: #f8f9fa;
|
| 101 |
+
}
|
| 102 |
+
.docs-table th {
|
| 103 |
+
padding: 12px;
|
| 104 |
+
text-align: left;
|
| 105 |
+
font-weight: 600;
|
| 106 |
+
border-bottom: 2px solid #dee2e6;
|
| 107 |
+
user-select: text;
|
| 108 |
+
}
|
| 109 |
+
.docs-table td {
|
| 110 |
+
padding: 12px;
|
| 111 |
+
border-bottom: 1px solid #dee2e6;
|
| 112 |
+
user-select: text;
|
| 113 |
+
cursor: text;
|
| 114 |
+
}
|
| 115 |
+
.docs-table tr:hover {
|
| 116 |
+
background-color: #f8f9fa;
|
| 117 |
+
}
|
| 118 |
+
.doc-name {
|
| 119 |
+
color: #0066cc;
|
| 120 |
+
word-break: break-all;
|
| 121 |
+
}
|
| 122 |
+
</style>
|
| 123 |
+
<table class="docs-table">
|
| 124 |
+
<thead>
|
| 125 |
+
<tr>
|
| 126 |
+
<th>Document Name</th>
|
| 127 |
+
<th>Type</th>
|
| 128 |
+
<th>Upload Date</th>
|
| 129 |
+
<th>Chunks</th>
|
| 130 |
+
</tr>
|
| 131 |
+
</thead>
|
| 132 |
+
<tbody>
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
for doc in docs_dict.values():
|
| 136 |
+
html += f"""
|
| 137 |
+
<tr>
|
| 138 |
+
<td class="doc-name">{doc['name']}</td>
|
| 139 |
+
<td>{doc['type']}</td>
|
| 140 |
+
<td>{doc['date']}</td>
|
| 141 |
+
<td>{doc['chunks']}</td>
|
| 142 |
+
</tr>
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
html += """
|
| 146 |
+
</tbody>
|
| 147 |
+
</table>
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
+
return html
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
return f"""
|
| 154 |
+
<div style="padding: 20px; color: #dc3545;">
|
| 155 |
+
<p>β Error: {str(e)}</p>
|
| 156 |
+
</div>
|
| 157 |
+
"""
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def upload_document(file, doc_type="document"):
|
| 161 |
+
"""
|
| 162 |
+
Upload PDF or DOCX file
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
file: Uploaded file object
|
| 166 |
+
doc_type: Type of document
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
Success message
|
| 170 |
+
"""
|
| 171 |
+
if file is None:
|
| 172 |
+
return "β Please select a file"
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
file_path = file.name
|
| 176 |
+
|
| 177 |
+
# Ingest document
|
| 178 |
+
num_chunks = ingest_document(file_path, doc_type)
|
| 179 |
+
|
| 180 |
+
return f"β
Success!\n\nFile: {os.path.basename(file_path)}\nChunks created: {num_chunks}\nType: {doc_type}"
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
return f"β Upload failed:\n{str(e)}"
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def scrape_single_url(url):
|
| 187 |
+
"""
|
| 188 |
+
Scrape single URL
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
url: URL to scrape
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Success message
|
| 195 |
+
"""
|
| 196 |
+
if not url:
|
| 197 |
+
return "β Please enter a URL"
|
| 198 |
+
|
| 199 |
+
try:
|
| 200 |
+
num_chunks = process_and_store_webpage(url)
|
| 201 |
+
return f"β
Success!\n\nURL: {url}\nChunks created: {num_chunks}"
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
return f"β Scraping failed:\n{str(e)}"
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def scrape_multiple_urls(urls_text):
|
| 208 |
+
"""
|
| 209 |
+
Scrape multiple URLs
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
urls_text: URLs separated by newlines
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
Summary of results
|
| 216 |
+
"""
|
| 217 |
+
if not urls_text:
|
| 218 |
+
return "β Please enter URLs (one per line)"
|
| 219 |
+
|
| 220 |
+
urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
|
| 221 |
+
|
| 222 |
+
results = []
|
| 223 |
+
success_count = 0
|
| 224 |
+
fail_count = 0
|
| 225 |
+
|
| 226 |
+
for url in urls:
|
| 227 |
+
try:
|
| 228 |
+
num_chunks = process_and_store_webpage(url)
|
| 229 |
+
results.append(f"β
{url}: {num_chunks} chunks")
|
| 230 |
+
success_count += 1
|
| 231 |
+
except Exception as e:
|
| 232 |
+
results.append(f"β {url}: {str(e)}")
|
| 233 |
+
fail_count += 1
|
| 234 |
+
|
| 235 |
+
summary = f"π Summary: {success_count} succeeded, {fail_count} failed\n\n"
|
| 236 |
+
return summary + "\n".join(results)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def delete_document(source_name):
|
| 240 |
+
"""
|
| 241 |
+
Delete document by source name
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
source_name: Name or URL of the source
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Success message
|
| 248 |
+
"""
|
| 249 |
+
if not source_name:
|
| 250 |
+
return "β Please enter document name or URL"
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
client.delete(
|
| 254 |
+
collection_name=collection_name,
|
| 255 |
+
points_selector=models.FilterSelector(
|
| 256 |
+
filter=models.Filter(
|
| 257 |
+
must=[
|
| 258 |
+
models.FieldCondition(
|
| 259 |
+
key="metadata.source",
|
| 260 |
+
match=models.MatchValue(value=source_name)
|
| 261 |
+
)
|
| 262 |
+
]
|
| 263 |
+
)
|
| 264 |
+
)
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
return f"β
Successfully deleted all content from:\n{source_name}"
|
| 268 |
+
|
| 269 |
+
except Exception as e:
|
| 270 |
+
return f"β Deletion failed:\n{str(e)}"
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# ==================== Gradio Interface (5.49) ====================
|
| 274 |
+
|
| 275 |
+
with gr.Blocks(
|
| 276 |
+
title="HR Intervals - Admin Panel",
|
| 277 |
+
theme=gr.themes.Soft()
|
| 278 |
+
) as demo:
|
| 279 |
+
|
| 280 |
+
gr.Markdown("# π HR Intervals - Knowledge Base Management")
|
| 281 |
+
gr.Markdown("Manage documents and web content for the AI assistant")
|
| 282 |
+
|
| 283 |
+
with gr.Tabs():
|
| 284 |
+
|
| 285 |
+
# Tab 1: View Documents
|
| 286 |
+
with gr.Tab("π View Documents"):
|
| 287 |
+
gr.Markdown("### Current documents in knowledge base")
|
| 288 |
+
gr.Markdown("π‘ *Tip: You can select and copy any text from the table below*")
|
| 289 |
+
|
| 290 |
+
refresh_btn = gr.Button("π Refresh List", variant="primary")
|
| 291 |
+
|
| 292 |
+
docs_table = gr.HTML(
|
| 293 |
+
label="Documents"
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
refresh_btn.click(list_all_documents, outputs=docs_table)
|
| 297 |
+
demo.load(list_all_documents, outputs=docs_table)
|
| 298 |
+
|
| 299 |
+
# Tab 2: Upload Documents
|
| 300 |
+
with gr.Tab("β¬οΈ Upload Documents"):
|
| 301 |
+
gr.Markdown("### Upload PDF or DOCX files")
|
| 302 |
+
|
| 303 |
+
file_input = gr.File(
|
| 304 |
+
label="Select File (PDF or DOCX)",
|
| 305 |
+
file_types=[".pdf", ".docx"]
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
doc_type_input = gr.Radio(
|
| 309 |
+
choices=["document", "policy", "guide", "article"],
|
| 310 |
+
value="document",
|
| 311 |
+
label="Document Type"
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
upload_btn = gr.Button("π€ Upload", variant="primary", size="lg")
|
| 315 |
+
upload_output = gr.Textbox(label="Upload Result", lines=5)
|
| 316 |
+
|
| 317 |
+
upload_btn.click(
|
| 318 |
+
upload_document,
|
| 319 |
+
inputs=[file_input, doc_type_input],
|
| 320 |
+
outputs=upload_output
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
# Tab 3: Scrape URLs
|
| 324 |
+
with gr.Tab("π Scrape Web Pages"):
|
| 325 |
+
gr.Markdown("### Scrape content from URLs")
|
| 326 |
+
|
| 327 |
+
with gr.Row():
|
| 328 |
+
with gr.Column():
|
| 329 |
+
gr.Markdown("#### Single URL")
|
| 330 |
+
url_input = gr.Textbox(
|
| 331 |
+
label="Enter URL",
|
| 332 |
+
placeholder="https://example.com/article"
|
| 333 |
+
)
|
| 334 |
+
scrape_btn = gr.Button("π Scrape", variant="primary")
|
| 335 |
+
scrape_output = gr.Textbox(label="Result", lines=4)
|
| 336 |
+
|
| 337 |
+
scrape_btn.click(
|
| 338 |
+
scrape_single_url,
|
| 339 |
+
inputs=url_input,
|
| 340 |
+
outputs=scrape_output
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
with gr.Column():
|
| 344 |
+
gr.Markdown("#### Batch URLs")
|
| 345 |
+
urls_input = gr.Textbox(
|
| 346 |
+
label="Enter multiple URLs (one per line)",
|
| 347 |
+
placeholder="https://example.com/page1\nhttps://example.com/page2",
|
| 348 |
+
lines=6
|
| 349 |
+
)
|
| 350 |
+
batch_btn = gr.Button("π Batch Scrape", variant="primary")
|
| 351 |
+
batch_output = gr.Textbox(label="Batch Results", lines=8)
|
| 352 |
+
|
| 353 |
+
batch_btn.click(
|
| 354 |
+
scrape_multiple_urls,
|
| 355 |
+
inputs=urls_input,
|
| 356 |
+
outputs=batch_output
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
# Tab 4: Delete Documents
|
| 360 |
+
with gr.Tab("ποΈ Delete Documents"):
|
| 361 |
+
gr.Markdown("### Delete documents or web pages")
|
| 362 |
+
gr.Markdown("β οΈ **Warning**: This operation cannot be undone!")
|
| 363 |
+
|
| 364 |
+
delete_input = gr.Textbox(
|
| 365 |
+
label="Document Name or URL",
|
| 366 |
+
placeholder="e.g., hiring_policy.pdf or https://example.com/article"
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
delete_btn = gr.Button("ποΈ Delete", variant="stop", size="lg")
|
| 370 |
+
delete_output = gr.Textbox(label="Delete Result", lines=3)
|
| 371 |
+
|
| 372 |
+
delete_btn.click(
|
| 373 |
+
delete_document,
|
| 374 |
+
inputs=delete_input,
|
| 375 |
+
outputs=delete_output
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# Tab 5: Help
|
| 379 |
+
with gr.Tab("βΉοΈ Help"):
|
| 380 |
+
gr.Markdown("""
|
| 381 |
+
### Usage Guide
|
| 382 |
+
|
| 383 |
+
#### π View Documents
|
| 384 |
+
- Shows all uploaded documents and web pages
|
| 385 |
+
- Displays document type, upload date, and number of chunks
|
| 386 |
+
- Click "Refresh" to see the latest status
|
| 387 |
+
|
| 388 |
+
#### β¬οΈ Upload Documents
|
| 389 |
+
- Supports PDF and DOCX formats
|
| 390 |
+
- Documents are automatically split into chunks (~1000 characters each)
|
| 391 |
+
- You can categorize documents by type
|
| 392 |
+
|
| 393 |
+
#### π Scrape Web Pages
|
| 394 |
+
- Enter full URLs (including https://)
|
| 395 |
+
- Supports single or batch scraping
|
| 396 |
+
- Content is automatically converted to Markdown format
|
| 397 |
+
|
| 398 |
+
#### ποΈ Delete Documents
|
| 399 |
+
- Enter exact filename or URL
|
| 400 |
+
- Deletes all chunks from that source
|
| 401 |
+
- **Warning**: Cannot be undone!
|
| 402 |
+
- **Tip**: To update a document, delete it first then upload the new version
|
| 403 |
+
|
| 404 |
+
---
|
| 405 |
+
|
| 406 |
+
### Advanced Management
|
| 407 |
+
|
| 408 |
+
For detailed vector database management, visit:
|
| 409 |
+
[Qdrant Cloud Dashboard](https://cloud.qdrant.io)
|
| 410 |
+
|
| 411 |
+
### Technical Support
|
| 412 |
+
|
| 413 |
+
If you encounter issues, please contact the development team.
|
| 414 |
+
""")
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
if __name__ == "__main__":
|
| 418 |
+
demo.launch(
|
| 419 |
+
server_name="0.0.0.0",
|
| 420 |
+
server_port=7861,
|
| 421 |
+
share=False
|
| 422 |
+
)
|
app.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gradio chat interface for end users
|
| 3 |
+
Uses Gradio 5.49 ChatInterface API
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import os
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from src.chatbot import create_rag_chain, ask_question
|
| 10 |
+
import re
|
| 11 |
+
import uuid
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
# Initialize chatbot
|
| 16 |
+
print("π€ Initializing chatbot...")
|
| 17 |
+
rag_chain, retriever = create_rag_chain()
|
| 18 |
+
print("β
Chatbot ready!")
|
| 19 |
+
|
| 20 |
+
# Generate unique session ID for each user
|
| 21 |
+
session_id = str(uuid.uuid4())
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def check_pii(text: str) -> bool:
|
| 25 |
+
"""
|
| 26 |
+
Simple PII detection - checks for potential names
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
text: Input text to check
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
True if PII detected
|
| 33 |
+
"""
|
| 34 |
+
# Check for capitalized words that might be names
|
| 35 |
+
name_pattern = r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'
|
| 36 |
+
if re.search(name_pattern, text):
|
| 37 |
+
return True
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def chat_response(message: str, history: list) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Handle chat messages (Gradio 5.x format)
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
message: User's message
|
| 47 |
+
history: Conversation history
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Bot's response
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# Check for PII
|
| 54 |
+
warning = ""
|
| 55 |
+
if check_pii(message):
|
| 56 |
+
warning = "β οΈ **Warning**: Please avoid sharing personal information about specific individuals.\n\n"
|
| 57 |
+
|
| 58 |
+
# Get answer from chatbot
|
| 59 |
+
try:
|
| 60 |
+
answer, sources = ask_question(rag_chain, retriever, message, session_id)
|
| 61 |
+
|
| 62 |
+
# Format response with sources
|
| 63 |
+
response = warning + answer
|
| 64 |
+
|
| 65 |
+
if sources:
|
| 66 |
+
response += "\n\nπ **Sources:**\n"
|
| 67 |
+
for i, doc in enumerate(sources[:3], 1):
|
| 68 |
+
source = doc.metadata.get("source", "Unknown")
|
| 69 |
+
response += f"{i}. {source}\n"
|
| 70 |
+
|
| 71 |
+
return response
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
return f"β Error: {str(e)}\n\nPlease make sure documents have been uploaded to the system."
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Create Gradio interface (Gradio 5.49 API)
|
| 78 |
+
with gr.Blocks(
|
| 79 |
+
title="HR Intervals AI Assistant",
|
| 80 |
+
theme=gr.themes.Soft()
|
| 81 |
+
) as demo:
|
| 82 |
+
|
| 83 |
+
gr.Markdown("""
|
| 84 |
+
# πΌ HR Intervals AI Assistant
|
| 85 |
+
|
| 86 |
+
Get instant answers to your HR questions based on our knowledge base.
|
| 87 |
+
""")
|
| 88 |
+
|
| 89 |
+
# Disclaimer
|
| 90 |
+
with gr.Accordion("β οΈ Important Disclaimer - Please Read", open=False):
|
| 91 |
+
gr.Markdown("""
|
| 92 |
+
**This tool is designed to provide general HR-related information and draft policy suggestions.**
|
| 93 |
+
|
| 94 |
+
- This is **NOT** a substitute for professional legal or HR advice
|
| 95 |
+
- For legal compliance and important decisions, consult a qualified attorney or HR professional
|
| 96 |
+
- Do **NOT** share personal information about specific individuals
|
| 97 |
+
|
| 98 |
+
By using this tool, you acknowledge that you understand these limitations.
|
| 99 |
+
""")
|
| 100 |
+
|
| 101 |
+
# Welcome message with disclaimer and example questions
|
| 102 |
+
WELCOME_MESSAGE = """π **Welcome to the HR Intervals AI Assistant!**
|
| 103 |
+
|
| 104 |
+
β οΈ **Important Disclaimer:**
|
| 105 |
+
|
| 106 |
+
This tool is designed to provide general HR-related information and draft policy suggestions. It is not a substitute for professional legal or HR advice. For legal compliance and to ensure the best outcome for your organization, we recommend consulting a qualified attorney or HR professional before implementing any policies or making decisions based on the information provided.
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
How can I help you today? **Try asking:**
|
| 111 |
+
|
| 112 |
+
β’ What should I include in a remote work policy?
|
| 113 |
+
β’ How do I handle employee terminations properly?
|
| 114 |
+
β’ What are best practices for hiring in Canada?
|
| 115 |
+
β’ Tell me about workplace safety requirements"""
|
| 116 |
+
|
| 117 |
+
# Chat interface (Gradio 5.x ChatInterface)
|
| 118 |
+
chat_interface = gr.ChatInterface(
|
| 119 |
+
fn=chat_response,
|
| 120 |
+
chatbot=gr.Chatbot(
|
| 121 |
+
height=500,
|
| 122 |
+
show_label=False,
|
| 123 |
+
type='messages',
|
| 124 |
+
avatar_images=(None, "https://em-content.zobj.net/thumbs/120/apple/354/robot_1f916.png"),
|
| 125 |
+
value=[{"role": "assistant", "content": WELCOME_MESSAGE}]
|
| 126 |
+
),
|
| 127 |
+
textbox=gr.Textbox(
|
| 128 |
+
placeholder="Ask your HR question here...",
|
| 129 |
+
container=False,
|
| 130 |
+
scale=7
|
| 131 |
+
),
|
| 132 |
+
title="",
|
| 133 |
+
description="",
|
| 134 |
+
theme=gr.themes.Soft()
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Footer
|
| 138 |
+
gr.Markdown("""
|
| 139 |
+
---
|
| 140 |
+
π‘ **Tip**: Be specific in your questions for better answers. Remember to consult professionals for legal matters.
|
| 141 |
+
""")
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
demo.launch(
|
| 146 |
+
server_name="0.0.0.0",
|
| 147 |
+
server_port=7860,
|
| 148 |
+
share=False
|
| 149 |
+
)
|
chatbot-widget.html
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>HR Chatbot Widget - Bottom Right Corner</title>
|
| 7 |
+
<style>
|
| 8 |
+
/* Demo page styles */
|
| 9 |
+
* {
|
| 10 |
+
margin: 0;
|
| 11 |
+
padding: 0;
|
| 12 |
+
box-sizing: border-box;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
body {
|
| 16 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
| 17 |
+
background: #f8fafc;
|
| 18 |
+
min-height: 100vh;
|
| 19 |
+
padding: 40px;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.demo-content {
|
| 23 |
+
max-width: 800px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
.demo-content h1 {
|
| 28 |
+
font-size: 2rem;
|
| 29 |
+
color: #1e293b;
|
| 30 |
+
margin-bottom: 20px;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
.demo-content p {
|
| 34 |
+
color: #64748b;
|
| 35 |
+
line-height: 1.8;
|
| 36 |
+
margin-bottom: 15px;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
.demo-content .note {
|
| 40 |
+
background: #fef3c7;
|
| 41 |
+
border-left: 4px solid #f59e0b;
|
| 42 |
+
padding: 16px 20px;
|
| 43 |
+
border-radius: 0 8px 8px 0;
|
| 44 |
+
margin: 30px 0;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
.demo-content .note strong {
|
| 48 |
+
color: #92400e;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/* ============================================
|
| 52 |
+
CHATBOT WIDGET STYLES - COPY FROM HERE
|
| 53 |
+
============================================ */
|
| 54 |
+
|
| 55 |
+
/* Chat Toggle Button */
|
| 56 |
+
.chat-widget-button {
|
| 57 |
+
position: fixed;
|
| 58 |
+
bottom: 24px;
|
| 59 |
+
right: 24px;
|
| 60 |
+
width: 64px;
|
| 61 |
+
height: 64px;
|
| 62 |
+
border-radius: 50%;
|
| 63 |
+
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
|
| 64 |
+
border: none;
|
| 65 |
+
cursor: pointer;
|
| 66 |
+
box-shadow: 0 8px 32px rgba(99, 102, 241, 0.4);
|
| 67 |
+
display: flex;
|
| 68 |
+
align-items: center;
|
| 69 |
+
justify-content: center;
|
| 70 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
|
| 71 |
+
z-index: 9998;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.chat-widget-button:hover {
|
| 75 |
+
transform: scale(1.1);
|
| 76 |
+
box-shadow: 0 12px 40px rgba(99, 102, 241, 0.5);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.chat-widget-button svg {
|
| 80 |
+
width: 28px;
|
| 81 |
+
height: 28px;
|
| 82 |
+
fill: white;
|
| 83 |
+
transition: transform 0.3s ease;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.chat-widget-button.active svg {
|
| 87 |
+
transform: rotate(90deg);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
/* Notification Badge */
|
| 91 |
+
.chat-widget-badge {
|
| 92 |
+
position: absolute;
|
| 93 |
+
top: -4px;
|
| 94 |
+
right: -4px;
|
| 95 |
+
width: 20px;
|
| 96 |
+
height: 20px;
|
| 97 |
+
background: #ef4444;
|
| 98 |
+
border-radius: 50%;
|
| 99 |
+
color: white;
|
| 100 |
+
font-size: 12px;
|
| 101 |
+
font-weight: 600;
|
| 102 |
+
display: flex;
|
| 103 |
+
align-items: center;
|
| 104 |
+
justify-content: center;
|
| 105 |
+
animation: pulse-badge 2s infinite;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
@keyframes pulse-badge {
|
| 109 |
+
0%, 100% { transform: scale(1); }
|
| 110 |
+
50% { transform: scale(1.1); }
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
/* Chat Window */
|
| 114 |
+
.chat-widget-window {
|
| 115 |
+
position: fixed;
|
| 116 |
+
bottom: 100px;
|
| 117 |
+
right: 24px;
|
| 118 |
+
width: 400px;
|
| 119 |
+
height: 600px;
|
| 120 |
+
background: white;
|
| 121 |
+
border-radius: 20px;
|
| 122 |
+
box-shadow: 0 25px 80px rgba(0, 0, 0, 0.2);
|
| 123 |
+
display: flex;
|
| 124 |
+
flex-direction: column;
|
| 125 |
+
overflow: hidden;
|
| 126 |
+
z-index: 9999;
|
| 127 |
+
opacity: 0;
|
| 128 |
+
visibility: hidden;
|
| 129 |
+
transform: translateY(20px) scale(0.95);
|
| 130 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
.chat-widget-window.open {
|
| 134 |
+
opacity: 1;
|
| 135 |
+
visibility: visible;
|
| 136 |
+
transform: translateY(0) scale(1);
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* Chat Header */
|
| 140 |
+
.chat-widget-header {
|
| 141 |
+
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
|
| 142 |
+
padding: 18px 20px;
|
| 143 |
+
display: flex;
|
| 144 |
+
align-items: center;
|
| 145 |
+
justify-content: space-between;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
.chat-widget-header-info {
|
| 149 |
+
display: flex;
|
| 150 |
+
align-items: center;
|
| 151 |
+
gap: 12px;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.chat-widget-avatar {
|
| 155 |
+
width: 44px;
|
| 156 |
+
height: 44px;
|
| 157 |
+
background: rgba(255, 255, 255, 0.2);
|
| 158 |
+
border-radius: 50%;
|
| 159 |
+
display: flex;
|
| 160 |
+
align-items: center;
|
| 161 |
+
justify-content: center;
|
| 162 |
+
font-size: 24px;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
.chat-widget-title {
|
| 166 |
+
color: white;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.chat-widget-title h3 {
|
| 170 |
+
font-size: 1rem;
|
| 171 |
+
font-weight: 600;
|
| 172 |
+
margin-bottom: 2px;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
.chat-widget-title span {
|
| 176 |
+
font-size: 0.8rem;
|
| 177 |
+
opacity: 0.9;
|
| 178 |
+
display: flex;
|
| 179 |
+
align-items: center;
|
| 180 |
+
gap: 6px;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
.chat-widget-title .status-dot {
|
| 184 |
+
width: 8px;
|
| 185 |
+
height: 8px;
|
| 186 |
+
background: #4ade80;
|
| 187 |
+
border-radius: 50%;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.chat-widget-close {
|
| 191 |
+
background: rgba(255, 255, 255, 0.2);
|
| 192 |
+
border: none;
|
| 193 |
+
width: 32px;
|
| 194 |
+
height: 32px;
|
| 195 |
+
border-radius: 50%;
|
| 196 |
+
color: white;
|
| 197 |
+
cursor: pointer;
|
| 198 |
+
display: flex;
|
| 199 |
+
align-items: center;
|
| 200 |
+
justify-content: center;
|
| 201 |
+
transition: background 0.2s;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.chat-widget-close:hover {
|
| 205 |
+
background: rgba(255, 255, 255, 0.3);
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
/* Chat Body (iframe container) */
|
| 209 |
+
.chat-widget-body {
|
| 210 |
+
flex: 1;
|
| 211 |
+
overflow: hidden;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
.chat-widget-body iframe {
|
| 215 |
+
width: 100%;
|
| 216 |
+
height: 100%;
|
| 217 |
+
border: none;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
/* Mobile Responsive */
|
| 221 |
+
@media (max-width: 480px) {
|
| 222 |
+
.chat-widget-window {
|
| 223 |
+
width: calc(100% - 20px);
|
| 224 |
+
height: calc(100% - 120px);
|
| 225 |
+
right: 10px;
|
| 226 |
+
bottom: 90px;
|
| 227 |
+
border-radius: 16px;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.chat-widget-button {
|
| 231 |
+
width: 56px;
|
| 232 |
+
height: 56px;
|
| 233 |
+
bottom: 20px;
|
| 234 |
+
right: 20px;
|
| 235 |
+
}
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
/* ============================================
|
| 239 |
+
END OF CHATBOT WIDGET STYLES
|
| 240 |
+
============================================ */
|
| 241 |
+
</style>
|
| 242 |
+
</head>
|
| 243 |
+
<body>
|
| 244 |
+
<!-- Demo Page Content -->
|
| 245 |
+
<div class="demo-content">
|
| 246 |
+
<h1>Your Website Content Here</h1>
|
| 247 |
+
<p>This is a demo page showing how the HR Chatbot widget appears in the bottom right corner. The chatbot is always accessible to visitors while they browse your website.</p>
|
| 248 |
+
<p>Our HR Assistant is powered by advanced AI technology, providing instant answers to your human resources questions. Whether you need guidance on policies, hiring practices, or workplace regulations, our chatbot is here to help.</p>
|
| 249 |
+
<p>The widget is designed to be non-intrusive while remaining easily accessible. It works seamlessly on both desktop and mobile devices.</p>
|
| 250 |
+
|
| 251 |
+
<div class="note">
|
| 252 |
+
<strong>π Click the chat button in the bottom right corner to open the HR Assistant!</strong>
|
| 253 |
+
</div>
|
| 254 |
+
|
| 255 |
+
<p>Get instant support for common HR inquiries including employee onboarding, benefits information, leave policies, and workplace compliance. Our AI assistant is available 24/7 to provide helpful guidance.</p>
|
| 256 |
+
<p>Please note that while our chatbot provides general HR information, it should not replace professional legal or HR advice for important decisions.</p>
|
| 257 |
+
</div>
|
| 258 |
+
|
| 259 |
+
<!-- ============================================
|
| 260 |
+
CHATBOT WIDGET HTML - COPY FROM HERE
|
| 261 |
+
============================================ -->
|
| 262 |
+
|
| 263 |
+
<!-- Chat Toggle Button -->
|
| 264 |
+
<button class="chat-widget-button" id="chatWidgetButton" onclick="toggleChat()">
|
| 265 |
+
<span class="chat-widget-badge">1</span>
|
| 266 |
+
<svg viewBox="0 0 24 24" id="chatIcon">
|
| 267 |
+
<path d="M20 2H4c-1.1 0-2 .9-2 2v18l4-4h14c1.1 0 2-.9 2-2V4c0-1.1-.9-2-2-2zm0 14H6l-2 2V4h16v12z"/>
|
| 268 |
+
</svg>
|
| 269 |
+
<svg viewBox="0 0 24 24" id="closeIcon" style="display: none;">
|
| 270 |
+
<path d="M19 6.41L17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/>
|
| 271 |
+
</svg>
|
| 272 |
+
</button>
|
| 273 |
+
|
| 274 |
+
<!-- Chat Window -->
|
| 275 |
+
<div class="chat-widget-window" id="chatWidgetWindow">
|
| 276 |
+
<div class="chat-widget-header">
|
| 277 |
+
<div class="chat-widget-header-info">
|
| 278 |
+
<div class="chat-widget-avatar">π€</div>
|
| 279 |
+
<div class="chat-widget-title">
|
| 280 |
+
<h3>HR Assistant</h3>
|
| 281 |
+
<span><span class="status-dot"></span> Online</span>
|
| 282 |
+
</div>
|
| 283 |
+
</div>
|
| 284 |
+
<button class="chat-widget-close" onclick="toggleChat()">
|
| 285 |
+
<svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
|
| 286 |
+
<path d="M19 6.41L17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/>
|
| 287 |
+
</svg>
|
| 288 |
+
</button>
|
| 289 |
+
</div>
|
| 290 |
+
<div class="chat-widget-body">
|
| 291 |
+
<iframe
|
| 292 |
+
src="https://pikamomo-hr-intervals-chatbot.hf.space"
|
| 293 |
+
title="HR Chatbot"
|
| 294 |
+
loading="lazy">
|
| 295 |
+
</iframe>
|
| 296 |
+
</div>
|
| 297 |
+
</div>
|
| 298 |
+
|
| 299 |
+
<!-- ============================================
|
| 300 |
+
CHATBOT WIDGET JAVASCRIPT - COPY THIS TOO
|
| 301 |
+
============================================ -->
|
| 302 |
+
<script>
|
| 303 |
+
let isOpen = false;
|
| 304 |
+
const button = document.getElementById('chatWidgetButton');
|
| 305 |
+
const window_el = document.getElementById('chatWidgetWindow');
|
| 306 |
+
const chatIcon = document.getElementById('chatIcon');
|
| 307 |
+
const closeIcon = document.getElementById('closeIcon');
|
| 308 |
+
const badge = document.querySelector('.chat-widget-badge');
|
| 309 |
+
|
| 310 |
+
function toggleChat() {
|
| 311 |
+
isOpen = !isOpen;
|
| 312 |
+
|
| 313 |
+
if (isOpen) {
|
| 314 |
+
window_el.classList.add('open');
|
| 315 |
+
button.classList.add('active');
|
| 316 |
+
chatIcon.style.display = 'none';
|
| 317 |
+
closeIcon.style.display = 'block';
|
| 318 |
+
badge.style.display = 'none';
|
| 319 |
+
} else {
|
| 320 |
+
window_el.classList.remove('open');
|
| 321 |
+
button.classList.remove('active');
|
| 322 |
+
chatIcon.style.display = 'block';
|
| 323 |
+
closeIcon.style.display = 'none';
|
| 324 |
+
}
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
// Close on escape key
|
| 328 |
+
document.addEventListener('keydown', function(e) {
|
| 329 |
+
if (e.key === 'Escape' && isOpen) {
|
| 330 |
+
toggleChat();
|
| 331 |
+
}
|
| 332 |
+
});
|
| 333 |
+
</script>
|
| 334 |
+
</body>
|
| 335 |
+
</html>
|
| 336 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ======================= LangChain Core =======================
|
| 2 |
+
langchain==1.0.2
|
| 3 |
+
langchain-openai==1.0.1
|
| 4 |
+
langchain-qdrant==1.1.0
|
| 5 |
+
langchain-community==0.4.1
|
| 6 |
+
langchain-core==1.0.1
|
| 7 |
+
|
| 8 |
+
# ======================= Vector Database =======================
|
| 9 |
+
qdrant-client==1.15.1
|
| 10 |
+
|
| 11 |
+
# ======================= Document Processing =======================
|
| 12 |
+
pypdf==6.1.1
|
| 13 |
+
python-docx==1.2.0
|
| 14 |
+
unstructured==0.18.15
|
| 15 |
+
|
| 16 |
+
# ======================= Web Scraping =======================
|
| 17 |
+
firecrawl-py==4.5.0
|
| 18 |
+
|
| 19 |
+
# ======================= User Interface =======================
|
| 20 |
+
gradio==5.49.1
|
| 21 |
+
|
| 22 |
+
# ======================= OpenAI =======================
|
| 23 |
+
openai==1.109.1
|
| 24 |
+
|
| 25 |
+
# ======================= Utilities =======================
|
| 26 |
+
python-dotenv==1.1.1
|
| 27 |
+
requests==2.32.5
|
| 28 |
+
numpy==2.2.6
|
| 29 |
+
pandas==2.2.3
|
| 30 |
+
tiktoken==0.11.0
|
src/__init__.py
ADDED
|
File without changes
|
src/chatbot.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG chatbot module using latest LangChain with LCEL
|
| 3 |
+
Handles question-answering with conversation memory using modern patterns
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 9 |
+
from langchain_qdrant import QdrantVectorStore
|
| 10 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 11 |
+
from langchain_core.chat_history import BaseChatMessageHistory
|
| 12 |
+
from langchain_community.chat_message_histories import ChatMessageHistory
|
| 13 |
+
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
| 14 |
+
from langchain_core.runnables.history import RunnableWithMessageHistory
|
| 15 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
from qdrant_client import QdrantClient
|
| 18 |
+
from typing import Tuple, List, Dict, Any
|
| 19 |
+
from operator import itemgetter
|
| 20 |
+
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
# Store for chat sessions
|
| 24 |
+
session_store = {}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_session_history(session_id: str) -> BaseChatMessageHistory:
|
| 28 |
+
"""
|
| 29 |
+
Get or create chat history for a session
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
session_id: Unique identifier for the session
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Chat message history object
|
| 36 |
+
"""
|
| 37 |
+
if session_id not in session_store:
|
| 38 |
+
session_store[session_id] = ChatMessageHistory()
|
| 39 |
+
return session_store[session_id]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def format_docs(docs: List[Document]) -> str:
|
| 43 |
+
"""
|
| 44 |
+
Format retrieved documents into a single string
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
docs: List of retrieved documents
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Formatted string with document contents
|
| 51 |
+
"""
|
| 52 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def create_rag_chain():
|
| 56 |
+
"""
|
| 57 |
+
Create RAG question-answering chain using LCEL (LangChain Expression Language)
|
| 58 |
+
Modern approach with pipe operator for better composability
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Conversational RAG chain with message history
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
# 1. Connect to Qdrant
|
| 65 |
+
client = QdrantClient(
|
| 66 |
+
url=os.getenv("QDRANT_URL"),
|
| 67 |
+
api_key=os.getenv("QDRANT_API_KEY")
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
embeddings = OpenAIEmbeddings(
|
| 71 |
+
model=os.getenv("OPEN_AI_EMBEDDING_MODEL", "text-embedding-3-small")
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
vectorstore = QdrantVectorStore(
|
| 75 |
+
client=client,
|
| 76 |
+
collection_name=os.getenv("QDRANT_COLLECTION"),
|
| 77 |
+
embedding=embeddings
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# 2. Create retriever
|
| 81 |
+
retriever = vectorstore.as_retriever(
|
| 82 |
+
search_type="similarity",
|
| 83 |
+
search_kwargs={"k": 5}
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# 3. Create LLM
|
| 87 |
+
llm = ChatOpenAI(
|
| 88 |
+
model=os.getenv("OPEN_AI_CHAT_MODEL", "gpt-4o-mini"),
|
| 89 |
+
temperature=0.3
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# 4. System prompt
|
| 93 |
+
system_prompt = """You are an HR assistant for nonprofit organizations in Canada.
|
| 94 |
+
Use the following context to answer questions accurately and helpfully.
|
| 95 |
+
|
| 96 |
+
IMPORTANT DISCLAIMERS:
|
| 97 |
+
- This tool provides general HR information only
|
| 98 |
+
- Not a substitute for professional legal or HR advice
|
| 99 |
+
- Consult qualified professionals before implementing policies
|
| 100 |
+
- Do NOT share personal information about specific individuals
|
| 101 |
+
|
| 102 |
+
Context:
|
| 103 |
+
{context}
|
| 104 |
+
|
| 105 |
+
Provide a clear, helpful answer. If you're not certain, say so. Always remind users to consult HR/legal professionals for important decisions."""
|
| 106 |
+
|
| 107 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 108 |
+
("system", system_prompt),
|
| 109 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
| 110 |
+
("human", "{input}")
|
| 111 |
+
])
|
| 112 |
+
|
| 113 |
+
# 5. Build RAG chain using LCEL (pipe operator)
|
| 114 |
+
# This is the modern LangChain approach for better composability
|
| 115 |
+
rag_chain = (
|
| 116 |
+
{
|
| 117 |
+
"context": itemgetter("input") | retriever | format_docs,
|
| 118 |
+
"input": itemgetter("input"),
|
| 119 |
+
"chat_history": itemgetter("chat_history")
|
| 120 |
+
}
|
| 121 |
+
| prompt
|
| 122 |
+
| llm
|
| 123 |
+
| StrOutputParser()
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# 6. Add chat history with message management
|
| 127 |
+
conversational_rag_chain = RunnableWithMessageHistory(
|
| 128 |
+
rag_chain,
|
| 129 |
+
get_session_history,
|
| 130 |
+
input_messages_key="input",
|
| 131 |
+
history_messages_key="chat_history",
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
return conversational_rag_chain, retriever
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def ask_question(
|
| 138 |
+
rag_chain,
|
| 139 |
+
retriever,
|
| 140 |
+
question: str,
|
| 141 |
+
session_id: str = "default"
|
| 142 |
+
) -> Tuple[str, List[Document]]:
|
| 143 |
+
"""
|
| 144 |
+
Ask a question and get answer with sources
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
rag_chain: The RAG chain
|
| 148 |
+
retriever: The vector store retriever for getting sources
|
| 149 |
+
question: User's question
|
| 150 |
+
session_id: Session identifier for conversation history
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
Tuple of (answer, source_documents)
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
# Get answer from conversational chain
|
| 157 |
+
answer = rag_chain.invoke(
|
| 158 |
+
{"input": question},
|
| 159 |
+
config={"configurable": {"session_id": session_id}}
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Retrieve source documents separately for display
|
| 163 |
+
sources = retriever.invoke(question)
|
| 164 |
+
|
| 165 |
+
return answer, sources
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# Test function
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
print("π€ Initializing chatbot with latest LangChain (LCEL)...")
|
| 171 |
+
rag_chain, retriever = create_rag_chain()
|
| 172 |
+
|
| 173 |
+
print("\nβ
Ready! Enter your question (type 'quit' to exit):\n")
|
| 174 |
+
|
| 175 |
+
session_id = "test_session"
|
| 176 |
+
|
| 177 |
+
while True:
|
| 178 |
+
question = input("You: ")
|
| 179 |
+
if question.lower() in ['quit', 'exit', 'q']:
|
| 180 |
+
break
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
answer, sources = ask_question(rag_chain, retriever, question, session_id)
|
| 184 |
+
|
| 185 |
+
print(f"\nBot: {answer}\n")
|
| 186 |
+
|
| 187 |
+
if sources:
|
| 188 |
+
print("π Sources:")
|
| 189 |
+
for i, doc in enumerate(sources[:3], 1):
|
| 190 |
+
source = doc.metadata.get("source", "Unknown")
|
| 191 |
+
print(f" {i}. {source}")
|
| 192 |
+
print()
|
| 193 |
+
except Exception as e:
|
| 194 |
+
print(f"\nβ Error: {str(e)}")
|
| 195 |
+
print("Make sure you have uploaded some documents first.\n")
|
src/ingestion.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document ingestion module
|
| 3 |
+
Loads PDF/DOCX files and stores them in Qdrant
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
# Add parent directory to path for imports
|
| 14 |
+
current_dir = Path(__file__).resolve().parent
|
| 15 |
+
parent_dir = current_dir.parent
|
| 16 |
+
if str(parent_dir) not in sys.path:
|
| 17 |
+
sys.path.insert(0, str(parent_dir))
|
| 18 |
+
|
| 19 |
+
from src.vector_store import process_and_store
|
| 20 |
+
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def load_document(file_path: str):
|
| 25 |
+
"""
|
| 26 |
+
Load PDF or DOCX document
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
file_path: Path to the document file
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
List of Document objects
|
| 33 |
+
"""
|
| 34 |
+
if file_path.endswith('.pdf'):
|
| 35 |
+
loader = PyPDFLoader(file_path)
|
| 36 |
+
elif file_path.endswith('.docx'):
|
| 37 |
+
loader = Docx2txtLoader(file_path)
|
| 38 |
+
else:
|
| 39 |
+
raise ValueError("Only PDF and DOCX files are supported")
|
| 40 |
+
|
| 41 |
+
documents = loader.load()
|
| 42 |
+
return documents
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def add_metadata(documents, source_name: str, doc_type: str = "document"):
|
| 46 |
+
"""
|
| 47 |
+
Add metadata to documents
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
documents: List of Document objects
|
| 51 |
+
source_name: Source filename
|
| 52 |
+
doc_type: Type of document (document, policy, guide, etc.)
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Documents with added metadata
|
| 56 |
+
"""
|
| 57 |
+
for doc in documents:
|
| 58 |
+
doc.metadata["source"] = source_name
|
| 59 |
+
doc.metadata["type"] = doc_type
|
| 60 |
+
doc.metadata["upload_date"] = datetime.now().strftime("%Y-%m-%d")
|
| 61 |
+
|
| 62 |
+
return documents
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def ingest_document(file_path: str, doc_type: str = "document") -> int:
|
| 66 |
+
"""
|
| 67 |
+
Complete document ingestion pipeline
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
file_path: Path to the document file
|
| 71 |
+
doc_type: Type of document
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Number of chunks created
|
| 75 |
+
"""
|
| 76 |
+
print(f"π Processing: {file_path}")
|
| 77 |
+
|
| 78 |
+
# 1. Load document
|
| 79 |
+
documents = load_document(file_path)
|
| 80 |
+
print(f" β
Loaded {len(documents)} pages")
|
| 81 |
+
|
| 82 |
+
# 2. Add metadata
|
| 83 |
+
source_name = os.path.basename(file_path)
|
| 84 |
+
documents = add_metadata(documents, source_name, doc_type)
|
| 85 |
+
|
| 86 |
+
# 3. Chunk and store (using shared function)
|
| 87 |
+
num_chunks = process_and_store(documents)
|
| 88 |
+
|
| 89 |
+
return num_chunks
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# Test function
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
print("π§ͺ Testing document ingestion...")
|
| 95 |
+
print("\nPlease place a test PDF or DOCX file in data/documents/")
|
| 96 |
+
print("Then update the file path below and run again.\n")
|
| 97 |
+
|
| 98 |
+
# Example:
|
| 99 |
+
# test_file = "data/documents/test.pdf"
|
| 100 |
+
# if os.path.exists(test_file):
|
| 101 |
+
# num_chunks = ingest_document(test_file)
|
| 102 |
+
# print(f"\nπ Success! Processed {num_chunks} chunks")
|
src/scraper.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Web scraping module
|
| 3 |
+
Scrapes web pages using Firecrawl and stores in Qdrant
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from firecrawl import FirecrawlApp
|
| 11 |
+
from langchain_core.documents import Document
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from qdrant_client import QdrantClient
|
| 14 |
+
|
| 15 |
+
# Add parent directory to path for imports
|
| 16 |
+
current_dir = Path(__file__).resolve().parent
|
| 17 |
+
parent_dir = current_dir.parent
|
| 18 |
+
if str(parent_dir) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(parent_dir))
|
| 20 |
+
|
| 21 |
+
from src.vector_store import process_and_store
|
| 22 |
+
|
| 23 |
+
load_dotenv()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def check_url_exists(url: str) -> int:
|
| 27 |
+
"""
|
| 28 |
+
Check if URL already exists in Qdrant
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
url: URL to check
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Number of existing chunks for this URL (0 if not found)
|
| 35 |
+
"""
|
| 36 |
+
client = QdrantClient(
|
| 37 |
+
url=os.getenv("QDRANT_URL"),
|
| 38 |
+
api_key=os.getenv("QDRANT_API_KEY")
|
| 39 |
+
)
|
| 40 |
+
collection_name = os.getenv("QDRANT_COLLECTION")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
result = client.scroll(
|
| 44 |
+
collection_name=collection_name,
|
| 45 |
+
limit=1,
|
| 46 |
+
scroll_filter={
|
| 47 |
+
"must": [{"key": "metadata.source", "match": {"value": url}}]
|
| 48 |
+
},
|
| 49 |
+
with_payload=False
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Count total chunks for this URL
|
| 53 |
+
count_result = client.count(
|
| 54 |
+
collection_name=collection_name,
|
| 55 |
+
count_filter={
|
| 56 |
+
"must": [{"key": "metadata.source", "match": {"value": url}}]
|
| 57 |
+
}
|
| 58 |
+
)
|
| 59 |
+
return count_result.count
|
| 60 |
+
except Exception:
|
| 61 |
+
return 0
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def scrape_url(url: str) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Scrape webpage content using Firecrawl
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
url: URL to scrape
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Markdown content of the webpage
|
| 73 |
+
"""
|
| 74 |
+
print(f"π Scraping: {url}")
|
| 75 |
+
|
| 76 |
+
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
| 77 |
+
result = app.scrape(url, formats=['markdown'])
|
| 78 |
+
|
| 79 |
+
# Handle different return types
|
| 80 |
+
if hasattr(result, 'markdown'):
|
| 81 |
+
markdown_content = result.markdown
|
| 82 |
+
elif isinstance(result, dict) and 'markdown' in result:
|
| 83 |
+
markdown_content = result['markdown']
|
| 84 |
+
else:
|
| 85 |
+
raise ValueError(f"Failed to scrape - unexpected result type: {type(result)}")
|
| 86 |
+
|
| 87 |
+
if not markdown_content:
|
| 88 |
+
raise ValueError("Failed to scrape - no content retrieved")
|
| 89 |
+
|
| 90 |
+
return markdown_content
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def process_and_store_webpage(url: str, force: bool = False) -> int:
|
| 94 |
+
"""
|
| 95 |
+
Scrape webpage and store in vector database
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
url: URL to scrape
|
| 99 |
+
force: If True, skip duplicate check and store anyway
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Number of chunks created
|
| 103 |
+
|
| 104 |
+
Raises:
|
| 105 |
+
ValueError: If URL already exists and force=False
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
# 0. Check if URL already exists
|
| 109 |
+
if not force:
|
| 110 |
+
existing_chunks = check_url_exists(url)
|
| 111 |
+
if existing_chunks > 0:
|
| 112 |
+
raise ValueError(
|
| 113 |
+
f"URL already exists with {existing_chunks} chunks. "
|
| 114 |
+
f"Use 'Delete' to remove it first, or force=True to add anyway."
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# 1. Scrape content
|
| 118 |
+
markdown_content = scrape_url(url)
|
| 119 |
+
print(f" β
Scraped {len(markdown_content)} characters")
|
| 120 |
+
|
| 121 |
+
# 2. Create document with metadata
|
| 122 |
+
doc = Document(
|
| 123 |
+
page_content=markdown_content,
|
| 124 |
+
metadata={
|
| 125 |
+
"source": url,
|
| 126 |
+
"type": "webpage",
|
| 127 |
+
"upload_date": datetime.now().strftime("%Y-%m-%d")
|
| 128 |
+
}
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# 3. Chunk and store (using shared function)
|
| 132 |
+
num_chunks = process_and_store([doc])
|
| 133 |
+
|
| 134 |
+
return num_chunks
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# Test function
|
| 138 |
+
if __name__ == "__main__":
|
| 139 |
+
print("π§ͺ Testing web scraper...")
|
| 140 |
+
|
| 141 |
+
# Test with a simple webpage
|
| 142 |
+
test_url = "https://hrintervals.ca/resources/sample-policy-inclusive-and-equitable-hiring-practices/"
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
num_chunks = process_and_store_webpage(test_url)
|
| 146 |
+
print(f"\nπ Success! Processed {num_chunks} chunks")
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"\nβ Error: {str(e)}")
|
src/vector_store.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Shared vector storage utilities
|
| 3 |
+
Handles chunking and storing documents in Qdrant
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 9 |
+
from langchain_openai import OpenAIEmbeddings
|
| 10 |
+
from langchain_qdrant import QdrantVectorStore
|
| 11 |
+
from qdrant_client import QdrantClient
|
| 12 |
+
from langchain_core.documents import Document
|
| 13 |
+
from typing import List
|
| 14 |
+
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_embeddings():
|
| 19 |
+
"""Get OpenAI embeddings instance"""
|
| 20 |
+
return OpenAIEmbeddings(
|
| 21 |
+
model=os.getenv("OPEN_AI_EMBEDDING_MODEL", "text-embedding-3-small")
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_qdrant_client():
|
| 26 |
+
"""Get Qdrant client instance"""
|
| 27 |
+
return QdrantClient(
|
| 28 |
+
url=os.getenv("QDRANT_URL"),
|
| 29 |
+
api_key=os.getenv("QDRANT_API_KEY")
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def chunk_documents(
|
| 34 |
+
documents: List[Document],
|
| 35 |
+
chunk_size: int = 1000,
|
| 36 |
+
chunk_overlap: int = 200
|
| 37 |
+
) -> List[Document]:
|
| 38 |
+
"""
|
| 39 |
+
Split documents into chunks
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
documents: List of LangChain Document objects
|
| 43 |
+
chunk_size: Maximum characters per chunk
|
| 44 |
+
chunk_overlap: Overlapping characters between chunks
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
List of chunked Document objects
|
| 48 |
+
"""
|
| 49 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 50 |
+
chunk_size=chunk_size,
|
| 51 |
+
chunk_overlap=chunk_overlap,
|
| 52 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
chunks = text_splitter.split_documents(documents)
|
| 56 |
+
return chunks
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def store_documents(documents: List[Document]) -> tuple[int, int]:
|
| 60 |
+
"""
|
| 61 |
+
Store documents in Qdrant vector database
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
documents: List of Document objects with content and metadata
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Tuple of (expected_count, actual_stored_count)
|
| 68 |
+
"""
|
| 69 |
+
embeddings = get_embeddings()
|
| 70 |
+
client = get_qdrant_client()
|
| 71 |
+
collection_name = os.getenv("QDRANT_COLLECTION")
|
| 72 |
+
|
| 73 |
+
# Get count before storing
|
| 74 |
+
try:
|
| 75 |
+
before_count = client.count(collection_name=collection_name).count
|
| 76 |
+
except Exception:
|
| 77 |
+
before_count = 0
|
| 78 |
+
|
| 79 |
+
# Store documents
|
| 80 |
+
vectorstore = QdrantVectorStore.from_documents(
|
| 81 |
+
documents=documents,
|
| 82 |
+
embedding=embeddings,
|
| 83 |
+
url=os.getenv("QDRANT_URL"),
|
| 84 |
+
api_key=os.getenv("QDRANT_API_KEY"),
|
| 85 |
+
collection_name=collection_name
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Verify storage by counting after
|
| 89 |
+
try:
|
| 90 |
+
after_count = client.count(collection_name=collection_name).count
|
| 91 |
+
actual_stored = after_count - before_count
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f" β οΈ Warning: Could not verify storage: {str(e)}")
|
| 94 |
+
actual_stored = len(documents) # Assume success if can't verify
|
| 95 |
+
|
| 96 |
+
return len(documents), actual_stored
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def process_and_store(
|
| 100 |
+
documents: List[Document],
|
| 101 |
+
chunk_size: int = 1000,
|
| 102 |
+
chunk_overlap: int = 200
|
| 103 |
+
) -> int:
|
| 104 |
+
"""
|
| 105 |
+
Complete pipeline: chunk documents and store in vector database
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
documents: List of Document objects
|
| 109 |
+
chunk_size: Maximum characters per chunk
|
| 110 |
+
chunk_overlap: Overlapping characters between chunks
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Number of chunks stored
|
| 114 |
+
"""
|
| 115 |
+
# 1. Chunk documents
|
| 116 |
+
chunks = chunk_documents(documents, chunk_size, chunk_overlap)
|
| 117 |
+
print(f" β
Created {len(chunks)} chunks")
|
| 118 |
+
|
| 119 |
+
# 2. Store in Qdrant with verification
|
| 120 |
+
try:
|
| 121 |
+
expected, actual_stored = store_documents(chunks)
|
| 122 |
+
|
| 123 |
+
if actual_stored == expected:
|
| 124 |
+
print(f" β
Stored {actual_stored} chunks in Qdrant")
|
| 125 |
+
elif actual_stored > 0:
|
| 126 |
+
print(f" β οΈ Partial storage: expected {expected}, actually stored {actual_stored}")
|
| 127 |
+
else:
|
| 128 |
+
print(f" β Storage failed: 0 chunks stored (expected {expected})")
|
| 129 |
+
|
| 130 |
+
return actual_stored
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f" β Error storing in Qdrant: {str(e)}")
|
| 134 |
+
raise
|
tests/test_connections.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test all API connections with 2025 October versions
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
print("π§ͺ Testing API Connections (October 2025)...\n")
|
| 12 |
+
|
| 13 |
+
# Test 1: OpenAI
|
| 14 |
+
print("1οΈβ£ Testing OpenAI...")
|
| 15 |
+
try:
|
| 16 |
+
from openai import OpenAI
|
| 17 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 18 |
+
|
| 19 |
+
# Test embeddings
|
| 20 |
+
response = client.embeddings.create(
|
| 21 |
+
model="text-embedding-3-small",
|
| 22 |
+
input="test"
|
| 23 |
+
)
|
| 24 |
+
print(" β
OpenAI connected successfully!")
|
| 25 |
+
print(f" β
Embeddings working (dimension: {len(response.data[0].embedding)})")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f" β OpenAI error: {str(e)}")
|
| 28 |
+
|
| 29 |
+
# Test 2: Qdrant
|
| 30 |
+
print("\n2οΈβ£ Testing Qdrant...")
|
| 31 |
+
try:
|
| 32 |
+
from qdrant_client import QdrantClient
|
| 33 |
+
client = QdrantClient(
|
| 34 |
+
url=os.getenv("QDRANT_URL"),
|
| 35 |
+
api_key=os.getenv("QDRANT_API_KEY")
|
| 36 |
+
)
|
| 37 |
+
collections = client.get_collections()
|
| 38 |
+
print(f" β
Qdrant connected! Collections: {len(collections.collections)}")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f" β Qdrant error: {str(e)}")
|
| 41 |
+
|
| 42 |
+
# Test 3: Firecrawl
|
| 43 |
+
print("\n3οΈβ£ Testing Firecrawl...")
|
| 44 |
+
try:
|
| 45 |
+
from firecrawl import FirecrawlApp
|
| 46 |
+
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
| 47 |
+
print(" β
Firecrawl initialized successfully!")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f" β Firecrawl error: {str(e)}")
|
| 50 |
+
|
| 51 |
+
# Test 4: LangChain imports (LCEL)
|
| 52 |
+
print("\n4οΈβ£ Testing LangChain with LCEL imports...")
|
| 53 |
+
try:
|
| 54 |
+
import langchain
|
| 55 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 56 |
+
from langchain_qdrant import QdrantVectorStore
|
| 57 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 58 |
+
from langchain_core.runnables.history import RunnableWithMessageHistory
|
| 59 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 60 |
+
from langchain_core.documents import Document
|
| 61 |
+
from operator import itemgetter
|
| 62 |
+
|
| 63 |
+
print(f" β
LangChain version: {langchain.__version__}")
|
| 64 |
+
print(" β
All LangChain LCEL imports successful!")
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f" β LangChain import error: {str(e)}")
|
| 67 |
+
|
| 68 |
+
# Test 5: Gradio
|
| 69 |
+
print("\n5οΈβ£ Testing Gradio...")
|
| 70 |
+
try:
|
| 71 |
+
import gradio as gr
|
| 72 |
+
print(f" β
Gradio version: {gr.__version__}")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f" β Gradio error: {str(e)}")
|
| 75 |
+
|
| 76 |
+
print("\n" + "="*50)
|
| 77 |
+
print("π Connection tests complete!")
|
| 78 |
+
print("\nNext steps:")
|
| 79 |
+
print("1. Upload a test document: python src/ingestion.py")
|
| 80 |
+
print("2. Test the chatbot: python src/chatbot.py")
|
| 81 |
+
print("3. Start the user interface: python app.py")
|
| 82 |
+
print("4. Start the admin interface: python admin.py")
|