Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- README.md +337 -0
- admin.py +584 -0
- app.py +910 -0
- config.py +345 -0
- docker_compose.yml +89 -0
- dockerfile.txt +68 -0
- document_processor.py +973 -0
- gitignore.txt +183 -0
- logo.png +0 -0
- requirements.txt +61 -0
- setup_script.py +371 -0
- utils.py +550 -0
- vector_store.py +804 -0
README.md
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RAG-Based-HR-Assistant
|
| 3 |
+
emoji: 🎯
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.28.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# BLUESCARF AI HR Assistant
|
| 16 |
+
|
| 17 |
+
A sophisticated RAG-based HR Assistant powered by Google Gemini AI, designed specifically for BLUESCARF ARTIFICIAL INTELLIGENCE. This system provides intelligent, context-aware responses to HR-related queries using company documents and policies.
|
| 18 |
+
|
| 19 |
+
## 🚀 Features
|
| 20 |
+
|
| 21 |
+
### Core Capabilities
|
| 22 |
+
- **RAG-Powered Intelligence**: Advanced retrieval-augmented generation using company documents
|
| 23 |
+
- **Google Gemini Integration**: State-of-the-art AI responses with company context
|
| 24 |
+
- **Document Learning**: Processes PDF policies, handbooks, and HR documents
|
| 25 |
+
- **Semantic Search**: Intelligent document retrieval with ChromaDB vector storage
|
| 26 |
+
- **Admin Management**: Secure document upload and knowledge base management
|
| 27 |
+
|
| 28 |
+
### Key Benefits
|
| 29 |
+
- **One-Time Learning**: Documents processed once, knowledge persists
|
| 30 |
+
- **Scope-Focused**: Only answers HR-related questions using company documents
|
| 31 |
+
- **Enterprise-Ready**: Built for production deployment with security features
|
| 32 |
+
- **Minimal Design**: Clean, professional interface optimized for efficiency
|
| 33 |
+
- **Real-Time Updates**: Add/remove documents after deployment
|
| 34 |
+
|
| 35 |
+
## 📋 Prerequisites
|
| 36 |
+
|
| 37 |
+
### Required
|
| 38 |
+
- Python 3.8 or higher
|
| 39 |
+
- Google Gemini API key ([Get yours here](https://makersuite.google.com/app/apikey))
|
| 40 |
+
- Minimum 2GB RAM for optimal performance
|
| 41 |
+
- 500MB storage space for vector database
|
| 42 |
+
|
| 43 |
+
### Recommended
|
| 44 |
+
- 4GB+ RAM for large document processing
|
| 45 |
+
- SSD storage for faster vector operations
|
| 46 |
+
- Stable internet connection for API calls
|
| 47 |
+
|
| 48 |
+
## 🛠️ Installation & Setup
|
| 49 |
+
|
| 50 |
+
### Method 1: Hugging Face Spaces (Recommended)
|
| 51 |
+
|
| 52 |
+
1. **Clone or Download** this repository
|
| 53 |
+
2. **Upload files** to your Hugging Face Space
|
| 54 |
+
3. **Add your company logo** as `logo.png` (200x200px recommended)
|
| 55 |
+
4. **Deploy** - the app will automatically install dependencies
|
| 56 |
+
|
| 57 |
+
### Method 2: Local Development
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
# Clone the repository
|
| 61 |
+
git clone <repository-url>
|
| 62 |
+
cd bluescarf-hr-assistant
|
| 63 |
+
|
| 64 |
+
# Install dependencies
|
| 65 |
+
pip install -r requirements.txt
|
| 66 |
+
|
| 67 |
+
# Run the application
|
| 68 |
+
streamlit run app.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Method 3: Docker Deployment
|
| 72 |
+
|
| 73 |
+
```dockerfile
|
| 74 |
+
FROM python:3.9-slim
|
| 75 |
+
|
| 76 |
+
WORKDIR /app
|
| 77 |
+
COPY . .
|
| 78 |
+
|
| 79 |
+
RUN pip install -r requirements.txt
|
| 80 |
+
|
| 81 |
+
EXPOSE 8501
|
| 82 |
+
|
| 83 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## ⚙️ Configuration
|
| 87 |
+
|
| 88 |
+
### Environment Variables
|
| 89 |
+
|
| 90 |
+
Create a `.env` file for custom configuration:
|
| 91 |
+
|
| 92 |
+
```env
|
| 93 |
+
# Application Settings
|
| 94 |
+
COMPANY_NAME="BLUESCARF ARTIFICIAL INTELLIGENCE"
|
| 95 |
+
ENVIRONMENT=production
|
| 96 |
+
|
| 97 |
+
# Document Processing
|
| 98 |
+
CHUNK_SIZE=1000
|
| 99 |
+
CHUNK_OVERLAP=200
|
| 100 |
+
MAX_FILE_SIZE=52428800 # 50MB
|
| 101 |
+
|
| 102 |
+
# Vector Database
|
| 103 |
+
MAX_CONTEXT_CHUNKS=5
|
| 104 |
+
SIMILARITY_THRESHOLD=0.5
|
| 105 |
+
|
| 106 |
+
# API Configuration
|
| 107 |
+
GEMINI_MODEL=gemini-pro
|
| 108 |
+
TEMPERATURE=0.3
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Admin Access
|
| 112 |
+
|
| 113 |
+
**Default Admin Password**: `bluescarf_admin_2024`
|
| 114 |
+
|
| 115 |
+
⚠️ **IMPORTANT**: Change this password immediately after deployment!
|
| 116 |
+
|
| 117 |
+
## 📚 Usage Guide
|
| 118 |
+
|
| 119 |
+
### For End Users
|
| 120 |
+
|
| 121 |
+
1. **Enter API Key**: Provide your Google Gemini API key
|
| 122 |
+
2. **Ask HR Questions**: Query about policies, benefits, procedures
|
| 123 |
+
3. **Get Contextual Answers**: Receive responses based on company documents
|
| 124 |
+
|
| 125 |
+
**Example Queries:**
|
| 126 |
+
- "What is our vacation policy?"
|
| 127 |
+
- "How do I apply for health insurance?"
|
| 128 |
+
- "What are the performance review procedures?"
|
| 129 |
+
- "Tell me about our remote work policy"
|
| 130 |
+
|
| 131 |
+
### For Administrators
|
| 132 |
+
|
| 133 |
+
1. **Access Admin Panel**: Click "Admin Access" and enter password
|
| 134 |
+
2. **Upload Documents**: Add PDF policies, handbooks, procedures
|
| 135 |
+
3. **Manage Knowledge Base**: View, delete, or update documents
|
| 136 |
+
4. **Monitor System**: Check health status and analytics
|
| 137 |
+
|
| 138 |
+
## 📁 Project Structure
|
| 139 |
+
|
| 140 |
+
```
|
| 141 |
+
bluescarf-hr-assistant/
|
| 142 |
+
├── app.py # Main Streamlit application
|
| 143 |
+
├── document_processor.py # PDF processing and chunking
|
| 144 |
+
├── vector_store.py # ChromaDB vector operations
|
| 145 |
+
├── admin.py # Administrative interface
|
| 146 |
+
├── config.py # Configuration management
|
| 147 |
+
├── utils.py # Utility functions
|
| 148 |
+
├── requirements.txt # Python dependencies
|
| 149 |
+
├── README.md # This documentation
|
| 150 |
+
├── logo.png # Company logo (add yours)
|
| 151 |
+
└── vector_db/ # Vector database storage (auto-created)
|
| 152 |
+
├── chroma.sqlite3 # ChromaDB database
|
| 153 |
+
└── metadata/ # Document metadata
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
## 🔒 Security Features
|
| 157 |
+
|
| 158 |
+
### Authentication
|
| 159 |
+
- Password-protected admin panel
|
| 160 |
+
- API key validation and secure storage
|
| 161 |
+
- Session-based access control
|
| 162 |
+
|
| 163 |
+
### Data Protection
|
| 164 |
+
- Local vector storage (no external data sharing)
|
| 165 |
+
- Secure document hashing for deduplication
|
| 166 |
+
- Audit logging for administrative actions
|
| 167 |
+
|
| 168 |
+
### Access Control
|
| 169 |
+
- HR-only query filtering
|
| 170 |
+
- Document source validation
|
| 171 |
+
- Secure file upload handling
|
| 172 |
+
|
| 173 |
+
## 🚀 Deployment Guide
|
| 174 |
+
|
| 175 |
+
### Hugging Face Spaces Deployment
|
| 176 |
+
|
| 177 |
+
1. **Create Space**: Visit [Hugging Face Spaces](https://huggingface.co/spaces)
|
| 178 |
+
2. **Choose Streamlit**: Select Streamlit as the SDK
|
| 179 |
+
3. **Upload Files**: Upload all project files
|
| 180 |
+
4. **Add Logo**: Replace `logo.png` with your company logo
|
| 181 |
+
5. **Configure Secrets**: Set environment variables if needed
|
| 182 |
+
6. **Deploy**: Space will build and deploy automatically
|
| 183 |
+
|
| 184 |
+
### Environment-Specific Optimizations
|
| 185 |
+
|
| 186 |
+
#### For Hugging Face Spaces:
|
| 187 |
+
- Automatic resource optimization
|
| 188 |
+
- Reduced memory footprint
|
| 189 |
+
- Optimized chunk sizes
|
| 190 |
+
|
| 191 |
+
#### For Private Servers:
|
| 192 |
+
- Full resource utilization
|
| 193 |
+
- Enhanced caching
|
| 194 |
+
- Advanced logging
|
| 195 |
+
|
| 196 |
+
## 📊 Performance Optimization
|
| 197 |
+
|
| 198 |
+
### Document Processing
|
| 199 |
+
- Intelligent chunking with semantic awareness
|
| 200 |
+
- Batch embedding generation
|
| 201 |
+
- Efficient vector storage with ChromaDB
|
| 202 |
+
|
| 203 |
+
### Response Generation
|
| 204 |
+
- Context-aware retrieval
|
| 205 |
+
- Optimized prompt engineering
|
| 206 |
+
- Relevance scoring and ranking
|
| 207 |
+
|
| 208 |
+
### System Resources
|
| 209 |
+
- Lazy loading of AI models
|
| 210 |
+
- Memory-efficient vector operations
|
| 211 |
+
- Automatic garbage collection
|
| 212 |
+
|
| 213 |
+
## 🔧 Customization
|
| 214 |
+
|
| 215 |
+
### Branding
|
| 216 |
+
- Replace `logo.png` with your company logo
|
| 217 |
+
- Update company name in `config.py`
|
| 218 |
+
- Customize colors in the CSS section of `app.py`
|
| 219 |
+
|
| 220 |
+
### Functionality
|
| 221 |
+
- Modify HR keywords in `utils.py`
|
| 222 |
+
- Adjust chunk sizes in `config.py`
|
| 223 |
+
- Customize response templates in `app.py`
|
| 224 |
+
|
| 225 |
+
### Integration
|
| 226 |
+
- Add SSO authentication
|
| 227 |
+
- Integrate with HR systems
|
| 228 |
+
- Connect to document management platforms
|
| 229 |
+
|
| 230 |
+
## 📈 Monitoring & Analytics
|
| 231 |
+
|
| 232 |
+
### Built-in Analytics
|
| 233 |
+
- Query classification and tracking
|
| 234 |
+
- Response quality metrics
|
| 235 |
+
- Document usage statistics
|
| 236 |
+
- Performance monitoring
|
| 237 |
+
|
| 238 |
+
### Health Checks
|
| 239 |
+
- Vector database integrity
|
| 240 |
+
- API connectivity status
|
| 241 |
+
- Storage availability
|
| 242 |
+
- Processing pipeline health
|
| 243 |
+
|
| 244 |
+
## 🐛 Troubleshooting
|
| 245 |
+
|
| 246 |
+
### Common Issues
|
| 247 |
+
|
| 248 |
+
**API Key Invalid**
|
| 249 |
+
- Verify key format and permissions
|
| 250 |
+
- Check Gemini API quotas
|
| 251 |
+
- Ensure internet connectivity
|
| 252 |
+
|
| 253 |
+
**Document Processing Fails**
|
| 254 |
+
- Verify PDF is text-based (not scanned)
|
| 255 |
+
- Check file size limits (50MB default)
|
| 256 |
+
- Ensure readable content exists
|
| 257 |
+
|
| 258 |
+
**Vector Search Returns No Results**
|
| 259 |
+
- Check document relevance to HR domain
|
| 260 |
+
- Verify embedding model availability
|
| 261 |
+
- Restart application to refresh cache
|
| 262 |
+
|
| 263 |
+
**Admin Panel Access Denied**
|
| 264 |
+
- Use correct password: `bluescarf_admin_2024`
|
| 265 |
+
- Clear browser cache/cookies
|
| 266 |
+
- Check for session timeouts
|
| 267 |
+
|
| 268 |
+
### Performance Issues
|
| 269 |
+
|
| 270 |
+
**Slow Document Processing**
|
| 271 |
+
- Reduce chunk size in configuration
|
| 272 |
+
- Process documents in smaller batches
|
| 273 |
+
- Increase available memory
|
| 274 |
+
|
| 275 |
+
**API Response Timeouts**
|
| 276 |
+
- Check internet connection stability
|
| 277 |
+
- Verify API key rate limits
|
| 278 |
+
- Reduce context chunk count
|
| 279 |
+
|
| 280 |
+
## 📞 Support & Contact
|
| 281 |
+
|
| 282 |
+
### Technical Support
|
| 283 |
+
- **Documentation**: Check this README and inline comments
|
| 284 |
+
- **Issues**: Review common troubleshooting steps
|
| 285 |
+
- **Performance**: Monitor system health checks
|
| 286 |
+
|
| 287 |
+
### Business Contact
|
| 288 |
+
- **Company**: BLUESCARF ARTIFICIAL INTELLIGENCE
|
| 289 |
+
- **Purpose**: HR Assistant Support
|
| 290 |
+
- **Access**: Through admin panel for system administrators
|
| 291 |
+
|
| 292 |
+
## 📄 License & Compliance
|
| 293 |
+
|
| 294 |
+
### Usage Terms
|
| 295 |
+
- Designed specifically for BLUESCARF AI internal use
|
| 296 |
+
- Ensure compliance with company data policies
|
| 297 |
+
- Maintain confidentiality of uploaded documents
|
| 298 |
+
|
| 299 |
+
### Data Handling
|
| 300 |
+
- All data processed locally
|
| 301 |
+
- No external sharing of company documents
|
| 302 |
+
- Secure storage and access controls
|
| 303 |
+
|
| 304 |
+
## 🔄 Version History
|
| 305 |
+
|
| 306 |
+
### v1.0.0 (Current)
|
| 307 |
+
- Initial release with full RAG functionality
|
| 308 |
+
- Google Gemini integration
|
| 309 |
+
- Admin panel for document management
|
| 310 |
+
- ChromaDB vector storage
|
| 311 |
+
- Professional UI with company branding
|
| 312 |
+
|
| 313 |
+
### Roadmap
|
| 314 |
+
- Multi-language support
|
| 315 |
+
- Advanced analytics dashboard
|
| 316 |
+
- Integration with HR systems
|
| 317 |
+
- Mobile-responsive enhancements
|
| 318 |
+
- Voice query capabilities
|
| 319 |
+
|
| 320 |
+
---
|
| 321 |
+
|
| 322 |
+
## 🚀 Quick Start Checklist
|
| 323 |
+
|
| 324 |
+
- [ ] Upload all project files to deployment platform
|
| 325 |
+
- [ ] Add your company logo as `logo.png`
|
| 326 |
+
- [ ] Obtain Google Gemini API key
|
| 327 |
+
- [ ] Change default admin password
|
| 328 |
+
- [ ] Upload initial HR documents via admin panel
|
| 329 |
+
- [ ] Test with sample HR queries
|
| 330 |
+
- [ ] Configure environment variables if needed
|
| 331 |
+
- [ ] Monitor system health and performance
|
| 332 |
+
|
| 333 |
+
**Ready to deploy!** Your BLUESCARF AI HR Assistant is now configured for production use.
|
| 334 |
+
|
| 335 |
+
---
|
| 336 |
+
|
| 337 |
+
*Built with ❤️ for BLUESCARF ARTIFICIAL INTELLIGENCE*
|
admin.py
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import time
|
| 3 |
+
import hashlib
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import json
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from document_processor import DocumentProcessor
|
| 10 |
+
from vector_store import VectorStore
|
| 11 |
+
from config import Config
|
| 12 |
+
import io
|
| 13 |
+
|
| 14 |
+
class AdminPanel:
|
| 15 |
+
"""
|
| 16 |
+
Secure administrative interface for knowledge base management.
|
| 17 |
+
Provides document upload, deletion, and system monitoring capabilities.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.config = Config()
|
| 22 |
+
self.document_processor = DocumentProcessor()
|
| 23 |
+
self.vector_store = VectorStore()
|
| 24 |
+
self.admin_password_hash = self._get_admin_password_hash()
|
| 25 |
+
|
| 26 |
+
def _get_admin_password_hash(self) -> str:
|
| 27 |
+
"""
|
| 28 |
+
Get or create admin password hash.
|
| 29 |
+
Default password: 'bluescarf_admin_2024' (change this in production!)
|
| 30 |
+
"""
|
| 31 |
+
password_file = Path(self.config.VECTOR_DB_PATH) / "admin_password.txt"
|
| 32 |
+
|
| 33 |
+
if password_file.exists():
|
| 34 |
+
try:
|
| 35 |
+
with open(password_file, 'r') as f:
|
| 36 |
+
return f.read().strip()
|
| 37 |
+
except Exception:
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
# Default password hash (SHA-256 of 'bluescarf_admin_2024')
|
| 41 |
+
default_password = "bluescarf_admin_2024"
|
| 42 |
+
password_hash = hashlib.sha256(default_password.encode()).hexdigest()
|
| 43 |
+
|
| 44 |
+
# Save to file
|
| 45 |
+
try:
|
| 46 |
+
password_file.parent.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
with open(password_file, 'w') as f:
|
| 48 |
+
f.write(password_hash)
|
| 49 |
+
except Exception as e:
|
| 50 |
+
st.warning(f"Could not save admin password: {str(e)}")
|
| 51 |
+
|
| 52 |
+
return password_hash
|
| 53 |
+
|
| 54 |
+
def _verify_admin_password(self, entered_password: str) -> bool:
|
| 55 |
+
"""
|
| 56 |
+
Verify admin password against stored hash.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
entered_password: Password entered by user
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
True if password is correct, False otherwise
|
| 63 |
+
"""
|
| 64 |
+
entered_hash = hashlib.sha256(entered_password.encode()).hexdigest()
|
| 65 |
+
return entered_hash == self.admin_password_hash
|
| 66 |
+
|
| 67 |
+
def _change_admin_password(self, current_password: str, new_password: str) -> bool:
|
| 68 |
+
"""
|
| 69 |
+
Change admin password with verification.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
current_password: Current admin password
|
| 73 |
+
new_password: New password to set
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
True if password changed successfully, False otherwise
|
| 77 |
+
"""
|
| 78 |
+
if not self._verify_admin_password(current_password):
|
| 79 |
+
st.error("Current password is incorrect")
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
if len(new_password) < 8:
|
| 83 |
+
st.error("New password must be at least 8 characters long")
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
# Update password hash
|
| 87 |
+
new_hash = hashlib.sha256(new_password.encode()).hexdigest()
|
| 88 |
+
password_file = Path(self.config.VECTOR_DB_PATH) / "admin_password.txt"
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
with open(password_file, 'w') as f:
|
| 92 |
+
f.write(new_hash)
|
| 93 |
+
|
| 94 |
+
self.admin_password_hash = new_hash
|
| 95 |
+
st.success("✅ Admin password updated successfully")
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
st.error(f"Failed to update password: {str(e)}")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
def render_authentication(self) -> bool:
|
| 103 |
+
"""
|
| 104 |
+
Render admin authentication interface.
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
True if authenticated, False otherwise
|
| 108 |
+
"""
|
| 109 |
+
if st.session_state.admin_authenticated:
|
| 110 |
+
return True
|
| 111 |
+
|
| 112 |
+
st.markdown("""
|
| 113 |
+
<div class="admin-section">
|
| 114 |
+
<h4>🔐 Administrator Authentication</h4>
|
| 115 |
+
<p>Enter admin password to access knowledge base management</p>
|
| 116 |
+
</div>
|
| 117 |
+
""", unsafe_allow_html=True)
|
| 118 |
+
|
| 119 |
+
with st.form("admin_auth_form", clear_on_submit=True):
|
| 120 |
+
password = st.text_input(
|
| 121 |
+
"Admin Password:",
|
| 122 |
+
type="password",
|
| 123 |
+
help="Default: bluescarf_admin_2024 (change in production!)"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
col1, col2 = st.columns([1, 3])
|
| 127 |
+
|
| 128 |
+
with col1:
|
| 129 |
+
login_button = st.form_submit_button("Login", type="primary")
|
| 130 |
+
|
| 131 |
+
with col2:
|
| 132 |
+
if st.form_submit_button("Show Default Password"):
|
| 133 |
+
st.info("Default password: `bluescarf_admin_2024`")
|
| 134 |
+
|
| 135 |
+
if login_button and password:
|
| 136 |
+
if self._verify_admin_password(password):
|
| 137 |
+
st.session_state.admin_authenticated = True
|
| 138 |
+
st.success("✅ Authentication successful!")
|
| 139 |
+
st.rerun()
|
| 140 |
+
else:
|
| 141 |
+
st.error("❌ Invalid password")
|
| 142 |
+
|
| 143 |
+
return False
|
| 144 |
+
|
| 145 |
+
def render_document_upload(self):
|
| 146 |
+
"""Render document upload interface with batch processing support."""
|
| 147 |
+
st.markdown("### 📁 Upload Company Documents")
|
| 148 |
+
|
| 149 |
+
with st.expander("📋 Upload Guidelines", expanded=False):
|
| 150 |
+
st.markdown("""
|
| 151 |
+
**Supported Documents:**
|
| 152 |
+
- Company policies and procedures
|
| 153 |
+
- Employee handbooks
|
| 154 |
+
- Benefits information
|
| 155 |
+
- HR guidelines and regulations
|
| 156 |
+
- Training materials
|
| 157 |
+
|
| 158 |
+
**Requirements:**
|
| 159 |
+
- PDF format only
|
| 160 |
+
- Maximum 50MB per file
|
| 161 |
+
- Readable text content (not scanned images)
|
| 162 |
+
- Company-related HR content
|
| 163 |
+
""")
|
| 164 |
+
|
| 165 |
+
# File upload interface
|
| 166 |
+
uploaded_files = st.file_uploader(
|
| 167 |
+
"Choose PDF files",
|
| 168 |
+
type=['pdf'],
|
| 169 |
+
accept_multiple_files=True,
|
| 170 |
+
help="Upload multiple PDF files for batch processing"
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
if uploaded_files:
|
| 174 |
+
st.markdown(f"**Selected Files:** {len(uploaded_files)} PDF(s)")
|
| 175 |
+
|
| 176 |
+
# Display file details
|
| 177 |
+
file_details = []
|
| 178 |
+
total_size = 0
|
| 179 |
+
|
| 180 |
+
for uploaded_file in uploaded_files:
|
| 181 |
+
file_size_mb = uploaded_file.size / (1024 * 1024)
|
| 182 |
+
total_size += file_size_mb
|
| 183 |
+
|
| 184 |
+
file_details.append({
|
| 185 |
+
'Filename': uploaded_file.name,
|
| 186 |
+
'Size (MB)': f"{file_size_mb:.2f}",
|
| 187 |
+
'Status': '✅ Ready' if file_size_mb <= 50 else '❌ Too Large'
|
| 188 |
+
})
|
| 189 |
+
|
| 190 |
+
df = pd.DataFrame(file_details)
|
| 191 |
+
st.dataframe(df, use_container_width=True)
|
| 192 |
+
|
| 193 |
+
# Process uploaded files
|
| 194 |
+
col1, col2, col3 = st.columns([2, 2, 1])
|
| 195 |
+
|
| 196 |
+
with col1:
|
| 197 |
+
process_button = st.button(
|
| 198 |
+
f"🚀 Process {len(uploaded_files)} Files",
|
| 199 |
+
type="primary",
|
| 200 |
+
disabled=total_size > 200 # 200MB total limit
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
with col2:
|
| 204 |
+
if total_size > 200:
|
| 205 |
+
st.error(f"Total size ({total_size:.1f}MB) exceeds 200MB limit")
|
| 206 |
+
|
| 207 |
+
with col3:
|
| 208 |
+
if st.button("🗑️ Clear"):
|
| 209 |
+
st.experimental_rerun()
|
| 210 |
+
|
| 211 |
+
if process_button:
|
| 212 |
+
self._process_uploaded_files(uploaded_files)
|
| 213 |
+
|
| 214 |
+
def _process_uploaded_files(self, uploaded_files: List) -> None:
|
| 215 |
+
"""
|
| 216 |
+
Process multiple uploaded files with progress tracking and error handling.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
uploaded_files: List of uploaded file objects
|
| 220 |
+
"""
|
| 221 |
+
success_count = 0
|
| 222 |
+
error_count = 0
|
| 223 |
+
duplicate_count = 0
|
| 224 |
+
|
| 225 |
+
# Overall progress tracking
|
| 226 |
+
overall_progress = st.progress(0)
|
| 227 |
+
status_placeholder = st.empty()
|
| 228 |
+
|
| 229 |
+
for i, uploaded_file in enumerate(uploaded_files):
|
| 230 |
+
try:
|
| 231 |
+
# Update overall progress
|
| 232 |
+
progress = i / len(uploaded_files)
|
| 233 |
+
overall_progress.progress(progress)
|
| 234 |
+
status_placeholder.info(f"Processing {uploaded_file.name}...")
|
| 235 |
+
|
| 236 |
+
# Validate file
|
| 237 |
+
if not self.document_processor.validate_pdf_file(uploaded_file):
|
| 238 |
+
error_count += 1
|
| 239 |
+
continue
|
| 240 |
+
|
| 241 |
+
# Check for duplicates
|
| 242 |
+
doc_hash = self.document_processor.calculate_document_hash(uploaded_file)
|
| 243 |
+
existing_docs = self.vector_store.get_documents_by_hash(doc_hash)
|
| 244 |
+
|
| 245 |
+
if existing_docs:
|
| 246 |
+
st.warning(f"⚠️ {uploaded_file.name} already exists in knowledge base")
|
| 247 |
+
duplicate_count += 1
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
# Process document
|
| 251 |
+
processed_doc = self.document_processor.process_document(
|
| 252 |
+
uploaded_file,
|
| 253 |
+
uploaded_file.name
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
if processed_doc:
|
| 257 |
+
# Add to vector store
|
| 258 |
+
if self.vector_store.add_document(processed_doc):
|
| 259 |
+
success_count += 1
|
| 260 |
+
else:
|
| 261 |
+
error_count += 1
|
| 262 |
+
else:
|
| 263 |
+
error_count += 1
|
| 264 |
+
|
| 265 |
+
except Exception as e:
|
| 266 |
+
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 267 |
+
error_count += 1
|
| 268 |
+
|
| 269 |
+
# Final progress update
|
| 270 |
+
overall_progress.progress(1.0)
|
| 271 |
+
status_placeholder.empty()
|
| 272 |
+
|
| 273 |
+
# Display results summary
|
| 274 |
+
st.markdown("### 📊 Processing Results")
|
| 275 |
+
|
| 276 |
+
col1, col2, col3 = st.columns(3)
|
| 277 |
+
|
| 278 |
+
with col1:
|
| 279 |
+
st.metric("✅ Successful", success_count)
|
| 280 |
+
|
| 281 |
+
with col2:
|
| 282 |
+
st.metric("⚠️ Duplicates", duplicate_count)
|
| 283 |
+
|
| 284 |
+
with col3:
|
| 285 |
+
st.metric("❌ Errors", error_count)
|
| 286 |
+
|
| 287 |
+
if success_count > 0:
|
| 288 |
+
st.success(f"🎉 Successfully processed {success_count} documents!")
|
| 289 |
+
# Refresh knowledge base stats
|
| 290 |
+
time.sleep(1)
|
| 291 |
+
st.rerun()
|
| 292 |
+
|
| 293 |
+
def render_knowledge_base_management(self):
|
| 294 |
+
"""Render knowledge base overview and management interface."""
|
| 295 |
+
st.markdown("### 📚 Knowledge Base Management")
|
| 296 |
+
|
| 297 |
+
# Get current statistics
|
| 298 |
+
stats = self.vector_store.get_collection_stats()
|
| 299 |
+
documents = self.vector_store.get_all_documents()
|
| 300 |
+
|
| 301 |
+
# Display overview metrics
|
| 302 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 303 |
+
|
| 304 |
+
with col1:
|
| 305 |
+
st.metric("📄 Documents", stats.get('total_documents', 0))
|
| 306 |
+
|
| 307 |
+
with col2:
|
| 308 |
+
st.metric("🧩 Chunks", stats.get('total_chunks', 0))
|
| 309 |
+
|
| 310 |
+
with col3:
|
| 311 |
+
avg_chunks = stats.get('avg_chunks_per_doc', 0)
|
| 312 |
+
st.metric("📊 Avg Chunks/Doc", f"{avg_chunks:.1f}")
|
| 313 |
+
|
| 314 |
+
with col4:
|
| 315 |
+
last_update = stats.get('latest_update', 0)
|
| 316 |
+
if last_update:
|
| 317 |
+
update_time = datetime.fromtimestamp(last_update).strftime("%m/%d/%Y")
|
| 318 |
+
st.metric("📅 Last Update", update_time)
|
| 319 |
+
else:
|
| 320 |
+
st.metric("📅 Last Update", "None")
|
| 321 |
+
|
| 322 |
+
if not documents:
|
| 323 |
+
st.info("📭 No documents in knowledge base. Upload some documents to get started!")
|
| 324 |
+
return
|
| 325 |
+
|
| 326 |
+
# Document management table
|
| 327 |
+
st.markdown("#### 📋 Document Library")
|
| 328 |
+
|
| 329 |
+
# Prepare document data for display
|
| 330 |
+
doc_data = []
|
| 331 |
+
for doc in documents:
|
| 332 |
+
processed_time = datetime.fromtimestamp(
|
| 333 |
+
doc.get('processed_at', 0)
|
| 334 |
+
).strftime("%Y-%m-%d %H:%M")
|
| 335 |
+
|
| 336 |
+
doc_data.append({
|
| 337 |
+
'Filename': doc.get('filename', 'Unknown'),
|
| 338 |
+
'Type': doc.get('document_type', 'hr_policy').replace('_', ' ').title(),
|
| 339 |
+
'Chunks': doc.get('chunk_count', 0),
|
| 340 |
+
'Processed': processed_time,
|
| 341 |
+
'Hash': doc.get('document_hash', '')[:12] + '...'
|
| 342 |
+
})
|
| 343 |
+
|
| 344 |
+
# Display documents table
|
| 345 |
+
df = pd.DataFrame(doc_data)
|
| 346 |
+
selected_rows = st.dataframe(
|
| 347 |
+
df,
|
| 348 |
+
use_container_width=True,
|
| 349 |
+
hide_index=True
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# Document management actions
|
| 353 |
+
if documents:
|
| 354 |
+
st.markdown("#### 🛠️ Management Actions")
|
| 355 |
+
|
| 356 |
+
col1, col2, col3 = st.columns([2, 2, 2])
|
| 357 |
+
|
| 358 |
+
with col1:
|
| 359 |
+
# Document selection for deletion
|
| 360 |
+
doc_options = [
|
| 361 |
+
f"{doc['filename']} ({doc.get('chunk_count', 0)} chunks)"
|
| 362 |
+
for doc in documents
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
selected_doc_idx = st.selectbox(
|
| 366 |
+
"Select document to delete:",
|
| 367 |
+
range(len(doc_options)),
|
| 368 |
+
format_func=lambda x: doc_options[x]
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
if st.button("🗑️ Delete Selected", type="secondary"):
|
| 372 |
+
self._delete_selected_document(documents[selected_doc_idx])
|
| 373 |
+
|
| 374 |
+
with col2:
|
| 375 |
+
# Health check
|
| 376 |
+
if st.button("🏥 Health Check", type="secondary"):
|
| 377 |
+
self._perform_health_check()
|
| 378 |
+
|
| 379 |
+
with col3:
|
| 380 |
+
# Danger zone - reset knowledge base
|
| 381 |
+
if st.button("⚠️ Reset All", type="secondary"):
|
| 382 |
+
self._confirm_reset_knowledge_base()
|
| 383 |
+
|
| 384 |
+
def _delete_selected_document(self, document: Dict[str, Any]):
|
| 385 |
+
"""
|
| 386 |
+
Delete selected document with confirmation.
|
| 387 |
+
|
| 388 |
+
Args:
|
| 389 |
+
document: Document metadata to delete
|
| 390 |
+
"""
|
| 391 |
+
doc_hash = document.get('document_hash')
|
| 392 |
+
filename = document.get('filename', 'Unknown')
|
| 393 |
+
|
| 394 |
+
if not doc_hash:
|
| 395 |
+
st.error("Invalid document selection")
|
| 396 |
+
return
|
| 397 |
+
|
| 398 |
+
# Confirmation dialog
|
| 399 |
+
with st.form(f"delete_confirm_{doc_hash[:8]}"):
|
| 400 |
+
st.warning(f"⚠️ **Confirm Deletion**")
|
| 401 |
+
st.write(f"Document: **{filename}**")
|
| 402 |
+
st.write(f"Chunks: **{document.get('chunk_count', 0)}**")
|
| 403 |
+
st.write("This action cannot be undone!")
|
| 404 |
+
|
| 405 |
+
col1, col2 = st.columns(2)
|
| 406 |
+
|
| 407 |
+
with col1:
|
| 408 |
+
confirm_delete = st.form_submit_button("🗑️ Confirm Delete", type="primary")
|
| 409 |
+
|
| 410 |
+
with col2:
|
| 411 |
+
cancel_delete = st.form_submit_button("❌ Cancel")
|
| 412 |
+
|
| 413 |
+
if confirm_delete:
|
| 414 |
+
if self.vector_store.delete_document(doc_hash):
|
| 415 |
+
st.success(f"✅ Successfully deleted {filename}")
|
| 416 |
+
time.sleep(1)
|
| 417 |
+
st.experimental_rerun()
|
| 418 |
+
else:
|
| 419 |
+
st.error("Failed to delete document")
|
| 420 |
+
|
| 421 |
+
if cancel_delete:
|
| 422 |
+
st.info("Deletion cancelled")
|
| 423 |
+
st.experimental_rerun()
|
| 424 |
+
|
| 425 |
+
def _perform_health_check(self):
|
| 426 |
+
"""Perform comprehensive system health check."""
|
| 427 |
+
with st.spinner("Performing health check..."):
|
| 428 |
+
health_status = self.vector_store.health_check()
|
| 429 |
+
|
| 430 |
+
st.markdown("#### 🏥 System Health Report")
|
| 431 |
+
|
| 432 |
+
if health_status.get('status') == 'healthy':
|
| 433 |
+
st.success("✅ System is healthy!")
|
| 434 |
+
elif health_status.get('status') == 'unhealthy':
|
| 435 |
+
st.warning("⚠️ System issues detected")
|
| 436 |
+
else:
|
| 437 |
+
st.error("❌ System error")
|
| 438 |
+
|
| 439 |
+
# Display detailed health metrics
|
| 440 |
+
col1, col2 = st.columns(2)
|
| 441 |
+
|
| 442 |
+
with col1:
|
| 443 |
+
st.markdown("**Storage Status:**")
|
| 444 |
+
if health_status.get('storage_accessible'):
|
| 445 |
+
st.success("✅ Storage accessible")
|
| 446 |
+
else:
|
| 447 |
+
st.error("❌ Storage issues")
|
| 448 |
+
|
| 449 |
+
with col2:
|
| 450 |
+
st.markdown("**Collection Status:**")
|
| 451 |
+
if health_status.get('collection_healthy'):
|
| 452 |
+
st.success("✅ Collection healthy")
|
| 453 |
+
else:
|
| 454 |
+
st.error("❌ Collection issues")
|
| 455 |
+
|
| 456 |
+
# Additional metrics
|
| 457 |
+
st.markdown("**System Metrics:**")
|
| 458 |
+
metrics_data = {
|
| 459 |
+
'Total Documents': health_status.get('total_documents', 0),
|
| 460 |
+
'Total Chunks': health_status.get('total_chunks', 0),
|
| 461 |
+
'Last Check': datetime.fromtimestamp(
|
| 462 |
+
health_status.get('last_check', time.time())
|
| 463 |
+
).strftime("%Y-%m-%d %H:%M:%S")
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
for metric, value in metrics_data.items():
|
| 467 |
+
st.write(f"• **{metric}:** {value}")
|
| 468 |
+
|
| 469 |
+
def _confirm_reset_knowledge_base(self):
|
| 470 |
+
"""Render knowledge base reset confirmation with safeguards."""
|
| 471 |
+
st.markdown("#### ⚠️ **DANGER ZONE**")
|
| 472 |
+
st.error("**Reset Knowledge Base** - This will delete ALL documents and chunks!")
|
| 473 |
+
|
| 474 |
+
with st.form("reset_confirmation"):
|
| 475 |
+
st.write("This action will:")
|
| 476 |
+
st.write("• Delete all processed documents")
|
| 477 |
+
st.write("• Remove all embeddings and chunks")
|
| 478 |
+
st.write("• Clear document metadata")
|
| 479 |
+
st.write("• **Cannot be undone!**")
|
| 480 |
+
|
| 481 |
+
confirmation_text = st.text_input(
|
| 482 |
+
"Type 'RESET BLUESCARF KNOWLEDGE BASE' to confirm:",
|
| 483 |
+
placeholder="Type confirmation text here..."
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
col1, col2 = st.columns(2)
|
| 487 |
+
|
| 488 |
+
with col1:
|
| 489 |
+
reset_button = st.form_submit_button(
|
| 490 |
+
"🔥 RESET EVERYTHING",
|
| 491 |
+
type="primary"
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
with col2:
|
| 495 |
+
cancel_button = st.form_submit_button("❌ Cancel")
|
| 496 |
+
|
| 497 |
+
if reset_button:
|
| 498 |
+
if confirmation_text == "RESET BLUESCARF KNOWLEDGE BASE":
|
| 499 |
+
with st.spinner("Resetting knowledge base..."):
|
| 500 |
+
if self.vector_store.reset_collection():
|
| 501 |
+
st.success("✅ Knowledge base reset successfully!")
|
| 502 |
+
time.sleep(2)
|
| 503 |
+
st.rerun()
|
| 504 |
+
else:
|
| 505 |
+
st.error("❌ Failed to reset knowledge base")
|
| 506 |
+
else:
|
| 507 |
+
st.error("❌ Confirmation text doesn't match. Reset cancelled.")
|
| 508 |
+
|
| 509 |
+
if cancel_button:
|
| 510 |
+
st.info("Reset cancelled")
|
| 511 |
+
st.rerun()
|
| 512 |
+
|
| 513 |
+
def render_admin_settings(self):
|
| 514 |
+
"""Render admin settings and configuration options."""
|
| 515 |
+
st.markdown("### ⚙️ Admin Settings")
|
| 516 |
+
|
| 517 |
+
# Password management
|
| 518 |
+
with st.expander("🔐 Password Management", expanded=False):
|
| 519 |
+
with st.form("change_password_form"):
|
| 520 |
+
current_password = st.text_input(
|
| 521 |
+
"Current Password:",
|
| 522 |
+
type="password"
|
| 523 |
+
)
|
| 524 |
+
new_password = st.text_input(
|
| 525 |
+
"New Password:",
|
| 526 |
+
type="password",
|
| 527 |
+
help="Minimum 8 characters"
|
| 528 |
+
)
|
| 529 |
+
confirm_password = st.text_input(
|
| 530 |
+
"Confirm New Password:",
|
| 531 |
+
type="password"
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
change_pwd_button = st.form_submit_button("Update Password")
|
| 535 |
+
|
| 536 |
+
if change_pwd_button:
|
| 537 |
+
if new_password != confirm_password:
|
| 538 |
+
st.error("New passwords don't match")
|
| 539 |
+
elif len(new_password) < 8:
|
| 540 |
+
st.error("Password must be at least 8 characters")
|
| 541 |
+
else:
|
| 542 |
+
self._change_admin_password(current_password, new_password)
|
| 543 |
+
|
| 544 |
+
# System information
|
| 545 |
+
with st.expander("📊 System Information", expanded=False):
|
| 546 |
+
stats = self.vector_store.get_collection_stats()
|
| 547 |
+
|
| 548 |
+
st.json({
|
| 549 |
+
'Knowledge Base Stats': stats,
|
| 550 |
+
'Storage Path': str(self.config.VECTOR_DB_PATH),
|
| 551 |
+
'Chunk Size': self.config.CHUNK_SIZE,
|
| 552 |
+
'Max Context Chunks': self.config.MAX_CONTEXT_CHUNKS,
|
| 553 |
+
'Max File Size (MB)': self.config.MAX_FILE_SIZE / (1024*1024)
|
| 554 |
+
})
|
| 555 |
+
|
| 556 |
+
# Logout button
|
| 557 |
+
if st.button("🚪 Logout", type="secondary"):
|
| 558 |
+
st.session_state.admin_authenticated = False
|
| 559 |
+
st.session_state.show_admin = False
|
| 560 |
+
st.rerun()
|
| 561 |
+
|
| 562 |
+
def render(self):
|
| 563 |
+
"""Main admin panel render method."""
|
| 564 |
+
if not self.render_authentication():
|
| 565 |
+
return
|
| 566 |
+
|
| 567 |
+
st.markdown("---")
|
| 568 |
+
st.markdown("## 🔧 **Administrator Panel**")
|
| 569 |
+
|
| 570 |
+
# Admin navigation tabs
|
| 571 |
+
tab1, tab2, tab3 = st.tabs([
|
| 572 |
+
"📁 Document Management",
|
| 573 |
+
"📚 Knowledge Base",
|
| 574 |
+
"⚙️ Settings"
|
| 575 |
+
])
|
| 576 |
+
|
| 577 |
+
with tab1:
|
| 578 |
+
self.render_document_upload()
|
| 579 |
+
|
| 580 |
+
with tab2:
|
| 581 |
+
self.render_knowledge_base_management()
|
| 582 |
+
|
| 583 |
+
with tab3:
|
| 584 |
+
self.render_admin_settings()
|
app.py
ADDED
|
@@ -0,0 +1,910 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import time
|
| 5 |
+
from typing import List, Dict, Any
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
import google.generativeai as genai
|
| 8 |
+
from vector_store import VectorStore
|
| 9 |
+
from admin import AdminPanel
|
| 10 |
+
from config import Config
|
| 11 |
+
from utils import validate_api_key, format_response, log_interaction
|
| 12 |
+
|
| 13 |
+
# Page configuration
|
| 14 |
+
st.set_page_config(
|
| 15 |
+
page_title="BLUESCARF AI - HR Assistant",
|
| 16 |
+
page_icon="🔷",
|
| 17 |
+
layout="wide",
|
| 18 |
+
initial_sidebar_state="collapsed"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Custom CSS for enhanced UX and professional styling
|
| 22 |
+
st.markdown("""
|
| 23 |
+
<style>
|
| 24 |
+
/* Modern Color Palette & Typography */
|
| 25 |
+
:root {
|
| 26 |
+
--primary-blue: #1e40af;
|
| 27 |
+
--light-blue: #3b82f6;
|
| 28 |
+
--accent-blue: #60a5fa;
|
| 29 |
+
--surface-light: #f8fafc;
|
| 30 |
+
--surface-white: #ffffff;
|
| 31 |
+
--text-primary: #1f2937;
|
| 32 |
+
--text-secondary: #6b7280;
|
| 33 |
+
--border-light: #e5e7eb;
|
| 34 |
+
--success-green: #10b981;
|
| 35 |
+
--warning-orange: #f59e0b;
|
| 36 |
+
--error-red: #ef4444;
|
| 37 |
+
--shadow-soft: 0 1px 3px rgba(0,0,0,0.1);
|
| 38 |
+
--shadow-medium: 0 4px 6px rgba(0,0,0,0.1);
|
| 39 |
+
--radius-md: 8px;
|
| 40 |
+
--radius-lg: 12px;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
/* Remove Streamlit Default Padding */
|
| 44 |
+
.main .block-container {
|
| 45 |
+
padding-top: 2rem;
|
| 46 |
+
padding-bottom: 2rem;
|
| 47 |
+
max-width: 1200px;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
/* Enhanced Header Design */
|
| 51 |
+
.main-header {
|
| 52 |
+
background: linear-gradient(135deg, var(--primary-blue) 0%, var(--light-blue) 100%);
|
| 53 |
+
padding: 2.5rem;
|
| 54 |
+
border-radius: var(--radius-lg);
|
| 55 |
+
margin-bottom: 2rem;
|
| 56 |
+
text-align: center;
|
| 57 |
+
box-shadow: var(--shadow-medium);
|
| 58 |
+
position: relative;
|
| 59 |
+
overflow: hidden;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.main-header::before {
|
| 63 |
+
content: '';
|
| 64 |
+
position: absolute;
|
| 65 |
+
top: 0;
|
| 66 |
+
left: 0;
|
| 67 |
+
right: 0;
|
| 68 |
+
bottom: 0;
|
| 69 |
+
background: url('data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><defs><pattern id="grid" width="10" height="10" patternUnits="userSpaceOnUse"><path d="M 10 0 L 0 0 0 10" fill="none" stroke="rgba(255,255,255,0.1)" stroke-width="0.5"/></pattern></defs><rect width="100" height="100" fill="url(%23grid)"/></svg>');
|
| 70 |
+
opacity: 0.3;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.main-header h1, .main-header h3 {
|
| 74 |
+
position: relative;
|
| 75 |
+
z-index: 1;
|
| 76 |
+
margin: 0;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.main-header h1 {
|
| 80 |
+
color: white;
|
| 81 |
+
font-size: 2.5rem;
|
| 82 |
+
font-weight: 700;
|
| 83 |
+
letter-spacing: -0.02em;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.main-header h3 {
|
| 87 |
+
color: #bfdbfe;
|
| 88 |
+
font-size: 1.25rem;
|
| 89 |
+
font-weight: 400;
|
| 90 |
+
margin-top: 0.5rem;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
/* Logo Styling */
|
| 94 |
+
.company-logo {
|
| 95 |
+
max-width: 120px;
|
| 96 |
+
margin: 1rem auto;
|
| 97 |
+
display: block;
|
| 98 |
+
border-radius: var(--radius-md);
|
| 99 |
+
box-shadow: var(--shadow-soft);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
/* Chat Interface Enhancements */
|
| 103 |
+
.chat-main-container {
|
| 104 |
+
background: var(--surface-white);
|
| 105 |
+
border-radius: var(--radius-lg);
|
| 106 |
+
padding: 1.5rem;
|
| 107 |
+
margin: 1rem 0;
|
| 108 |
+
box-shadow: var(--shadow-medium);
|
| 109 |
+
border: 1px solid var(--border-light);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.chat-messages-container {
|
| 113 |
+
min-height: 300px;
|
| 114 |
+
max-height: 500px;
|
| 115 |
+
overflow-y: auto;
|
| 116 |
+
padding: 1rem;
|
| 117 |
+
background: var(--surface-light);
|
| 118 |
+
border-radius: var(--radius-md);
|
| 119 |
+
margin-bottom: 1.5rem;
|
| 120 |
+
border: 1px solid var(--border-light);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.chat-messages-container::-webkit-scrollbar {
|
| 124 |
+
width: 6px;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
.chat-messages-container::-webkit-scrollbar-track {
|
| 128 |
+
background: #f1f5f9;
|
| 129 |
+
border-radius: 3px;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.chat-messages-container::-webkit-scrollbar-thumb {
|
| 133 |
+
background: #cbd5e1;
|
| 134 |
+
border-radius: 3px;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.chat-messages-container::-webkit-scrollbar-thumb:hover {
|
| 138 |
+
background: #94a3b8;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
/* Enhanced Message Bubbles */
|
| 142 |
+
.user-message {
|
| 143 |
+
background: linear-gradient(135deg, var(--light-blue), var(--accent-blue));
|
| 144 |
+
color: white;
|
| 145 |
+
padding: 1rem 1.25rem;
|
| 146 |
+
border-radius: 1.5rem 1.5rem 0.5rem 1.5rem;
|
| 147 |
+
margin: 0.75rem 0 0.75rem auto;
|
| 148 |
+
max-width: 80%;
|
| 149 |
+
box-shadow: var(--shadow-soft);
|
| 150 |
+
animation: slideInRight 0.3s ease-out;
|
| 151 |
+
position: relative;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.assistant-message {
|
| 155 |
+
background: var(--surface-white);
|
| 156 |
+
color: var(--text-primary);
|
| 157 |
+
padding: 1rem 1.25rem;
|
| 158 |
+
border-radius: 1.5rem 1.5rem 1.5rem 0.5rem;
|
| 159 |
+
margin: 0.75rem auto 0.75rem 0;
|
| 160 |
+
max-width: 80%;
|
| 161 |
+
box-shadow: var(--shadow-soft);
|
| 162 |
+
border: 1px solid var(--border-light);
|
| 163 |
+
animation: slideInLeft 0.3s ease-out;
|
| 164 |
+
position: relative;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
@keyframes slideInRight {
|
| 168 |
+
from { opacity: 0; transform: translateX(20px); }
|
| 169 |
+
to { opacity: 1; transform: translateX(0); }
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
@keyframes slideInLeft {
|
| 173 |
+
from { opacity: 0; transform: translateX(-20px); }
|
| 174 |
+
to { opacity: 1; transform: translateX(0); }
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.message-meta {
|
| 178 |
+
font-size: 0.75rem;
|
| 179 |
+
opacity: 0.7;
|
| 180 |
+
margin-top: 0.5rem;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
/* Perfect Chat Input Layout */
|
| 184 |
+
.chat-input-container {
|
| 185 |
+
display: flex;
|
| 186 |
+
gap: 0.75rem;
|
| 187 |
+
align-items: flex-end;
|
| 188 |
+
padding: 1rem;
|
| 189 |
+
background: var(--surface-light);
|
| 190 |
+
border-radius: var(--radius-md);
|
| 191 |
+
border: 2px solid transparent;
|
| 192 |
+
transition: border-color 0.2s ease;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
.chat-input-container:focus-within {
|
| 196 |
+
border-color: var(--light-blue);
|
| 197 |
+
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
.chat-input-field {
|
| 201 |
+
flex: 1;
|
| 202 |
+
min-height: 44px;
|
| 203 |
+
max-height: 120px;
|
| 204 |
+
padding: 0.75rem 1rem;
|
| 205 |
+
border: 1px solid var(--border-light);
|
| 206 |
+
border-radius: var(--radius-md);
|
| 207 |
+
font-size: 1rem;
|
| 208 |
+
resize: vertical;
|
| 209 |
+
transition: all 0.2s ease;
|
| 210 |
+
background: var(--surface-white);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
.chat-input-field:focus {
|
| 214 |
+
outline: none;
|
| 215 |
+
border-color: var(--light-blue);
|
| 216 |
+
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.chat-send-button {
|
| 220 |
+
min-width: 44px;
|
| 221 |
+
height: 44px;
|
| 222 |
+
background: linear-gradient(135deg, var(--light-blue), var(--primary-blue));
|
| 223 |
+
color: white;
|
| 224 |
+
border: none;
|
| 225 |
+
border-radius: var(--radius-md);
|
| 226 |
+
cursor: pointer;
|
| 227 |
+
transition: all 0.2s ease;
|
| 228 |
+
display: flex;
|
| 229 |
+
align-items: center;
|
| 230 |
+
justify-content: center;
|
| 231 |
+
font-weight: 600;
|
| 232 |
+
box-shadow: var(--shadow-soft);
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
.chat-send-button:hover:not(:disabled) {
|
| 236 |
+
transform: translateY(-1px);
|
| 237 |
+
box-shadow: 0 4px 12px rgba(59, 130, 246, 0.3);
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.chat-send-button:disabled {
|
| 241 |
+
opacity: 0.6;
|
| 242 |
+
cursor: not-allowed;
|
| 243 |
+
transform: none;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
/* Enhanced Button Styles */
|
| 247 |
+
.stButton > button {
|
| 248 |
+
background: linear-gradient(135deg, var(--light-blue), var(--primary-blue));
|
| 249 |
+
color: white;
|
| 250 |
+
border: none;
|
| 251 |
+
border-radius: var(--radius-md);
|
| 252 |
+
padding: 0.6rem 1.2rem;
|
| 253 |
+
font-weight: 600;
|
| 254 |
+
transition: all 0.2s ease;
|
| 255 |
+
box-shadow: var(--shadow-soft);
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
.stButton > button:hover {
|
| 259 |
+
transform: translateY(-1px);
|
| 260 |
+
box-shadow: 0 4px 12px rgba(59, 130, 246, 0.3);
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
/* Loading States */
|
| 264 |
+
.loading-indicator {
|
| 265 |
+
display: flex;
|
| 266 |
+
align-items: center;
|
| 267 |
+
gap: 0.5rem;
|
| 268 |
+
padding: 1rem;
|
| 269 |
+
background: var(--surface-light);
|
| 270 |
+
border-radius: var(--radius-md);
|
| 271 |
+
margin: 0.5rem 0;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.loading-dots {
|
| 275 |
+
display: flex;
|
| 276 |
+
gap: 0.25rem;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.loading-dot {
|
| 280 |
+
width: 6px;
|
| 281 |
+
height: 6px;
|
| 282 |
+
background: var(--light-blue);
|
| 283 |
+
border-radius: 50%;
|
| 284 |
+
animation: loadingPulse 1.4s infinite ease-in-out;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.loading-dot:nth-child(1) { animation-delay: -0.32s; }
|
| 288 |
+
.loading-dot:nth-child(2) { animation-delay: -0.16s; }
|
| 289 |
+
|
| 290 |
+
@keyframes loadingPulse {
|
| 291 |
+
0%, 80%, 100% { transform: scale(0.8); opacity: 0.5; }
|
| 292 |
+
40% { transform: scale(1); opacity: 1; }
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
/* Admin Section Enhancements */
|
| 296 |
+
.admin-section {
|
| 297 |
+
background: linear-gradient(135deg, #fef2f2, #fdf2f8);
|
| 298 |
+
border: 1px solid #fecaca;
|
| 299 |
+
border-radius: var(--radius-lg);
|
| 300 |
+
padding: 1.5rem;
|
| 301 |
+
margin-top: 2rem;
|
| 302 |
+
position: relative;
|
| 303 |
+
overflow: hidden;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
.admin-section::before {
|
| 307 |
+
content: '🔐';
|
| 308 |
+
position: absolute;
|
| 309 |
+
top: 1rem;
|
| 310 |
+
right: 1rem;
|
| 311 |
+
font-size: 1.5rem;
|
| 312 |
+
opacity: 0.3;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
/* Status Indicators */
|
| 316 |
+
.status-indicator {
|
| 317 |
+
display: inline-flex;
|
| 318 |
+
align-items: center;
|
| 319 |
+
gap: 0.5rem;
|
| 320 |
+
padding: 0.375rem 0.75rem;
|
| 321 |
+
border-radius: 9999px;
|
| 322 |
+
font-size: 0.875rem;
|
| 323 |
+
font-weight: 500;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
.status-success {
|
| 327 |
+
background: #dcfce7;
|
| 328 |
+
color: #166534;
|
| 329 |
+
border: 1px solid #bbf7d0;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
.status-warning {
|
| 333 |
+
background: #fef3c7;
|
| 334 |
+
color: #92400e;
|
| 335 |
+
border: 1px solid #fde68a;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
.status-error {
|
| 339 |
+
background: #fee2e2;
|
| 340 |
+
color: #991b1b;
|
| 341 |
+
border: 1px solid #fecaca;
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
/* Enhanced Metrics */
|
| 345 |
+
.metric-card {
|
| 346 |
+
background: var(--surface-white);
|
| 347 |
+
padding: 1.5rem;
|
| 348 |
+
border-radius: var(--radius-md);
|
| 349 |
+
box-shadow: var(--shadow-soft);
|
| 350 |
+
border: 1px solid var(--border-light);
|
| 351 |
+
text-align: center;
|
| 352 |
+
transition: transform 0.2s ease;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.metric-card:hover {
|
| 356 |
+
transform: translateY(-2px);
|
| 357 |
+
box-shadow: var(--shadow-medium);
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
.metric-value {
|
| 361 |
+
font-size: 2rem;
|
| 362 |
+
font-weight: 700;
|
| 363 |
+
color: var(--primary-blue);
|
| 364 |
+
margin-bottom: 0.5rem;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
.metric-label {
|
| 368 |
+
font-size: 0.875rem;
|
| 369 |
+
color: var(--text-secondary);
|
| 370 |
+
font-weight: 500;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
/* Footer Enhancement */
|
| 374 |
+
.footer {
|
| 375 |
+
text-align: center;
|
| 376 |
+
padding: 2rem;
|
| 377 |
+
color: var(--text-secondary);
|
| 378 |
+
border-top: 1px solid var(--border-light);
|
| 379 |
+
margin-top: 3rem;
|
| 380 |
+
background: var(--surface-light);
|
| 381 |
+
border-radius: var(--radius-md);
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
/* Mobile Responsiveness */
|
| 385 |
+
@media (max-width: 768px) {
|
| 386 |
+
.main-header {
|
| 387 |
+
padding: 1.5rem;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
.main-header h1 {
|
| 391 |
+
font-size: 1.875rem;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
.chat-input-container {
|
| 395 |
+
flex-direction: column;
|
| 396 |
+
gap: 0.75rem;
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
.chat-send-button {
|
| 400 |
+
width: 100%;
|
| 401 |
+
height: 48px;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
.user-message, .assistant-message {
|
| 405 |
+
max-width: 95%;
|
| 406 |
+
}
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
/* Performance Optimization - Reduce Repaints */
|
| 410 |
+
.main .block-container {
|
| 411 |
+
will-change: transform;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
/* Accessibility Enhancements */
|
| 415 |
+
.chat-input-field:focus,
|
| 416 |
+
.stButton > button:focus {
|
| 417 |
+
outline: 2px solid var(--light-blue);
|
| 418 |
+
outline-offset: 2px;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
/* High Contrast Mode Support */
|
| 422 |
+
@media (prefers-contrast: high) {
|
| 423 |
+
:root {
|
| 424 |
+
--primary-blue: #0056b3;
|
| 425 |
+
--light-blue: #0066cc;
|
| 426 |
+
--border-light: #666666;
|
| 427 |
+
}
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
/* Reduced Motion Support */
|
| 431 |
+
@media (prefers-reduced-motion: reduce) {
|
| 432 |
+
* {
|
| 433 |
+
animation-duration: 0.01ms !important;
|
| 434 |
+
animation-iteration-count: 1 !important;
|
| 435 |
+
transition-duration: 0.01ms !important;
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
</style>
|
| 439 |
+
""", unsafe_allow_html=True)
|
| 440 |
+
|
| 441 |
+
class HRAssistant:
|
| 442 |
+
def __init__(self):
|
| 443 |
+
self.config = Config()
|
| 444 |
+
self.vector_store = VectorStore()
|
| 445 |
+
self.admin_panel = AdminPanel()
|
| 446 |
+
|
| 447 |
+
def initialize_session_state(self):
|
| 448 |
+
"""Initialize session state variables"""
|
| 449 |
+
if 'messages' not in st.session_state:
|
| 450 |
+
st.session_state.messages = []
|
| 451 |
+
if 'api_key_validated' not in st.session_state:
|
| 452 |
+
st.session_state.api_key_validated = False
|
| 453 |
+
if 'show_admin' not in st.session_state:
|
| 454 |
+
st.session_state.show_admin = False
|
| 455 |
+
if 'admin_authenticated' not in st.session_state:
|
| 456 |
+
st.session_state.admin_authenticated = False
|
| 457 |
+
|
| 458 |
+
def render_header(self):
|
| 459 |
+
"""Render application header with logo"""
|
| 460 |
+
st.markdown("""
|
| 461 |
+
<div class="main-header">
|
| 462 |
+
<h1 style="color: white; margin: 0;">BLUESCARF ARTIFICIAL INTELLIGENCE</h1>
|
| 463 |
+
<h3 style="color: #bfdbfe; margin: 0.5rem 0 0 0;">HR Assistant</h3>
|
| 464 |
+
</div>
|
| 465 |
+
""", unsafe_allow_html=True)
|
| 466 |
+
|
| 467 |
+
# Logo placeholder - replace logo.png with actual company logo
|
| 468 |
+
logo_path = Path("logo.png")
|
| 469 |
+
if logo_path.exists():
|
| 470 |
+
st.image("logo.png", width=200)
|
| 471 |
+
else:
|
| 472 |
+
st.info("📋 Replace 'logo.png' with your company logo")
|
| 473 |
+
|
| 474 |
+
def setup_gemini_api(self, api_key: str) -> bool:
|
| 475 |
+
"""Configure Gemini API with provided key"""
|
| 476 |
+
try:
|
| 477 |
+
if not validate_api_key(api_key):
|
| 478 |
+
return False
|
| 479 |
+
|
| 480 |
+
genai.configure(api_key=api_key)
|
| 481 |
+
|
| 482 |
+
# Test API connection
|
| 483 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
| 484 |
+
test_response = model.generate_content("Hello")
|
| 485 |
+
|
| 486 |
+
st.session_state.api_key_validated = True
|
| 487 |
+
st.session_state.model = model
|
| 488 |
+
return True
|
| 489 |
+
|
| 490 |
+
except Exception as e:
|
| 491 |
+
st.error(f"API Configuration Error: {str(e)}")
|
| 492 |
+
return False
|
| 493 |
+
|
| 494 |
+
def get_relevant_context(self, query: str) -> List[Dict[str, Any]]:
|
| 495 |
+
"""Retrieve relevant context from vector store"""
|
| 496 |
+
return self._retrieve_relevant_context(query)
|
| 497 |
+
|
| 498 |
+
def generate_response(self, query: str, context: List[Dict[str, Any]]) -> str:
|
| 499 |
+
"""Generate response using Gemini API with retrieved context"""
|
| 500 |
+
return self._generate_contextual_response(query, context)
|
| 501 |
+
|
| 502 |
+
def is_hr_related_query(self, query: str) -> bool:
|
| 503 |
+
"""Check if query is HR-related using enhanced classification"""
|
| 504 |
+
return self._is_hr_related_query(query)
|
| 505 |
+
|
| 506 |
+
# Log interaction
|
| 507 |
+
log_interaction(query, response)
|
| 508 |
+
|
| 509 |
+
def render_chat_interface(self):
|
| 510 |
+
"""Render the main chat interface with robust state management"""
|
| 511 |
+
st.markdown("### 💬 Chat with HR Assistant")
|
| 512 |
+
|
| 513 |
+
# Initialize input state management
|
| 514 |
+
if 'input_processed' not in st.session_state:
|
| 515 |
+
st.session_state.input_processed = False
|
| 516 |
+
if 'last_input' not in st.session_state:
|
| 517 |
+
st.session_state.last_input = ""
|
| 518 |
+
|
| 519 |
+
# Chat message container
|
| 520 |
+
self._render_chat_messages()
|
| 521 |
+
|
| 522 |
+
# Input interface with intelligent state handling
|
| 523 |
+
self._render_chat_input()
|
| 524 |
+
|
| 525 |
+
# Chat controls
|
| 526 |
+
self._render_chat_controls()
|
| 527 |
+
|
| 528 |
+
def _render_chat_messages(self):
|
| 529 |
+
"""Render chat message history with optimized layout"""
|
| 530 |
+
if not st.session_state.messages:
|
| 531 |
+
st.info("👋 Welcome! Ask me anything about BLUESCARF AI HR policies and procedures.")
|
| 532 |
+
return
|
| 533 |
+
|
| 534 |
+
# Create scrollable chat container
|
| 535 |
+
chat_container = st.container()
|
| 536 |
+
|
| 537 |
+
with chat_container:
|
| 538 |
+
for idx, message in enumerate(st.session_state.messages):
|
| 539 |
+
message_key = f"msg_{idx}_{message.get('timestamp', time.time())}"
|
| 540 |
+
|
| 541 |
+
if message["role"] == "user":
|
| 542 |
+
st.markdown(f"""
|
| 543 |
+
<div class="user-message" id="{message_key}">
|
| 544 |
+
<strong>You:</strong> {message["content"]}
|
| 545 |
+
</div>
|
| 546 |
+
""", unsafe_allow_html=True)
|
| 547 |
+
else:
|
| 548 |
+
st.markdown(f"""
|
| 549 |
+
<div class="assistant-message" id="{message_key}">
|
| 550 |
+
<strong>HR Assistant:</strong> {message["content"]}
|
| 551 |
+
</div>
|
| 552 |
+
""", unsafe_allow_html=True)
|
| 553 |
+
|
| 554 |
+
def _render_chat_input(self):
|
| 555 |
+
"""Render chat input with intelligent state management to prevent loops"""
|
| 556 |
+
col1, col2 = st.columns([5, 1])
|
| 557 |
+
|
| 558 |
+
with col1:
|
| 559 |
+
# Dynamic input key to prevent state persistence issues
|
| 560 |
+
input_key = f"chat_input_{len(st.session_state.messages)}"
|
| 561 |
+
|
| 562 |
+
user_input = st.text_input(
|
| 563 |
+
"Ask me about company policies, benefits, procedures...",
|
| 564 |
+
key=input_key,
|
| 565 |
+
placeholder="Type your HR question here...",
|
| 566 |
+
value="" # Always start with empty value
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
with col2:
|
| 570 |
+
send_button = st.button("Send", type="primary", key=f"send_{len(st.session_state.messages)}")
|
| 571 |
+
|
| 572 |
+
# Process input with anti-loop protection
|
| 573 |
+
if send_button and user_input and user_input.strip():
|
| 574 |
+
# Prevent duplicate processing
|
| 575 |
+
if user_input != st.session_state.last_input or not st.session_state.input_processed:
|
| 576 |
+
self._process_user_query(user_input.strip())
|
| 577 |
+
st.session_state.last_input = user_input.strip()
|
| 578 |
+
st.session_state.input_processed = True
|
| 579 |
+
# Trigger rerun to update UI with new messages
|
| 580 |
+
st.rerun()
|
| 581 |
+
else:
|
| 582 |
+
st.warning("⚠️ Query already processed. Please ask a new question.")
|
| 583 |
+
|
| 584 |
+
# Reset processing flag when input changes
|
| 585 |
+
if user_input != st.session_state.last_input:
|
| 586 |
+
st.session_state.input_processed = False
|
| 587 |
+
|
| 588 |
+
def _render_chat_controls(self):
|
| 589 |
+
"""Render chat control buttons with proper state management"""
|
| 590 |
+
if not st.session_state.messages:
|
| 591 |
+
return
|
| 592 |
+
|
| 593 |
+
col1, col2, col3 = st.columns([2, 2, 2])
|
| 594 |
+
|
| 595 |
+
with col1:
|
| 596 |
+
if st.button("🗑️ Clear Chat", key="clear_chat_btn"):
|
| 597 |
+
self._clear_chat_session()
|
| 598 |
+
|
| 599 |
+
with col2:
|
| 600 |
+
if st.button("📥 Export Chat", key="export_chat_btn"):
|
| 601 |
+
self._export_chat_history()
|
| 602 |
+
|
| 603 |
+
with col3:
|
| 604 |
+
st.caption(f"💬 {len(st.session_state.messages)} messages")
|
| 605 |
+
|
| 606 |
+
def _process_user_query(self, query: str):
|
| 607 |
+
"""Process user query with enhanced error handling and state management"""
|
| 608 |
+
if not query or len(query.strip()) < 3:
|
| 609 |
+
st.warning("⚠️ Please enter a meaningful question.")
|
| 610 |
+
return
|
| 611 |
+
|
| 612 |
+
# Add user message to chat history
|
| 613 |
+
user_message = {
|
| 614 |
+
"role": "user",
|
| 615 |
+
"content": query,
|
| 616 |
+
"timestamp": time.time(),
|
| 617 |
+
"message_id": self._generate_message_id()
|
| 618 |
+
}
|
| 619 |
+
st.session_state.messages.append(user_message)
|
| 620 |
+
|
| 621 |
+
# Process query and generate response
|
| 622 |
+
try:
|
| 623 |
+
with st.spinner("🤔 Thinking..."):
|
| 624 |
+
response = self._generate_intelligent_response(query)
|
| 625 |
+
|
| 626 |
+
# Add assistant response to chat history
|
| 627 |
+
assistant_message = {
|
| 628 |
+
"role": "assistant",
|
| 629 |
+
"content": response,
|
| 630 |
+
"timestamp": time.time(),
|
| 631 |
+
"message_id": self._generate_message_id(),
|
| 632 |
+
"query_processed": query
|
| 633 |
+
}
|
| 634 |
+
st.session_state.messages.append(assistant_message)
|
| 635 |
+
|
| 636 |
+
# Log successful interaction
|
| 637 |
+
self._log_successful_interaction(query, response)
|
| 638 |
+
|
| 639 |
+
except Exception as e:
|
| 640 |
+
error_response = f"I apologize, but I encountered an error processing your request: {str(e)}. Please try rephrasing your question."
|
| 641 |
+
|
| 642 |
+
assistant_message = {
|
| 643 |
+
"role": "assistant",
|
| 644 |
+
"content": error_response,
|
| 645 |
+
"timestamp": time.time(),
|
| 646 |
+
"message_id": self._generate_message_id(),
|
| 647 |
+
"error": True
|
| 648 |
+
}
|
| 649 |
+
st.session_state.messages.append(assistant_message)
|
| 650 |
+
|
| 651 |
+
# Log error for debugging
|
| 652 |
+
self._log_error_interaction(query, str(e))
|
| 653 |
+
|
| 654 |
+
def _generate_intelligent_response(self, query: str) -> str:
|
| 655 |
+
"""Generate contextually aware response using RAG pipeline"""
|
| 656 |
+
# Validate query scope
|
| 657 |
+
if not self._is_hr_related_query(query):
|
| 658 |
+
return self._get_scope_redirect_message()
|
| 659 |
+
|
| 660 |
+
# Retrieve relevant context
|
| 661 |
+
context_chunks = self._retrieve_relevant_context(query)
|
| 662 |
+
|
| 663 |
+
if not context_chunks:
|
| 664 |
+
return self._get_no_context_message()
|
| 665 |
+
|
| 666 |
+
# Generate response using Gemini API
|
| 667 |
+
return self._generate_contextual_response(query, context_chunks)
|
| 668 |
+
|
| 669 |
+
def _retrieve_relevant_context(self, query: str) -> List[Dict[str, Any]]:
|
| 670 |
+
"""Retrieve relevant context with enhanced error handling"""
|
| 671 |
+
try:
|
| 672 |
+
return self.vector_store.similarity_search(
|
| 673 |
+
query,
|
| 674 |
+
k=self.config.MAX_CONTEXT_CHUNKS
|
| 675 |
+
)
|
| 676 |
+
except Exception as e:
|
| 677 |
+
st.error(f"Context retrieval error: {str(e)}")
|
| 678 |
+
return []
|
| 679 |
+
|
| 680 |
+
def _generate_contextual_response(self, query: str, context: List[Dict[str, Any]]) -> str:
|
| 681 |
+
"""Generate response using Gemini API with retrieved context"""
|
| 682 |
+
try:
|
| 683 |
+
# Prepare context for prompt engineering
|
| 684 |
+
context_text = self._format_context_for_prompt(context)
|
| 685 |
+
|
| 686 |
+
# Construct optimized prompt
|
| 687 |
+
prompt = self._build_contextual_prompt(query, context_text)
|
| 688 |
+
|
| 689 |
+
# Generate response with error handling
|
| 690 |
+
response = st.session_state.model.generate_content(prompt)
|
| 691 |
+
|
| 692 |
+
return self._format_and_validate_response(response.text)
|
| 693 |
+
|
| 694 |
+
except Exception as e:
|
| 695 |
+
return f"I apologize, but I encountered an error generating a response: {str(e)}. Please try rephrasing your question."
|
| 696 |
+
|
| 697 |
+
def _format_context_for_prompt(self, context: List[Dict[str, Any]]) -> str:
|
| 698 |
+
"""Format context chunks for optimal prompt engineering"""
|
| 699 |
+
formatted_sections = []
|
| 700 |
+
|
| 701 |
+
for idx, chunk in enumerate(context, 1):
|
| 702 |
+
source = chunk['metadata'].get('source', 'Company Document')
|
| 703 |
+
content = chunk['content']
|
| 704 |
+
|
| 705 |
+
formatted_sections.append(
|
| 706 |
+
f"[Document {idx}: {source}]\n{content}\n"
|
| 707 |
+
)
|
| 708 |
+
|
| 709 |
+
return "\n".join(formatted_sections)
|
| 710 |
+
|
| 711 |
+
def _build_contextual_prompt(self, query: str, context_text: str) -> str:
|
| 712 |
+
"""Build optimized prompt for Gemini API"""
|
| 713 |
+
system_context = self.config.get_hr_context_prompt()
|
| 714 |
+
|
| 715 |
+
return f"""{system_context}
|
| 716 |
+
|
| 717 |
+
COMPANY DOCUMENT CONTEXT:
|
| 718 |
+
{context_text}
|
| 719 |
+
|
| 720 |
+
USER QUESTION: {query}
|
| 721 |
+
|
| 722 |
+
RESPONSE GUIDELINES:
|
| 723 |
+
- Answer based ONLY on the provided company documents
|
| 724 |
+
- Be specific and reference relevant policies
|
| 725 |
+
- If information is incomplete, state what's available and suggest contacting HR
|
| 726 |
+
- Maintain professional, helpful tone
|
| 727 |
+
- Provide actionable guidance when possible
|
| 728 |
+
|
| 729 |
+
RESPONSE:"""
|
| 730 |
+
|
| 731 |
+
def _format_and_validate_response(self, response_text: str) -> str:
|
| 732 |
+
"""Format and validate AI response for optimal user experience"""
|
| 733 |
+
if not response_text or len(response_text.strip()) < 10:
|
| 734 |
+
return "I apologize, but I couldn't generate a meaningful response. Please try rephrasing your question."
|
| 735 |
+
|
| 736 |
+
# Enhanced text formatting
|
| 737 |
+
formatted_response = self._enhance_response_formatting(response_text.strip())
|
| 738 |
+
|
| 739 |
+
# Add contextual footer if response is substantial
|
| 740 |
+
if len(formatted_response) > 150:
|
| 741 |
+
formatted_response += "\n\n*For additional assistance, please contact the HR department.*"
|
| 742 |
+
|
| 743 |
+
return formatted_response
|
| 744 |
+
|
| 745 |
+
def _enhance_response_formatting(self, text: str) -> str:
|
| 746 |
+
"""Apply intelligent formatting enhancements"""
|
| 747 |
+
# Remove AI response artifacts
|
| 748 |
+
cleaned = text.replace("Based on the provided documents,", "")
|
| 749 |
+
cleaned = cleaned.replace("According to the company policies,", "")
|
| 750 |
+
|
| 751 |
+
# Ensure proper sentence spacing
|
| 752 |
+
sentences = cleaned.split('. ')
|
| 753 |
+
properly_spaced = '. '.join(sentence.strip() for sentence in sentences if sentence.strip())
|
| 754 |
+
|
| 755 |
+
return properly_spaced
|
| 756 |
+
|
| 757 |
+
def _is_hr_related_query(self, query: str) -> bool:
|
| 758 |
+
"""Enhanced HR query classification with fuzzy matching"""
|
| 759 |
+
hr_indicators = [
|
| 760 |
+
'policy', 'leave', 'vacation', 'sick', 'holiday', 'benefit', 'insurance',
|
| 761 |
+
'salary', 'compensation', 'promotion', 'performance', 'review', 'training',
|
| 762 |
+
'onboarding', 'handbook', 'procedure', 'guideline', 'hr', 'human resources',
|
| 763 |
+
'employee', 'staff', 'team', 'department', 'work', 'job', 'role',
|
| 764 |
+
'resignation', 'termination', 'disciplinary', 'conduct', 'harassment'
|
| 765 |
+
]
|
| 766 |
+
|
| 767 |
+
query_lower = query.lower()
|
| 768 |
+
return any(indicator in query_lower for indicator in hr_indicators)
|
| 769 |
+
|
| 770 |
+
def _get_scope_redirect_message(self) -> str:
|
| 771 |
+
"""Get polite redirect message for non-HR queries"""
|
| 772 |
+
return ("I'm specifically designed to assist with BLUESCARF AI HR-related questions "
|
| 773 |
+
"using our company policies and documents. Please ask me about company "
|
| 774 |
+
"policies, benefits, leave procedures, or other HR matters.")
|
| 775 |
+
|
| 776 |
+
def _get_no_context_message(self) -> str:
|
| 777 |
+
"""Get message when no relevant context is found"""
|
| 778 |
+
return ("I couldn't find relevant information in our company documents for your "
|
| 779 |
+
"question. Please contact HR directly for assistance, or try rephrasing "
|
| 780 |
+
"your question using different terms.")
|
| 781 |
+
|
| 782 |
+
def _clear_chat_session(self):
|
| 783 |
+
"""Clear chat session with proper state reset"""
|
| 784 |
+
st.session_state.messages = []
|
| 785 |
+
st.session_state.input_processed = False
|
| 786 |
+
st.session_state.last_input = ""
|
| 787 |
+
st.success("🗑️ Chat history cleared!")
|
| 788 |
+
st.rerun()
|
| 789 |
+
|
| 790 |
+
def _export_chat_history(self):
|
| 791 |
+
"""Export chat history for user reference"""
|
| 792 |
+
if not st.session_state.messages:
|
| 793 |
+
st.warning("No chat history to export.")
|
| 794 |
+
return
|
| 795 |
+
|
| 796 |
+
# Create exportable format
|
| 797 |
+
export_content = "BLUESCARF AI HR Assistant - Chat Export\n"
|
| 798 |
+
export_content += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
| 799 |
+
|
| 800 |
+
for message in st.session_state.messages:
|
| 801 |
+
role = "You" if message["role"] == "user" else "HR Assistant"
|
| 802 |
+
timestamp = datetime.fromtimestamp(message["timestamp"]).strftime('%H:%M:%S')
|
| 803 |
+
export_content += f"[{timestamp}] {role}: {message['content']}\n\n"
|
| 804 |
+
|
| 805 |
+
st.download_button(
|
| 806 |
+
label="📥 Download Chat History",
|
| 807 |
+
data=export_content,
|
| 808 |
+
file_name=f"hr_chat_export_{int(time.time())}.txt",
|
| 809 |
+
mime="text/plain"
|
| 810 |
+
)
|
| 811 |
+
|
| 812 |
+
def _generate_message_id(self) -> str:
|
| 813 |
+
"""Generate unique message identifier"""
|
| 814 |
+
return f"msg_{int(time.time() * 1000)}_{len(st.session_state.messages)}"
|
| 815 |
+
|
| 816 |
+
def _log_successful_interaction(self, query: str, response: str):
|
| 817 |
+
"""Log successful interaction for analytics"""
|
| 818 |
+
try:
|
| 819 |
+
log_interaction(query, response, {
|
| 820 |
+
'success': True,
|
| 821 |
+
'response_length': len(response),
|
| 822 |
+
'session_messages': len(st.session_state.messages)
|
| 823 |
+
})
|
| 824 |
+
except Exception:
|
| 825 |
+
pass # Silent fail for logging
|
| 826 |
+
|
| 827 |
+
def _log_error_interaction(self, query: str, error: str):
|
| 828 |
+
"""Log error interaction for debugging"""
|
| 829 |
+
try:
|
| 830 |
+
log_interaction(query, f"ERROR: {error}", {
|
| 831 |
+
'success': False,
|
| 832 |
+
'error_type': 'processing_error',
|
| 833 |
+
'session_messages': len(st.session_state.messages)
|
| 834 |
+
})
|
| 835 |
+
except Exception:
|
| 836 |
+
pass # Silent fail for logging
|
| 837 |
+
|
| 838 |
+
def render_admin_section(self):
|
| 839 |
+
"""Render admin panel section"""
|
| 840 |
+
st.markdown("---")
|
| 841 |
+
|
| 842 |
+
col1, col2 = st.columns([3, 1])
|
| 843 |
+
|
| 844 |
+
with col1:
|
| 845 |
+
st.markdown("### 🔧 Administrator Panel")
|
| 846 |
+
st.markdown("*Manage knowledge base and update company documents*")
|
| 847 |
+
|
| 848 |
+
with col2:
|
| 849 |
+
if st.button("Admin Access"):
|
| 850 |
+
st.session_state.show_admin = not st.session_state.show_admin
|
| 851 |
+
|
| 852 |
+
if st.session_state.show_admin:
|
| 853 |
+
self.admin_panel.render()
|
| 854 |
+
|
| 855 |
+
def render_footer(self):
|
| 856 |
+
"""Render application footer"""
|
| 857 |
+
st.markdown("""
|
| 858 |
+
<div class="footer">
|
| 859 |
+
<p><strong>BLUESCARF ARTIFICIAL INTELLIGENCE</strong> | HR Assistant v1.0</p>
|
| 860 |
+
<p>Powered by Google Gemini AI | Built with Streamlit</p>
|
| 861 |
+
</div>
|
| 862 |
+
""", unsafe_allow_html=True)
|
| 863 |
+
|
| 864 |
+
def run(self):
|
| 865 |
+
"""Main application entry point"""
|
| 866 |
+
self.initialize_session_state()
|
| 867 |
+
self.render_header()
|
| 868 |
+
|
| 869 |
+
# API Key input
|
| 870 |
+
if not st.session_state.api_key_validated:
|
| 871 |
+
st.markdown("### 🔑 API Configuration")
|
| 872 |
+
|
| 873 |
+
with st.form("api_key_form"):
|
| 874 |
+
api_key = st.text_input(
|
| 875 |
+
"Enter your Google Gemini API Key:",
|
| 876 |
+
type="password",
|
| 877 |
+
help="Get your API key from https://makersuite.google.com/app/apikey"
|
| 878 |
+
)
|
| 879 |
+
|
| 880 |
+
submitted = st.form_submit_button("Connect", type="primary")
|
| 881 |
+
|
| 882 |
+
if submitted and api_key:
|
| 883 |
+
with st.spinner("Validating API key..."):
|
| 884 |
+
if self.setup_gemini_api(api_key):
|
| 885 |
+
st.success("✅ API key validated successfully!")
|
| 886 |
+
st.rerun()
|
| 887 |
+
else:
|
| 888 |
+
st.error("❌ Invalid API key. Please check and try again.")
|
| 889 |
+
|
| 890 |
+
# Show knowledge base status
|
| 891 |
+
doc_count = self.vector_store.get_document_count()
|
| 892 |
+
if doc_count > 0:
|
| 893 |
+
st.info(f"📚 Knowledge base contains {doc_count} processed documents")
|
| 894 |
+
else:
|
| 895 |
+
st.warning("⚠️ No documents in knowledge base. Please use admin panel to add company documents.")
|
| 896 |
+
|
| 897 |
+
else:
|
| 898 |
+
# Main application interface
|
| 899 |
+
self.render_chat_interface()
|
| 900 |
+
self.render_admin_section()
|
| 901 |
+
|
| 902 |
+
self.render_footer()
|
| 903 |
+
|
| 904 |
+
def main():
|
| 905 |
+
"""Application entry point"""
|
| 906 |
+
app = HRAssistant()
|
| 907 |
+
app.run()
|
| 908 |
+
|
| 909 |
+
if __name__ == "__main__":
|
| 910 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, Any, Optional
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
class Config:
|
| 7 |
+
"""
|
| 8 |
+
Centralized configuration management for BLUESCARF AI HR Assistant.
|
| 9 |
+
Provides environment-aware settings with sensible defaults and validation.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
"""Initialize configuration with environment-specific optimizations."""
|
| 14 |
+
self._load_environment_config()
|
| 15 |
+
self._validate_configuration()
|
| 16 |
+
|
| 17 |
+
def _load_environment_config(self):
|
| 18 |
+
"""Load configuration from environment variables with intelligent defaults."""
|
| 19 |
+
|
| 20 |
+
# === Core Application Settings ===
|
| 21 |
+
self.APP_NAME = "BLUESCARF AI HR Assistant"
|
| 22 |
+
self.APP_VERSION = "1.0.0"
|
| 23 |
+
self.COMPANY_NAME = "BLUESCARF ARTIFICIAL INTELLIGENCE"
|
| 24 |
+
|
| 25 |
+
# === Document Processing Configuration ===
|
| 26 |
+
# Optimal chunk size for semantic coherence (384-512 tokens typical)
|
| 27 |
+
self.CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', 1000))
|
| 28 |
+
|
| 29 |
+
# Overlap for context continuity (10-20% of chunk size)
|
| 30 |
+
self.CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', 200))
|
| 31 |
+
|
| 32 |
+
# Minimum viable chunk size to filter noise
|
| 33 |
+
self.MIN_CHUNK_SIZE = int(os.getenv('MIN_CHUNK_SIZE', 100))
|
| 34 |
+
|
| 35 |
+
# Maximum file size (50MB default for enterprise documents)
|
| 36 |
+
self.MAX_FILE_SIZE = int(os.getenv('MAX_FILE_SIZE', 50 * 1024 * 1024))
|
| 37 |
+
|
| 38 |
+
# === Vector Store Configuration ===
|
| 39 |
+
# Persistent storage path with environment fallback
|
| 40 |
+
default_db_path = Path("vector_db")
|
| 41 |
+
self.VECTOR_DB_PATH = Path(os.getenv('VECTOR_DB_PATH', default_db_path))
|
| 42 |
+
|
| 43 |
+
# Maximum context chunks for retrieval (balance between context and noise)
|
| 44 |
+
self.MAX_CONTEXT_CHUNKS = int(os.getenv('MAX_CONTEXT_CHUNKS', 5))
|
| 45 |
+
|
| 46 |
+
# Similarity search parameters
|
| 47 |
+
self.SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', 0.5))
|
| 48 |
+
self.MAX_SEARCH_RESULTS = int(os.getenv('MAX_SEARCH_RESULTS', 10))
|
| 49 |
+
|
| 50 |
+
# === API Configuration ===
|
| 51 |
+
# Gemini model selection (optimized for reasoning and context)
|
| 52 |
+
self.GEMINI_MODEL = os.getenv('GEMINI_MODEL', 'gemini-pro')
|
| 53 |
+
|
| 54 |
+
# Response generation parameters
|
| 55 |
+
self.MAX_RESPONSE_TOKENS = int(os.getenv('MAX_RESPONSE_TOKENS', 1024))
|
| 56 |
+
self.TEMPERATURE = float(os.getenv('TEMPERATURE', 0.3)) # Conservative for factual responses
|
| 57 |
+
|
| 58 |
+
# API rate limiting and retry configuration
|
| 59 |
+
self.API_RETRY_ATTEMPTS = int(os.getenv('API_RETRY_ATTEMPTS', 3))
|
| 60 |
+
self.API_TIMEOUT_SECONDS = int(os.getenv('API_TIMEOUT_SECONDS', 30))
|
| 61 |
+
|
| 62 |
+
# === Security Configuration ===
|
| 63 |
+
# Session and authentication settings
|
| 64 |
+
self.SESSION_TIMEOUT_HOURS = int(os.getenv('SESSION_TIMEOUT_HOURS', 8))
|
| 65 |
+
self.ADMIN_SESSION_TIMEOUT_HOURS = int(os.getenv('ADMIN_SESSION_TIMEOUT_HOURS', 2))
|
| 66 |
+
|
| 67 |
+
# === Logging and Monitoring ===
|
| 68 |
+
# Application logging configuration
|
| 69 |
+
self.LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
|
| 70 |
+
self.LOG_FILE_PATH = Path(os.getenv('LOG_FILE_PATH', 'logs/hr_assistant.log'))
|
| 71 |
+
self.ENABLE_INTERACTION_LOGGING = os.getenv('ENABLE_INTERACTION_LOGGING', 'true').lower() == 'true'
|
| 72 |
+
|
| 73 |
+
# === Performance Optimization ===
|
| 74 |
+
# Embedding model caching and batch processing
|
| 75 |
+
self.EMBEDDING_BATCH_SIZE = int(os.getenv('EMBEDDING_BATCH_SIZE', 32))
|
| 76 |
+
self.ENABLE_MODEL_CACHING = os.getenv('ENABLE_MODEL_CACHING', 'true').lower() == 'true'
|
| 77 |
+
|
| 78 |
+
# Streamlit performance settings
|
| 79 |
+
self.STREAMLIT_THEME = os.getenv('STREAMLIT_THEME', 'light')
|
| 80 |
+
self.ENABLE_CACHING = os.getenv('ENABLE_CACHING', 'true').lower() == 'true'
|
| 81 |
+
|
| 82 |
+
# === Deployment Configuration ===
|
| 83 |
+
# Environment detection for deployment-specific optimizations
|
| 84 |
+
self.ENVIRONMENT = os.getenv('ENVIRONMENT', 'development')
|
| 85 |
+
self.IS_PRODUCTION = self.ENVIRONMENT.lower() == 'production'
|
| 86 |
+
self.IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None
|
| 87 |
+
|
| 88 |
+
# Resource limits for cloud deployment
|
| 89 |
+
if self.IS_HUGGINGFACE:
|
| 90 |
+
self._apply_huggingface_optimizations()
|
| 91 |
+
|
| 92 |
+
def _apply_huggingface_optimizations(self):
|
| 93 |
+
"""Apply Hugging Face Spaces specific optimizations."""
|
| 94 |
+
# Reduce memory footprint for cloud deployment
|
| 95 |
+
self.CHUNK_SIZE = min(self.CHUNK_SIZE, 800)
|
| 96 |
+
self.MAX_CONTEXT_CHUNKS = min(self.MAX_CONTEXT_CHUNKS, 4)
|
| 97 |
+
self.EMBEDDING_BATCH_SIZE = min(self.EMBEDDING_BATCH_SIZE, 16)
|
| 98 |
+
self.MAX_FILE_SIZE = min(self.MAX_FILE_SIZE, 25 * 1024 * 1024) # 25MB limit
|
| 99 |
+
|
| 100 |
+
# Optimize for limited computational resources
|
| 101 |
+
self.ENABLE_MODEL_CACHING = True
|
| 102 |
+
self.API_TIMEOUT_SECONDS = 60 # More lenient timeout for cloud
|
| 103 |
+
|
| 104 |
+
def _validate_configuration(self):
|
| 105 |
+
"""Validate configuration parameters and ensure system compatibility."""
|
| 106 |
+
validation_errors = []
|
| 107 |
+
|
| 108 |
+
# Validate numeric ranges
|
| 109 |
+
if self.CHUNK_SIZE < 100 or self.CHUNK_SIZE > 2000:
|
| 110 |
+
validation_errors.append("CHUNK_SIZE must be between 100 and 2000")
|
| 111 |
+
|
| 112 |
+
if self.CHUNK_OVERLAP >= self.CHUNK_SIZE:
|
| 113 |
+
validation_errors.append("CHUNK_OVERLAP must be less than CHUNK_SIZE")
|
| 114 |
+
|
| 115 |
+
if self.SIMILARITY_THRESHOLD < 0 or self.SIMILARITY_THRESHOLD > 1:
|
| 116 |
+
validation_errors.append("SIMILARITY_THRESHOLD must be between 0 and 1")
|
| 117 |
+
|
| 118 |
+
if self.TEMPERATURE < 0 or self.TEMPERATURE > 1:
|
| 119 |
+
validation_errors.append("TEMPERATURE must be between 0 and 1")
|
| 120 |
+
|
| 121 |
+
# Validate paths and create directories
|
| 122 |
+
try:
|
| 123 |
+
self.VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
self.LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 125 |
+
except Exception as e:
|
| 126 |
+
validation_errors.append(f"Cannot create required directories: {str(e)}")
|
| 127 |
+
|
| 128 |
+
# Report validation errors
|
| 129 |
+
if validation_errors:
|
| 130 |
+
error_message = "Configuration validation failed:\n" + "\n".join(validation_errors)
|
| 131 |
+
if 'streamlit' in globals():
|
| 132 |
+
st.error(error_message)
|
| 133 |
+
else:
|
| 134 |
+
print(f"ERROR: {error_message}")
|
| 135 |
+
raise ValueError(error_message)
|
| 136 |
+
|
| 137 |
+
def get_hr_context_prompt(self) -> str:
|
| 138 |
+
"""
|
| 139 |
+
Generate context-aware system prompt for HR assistant interactions.
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Optimized system prompt for Gemini API
|
| 143 |
+
"""
|
| 144 |
+
return f"""
|
| 145 |
+
You are an intelligent HR Assistant for {self.COMPANY_NAME}.
|
| 146 |
+
|
| 147 |
+
CORE IDENTITY:
|
| 148 |
+
- Professional, helpful, and knowledgeable about company policies
|
| 149 |
+
- Exclusively focused on HR-related matters using provided company documents
|
| 150 |
+
- Maintain confidentiality and provide accurate, policy-based guidance
|
| 151 |
+
|
| 152 |
+
RESPONSE GUIDELINES:
|
| 153 |
+
1. SCOPE: Only answer questions related to company HR policies, procedures, and benefits
|
| 154 |
+
2. SOURCE: Base responses exclusively on provided company documents
|
| 155 |
+
3. CLARITY: Provide clear, actionable guidance with specific policy references
|
| 156 |
+
4. BOUNDARIES: Politely redirect non-HR questions to appropriate resources
|
| 157 |
+
5. ACCURACY: If information isn't in the documents, state this clearly
|
| 158 |
+
6. TONE: Professional yet approachable, maintaining company values
|
| 159 |
+
|
| 160 |
+
STRUCTURED RESPONSE FORMAT:
|
| 161 |
+
- Direct answer to the question
|
| 162 |
+
- Relevant policy/document references
|
| 163 |
+
- Next steps or additional resources if applicable
|
| 164 |
+
- Contact information for complex cases requiring human intervention
|
| 165 |
+
|
| 166 |
+
Remember: You represent {self.COMPANY_NAME} and should reflect our commitment to supporting employees through clear, accurate HR guidance.
|
| 167 |
+
"""
|
| 168 |
+
|
| 169 |
+
def get_similarity_search_config(self) -> Dict[str, Any]:
|
| 170 |
+
"""
|
| 171 |
+
Get optimized configuration for vector similarity search.
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
Dictionary with search parameters
|
| 175 |
+
"""
|
| 176 |
+
return {
|
| 177 |
+
'k': self.MAX_CONTEXT_CHUNKS,
|
| 178 |
+
'similarity_threshold': self.SIMILARITY_THRESHOLD,
|
| 179 |
+
'max_results': self.MAX_SEARCH_RESULTS,
|
| 180 |
+
'include_metadata': True,
|
| 181 |
+
'score_threshold': 0.3, # Minimum relevance score
|
| 182 |
+
'diversity_penalty': 0.1 # Encourage diverse results
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
def get_gemini_config(self) -> Dict[str, Any]:
|
| 186 |
+
"""
|
| 187 |
+
Get optimized configuration for Gemini API calls.
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Dictionary with API parameters
|
| 191 |
+
"""
|
| 192 |
+
return {
|
| 193 |
+
'model': self.GEMINI_MODEL,
|
| 194 |
+
'temperature': self.TEMPERATURE,
|
| 195 |
+
'max_output_tokens': self.MAX_RESPONSE_TOKENS,
|
| 196 |
+
'top_p': 0.8, # Nucleus sampling for balanced creativity
|
| 197 |
+
'top_k': 40, # Limit token consideration for consistency
|
| 198 |
+
'stop_sequences': ["Human:", "Assistant:", "---"],
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
def get_document_processing_config(self) -> Dict[str, Any]:
|
| 202 |
+
"""
|
| 203 |
+
Get optimized configuration for document processing pipeline.
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
Dictionary with processing parameters
|
| 207 |
+
"""
|
| 208 |
+
return {
|
| 209 |
+
'chunk_size': self.CHUNK_SIZE,
|
| 210 |
+
'chunk_overlap': self.CHUNK_OVERLAP,
|
| 211 |
+
'min_chunk_size': self.MIN_CHUNK_SIZE,
|
| 212 |
+
'max_file_size': self.MAX_FILE_SIZE,
|
| 213 |
+
'embedding_batch_size': self.EMBEDDING_BATCH_SIZE,
|
| 214 |
+
'enable_caching': self.ENABLE_MODEL_CACHING,
|
| 215 |
+
'supported_formats': ['pdf'],
|
| 216 |
+
'content_filters': {
|
| 217 |
+
'min_word_count': 10,
|
| 218 |
+
'max_word_count': 2000,
|
| 219 |
+
'remove_headers_footers': True,
|
| 220 |
+
'normalize_whitespace': True
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
def get_streamlit_config(self) -> Dict[str, str]:
|
| 225 |
+
"""
|
| 226 |
+
Get Streamlit-specific configuration for optimal UI performance.
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
Dictionary with Streamlit settings
|
| 230 |
+
"""
|
| 231 |
+
return {
|
| 232 |
+
'page_title': self.APP_NAME,
|
| 233 |
+
'page_icon': '🔷',
|
| 234 |
+
'layout': 'wide',
|
| 235 |
+
'initial_sidebar_state': 'collapsed',
|
| 236 |
+
'menu_items': {
|
| 237 |
+
'Get Help': f'mailto:support@{self.COMPANY_NAME.lower().replace(" ", "")}.com',
|
| 238 |
+
'Report a bug': None,
|
| 239 |
+
'About': f'{self.APP_NAME} v{self.APP_VERSION} - Powered by Google Gemini AI'
|
| 240 |
+
}
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
def get_logging_config(self) -> Dict[str, Any]:
|
| 244 |
+
"""
|
| 245 |
+
Get comprehensive logging configuration for monitoring and debugging.
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
Dictionary with logging parameters
|
| 249 |
+
"""
|
| 250 |
+
return {
|
| 251 |
+
'level': self.LOG_LEVEL,
|
| 252 |
+
'file_path': str(self.LOG_FILE_PATH),
|
| 253 |
+
'enable_interaction_logging': self.ENABLE_INTERACTION_LOGGING,
|
| 254 |
+
'log_format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 255 |
+
'max_file_size': 10 * 1024 * 1024, # 10MB
|
| 256 |
+
'backup_count': 5,
|
| 257 |
+
'console_output': not self.IS_PRODUCTION
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
def get_security_config(self) -> Dict[str, Any]:
|
| 261 |
+
"""
|
| 262 |
+
Get security configuration for admin access and session management.
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
Dictionary with security parameters
|
| 266 |
+
"""
|
| 267 |
+
return {
|
| 268 |
+
'session_timeout_hours': self.SESSION_TIMEOUT_HOURS,
|
| 269 |
+
'admin_session_timeout_hours': self.ADMIN_SESSION_TIMEOUT_HOURS,
|
| 270 |
+
'password_min_length': 8,
|
| 271 |
+
'password_complexity_required': self.IS_PRODUCTION,
|
| 272 |
+
'enable_rate_limiting': self.IS_PRODUCTION,
|
| 273 |
+
'max_failed_attempts': 3,
|
| 274 |
+
'lockout_duration_minutes': 15
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
def create_environment_file(self, file_path: Optional[str] = None) -> str:
|
| 278 |
+
"""
|
| 279 |
+
Generate .env file template with all configuration options.
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
file_path: Optional path for .env file
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
Path to created .env file
|
| 286 |
+
"""
|
| 287 |
+
if not file_path:
|
| 288 |
+
file_path = '.env'
|
| 289 |
+
|
| 290 |
+
env_content = f"""# {self.APP_NAME} Configuration
|
| 291 |
+
# Generated automatically - modify as needed for your deployment
|
| 292 |
+
|
| 293 |
+
# === Application Settings ===
|
| 294 |
+
APP_NAME="{self.APP_NAME}"
|
| 295 |
+
APP_VERSION="{self.APP_VERSION}"
|
| 296 |
+
COMPANY_NAME="{self.COMPANY_NAME}"
|
| 297 |
+
ENVIRONMENT=production
|
| 298 |
+
|
| 299 |
+
# === Document Processing ===
|
| 300 |
+
CHUNK_SIZE={self.CHUNK_SIZE}
|
| 301 |
+
CHUNK_OVERLAP={self.CHUNK_OVERLAP}
|
| 302 |
+
MIN_CHUNK_SIZE={self.MIN_CHUNK_SIZE}
|
| 303 |
+
MAX_FILE_SIZE={self.MAX_FILE_SIZE}
|
| 304 |
+
|
| 305 |
+
# === Vector Database ===
|
| 306 |
+
VECTOR_DB_PATH=./vector_db
|
| 307 |
+
MAX_CONTEXT_CHUNKS={self.MAX_CONTEXT_CHUNKS}
|
| 308 |
+
SIMILARITY_THRESHOLD={self.SIMILARITY_THRESHOLD}
|
| 309 |
+
|
| 310 |
+
# === API Configuration ===
|
| 311 |
+
GEMINI_MODEL={self.GEMINI_MODEL}
|
| 312 |
+
TEMPERATURE={self.TEMPERATURE}
|
| 313 |
+
MAX_RESPONSE_TOKENS={self.MAX_RESPONSE_TOKENS}
|
| 314 |
+
|
| 315 |
+
# === Security ===
|
| 316 |
+
SESSION_TIMEOUT_HOURS={self.SESSION_TIMEOUT_HOURS}
|
| 317 |
+
ADMIN_SESSION_TIMEOUT_HOURS={self.ADMIN_SESSION_TIMEOUT_HOURS}
|
| 318 |
+
|
| 319 |
+
# === Logging ===
|
| 320 |
+
LOG_LEVEL={self.LOG_LEVEL}
|
| 321 |
+
LOG_FILE_PATH=./logs/hr_assistant.log
|
| 322 |
+
ENABLE_INTERACTION_LOGGING=true
|
| 323 |
+
|
| 324 |
+
# === Performance ===
|
| 325 |
+
EMBEDDING_BATCH_SIZE={self.EMBEDDING_BATCH_SIZE}
|
| 326 |
+
ENABLE_MODEL_CACHING=true
|
| 327 |
+
ENABLE_CACHING=true
|
| 328 |
+
"""
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
with open(file_path, 'w') as f:
|
| 332 |
+
f.write(env_content)
|
| 333 |
+
return file_path
|
| 334 |
+
except Exception as e:
|
| 335 |
+
if 'streamlit' in globals():
|
| 336 |
+
st.error(f"Failed to create .env file: {str(e)}")
|
| 337 |
+
return ""
|
| 338 |
+
|
| 339 |
+
def __str__(self) -> str:
|
| 340 |
+
"""String representation for debugging and logging."""
|
| 341 |
+
return f"{self.APP_NAME} Config (Environment: {self.ENVIRONMENT})"
|
| 342 |
+
|
| 343 |
+
def __repr__(self) -> str:
|
| 344 |
+
"""Developer-friendly representation."""
|
| 345 |
+
return f"Config(app='{self.APP_NAME}', env='{self.ENVIRONMENT}', version='{self.APP_VERSION}')"
|
docker_compose.yml
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BLUESCARF AI HR Assistant - Docker Compose Configuration
|
| 2 |
+
# For local development and production deployment
|
| 3 |
+
|
| 4 |
+
version: '3.8'
|
| 5 |
+
|
| 6 |
+
services:
|
| 7 |
+
hr-assistant:
|
| 8 |
+
build:
|
| 9 |
+
context: .
|
| 10 |
+
dockerfile: Dockerfile
|
| 11 |
+
container_name: bluescarf-hr-assistant
|
| 12 |
+
restart: unless-stopped
|
| 13 |
+
ports:
|
| 14 |
+
- "8501:8501"
|
| 15 |
+
environment:
|
| 16 |
+
# Application Configuration
|
| 17 |
+
- ENVIRONMENT=production
|
| 18 |
+
- COMPANY_NAME=BLUESCARF ARTIFICIAL INTELLIGENCE
|
| 19 |
+
|
| 20 |
+
# Performance Optimization
|
| 21 |
+
- CHUNK_SIZE=1000
|
| 22 |
+
- MAX_CONTEXT_CHUNKS=5
|
| 23 |
+
- EMBEDDING_BATCH_SIZE=16
|
| 24 |
+
|
| 25 |
+
# Security Settings
|
| 26 |
+
- SESSION_TIMEOUT_HOURS=8
|
| 27 |
+
- ADMIN_SESSION_TIMEOUT_HOURS=2
|
| 28 |
+
|
| 29 |
+
# Logging
|
| 30 |
+
- LOG_LEVEL=INFO
|
| 31 |
+
- ENABLE_INTERACTION_LOGGING=true
|
| 32 |
+
volumes:
|
| 33 |
+
# Persistent vector database storage
|
| 34 |
+
- vector_db_data:/app/vector_db
|
| 35 |
+
|
| 36 |
+
# Persistent logs
|
| 37 |
+
- logs_data:/app/logs
|
| 38 |
+
|
| 39 |
+
# Optional: Custom logo (uncomment and provide path)
|
| 40 |
+
# - ./custom_logo.png:/app/logo.png:ro
|
| 41 |
+
|
| 42 |
+
# Optional: Custom configuration (uncomment if using)
|
| 43 |
+
# - ./production.env:/app/.env:ro
|
| 44 |
+
|
| 45 |
+
# Resource limits for production
|
| 46 |
+
deploy:
|
| 47 |
+
resources:
|
| 48 |
+
limits:
|
| 49 |
+
memory: 2G
|
| 50 |
+
cpus: '1.0'
|
| 51 |
+
reservations:
|
| 52 |
+
memory: 1G
|
| 53 |
+
cpus: '0.5'
|
| 54 |
+
|
| 55 |
+
# Health check configuration
|
| 56 |
+
healthcheck:
|
| 57 |
+
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
| 58 |
+
interval: 30s
|
| 59 |
+
timeout: 10s
|
| 60 |
+
retries: 3
|
| 61 |
+
start_period: 40s
|
| 62 |
+
|
| 63 |
+
# Networking
|
| 64 |
+
networks:
|
| 65 |
+
- hr_assistant_network
|
| 66 |
+
|
| 67 |
+
# Named volumes for data persistence
|
| 68 |
+
volumes:
|
| 69 |
+
vector_db_data:
|
| 70 |
+
driver: local
|
| 71 |
+
driver_opts:
|
| 72 |
+
type: none
|
| 73 |
+
o: bind
|
| 74 |
+
device: ./data/vector_db
|
| 75 |
+
|
| 76 |
+
logs_data:
|
| 77 |
+
driver: local
|
| 78 |
+
driver_opts:
|
| 79 |
+
type: none
|
| 80 |
+
o: bind
|
| 81 |
+
device: ./data/logs
|
| 82 |
+
|
| 83 |
+
# Custom network for isolation
|
| 84 |
+
networks:
|
| 85 |
+
hr_assistant_network:
|
| 86 |
+
driver: bridge
|
| 87 |
+
|
| 88 |
+
# Development override (create docker-compose.dev.yml for development)
|
| 89 |
+
# To use: docker-compose -f docker-compose.yml -f docker-compose.dev.yml up
|
dockerfile.txt
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BLUESCARF AI HR Assistant - Docker Configuration
|
| 2 |
+
# Optimized for production deployment with security and performance
|
| 3 |
+
|
| 4 |
+
# Use official Python runtime as base image
|
| 5 |
+
FROM python:3.9-slim
|
| 6 |
+
|
| 7 |
+
# Set metadata
|
| 8 |
+
LABEL maintainer="BLUESCARF ARTIFICIAL INTELLIGENCE"
|
| 9 |
+
LABEL description="RAG-based HR Assistant with Google Gemini AI"
|
| 10 |
+
LABEL version="1.0.0"
|
| 11 |
+
|
| 12 |
+
# Set environment variables
|
| 13 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 14 |
+
PYTHONUNBUFFERED=1 \
|
| 15 |
+
STREAMLIT_SERVER_PORT=8501 \
|
| 16 |
+
STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
|
| 17 |
+
STREAMLIT_SERVER_HEADLESS=true \
|
| 18 |
+
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 19 |
+
|
| 20 |
+
# Create non-root user for security
|
| 21 |
+
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
| 22 |
+
|
| 23 |
+
# Set working directory
|
| 24 |
+
WORKDIR /app
|
| 25 |
+
|
| 26 |
+
# Install system dependencies
|
| 27 |
+
RUN apt-get update && apt-get install -y \
|
| 28 |
+
build-essential \
|
| 29 |
+
curl \
|
| 30 |
+
software-properties-common \
|
| 31 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 32 |
+
|
| 33 |
+
# Copy requirements first for better caching
|
| 34 |
+
COPY requirements.txt .
|
| 35 |
+
|
| 36 |
+
# Install Python dependencies
|
| 37 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 38 |
+
pip install --no-cache-dir -r requirements.txt
|
| 39 |
+
|
| 40 |
+
# Copy application code
|
| 41 |
+
COPY . .
|
| 42 |
+
|
| 43 |
+
# Create necessary directories with proper permissions
|
| 44 |
+
RUN mkdir -p /app/vector_db /app/logs /app/temp && \
|
| 45 |
+
chown -R appuser:appuser /app
|
| 46 |
+
|
| 47 |
+
# Switch to non-root user
|
| 48 |
+
USER appuser
|
| 49 |
+
|
| 50 |
+
# Health check to ensure the app is running
|
| 51 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 52 |
+
CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
| 53 |
+
|
| 54 |
+
# Expose port
|
| 55 |
+
EXPOSE 8501
|
| 56 |
+
|
| 57 |
+
# Set default command
|
| 58 |
+
CMD ["streamlit", "run", "app.py", \
|
| 59 |
+
"--server.port=8501", \
|
| 60 |
+
"--server.address=0.0.0.0", \
|
| 61 |
+
"--server.headless=true", \
|
| 62 |
+
"--browser.gatherUsageStats=false", \
|
| 63 |
+
"--theme.primaryColor=#3b82f6", \
|
| 64 |
+
"--theme.backgroundColor=#ffffff", \
|
| 65 |
+
"--theme.secondaryBackgroundColor=#f8fafc"]
|
| 66 |
+
|
| 67 |
+
# Alternative command for development (uncomment for dev builds)
|
| 68 |
+
# CMD ["streamlit", "run", "app.py", "--server.runOnSave=true", "--server.enableCORS=true"]
|
document_processor.py
ADDED
|
@@ -0,0 +1,973 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 5 |
+
import hashlib
|
| 6 |
+
import time
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from config import Config
|
| 9 |
+
|
| 10 |
+
class BulletproofDocumentProcessor:
|
| 11 |
+
"""
|
| 12 |
+
Bulletproof PDF processor designed for maximum compatibility and reliability.
|
| 13 |
+
|
| 14 |
+
This processor implements a multi-strategy extraction approach with intelligent
|
| 15 |
+
fallbacks, avoiding complex dependencies while ensuring robust text extraction
|
| 16 |
+
from diverse PDF formats commonly found in HR documentation.
|
| 17 |
+
|
| 18 |
+
Architecture:
|
| 19 |
+
- Primary: Native text extraction using minimal libraries
|
| 20 |
+
- Secondary: Byte-level pattern matching for encoded content
|
| 21 |
+
- Tertiary: Manual content stream parsing for complex PDFs
|
| 22 |
+
- Fallback: User-guided content input for problematic files
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
self.config = Config()
|
| 27 |
+
self.embedding_model = self._initialize_embedding_engine()
|
| 28 |
+
self.extraction_stats = {
|
| 29 |
+
'attempts': 0,
|
| 30 |
+
'successes': 0,
|
| 31 |
+
'method_effectiveness': {}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def _initialize_embedding_engine(self):
|
| 35 |
+
"""
|
| 36 |
+
Initialize embedding engine with enhanced error handling and fallback mechanisms.
|
| 37 |
+
|
| 38 |
+
This method implements a graceful degradation strategy, ensuring the system
|
| 39 |
+
remains functional even if specific embedding libraries encounter issues.
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
from sentence_transformers import SentenceTransformer
|
| 43 |
+
|
| 44 |
+
# Use a more compatible model that's less likely to trigger torch issues
|
| 45 |
+
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
|
| 46 |
+
|
| 47 |
+
# Suppress torch warnings that don't affect functionality
|
| 48 |
+
import warnings
|
| 49 |
+
warnings.filterwarnings("ignore", message=".*torch.classes.*")
|
| 50 |
+
|
| 51 |
+
return model
|
| 52 |
+
|
| 53 |
+
except Exception as embedding_error:
|
| 54 |
+
st.warning(f"Embedding model initialization issue: {str(embedding_error)}")
|
| 55 |
+
st.info("📌 System will continue with basic functionality. Some features may be limited.")
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
def extract_text_from_pdf(self, pdf_file) -> Optional[str]:
|
| 59 |
+
"""
|
| 60 |
+
Bulletproof PDF text extraction using progressive strategy escalation.
|
| 61 |
+
|
| 62 |
+
This method implements a sophisticated extraction pipeline that adapts
|
| 63 |
+
to different PDF types and encoding scenarios, ensuring maximum success
|
| 64 |
+
rate across diverse document formats.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
pdf_file: PDF file object or path
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Extracted text content or None if all methods fail
|
| 71 |
+
"""
|
| 72 |
+
self.extraction_stats['attempts'] += 1
|
| 73 |
+
|
| 74 |
+
# Define extraction strategies in order of preference and reliability
|
| 75 |
+
extraction_strategies = [
|
| 76 |
+
('PyPDF2_Enhanced', self._extract_pypdf2_enhanced),
|
| 77 |
+
('ByteLevel_Analysis', self._extract_byte_level),
|
| 78 |
+
('Pattern_Matching', self._extract_pattern_based),
|
| 79 |
+
('Manual_Parsing', self._extract_manual_streams)
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
# Execute extraction strategies with comprehensive error handling
|
| 83 |
+
for strategy_name, extraction_method in extraction_strategies:
|
| 84 |
+
try:
|
| 85 |
+
st.info(f"🔄 Executing {strategy_name} extraction...")
|
| 86 |
+
|
| 87 |
+
# Reset file pointer for each attempt
|
| 88 |
+
self._reset_file_pointer(pdf_file)
|
| 89 |
+
|
| 90 |
+
# Execute extraction with timeout protection
|
| 91 |
+
extracted_text = self._execute_with_timeout(
|
| 92 |
+
extraction_method,
|
| 93 |
+
pdf_file,
|
| 94 |
+
timeout_seconds=30
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Validate extraction quality
|
| 98 |
+
if self._validate_extraction_quality(extracted_text):
|
| 99 |
+
self._record_success(strategy_name)
|
| 100 |
+
st.success(f"✅ {strategy_name} extraction successful!")
|
| 101 |
+
return self._post_process_extracted_text(extracted_text)
|
| 102 |
+
else:
|
| 103 |
+
st.warning(f"⚠️ {strategy_name} extracted insufficient content")
|
| 104 |
+
|
| 105 |
+
except Exception as strategy_error:
|
| 106 |
+
st.warning(f"⚠️ {strategy_name} failed: {str(strategy_error)}")
|
| 107 |
+
self._record_failure(strategy_name, str(strategy_error))
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# All automated strategies failed - provide comprehensive guidance
|
| 111 |
+
self._handle_extraction_failure(pdf_file)
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
+
def _extract_pypdf2_enhanced(self, pdf_file) -> str:
|
| 115 |
+
"""
|
| 116 |
+
Enhanced PyPDF2 extraction with robust error handling and encoding management.
|
| 117 |
+
|
| 118 |
+
This method implements intelligent PDF parsing that handles various
|
| 119 |
+
encoding scenarios and structural anomalies commonly found in HR documents.
|
| 120 |
+
"""
|
| 121 |
+
try:
|
| 122 |
+
import PyPDF2
|
| 123 |
+
|
| 124 |
+
# Prepare PDF reader with enhanced configuration
|
| 125 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 126 |
+
|
| 127 |
+
# Create reader with multiple fallback configurations
|
| 128 |
+
reader_configs = [
|
| 129 |
+
{'strict': False, 'password': None},
|
| 130 |
+
{'strict': True, 'password': None},
|
| 131 |
+
{'strict': False, 'password': ''} # Some PDFs have empty passwords
|
| 132 |
+
]
|
| 133 |
+
|
| 134 |
+
pdf_reader = None
|
| 135 |
+
for config in reader_configs:
|
| 136 |
+
try:
|
| 137 |
+
pdf_reader = PyPDF2.PdfReader(
|
| 138 |
+
io.BytesIO(pdf_data),
|
| 139 |
+
strict=config['strict']
|
| 140 |
+
)
|
| 141 |
+
if pdf_reader.is_encrypted and config['password'] is not None:
|
| 142 |
+
pdf_reader.decrypt(config['password'])
|
| 143 |
+
break
|
| 144 |
+
except Exception:
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
if not pdf_reader:
|
| 148 |
+
raise Exception("Could not initialize PDF reader with any configuration")
|
| 149 |
+
|
| 150 |
+
# Extract text with page-level error handling
|
| 151 |
+
text_fragments = []
|
| 152 |
+
successful_pages = 0
|
| 153 |
+
|
| 154 |
+
for page_index, page in enumerate(pdf_reader.pages):
|
| 155 |
+
try:
|
| 156 |
+
# Multi-method text extraction per page
|
| 157 |
+
page_text = self._extract_page_text_robust(page, page_index)
|
| 158 |
+
|
| 159 |
+
if page_text and len(page_text.strip()) > 10:
|
| 160 |
+
text_fragments.append(f"\n--- Page {page_index + 1} ---\n{page_text}")
|
| 161 |
+
successful_pages += 1
|
| 162 |
+
|
| 163 |
+
except Exception as page_error:
|
| 164 |
+
# Log page error but continue with other pages
|
| 165 |
+
st.warning(f"Page {page_index + 1} extraction failed: {str(page_error)}")
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
if successful_pages == 0:
|
| 169 |
+
raise Exception("No pages yielded readable content")
|
| 170 |
+
|
| 171 |
+
return '\n'.join(text_fragments)
|
| 172 |
+
|
| 173 |
+
except ImportError:
|
| 174 |
+
raise Exception("PyPDF2 library not available")
|
| 175 |
+
except Exception as e:
|
| 176 |
+
raise Exception(f"PyPDF2 extraction failed: {str(e)}")
|
| 177 |
+
|
| 178 |
+
def _extract_page_text_robust(self, page, page_index: int) -> str:
|
| 179 |
+
"""
|
| 180 |
+
Robust page-level text extraction with multiple fallback methods.
|
| 181 |
+
|
| 182 |
+
This method implements several text extraction approaches for individual
|
| 183 |
+
pages, ensuring maximum content recovery from diverse PDF structures.
|
| 184 |
+
"""
|
| 185 |
+
# Primary extraction method
|
| 186 |
+
try:
|
| 187 |
+
text = page.extract_text()
|
| 188 |
+
if text and len(text.strip()) > 10:
|
| 189 |
+
return text
|
| 190 |
+
except Exception:
|
| 191 |
+
pass
|
| 192 |
+
|
| 193 |
+
# Secondary extraction: access text objects directly
|
| 194 |
+
try:
|
| 195 |
+
if hasattr(page, 'get_contents') and page.get_contents():
|
| 196 |
+
content_stream = page.get_contents()
|
| 197 |
+
if hasattr(content_stream, 'get_data'):
|
| 198 |
+
stream_data = content_stream.get_data()
|
| 199 |
+
decoded_stream = stream_data.decode('latin-1', errors='ignore')
|
| 200 |
+
|
| 201 |
+
# Extract text from stream using safe pattern matching
|
| 202 |
+
text = self._extract_from_content_stream(decoded_stream)
|
| 203 |
+
if text and len(text.strip()) > 10:
|
| 204 |
+
return text
|
| 205 |
+
except Exception:
|
| 206 |
+
pass
|
| 207 |
+
|
| 208 |
+
# Tertiary extraction: character mapping approach
|
| 209 |
+
try:
|
| 210 |
+
return self._extract_via_character_mapping(page)
|
| 211 |
+
except Exception:
|
| 212 |
+
pass
|
| 213 |
+
|
| 214 |
+
return ""
|
| 215 |
+
|
| 216 |
+
def _extract_byte_level(self, pdf_file) -> str:
|
| 217 |
+
"""
|
| 218 |
+
Byte-level PDF analysis for extracting text from structurally complex files.
|
| 219 |
+
|
| 220 |
+
This method performs low-level byte analysis to identify and extract
|
| 221 |
+
text content from PDFs that resist standard parsing methods.
|
| 222 |
+
"""
|
| 223 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 224 |
+
|
| 225 |
+
# Multi-encoding text extraction strategy
|
| 226 |
+
text_candidates = []
|
| 227 |
+
|
| 228 |
+
# Strategy 1: Latin-1 decoding with pattern extraction
|
| 229 |
+
try:
|
| 230 |
+
decoded_content = pdf_data.decode('latin-1', errors='ignore')
|
| 231 |
+
latin_text = self._extract_text_patterns(decoded_content)
|
| 232 |
+
if latin_text:
|
| 233 |
+
text_candidates.append(('latin-1', latin_text))
|
| 234 |
+
except Exception:
|
| 235 |
+
pass
|
| 236 |
+
|
| 237 |
+
# Strategy 2: UTF-8 decoding with lenient error handling
|
| 238 |
+
try:
|
| 239 |
+
decoded_content = pdf_data.decode('utf-8', errors='ignore')
|
| 240 |
+
utf8_text = self._extract_text_patterns(decoded_content)
|
| 241 |
+
if utf8_text:
|
| 242 |
+
text_candidates.append(('utf-8', utf8_text))
|
| 243 |
+
except Exception:
|
| 244 |
+
pass
|
| 245 |
+
|
| 246 |
+
# Strategy 3: Windows-1252 encoding (common in office documents)
|
| 247 |
+
try:
|
| 248 |
+
decoded_content = pdf_data.decode('cp1252', errors='ignore')
|
| 249 |
+
cp1252_text = self._extract_text_patterns(decoded_content)
|
| 250 |
+
if cp1252_text:
|
| 251 |
+
text_candidates.append(('cp1252', cp1252_text))
|
| 252 |
+
except Exception:
|
| 253 |
+
pass
|
| 254 |
+
|
| 255 |
+
# Select best candidate based on content quality metrics
|
| 256 |
+
if text_candidates:
|
| 257 |
+
best_candidate = max(
|
| 258 |
+
text_candidates,
|
| 259 |
+
key=lambda x: self._calculate_text_quality_score(x[1])
|
| 260 |
+
)
|
| 261 |
+
return best_candidate[1]
|
| 262 |
+
|
| 263 |
+
raise Exception("Byte-level extraction found no readable content")
|
| 264 |
+
|
| 265 |
+
def _extract_text_patterns(self, decoded_content: str) -> str:
|
| 266 |
+
"""
|
| 267 |
+
Extract text using safe pattern matching without complex regex.
|
| 268 |
+
|
| 269 |
+
This method identifies text content using simple string operations,
|
| 270 |
+
avoiding regex compilation issues while maintaining extraction effectiveness.
|
| 271 |
+
"""
|
| 272 |
+
text_fragments = []
|
| 273 |
+
|
| 274 |
+
# Extract content between parentheses (common PDF text marker)
|
| 275 |
+
content_length = len(decoded_content)
|
| 276 |
+
i = 0
|
| 277 |
+
|
| 278 |
+
while i < content_length - 1:
|
| 279 |
+
if decoded_content[i] == '(':
|
| 280 |
+
# Found potential text start
|
| 281 |
+
j = i + 1
|
| 282 |
+
parenthesis_depth = 1
|
| 283 |
+
extracted_fragment = ""
|
| 284 |
+
|
| 285 |
+
# Extract until matching closing parenthesis
|
| 286 |
+
while j < content_length and parenthesis_depth > 0:
|
| 287 |
+
char = decoded_content[j]
|
| 288 |
+
|
| 289 |
+
if char == '(':
|
| 290 |
+
parenthesis_depth += 1
|
| 291 |
+
elif char == ')':
|
| 292 |
+
parenthesis_depth -= 1
|
| 293 |
+
|
| 294 |
+
if parenthesis_depth > 0:
|
| 295 |
+
# Handle escape sequences
|
| 296 |
+
if char == '\\' and j + 1 < content_length:
|
| 297 |
+
next_char = decoded_content[j + 1]
|
| 298 |
+
if next_char in 'ntr\\()':
|
| 299 |
+
escape_map = {'n': '\n', 't': '\t', 'r': '\r', '\\': '\\', '(': '(', ')': ')'}
|
| 300 |
+
extracted_fragment += escape_map.get(next_char, next_char)
|
| 301 |
+
j += 2
|
| 302 |
+
else:
|
| 303 |
+
extracted_fragment += next_char
|
| 304 |
+
j += 2
|
| 305 |
+
else:
|
| 306 |
+
extracted_fragment += char
|
| 307 |
+
j += 1
|
| 308 |
+
else:
|
| 309 |
+
j += 1
|
| 310 |
+
|
| 311 |
+
# Process extracted fragment
|
| 312 |
+
cleaned_fragment = self._clean_text_fragment(extracted_fragment)
|
| 313 |
+
if self._is_meaningful_text(cleaned_fragment):
|
| 314 |
+
text_fragments.append(cleaned_fragment)
|
| 315 |
+
|
| 316 |
+
i = j
|
| 317 |
+
else:
|
| 318 |
+
i += 1
|
| 319 |
+
|
| 320 |
+
return ' '.join(text_fragments) if text_fragments else ""
|
| 321 |
+
|
| 322 |
+
def _extract_pattern_based(self, pdf_file) -> str:
|
| 323 |
+
"""
|
| 324 |
+
Pattern-based extraction for identifying text in various PDF structures.
|
| 325 |
+
|
| 326 |
+
This method uses content structure analysis to locate and extract
|
| 327 |
+
text from PDFs with non-standard formatting or encoding.
|
| 328 |
+
"""
|
| 329 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 330 |
+
decoded_content = pdf_data.decode('latin-1', errors='ignore')
|
| 331 |
+
|
| 332 |
+
# Define text extraction patterns (using simple string operations)
|
| 333 |
+
extraction_patterns = [
|
| 334 |
+
self._extract_bt_et_blocks, # Text objects between BT/ET markers
|
| 335 |
+
self._extract_tj_operations, # Text show operations
|
| 336 |
+
self._extract_font_encoded_text, # Font-encoded text content
|
| 337 |
+
self._extract_stream_objects # Direct stream object analysis
|
| 338 |
+
]
|
| 339 |
+
|
| 340 |
+
best_extraction = ""
|
| 341 |
+
best_quality_score = 0
|
| 342 |
+
|
| 343 |
+
for pattern_extractor in extraction_patterns:
|
| 344 |
+
try:
|
| 345 |
+
extracted_text = pattern_extractor(decoded_content)
|
| 346 |
+
quality_score = self._calculate_text_quality_score(extracted_text)
|
| 347 |
+
|
| 348 |
+
if quality_score > best_quality_score:
|
| 349 |
+
best_extraction = extracted_text
|
| 350 |
+
best_quality_score = quality_score
|
| 351 |
+
|
| 352 |
+
except Exception as pattern_error:
|
| 353 |
+
st.warning(f"Pattern extraction method failed: {str(pattern_error)}")
|
| 354 |
+
continue
|
| 355 |
+
|
| 356 |
+
if best_quality_score > 0.3: # Minimum quality threshold
|
| 357 |
+
return best_extraction
|
| 358 |
+
|
| 359 |
+
raise Exception("Pattern-based extraction found no high-quality content")
|
| 360 |
+
|
| 361 |
+
def _extract_bt_et_blocks(self, content: str) -> str:
|
| 362 |
+
"""Extract text from BT/ET (Begin Text/End Text) blocks."""
|
| 363 |
+
text_blocks = []
|
| 364 |
+
|
| 365 |
+
# Find BT/ET pairs using simple string searching
|
| 366 |
+
bt_positions = []
|
| 367 |
+
et_positions = []
|
| 368 |
+
|
| 369 |
+
search_pos = 0
|
| 370 |
+
while True:
|
| 371 |
+
bt_pos = content.find('BT\n', search_pos)
|
| 372 |
+
if bt_pos == -1:
|
| 373 |
+
bt_pos = content.find('BT ', search_pos)
|
| 374 |
+
if bt_pos == -1:
|
| 375 |
+
break
|
| 376 |
+
bt_positions.append(bt_pos)
|
| 377 |
+
search_pos = bt_pos + 1
|
| 378 |
+
|
| 379 |
+
search_pos = 0
|
| 380 |
+
while True:
|
| 381 |
+
et_pos = content.find('ET\n', search_pos)
|
| 382 |
+
if et_pos == -1:
|
| 383 |
+
et_pos = content.find('ET ', search_pos)
|
| 384 |
+
if et_pos == -1:
|
| 385 |
+
break
|
| 386 |
+
et_positions.append(et_pos)
|
| 387 |
+
search_pos = et_pos + 1
|
| 388 |
+
|
| 389 |
+
# Match BT/ET pairs and extract content
|
| 390 |
+
for bt_pos in bt_positions:
|
| 391 |
+
# Find corresponding ET
|
| 392 |
+
matching_et = None
|
| 393 |
+
for et_pos in et_positions:
|
| 394 |
+
if et_pos > bt_pos:
|
| 395 |
+
matching_et = et_pos
|
| 396 |
+
break
|
| 397 |
+
|
| 398 |
+
if matching_et:
|
| 399 |
+
block_content = content[bt_pos:matching_et]
|
| 400 |
+
block_text = self._extract_text_from_block(block_content)
|
| 401 |
+
if block_text:
|
| 402 |
+
text_blocks.append(block_text)
|
| 403 |
+
|
| 404 |
+
return ' '.join(text_blocks)
|
| 405 |
+
|
| 406 |
+
def _extract_manual_streams(self, pdf_file) -> str:
|
| 407 |
+
"""
|
| 408 |
+
Manual PDF stream parsing for maximum compatibility.
|
| 409 |
+
|
| 410 |
+
This method implements a custom PDF parser that handles edge cases
|
| 411 |
+
and structural variations that standard libraries might miss.
|
| 412 |
+
"""
|
| 413 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 414 |
+
|
| 415 |
+
# Identify and extract content streams
|
| 416 |
+
stream_markers = [b'stream\n', b'stream\r\n', b'stream\r']
|
| 417 |
+
endstream_markers = [b'endstream', b'\nendstream', b'\rendstream']
|
| 418 |
+
|
| 419 |
+
extracted_streams = []
|
| 420 |
+
|
| 421 |
+
for stream_marker in stream_markers:
|
| 422 |
+
start_pos = 0
|
| 423 |
+
while True:
|
| 424 |
+
stream_start = pdf_data.find(stream_marker, start_pos)
|
| 425 |
+
if stream_start == -1:
|
| 426 |
+
break
|
| 427 |
+
|
| 428 |
+
# Find corresponding endstream
|
| 429 |
+
content_start = stream_start + len(stream_marker)
|
| 430 |
+
stream_end = pdf_data.find(b'endstream', content_start)
|
| 431 |
+
|
| 432 |
+
if stream_end != -1:
|
| 433 |
+
stream_content = pdf_data[content_start:stream_end]
|
| 434 |
+
|
| 435 |
+
# Attempt to decompress if needed
|
| 436 |
+
decompressed_content = self._attempt_decompression(stream_content)
|
| 437 |
+
|
| 438 |
+
# Extract text from stream
|
| 439 |
+
stream_text = self._extract_text_from_stream(decompressed_content)
|
| 440 |
+
if stream_text:
|
| 441 |
+
extracted_streams.append(stream_text)
|
| 442 |
+
|
| 443 |
+
start_pos = stream_end + 1 if stream_end != -1 else stream_start + 1
|
| 444 |
+
|
| 445 |
+
combined_text = ' '.join(extracted_streams)
|
| 446 |
+
if len(combined_text.strip()) > 50:
|
| 447 |
+
return combined_text
|
| 448 |
+
|
| 449 |
+
raise Exception("Manual stream parsing found insufficient content")
|
| 450 |
+
|
| 451 |
+
def _attempt_decompression(self, stream_content: bytes) -> bytes:
|
| 452 |
+
"""Attempt to decompress PDF stream content if compressed."""
|
| 453 |
+
try:
|
| 454 |
+
import zlib
|
| 455 |
+
return zlib.decompress(stream_content)
|
| 456 |
+
except:
|
| 457 |
+
try:
|
| 458 |
+
import gzip
|
| 459 |
+
return gzip.decompress(stream_content)
|
| 460 |
+
except:
|
| 461 |
+
return stream_content # Return as-is if decompression fails
|
| 462 |
+
|
| 463 |
+
def _extract_text_from_stream(self, stream_content: bytes) -> str:
|
| 464 |
+
"""Extract text content from decompressed PDF stream."""
|
| 465 |
+
try:
|
| 466 |
+
decoded_stream = stream_content.decode('latin-1', errors='ignore')
|
| 467 |
+
return self._extract_text_patterns(decoded_stream)
|
| 468 |
+
except:
|
| 469 |
+
return ""
|
| 470 |
+
|
| 471 |
+
# Utility methods for robust extraction
|
| 472 |
+
|
| 473 |
+
def _read_pdf_data(self, pdf_file) -> bytes:
|
| 474 |
+
"""Safely read PDF data from various input types."""
|
| 475 |
+
if hasattr(pdf_file, 'read'):
|
| 476 |
+
pdf_file.seek(0)
|
| 477 |
+
data = pdf_file.read()
|
| 478 |
+
pdf_file.seek(0)
|
| 479 |
+
return data
|
| 480 |
+
else:
|
| 481 |
+
with open(pdf_file, 'rb') as f:
|
| 482 |
+
return f.read()
|
| 483 |
+
|
| 484 |
+
def _reset_file_pointer(self, pdf_file) -> None:
|
| 485 |
+
"""Reset file pointer if the file object supports it."""
|
| 486 |
+
if hasattr(pdf_file, 'seek'):
|
| 487 |
+
pdf_file.seek(0)
|
| 488 |
+
|
| 489 |
+
def _clean_text_fragment(self, fragment: str) -> str:
|
| 490 |
+
"""Clean individual text fragments for better readability."""
|
| 491 |
+
if not fragment:
|
| 492 |
+
return ""
|
| 493 |
+
|
| 494 |
+
# Remove non-printable characters
|
| 495 |
+
printable_chars = []
|
| 496 |
+
for char in fragment:
|
| 497 |
+
if 32 <= ord(char) <= 126 or char in '\n\r\t':
|
| 498 |
+
printable_chars.append(char)
|
| 499 |
+
elif ord(char) > 126: # Allow extended characters
|
| 500 |
+
printable_chars.append(char)
|
| 501 |
+
else:
|
| 502 |
+
printable_chars.append(' ')
|
| 503 |
+
|
| 504 |
+
cleaned = ''.join(printable_chars)
|
| 505 |
+
|
| 506 |
+
# Normalize whitespace
|
| 507 |
+
words = cleaned.split()
|
| 508 |
+
return ' '.join(words) if words else ""
|
| 509 |
+
|
| 510 |
+
def _is_meaningful_text(self, text: str) -> bool:
|
| 511 |
+
"""Determine if extracted text contains meaningful content."""
|
| 512 |
+
if not text or len(text.strip()) < 3:
|
| 513 |
+
return False
|
| 514 |
+
|
| 515 |
+
# Check for reasonable character distribution
|
| 516 |
+
alphanumeric_count = sum(1 for c in text if c.isalnum())
|
| 517 |
+
total_chars = len(text.replace(' ', ''))
|
| 518 |
+
|
| 519 |
+
if total_chars == 0:
|
| 520 |
+
return False
|
| 521 |
+
|
| 522 |
+
alphanumeric_ratio = alphanumeric_count / total_chars
|
| 523 |
+
return alphanumeric_ratio > 0.3 # At least 30% alphanumeric
|
| 524 |
+
|
| 525 |
+
def _calculate_text_quality_score(self, text: str) -> float:
|
| 526 |
+
"""Calculate quality score for extracted text."""
|
| 527 |
+
if not text:
|
| 528 |
+
return 0.0
|
| 529 |
+
|
| 530 |
+
# Factors contributing to quality score
|
| 531 |
+
length_score = min(len(text) / 1000, 1.0) # Longer text generally better
|
| 532 |
+
word_count = len(text.split())
|
| 533 |
+
word_score = min(word_count / 100, 1.0) # More words generally better
|
| 534 |
+
|
| 535 |
+
# Check for common HR terms (bonus points)
|
| 536 |
+
hr_terms = ['policy', 'employee', 'company', 'benefit', 'leave', 'work', 'staff']
|
| 537 |
+
hr_term_count = sum(1 for term in hr_terms if term.lower() in text.lower())
|
| 538 |
+
hr_bonus = min(hr_term_count * 0.1, 0.3)
|
| 539 |
+
|
| 540 |
+
# Penalty for excessive repetition
|
| 541 |
+
unique_words = len(set(text.lower().split()))
|
| 542 |
+
repetition_penalty = max(0, (word_count - unique_words * 2) / word_count) if word_count > 0 else 0
|
| 543 |
+
|
| 544 |
+
quality_score = (length_score * 0.3 + word_score * 0.4 + hr_bonus) * (1 - repetition_penalty)
|
| 545 |
+
return min(quality_score, 1.0)
|
| 546 |
+
|
| 547 |
+
def _validate_extraction_quality(self, text: str) -> bool:
|
| 548 |
+
"""Validate that extracted text meets minimum quality standards."""
|
| 549 |
+
if not text or len(text.strip()) < 100:
|
| 550 |
+
return False
|
| 551 |
+
|
| 552 |
+
quality_score = self._calculate_text_quality_score(text)
|
| 553 |
+
return quality_score > 0.3
|
| 554 |
+
|
| 555 |
+
def _post_process_extracted_text(self, text: str) -> str:
|
| 556 |
+
"""Post-process extracted text for optimal readability."""
|
| 557 |
+
if not text:
|
| 558 |
+
return ""
|
| 559 |
+
|
| 560 |
+
# Normalize line breaks and spacing
|
| 561 |
+
lines = text.split('\n')
|
| 562 |
+
processed_lines = []
|
| 563 |
+
|
| 564 |
+
for line in lines:
|
| 565 |
+
line = line.strip()
|
| 566 |
+
if line and not line.startswith('---'): # Remove page markers
|
| 567 |
+
processed_lines.append(line)
|
| 568 |
+
|
| 569 |
+
# Join lines with appropriate spacing
|
| 570 |
+
result = '\n'.join(processed_lines)
|
| 571 |
+
|
| 572 |
+
# Final cleanup
|
| 573 |
+
while '\n\n\n' in result:
|
| 574 |
+
result = result.replace('\n\n\n', '\n\n')
|
| 575 |
+
|
| 576 |
+
return result.strip()
|
| 577 |
+
|
| 578 |
+
def _execute_with_timeout(self, func, *args, timeout_seconds: int = 30):
|
| 579 |
+
"""Execute function with timeout protection."""
|
| 580 |
+
# Simplified timeout implementation for basic protection
|
| 581 |
+
start_time = time.time()
|
| 582 |
+
try:
|
| 583 |
+
result = func(*args)
|
| 584 |
+
elapsed = time.time() - start_time
|
| 585 |
+
if elapsed > timeout_seconds:
|
| 586 |
+
st.warning(f"Operation took {elapsed:.1f}s (longer than expected)")
|
| 587 |
+
return result
|
| 588 |
+
except Exception as e:
|
| 589 |
+
elapsed = time.time() - start_time
|
| 590 |
+
if elapsed > timeout_seconds:
|
| 591 |
+
raise Exception(f"Operation timed out after {elapsed:.1f}s")
|
| 592 |
+
raise e
|
| 593 |
+
|
| 594 |
+
def _record_success(self, method: str):
|
| 595 |
+
"""Record successful extraction for analytics."""
|
| 596 |
+
self.extraction_stats['successes'] += 1
|
| 597 |
+
if method not in self.extraction_stats['method_effectiveness']:
|
| 598 |
+
self.extraction_stats['method_effectiveness'][method] = {'success': 0, 'total': 0}
|
| 599 |
+
self.extraction_stats['method_effectiveness'][method]['success'] += 1
|
| 600 |
+
self.extraction_stats['method_effectiveness'][method]['total'] += 1
|
| 601 |
+
|
| 602 |
+
def _record_failure(self, method: str, error: str):
|
| 603 |
+
"""Record failed extraction for analytics."""
|
| 604 |
+
if method not in self.extraction_stats['method_effectiveness']:
|
| 605 |
+
self.extraction_stats['method_effectiveness'][method] = {'success': 0, 'total': 0}
|
| 606 |
+
self.extraction_stats['method_effectiveness'][method]['total'] += 1
|
| 607 |
+
|
| 608 |
+
def _handle_extraction_failure(self, pdf_file):
|
| 609 |
+
"""Provide comprehensive guidance when all extraction methods fail."""
|
| 610 |
+
st.error("❌ All extraction methods failed. Comprehensive PDF analysis:")
|
| 611 |
+
|
| 612 |
+
# Analyze PDF structure for specific guidance
|
| 613 |
+
analysis_results = self._analyze_pdf_structure(pdf_file)
|
| 614 |
+
|
| 615 |
+
col1, col2 = st.columns(2)
|
| 616 |
+
|
| 617 |
+
with col1:
|
| 618 |
+
st.markdown("**📊 PDF Analysis Results:**")
|
| 619 |
+
for key, value in analysis_results.items():
|
| 620 |
+
st.write(f"• **{key}:** {value}")
|
| 621 |
+
|
| 622 |
+
with col2:
|
| 623 |
+
st.markdown("**🛠️ Recommended Solutions:**")
|
| 624 |
+
solutions = self._generate_specific_solutions(analysis_results)
|
| 625 |
+
for solution in solutions:
|
| 626 |
+
st.write(f"• {solution}")
|
| 627 |
+
|
| 628 |
+
# Provide manual input option as last resort
|
| 629 |
+
self._offer_manual_input_option()
|
| 630 |
+
|
| 631 |
+
def _analyze_pdf_structure(self, pdf_file) -> Dict[str, str]:
|
| 632 |
+
"""Analyze PDF structure to provide specific guidance."""
|
| 633 |
+
analysis = {}
|
| 634 |
+
|
| 635 |
+
try:
|
| 636 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 637 |
+
|
| 638 |
+
# Basic file analysis
|
| 639 |
+
analysis['File Size'] = f"{len(pdf_data) / 1024:.1f} KB"
|
| 640 |
+
analysis['PDF Version'] = self._detect_pdf_version(pdf_data)
|
| 641 |
+
analysis['Encryption'] = 'Yes' if b'/Encrypt' in pdf_data else 'No'
|
| 642 |
+
analysis['Images Present'] = 'Yes' if b'/Image' in pdf_data else 'No'
|
| 643 |
+
analysis['Fonts Present'] = 'Yes' if b'/Font' in pdf_data else 'No'
|
| 644 |
+
analysis['Text Objects'] = str(pdf_data.count(b'BT'))
|
| 645 |
+
|
| 646 |
+
# Content type detection
|
| 647 |
+
if pdf_data.count(b'BT') == 0 and b'/Image' in pdf_data:
|
| 648 |
+
analysis['Content Type'] = 'Likely scanned/image-based'
|
| 649 |
+
elif pdf_data.count(b'BT') > 0:
|
| 650 |
+
analysis['Content Type'] = 'Text-based'
|
| 651 |
+
else:
|
| 652 |
+
analysis['Content Type'] = 'Unknown/Complex'
|
| 653 |
+
|
| 654 |
+
except Exception as e:
|
| 655 |
+
analysis['Analysis Error'] = str(e)
|
| 656 |
+
|
| 657 |
+
return analysis
|
| 658 |
+
|
| 659 |
+
def _detect_pdf_version(self, pdf_data: bytes) -> str:
|
| 660 |
+
"""Detect PDF version from header."""
|
| 661 |
+
try:
|
| 662 |
+
header = pdf_data[:20].decode('ascii', errors='ignore')
|
| 663 |
+
if '%PDF-' in header:
|
| 664 |
+
version_start = header.find('%PDF-') + 5
|
| 665 |
+
version = header[version_start:version_start + 3]
|
| 666 |
+
return version
|
| 667 |
+
except:
|
| 668 |
+
pass
|
| 669 |
+
return 'Unknown'
|
| 670 |
+
|
| 671 |
+
def _generate_specific_solutions(self, analysis: Dict[str, str]) -> List[str]:
|
| 672 |
+
"""Generate specific solutions based on PDF analysis."""
|
| 673 |
+
solutions = []
|
| 674 |
+
|
| 675 |
+
content_type = analysis.get('Content Type', '')
|
| 676 |
+
encryption = analysis.get('Encryption', '')
|
| 677 |
+
|
| 678 |
+
if 'scanned' in content_type.lower() or 'image' in content_type.lower():
|
| 679 |
+
solutions.extend([
|
| 680 |
+
"PDF appears to be scanned - use OCR software to convert to text",
|
| 681 |
+
"Try Adobe Acrobat's 'Recognize Text' feature",
|
| 682 |
+
"Consider re-creating document from original source"
|
| 683 |
+
])
|
| 684 |
+
|
| 685 |
+
if encryption == 'Yes':
|
| 686 |
+
solutions.append("Remove password protection before uploading")
|
| 687 |
+
|
| 688 |
+
if analysis.get('Text Objects', '0') == '0':
|
| 689 |
+
solutions.extend([
|
| 690 |
+
"No text objects found - likely image-based content",
|
| 691 |
+
"Export from original application (Word, Google Docs) as PDF"
|
| 692 |
+
])
|
| 693 |
+
|
| 694 |
+
# Universal solutions
|
| 695 |
+
solutions.extend([
|
| 696 |
+
"Try 'Print to PDF' from any PDF viewer",
|
| 697 |
+
"Use online PDF converter to optimize format",
|
| 698 |
+
"Contact IT support for complex document conversion"
|
| 699 |
+
])
|
| 700 |
+
|
| 701 |
+
return solutions
|
| 702 |
+
|
| 703 |
+
def _offer_manual_input_option(self):
|
| 704 |
+
"""Offer manual text input as last resort."""
|
| 705 |
+
with st.expander("🖊️ Manual Text Input (Last Resort)", expanded=False):
|
| 706 |
+
st.markdown("""
|
| 707 |
+
If automatic extraction fails, you can manually input key policy content:
|
| 708 |
+
""")
|
| 709 |
+
|
| 710 |
+
manual_text = st.text_area(
|
| 711 |
+
"Paste policy text here:",
|
| 712 |
+
height=200,
|
| 713 |
+
placeholder="Copy and paste the key content from your PDF here..."
|
| 714 |
+
)
|
| 715 |
+
|
| 716 |
+
if st.button("📝 Process Manual Input") and manual_text:
|
| 717 |
+
if len(manual_text.strip()) > 100:
|
| 718 |
+
st.success("✅ Manual input received! Processing...")
|
| 719 |
+
return manual_text.strip()
|
| 720 |
+
else:
|
| 721 |
+
st.warning("Please provide more substantial content (at least 100 characters)")
|
| 722 |
+
|
| 723 |
+
return None
|
| 724 |
+
|
| 725 |
+
# Required interface methods for compatibility
|
| 726 |
+
|
| 727 |
+
def create_intelligent_chunks(self, text: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 728 |
+
"""Create optimized text chunks for vector storage."""
|
| 729 |
+
if not text or len(text.strip()) < 50:
|
| 730 |
+
return []
|
| 731 |
+
|
| 732 |
+
chunks = []
|
| 733 |
+
chunk_size = self.config.CHUNK_SIZE
|
| 734 |
+
overlap = self.config.CHUNK_OVERLAP
|
| 735 |
+
|
| 736 |
+
# Intelligent sentence-based chunking
|
| 737 |
+
sentences = self._split_into_sentences_robust(text)
|
| 738 |
+
|
| 739 |
+
current_chunk = ""
|
| 740 |
+
chunk_index = 0
|
| 741 |
+
|
| 742 |
+
for sentence in sentences:
|
| 743 |
+
potential_chunk = f"{current_chunk} {sentence}".strip() if current_chunk else sentence
|
| 744 |
+
|
| 745 |
+
if len(potential_chunk) <= chunk_size:
|
| 746 |
+
current_chunk = potential_chunk
|
| 747 |
+
else:
|
| 748 |
+
# Save current chunk if meaningful
|
| 749 |
+
if current_chunk and len(current_chunk.strip()) >= 100:
|
| 750 |
+
chunks.append({
|
| 751 |
+
'content': current_chunk.strip(),
|
| 752 |
+
'metadata': {
|
| 753 |
+
**metadata,
|
| 754 |
+
'chunk_type': 'intelligent_semantic',
|
| 755 |
+
'chunk_index': chunk_index,
|
| 756 |
+
'extraction_method': 'bulletproof_processor'
|
| 757 |
+
}
|
| 758 |
+
})
|
| 759 |
+
chunk_index += 1
|
| 760 |
+
|
| 761 |
+
# Start new chunk with smart overlap
|
| 762 |
+
if overlap > 0 and current_chunk:
|
| 763 |
+
words = current_chunk.split()
|
| 764 |
+
overlap_words = words[-overlap:] if len(words) > overlap else words
|
| 765 |
+
current_chunk = " ".join(overlap_words) + " " + sentence
|
| 766 |
+
else:
|
| 767 |
+
current_chunk = sentence
|
| 768 |
+
|
| 769 |
+
# Process final chunk
|
| 770 |
+
if current_chunk and len(current_chunk.strip()) >= 100:
|
| 771 |
+
chunks.append({
|
| 772 |
+
'content': current_chunk.strip(),
|
| 773 |
+
'metadata': {
|
| 774 |
+
**metadata,
|
| 775 |
+
'chunk_type': 'intelligent_semantic',
|
| 776 |
+
'chunk_index': chunk_index,
|
| 777 |
+
'extraction_method': 'bulletproof_processor'
|
| 778 |
+
}
|
| 779 |
+
})
|
| 780 |
+
|
| 781 |
+
return chunks
|
| 782 |
+
|
| 783 |
+
def _split_into_sentences_robust(self, text: str) -> List[str]:
|
| 784 |
+
"""Robust sentence splitting optimized for HR documents."""
|
| 785 |
+
sentences = []
|
| 786 |
+
current_sentence = ""
|
| 787 |
+
|
| 788 |
+
# Enhanced sentence boundary detection
|
| 789 |
+
sentence_endings = '.!?'
|
| 790 |
+
abbreviations = {'Mr.', 'Mrs.', 'Dr.', 'Inc.', 'Corp.', 'Ltd.', 'Co.', 'etc.', 'vs.'}
|
| 791 |
+
|
| 792 |
+
i = 0
|
| 793 |
+
while i < len(text):
|
| 794 |
+
char = text[i]
|
| 795 |
+
current_sentence += char
|
| 796 |
+
|
| 797 |
+
if char in sentence_endings:
|
| 798 |
+
# Check if this is a real sentence ending
|
| 799 |
+
is_sentence_end = True
|
| 800 |
+
|
| 801 |
+
# Check for abbreviations
|
| 802 |
+
words_before = current_sentence.strip().split()
|
| 803 |
+
if words_before:
|
| 804 |
+
last_word = words_before[-1]
|
| 805 |
+
if last_word in abbreviations:
|
| 806 |
+
is_sentence_end = False
|
| 807 |
+
|
| 808 |
+
# Check if followed by lowercase (likely abbreviation)
|
| 809 |
+
if i + 1 < len(text) and text[i + 1].islower():
|
| 810 |
+
is_sentence_end = False
|
| 811 |
+
|
| 812 |
+
if is_sentence_end and len(current_sentence.strip()) > 10:
|
| 813 |
+
sentences.append(current_sentence.strip())
|
| 814 |
+
current_sentence = ""
|
| 815 |
+
elif char == '\n' and current_sentence.strip():
|
| 816 |
+
# Force sentence break on newlines
|
| 817 |
+
sentences.append(current_sentence.strip())
|
| 818 |
+
current_sentence = ""
|
| 819 |
+
|
| 820 |
+
i += 1
|
| 821 |
+
|
| 822 |
+
# Add final sentence
|
| 823 |
+
if current_sentence.strip() and len(current_sentence.strip()) > 10:
|
| 824 |
+
sentences.append(current_sentence.strip())
|
| 825 |
+
|
| 826 |
+
return sentences
|
| 827 |
+
|
| 828 |
+
def generate_embeddings(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 829 |
+
"""Generate embeddings with robust error handling."""
|
| 830 |
+
if not chunks or not self.embedding_model:
|
| 831 |
+
st.warning("⚠️ Embedding generation unavailable. Documents will be stored without embeddings.")
|
| 832 |
+
return chunks
|
| 833 |
+
|
| 834 |
+
enhanced_chunks = []
|
| 835 |
+
progress_bar = st.progress(0)
|
| 836 |
+
status_text = st.empty()
|
| 837 |
+
|
| 838 |
+
for i, chunk in enumerate(chunks):
|
| 839 |
+
try:
|
| 840 |
+
progress = (i + 1) / len(chunks)
|
| 841 |
+
progress_bar.progress(progress)
|
| 842 |
+
status_text.text(f"Generating embeddings... {i + 1}/{len(chunks)}")
|
| 843 |
+
|
| 844 |
+
# Generate embedding with error handling
|
| 845 |
+
embedding = self.embedding_model.encode(
|
| 846 |
+
chunk['content'],
|
| 847 |
+
normalize_embeddings=True,
|
| 848 |
+
show_progress_bar=False
|
| 849 |
+
).tolist()
|
| 850 |
+
|
| 851 |
+
enhanced_chunk = {
|
| 852 |
+
**chunk,
|
| 853 |
+
'embedding': embedding,
|
| 854 |
+
'embedding_model': 'all-MiniLM-L6-v2',
|
| 855 |
+
'processed_at': time.time()
|
| 856 |
+
}
|
| 857 |
+
enhanced_chunks.append(enhanced_chunk)
|
| 858 |
+
|
| 859 |
+
except Exception as e:
|
| 860 |
+
st.warning(f"Embedding generation failed for chunk {i}: {str(e)}")
|
| 861 |
+
# Add chunk without embedding
|
| 862 |
+
enhanced_chunks.append({
|
| 863 |
+
**chunk,
|
| 864 |
+
'embedding': None,
|
| 865 |
+
'embedding_error': str(e),
|
| 866 |
+
'processed_at': time.time()
|
| 867 |
+
})
|
| 868 |
+
|
| 869 |
+
progress_bar.empty()
|
| 870 |
+
status_text.empty()
|
| 871 |
+
return enhanced_chunks
|
| 872 |
+
|
| 873 |
+
def calculate_document_hash(self, pdf_file) -> str:
|
| 874 |
+
"""Calculate document hash for deduplication."""
|
| 875 |
+
hasher = hashlib.sha256()
|
| 876 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 877 |
+
hasher.update(pdf_data)
|
| 878 |
+
return hasher.hexdigest()
|
| 879 |
+
|
| 880 |
+
def process_document(self, pdf_file, filename: str) -> Optional[Dict[str, Any]]:
|
| 881 |
+
"""Complete document processing pipeline with comprehensive error handling."""
|
| 882 |
+
try:
|
| 883 |
+
# Calculate document hash
|
| 884 |
+
doc_hash = self.calculate_document_hash(pdf_file)
|
| 885 |
+
|
| 886 |
+
# Extract text with bulletproof methods
|
| 887 |
+
st.info(f"📄 Processing {filename} with bulletproof extraction...")
|
| 888 |
+
text_content = self.extract_text_from_pdf(pdf_file)
|
| 889 |
+
|
| 890 |
+
if not text_content:
|
| 891 |
+
st.error("❌ Could not extract readable content from PDF")
|
| 892 |
+
return None
|
| 893 |
+
|
| 894 |
+
# Create comprehensive metadata
|
| 895 |
+
metadata = {
|
| 896 |
+
'source': filename,
|
| 897 |
+
'document_hash': doc_hash,
|
| 898 |
+
'processed_at': time.time(),
|
| 899 |
+
'content_length': len(text_content),
|
| 900 |
+
'document_type': 'hr_policy',
|
| 901 |
+
'extraction_stats': self.extraction_stats,
|
| 902 |
+
'processor_version': 'bulletproof_v1.0'
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
# Create intelligent chunks
|
| 906 |
+
st.info("🧩 Creating intelligent text chunks...")
|
| 907 |
+
chunks = self.create_intelligent_chunks(text_content, metadata)
|
| 908 |
+
|
| 909 |
+
if not chunks:
|
| 910 |
+
st.error("❌ Failed to create meaningful chunks from document")
|
| 911 |
+
return None
|
| 912 |
+
|
| 913 |
+
# Generate embeddings
|
| 914 |
+
st.info("🧠 Generating semantic embeddings...")
|
| 915 |
+
enhanced_chunks = self.generate_embeddings(chunks)
|
| 916 |
+
|
| 917 |
+
# Prepare final document package
|
| 918 |
+
processed_doc = {
|
| 919 |
+
'filename': filename,
|
| 920 |
+
'document_hash': doc_hash,
|
| 921 |
+
'metadata': metadata,
|
| 922 |
+
'chunks': enhanced_chunks,
|
| 923 |
+
'chunk_count': len(enhanced_chunks),
|
| 924 |
+
'total_tokens': sum(len(chunk['content'].split()) for chunk in enhanced_chunks),
|
| 925 |
+
'processing_time': time.time() - metadata['processed_at']
|
| 926 |
+
}
|
| 927 |
+
|
| 928 |
+
st.success(f"✅ Successfully processed {filename} into {len(enhanced_chunks)} chunks")
|
| 929 |
+
return processed_doc
|
| 930 |
+
|
| 931 |
+
except Exception as e:
|
| 932 |
+
st.error(f"❌ Document processing failed: {str(e)}")
|
| 933 |
+
return None
|
| 934 |
+
|
| 935 |
+
def validate_pdf_file(self, pdf_file) -> bool:
|
| 936 |
+
"""Comprehensive PDF validation with helpful feedback."""
|
| 937 |
+
try:
|
| 938 |
+
# Basic file type validation
|
| 939 |
+
if hasattr(pdf_file, 'type') and pdf_file.type != 'application/pdf':
|
| 940 |
+
st.error("❌ Please upload a valid PDF file")
|
| 941 |
+
return False
|
| 942 |
+
|
| 943 |
+
# Size validation
|
| 944 |
+
if hasattr(pdf_file, 'size'):
|
| 945 |
+
if pdf_file.size > self.config.MAX_FILE_SIZE:
|
| 946 |
+
size_mb = self.config.MAX_FILE_SIZE / (1024*1024)
|
| 947 |
+
st.error(f"❌ File size exceeds {size_mb:.1f}MB limit")
|
| 948 |
+
return False
|
| 949 |
+
|
| 950 |
+
if pdf_file.size < 100:
|
| 951 |
+
st.error("❌ File appears to be too small or corrupted")
|
| 952 |
+
return False
|
| 953 |
+
|
| 954 |
+
# PDF signature validation
|
| 955 |
+
try:
|
| 956 |
+
pdf_data = self._read_pdf_data(pdf_file)
|
| 957 |
+
if not pdf_data.startswith(b'%PDF'):
|
| 958 |
+
st.error("❌ Invalid PDF file format")
|
| 959 |
+
return False
|
| 960 |
+
|
| 961 |
+
st.success("✅ PDF file validation passed")
|
| 962 |
+
return True
|
| 963 |
+
|
| 964 |
+
except Exception as validation_error:
|
| 965 |
+
st.warning(f"⚠️ PDF validation warning: {str(validation_error)}")
|
| 966 |
+
return True # Allow processing to continue
|
| 967 |
+
|
| 968 |
+
except Exception as e:
|
| 969 |
+
st.error(f"❌ File validation failed: {str(e)}")
|
| 970 |
+
return False
|
| 971 |
+
|
| 972 |
+
# Replace the previous DocumentProcessor with our bulletproof version
|
| 973 |
+
DocumentProcessor = BulletproofDocumentProcessor
|
gitignore.txt
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BLUESCARF AI HR Assistant - Git Ignore Configuration
|
| 2 |
+
|
| 3 |
+
# Python
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*$py.class
|
| 7 |
+
*.so
|
| 8 |
+
.Python
|
| 9 |
+
build/
|
| 10 |
+
develop-eggs/
|
| 11 |
+
dist/
|
| 12 |
+
downloads/
|
| 13 |
+
eggs/
|
| 14 |
+
.eggs/
|
| 15 |
+
lib/
|
| 16 |
+
lib64/
|
| 17 |
+
parts/
|
| 18 |
+
sdist/
|
| 19 |
+
var/
|
| 20 |
+
wheels/
|
| 21 |
+
pip-wheel-metadata/
|
| 22 |
+
share/python-wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
MANIFEST
|
| 27 |
+
|
| 28 |
+
# Virtual Environments
|
| 29 |
+
venv/
|
| 30 |
+
env/
|
| 31 |
+
ENV/
|
| 32 |
+
env.bak/
|
| 33 |
+
venv.bak/
|
| 34 |
+
.venv/
|
| 35 |
+
|
| 36 |
+
# IDE and Editors
|
| 37 |
+
.vscode/
|
| 38 |
+
.idea/
|
| 39 |
+
*.swp
|
| 40 |
+
*.swo
|
| 41 |
+
*~
|
| 42 |
+
.DS_Store
|
| 43 |
+
Thumbs.db
|
| 44 |
+
|
| 45 |
+
# Streamlit
|
| 46 |
+
.streamlit/
|
| 47 |
+
.streamlit/secrets.toml
|
| 48 |
+
|
| 49 |
+
# Vector Database (contains processed documents - exclude for privacy)
|
| 50 |
+
vector_db/
|
| 51 |
+
*.db
|
| 52 |
+
*.sqlite
|
| 53 |
+
*.sqlite3
|
| 54 |
+
|
| 55 |
+
# Logs and Monitoring
|
| 56 |
+
logs/
|
| 57 |
+
*.log
|
| 58 |
+
log/
|
| 59 |
+
*.log.*
|
| 60 |
+
|
| 61 |
+
# Environment and Configuration
|
| 62 |
+
.env
|
| 63 |
+
.env.local
|
| 64 |
+
.env.production
|
| 65 |
+
.env.development
|
| 66 |
+
config.local.py
|
| 67 |
+
secrets.toml
|
| 68 |
+
|
| 69 |
+
# API Keys and Sensitive Data
|
| 70 |
+
api_keys.txt
|
| 71 |
+
keys/
|
| 72 |
+
credentials/
|
| 73 |
+
*.key
|
| 74 |
+
*.pem
|
| 75 |
+
*.p12
|
| 76 |
+
|
| 77 |
+
# Temporary Files
|
| 78 |
+
temp/
|
| 79 |
+
tmp/
|
| 80 |
+
*.tmp
|
| 81 |
+
*.temp
|
| 82 |
+
.cache/
|
| 83 |
+
cache/
|
| 84 |
+
|
| 85 |
+
# Document Processing Temp Files
|
| 86 |
+
*.pdf.processing
|
| 87 |
+
*.pdf.temp
|
| 88 |
+
upload_temp/
|
| 89 |
+
|
| 90 |
+
# Backup Files
|
| 91 |
+
*.backup
|
| 92 |
+
*.bak
|
| 93 |
+
*_backup_*
|
| 94 |
+
backup/
|
| 95 |
+
|
| 96 |
+
# System Files
|
| 97 |
+
.DS_Store?
|
| 98 |
+
ehthumbs.db
|
| 99 |
+
Icon?
|
| 100 |
+
Thumbs.db
|
| 101 |
+
|
| 102 |
+
# Archives
|
| 103 |
+
*.zip
|
| 104 |
+
*.tar.gz
|
| 105 |
+
*.rar
|
| 106 |
+
*.7z
|
| 107 |
+
|
| 108 |
+
# Jupyter Notebooks (if used for development)
|
| 109 |
+
.ipynb_checkpoints/
|
| 110 |
+
*.ipynb
|
| 111 |
+
|
| 112 |
+
# Model Files (if storing locally)
|
| 113 |
+
models/
|
| 114 |
+
*.model
|
| 115 |
+
*.pkl
|
| 116 |
+
*.joblib
|
| 117 |
+
|
| 118 |
+
# Testing
|
| 119 |
+
.pytest_cache/
|
| 120 |
+
.coverage
|
| 121 |
+
htmlcov/
|
| 122 |
+
.tox/
|
| 123 |
+
.coverage.*
|
| 124 |
+
coverage.xml
|
| 125 |
+
*.cover
|
| 126 |
+
.hypothesis/
|
| 127 |
+
|
| 128 |
+
# Documentation Build
|
| 129 |
+
docs/_build/
|
| 130 |
+
site/
|
| 131 |
+
|
| 132 |
+
# Docker
|
| 133 |
+
.dockerignore
|
| 134 |
+
docker-compose.override.yml
|
| 135 |
+
|
| 136 |
+
# Hugging Face Spaces
|
| 137 |
+
.gradio/
|
| 138 |
+
|
| 139 |
+
# Mac
|
| 140 |
+
.AppleDouble
|
| 141 |
+
.LSOverride
|
| 142 |
+
|
| 143 |
+
# Windows
|
| 144 |
+
[Dd]esktop.ini
|
| 145 |
+
$RECYCLE.BIN/
|
| 146 |
+
*.cab
|
| 147 |
+
*.msi
|
| 148 |
+
*.msix
|
| 149 |
+
*.msm
|
| 150 |
+
*.msp
|
| 151 |
+
*.lnk
|
| 152 |
+
|
| 153 |
+
# Linux
|
| 154 |
+
*~
|
| 155 |
+
.fuse_hidden*
|
| 156 |
+
.directory
|
| 157 |
+
.Trash-*
|
| 158 |
+
.nfs*
|
| 159 |
+
|
| 160 |
+
# Project-Specific Exclusions
|
| 161 |
+
# (Add any custom files you want to exclude)
|
| 162 |
+
|
| 163 |
+
# Keep empty directories with this exception
|
| 164 |
+
!.gitkeep
|
| 165 |
+
|
| 166 |
+
# But ignore the contents of data directories
|
| 167 |
+
data/
|
| 168 |
+
uploads/
|
| 169 |
+
processed/
|
| 170 |
+
|
| 171 |
+
# Ignore local configuration overrides
|
| 172 |
+
local_config.py
|
| 173 |
+
development_settings.py
|
| 174 |
+
|
| 175 |
+
# Ignore any personal notes or documentation
|
| 176 |
+
NOTES.md
|
| 177 |
+
TODO.md
|
| 178 |
+
personal_notes.txt
|
| 179 |
+
|
| 180 |
+
# Ignore error logs and debug files
|
| 181 |
+
error.log
|
| 182 |
+
debug.log
|
| 183 |
+
trace.log
|
logo.png
ADDED
|
requirements.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BLUESCARF AI HR Assistant - Production Dependencies
|
| 2 |
+
# Optimized for Hugging Face Spaces deployment
|
| 3 |
+
|
| 4 |
+
# Core Framework
|
| 5 |
+
#streamlit
|
| 6 |
+
|
| 7 |
+
# Google AI Integration
|
| 8 |
+
#google-generativeai
|
| 9 |
+
|
| 10 |
+
# Vector Database and Embeddings
|
| 11 |
+
#chromadb
|
| 12 |
+
#sentence-transformers
|
| 13 |
+
|
| 14 |
+
# PDF Processing
|
| 15 |
+
#PyPDF2
|
| 16 |
+
|
| 17 |
+
# Data Processing and Analysis
|
| 18 |
+
#pandas
|
| 19 |
+
#numpy
|
| 20 |
+
|
| 21 |
+
# Utilities and Performance
|
| 22 |
+
#pathlib2
|
| 23 |
+
#python-dotenv
|
| 24 |
+
|
| 25 |
+
# Security and Validation
|
| 26 |
+
#hashlib-compat
|
| 27 |
+
|
| 28 |
+
# Optional: Enhanced PDF Processing (uncomment if needed)
|
| 29 |
+
# pdfplumber
|
| 30 |
+
# pymupdf
|
| 31 |
+
|
| 32 |
+
# Development and Debugging (remove in production)
|
| 33 |
+
# streamlit-debug
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# BLUESCARF AI HR Assistant - Production Dependencies
|
| 37 |
+
# Optimized for Hugging Face Spaces deployment
|
| 38 |
+
|
| 39 |
+
# Core Framework
|
| 40 |
+
streamlit>=1.28.0
|
| 41 |
+
|
| 42 |
+
# Google AI Integration
|
| 43 |
+
google-generativeai>=0.4.0
|
| 44 |
+
|
| 45 |
+
# Vector Database and Embeddings
|
| 46 |
+
chromadb>=0.4.0
|
| 47 |
+
sentence-transformers>=2.2.0
|
| 48 |
+
|
| 49 |
+
# PDF Processing
|
| 50 |
+
PyPDF2
|
| 51 |
+
#pdfplumber>=0.7.0
|
| 52 |
+
#pymupdf>=1.23.0
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Data Processing and Analysis
|
| 56 |
+
pandas>=2.0.0
|
| 57 |
+
numpy>=1.24.0
|
| 58 |
+
|
| 59 |
+
# Utilities
|
| 60 |
+
python-dotenv>=1.0.0
|
| 61 |
+
regex>=2022.0.0
|
setup_script.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
BLUESCARF AI HR Assistant - Automated Setup and Validation Script
|
| 4 |
+
Provides comprehensive setup, validation, and deployment assistance.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import subprocess
|
| 10 |
+
import shutil
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import json
|
| 13 |
+
import time
|
| 14 |
+
from typing import Dict, List, Tuple, Optional
|
| 15 |
+
|
| 16 |
+
class Colors:
|
| 17 |
+
"""ANSI color codes for terminal output."""
|
| 18 |
+
HEADER = '\033[95m'
|
| 19 |
+
OKBLUE = '\033[94m'
|
| 20 |
+
OKCYAN = '\033[96m'
|
| 21 |
+
OKGREEN = '\033[92m'
|
| 22 |
+
WARNING = '\033[93m'
|
| 23 |
+
FAIL = '\033[91m'
|
| 24 |
+
ENDC = '\033[0m'
|
| 25 |
+
BOLD = '\033[1m'
|
| 26 |
+
UNDERLINE = '\033[4m'
|
| 27 |
+
|
| 28 |
+
class SetupManager:
|
| 29 |
+
"""Comprehensive setup and validation manager for BLUESCARF AI HR Assistant."""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
self.project_root = Path(__file__).parent
|
| 33 |
+
self.requirements_file = self.project_root / "requirements.txt"
|
| 34 |
+
self.config_file = self.project_root / "config.py"
|
| 35 |
+
self.logo_file = self.project_root / "logo.png"
|
| 36 |
+
|
| 37 |
+
def print_header(self):
|
| 38 |
+
"""Print application header with branding."""
|
| 39 |
+
print(f"{Colors.HEADER}{Colors.BOLD}")
|
| 40 |
+
print("=" * 60)
|
| 41 |
+
print(" BLUESCARF ARTIFICIAL INTELLIGENCE")
|
| 42 |
+
print(" HR Assistant Setup & Validation")
|
| 43 |
+
print(" Version 1.0.0")
|
| 44 |
+
print("=" * 60)
|
| 45 |
+
print(f"{Colors.ENDC}")
|
| 46 |
+
|
| 47 |
+
def check_python_version(self) -> bool:
|
| 48 |
+
"""Validate Python version compatibility."""
|
| 49 |
+
print(f"{Colors.OKBLUE}Checking Python version...{Colors.ENDC}")
|
| 50 |
+
|
| 51 |
+
version = sys.version_info
|
| 52 |
+
min_version = (3, 8)
|
| 53 |
+
|
| 54 |
+
if version >= min_version:
|
| 55 |
+
print(f"{Colors.OKGREEN}✓ Python {version.major}.{version.minor}.{version.micro} (Compatible){Colors.ENDC}")
|
| 56 |
+
return True
|
| 57 |
+
else:
|
| 58 |
+
print(f"{Colors.FAIL}✗ Python {version.major}.{version.minor}.{version.micro} (Requires 3.8+){Colors.ENDC}")
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
+
def check_dependencies(self) -> Tuple[bool, List[str]]:
|
| 62 |
+
"""Check if all required dependencies are available."""
|
| 63 |
+
print(f"{Colors.OKBLUE}Checking dependencies...{Colors.ENDC}")
|
| 64 |
+
|
| 65 |
+
if not self.requirements_file.exists():
|
| 66 |
+
print(f"{Colors.FAIL}✗ requirements.txt not found{Colors.ENDC}")
|
| 67 |
+
return False, ["requirements.txt missing"]
|
| 68 |
+
|
| 69 |
+
# Read requirements
|
| 70 |
+
with open(self.requirements_file, 'r') as f:
|
| 71 |
+
requirements = [line.strip() for line in f if line.strip() and not line.startswith('#')]
|
| 72 |
+
|
| 73 |
+
missing_packages = []
|
| 74 |
+
|
| 75 |
+
for requirement in requirements:
|
| 76 |
+
package_name = requirement.split('==')[0].split('>=')[0].split('~=')[0]
|
| 77 |
+
try:
|
| 78 |
+
__import__(package_name.replace('-', '_'))
|
| 79 |
+
print(f"{Colors.OKGREEN}✓ {package_name}{Colors.ENDC}")
|
| 80 |
+
except ImportError:
|
| 81 |
+
print(f"{Colors.WARNING}! {package_name} (not installed){Colors.ENDC}")
|
| 82 |
+
missing_packages.append(package_name)
|
| 83 |
+
|
| 84 |
+
if missing_packages:
|
| 85 |
+
return False, missing_packages
|
| 86 |
+
else:
|
| 87 |
+
print(f"{Colors.OKGREEN}✓ All dependencies satisfied{Colors.ENDC}")
|
| 88 |
+
return True, []
|
| 89 |
+
|
| 90 |
+
def install_dependencies(self) -> bool:
|
| 91 |
+
"""Install missing dependencies using pip."""
|
| 92 |
+
print(f"{Colors.OKBLUE}Installing dependencies...{Colors.ENDC}")
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
subprocess.check_call([
|
| 96 |
+
sys.executable, "-m", "pip", "install", "-r", str(self.requirements_file)
|
| 97 |
+
])
|
| 98 |
+
print(f"{Colors.OKGREEN}✓ Dependencies installed successfully{Colors.ENDC}")
|
| 99 |
+
return True
|
| 100 |
+
except subprocess.CalledProcessError as e:
|
| 101 |
+
print(f"{Colors.FAIL}✗ Failed to install dependencies: {e}{Colors.ENDC}")
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
def validate_project_structure(self) -> Tuple[bool, List[str]]:
|
| 105 |
+
"""Validate that all required project files exist."""
|
| 106 |
+
print(f"{Colors.OKBLUE}Validating project structure...{Colors.ENDC}")
|
| 107 |
+
|
| 108 |
+
required_files = [
|
| 109 |
+
"app.py",
|
| 110 |
+
"document_processor.py",
|
| 111 |
+
"vector_store.py",
|
| 112 |
+
"admin.py",
|
| 113 |
+
"config.py",
|
| 114 |
+
"utils.py",
|
| 115 |
+
"requirements.txt"
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
missing_files = []
|
| 119 |
+
|
| 120 |
+
for file_name in required_files:
|
| 121 |
+
file_path = self.project_root / file_name
|
| 122 |
+
if file_path.exists():
|
| 123 |
+
print(f"{Colors.OKGREEN}✓ {file_name}{Colors.ENDC}")
|
| 124 |
+
else:
|
| 125 |
+
print(f"{Colors.FAIL}✗ {file_name} (missing){Colors.ENDC}")
|
| 126 |
+
missing_files.append(file_name)
|
| 127 |
+
|
| 128 |
+
# Check for logo
|
| 129 |
+
if self.logo_file.exists():
|
| 130 |
+
print(f"{Colors.OKGREEN}✓ logo.png (company logo found){Colors.ENDC}")
|
| 131 |
+
else:
|
| 132 |
+
print(f"{Colors.WARNING}! logo.png (add your company logo){Colors.ENDC}")
|
| 133 |
+
|
| 134 |
+
if missing_files:
|
| 135 |
+
return False, missing_files
|
| 136 |
+
else:
|
| 137 |
+
print(f"{Colors.OKGREEN}✓ Project structure is valid{Colors.ENDC}")
|
| 138 |
+
return True, []
|
| 139 |
+
|
| 140 |
+
def setup_directories(self) -> bool:
|
| 141 |
+
"""Create necessary directories for the application."""
|
| 142 |
+
print(f"{Colors.OKBLUE}Setting up directories...{Colors.ENDC}")
|
| 143 |
+
|
| 144 |
+
directories = [
|
| 145 |
+
"vector_db",
|
| 146 |
+
"logs",
|
| 147 |
+
"temp",
|
| 148 |
+
"data",
|
| 149 |
+
"data/vector_db",
|
| 150 |
+
"data/logs"
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
for directory in directories:
|
| 155 |
+
dir_path = self.project_root / directory
|
| 156 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 157 |
+
print(f"{Colors.OKGREEN}✓ Created {directory}/{Colors.ENDC}")
|
| 158 |
+
|
| 159 |
+
return True
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"{Colors.FAIL}✗ Failed to create directories: {e}{Colors.ENDC}")
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
def create_env_file(self) -> bool:
|
| 165 |
+
"""Create .env file from template if it doesn't exist."""
|
| 166 |
+
print(f"{Colors.OKBLUE}Setting up environment configuration...{Colors.ENDC}")
|
| 167 |
+
|
| 168 |
+
env_file = self.project_root / ".env"
|
| 169 |
+
env_example = self.project_root / ".env.example"
|
| 170 |
+
|
| 171 |
+
if env_file.exists():
|
| 172 |
+
print(f"{Colors.OKGREEN}✓ .env file already exists{Colors.ENDC}")
|
| 173 |
+
return True
|
| 174 |
+
|
| 175 |
+
if env_example.exists():
|
| 176 |
+
try:
|
| 177 |
+
shutil.copy(env_example, env_file)
|
| 178 |
+
print(f"{Colors.OKGREEN}✓ Created .env from .env.example{Colors.ENDC}")
|
| 179 |
+
print(f"{Colors.WARNING}! Please review and customize .env file{Colors.ENDC}")
|
| 180 |
+
return True
|
| 181 |
+
except Exception as e:
|
| 182 |
+
print(f"{Colors.FAIL}✗ Failed to create .env file: {e}{Colors.ENDC}")
|
| 183 |
+
return False
|
| 184 |
+
else:
|
| 185 |
+
print(f"{Colors.WARNING}! .env.example not found, skipping .env creation{Colors.ENDC}")
|
| 186 |
+
return True
|
| 187 |
+
|
| 188 |
+
def validate_streamlit_config(self) -> bool:
|
| 189 |
+
"""Validate Streamlit configuration."""
|
| 190 |
+
print(f"{Colors.OKBLUE}Validating Streamlit configuration...{Colors.ENDC}")
|
| 191 |
+
|
| 192 |
+
try:
|
| 193 |
+
import streamlit as st
|
| 194 |
+
print(f"{Colors.OKGREEN}✓ Streamlit is available{Colors.ENDC}")
|
| 195 |
+
return True
|
| 196 |
+
except ImportError:
|
| 197 |
+
print(f"{Colors.FAIL}✗ Streamlit not available{Colors.ENDC}")
|
| 198 |
+
return False
|
| 199 |
+
|
| 200 |
+
def test_api_imports(self) -> Dict[str, bool]:
|
| 201 |
+
"""Test critical API imports."""
|
| 202 |
+
print(f"{Colors.OKBLUE}Testing critical imports...{Colors.ENDC}")
|
| 203 |
+
|
| 204 |
+
import_tests = {
|
| 205 |
+
"Google AI": ("google.generativeai", "google-generativeai"),
|
| 206 |
+
"ChromaDB": ("chromadb", "chromadb"),
|
| 207 |
+
"Sentence Transformers": ("sentence_transformers", "sentence-transformers"),
|
| 208 |
+
"PyPDF2": ("PyPDF2", "PyPDF2"),
|
| 209 |
+
"Pandas": ("pandas", "pandas"),
|
| 210 |
+
"NumPy": ("numpy", "numpy")
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
results = {}
|
| 214 |
+
|
| 215 |
+
for name, (module, package) in import_tests.items():
|
| 216 |
+
try:
|
| 217 |
+
__import__(module)
|
| 218 |
+
print(f"{Colors.OKGREEN}✓ {name}{Colors.ENDC}")
|
| 219 |
+
results[name] = True
|
| 220 |
+
except ImportError:
|
| 221 |
+
print(f"{Colors.FAIL}✗ {name} (install with: pip install {package}){Colors.ENDC}")
|
| 222 |
+
results[name] = False
|
| 223 |
+
|
| 224 |
+
return results
|
| 225 |
+
|
| 226 |
+
def generate_deployment_summary(self) -> Dict[str, any]:
|
| 227 |
+
"""Generate comprehensive deployment summary."""
|
| 228 |
+
print(f"{Colors.OKBLUE}Generating deployment summary...{Colors.ENDC}")
|
| 229 |
+
|
| 230 |
+
summary = {
|
| 231 |
+
"timestamp": time.time(),
|
| 232 |
+
"python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
| 233 |
+
"project_path": str(self.project_root),
|
| 234 |
+
"files_present": [],
|
| 235 |
+
"directories_created": [],
|
| 236 |
+
"configuration_status": "pending"
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# Check files
|
| 240 |
+
for file_path in self.project_root.glob("*.py"):
|
| 241 |
+
summary["files_present"].append(file_path.name)
|
| 242 |
+
|
| 243 |
+
# Check directories
|
| 244 |
+
for dir_path in ["vector_db", "logs", "temp"]:
|
| 245 |
+
if (self.project_root / dir_path).exists():
|
| 246 |
+
summary["directories_created"].append(dir_path)
|
| 247 |
+
|
| 248 |
+
return summary
|
| 249 |
+
|
| 250 |
+
def provide_next_steps(self):
|
| 251 |
+
"""Provide clear next steps for deployment."""
|
| 252 |
+
print(f"\n{Colors.HEADER}{Colors.BOLD}NEXT STEPS:{Colors.ENDC}")
|
| 253 |
+
print(f"{Colors.OKBLUE}1. Get Google Gemini API Key:{Colors.ENDC}")
|
| 254 |
+
print(" → Visit: https://makersuite.google.com/app/apikey")
|
| 255 |
+
print(" → Create or use existing API key")
|
| 256 |
+
|
| 257 |
+
print(f"\n{Colors.OKBLUE}2. Add Company Logo:{Colors.ENDC}")
|
| 258 |
+
print(" → Replace 'logo.png' with your company logo")
|
| 259 |
+
print(" → Recommended size: 200x200 pixels")
|
| 260 |
+
|
| 261 |
+
print(f"\n{Colors.OKBLUE}3. Upload Initial Documents:{Colors.ENDC}")
|
| 262 |
+
print(" → Run the application: streamlit run app.py")
|
| 263 |
+
print(" → Access admin panel with password: bluescarf_admin_2024")
|
| 264 |
+
print(" → Upload HR policies, handbooks, procedures")
|
| 265 |
+
|
| 266 |
+
print(f"\n{Colors.OKBLUE}4. Test the System:{Colors.ENDC}")
|
| 267 |
+
print(" → Enter your API key in the application")
|
| 268 |
+
print(" → Ask test questions about uploaded documents")
|
| 269 |
+
print(" → Verify responses are accurate and relevant")
|
| 270 |
+
|
| 271 |
+
print(f"\n{Colors.OKBLUE}5. Deploy to Production:{Colors.ENDC}")
|
| 272 |
+
print(" → For Hugging Face Spaces: Upload all files")
|
| 273 |
+
print(" → For Docker: Use provided Dockerfile")
|
| 274 |
+
print(" → For cloud: Follow platform-specific guides")
|
| 275 |
+
|
| 276 |
+
print(f"\n{Colors.WARNING}IMPORTANT SECURITY NOTES:{Colors.ENDC}")
|
| 277 |
+
print(" → Change default admin password immediately")
|
| 278 |
+
print(" → Keep API keys secure and never commit to git")
|
| 279 |
+
print(" → Review uploaded documents for sensitive information")
|
| 280 |
+
|
| 281 |
+
print(f"\n{Colors.OKGREEN}Ready for deployment! 🚀{Colors.ENDC}")
|
| 282 |
+
|
| 283 |
+
def run_comprehensive_setup(self) -> bool:
|
| 284 |
+
"""Run complete setup and validation process."""
|
| 285 |
+
self.print_header()
|
| 286 |
+
|
| 287 |
+
success = True
|
| 288 |
+
|
| 289 |
+
# 1. Check Python version
|
| 290 |
+
if not self.check_python_version():
|
| 291 |
+
print(f"{Colors.FAIL}Setup failed: Incompatible Python version{Colors.ENDC}")
|
| 292 |
+
return False
|
| 293 |
+
|
| 294 |
+
# 2. Validate project structure
|
| 295 |
+
structure_valid, missing_files = self.validate_project_structure()
|
| 296 |
+
if not structure_valid:
|
| 297 |
+
print(f"{Colors.FAIL}Setup failed: Missing files: {missing_files}{Colors.ENDC}")
|
| 298 |
+
return False
|
| 299 |
+
|
| 300 |
+
# 3. Check dependencies
|
| 301 |
+
deps_valid, missing_deps = self.check_dependencies()
|
| 302 |
+
if not deps_valid:
|
| 303 |
+
print(f"{Colors.WARNING}Installing missing dependencies...{Colors.ENDC}")
|
| 304 |
+
if not self.install_dependencies():
|
| 305 |
+
print(f"{Colors.FAIL}Setup failed: Could not install dependencies{Colors.ENDC}")
|
| 306 |
+
return False
|
| 307 |
+
|
| 308 |
+
# 4. Setup directories
|
| 309 |
+
if not self.setup_directories():
|
| 310 |
+
print(f"{Colors.FAIL}Setup failed: Could not create directories{Colors.ENDC}")
|
| 311 |
+
return False
|
| 312 |
+
|
| 313 |
+
# 5. Create environment file
|
| 314 |
+
if not self.create_env_file():
|
| 315 |
+
print(f"{Colors.WARNING}Environment file setup incomplete{Colors.ENDC}")
|
| 316 |
+
|
| 317 |
+
# 6. Validate Streamlit
|
| 318 |
+
if not self.validate_streamlit_config():
|
| 319 |
+
print(f"{Colors.FAIL}Setup failed: Streamlit configuration issue{Colors.ENDC}")
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
# 7. Test imports
|
| 323 |
+
import_results = self.test_api_imports()
|
| 324 |
+
if not all(import_results.values()):
|
| 325 |
+
print(f"{Colors.WARNING}Some imports failed, but setup can continue{Colors.ENDC}")
|
| 326 |
+
|
| 327 |
+
# 8. Generate summary
|
| 328 |
+
summary = self.generate_deployment_summary()
|
| 329 |
+
|
| 330 |
+
print(f"\n{Colors.OKGREEN}{Colors.BOLD}✓ SETUP COMPLETED SUCCESSFULLY!{Colors.ENDC}")
|
| 331 |
+
|
| 332 |
+
# 9. Provide next steps
|
| 333 |
+
self.provide_next_steps()
|
| 334 |
+
|
| 335 |
+
return True
|
| 336 |
+
|
| 337 |
+
def main():
|
| 338 |
+
"""Main setup function."""
|
| 339 |
+
setup_manager = SetupManager()
|
| 340 |
+
|
| 341 |
+
if len(sys.argv) > 1:
|
| 342 |
+
command = sys.argv[1]
|
| 343 |
+
|
| 344 |
+
if command == "validate":
|
| 345 |
+
# Quick validation only
|
| 346 |
+
setup_manager.check_python_version()
|
| 347 |
+
setup_manager.validate_project_structure()
|
| 348 |
+
setup_manager.check_dependencies()
|
| 349 |
+
|
| 350 |
+
elif command == "install":
|
| 351 |
+
# Install dependencies only
|
| 352 |
+
setup_manager.install_dependencies()
|
| 353 |
+
|
| 354 |
+
elif command == "structure":
|
| 355 |
+
# Setup directories only
|
| 356 |
+
setup_manager.setup_directories()
|
| 357 |
+
|
| 358 |
+
elif command == "test":
|
| 359 |
+
# Test imports only
|
| 360 |
+
setup_manager.test_api_imports()
|
| 361 |
+
|
| 362 |
+
else:
|
| 363 |
+
print(f"Unknown command: {command}")
|
| 364 |
+
print("Available commands: validate, install, structure, test")
|
| 365 |
+
else:
|
| 366 |
+
# Run complete setup
|
| 367 |
+
success = setup_manager.run_comprehensive_setup()
|
| 368 |
+
sys.exit(0 if success else 1)
|
| 369 |
+
|
| 370 |
+
if __name__ == "__main__":
|
| 371 |
+
main()
|
utils.py
ADDED
|
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
import hashlib
|
| 10 |
+
import uuid
|
| 11 |
+
from config import Config
|
| 12 |
+
|
| 13 |
+
class InteractionLogger:
|
| 14 |
+
"""Advanced logging system for user interactions and system monitoring."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, config: Config):
|
| 17 |
+
self.config = config
|
| 18 |
+
self.logger = self._setup_logger()
|
| 19 |
+
self.interaction_log_path = config.LOG_FILE_PATH.parent / "interactions.jsonl"
|
| 20 |
+
|
| 21 |
+
def _setup_logger(self) -> logging.Logger:
|
| 22 |
+
"""Configure professional logging with rotation and formatting."""
|
| 23 |
+
logger = logging.getLogger("hr_assistant")
|
| 24 |
+
logger.setLevel(getattr(logging, self.config.LOG_LEVEL))
|
| 25 |
+
|
| 26 |
+
# Prevent duplicate handlers
|
| 27 |
+
if not logger.handlers:
|
| 28 |
+
# File handler with rotation
|
| 29 |
+
from logging.handlers import RotatingFileHandler
|
| 30 |
+
file_handler = RotatingFileHandler(
|
| 31 |
+
self.config.LOG_FILE_PATH,
|
| 32 |
+
maxBytes=self.config.get_logging_config()['max_file_size'],
|
| 33 |
+
backupCount=self.config.get_logging_config()['backup_count']
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Console handler for development
|
| 37 |
+
if self.config.get_logging_config()['console_output']:
|
| 38 |
+
console_handler = logging.StreamHandler()
|
| 39 |
+
console_handler.setLevel(logging.INFO)
|
| 40 |
+
logger.addHandler(console_handler)
|
| 41 |
+
|
| 42 |
+
# Formatter with structured information
|
| 43 |
+
formatter = logging.Formatter(
|
| 44 |
+
self.config.get_logging_config()['log_format']
|
| 45 |
+
)
|
| 46 |
+
file_handler.setFormatter(formatter)
|
| 47 |
+
logger.addHandler(file_handler)
|
| 48 |
+
|
| 49 |
+
return logger
|
| 50 |
+
|
| 51 |
+
def log_interaction(self, query: str, response: str, metadata: Optional[Dict] = None):
|
| 52 |
+
"""Log user interactions for analysis and improvement."""
|
| 53 |
+
if not self.config.ENABLE_INTERACTION_LOGGING:
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
interaction_data = {
|
| 57 |
+
'timestamp': time.time(),
|
| 58 |
+
'session_id': self._get_session_id(),
|
| 59 |
+
'query': query,
|
| 60 |
+
'response_length': len(response),
|
| 61 |
+
'query_length': len(query),
|
| 62 |
+
'query_type': self._classify_query(query),
|
| 63 |
+
'metadata': metadata or {}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
self.interaction_log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
with open(self.interaction_log_path, 'a') as f:
|
| 69 |
+
f.write(json.dumps(interaction_data) + '\n')
|
| 70 |
+
except Exception as e:
|
| 71 |
+
self.logger.warning(f"Failed to log interaction: {str(e)}")
|
| 72 |
+
|
| 73 |
+
def _get_session_id(self) -> str:
|
| 74 |
+
"""Generate or retrieve session identifier for tracking."""
|
| 75 |
+
if 'session_id' not in st.session_state:
|
| 76 |
+
st.session_state.session_id = str(uuid.uuid4())[:8]
|
| 77 |
+
return st.session_state.session_id
|
| 78 |
+
|
| 79 |
+
def _classify_query(self, query: str) -> str:
|
| 80 |
+
"""Intelligent query classification for analytics."""
|
| 81 |
+
query_lower = query.lower()
|
| 82 |
+
|
| 83 |
+
policy_keywords = ['policy', 'procedure', 'guideline', 'rule']
|
| 84 |
+
benefit_keywords = ['benefit', 'insurance', 'health', 'dental', '401k', 'retirement']
|
| 85 |
+
leave_keywords = ['leave', 'vacation', 'sick', 'pto', 'holiday', 'time off']
|
| 86 |
+
payroll_keywords = ['salary', 'pay', 'payroll', 'compensation', 'bonus']
|
| 87 |
+
|
| 88 |
+
if any(keyword in query_lower for keyword in policy_keywords):
|
| 89 |
+
return 'policy_inquiry'
|
| 90 |
+
elif any(keyword in query_lower for keyword in benefit_keywords):
|
| 91 |
+
return 'benefits_inquiry'
|
| 92 |
+
elif any(keyword in query_lower for keyword in leave_keywords):
|
| 93 |
+
return 'leave_inquiry'
|
| 94 |
+
elif any(keyword in query_lower for keyword in payroll_keywords):
|
| 95 |
+
return 'payroll_inquiry'
|
| 96 |
+
else:
|
| 97 |
+
return 'general_inquiry'
|
| 98 |
+
|
| 99 |
+
# Global logger instance
|
| 100 |
+
config = Config()
|
| 101 |
+
interaction_logger = InteractionLogger(config)
|
| 102 |
+
|
| 103 |
+
def validate_api_key(api_key: str) -> bool:
|
| 104 |
+
"""
|
| 105 |
+
Validate Google Gemini API key format and basic structure.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
api_key: API key string to validate
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
True if key appears valid, False otherwise
|
| 112 |
+
"""
|
| 113 |
+
if not api_key or not isinstance(api_key, str):
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
# Basic format validation for Google API keys
|
| 117 |
+
# They typically start with 'AIza' and are 39 characters long
|
| 118 |
+
api_key = api_key.strip()
|
| 119 |
+
|
| 120 |
+
if len(api_key) < 30: # Too short to be valid
|
| 121 |
+
return False
|
| 122 |
+
|
| 123 |
+
if len(api_key) > 50: # Too long to be typical
|
| 124 |
+
return False
|
| 125 |
+
|
| 126 |
+
# Check for suspicious patterns
|
| 127 |
+
if api_key.lower() in ['test', 'demo', 'placeholder', 'your_api_key']:
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
# Basic character validation (alphanumeric and common symbols)
|
| 131 |
+
if not re.match(r'^[A-Za-z0-9_-]+$', api_key):
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
return True
|
| 135 |
+
|
| 136 |
+
def format_response(response_text: str) -> str:
|
| 137 |
+
"""
|
| 138 |
+
Intelligently format and enhance AI response for optimal user experience.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
response_text: Raw response from AI model
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Formatted and enhanced response text
|
| 145 |
+
"""
|
| 146 |
+
if not response_text:
|
| 147 |
+
return "I apologize, but I couldn't generate a response. Please try rephrasing your question."
|
| 148 |
+
|
| 149 |
+
# Remove common AI response artifacts
|
| 150 |
+
cleaned_text = response_text.strip()
|
| 151 |
+
|
| 152 |
+
# Remove repetitive phrases or AI disclaimers
|
| 153 |
+
artifact_patterns = [
|
| 154 |
+
r'^(As an AI|I am an AI|According to the|Based on the).*?[,.]?\s*',
|
| 155 |
+
r'\b(please note that|it\'s important to note|keep in mind)\b.*?[.!]',
|
| 156 |
+
r'\b(I hope this helps|Hope this helps|Let me know if you need)\b.*?[.!]?$'
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
for pattern in artifact_patterns:
|
| 160 |
+
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
|
| 161 |
+
|
| 162 |
+
# Improve formatting structure
|
| 163 |
+
cleaned_text = _enhance_text_structure(cleaned_text)
|
| 164 |
+
|
| 165 |
+
# Add professional closing if response is substantial
|
| 166 |
+
if len(cleaned_text) > 200 and not _has_closing_statement(cleaned_text):
|
| 167 |
+
cleaned_text += "\n\nIf you need additional clarification or have related questions, please don't hesitate to ask."
|
| 168 |
+
|
| 169 |
+
return cleaned_text.strip()
|
| 170 |
+
|
| 171 |
+
def _enhance_text_structure(text: str) -> str:
|
| 172 |
+
"""Enhance text structure with better paragraphs and formatting."""
|
| 173 |
+
# Fix paragraph spacing
|
| 174 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 175 |
+
|
| 176 |
+
# Ensure proper spacing after periods
|
| 177 |
+
text = re.sub(r'\.([A-Z])', r'. \1', text)
|
| 178 |
+
|
| 179 |
+
# Fix common formatting issues
|
| 180 |
+
text = re.sub(r'\s+', ' ', text) # Multiple spaces to single
|
| 181 |
+
text = re.sub(r'([.!?])\s*\n\s*([a-z])', r'\1 \2', text) # Fix broken sentences
|
| 182 |
+
|
| 183 |
+
# Enhance list formatting
|
| 184 |
+
text = re.sub(r'\n(\d+\.|\*|\-)\s*', r'\n\n\1 ', text)
|
| 185 |
+
|
| 186 |
+
return text
|
| 187 |
+
|
| 188 |
+
def _has_closing_statement(text: str) -> bool:
|
| 189 |
+
"""Check if text already has a professional closing statement."""
|
| 190 |
+
closing_patterns = [
|
| 191 |
+
r'please.*?(contact|reach out|ask|let.*know)',
|
| 192 |
+
r'if you.*?(need|have|require)',
|
| 193 |
+
r'feel free to.*?(ask|contact|reach)',
|
| 194 |
+
r'don\'t hesitate to.*?(ask|contact|reach)'
|
| 195 |
+
]
|
| 196 |
+
|
| 197 |
+
text_lower = text.lower()
|
| 198 |
+
return any(re.search(pattern, text_lower) for pattern in closing_patterns)
|
| 199 |
+
|
| 200 |
+
def log_interaction(query: str, response: str, metadata: Optional[Dict] = None):
|
| 201 |
+
"""
|
| 202 |
+
Convenience function for logging user interactions.
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
query: User's question or input
|
| 206 |
+
response: System's response
|
| 207 |
+
metadata: Additional context information
|
| 208 |
+
"""
|
| 209 |
+
interaction_logger.log_interaction(query, response, metadata)
|
| 210 |
+
|
| 211 |
+
def sanitize_filename(filename: str) -> str:
|
| 212 |
+
"""
|
| 213 |
+
Sanitize filename for safe storage while preserving readability.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
filename: Original filename
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Sanitized filename safe for filesystem operations
|
| 220 |
+
"""
|
| 221 |
+
# Remove or replace problematic characters
|
| 222 |
+
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
| 223 |
+
|
| 224 |
+
# Remove multiple underscores
|
| 225 |
+
sanitized = re.sub(r'_{2,}', '_', sanitized)
|
| 226 |
+
|
| 227 |
+
# Ensure reasonable length
|
| 228 |
+
name, ext = Path(filename).stem, Path(filename).suffix
|
| 229 |
+
if len(name) > 100:
|
| 230 |
+
name = name[:100]
|
| 231 |
+
|
| 232 |
+
sanitized = f"{name}{ext}"
|
| 233 |
+
|
| 234 |
+
# Ensure not empty or just extension
|
| 235 |
+
if not sanitized or sanitized.startswith('.'):
|
| 236 |
+
sanitized = f"document_{int(time.time())}.pdf"
|
| 237 |
+
|
| 238 |
+
return sanitized
|
| 239 |
+
|
| 240 |
+
def calculate_text_similarity(text1: str, text2: str) -> float:
|
| 241 |
+
"""
|
| 242 |
+
Calculate semantic similarity between two text strings using word overlap.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
text1: First text string
|
| 246 |
+
text2: Second text string
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Similarity score between 0 and 1
|
| 250 |
+
"""
|
| 251 |
+
# Tokenize and normalize
|
| 252 |
+
words1 = set(text1.lower().split())
|
| 253 |
+
words2 = set(text2.lower().split())
|
| 254 |
+
|
| 255 |
+
# Calculate Jaccard similarity
|
| 256 |
+
intersection = words1.intersection(words2)
|
| 257 |
+
union = words1.union(words2)
|
| 258 |
+
|
| 259 |
+
if not union:
|
| 260 |
+
return 0.0
|
| 261 |
+
|
| 262 |
+
return len(intersection) / len(union)
|
| 263 |
+
|
| 264 |
+
def extract_key_phrases(text: str, max_phrases: int = 5) -> List[str]:
|
| 265 |
+
"""
|
| 266 |
+
Extract key phrases from text for metadata and search optimization.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
text: Input text to analyze
|
| 270 |
+
max_phrases: Maximum number of phrases to extract
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
List of key phrases
|
| 274 |
+
"""
|
| 275 |
+
# Simple extraction based on frequency and HR domain relevance
|
| 276 |
+
hr_relevant_terms = {
|
| 277 |
+
'policy', 'procedure', 'benefit', 'leave', 'vacation', 'sick', 'health',
|
| 278 |
+
'insurance', 'retirement', '401k', 'pto', 'holiday', 'payroll', 'salary',
|
| 279 |
+
'compensation', 'performance', 'review', 'training', 'onboarding',
|
| 280 |
+
'termination', 'resignation', 'discipline', 'harassment', 'diversity'
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
|
| 284 |
+
word_freq = {}
|
| 285 |
+
|
| 286 |
+
for word in words:
|
| 287 |
+
if word in hr_relevant_terms:
|
| 288 |
+
word_freq[word] = word_freq.get(word, 0) + 2 # Boost HR terms
|
| 289 |
+
else:
|
| 290 |
+
word_freq[word] = word_freq.get(word, 0) + 1
|
| 291 |
+
|
| 292 |
+
# Extract top phrases
|
| 293 |
+
key_phrases = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
| 294 |
+
return [phrase[0] for phrase in key_phrases[:max_phrases]]
|
| 295 |
+
|
| 296 |
+
def format_timestamp(timestamp: float, format_type: str = 'readable') -> str:
|
| 297 |
+
"""
|
| 298 |
+
Format timestamp for display in various contexts.
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
timestamp: Unix timestamp
|
| 302 |
+
format_type: Type of formatting ('readable', 'short', 'iso')
|
| 303 |
+
|
| 304 |
+
Returns:
|
| 305 |
+
Formatted timestamp string
|
| 306 |
+
"""
|
| 307 |
+
dt = datetime.fromtimestamp(timestamp)
|
| 308 |
+
|
| 309 |
+
if format_type == 'readable':
|
| 310 |
+
return dt.strftime('%B %d, %Y at %I:%M %p')
|
| 311 |
+
elif format_type == 'short':
|
| 312 |
+
return dt.strftime('%m/%d/%Y %H:%M')
|
| 313 |
+
elif format_type == 'iso':
|
| 314 |
+
return dt.isoformat()
|
| 315 |
+
else:
|
| 316 |
+
return str(dt)
|
| 317 |
+
|
| 318 |
+
def estimate_reading_time(text: str) -> int:
|
| 319 |
+
"""
|
| 320 |
+
Estimate reading time for text content in minutes.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
text: Text content to analyze
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
Estimated reading time in minutes
|
| 327 |
+
"""
|
| 328 |
+
# Average reading speed: 200-250 words per minute
|
| 329 |
+
word_count = len(text.split())
|
| 330 |
+
reading_time = max(1, round(word_count / 225))
|
| 331 |
+
return reading_time
|
| 332 |
+
|
| 333 |
+
def create_document_summary(text: str, max_length: int = 200) -> str:
|
| 334 |
+
"""
|
| 335 |
+
Create intelligent document summary for preview purposes.
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
text: Full document text
|
| 339 |
+
max_length: Maximum summary length in characters
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Document summary
|
| 343 |
+
"""
|
| 344 |
+
# Extract first meaningful paragraph or section
|
| 345 |
+
paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]
|
| 346 |
+
|
| 347 |
+
if not paragraphs:
|
| 348 |
+
return text[:max_length] + '...' if len(text) > max_length else text
|
| 349 |
+
|
| 350 |
+
summary = paragraphs[0]
|
| 351 |
+
|
| 352 |
+
# If first paragraph is too long, truncate intelligently
|
| 353 |
+
if len(summary) > max_length:
|
| 354 |
+
# Try to end at a sentence boundary
|
| 355 |
+
sentences = summary.split('. ')
|
| 356 |
+
truncated = sentences[0]
|
| 357 |
+
|
| 358 |
+
for sentence in sentences[1:]:
|
| 359 |
+
if len(truncated + '. ' + sentence) <= max_length - 3:
|
| 360 |
+
truncated += '. ' + sentence
|
| 361 |
+
else:
|
| 362 |
+
break
|
| 363 |
+
|
| 364 |
+
summary = truncated + '...'
|
| 365 |
+
|
| 366 |
+
return summary
|
| 367 |
+
|
| 368 |
+
def validate_document_content(text: str) -> Tuple[bool, List[str]]:
|
| 369 |
+
"""
|
| 370 |
+
Validate document content for HR relevance and quality.
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
text: Document text to validate
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
Tuple of (is_valid, list_of_issues)
|
| 377 |
+
"""
|
| 378 |
+
issues = []
|
| 379 |
+
|
| 380 |
+
# Check minimum content length
|
| 381 |
+
if len(text.strip()) < 100:
|
| 382 |
+
issues.append("Document content is too short (minimum 100 characters)")
|
| 383 |
+
|
| 384 |
+
# Check for readable text vs. scanned images
|
| 385 |
+
word_count = len(text.split())
|
| 386 |
+
if word_count < 20:
|
| 387 |
+
issues.append("Document appears to contain very little readable text")
|
| 388 |
+
|
| 389 |
+
# Check for HR-relevant content
|
| 390 |
+
hr_indicators = [
|
| 391 |
+
'policy', 'employee', 'benefit', 'leave', 'vacation', 'sick',
|
| 392 |
+
'insurance', 'company', 'workplace', 'procedure', 'guideline',
|
| 393 |
+
'handbook', 'hr', 'human resources', 'personnel'
|
| 394 |
+
]
|
| 395 |
+
|
| 396 |
+
text_lower = text.lower()
|
| 397 |
+
hr_score = sum(1 for indicator in hr_indicators if indicator in text_lower)
|
| 398 |
+
|
| 399 |
+
if hr_score < 2:
|
| 400 |
+
issues.append("Document may not be HR-related (consider adding to appropriate knowledge base)")
|
| 401 |
+
|
| 402 |
+
# Check for excessive repetition (common in corrupted PDFs)
|
| 403 |
+
lines = text.split('\n')
|
| 404 |
+
unique_lines = set(line.strip() for line in lines if line.strip())
|
| 405 |
+
|
| 406 |
+
if len(lines) > 10 and len(unique_lines) / len(lines) < 0.3:
|
| 407 |
+
issues.append("Document contains excessive repetition (possible extraction error)")
|
| 408 |
+
|
| 409 |
+
is_valid = len(issues) == 0
|
| 410 |
+
return is_valid, issues
|
| 411 |
+
|
| 412 |
+
def create_session_analytics() -> Dict[str, Any]:
|
| 413 |
+
"""
|
| 414 |
+
Create analytics data for current session.
|
| 415 |
+
|
| 416 |
+
Returns:
|
| 417 |
+
Dictionary with session analytics
|
| 418 |
+
"""
|
| 419 |
+
session_data = {
|
| 420 |
+
'session_id': interaction_logger._get_session_id(),
|
| 421 |
+
'start_time': st.session_state.get('session_start', time.time()),
|
| 422 |
+
'current_time': time.time(),
|
| 423 |
+
'message_count': len(st.session_state.get('messages', [])),
|
| 424 |
+
'api_key_validated': st.session_state.get('api_key_validated', False),
|
| 425 |
+
'admin_accessed': st.session_state.get('admin_authenticated', False)
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
# Calculate session duration
|
| 429 |
+
session_data['duration_minutes'] = (
|
| 430 |
+
session_data['current_time'] - session_data['start_time']
|
| 431 |
+
) / 60
|
| 432 |
+
|
| 433 |
+
return session_data
|
| 434 |
+
|
| 435 |
+
def safe_json_loads(json_string: str, default: Any = None) -> Any:
|
| 436 |
+
"""
|
| 437 |
+
Safely parse JSON string with fallback.
|
| 438 |
+
|
| 439 |
+
Args:
|
| 440 |
+
json_string: JSON string to parse
|
| 441 |
+
default: Default value if parsing fails
|
| 442 |
+
|
| 443 |
+
Returns:
|
| 444 |
+
Parsed JSON or default value
|
| 445 |
+
"""
|
| 446 |
+
try:
|
| 447 |
+
return json.loads(json_string)
|
| 448 |
+
except (json.JSONDecodeError, TypeError):
|
| 449 |
+
return default
|
| 450 |
+
|
| 451 |
+
def hash_document_content(content: str) -> str:
|
| 452 |
+
"""
|
| 453 |
+
Create content-based hash for deduplication.
|
| 454 |
+
|
| 455 |
+
Args:
|
| 456 |
+
content: Document content
|
| 457 |
+
|
| 458 |
+
Returns:
|
| 459 |
+
SHA-256 hash of normalized content
|
| 460 |
+
"""
|
| 461 |
+
# Normalize content for consistent hashing
|
| 462 |
+
normalized = re.sub(r'\s+', ' ', content.strip().lower())
|
| 463 |
+
return hashlib.sha256(normalized.encode()).hexdigest()
|
| 464 |
+
|
| 465 |
+
def format_file_size(size_bytes: int) -> str:
|
| 466 |
+
"""
|
| 467 |
+
Format file size in human-readable format.
|
| 468 |
+
|
| 469 |
+
Args:
|
| 470 |
+
size_bytes: File size in bytes
|
| 471 |
+
|
| 472 |
+
Returns:
|
| 473 |
+
Formatted size string
|
| 474 |
+
"""
|
| 475 |
+
if size_bytes < 1024:
|
| 476 |
+
return f"{size_bytes} B"
|
| 477 |
+
elif size_bytes < 1024**2:
|
| 478 |
+
return f"{size_bytes / 1024:.1f} KB"
|
| 479 |
+
elif size_bytes < 1024**3:
|
| 480 |
+
return f"{size_bytes / (1024**2):.1f} MB"
|
| 481 |
+
else:
|
| 482 |
+
return f"{size_bytes / (1024**3):.1f} GB"
|
| 483 |
+
|
| 484 |
+
def create_backup_filename(original_filename: str) -> str:
|
| 485 |
+
"""
|
| 486 |
+
Create backup filename with timestamp.
|
| 487 |
+
|
| 488 |
+
Args:
|
| 489 |
+
original_filename: Original file name
|
| 490 |
+
|
| 491 |
+
Returns:
|
| 492 |
+
Backup filename with timestamp
|
| 493 |
+
"""
|
| 494 |
+
name, ext = Path(original_filename).stem, Path(original_filename).suffix
|
| 495 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 496 |
+
return f"{name}_backup_{timestamp}{ext}"
|
| 497 |
+
|
| 498 |
+
def performance_monitor(func):
|
| 499 |
+
"""
|
| 500 |
+
Decorator for monitoring function performance.
|
| 501 |
+
|
| 502 |
+
Args:
|
| 503 |
+
func: Function to monitor
|
| 504 |
+
|
| 505 |
+
Returns:
|
| 506 |
+
Wrapped function with performance logging
|
| 507 |
+
"""
|
| 508 |
+
def wrapper(*args, **kwargs):
|
| 509 |
+
start_time = time.time()
|
| 510 |
+
try:
|
| 511 |
+
result = func(*args, **kwargs)
|
| 512 |
+
execution_time = time.time() - start_time
|
| 513 |
+
|
| 514 |
+
if execution_time > 5: # Log slow operations
|
| 515 |
+
interaction_logger.logger.warning(
|
| 516 |
+
f"Slow operation: {func.__name__} took {execution_time:.2f}s"
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
return result
|
| 520 |
+
except Exception as e:
|
| 521 |
+
execution_time = time.time() - start_time
|
| 522 |
+
interaction_logger.logger.error(
|
| 523 |
+
f"Function {func.__name__} failed after {execution_time:.2f}s: {str(e)}"
|
| 524 |
+
)
|
| 525 |
+
raise
|
| 526 |
+
|
| 527 |
+
return wrapper
|
| 528 |
+
|
| 529 |
+
# Convenience functions for common operations
|
| 530 |
+
def get_current_timestamp() -> float:
|
| 531 |
+
"""Get current timestamp for consistent time tracking."""
|
| 532 |
+
return time.time()
|
| 533 |
+
|
| 534 |
+
def is_valid_email(email: str) -> bool:
|
| 535 |
+
"""Basic email validation for contact forms."""
|
| 536 |
+
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
| 537 |
+
return bool(re.match(pattern, email))
|
| 538 |
+
|
| 539 |
+
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
|
| 540 |
+
"""Intelligently truncate text at word boundaries."""
|
| 541 |
+
if len(text) <= max_length:
|
| 542 |
+
return text
|
| 543 |
+
|
| 544 |
+
truncated = text[:max_length - len(suffix)]
|
| 545 |
+
# Try to break at word boundary
|
| 546 |
+
last_space = truncated.rfind(' ')
|
| 547 |
+
if last_space > max_length * 0.7: # If we can save at least 30% of the text
|
| 548 |
+
truncated = truncated[:last_space]
|
| 549 |
+
|
| 550 |
+
return truncated + suffix
|
vector_store.py
ADDED
|
@@ -0,0 +1,804 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chromadb
|
| 2 |
+
from chromadb.config import Settings
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import time
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import uuid
|
| 11 |
+
from config import Config
|
| 12 |
+
|
| 13 |
+
class BulletproofVectorStore:
|
| 14 |
+
"""
|
| 15 |
+
Ultra-robust vector storage with bulletproof deletion mechanics.
|
| 16 |
+
|
| 17 |
+
Engineering Philosophy:
|
| 18 |
+
- Atomic operations with rollback capability
|
| 19 |
+
- Deep diagnostic feedback for troubleshooting
|
| 20 |
+
- Multiple deletion strategies with fallback mechanisms
|
| 21 |
+
- State synchronization with UI refresh triggers
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.config = Config()
|
| 26 |
+
self.client = self._initialize_chromadb_with_diagnostics()
|
| 27 |
+
self.collection_name = "hr_knowledge_base"
|
| 28 |
+
self.collection = self._get_or_create_collection_robust()
|
| 29 |
+
self.deletion_diagnostics = {"operations": [], "performance_metrics": {}}
|
| 30 |
+
|
| 31 |
+
def _initialize_chromadb_with_diagnostics(self) -> chromadb.Client:
|
| 32 |
+
"""Initialize ChromaDB with comprehensive error diagnosis and recovery."""
|
| 33 |
+
try:
|
| 34 |
+
data_dir = Path(self.config.VECTOR_DB_PATH)
|
| 35 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
client = chromadb.PersistentClient(
|
| 38 |
+
path=str(data_dir),
|
| 39 |
+
settings=Settings(
|
| 40 |
+
anonymized_telemetry=False,
|
| 41 |
+
allow_reset=True,
|
| 42 |
+
# Enhanced settings for deletion reliability
|
| 43 |
+
chroma_server_authn_credentials_file=None,
|
| 44 |
+
chroma_server_authn_provider=None
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Verify client connection with diagnostic test
|
| 49 |
+
collections = client.list_collections()
|
| 50 |
+
st.info(f"🔍 ChromaDB initialized successfully. Found {len(collections)} existing collections.")
|
| 51 |
+
|
| 52 |
+
return client
|
| 53 |
+
|
| 54 |
+
except Exception as initialization_error:
|
| 55 |
+
st.error(f"🚨 ChromaDB initialization failed: {str(initialization_error)}")
|
| 56 |
+
raise
|
| 57 |
+
|
| 58 |
+
def _get_or_create_collection_robust(self) -> chromadb.Collection:
|
| 59 |
+
"""Get or create collection with enhanced error handling and validation."""
|
| 60 |
+
try:
|
| 61 |
+
# Attempt to get existing collection with diagnostic feedback
|
| 62 |
+
try:
|
| 63 |
+
collection = self.client.get_collection(
|
| 64 |
+
name=self.collection_name,
|
| 65 |
+
embedding_function=None
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Validate collection integrity
|
| 69 |
+
collection_count = collection.count()
|
| 70 |
+
st.success(f"✅ Connected to existing collection with {collection_count} items")
|
| 71 |
+
return collection
|
| 72 |
+
|
| 73 |
+
except Exception as get_error:
|
| 74 |
+
st.info(f"📋 Creating new collection: {str(get_error)}")
|
| 75 |
+
|
| 76 |
+
# Create new collection with enhanced metadata
|
| 77 |
+
collection = self.client.create_collection(
|
| 78 |
+
name=self.collection_name,
|
| 79 |
+
embedding_function=None,
|
| 80 |
+
metadata={
|
| 81 |
+
"description": "BLUESCARF AI HR Knowledge Base",
|
| 82 |
+
"created_at": time.time(),
|
| 83 |
+
"version": "2.0_bulletproof",
|
| 84 |
+
"deletion_engine": "enhanced"
|
| 85 |
+
}
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
st.success("🎉 New collection created successfully")
|
| 89 |
+
return collection
|
| 90 |
+
|
| 91 |
+
except Exception as collection_error:
|
| 92 |
+
st.error(f"💥 Collection setup failed: {str(collection_error)}")
|
| 93 |
+
raise
|
| 94 |
+
|
| 95 |
+
def delete_document_bulletproof(self, document_hash: str) -> bool:
|
| 96 |
+
"""
|
| 97 |
+
Bulletproof document deletion with multiple strategies and deep diagnostics.
|
| 98 |
+
|
| 99 |
+
Architecture:
|
| 100 |
+
1. Pre-deletion validation and state capture
|
| 101 |
+
2. Multiple deletion strategies with fallback mechanisms
|
| 102 |
+
3. Post-deletion verification and cleanup
|
| 103 |
+
4. UI state synchronization and user feedback
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
document_hash: Unique document identifier
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
bool: True if deletion successful, False otherwise
|
| 110 |
+
"""
|
| 111 |
+
deletion_session_id = str(uuid.uuid4())[:8]
|
| 112 |
+
operation_start = time.time()
|
| 113 |
+
|
| 114 |
+
st.info(f"🚀 **Deletion Engine Activated** (Session: {deletion_session_id})")
|
| 115 |
+
|
| 116 |
+
# Phase 1: Pre-deletion diagnostics and validation
|
| 117 |
+
validation_result = self._execute_pre_deletion_diagnostics(document_hash)
|
| 118 |
+
if not validation_result["is_valid"]:
|
| 119 |
+
st.error(f"❌ Pre-deletion validation failed: {validation_result['reason']}")
|
| 120 |
+
return False
|
| 121 |
+
|
| 122 |
+
st.success(f"✅ Validation passed - {validation_result['chunk_count']} chunks identified")
|
| 123 |
+
|
| 124 |
+
# Phase 2: Execute deletion with multiple strategies
|
| 125 |
+
deletion_strategies = [
|
| 126 |
+
("primary_where_clause", self._delete_via_where_clause),
|
| 127 |
+
("direct_id_deletion", self._delete_via_direct_ids),
|
| 128 |
+
("batch_deletion", self._delete_via_batch_operations),
|
| 129 |
+
("nuclear_reset", self._delete_via_collection_reset)
|
| 130 |
+
]
|
| 131 |
+
|
| 132 |
+
for strategy_name, deletion_method in deletion_strategies:
|
| 133 |
+
try:
|
| 134 |
+
st.info(f"🔧 Executing {strategy_name.replace('_', ' ').title()} strategy...")
|
| 135 |
+
|
| 136 |
+
deletion_success = deletion_method(document_hash, validation_result)
|
| 137 |
+
|
| 138 |
+
if deletion_success:
|
| 139 |
+
# Phase 3: Post-deletion verification
|
| 140 |
+
verification_result = self._execute_post_deletion_verification(document_hash)
|
| 141 |
+
|
| 142 |
+
if verification_result["is_clean"]:
|
| 143 |
+
# Phase 4: Cleanup and UI synchronization
|
| 144 |
+
self._execute_comprehensive_cleanup(document_hash)
|
| 145 |
+
self._trigger_ui_state_refresh()
|
| 146 |
+
|
| 147 |
+
operation_time = time.time() - operation_start
|
| 148 |
+
st.success(f"🎉 **Deletion Complete!** ({operation_time:.2f}s using {strategy_name})")
|
| 149 |
+
|
| 150 |
+
# Record successful operation
|
| 151 |
+
self._record_deletion_success(deletion_session_id, strategy_name, operation_time)
|
| 152 |
+
return True
|
| 153 |
+
else:
|
| 154 |
+
st.warning(f"⚠️ {strategy_name} incomplete - trying next strategy")
|
| 155 |
+
else:
|
| 156 |
+
st.warning(f"⚠️ {strategy_name} failed - trying next strategy")
|
| 157 |
+
|
| 158 |
+
except Exception as strategy_error:
|
| 159 |
+
st.error(f"💥 {strategy_name} error: {str(strategy_error)}")
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# All strategies failed - provide comprehensive diagnostics
|
| 163 |
+
st.error("🚨 **All deletion strategies failed**")
|
| 164 |
+
self._provide_failure_diagnostics(document_hash, deletion_session_id)
|
| 165 |
+
return False
|
| 166 |
+
|
| 167 |
+
def _execute_pre_deletion_diagnostics(self, document_hash: str) -> Dict[str, Any]:
|
| 168 |
+
"""Comprehensive pre-deletion validation with detailed diagnostics."""
|
| 169 |
+
diagnostic_result = {
|
| 170 |
+
"is_valid": False,
|
| 171 |
+
"chunk_count": 0,
|
| 172 |
+
"chunk_ids": [],
|
| 173 |
+
"reason": "",
|
| 174 |
+
"collection_status": {},
|
| 175 |
+
"metadata_status": {}
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
# Collection integrity check
|
| 180 |
+
collection_count = self.collection.count()
|
| 181 |
+
diagnostic_result["collection_status"] = {
|
| 182 |
+
"total_items": collection_count,
|
| 183 |
+
"is_accessible": True,
|
| 184 |
+
"connection_healthy": True
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
# Document existence verification with multiple query approaches
|
| 188 |
+
query_results = self.collection.get(
|
| 189 |
+
where={"document_hash": document_hash},
|
| 190 |
+
include=['documents', 'metadatas']
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
if not query_results['ids']:
|
| 194 |
+
# Try alternative query methods
|
| 195 |
+
all_items = self.collection.get(include=['metadatas'])
|
| 196 |
+
matching_items = [
|
| 197 |
+
item_id for item_id, metadata in zip(all_items['ids'], all_items['metadatas'])
|
| 198 |
+
if metadata.get('document_hash') == document_hash
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
if matching_items:
|
| 202 |
+
diagnostic_result["chunk_ids"] = matching_items
|
| 203 |
+
diagnostic_result["chunk_count"] = len(matching_items)
|
| 204 |
+
diagnostic_result["is_valid"] = True
|
| 205 |
+
st.info(f"📋 Found document via alternative query: {len(matching_items)} chunks")
|
| 206 |
+
else:
|
| 207 |
+
diagnostic_result["reason"] = "Document not found in collection"
|
| 208 |
+
return diagnostic_result
|
| 209 |
+
else:
|
| 210 |
+
diagnostic_result["chunk_ids"] = query_results['ids']
|
| 211 |
+
diagnostic_result["chunk_count"] = len(query_results['ids'])
|
| 212 |
+
diagnostic_result["is_valid"] = True
|
| 213 |
+
|
| 214 |
+
# Metadata file verification
|
| 215 |
+
metadata_file = Path(self.config.VECTOR_DB_PATH) / "metadata" / f"{document_hash}.json"
|
| 216 |
+
diagnostic_result["metadata_status"] = {
|
| 217 |
+
"file_exists": metadata_file.exists(),
|
| 218 |
+
"file_path": str(metadata_file)
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
return diagnostic_result
|
| 222 |
+
|
| 223 |
+
except Exception as diagnostic_error:
|
| 224 |
+
diagnostic_result["reason"] = f"Diagnostic error: {str(diagnostic_error)}"
|
| 225 |
+
return diagnostic_result
|
| 226 |
+
|
| 227 |
+
def _delete_via_where_clause(self, document_hash: str, validation_data: Dict) -> bool:
|
| 228 |
+
"""Primary deletion strategy using WHERE clause filtering."""
|
| 229 |
+
try:
|
| 230 |
+
pre_count = self.collection.count()
|
| 231 |
+
|
| 232 |
+
# Execute deletion with enhanced where clause
|
| 233 |
+
self.collection.delete(where={"document_hash": document_hash})
|
| 234 |
+
|
| 235 |
+
post_count = self.collection.count()
|
| 236 |
+
deleted_count = pre_count - post_count
|
| 237 |
+
|
| 238 |
+
st.info(f"📊 Where clause deletion: {deleted_count} items removed")
|
| 239 |
+
return deleted_count > 0
|
| 240 |
+
|
| 241 |
+
except Exception as where_error:
|
| 242 |
+
st.error(f"Where clause deletion failed: {str(where_error)}")
|
| 243 |
+
return False
|
| 244 |
+
|
| 245 |
+
def _delete_via_direct_ids(self, document_hash: str, validation_data: Dict) -> bool:
|
| 246 |
+
"""Secondary deletion strategy using direct ID targeting."""
|
| 247 |
+
try:
|
| 248 |
+
chunk_ids = validation_data.get("chunk_ids", [])
|
| 249 |
+
if not chunk_ids:
|
| 250 |
+
return False
|
| 251 |
+
|
| 252 |
+
# Delete by specific IDs in batches for reliability
|
| 253 |
+
batch_size = 10
|
| 254 |
+
deleted_total = 0
|
| 255 |
+
|
| 256 |
+
for i in range(0, len(chunk_ids), batch_size):
|
| 257 |
+
batch_ids = chunk_ids[i:i + batch_size]
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
self.collection.delete(ids=batch_ids)
|
| 261 |
+
deleted_total += len(batch_ids)
|
| 262 |
+
st.info(f"🗑️ Batch {i//batch_size + 1}: Deleted {len(batch_ids)} chunks")
|
| 263 |
+
except Exception as batch_error:
|
| 264 |
+
st.warning(f"Batch deletion failed: {str(batch_error)}")
|
| 265 |
+
continue
|
| 266 |
+
|
| 267 |
+
return deleted_total > 0
|
| 268 |
+
|
| 269 |
+
except Exception as id_error:
|
| 270 |
+
st.error(f"Direct ID deletion failed: {str(id_error)}")
|
| 271 |
+
return False
|
| 272 |
+
|
| 273 |
+
def _delete_via_batch_operations(self, document_hash: str, validation_data: Dict) -> bool:
|
| 274 |
+
"""Tertiary deletion strategy using optimized batch operations."""
|
| 275 |
+
try:
|
| 276 |
+
# Get all items and filter out target document
|
| 277 |
+
all_items = self.collection.get(include=['documents', 'metadatas'])
|
| 278 |
+
|
| 279 |
+
# Identify items to keep (inverse deletion approach)
|
| 280 |
+
items_to_keep = {
|
| 281 |
+
'ids': [],
|
| 282 |
+
'documents': [],
|
| 283 |
+
'metadatas': []
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
for item_id, doc, metadata in zip(all_items['ids'], all_items['documents'], all_items['metadatas']):
|
| 287 |
+
if metadata.get('document_hash') != document_hash:
|
| 288 |
+
items_to_keep['ids'].append(item_id)
|
| 289 |
+
items_to_keep['documents'].append(doc)
|
| 290 |
+
items_to_keep['metadatas'].append(metadata)
|
| 291 |
+
|
| 292 |
+
# Reset collection and add back only items to keep
|
| 293 |
+
collection_metadata = self.collection.metadata
|
| 294 |
+
self.client.delete_collection(self.collection_name)
|
| 295 |
+
|
| 296 |
+
self.collection = self.client.create_collection(
|
| 297 |
+
name=self.collection_name,
|
| 298 |
+
embedding_function=None,
|
| 299 |
+
metadata=collection_metadata
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Re-add items that should be kept
|
| 303 |
+
if items_to_keep['ids']:
|
| 304 |
+
# Need to get embeddings back - this is complex, skip for now
|
| 305 |
+
st.warning("Batch operation requires embedding reconstruction - skipping")
|
| 306 |
+
return False
|
| 307 |
+
|
| 308 |
+
st.info("🔄 Batch operation completed")
|
| 309 |
+
return True
|
| 310 |
+
|
| 311 |
+
except Exception as batch_error:
|
| 312 |
+
st.error(f"Batch operation failed: {str(batch_error)}")
|
| 313 |
+
return False
|
| 314 |
+
|
| 315 |
+
def _delete_via_collection_reset(self, document_hash: str, validation_data: Dict) -> bool:
|
| 316 |
+
"""Nuclear option: reset collection and rebuild without target document."""
|
| 317 |
+
try:
|
| 318 |
+
st.warning("⚠️ **NUCLEAR OPTION**: Rebuilding entire collection")
|
| 319 |
+
|
| 320 |
+
# This is a last resort - requires careful implementation
|
| 321 |
+
# For now, return False to avoid data loss
|
| 322 |
+
st.error("Nuclear reset not implemented for safety - manual intervention required")
|
| 323 |
+
return False
|
| 324 |
+
|
| 325 |
+
except Exception as reset_error:
|
| 326 |
+
st.error(f"Collection reset failed: {str(reset_error)}")
|
| 327 |
+
return False
|
| 328 |
+
|
| 329 |
+
def _execute_post_deletion_verification(self, document_hash: str) -> Dict[str, Any]:
|
| 330 |
+
"""Verify deletion completion with comprehensive checks."""
|
| 331 |
+
verification_result = {
|
| 332 |
+
"is_clean": False,
|
| 333 |
+
"remaining_chunks": 0,
|
| 334 |
+
"verification_methods": {}
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
# Method 1: WHERE clause verification
|
| 339 |
+
where_results = self.collection.get(where={"document_hash": document_hash})
|
| 340 |
+
remaining_via_where = len(where_results['ids'])
|
| 341 |
+
verification_result["verification_methods"]["where_clause"] = remaining_via_where
|
| 342 |
+
|
| 343 |
+
# Method 2: Full scan verification
|
| 344 |
+
all_items = self.collection.get(include=['metadatas'])
|
| 345 |
+
remaining_via_scan = sum(
|
| 346 |
+
1 for metadata in all_items['metadatas']
|
| 347 |
+
if metadata.get('document_hash') == document_hash
|
| 348 |
+
)
|
| 349 |
+
verification_result["verification_methods"]["full_scan"] = remaining_via_scan
|
| 350 |
+
|
| 351 |
+
# Determine overall cleanliness
|
| 352 |
+
verification_result["remaining_chunks"] = max(remaining_via_where, remaining_via_scan)
|
| 353 |
+
verification_result["is_clean"] = verification_result["remaining_chunks"] == 0
|
| 354 |
+
|
| 355 |
+
if verification_result["is_clean"]:
|
| 356 |
+
st.success("✅ Verification passed - document completely removed")
|
| 357 |
+
else:
|
| 358 |
+
st.warning(f"⚠️ Verification found {verification_result['remaining_chunks']} remaining chunks")
|
| 359 |
+
|
| 360 |
+
return verification_result
|
| 361 |
+
|
| 362 |
+
except Exception as verification_error:
|
| 363 |
+
st.error(f"Verification failed: {str(verification_error)}")
|
| 364 |
+
verification_result["verification_error"] = str(verification_error)
|
| 365 |
+
return verification_result
|
| 366 |
+
|
| 367 |
+
def _execute_comprehensive_cleanup(self, document_hash: str):
|
| 368 |
+
"""Execute comprehensive cleanup of metadata and cached data."""
|
| 369 |
+
try:
|
| 370 |
+
# Remove metadata file
|
| 371 |
+
metadata_file = Path(self.config.VECTOR_DB_PATH) / "metadata" / f"{document_hash}.json"
|
| 372 |
+
if metadata_file.exists():
|
| 373 |
+
metadata_file.unlink()
|
| 374 |
+
st.info("🧹 Metadata file removed")
|
| 375 |
+
|
| 376 |
+
# Clear any cached data in session state
|
| 377 |
+
cache_keys_to_clear = [
|
| 378 |
+
'admin_documents_cache',
|
| 379 |
+
'document_list_cache',
|
| 380 |
+
'admin_stats_cache'
|
| 381 |
+
]
|
| 382 |
+
|
| 383 |
+
for key in cache_keys_to_clear:
|
| 384 |
+
if key in st.session_state:
|
| 385 |
+
del st.session_state[key]
|
| 386 |
+
|
| 387 |
+
st.info("🔄 Cache cleared")
|
| 388 |
+
|
| 389 |
+
except Exception as cleanup_error:
|
| 390 |
+
st.warning(f"Cleanup warning: {str(cleanup_error)}")
|
| 391 |
+
|
| 392 |
+
def _trigger_ui_state_refresh(self):
|
| 393 |
+
"""Trigger comprehensive UI state refresh to reflect deletion."""
|
| 394 |
+
# Force refresh of admin components
|
| 395 |
+
refresh_triggers = [
|
| 396 |
+
'admin_refresh_counter',
|
| 397 |
+
'document_management_refresh',
|
| 398 |
+
'collection_stats_refresh'
|
| 399 |
+
]
|
| 400 |
+
|
| 401 |
+
for trigger in refresh_triggers:
|
| 402 |
+
if trigger not in st.session_state:
|
| 403 |
+
st.session_state[trigger] = 0
|
| 404 |
+
st.session_state[trigger] += 1
|
| 405 |
+
|
| 406 |
+
# Set global refresh flag
|
| 407 |
+
st.session_state.force_admin_refresh = True
|
| 408 |
+
st.info("🔄 UI refresh triggered")
|
| 409 |
+
|
| 410 |
+
def _record_deletion_success(self, session_id: str, strategy: str, operation_time: float):
|
| 411 |
+
"""Record successful deletion for analytics and optimization."""
|
| 412 |
+
success_record = {
|
| 413 |
+
"session_id": session_id,
|
| 414 |
+
"strategy_used": strategy,
|
| 415 |
+
"operation_time": operation_time,
|
| 416 |
+
"timestamp": time.time(),
|
| 417 |
+
"collection_size_after": self.collection.count()
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
self.deletion_diagnostics["operations"].append(success_record)
|
| 421 |
+
st.info(f"📊 Operation recorded: {strategy} in {operation_time:.2f}s")
|
| 422 |
+
|
| 423 |
+
def _provide_failure_diagnostics(self, document_hash: str, session_id: str):
|
| 424 |
+
"""Provide comprehensive failure diagnostics for troubleshooting."""
|
| 425 |
+
st.error("🚨 **DELETION FAILURE ANALYSIS**")
|
| 426 |
+
|
| 427 |
+
diagnostic_data = {
|
| 428 |
+
"session_id": session_id,
|
| 429 |
+
"document_hash": document_hash[:16] + "...",
|
| 430 |
+
"collection_info": {
|
| 431 |
+
"total_items": self.collection.count(),
|
| 432 |
+
"collection_name": self.collection_name
|
| 433 |
+
},
|
| 434 |
+
"attempted_strategies": ["where_clause", "direct_ids", "batch_operations"],
|
| 435 |
+
"system_state": {
|
| 436 |
+
"chromadb_version": chromadb.__version__,
|
| 437 |
+
"python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}"
|
| 438 |
+
}
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
with st.expander("🔍 **Technical Diagnostics**", expanded=True):
|
| 442 |
+
st.json(diagnostic_data)
|
| 443 |
+
|
| 444 |
+
st.markdown("**🛠️ Troubleshooting Steps:**")
|
| 445 |
+
st.write("1. **Verify Collection Access**: Check if collection is properly initialized")
|
| 446 |
+
st.write("2. **Manual Verification**: Use admin panel to verify document existence")
|
| 447 |
+
st.write("3. **System Restart**: Try refreshing the application")
|
| 448 |
+
st.write("4. **Alternative Approach**: Use collection reset if data loss is acceptable")
|
| 449 |
+
|
| 450 |
+
if st.button("🔄 **Force Collection Refresh**", key=f"force_refresh_{session_id}"):
|
| 451 |
+
try:
|
| 452 |
+
self.collection = self._get_or_create_collection_robust()
|
| 453 |
+
st.success("✅ Collection refreshed - try deletion again")
|
| 454 |
+
st.rerun()
|
| 455 |
+
except Exception as refresh_error:
|
| 456 |
+
st.error(f"Refresh failed: {str(refresh_error)}")
|
| 457 |
+
|
| 458 |
+
# Keep all other existing methods from the original VectorStore class
|
| 459 |
+
# Just replace the delete_document method with delete_document_bulletproof
|
| 460 |
+
|
| 461 |
+
def delete_document(self, document_hash: str) -> bool:
|
| 462 |
+
"""Wrapper method for backwards compatibility."""
|
| 463 |
+
return self.delete_document_bulletproof(document_hash)
|
| 464 |
+
|
| 465 |
+
# Include all other original methods here for completeness
|
| 466 |
+
def add_document(self, processed_doc: Dict[str, Any]) -> bool:
|
| 467 |
+
"""Add processed document with chunks and embeddings to vector store."""
|
| 468 |
+
try:
|
| 469 |
+
# Check if document already exists
|
| 470 |
+
existing_docs = self.get_documents_by_hash(processed_doc['document_hash'])
|
| 471 |
+
if existing_docs:
|
| 472 |
+
st.warning(f"Document {processed_doc['filename']} already exists in knowledge base")
|
| 473 |
+
return False
|
| 474 |
+
|
| 475 |
+
# Prepare data for ChromaDB
|
| 476 |
+
chunk_ids = []
|
| 477 |
+
embeddings = []
|
| 478 |
+
documents = []
|
| 479 |
+
metadatas = []
|
| 480 |
+
|
| 481 |
+
for i, chunk in enumerate(processed_doc['chunks']):
|
| 482 |
+
# Generate unique ID for each chunk
|
| 483 |
+
chunk_id = f"{processed_doc['document_hash']}_{i}"
|
| 484 |
+
chunk_ids.append(chunk_id)
|
| 485 |
+
|
| 486 |
+
# Extract embedding
|
| 487 |
+
embeddings.append(chunk['embedding'])
|
| 488 |
+
|
| 489 |
+
# Store chunk content
|
| 490 |
+
documents.append(chunk['content'])
|
| 491 |
+
|
| 492 |
+
# Prepare metadata (ChromaDB doesn't support nested objects)
|
| 493 |
+
metadata = {
|
| 494 |
+
'source': processed_doc['filename'],
|
| 495 |
+
'document_hash': processed_doc['document_hash'],
|
| 496 |
+
'chunk_index': chunk['metadata']['chunk_index'],
|
| 497 |
+
'chunk_type': chunk['metadata']['chunk_type'],
|
| 498 |
+
'processed_at': chunk['metadata'].get('processed_at', time.time()),
|
| 499 |
+
'content_length': len(chunk['content']),
|
| 500 |
+
'document_type': chunk['metadata'].get('document_type', 'hr_policy')
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
# Add section header if available
|
| 504 |
+
if 'section_header' in chunk['metadata']:
|
| 505 |
+
metadata['section_header'] = chunk['metadata']['section_header']
|
| 506 |
+
|
| 507 |
+
metadatas.append(metadata)
|
| 508 |
+
|
| 509 |
+
# Add to collection in batch for efficiency
|
| 510 |
+
self.collection.add(
|
| 511 |
+
ids=chunk_ids,
|
| 512 |
+
embeddings=embeddings,
|
| 513 |
+
documents=documents,
|
| 514 |
+
metadatas=metadatas
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
# Store document-level metadata separately
|
| 518 |
+
self._store_document_metadata(processed_doc)
|
| 519 |
+
|
| 520 |
+
st.success(f"✅ Added {len(chunk_ids)} chunks from {processed_doc['filename']} to knowledge base")
|
| 521 |
+
return True
|
| 522 |
+
|
| 523 |
+
except Exception as e:
|
| 524 |
+
st.error(f"Failed to add document to vector store: {str(e)}")
|
| 525 |
+
return False
|
| 526 |
+
|
| 527 |
+
def _store_document_metadata(self, processed_doc: Dict[str, Any]):
|
| 528 |
+
"""Store document-level metadata for management and tracking."""
|
| 529 |
+
try:
|
| 530 |
+
metadata_dir = Path(self.config.VECTOR_DB_PATH) / "metadata"
|
| 531 |
+
metadata_dir.mkdir(exist_ok=True)
|
| 532 |
+
|
| 533 |
+
metadata_file = metadata_dir / f"{processed_doc['document_hash']}.json"
|
| 534 |
+
|
| 535 |
+
doc_metadata = {
|
| 536 |
+
'filename': processed_doc['filename'],
|
| 537 |
+
'document_hash': processed_doc['document_hash'],
|
| 538 |
+
'chunk_count': processed_doc['chunk_count'],
|
| 539 |
+
'total_tokens': processed_doc['total_tokens'],
|
| 540 |
+
'processed_at': time.time(),
|
| 541 |
+
'metadata': processed_doc['metadata']
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
with open(metadata_file, 'w') as f:
|
| 545 |
+
json.dump(doc_metadata, f, indent=2)
|
| 546 |
+
|
| 547 |
+
except Exception as e:
|
| 548 |
+
st.warning(f"Failed to store document metadata: {str(e)}")
|
| 549 |
+
|
| 550 |
+
def similarity_search(self, query: str, k: int = 5, filter_metadata: Optional[Dict] = None) -> List[Dict[str, Any]]:
|
| 551 |
+
"""Perform semantic similarity search with advanced filtering and ranking."""
|
| 552 |
+
try:
|
| 553 |
+
# Import here to avoid loading model at startup
|
| 554 |
+
from sentence_transformers import SentenceTransformer
|
| 555 |
+
|
| 556 |
+
# Generate query embedding
|
| 557 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 558 |
+
query_embedding = embedding_model.encode([query], normalize_embeddings=True)[0].tolist()
|
| 559 |
+
|
| 560 |
+
# Perform similarity search
|
| 561 |
+
results = self.collection.query(
|
| 562 |
+
query_embeddings=[query_embedding],
|
| 563 |
+
n_results=min(k * 2, 20), # Get more results for re-ranking
|
| 564 |
+
where=filter_metadata,
|
| 565 |
+
include=['documents', 'metadatas', 'distances']
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
if not results['documents'][0]:
|
| 569 |
+
return []
|
| 570 |
+
|
| 571 |
+
# Process and rank results
|
| 572 |
+
processed_results = []
|
| 573 |
+
for i, (doc, metadata, distance) in enumerate(zip(
|
| 574 |
+
results['documents'][0],
|
| 575 |
+
results['metadatas'][0],
|
| 576 |
+
results['distances'][0]
|
| 577 |
+
)):
|
| 578 |
+
# Convert distance to similarity score
|
| 579 |
+
similarity_score = 1.0 - distance
|
| 580 |
+
|
| 581 |
+
# Apply content-based scoring
|
| 582 |
+
content_score = self._calculate_content_relevance(query, doc)
|
| 583 |
+
|
| 584 |
+
# Combine scores with weighting
|
| 585 |
+
final_score = (similarity_score * 0.7) + (content_score * 0.3)
|
| 586 |
+
|
| 587 |
+
processed_results.append({
|
| 588 |
+
'content': doc,
|
| 589 |
+
'metadata': metadata,
|
| 590 |
+
'similarity_score': similarity_score,
|
| 591 |
+
'content_score': content_score,
|
| 592 |
+
'final_score': final_score,
|
| 593 |
+
'rank': i + 1
|
| 594 |
+
})
|
| 595 |
+
|
| 596 |
+
# Sort by final score and return top k
|
| 597 |
+
processed_results.sort(key=lambda x: x['final_score'], reverse=True)
|
| 598 |
+
return processed_results[:k]
|
| 599 |
+
|
| 600 |
+
except Exception as e:
|
| 601 |
+
st.error(f"Similarity search failed: {str(e)}")
|
| 602 |
+
return []
|
| 603 |
+
|
| 604 |
+
def _calculate_content_relevance(self, query: str, content: str) -> float:
|
| 605 |
+
"""Calculate content-based relevance score using keyword matching and context analysis."""
|
| 606 |
+
try:
|
| 607 |
+
query_words = set(query.lower().split())
|
| 608 |
+
content_words = set(content.lower().split())
|
| 609 |
+
|
| 610 |
+
# Keyword overlap score
|
| 611 |
+
common_words = query_words.intersection(content_words)
|
| 612 |
+
keyword_score = len(common_words) / len(query_words) if query_words else 0
|
| 613 |
+
|
| 614 |
+
# Length penalty for very short chunks
|
| 615 |
+
length_score = min(len(content) / 200, 1.0)
|
| 616 |
+
|
| 617 |
+
# Section header bonus
|
| 618 |
+
if any(word in content.lower()[:100] for word in ['policy', 'procedure', 'guidelines']):
|
| 619 |
+
header_bonus = 0.1
|
| 620 |
+
else:
|
| 621 |
+
header_bonus = 0
|
| 622 |
+
|
| 623 |
+
return min(keyword_score + length_score * 0.3 + header_bonus, 1.0)
|
| 624 |
+
|
| 625 |
+
except Exception:
|
| 626 |
+
return 0.5 # Default score if calculation fails
|
| 627 |
+
|
| 628 |
+
def get_documents_by_hash(self, document_hash: str) -> List[Dict[str, Any]]:
|
| 629 |
+
"""Retrieve all chunks for a specific document by hash."""
|
| 630 |
+
try:
|
| 631 |
+
results = self.collection.get(
|
| 632 |
+
where={"document_hash": document_hash},
|
| 633 |
+
include=['documents', 'metadatas']
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
chunks = []
|
| 637 |
+
for doc, metadata in zip(results['documents'], results['metadatas']):
|
| 638 |
+
chunks.append({
|
| 639 |
+
'content': doc,
|
| 640 |
+
'metadata': metadata
|
| 641 |
+
})
|
| 642 |
+
|
| 643 |
+
return chunks
|
| 644 |
+
|
| 645 |
+
except Exception as e:
|
| 646 |
+
st.error(f"Failed to retrieve document: {str(e)}")
|
| 647 |
+
return []
|
| 648 |
+
|
| 649 |
+
def get_all_documents(self) -> List[Dict[str, Any]]:
|
| 650 |
+
"""Get metadata for all documents in the knowledge base."""
|
| 651 |
+
try:
|
| 652 |
+
# Get unique documents from collection
|
| 653 |
+
results = self.collection.get(include=['metadatas'])
|
| 654 |
+
|
| 655 |
+
if not results['metadatas']:
|
| 656 |
+
return []
|
| 657 |
+
|
| 658 |
+
# Group by document hash
|
| 659 |
+
documents = {}
|
| 660 |
+
for metadata in results['metadatas']:
|
| 661 |
+
doc_hash = metadata['document_hash']
|
| 662 |
+
if doc_hash not in documents:
|
| 663 |
+
documents[doc_hash] = {
|
| 664 |
+
'document_hash': doc_hash,
|
| 665 |
+
'filename': metadata['source'],
|
| 666 |
+
'document_type': metadata.get('document_type', 'hr_policy'),
|
| 667 |
+
'processed_at': metadata.get('processed_at', 0),
|
| 668 |
+
'chunk_count': 0
|
| 669 |
+
}
|
| 670 |
+
documents[doc_hash]['chunk_count'] += 1
|
| 671 |
+
|
| 672 |
+
# Load additional metadata from files
|
| 673 |
+
metadata_dir = Path(self.config.VECTOR_DB_PATH) / "metadata"
|
| 674 |
+
if metadata_dir.exists():
|
| 675 |
+
for metadata_file in metadata_dir.glob("*.json"):
|
| 676 |
+
try:
|
| 677 |
+
with open(metadata_file, 'r') as f:
|
| 678 |
+
file_metadata = json.load(f)
|
| 679 |
+
doc_hash = file_metadata['document_hash']
|
| 680 |
+
if doc_hash in documents:
|
| 681 |
+
documents[doc_hash].update(file_metadata)
|
| 682 |
+
except Exception as e:
|
| 683 |
+
continue
|
| 684 |
+
|
| 685 |
+
return list(documents.values())
|
| 686 |
+
|
| 687 |
+
except Exception as e:
|
| 688 |
+
st.error(f"Failed to retrieve documents: {str(e)}")
|
| 689 |
+
return []
|
| 690 |
+
|
| 691 |
+
def get_document_count(self) -> int:
|
| 692 |
+
"""Get total number of documents in knowledge base."""
|
| 693 |
+
try:
|
| 694 |
+
documents = self.get_all_documents()
|
| 695 |
+
return len(documents)
|
| 696 |
+
except Exception:
|
| 697 |
+
return 0
|
| 698 |
+
|
| 699 |
+
def get_total_chunks(self) -> int:
|
| 700 |
+
"""Get total number of chunks in knowledge base."""
|
| 701 |
+
try:
|
| 702 |
+
collection_info = self.collection.count()
|
| 703 |
+
return collection_info
|
| 704 |
+
except Exception:
|
| 705 |
+
return 0
|
| 706 |
+
|
| 707 |
+
def get_collection_stats(self) -> Dict[str, Any]:
|
| 708 |
+
"""Get comprehensive statistics about the knowledge base."""
|
| 709 |
+
try:
|
| 710 |
+
documents = self.get_all_documents()
|
| 711 |
+
total_chunks = self.get_total_chunks()
|
| 712 |
+
|
| 713 |
+
if not documents:
|
| 714 |
+
return {
|
| 715 |
+
'total_documents': 0,
|
| 716 |
+
'total_chunks': 0,
|
| 717 |
+
'avg_chunks_per_doc': 0,
|
| 718 |
+
'document_types': {},
|
| 719 |
+
'latest_update': None
|
| 720 |
+
}
|
| 721 |
+
|
| 722 |
+
# Calculate statistics
|
| 723 |
+
document_types = {}
|
| 724 |
+
latest_update = 0
|
| 725 |
+
|
| 726 |
+
for doc in documents:
|
| 727 |
+
doc_type = doc.get('document_type', 'unknown')
|
| 728 |
+
document_types[doc_type] = document_types.get(doc_type, 0) + 1
|
| 729 |
+
|
| 730 |
+
processed_at = doc.get('processed_at', 0)
|
| 731 |
+
if processed_at > latest_update:
|
| 732 |
+
latest_update = processed_at
|
| 733 |
+
|
| 734 |
+
avg_chunks = total_chunks / len(documents) if documents else 0
|
| 735 |
+
|
| 736 |
+
return {
|
| 737 |
+
'total_documents': len(documents),
|
| 738 |
+
'total_chunks': total_chunks,
|
| 739 |
+
'avg_chunks_per_doc': round(avg_chunks, 1),
|
| 740 |
+
'document_types': document_types,
|
| 741 |
+
'latest_update': latest_update,
|
| 742 |
+
'storage_path': str(self.config.VECTOR_DB_PATH)
|
| 743 |
+
}
|
| 744 |
+
|
| 745 |
+
except Exception as e:
|
| 746 |
+
st.error(f"Failed to get collection stats: {str(e)}")
|
| 747 |
+
return {}
|
| 748 |
+
|
| 749 |
+
def reset_collection(self) -> bool:
|
| 750 |
+
"""Reset the entire knowledge base (use with caution)."""
|
| 751 |
+
try:
|
| 752 |
+
# Delete collection
|
| 753 |
+
self.client.delete_collection(self.collection_name)
|
| 754 |
+
|
| 755 |
+
# Recreate collection
|
| 756 |
+
self.collection = self._get_or_create_collection_robust()
|
| 757 |
+
|
| 758 |
+
# Clean up metadata files
|
| 759 |
+
metadata_dir = Path(self.config.VECTOR_DB_PATH) / "metadata"
|
| 760 |
+
if metadata_dir.exists():
|
| 761 |
+
for metadata_file in metadata_dir.glob("*.json"):
|
| 762 |
+
metadata_file.unlink()
|
| 763 |
+
|
| 764 |
+
st.success("✅ Knowledge base reset successfully")
|
| 765 |
+
return True
|
| 766 |
+
|
| 767 |
+
except Exception as e:
|
| 768 |
+
st.error(f"Failed to reset collection: {str(e)}")
|
| 769 |
+
return False
|
| 770 |
+
|
| 771 |
+
def health_check(self) -> Dict[str, Any]:
|
| 772 |
+
"""Perform health check on vector store system."""
|
| 773 |
+
try:
|
| 774 |
+
# Check collection accessibility
|
| 775 |
+
collection_healthy = True
|
| 776 |
+
try:
|
| 777 |
+
self.collection.count()
|
| 778 |
+
except Exception:
|
| 779 |
+
collection_healthy = False
|
| 780 |
+
|
| 781 |
+
# Check storage path
|
| 782 |
+
storage_accessible = Path(self.config.VECTOR_DB_PATH).exists()
|
| 783 |
+
|
| 784 |
+
# Get basic stats
|
| 785 |
+
stats = self.get_collection_stats()
|
| 786 |
+
|
| 787 |
+
return {
|
| 788 |
+
'collection_healthy': collection_healthy,
|
| 789 |
+
'storage_accessible': storage_accessible,
|
| 790 |
+
'total_documents': stats.get('total_documents', 0),
|
| 791 |
+
'total_chunks': stats.get('total_chunks', 0),
|
| 792 |
+
'last_check': time.time(),
|
| 793 |
+
'status': 'healthy' if (collection_healthy and storage_accessible) else 'unhealthy'
|
| 794 |
+
}
|
| 795 |
+
|
| 796 |
+
except Exception as e:
|
| 797 |
+
return {
|
| 798 |
+
'status': 'error',
|
| 799 |
+
'error_message': str(e),
|
| 800 |
+
'last_check': time.time()
|
| 801 |
+
}
|
| 802 |
+
|
| 803 |
+
# Replace the original VectorStore with our bulletproof version
|
| 804 |
+
VectorStore = BulletproofVectorStore
|