Spaces:

softblackhole
/

auto-tagging-rag

Sleeping

App Files Files Community

soft.engineer commited on Nov 17, 2025

Commit

5ee86b8

1 Parent(s): 19167e3

initial project

Browse files

Files changed (31) hide show

.env.example +16 -0
.gitignore +63 -0
E2E_TEST_DESIGN.md +544 -0
EVALUATION_GUIDE.md +323 -0
HUGGINGFACE_DEPLOYMENT.md +325 -0
MODEL_DB_CONFIG.md +451 -0
README.md +605 -4
SETUP_GUIDE.md +512 -0
app.py +1552 -0
core/comparison.py +148 -0
core/eval.py +420 -0
core/index.py +389 -0
core/ingest.py +667 -0
core/report_generator.py +386 -0
core/reranker.py +183 -0
core/retrieval.py +507 -0
core/session_manager.py +353 -0
core/session_rag.py +100 -0
core/tag_generator.py +464 -0
core/utils.py +148 -0
core/visualization.py +291 -0
requirements.txt +55 -0
tests/README.md +283 -0
tests/__init__.py +2 -0
tests/conftest.py +154 -0
tests/test_accuracy.py +231 -0
tests/test_japanese_support.py +87 -0
tests/test_mcp_server.py +263 -0
tests/test_robustness.py +194 -0
tests/test_user_scenarios.py +174 -0
tests/test_ux.py +156 -0

.env.example ADDED Viewed

	@@ -0,0 +1,16 @@

+OPENAI_API_KEY=your_openai_api_key_here
+# =============================================================================
+# OPTIONAL CONFIGURATION VARIABLES
+# (Can be set but have defaults - see README for full documentation)
+# =============================================================================
+# These variables are optional and have sensible defaults.
+# Uncomment and modify only if you need to override defaults:
+# OPENAI_MODEL=gpt-4o-mini
+# OPENAI_EMBED_MODEL=text-embedding-3-small
+# ST_EMBED_MODEL=all-MiniLM-L6-v2
+# CHROMA_PERSIST_DIR=./chroma_data
+# SESSION_TIMEOUT=3600
+# LOG_LEVEL=INFO

.gitignore ADDED Viewed

	@@ -0,0 +1,63 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+env/
+.venv
+# Environment variables (SECURITY)
+.env
+.env.local
+.env.*.local
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# Project specific
+chroma_data/
+reports/
+*.log
+*.sqlite3
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+# Jupyter Notebook
+.ipynb_checkpoints
+# Model files (large)
+*.bin
+*.safetensors
+# Temporary files
+*.tmp
+*.temp

E2E_TEST_DESIGN.md ADDED Viewed

	@@ -0,0 +1,544 @@

+# E2E Test Design Document
+## Overview
+This document outlines comprehensive End-to-End (E2E) test cases for the Auto Tagging RAG System. The test suite covers accuracy, user experience, and robustness scenarios, including testable cases for non-technical users.
+## Test Categories
+### 1. Accuracy Tests
+#### 1.1 Tag Generation Accuracy
+**Test Case: TAG-001 - English Document Tag Generation**
+- **Objective**: Verify accurate tag generation for English documents
+- **Steps**:
+  1. Upload sample English document (`sample_documents/emergency_procedures.txt`)
+  2. Select language: "en" or "Auto"
+  3. Click "Build RAG Index"
+  4. Verify tags generated match document content
+- **Expected Result**: Tags include relevant keywords like "emergency", "fire", "safety", "procedure"
+- **Success Criteria**: At least 5 relevant tags generated, tags match document topics
+**Test Case: TAG-002 - Japanese Document Tag Generation**
+- **Objective**: Verify accurate tag generation for Japanese documents
+- **Steps**:
+  1. Upload Japanese document (test document)
+  2. Select language: "ja" or "Auto"
+  3. Click "Build RAG Index"
+  4. Verify Japanese tags are generated correctly
+- **Expected Result**: Tags in Japanese characters, relevant to document content
+- **Success Criteria**: Tags are in Japanese, relevant to content
+**Test Case: TAG-003 - Manual Tag Integration**
+- **Objective**: Verify manual tags are added and prioritized
+- **Steps**:
+  1. Upload document
+  2. Add manual tags: "custom-tag-1, custom-tag-2"
+  3. Build RAG Index
+  4. Check tag visualization
+- **Expected Result**: Manual tags appear first in tag list, combined with auto-generated tags
+- **Success Criteria**: Manual tags are prepended, no duplicates
+#### 1.2 Retrieval Accuracy
+**Test Case: RET-001 - Base RAG Retrieval**
+- **Objective**: Verify Base RAG returns relevant documents
+- **Steps**:
+  1. Upload multiple sample documents
+  2. Build RAG Index
+  3. Search for "emergency procedures"
+  4. Verify returned documents are relevant
+- **Expected Result**: Top result contains emergency procedure content
+- **Success Criteria**: Precision@1 > 0.5
+**Test Case: RET-002 - Tag Filter RAG with OR Operator**
+- **Objective**: Verify tag filtering works with OR operator
+- **Steps**:
+  1. Upload documents with known tags
+  2. Search with tags: "fire, emergency" (OR operator)
+  3. Verify results match any tag
+- **Expected Result**: Results contain documents with "fire" OR "emergency" tags
+- **Success Criteria**: All results have at least one matching tag
+**Test Case: RET-003 - Tag Filter RAG with AND Operator**
+- **Objective**: Verify tag filtering works with AND operator
+- **Steps**:
+  1. Search with tags: "fire, emergency" (AND operator)
+  2. Verify results match all tags
+- **Expected Result**: Results contain documents with BOTH "fire" AND "emergency" tags
+- **Success Criteria**: All results have all specified tags
+**Test Case: RET-004 - Hybrid RAG Weight Tuning**
+- **Objective**: Verify hybrid search combines vector and tag scores correctly
+- **Steps**:
+  1. Search with vector_weight=0.7, tag_weight=0.3
+  2. Compare results with vector_weight=0.3, tag_weight=0.7
+  3. Verify different results based on weights
+- **Expected Result**: Results change based on weight configuration
+- **Success Criteria**: Different weight configurations produce different rankings
+**Test Case: RET-005 - Hybrid Rerank RAG**
+- **Objective**: Verify reranking improves result relevance
+- **Steps**:
+  1. Search with Hybrid RAG (baseline)
+  2. Search with Hybrid Rerank RAG
+  3. Compare top results
+- **Expected Result**: Reranked results show higher semantic similarity scores
+- **Success Criteria**: nDCG@3 improves with reranking
+#### 1.3 Evaluation Metrics Accuracy
+**Test Case: MET-001 - Precision@k Calculation**
+- **Objective**: Verify Precision@k is calculated correctly
+- **Steps**:
+  1. Use sample evaluation queries (`sample_evaluation_queries.json`)
+  2. Run evaluation with ground truth
+  3. Verify Precision@1, Precision@3, Precision@5 values
+- **Expected Result**: Precision@k values match expected ranges (0.0-1.0)
+- **Success Criteria**: Precision@3 > 0.3 for sample data
+**Test Case: MET-002 - nDCG@k Calculation**
+- **Objective**: Verify nDCG@k considers ranking order
+- **Steps**:
+  1. Run evaluation
+  2. Compare nDCG@3 and nDCG@5
+- **Expected Result**: nDCG@5 >= nDCG@3 (more results improve ranking)
+- **Success Criteria**: nDCG values increase with k
+**Test Case: MET-003 - MRR Calculation**
+- **Objective**: Verify MRR reflects first relevant result position
+- **Steps**:
+  1. Run evaluation with queries having clear first match
+  2. Verify MRR value
+- **Expected Result**: MRR reflects position of first relevant result
+- **Success Criteria**: MRR > 0.3 for sample queries
+**Test Case: MET-004 - User Satisfaction Integration**
+- **Objective**: Verify user satisfaction scores are recorded
+- **Steps**:
+  1. Provide user satisfaction JSON (`sample_user_satisfaction.json`)
+  2. Run evaluation
+  3. Check results include satisfaction scores
+- **Expected Result**: CSV/JSON reports include user_satisfaction column
+- **Success Criteria**: Satisfaction scores appear in all output formats
+### 2. User Experience Tests
+#### 2.1 Non-Technical User Scenarios
+**Test Case: UX-001 - First-Time User Document Upload**
+- **User Type**: Non-technical user
+- **Objective**: First-time user can upload and process documents
+- **Steps**:
+  1. Open application
+  2. Navigate to "Upload & Tagging" tab
+  3. Drag and drop PDF/TXT files
+  4. Leave language as "Auto"
+  5. Click "Build RAG Index"
+  6. Wait for processing
+- **Expected Result**:
+  - Files are uploaded successfully
+  - Processing completes without errors
+  - Tags are generated and displayed
+  - Document count updates
+- **Success Criteria**: User completes upload without reading documentation
+**Test Case: UX-002 - Simple Search Query**
+- **User Type**: Non-technical user
+- **Objective**: User can search documents without understanding technical details
+- **Steps**:
+  1. Upload documents
+  2. Go to "Search & Compare" tab
+  3. Enter query: "What are emergency procedures?"
+  4. Click "Search All Methods"
+  5. Review results
+- **Expected Result**: Results appear with readable text, no technical jargon visible
+- **Success Criteria**: User finds relevant information without confusion
+**Test Case: UX-003 - Chat Interface Usage**
+- **User Type**: Non-technical user
+- **Objective**: User can chat naturally with the system
+- **Steps**:
+  1. Go to "Chat Interface" tab
+  2. Type: "Tell me about fire safety"
+  3. Click "Send"
+  4. Review answer and sources
+- **Expected Result**: Natural language answer, sources visible in accordion
+- **Success Criteria**: Answer is clear and helpful, sources are accessible
+**Test Case: UX-004 - Evaluation for Non-Technical User**
+- **User Type**: Non-technical user
+- **Objective**: User can run basic evaluation with sample data
+- **Steps**:
+  1. Copy sample queries from `sample_evaluation_queries.json`
+  2. Paste into "Evaluation Queries" field
+  3. Click "Run Evaluation"
+  4. View results and charts
+- **Expected Result**: Charts display, results are understandable
+- **Success Criteria**: User can interpret charts without technical knowledge
+**Test Case: UX-005 - Session Persistence**
+- **User Type**: Non-technical user
+- **Objective**: User's data persists across browser refresh
+- **Steps**:
+  1. Upload documents and build index
+  2. Note document count
+  3. Refresh browser page
+  4. Check session and document count
+- **Expected Result**: Same session ID, same document count, data accessible
+- **Success Criteria**: No data loss after refresh
+#### 2.2 Advanced User Scenarios
+**Test Case: UX-006 - Tag Weight Tuning**
+- **User Type**: Technical user
+- **Objective**: Advanced user can tune hybrid search weights
+- **Steps**:
+  1. Go to "Search & Compare" tab
+  2. Adjust vector weight slider (0.0-1.0)
+  3. Adjust tag weight slider (0.0-1.0)
+  4. Search and compare results
+- **Expected Result**: Results change based on weight configuration
+- **Success Criteria**: Weights affect result ranking visibly
+**Test Case: UX-007 - Custom Tag Input**
+- **User Type**: Technical user
+- **Objective**: User can add custom tags during upload
+- **Steps**:
+  1. Upload document
+  2. Enter custom tags: "project-alpha, confidential"
+  3. Build index
+  4. Verify tags in visualization
+- **Expected Result**: Custom tags appear in tag list, used in filtering
+- **Success Criteria**: Custom tags work in tag-based search
+**Test Case: UX-008 - Export Functionality**
+- **User Type**: Technical user
+- **Objective**: User can export evaluation results
+- **Steps**:
+  1. Run evaluation
+  2. Click "Download CSV"
+  3. Click "Download Charts"
+- **Expected Result**: Files download automatically without extra clicks
+- **Success Criteria**: One-click download works, files are valid
+**Test Case: UX-009 - Multi-Document Processing**
+- **User Type**: Advanced user
+- **Objective**: User can process multiple documents simultaneously
+- **Steps**:
+  1. Upload 5+ documents
+  2. Build index
+  3. Verify all documents indexed
+- **Expected Result**: All documents processed, unique document count correct
+- **Success Criteria**: Document count matches number of uploaded files
+#### 2.3 UI/UX Quality Tests
+**Test Case: UX-010 - Responsive Design**
+- **Objective**: UI works on different screen sizes
+- **Steps**:
+  1. Test on desktop (1920x1080)
+  2. Test on tablet (768x1024)
+  3. Test on mobile (375x667)
+- **Expected Result**: All tabs accessible, forms usable, no horizontal scroll
+- **Success Criteria**: UI adapts to screen size
+**Test Case: UX-011 - Loading States**
+- **Objective**: Users see feedback during processing
+- **Steps**:
+  1. Upload large document (50+ pages)
+  2. Observe UI during processing
+- **Expected Result**: Progress indicators visible, status messages clear
+- **Success Criteria**: User understands system is working
+**Test Case: UX-012 - Error Messages**
+- **Objective**: Errors are user-friendly
+- **Steps**:
+  1. Upload invalid file (corrupted PDF)
+  2. Search with empty query
+  3. Run evaluation with invalid JSON
+- **Expected Result**: Clear error messages, actionable guidance
+- **Success Criteria**: Errors help user fix issues
+### 3. Robustness Tests
+#### 3.1 Error Handling
+**Test Case: ROB-001 - Invalid File Upload**
+- **Objective**: System handles invalid files gracefully
+- **Steps**:
+  1. Upload corrupted PDF file
+  2. Upload non-PDF/TXT file (e.g., .exe)
+  3. Upload empty file
+- **Expected Result**:
+  - Error message displayed
+  - System remains functional
+  - No crashes
+- **Success Criteria**: Graceful error handling, no crashes
+**Test Case: ROB-002 - Invalid JSON in Evaluation**
+- **Objective**: System handles malformed JSON
+- **Steps**:
+  1. Enter invalid JSON in evaluation queries
+  2. Click "Run Evaluation"
+- **Expected Result**: Clear error message about JSON format
+- **Success Criteria**: Error is helpful, system recovers
+**Test Case: ROB-003 - Empty Query Search**
+- **Objective**: System handles empty search queries
+- **Steps**:
+  1. Leave search query empty
+  2. Click "Search All Methods"
+- **Expected Result**: Error message or no results message
+- **Success Criteria**: No crashes, clear feedback
+**Test Case: ROB-004 - Missing Ground Truth in Evaluation**
+- **Objective**: System handles missing ground truth
+- **Steps**:
+  1. Create evaluation query without ground_truth field
+  2. Run evaluation
+- **Expected Result**: Evaluation runs, metrics are 0 or skipped
+- **Success Criteria**: System continues, no errors
+**Test Case: ROB-005 - Large Document Processing**
+- **Objective**: System handles large documents
+- **Steps**:
+  1. Upload 100+ page PDF
+  2. Build index
+  3. Monitor memory usage
+- **Expected Result**: Processing completes, no memory errors
+- **Success Criteria**: Large documents process successfully
+#### 3.2 Edge Cases
+**Test Case: ROB-006 - Very Short Documents**
+- **Objective**: System handles minimal content
+- **Steps**:
+  1. Upload document with 1 sentence
+  2. Build index
+  3. Search for content
+- **Expected Result**: Tags generated, search works
+- **Success Criteria**: Minimal content is processed
+**Test Case: ROB-007 - Special Characters in Documents**
+- **Objective**: System handles special characters
+- **Steps**:
+  1. Upload document with special characters (é, 日本語, 🎉)
+  2. Build index
+  3. Search with special characters
+- **Expected Result**: Special characters preserved, search works
+- **Success Criteria**: Unicode handled correctly
+**Test Case: ROB-008 - Concurrent Sessions**
+- **Objective**: Multiple users can use system simultaneously
+- **Steps**:
+  1. Open application in two browser windows
+  2. Upload different documents in each
+  3. Verify isolation
+- **Expected Result**: Each session has separate data, no interference
+- **Success Criteria**: Session isolation works correctly
+**Test Case: ROB-009 - Session Expiration Handling**
+- **Objective**: System handles expired sessions
+- **Steps**:
+  1. Create session
+  2. Wait for expiration (if configured)
+  3. Try to access session
+- **Expected Result**: New session created or session restored
+- **Success Criteria**: Graceful session handling
+**Test Case: ROB-010 - Network Interruption**
+- **Objective**: System handles offline mode
+- **Steps**:
+  1. Disconnect network
+  2. Upload documents
+  3. Search documents
+- **Expected Result**: Works offline (after initial model downloads)
+- **Success Criteria**: Offline functionality works
+#### 3.3 Data Integrity
+**Test Case: ROB-011 - Document Count Accuracy**
+- **Objective**: Document count reflects unique documents
+- **Steps**:
+  1. Upload 3 documents
+  2. Check document count
+  3. Upload 2 more documents
+  4. Verify count updates to 5
+- **Expected Result**: Count matches unique documents, not chunks
+- **Success Criteria**: Accurate document counting
+**Test Case: ROB-012 - Tag Consistency**
+- **Objective**: Same document produces consistent tags
+- **Steps**:
+  1. Upload document
+  2. Note tags
+  3. Reset index
+  4. Upload same document again
+  5. Compare tags
+- **Expected Result**: Tags are consistent (may vary slightly due to randomness)
+- **Success Criteria**: Similar tags generated for same document
+**Test Case: ROB-013 - Index Reset**
+- **Objective**: Reset clears all data correctly
+- **Steps**:
+  1. Upload documents
+  2. Build index
+  3. Click "Reset Index"
+  4. Verify document count is 0
+  5. Try to search
+- **Expected Result**: Count resets, search returns no results
+- **Success Criteria**: Complete reset functionality
+### 4. Performance Tests
+#### 4.1 Response Time
+**Test Case: PERF-001 - Document Upload Speed**
+- **Objective**: Upload completes in reasonable time
+- **Steps**:
+  1. Upload 10 documents (total ~1MB)
+  2. Measure time to completion
+- **Expected Result**: Processing completes within 30 seconds
+- **Success Criteria**: < 30s for 10 documents
+**Test Case: PERF-002 - Search Latency**
+- **Objective**: Search returns results quickly
+- **Steps**:
+  1. Index 100 documents
+  2. Measure search latency for each method
+- **Expected Result**:
+  - Base RAG: < 1s
+  - Tag Filter: < 1s
+  - Hybrid: < 2s
+  - Hybrid Rerank: < 5s
+- **Success Criteria**: All methods meet latency targets
+**Test Case: PERF-003 - Evaluation Speed**
+- **Objective**: Evaluation completes in reasonable time
+- **Steps**:
+  1. Run evaluation with 10 queries, 3 k-values
+  2. Measure total time
+- **Expected Result**: Completes within 60 seconds
+- **Success Criteria**: < 60s for 10 queries × 4 pipelines × 3 k-values
+#### 4.2 Scalability
+**Test Case: PERF-004 - Large Document Set**
+- **Objective**: System handles 1000+ documents
+- **Steps**:
+  1. Index 1000 documents
+  2. Perform searches
+  3. Monitor memory and CPU
+- **Expected Result**: System remains responsive
+- **Success Criteria**: Memory usage reasonable, search latency acceptable
+**Test Case: PERF-005 - Concurrent Users**
+- **Objective**: Multiple users don't degrade performance
+- **Steps**:
+  1. Simulate 5 concurrent users
+  2. Each performs searches
+  3. Monitor performance
+- **Expected Result**: No significant performance degradation
+- **Success Criteria**: Latency stays within acceptable range
+### 5. Integration Tests
+#### 5.1 API Integration
+**Test Case: INT-001 - Gradio API Access**
+- **Objective**: Verify API endpoints work
+- **Steps**:
+  1. Use Gradio Client to call API
+  2. Test build_rag API
+  3. Test search API
+  4. Test evaluate API
+- **Expected Result**: All APIs return expected results
+- **Success Criteria**: API endpoints functional
+**Test Case: INT-002 - MCP Server Integration**
+- **Objective**: Verify MCP server functions
+- **Steps**:
+  1. Connect MCP client
+  2. Call MCP tools
+  3. Verify responses
+- **Expected Result**: MCP tools work correctly
+- **Success Criteria**: MCP integration functional
+#### 5.2 Data Flow
+**Test Case: INT-003 - End-to-End Workflow**
+- **Objective**: Complete workflow from upload to evaluation
+- **Steps**:
+  1. Upload documents
+  2. Build index
+  3. Search documents
+  4. Run evaluation
+  5. Download reports
+- **Expected Result**: All steps complete successfully
+- **Success Criteria**: Complete workflow functional
+## Test Execution Guidelines
+### Test Environment Setup
+1. **Prerequisites**:
+   - Python 3.8+
+   - All dependencies installed (`pip install -r requirements.txt`)
+   - spaCy model downloaded (`python -m spacy download en_core_web_sm`)
+   - Sample documents available in `sample_documents/`
+2. **Test Data**:
+   - Use provided sample documents
+   - Use `sample_evaluation_queries.json` for evaluation tests
+   - Use `sample_user_satisfaction.json` for satisfaction tests
+3. **Test Execution**:
+   - Manual tests: Follow step-by-step instructions
+   - Automated tests: Run pytest (if implemented)
+   - Document results and any issues found
+### Test Reporting
+For each test case:
+1. **Status**: Pass / Fail / Blocked / Not Tested
+2. **Notes**: Observations, issues, screenshots
+3. **Environment**: Browser, OS, Python version
+4. **Date**: Test execution date
+### Non-Technical User Testing
+For UX tests (UX-001 to UX-005):
+- Use actual non-technical users when possible
+- Provide minimal instruction
+- Observe user behavior
+- Note confusion points
+- Measure task completion time
+## Success Criteria Summary
+### Must Pass (Critical)
+- All Accuracy Tests (TAG-001 to MET-004)
+- All Robustness Tests (ROB-001 to ROB-013)
+- Core UX Tests (UX-001, UX-002, UX-005)
+### Should Pass (Important)
+- Advanced UX Tests (UX-006 to UX-012)
+- Performance Tests (PERF-001 to PERF-003)
+- Integration Tests (INT-001 to INT-003)
+### Nice to Have
+- Scalability Tests (PERF-004, PERF-005)
+- Edge case handling
+- Performance optimizations
+## Test Maintenance
+- Update test cases when features change
+- Add new test cases for new features
+- Review and refine test cases quarterly
+- Keep test data updated

EVALUATION_GUIDE.md ADDED Viewed

	@@ -0,0 +1,323 @@

+# Evaluation Guide - How to Run Evaluation with Samples
+## Overview
+The **Analytics & Evaluation** tab allows you to run comprehensive quantitative evaluation of all 4 retrieval methods using test queries with ground truth documents.
+## Input Format
+### 1. Evaluation Queries (JSON)
+**Required Format:**
+```json
+[
+  {
+    "query": "Your question here",
+    "ground_truth": ["chunk_content_1", "chunk_content_2"],
+    "k_values": [1, 3, 5],
+    "tags": ["tag1", "tag2"],
+    "tag_operator": "OR",
+    "vector_weight": 0.7,
+    "tag_weight": 0.3
+  }
+]
+```
+**Fields:**
+- **`query`** (required): The search question/query string
+- **`ground_truth`** (required): List of actual document contents that should be retrieved. These should match the **actual text content** of chunks in your indexed documents.
+- **`k_values`** (optional): List of k values to test (default: `[1, 3, 5]`)
+- **`tags`** (optional): Tags for tag-based pipelines
+- **`tag_operator`** (optional): `"OR"`, `"AND"`, or `"NOT"` (default: `"OR"`)
+- **`vector_weight`** (optional): For hybrid pipelines (default: `0.7`)
+- **`tag_weight`** (optional): For hybrid pipelines (default: `0.3`)
+### 2. User Satisfaction Scores (JSON, Optional)
+**Format:**
+```json
+{
+  "query_0": 4.5,
+  "query_1": 3.8,
+  "query_2": 5.0
+}
+```
+- Keys are `"query_0"`, `"query_1"`, etc. (index-based)
+- Values are satisfaction scores (typically 1-5)
+## Sample Evaluation Input
+### Example 1: Basic Evaluation
+```json
+[
+  {
+    "query": "What are the emergency procedures for fire incidents?",
+    "ground_truth": [
+      "In case of fire, immediately activate the nearest fire alarm and evacuate the building following the posted exit routes.",
+      "Fire safety protocols require all personnel to know the location of fire extinguishers and emergency exits.",
+      "During fire emergencies, do not use elevators and stay low to avoid smoke inhalation."
+    ],
+    "k_values": [1, 3, 5]
+  },
+  {
+    "query": "What equipment is needed for patient safety monitoring?",
+    "ground_truth": [
+      "Standard patient monitoring equipment includes blood pressure cuffs, pulse oximeters, and ECG monitors.",
+      "Safety monitoring requires regular calibration of medical devices and documented maintenance logs."
+    ],
+    "k_values": [1, 3, 5]
+  }
+]
+```
+### Example 2: With Tags
+```json
+[
+  {
+    "query": "What are surgical safety protocols?",
+    "ground_truth": [
+      "All surgical procedures require pre-operative checklists and sterile environment protocols.",
+      "Surgical safety includes patient identification verification and site marking procedures.",
+      "Post-operative care involves monitoring vital signs and wound care instructions."
+    ],
+    "k_values": [1, 3, 5],
+    "tags": ["surgery", "safety", "protocol"],
+    "tag_operator": "AND"
+  },
+  {
+    "query": "How to handle medical emergencies?",
+    "ground_truth": [
+      "Medical emergency response begins with assessing patient ABC (Airway, Breathing, Circulation).",
+      "Emergency protocols require immediate notification of medical team and preparation of emergency equipment."
+    ],
+    "k_values": [1, 3, 5],
+    "tags": ["emergency", "medical", "response"],
+    "tag_operator": "OR"
+  }
+]
+```
+### Example 3: With User Satisfaction
+**Evaluation Queries:**
+```json
+[
+  {
+    "query": "What are infection control measures?",
+    "ground_truth": [
+      "Infection control requires hand hygiene, use of personal protective equipment, and proper sterilization of instruments.",
+      "Standard precautions must be followed for all patients to prevent transmission of infectious diseases."
+    ],
+    "k_values": [1, 3, 5]
+  },
+  {
+    "query": "What are patient care guidelines?",
+    "ground_truth": [
+      "Patient care guidelines emphasize respect for patient autonomy, informed consent, and maintaining confidentiality.",
+      "Care protocols require documentation of all interventions and regular assessment of patient condition."
+    ],
+    "k_values": [1, 3, 5]
+  }
+]
+```
+**User Satisfaction Scores:**
+```json
+{
+  "query_0": 4.5,
+  "query_1": 4.2
+}
+```
+## Step-by-Step Instructions
+### Step 1: Upload Documents
+1. Go to **Upload & Tagging** tab
+2. Upload your PDF/TXT documents
+3. Click **"Build RAG Index"**
+4. Wait for indexing to complete
+### Step 2: Prepare Ground Truth
+**Important:** Ground truth must match the **actual text content** of chunks in your indexed documents.
+**How to find ground truth:**
+1. Use **Search & Compare** tab to search for similar queries
+2. Check the retrieved document content
+3. Copy the exact text from relevant chunks
+4. Use these as your `ground_truth` array
+**Example:**
+If a chunk contains:
+```
+"Fire safety protocols require all personnel to know the location of fire extinguishers and emergency exits."
+```
+Then use:
+```json
+"ground_truth": ["Fire safety protocols require all personnel to know the location of fire extinguishers and emergency exits."]
+```
+### Step 3: Enter Evaluation Queries
+1. Go to **Analytics & Evaluation** tab
+2. In **"Evaluation Queries (JSON)"** field, paste your JSON array
+3. Use the sample format above as a template
+### Step 4: (Optional) Add User Satisfaction
+1. In **"User Satisfaction Scores (JSON, optional)"** field
+2. Enter satisfaction scores as JSON object
+3. Use `query_0`, `query_1`, etc. as keys
+### Step 5: Set Output Filename
+1. In **"Output Filename"** field
+2. Enter filename (e.g., `evaluation_results.csv`)
+3. Results will be saved to `reports/` directory
+### Step 6: Run Evaluation
+1. Click **"Run Evaluation"** button
+2. Wait for evaluation to complete (may take several minutes)
+3. Results will appear in:
+   - **Evaluation Status**: Summary message
+   - **Evaluation Results**: DataFrame with all metrics
+   - **Summary Statistics**: Aggregated metrics by pipeline
+   - **Visualization Tabs**: Charts and graphs
+## Understanding Results
+### Metrics Explained
+- **Precision@k**: Fraction of retrieved documents that are relevant
+  - Range: 0.0 - 1.0 (higher is better)
+  - Example: 0.8 means 80% of retrieved docs are relevant
+- **nDCG@k**: Normalized Discounted Cumulative Gain
+  - Range: 0.0 - 1.0 (higher is better)
+  - Measures ranking quality with position weighting
+- **Hit@k**: Whether at least one relevant document is in top-k
+  - Value: 0.0 or 1.0 (1.0 = found at least one relevant doc)
+- **MRR**: Mean Reciprocal Rank
+  - Range: 0.0 - 1.0 (higher is better)
+  - Average of 1/rank where first relevant doc appears
+- **Semantic Similarity**: Average cosine similarity between query and retrieved docs
+  - Range: 0.0 - 1.0 (higher is better)
+- **Latency**: Response time in seconds (lower is better)
+- **User Satisfaction**: Average satisfaction score (if provided)
+  - Range: depends on your scale (typically 1-5)
+### Results DataFrame
+Columns include:
+- `query_id`: Query identifier
+- `query`: Query text
+- `pipeline`: Pipeline name (base_rag, tag_filter_rag, hybrid_rag, hybrid_rerank_rag)
+- `k`: Number of results requested
+- `precision_at_k`: Precision metric
+- `ndcg_at_k`: nDCG metric
+- `hit_at_k`: Hit metric
+- `mrr`: MRR metric
+- `semantic_similarity`: Similarity score
+- `latency`: Response time
+- `retrieved_count`: Number of documents retrieved
+- `user_satisfaction`: Satisfaction score (if provided)
+## Common Issues and Solutions
+### Issue 1: "No results found" or Low Precision
+**Problem:** Ground truth doesn't match indexed documents
+**Solution:**
+1. Check that ground truth text **exactly matches** chunk content
+2. Use **Search & Compare** to verify what's actually indexed
+3. Copy exact text from retrieved chunks
+### Issue 2: "Invalid JSON format"
+**Problem:** JSON syntax error
+**Solution:**
+1. Validate JSON using an online JSON validator
+2. Ensure all strings are in double quotes `"`, not single quotes `'`
+3. Ensure no trailing commas
+4. Check brackets and braces are balanced
+### Issue 3: Evaluation Takes Too Long
+**Problem:** Too many queries or high k values
+**Solution:**
+1. Start with 2-3 queries
+2. Use lower k values (e.g., `[1, 3]` instead of `[1, 3, 5, 10]`)
+3. Evaluation runs sequentially - be patient
+### Issue 4: All Metrics Are Zero
+**Problem:** Ground truth doesn't match any retrieved documents
+**Solution:**
+1. Verify documents are actually indexed (check document count)
+2. Check that ground truth text matches indexed chunk content exactly
+3. Use semantic matching threshold (system uses ~0.8 similarity threshold)
+## Tips for Better Evaluation
+1. **Start Small**: Begin with 2-3 queries to test the format
+2. **Verify Ground Truth**: Always check what's actually indexed before creating ground truth
+3. **Use Representative Queries**: Include queries that reflect real user needs
+4. **Test Different k Values**: Try `[1, 3, 5]` to see how results improve with more documents
+5. **Compare Methods**: Use evaluation to see which pipeline performs best for your data
+6. **Include Edge Cases**: Test with queries that might not have perfect matches
+## Output Files
+Evaluation generates several files in `reports/` directory:
+1. **CSV File**: `evaluation_results.csv` - Detailed metrics per query/pipeline/k
+2. **JSON File**: `evaluation_results.json` - Complete results with summary
+3. **PNG Charts**: Various visualization charts in `reports/visualizations/`
+4. **HTML Report**: Comprehensive report with embedded charts
+## Sample Workflow
+1. **Upload documents** → Index with tags
+2. **Search manually** → Find relevant chunks
+3. **Create queries** → Based on document topics
+4. **Extract ground truth** → Copy exact chunk text
+5. **Run evaluation** → Get quantitative metrics
+6. **Analyze results** → Compare pipeline performance
+7. **Iterate** → Refine queries and ground truth
+## Quick Reference
+**Minimal Valid Input:**
+```json
+[
+  {
+    "query": "Your question",
+    "ground_truth": ["Exact chunk text 1", "Exact chunk text 2"]
+  }
+]
+```
+**Full Input Example:**
+```json
+[
+  {
+    "query": "What are safety protocols?",
+    "ground_truth": ["Safety protocol text from indexed document"],
+    "k_values": [1, 3, 5],
+    "tags": ["safety", "protocol"],
+    "tag_operator": "OR"
+  }
+]
+```
+Remember: Ground truth must **exactly match** the content of your indexed document chunks!

HUGGINGFACE_DEPLOYMENT.md ADDED Viewed

	@@ -0,0 +1,325 @@

+# Hugging Face Spaces Deployment Guide
+Complete guide for deploying Auto Tagging RAG System to Hugging Face Spaces.
+## Quick Start
+### Prerequisites
+1. **Hugging Face Account**: Sign up at [huggingface.co](https://huggingface.co)
+2. **Git**: Installed and configured
+3. **Repository**: Your project code pushed to a Git repository
+### Step 1: Create a New Space
+1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+2. Click **"Create new Space"**
+3. Configure:
+   - **Space name**: `auto-tagging-rag` (or your preferred name)
+   - **SDK**: Select **Gradio**
+   - **Visibility**: Public or Private
+   - **Hardware**: CPU (free) or GPU (paid) if needed
+### Step 2: Clone and Setup
+```bash
+# Clone your Space repository
+git clone https://huggingface.co/spaces/YOUR_USERNAME/auto-tagging-rag
+cd auto-tagging-rag
+# Copy your project files to the Space repository
+cp -r /path/to/your/auto_tagging_rag/* .
+cp /path/to/your/auto_tagging_rag/.gitignore .  # If exists
+```
+### Step 3: Verify Files Structure
+Ensure these files are present:
+```
+.
+├── app.py                    # Gradio application entry point
+├── requirements.txt          # Python dependencies
+├── README.md                 # Space description
+└── core/                     # Core modules directory
+    ├── __init__.py
+    ├── ingest.py
+    ├── index.py
+    ├── retrieval.py
+    ├── eval.py
+    ├── tag_generator.py
+    ├── reranker.py
+    ├── comparison.py
+    ├── visualization.py
+    ├── report_generator.py
+    ├── session_manager.py
+    ├── session_rag.py
+    └── utils.py
+```
+### Step 4: Update README.md for Spaces
+Your `README.md` should include Hugging Face Spaces frontmatter (already included):
+```markdown
+---
+title: Auto Tagging RAG System
+emoji: 📚
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: false
+license: mit
+python_version: "3.10"
+---
+```
+### Step 5: Configure Environment Variables
+In your Hugging Face Space (if using OpenAI features):
+1. Go to **Settings** → **Repository secrets**
+2. Add security-important variables:
+   - `OPENAI_API_KEY`: Your OpenAI API key (required only if using OpenAI embeddings or tag generation)
+     - Get from: [https://platform.openai.com/api-keys](https://platform.openai.com/api-keys)
+**Note**:
+- Only `OPENAI_API_KEY` is security-sensitive and should be set as a secret
+- All other configuration variables have sensible defaults
+- Environment variables are automatically injected into the Space container
+### Step 6: Install Dependencies
+Ensure `requirements.txt` includes all necessary packages:
+```txt
+gradio==5.49.1
+gradio-client==1.13.3
+langchain>=0.1.0
+langchain-community>=0.0.0
+chromadb>=0.4.0
+pypdf>=3.0.0
+PyPDF2>=3.0.0
+sentence-transformers>=2.2.0
+tiktoken>=0.5.0
+yake>=0.4.0
+keybert>=0.8.0
+spacy>=3.7.0
+janome>=0.5.0
+openai>=1.0.0
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+python-dotenv>=1.0.0
+PyYAML>=6.0
+numpy>=1.21.0
+pandas>=1.5.0
+scikit-learn>=1.2.0
+matplotlib>=3.5.0
+jinja2>=3.1.0
+mcp>=1.0.0
+fastapi>=0.110.0
+starlette>=0.36.3
+uvicorn>=0.23.0
+```
+**Important for Hugging Face Spaces**:
+- Models (SentenceTransformers, spaCy) are downloaded automatically on first run
+- No manual model download needed
+- Models are cached in `/tmp` or container storage
+### Step 7: Handle Model Downloads in Code
+Your `app.py` should handle model downloads gracefully. The system already does this:
+```python
+# In core/index.py - SentenceTransformers uses local_files_only=False on first run
+# In core/tag_generator.py - spaCy models are loaded with error handling
+```
+**For spaCy English model**: The code automatically downloads it on first use if not available locally.
+### Step 8: Configure Persistence Directory
+For Hugging Face Spaces, use a writable directory:
+```python
+# In app.py or core modules
+import os
+# Default for Spaces
+PERSIST_DIR = os.getenv("CHROMA_PERSIST_DIR", "/tmp/chroma_data")
+# For Spaces, /tmp is writable and persists during container lifecycle
+# Note: Data is cleared on Space restart
+```
+### Step 9: Commit and Push
+```bash
+git add .
+git commit -m "Initial commit: Auto Tagging RAG System"
+git push
+```
+### Step 10: Build and Deploy
+1. Hugging Face Spaces will automatically build your Space
+2. Check the **Logs** tab for build progress
+3. Wait for build to complete (usually 2-5 minutes)
+4. Your Space will be available at: `https://huggingface.co/spaces/YOUR_USERNAME/auto-tagging-rag`
+## Configuration for Spaces
+### Environment Variables in Space Settings
+Go to **Settings** → **Repository secrets** and add (if using OpenAI):
+| Variable | Description | Required | Example |
+|----------|-------------|----------|---------|
+| `OPENAI_API_KEY` | OpenAI API key for embeddings/tag generation | Only if using OpenAI | `sk-...` |
+**Security Note**:
+- Only `OPENAI_API_KEY` is security-sensitive and should be set as a Repository Secret
+- All other configuration variables have sensible defaults and don't need to be set
+- See `SETUP_GUIDE.md` for optional configuration variables
+### Hardware Requirements
+**CPU (Free Tier)**:
+- Sufficient for small to medium datasets
+- Good for testing and demonstrations
+- Model downloads may take longer on first run
+**GPU (Paid Tier)**:
+- Recommended for large datasets or production use
+- Faster embedding generation
+- Better reranking performance
+### Storage Considerations
+**Important Notes**:
+- Spaces have limited persistent storage
+- ChromaDB data stored in `/tmp` is cleared on Space restart
+- For persistent data, consider:
+  - Using Hugging Face Datasets (for document storage)
+  - External database (e.g., PostgreSQL via API)
+  - Hugging Face Hub for model artifacts
+## Troubleshooting
+### Build Failures
+**Issue**: Build fails with "Module not found"
+- **Solution**: Check `requirements.txt` includes all dependencies
+- Verify Python version matches `python_version: "3.10"` in README
+**Issue**: spaCy model not found
+- **Solution**: The code automatically downloads models on first run
+- Check logs for download progress
+- If persistent, add `python -m spacy download en_core_web_sm` to build process (not recommended - handled in code)
+### Runtime Errors
+**Issue**: "Permission denied" errors with `/data`
+- **Solution**: Use `/tmp/chroma_data` instead (already configured)
+**Issue**: Out of memory errors
+- **Solution**: Upgrade to GPU Space or reduce `MAX_TAGS_PER_CHUNK` and `k` values
+**Issue**: Models not loading
+- **Solution**: Check internet connectivity during first run
+- Models download automatically on first use
+- Subsequent runs use cached models
+### Performance Issues
+**Issue**: Slow first load
+- **Solution**: Normal - models download on first run
+- Subsequent loads are faster (models cached)
+**Issue**: Timeout errors
+- **Solution**: Increase Space timeout in settings
+- Or reduce batch processing size
+## Updating Your Space
+```bash
+# Make changes to your code
+cd auto-tagging-rag
+# Commit changes
+git add .
+git commit -m "Update: Description of changes"
+git push
+# Hugging Face Spaces will automatically rebuild
+```
+## Public vs Private Spaces
+**Public Spaces**:
+- Accessible to everyone
+- Great for demos and sharing
+- Free hosting (with usage limits)
+**Private Spaces**:
+- Requires Hugging Face Pro subscription
+- Access restricted to authorized users
+- Better for internal use cases
+## Best Practices
+1. **Optimize Model Loading**: Models load on first use - consider lazy loading
+2. **Error Handling**: Add comprehensive error handling for network issues
+3. **User Guidance**: Add clear instructions in README for users
+4. **Resource Management**: Monitor memory usage - Spaces have limits
+5. **Session Management**: Sessions persist in `/tmp` but clear on restart
+## Example Space Configuration
+### README.md Frontmatter
+```markdown
+---
+title: Auto Tagging RAG System
+emoji: 📚
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: false
+license: mit
+python_version: "3.10"
+---
+# Auto Tagging RAG System
+[Your project description]
+```
+### .gitignore
+```
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+chroma_data/
+reports/
+*.log
+.env
+.DS_Store
+*.swp
+*.swo
+```
+## Support
+- **Documentation**: See `README.md` and `SETUP_GUIDE.md`
+- **Issues**: Report on GitHub or Hugging Face Space
+- **Community**: Hugging Face Spaces discussions

MODEL_DB_CONFIG.md ADDED Viewed

	@@ -0,0 +1,451 @@

+# Model & Database Configuration Notes
+## Overview
+This document details all libraries, models, APIs, and database configurations used in the Auto Tagging RAG System.
+## Table of Contents
+1. [Embedding Models](#embedding-models)
+2. [Tag Generation Libraries](#tag-generation-libraries)
+3. [Reranking Models](#reranking-models)
+4. [Database Configuration](#database-configuration)
+5. [API Integrations](#api-integrations)
+6. [Model Download & Caching](#model-download--caching)
+## Embedding Models
+### Primary: SentenceTransformers
+**Library**: `sentence-transformers>=2.2.0`
+**Default Model**: `all-MiniLM-L6-v2`
+- **Provider**: Hugging Face
+- **Dimensions**: 384
+- **Size**: ~80MB
+- **Language**: Multilingual (optimized for English)
+- **Performance**: Fast, good quality for most use cases
+- **Download**: Automatic on first use
+- **Location**: Cached in `~/.cache/huggingface/` or `~/.cache/torch/sentence_transformers/`
+**Usage**:
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
+embeddings = model.encode(["Your text here"])
+```
+**Configuration**:
+- Set via `ST_EMBED_MODEL` environment variable
+- Alternative models: `all-mpnet-base-v2` (768 dims), `paraphrase-multilingual-MiniLM-L12-v2` (384 dims)
+**Offline Mode**:
+- Uses `local_files_only=True` to prevent network access after initial download
+- Model is cached locally for offline operation
+### Alternative: OpenAI Embeddings
+**Library**: `openai>=1.0.0`
+**Default Model**: `text-embedding-3-small`
+- **Provider**: OpenAI API
+- **Dimensions**: 1536
+- **Cost**: Pay-per-use
+- **Rate Limits**: Varies by plan
+- **Requires**: `OPENAI_API_KEY` environment variable
+**Alternative Models**:
+- `text-embedding-3-large`: 3072 dimensions, higher quality
+- `text-embedding-ada-002`: Legacy model, 1536 dimensions
+**Usage**:
+```python
+from openai import OpenAI
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+response = client.embeddings.create(
+    model="text-embedding-3-small",
+    input=["Your text here"]
+)
+```
+**Configuration**:
+- Set via `OPENAI_EMBED_MODEL` environment variable
+- Force usage with `USE_OPENAI_EMBEDDINGS=true`
+**ChromaDB Integration**:
+- When using OpenAI embeddings, ChromaDB collections are namespaced: `documents__oai_1536`
+- When using SentenceTransformers, collections are: `documents__st_384`
+- Switching providers requires re-indexing documents
+## Tag Generation Libraries
+### YAKE (Yet Another Keyword Extractor)
+**Library**: `yake>=0.4.0`
+**Purpose**: Language-independent keyword extraction
+- **Languages**: English, Japanese, and many others
+- **Method**: Statistical approach based on word co-occurrence
+- **Advantages**: No model downloads, fast, works offline
+- **Configuration**: Controlled via `MAX_TAGS_PER_CHUNK`, `MIN_TAG_LENGTH`, `MAX_TAG_LENGTH`
+**Usage**:
+```python
+import yake
+kw_extractor = yake.KeywordExtractor(
+    lan="en",  # or "ja"
+    n=3,       # max words in phrase
+    top=10     # max tags
+)
+keywords = kw_extractor.extract_keywords(text)
+```
+### KeyBERT
+**Library**: `keybert>=0.8.0`
+**Purpose**: Keyword extraction using BERT embeddings
+- **Base Model**: Uses SentenceTransformers models (default: `all-MiniLM-L6-v2`)
+- **Languages**: Works with multilingual BERT models
+- **Method**: Extracts keywords by comparing document embeddings with candidate phrase embeddings
+- **Advantages**: Higher quality than YAKE, leverages semantic understanding
+**Usage**:
+```python
+from keybert import KeyBERT
+kw_model = KeyBERT(model='all-MiniLM-L6-v2')
+keywords = kw_model.extract_keywords(text, top_n=10)
+```
+### spaCy
+**Library**: `spacy>=3.7.0`
+**Model**: `en_core_web_sm` (English)
+- **Size**: ~13MB
+- **Download**: `python -m spacy download en_core_web_sm`
+- **Location**: Cached in spaCy data directory
+**Purpose**: Noun phrase extraction for English
+- **Method**: POS tagging + noun phrase chunking
+- **Advantages**: High precision, grammar-aware
+- **Limitation**: English only
+**Usage**:
+```python
+import spacy
+nlp = spacy.load("en_core_web_sm")
+doc = nlp(text)
+noun_phrases = [chunk.text for chunk in doc.noun_chunks]
+```
+### Janome
+**Library**: `janome>=0.5.0`
+**Purpose**: Japanese morphological analysis
+- **Languages**: Japanese only
+- **Method**: MeCab-based tokenization and POS tagging
+- **Advantages**: No external dependencies, pure Python
+- **Download**: No model download required (built-in dictionaries)
+**Usage**:
+```python
+from janome.tokenizer import Tokenizer
+t = Tokenizer()
+tokens = t.tokenize(text)
+nouns = [token.surface for token in tokens if token.part_of_speech.startswith('名詞')]
+```
+### OpenAI Tag Generation
+**Library**: `openai>=1.0.0`
+**Model**: Configurable (default: `gpt-4o-mini`)
+- **Purpose**: AI-generated tags using language model
+- **Method**: Prompt-based generation
+- **Advantages**: Highest quality, contextual understanding
+- **Cost**: Pay-per-use
+- **Requires**: `OPENAI_API_KEY`
+**Usage**:
+```python
+from openai import OpenAI
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+response = client.chat.completions.create(
+    model="gpt-4o-mini",
+    messages=[{"role": "user", "content": f"Extract key tags from: {text}"}]
+)
+```
+**Configuration**:
+- Set model via `OPENAI_MODEL` environment variable
+- Options: `gpt-4o-mini`, `gpt-4`, `gpt-3.5-turbo`
+## Reranking Models
+### Cross-Encoder Reranker
+**Library**: `sentence-transformers` (cross-encoder models)
+**Default Model**: `cross-encoder/ms-marco-MiniLM-L-6-v2`
+- **Provider**: Hugging Face
+- **Size**: ~40MB
+- **Purpose**: Re-rank retrieved documents based on query-document relevance
+- **Method**: Cross-attention between query and document
+- **Download**: Automatic on first use
+- **Performance**: Most accurate reranking method
+**Usage**:
+```python
+from sentence_transformers import CrossEncoder
+model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+scores = model.predict([(query, doc) for doc in documents])
+```
+**Configuration**:
+- Set model via `RERANKER_MODEL` environment variable
+- Alternative models: `cross-encoder/ms-marco-electra-base`, `cross-encoder/quora-distilroberta-base`
+### Semantic Reranking
+**Method**: Uses embedding similarity (vector store)
+- **Library**: `sentence-transformers` or `openai`
+- **Purpose**: Re-rank based on semantic similarity
+- **Advantages**: No additional model download
+- **Performance**: Good but less accurate than cross-encoder
+### Heuristic Reranking
+**Method**: Rule-based scoring
+- **Factors**: Tag overlap, keyword frequency, document length
+- **Advantages**: Fast, no model required
+- **Performance**: Baseline method
+**Configuration**:
+- Set strategy via `RERANKER_STRATEGY` environment variable
+- Options: `cross_encoder`, `semantic`, `heuristic`, `openai`
+## Database Configuration
+### ChromaDB
+**Library**: `chromadb>=0.4.0`
+**Type**: Vector database with persistent storage
+- **Storage Format**: SQLite (metadata) + binary files (vectors)
+- **Location**: Configurable via `CHROMA_PERSIST_DIR` (default: `./chroma_data`)
+- **Collection Naming**: Namespaced by embedding provider/dimension
+  - SentenceTransformers: `session_xxxxx__st_384`
+  - OpenAI: `session_xxxxx__oai_1536`
+**Configuration**:
+```python
+import chromadb
+client = chromadb.PersistentClient(path="./chroma_data")
+collection = client.create_collection(
+    name="documents",
+    metadata={"hnsw:space": "cosine"}
+)
+```
+**Metadata Schema**:
+- `source_name`: Original filename
+- `doc_id`: Unique document identifier
+- `chunk_index`: Chunk index within document
+- `tags`: Comma-separated tag string
+- `lang`: Language code (en/ja)
+- `user_tags`: User-provided tags (comma-separated)
+**Storage Structure**:
+```
+chroma_data/
+├── chroma.sqlite3          # Metadata database
+├── session_xxxxx__st_384/  # Vector data for each collection
+│   ├── data_level0.bin
+│   ├── header.bin
+│   ├── length.bin
+│   └── link_lists.bin
+```
+**Session Isolation**:
+- Each user session has separate ChromaDB collection
+- Collections prefixed with `session_` + 8-char session ID
+- Sessions persist across server restarts (collections stored on disk)
+**Cleanup**:
+- Sessions expire after `SESSION_TIMEOUT` seconds (default: 3600)
+- Expired sessions are cleaned up automatically
+- Manual cleanup: Delete `chroma_data/` directory
+## API Integrations
+### OpenAI API
+**Endpoint**: `https://api.openai.com/v1/`
+**Endpoints Used**:
+1. **Embeddings**: `POST /embeddings`
+   - Models: `text-embedding-3-small`, `text-embedding-3-large`
+   - Rate limits: Varies by plan
+2. **Chat Completions**: `POST /chat/completions`
+   - Models: `gpt-4o-mini`, `gpt-4`
+   - Used for: Tag generation, metadata detection
+**Authentication**:
+- API Key in `OPENAI_API_KEY` environment variable
+- Format: `sk-...`
+**Rate Limits**:
+- Free tier: Limited requests/minute
+- Paid tier: Higher limits
+- Handling: Retries with exponential backoff
+**Error Handling**:
+- Network errors: Graceful fallback to SentenceTransformers
+- Rate limits: Retry with backoff
+- Invalid key: Log warning, continue with fallback
+### Hugging Face Hub (Automatic)
+**Purpose**: Model downloads via `sentence-transformers`
+**Models Downloaded**:
+- Embeddings: `all-MiniLM-L6-v2`
+- Reranker: `cross-encoder/ms-marco-MiniLM-L-6-v2`
+**Caching**:
+- Location: `~/.cache/huggingface/` or `~/.cache/torch/sentence_transformers/`
+- First download: Requires internet
+- Subsequent runs: Uses cached models (offline)
+**Offline Mode**:
+- Set `local_files_only=True` in code
+- Prevents network access after initial download
+- Falls back gracefully if model not cached
+## Model Download & Caching
+### Automatic Downloads
+The following models are downloaded automatically on first use:
+1. **SentenceTransformers Embeddings** (~80MB)
+   - Triggered: First vector store initialization
+   - Location: `~/.cache/torch/sentence_transformers/all-MiniLM-L6-v2/`
+2. **spaCy English Model** (~13MB)
+   - Triggered: Manual download required (`python -m spacy download en_core_web_sm`)
+   - Location: spaCy data directory
+3. **Cross-Encoder Reranker** (~40MB)
+   - Triggered: First reranking operation
+   - Location: `~/.cache/torch/sentence_transformers/cross-encoder/ms-marco-MiniLM-L-6-v2/`
+### Manual Pre-download
+To download all models before first use:
+```bash
+# 1. spaCy English model
+python -m spacy download en_core_web_sm
+# 2. SentenceTransformers embeddings (via Python)
+python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+# 3. Cross-encoder reranker (via Python)
+python3 -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')"
+```
+### Cache Management
+**View Cached Models**:
+```bash
+# SentenceTransformers
+ls ~/.cache/torch/sentence_transformers/
+# spaCy
+python -m spacy info en_core_web_sm
+```
+**Clear Cache** (if needed):
+```bash
+# SentenceTransformers
+rm -rf ~/.cache/torch/sentence_transformers/
+# spaCy (re-download)
+python -m spacy download en_core_web_sm
+```
+### Disk Space Requirements
+- **Minimum**: ~200MB for all models
+- **Recommended**: ~500MB for cache and data
+- **With Data**: Additional space for ChromaDB and reports
+## Offline Operation
+### Fully Offline After Setup
+After initial model downloads, the system works **completely offline**:
+1. ✅ All models cached locally
+2. ✅ No API calls (unless OpenAI explicitly enabled)
+3. ✅ No network access required
+4. ✅ Tiktoken fallback for token counting
+### Components Requiring Internet
+- **Initial Setup**: Model downloads (one-time)
+- **OpenAI Features**: If `OPENAI_API_KEY` is set (optional)
+- **Gradio Telemetry**: Anonymous usage stats (can be disabled)
+### Disable Network Access
+Set in code:
+- `local_files_only=True` for SentenceTransformers
+- Remove `OPENAI_API_KEY` from environment
+- Tiktoken falls back automatically if network unavailable
+## Version Compatibility
+### Python Version
+- **Minimum**: Python 3.8
+- **Recommended**: Python 3.10+
+- **Tested**: Python 3.8, 3.9, 3.10, 3.11
+### Library Versions
+See `requirements.txt` for exact versions. Key libraries:
+- `gradio>=5.49.1`
+- `chromadb>=0.4.0`
+- `sentence-transformers>=2.2.0`
+- `spacy>=3.7.0`
+### Model Compatibility
+- All models tested with current library versions
+- Breaking changes in model formats are handled automatically
+- Updates: Check Hugging Face for model updates
+## Performance Notes
+### Model Loading Times
+- SentenceTransformers: ~2-5 seconds (first load)
+- spaCy: ~1-2 seconds (first load)
+- Cross-encoder: ~2-5 seconds (first load)
+- Subsequent loads: Cached, faster
+### Memory Usage
+- Base: ~500MB (Python + libraries)
+- SentenceTransformers: ~200MB
+- spaCy: ~100MB
+- Cross-encoder: ~150MB
+- ChromaDB: Varies with document count
+- **Total**: ~1-2GB typical usage
+### Optimization Tips
+- Use smaller embedding models for lower memory
+- Disable reranking if not needed
+- Use CPU-only mode (default) - GPU optional
+- Clear ChromaDB cache if disk space limited

README.md CHANGED Viewed

@@ -1,12 +1,613 @@
 ---
-title: Auto Tagging Rag
-emoji: 😻
 colorFrom: indigo
-colorTo: green
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Auto Tagging RAG System
+emoji: 📚
 colorFrom: indigo
+colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
+license: mit
+python_version: "3.10"
 ---
+# Auto Tagging RAG System
+A comprehensive system for evaluating multiple **RAG (Retrieval-Augmented Generation)** pipelines with support for flat tags, hybrid retrieval, reranking, and extensive evaluation metrics.
+## Features
+### Retrieval Pipelines
+- **Base RAG**: Standard vector similarity search
+- **Tag Filter RAG**: Filter documents using flat tags with AND/OR/NOT operators
+- **Hybrid RAG**: Combine vector search and tag-based filtering with weighted scoring
+- **Hybrid Rerank RAG**: Apply reranking after hybrid retrieval for refined results
+### Tag System
+- **Flat Tags**: Automatic non-hierarchical tag generation using:
+  - YAKE (Yet Another Keyword Extractor)
+  - KeyBERT (keyword extraction with BERT)
+  - spaCy (noun phrase extraction for English)
+  - Janome (Japanese tokenization and noun extraction)
+  - OpenAI-based generation (optional)
+- **Manual Tag Input**: Users can add custom tags during document upload
+  - Manual tags are combined with auto-generated tags
+  - Manual tags take priority (prepended to tag list)
+- **Multi-language Support**: English and Japanese tag generation
+### Evaluation & Analysis
+- **Extended Metrics**: Precision@k, nDCG@k, MRR, Hit@k, Semantic Similarity, Latency (mean, p50, p90), User Satisfaction
+- **Comparison Framework**: Side-by-side comparison of all retrieval methods
+- **Visualization**: Bar charts, line plots, scatter plots, box plots, stacked bar charts, Pareto charts
+- **Report Generation**: Comprehensive HTML, CSV, and JSON reports with embedded visualizations and representative examples
+### Additional Features
+- **Session Management**:
+  - Browser-based session persistence using localStorage
+  - Session ID automatically saved and restored on page refresh
+  - Multi-user support with isolated data and retrieval contexts
+  - Document count display (shows unique documents, not chunks)
+- **Gradio UI**: User-friendly interface for all operations
+- **MCP Server**: Model Context Protocol server for programmatic access
+- **API Export**: All main functions exposed via Gradio Client API
+## Repository Layout
+```
+.
+├── app.py                    # Gradio UI entry point; defines interface and exposed functions
+├── core/                     # Core logic modules
+│   ├── ingest.py             # Document loaders, chunking, flat tag generation
+│   ├── index.py              # Embeddings, vector DB (ChromaDB), metadata/tag filtering
+│   ├── retrieval.py          # All RAG pipelines (Base, TagFilter, Hybrid, HybridRerank)
+│   ├── eval.py               # Evaluation metrics and batch evaluation
+│   ├── tag_generator.py      # Flat tag generation (YAKE, KeyBERT, spaCy, Janome, OpenAI)
+│   ├── reranker.py           # Reranking strategies (cross-encoder, semantic, heuristic)
+│   ├── comparison.py         # RAG method comparison framework
+│   ├── visualization.py      # Chart generation (bar, line, scatter, box, stacked, Pareto)
+│   ├── report_generator.py   # Comprehensive report generation (HTML, CSV, JSON)
+│   ├── session_manager.py    # User session management and isolation
+│   ├── session_rag.py        # Session-aware RAG manager
+│   └── utils.py              # Shared helpers (e.g., PII masking, ID generation)
+├── tests/                    # pytest test suite (E2E tests)
+│   ├── conftest.py           # Pytest fixtures
+│   ├── test_accuracy.py      # Accuracy tests (tag generation, retrieval, metrics)
+│   ├── test_mcp_server.py    # MCP server tests
+│   ├── test_ux.py            # User experience tests
+│   ├── test_robustness.py    # Edge cases and error handling
+│   ├── test_user_scenarios.py # Non-technical user scenarios
+│   └── test_japanese_support.py # Japanese language support tests
+├── reports/                  # Evaluation results (CSV/JSON/PNG/HTML)
+├── requirements.txt          # Python dependencies
+└── README.md                 # This file
+```
+## Setup
+### Prerequisites
+- Python 3.8+
+- pip or conda
+### Installation
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd auto-tagging-rag
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+**Note:** Models (SentenceTransformers, spaCy) are downloaded automatically on first run. No manual download required.
+3. Create necessary directories:
+```bash
+mkdir -p reports chroma_data
+```
+### Environment Variables
+**Security-Important Variables:**
+Create a `.env` file (copy from `.env.example`) for sensitive configuration:
+```bash
+cp .env.example .env
+# Edit .env and add your API key
+```
+- `OPENAI_API_KEY` (required only if using OpenAI features): Your OpenAI API key for embeddings, tag generation, and metadata detection
+  - Get your API key from: [https://platform.openai.com/api-keys](https://platform.openai.com/api-keys)
+  - **Never commit `.env` to version control** (already in `.gitignore`)
+**Note:** All other configuration variables have sensible defaults and can be overridden if needed. See `SETUP_GUIDE.md` for full configuration options.
+## Deployment
+### Hugging Face Spaces
+Deploy this app to Hugging Face Spaces for free hosting:
+1. **Create a Space**: Go to [Hugging Face Spaces](https://huggingface.co/spaces) and create a new Gradio Space
+2. **Push Your Code**: Clone your Space repository and push your code
+3. **Configure**: Set environment variables in Space Settings (optional)
+4. **Deploy**: Spaces automatically builds and deploys your app
+**See [HUGGINGFACE_DEPLOYMENT.md](HUGGINGFACE_DEPLOYMENT.md) for complete deployment guide.**
+**Key Points for Spaces**:
+- Models download automatically on first run (no manual download needed)
+- Use `/tmp/chroma_data` for persistence (clears on restart)
+- Environment variables configured in Space Settings
+- Free CPU tier available, GPU tier for better performance
+### Local Deployment
+See [SETUP_GUIDE.md](SETUP_GUIDE.md) for local deployment instructions.
+## Usage
+### Using the Gradio API (gradio_client)
+All main functions are exposed via the Gradio API with `api_name`:
+- `build_rag`: Build RAG index from uploaded files
+- `search`: Search documents using both pipelines
+- `chat`: Chat interface with RAG system
+- `evaluate`: Run quantitative evaluation
+Example usage:
+```python
+from gradio_client import Client
+client = Client("http://your-server:7860/")
+# Build RAG index
+result = client.predict(
+    files=["doc1.pdf", "doc2.pdf"],
+    language="en",
+    user_tags="hospital-protocol, urgent",  # Optional manual tags
+    api_name="/build_rag"
+)
+# Search documents
+results = client.predict(
+    query="What are emergency procedures?",
+    k=5,
+    tags="emergency, procedures",
+    tag_operator="OR",
+    api_name="/search_all"
+)
+```
+### MCP Server
+The system can run as an MCP (Model Context Protocol) server for programmatic access:
+```bash
+python app.py --mcp
+```
+#### Connecting to MCP Server
+Add to your MCP client configuration (e.g., for Claude Desktop):
+```json
+{
+  "mcpServers": {
+    "auto-tagging-rag": {
+      "command": "python",
+      "args": ["/path/to/app.py", "--mcp"],
+      "env": {}
+    }
+  }
+}
+```
+#### Available MCP Tools
+1. **search_documents**: Search documents using RAG system
+   - Parameters: `query`, `k`, `pipeline`, `tags`, `tag_operator`
+2. **evaluate_retrieval**: Evaluate RAG performance with batch queries
+   - Parameters: `queries` (array), `output_file`
+## UI Tabs
+### 📁 Tab 1: Upload & Tagging
+**Purpose**: Document processing and tag generation
+**Components:**
+- File upload area (PDF/TXT with drag & drop)
+- **Language Selection**: Auto-detect, English (en), or Japanese (ja)
+- **Manual Tag Input**: Add custom tags (comma-separated) that will be combined with auto-generated tags
+  - Example: `hospital-protocol, urgent, confidential`
+  - Manual tags are prepended to auto-generated tags
+- Progress bar for processing status
+- Processing Summary: Files processed, chunks created, tags generated, user tags count
+- Indexed Chunks (preview): Shows chunks with tags and preview text
+- Tag visualization display (cloud or list format)
+- Reset Index button to clear all data
+**After Build:**
+- Build Status (processed count, indexed chunks, tags generated)
+- File Summary (Filename, Chunks, Language, Tags, User Tags)
+- Indexed Chunks (preview with metadata and first 160 chars)
+- Tag Visualization (Top 20 tags with frequency)
+### 🔍 Tab 2: Search & Compare
+**Purpose**: Test different retrieval methods side-by-side
+**Components:**
+- Search query input box
+- **Method Selection**: Compare all 4 retrieval methods simultaneously:
+  - Base RAG (vector similarity)
+  - Tag Filter RAG (tag-based filtering)
+  - Hybrid RAG (weighted combination)
+  - Hybrid Rerank RAG (with reranking)
+- **Tags**: Comma-separated tags for tag-based filtering (optional)
+- **Tag Operator**: AND/OR/NOT for tag filtering logic
+- **Vector/Tag Weight Sliders**: Adjust hybrid retrieval weights (default 0.7/0.3)
+- **k**: Number of results to retrieve (slider: 1-20)
+- **Side-by-side Results Display**:
+  - Answers from each method
+  - Retrieved documents with scores
+  - Tags used in retrieval
+  - Tags found in results
+  - Response times
+  - Quick comparison metrics
+### 💬 Tab 3: Chat Interface
+**Purpose**: Natural conversation with tag-enhanced RAG
+**Components:**
+- Chat message input
+- **Pipeline Selection**: Choose one retrieval method:
+  - Base RAG
+  - Tag Filter RAG
+  - Hybrid RAG
+  - Hybrid Rerank RAG
+- **Tag Filter Toggle**: Enable/disable tag usage
+- **Tag Input**: Specify tags for tag-based pipelines (comma-separated)
+- Conversation history display
+- Visible source documents with tags
+- Adjust retrieval parameters (k)
+### 📊 Tab 4: Analytics & Evaluation
+**Purpose**: Performance visualization and metrics
+**Components:**
+- Input queries in JSON format with ground truth
+- **User Satisfaction Scores**: Optional satisfaction ratings per query
+- **Metrics Display**:
+  - Precision@k, nDCG@k, MRR, Hit@k
+  - Semantic Similarity
+  - Latency (mean, p50, p90)
+  - User Satisfaction (if provided)
+- **Visualization Tabs**:
+  - Bar Charts: Method comparison metrics
+  - Line Plots: Metric trends over k values
+  - Scatter Plots: Correlation analysis
+  - Box Plots: Distribution analysis
+  - Stacked Bar Charts: Method breakdown
+  - Pareto Charts: Performance ranking
+- **Summary Statistics**: Aggregated metrics across all queries
+- **Export Buttons**: CSV/PNG export for results and charts
+- **Report Generation**: Comprehensive HTML reports with embedded visualizations
+### ⚙️ Tab 5: Settings & Management
+**Purpose**: System configuration and user management
+**Components:**
+- **Tag Generation Parameters**:
+  - Max Tags Per Chunk (5-50)
+  - Min Tag Length (1-5 words)
+  - Max Tag Length (1-5 words)
+  - Tag Generation Method (auto, yake, keybert, spacy, janome, openai)
+- **Hybrid Search Weights Configuration**:
+  - Default Vector Weight (0.0-1.0)
+  - Default Tag Weight (0.0-1.0)
+- **Database Management**:
+  - Clear All Data
+  - Export Database
+  - Import Database
+- **API Key Configuration**: OpenAI API key management
+- **Embedding Configuration**: Select embedding provider (SentenceTransformers/OpenAI) and model
+### 🎨 Global UI Elements
+- **Session Indicator**: Shows current session ID (persisted in browser localStorage)
+- **Document Count**: Shows total number of unique documents indexed (updates automatically)
+- **Processing Status**: System status indicator
+- **Session Persistence**: Session automatically restored from localStorage on page refresh
+## Evaluation
+### Quantitative Evaluation
+The system evaluates all 4 retrieval pipelines (Base-RAG, Tag Filter RAG, Hybrid RAG, Hybrid Rerank RAG) on multiple metrics:
+**Metrics:**
+- **Precision@k**: Fraction of retrieved documents that are relevant
+- **nDCG@k**: Normalized Discounted Cumulative Gain at k
+- **MRR**: Mean Reciprocal Rank
+- **Hit@k**: Whether at least one relevant document is in top-k
+- **Semantic Similarity**: Average cosine similarity between query and retrieved documents
+- **Latency**: Response time (mean, p50, p90 percentiles)
+- **User Satisfaction**: Average satisfaction score (if provided)
+### Evaluation Input Format
+```json
+[
+  {
+    "query": "What are emergency procedures?",
+    "ground_truth": ["chunk_id_1", "chunk_id_2", "chunk_id_3"],
+    "k_values": [1, 3, 5],
+    "tags": ["emergency", "procedure", "triage"],
+    "user_satisfaction": 4.5
+  }
+]
+```
+**Note:** `ground_truth` should be a list of chunk IDs that are relevant to the query. The system will automatically retrieve chunk IDs from the indexed documents.
+### Evaluation Results
+Results are saved to `reports/` directory:
+- **CSV**: Detailed metrics per query and pipeline
+- **JSON Summary**: Aggregated statistics and metadata
+- **Aggregated Stats CSV**: Summary by pipeline and k value
+- **Examples JSON**: Representative examples (best/worst performing queries)
+- **Visualizations**: PNG charts for all visualization types
+- **HTML Report**: Comprehensive report with embedded visualizations
+## Metadata Schema
+Chunks are tagged with the following metadata:
+```json
+{
+  "doc_id": "uuid",
+  "chunk_id": "uuid",
+  "source_name": "filename.pdf",
+  "lang": "ja|en",
+  "tags": ["user-tag1", "user-tag2", "auto-tag1", "auto-tag2"],
+  "chunk_index": 0,
+  "chunk_size": 1000
+}
+```
+**Tag Storage:**
+- Tags are stored as comma-separated strings in ChromaDB metadata
+- Converted back to lists when retrieved
+- Manual tags (user-provided) are prepended to auto-generated tags
+- Tags are normalized (lowercase, deduplicated)
+**Document Counting:**
+- System counts unique documents (by `doc_id`), not chunks
+- Each document can have multiple chunks
+- Document count displayed in UI header
+## Documentation
+### Setup & Configuration
+- **[SETUP_GUIDE.md](SETUP_GUIDE.md)**: Complete setup instructions, environment variables, deployment options, and troubleshooting
+- **[MODEL_DB_CONFIG.md](MODEL_DB_CONFIG.md)**: Detailed documentation of all models, libraries, APIs, and database configuration
+- **[HUGGINGFACE_DEPLOYMENT.md](HUGGINGFACE_DEPLOYMENT.md)**: Guide for deploying to Hugging Face Spaces
+### Testing
+- **[E2E_TEST_DESIGN.md](E2E_TEST_DESIGN.md)**: Comprehensive E2E test cases covering accuracy, UX, robustness, and non-technical user scenarios (46 test cases)
+### Usage Guides
+- **[EVALUATION_GUIDE.md](EVALUATION_GUIDE.md)**: How to run evaluation with sample queries
+## Testing
+See **[E2E_TEST_DESIGN.md](E2E_TEST_DESIGN.md)** for comprehensive test design document with 46 test cases covering:
+**Test Categories**:
+- **Accuracy Tests** (13 cases): Tag generation, retrieval accuracy, evaluation metrics
+- **User Experience Tests** (12 cases): Non-technical user scenarios, advanced usage, UI/UX quality
+- **Robustness Tests** (13 cases): Error handling, edge cases, data integrity
+- **Performance Tests** (5 cases): Response time, scalability
+- **Integration Tests** (3 cases): API integration, end-to-end workflows
+**Test Execution**:
+- Manual tests: Follow step-by-step instructions in test design document
+- Automated tests: Run pytest (if implemented)
+```bash
+pytest tests/ -v
+```
+## Architecture
+### Retrieval Pipelines
+**1. Base-RAG:**
+- Vector similarity search using embeddings
+- Return top-k results ranked by cosine similarity
+- No filtering applied
+**2. Tag Filter RAG:**
+- Filter documents using flat tags with boolean operators (AND/OR/NOT)
+- Vector search within filtered subset
+- Return top-k results with tag context
+**3. Hybrid RAG:**
+- Perform both vector search and tag-based filtering
+- Combine scores with configurable weights (default: 0.7 vector, 0.3 tag)
+- Normalize and merge results
+- Return top-k results ranked by hybrid score
+**4. Hybrid Rerank RAG:**
+- Perform hybrid retrieval (steps 1-3 above)
+- Apply reranking using cross-encoder or semantic similarity
+- Return top-k reranked results
+### Tag-Based System Architecture
+**Tag Generation:**
+1. **Language Detection**: Auto-detect document language (English/Japanese)
+2. **Manual Tag Input**: User can add custom tags during upload (optional)
+3. **Method Selection**: Choose tag generation method based on availability and language
+4. **Tag Extraction**: Extract tags using selected method (YAKE, KeyBERT, spaCy, Janome, or OpenAI)
+5. **Tag Merging**: Combine manual tags with auto-generated tags
+   - Manual tags are prepended (higher priority)
+   - Deduplicate identical tags
+6. **Tag Filtering**: Remove stopwords, normalize, deduplicate
+7. **Storage**: Store tags in document metadata (comma-separated string in ChromaDB)
+**Tag Filtering:**
+- **OR Operator**: Documents matching any tag (union)
+- **AND Operator**: Documents matching all tags (intersection)
+- **NOT Operator**: Exclude documents matching specified tags
+- **Post-filtering**: For AND/NOT operators, filter results after initial retrieval
+### Reranking Strategies
+1. **Cross-Encoder**: Use transformer-based cross-encoder models (e.g., `cross-encoder/ms-marco-MiniLM-L-6-v2`)
+2. **OpenAI**: Use OpenAI API for semantic reranking (if available)
+3. **Semantic Similarity**: Re-rank by query-document semantic similarity
+4. **Heuristic**: Simple score-based reranking
+## Vector Database & Embeddings
+- **ChromaDB** with persistence
+- **Embeddings Provider**:
+  - OpenAI (if `OPENAI_API_KEY` present): `OPENAI_EMBED_MODEL` (default `text-embedding-3-small`)
+  - SentenceTransformers fallback: `ST_EMBED_MODEL` (default `all-MiniLM-L6-v2`)
+- **Collections**: Namespaced by provider/dimension and session to avoid mismatch
+  - Format: `session_{session_id[:8]}` for user sessions
+  - Each session has isolated ChromaDB collections
+- **Tag Filtering**: Supported for flat tags with OR/AND/NOT operators
+- **Document Counting**: Counts unique documents (by `doc_id`), not chunks
+- **Session Isolation**: Each user session has isolated ChromaDB collections
+## Session Management
+The system supports multi-user deployments with browser-based session persistence:
+- **Session Creation**: Automatic session creation on first page load
+- **Session Persistence**:
+  - Session ID automatically saved to browser localStorage
+  - Session restored automatically on page refresh
+  - Data persists across browser sessions and refreshes
+- **Session Isolation**: Each session has its own ChromaDB collection
+  - Collection name format: `session_{session_id[:8]}`
+  - All documents, tags, and metadata are isolated per session
+- **Session Display**:
+  - Session ID shown in UI header (first 8 characters)
+  - Document count displayed and updated automatically
+- **Session Timeout**: Configurable timeout (default 3600 seconds)
+- **Session Cleanup**: Automatic cleanup of expired sessions
+- **Session-Aware RAG**: All RAG operations are scoped to the user's session
+- **New Session Creation**: If session ID exists in localStorage but not on server, creates new session and updates localStorage
+## Deployment
+### Local Deployment
+```bash
+python app.py
+```
+The app will be available at `http://localhost:7860`
+### Hugging Face Spaces
+The app is configured for Hugging Face Spaces deployment. The `README.md` frontmatter includes Spaces-specific configuration.
+### Docker Deployment
+```dockerfile
+FROM python:3.9-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["python", "app.py"]
+```
+### Environment Configuration
+For production deployments, ensure:
+- ChromaDB persistence directory is writable
+- OpenAI API key is set (if using OpenAI features)
+- Required model files are downloaded (spaCy, SentenceTransformers)
+- Sufficient disk space for vector database and reports
+## Dependencies
+All dependencies are included in `requirements.txt`. Key dependencies include:
+### Core Dependencies
+- `gradio>=5.49.1`: Web UI framework
+- `chromadb>=0.4.0`: Vector database
+- `sentence-transformers>=2.2.0`: Embeddings
+- `pypdf>=3.0.0`: PDF processing
+- `pandas>=1.5.0`: Data processing
+- `numpy>=1.21.0`: Numerical operations
+- `scikit-learn>=1.2.0`: Machine learning utilities
+### Tag Generation (English and Japanese)
+- `yake>=0.4.0`: Keyword extraction
+- `keybert>=0.8.0`: BERT-based keyword extraction
+- `spacy>=3.7.0`: NLP processing (English) - model `en_core_web_sm` required
+- `janome>=0.5.0`: Japanese tokenization
+### Visualization and Reports
+- `matplotlib>=3.5.0`: Chart generation
+- `jinja2>=3.1.0`: HTML template rendering
+### Optional
+- `openai>=1.0.0`: OpenAI API client (for OpenAI embeddings, tag generation, and reranking)
+See `requirements.txt` for the complete list.
+## Troubleshooting
+### Tag Generation Not Working
+- Ensure all dependencies are installed: `pip install -r requirements.txt`
+- For English: Download spaCy model: `python -m spacy download en_core_web_sm`
+- For Japanese: Ensure `janome` is installed (included in requirements.txt)
+- Verify documents are correctly detected as the appropriate language
+### Visualization Errors
+- Ensure all dependencies are installed: `pip install -r requirements.txt` (includes `matplotlib`)
+- Check that `reports/` directory is writable
+### Reranking Errors
+- Cross-encoder models are downloaded automatically on first use
+- For OpenAI reranking, ensure `OPENAI_API_KEY` is set
+- Check internet connection for model downloads
+### Session Issues
+- Session ID is stored in browser localStorage with key `rag_session_id`
+- Clear browser localStorage to reset session
+- Ensure ChromaDB persistence directory is writable
+- Check session timeout settings
+- Verify session cleanup is running
+- If session ID exists in localStorage but data is missing, a new session will be created automatically
+### Japanese Support Issues
+- Install `janome`: `pip install janome`
+- Ensure documents are correctly detected as Japanese
+- Check that Japanese tag generation method is selected
+## Acknowledgments
+Built for comprehensive RAG evaluation and comparison, supporting multiple retrieval strategies, tag-based filtering, hybrid approaches, and extensive evaluation metrics.

SETUP_GUIDE.md ADDED Viewed

	@@ -0,0 +1,512 @@

+# Setup Guide - Auto Tagging RAG System
+## Overview
+This guide provides detailed setup instructions for the Auto Tagging RAG System, including environment configuration, dependency installation, deployment options, and troubleshooting.
+## Table of Contents
+1. [Prerequisites](#prerequisites)
+2. [Installation](#installation)
+3. [Environment Configuration](#environment-configuration)
+4. [Deployment Options](#deployment-options)
+5. [Initial Setup](#initial-setup)
+6. [Verification](#verification)
+7. [Troubleshooting](#troubleshooting)
+## Prerequisites
+### System Requirements
+- **Python**: 3.8 or higher
+- **Operating System**: Linux, macOS, or Windows
+- **Memory**: Minimum 4GB RAM (8GB+ recommended)
+- **Storage**: At least 2GB free space for models and data
+- **Network**: Internet connection for initial model downloads (offline mode supported after setup)
+### Software Dependencies
+- Python package manager: `pip` or `conda`
+- Git (for cloning repository)
+## Installation
+### Step 1: Clone Repository
+```bash
+git clone <repository-url>
+cd auto_tagging_rag
+```
+### Step 2: Create Virtual Environment (Recommended)
+**Using venv:**
+```bash
+python3 -m venv venv
+source venv/bin/activate  # On Linux/macOS
+# or
+venv\Scripts\activate  # On Windows
+```
+**Using conda:**
+```bash
+conda create -n auto_tagging_rag python=3.10
+conda activate auto_tagging_rag
+```
+### Step 3: Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+**Important**: Wait for all packages to install completely before proceeding to Step 4.
+### Step 4: Create Required Directories
+```bash
+mkdir -p reports chroma_data
+```
+These directories will store:
+- `reports/`: Evaluation results, visualizations, and generated reports
+- `chroma_data/`: Vector database persistent storage
+### Step 5: Model Downloads (Automatic)
+The first time you run the application, it will automatically download:
+- **SentenceTransformers**: `all-MiniLM-L6-v2` embedding model (~80MB)
+- **spaCy**: `en_core_web_sm` model (if using spaCy tag generation)
+**Important**:
+- Models download automatically on first use
+- No manual download required
+- Ensure internet connection for initial downloads
+- Models are cached locally for offline operation
+## Environment Configuration
+### Creating `.env` File
+Create a `.env` file in the project root:
+```bash
+touch .env  # Linux/macOS
+# or create .env file manually on Windows
+```
+### Environment Variables
+Copy the template below and configure as needed:
+```bash
+# ============================================
+# OpenAI Configuration (Optional)
+# ============================================
+# Required only if using OpenAI embeddings/tag generation
+OPENAI_API_KEY=your_openai_api_key_here
+# OpenAI Model Selection
+OPENAI_MODEL=gpt-4o-mini              # Chat model for tag generation
+OPENAI_EMBED_MODEL=text-embedding-3-small  # Embedding model (1536 dimensions)
+# Force OpenAI embeddings (true/false)
+USE_OPENAI_EMBEDDINGS=false           # Set to true to force OpenAI embeddings
+# ============================================
+# Embedding Configuration
+# ============================================
+# SentenceTransformers fallback model
+ST_EMBED_MODEL=all-MiniLM-L6-v2       # Used when OpenAI is not available
+# ============================================
+# Reranking Configuration
+# ============================================
+RERANKER_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2
+RERANKER_STRATEGY=cross_encoder       # Options: cross_encoder, openai, semantic, heuristic
+# ============================================
+# Tag Generation Configuration
+# ============================================
+TAG_GENERATION_METHOD=auto            # Options: yake, keybert, spacy, janome, openai, auto
+MAX_TAGS_PER_CHUNK=10                 # Maximum tags per document chunk
+MIN_TAG_LENGTH=2                      # Minimum tag length (words)
+MAX_TAG_LENGTH=5                      # Maximum tag length (words)
+# ============================================
+# Database & Storage Configuration
+# ============================================
+CHROMA_PERSIST_DIR=./chroma_data      # ChromaDB persistence directory
+SESSION_TIMEOUT=3600                  # Session timeout in seconds (default: 1 hour)
+# ============================================
+# UI & Behavior Configuration
+# ============================================
+DEFAULT_SEARCH_K=5                    # Default number of search results
+GRADIO_SERVER_PORT=7860               # Gradio server port
+GRADIO_SERVER_NAME=0.0.0.0            # Server host (0.0.0.0 for all interfaces)
+LOG_LEVEL=INFO                        # Logging level: DEBUG, INFO, WARNING, ERROR
+```
+### Environment Variable Descriptions
+#### OpenAI Configuration
+- **OPENAI_API_KEY**: Your OpenAI API key (get from https://platform.openai.com/api-keys)
+  - Required only if using OpenAI embeddings or tag generation
+  - Can be omitted for fully offline operation
+- **OPENAI_MODEL**: Chat model for tag generation (default: `gpt-4o-mini`)
+- **OPENAI_EMBED_MODEL**: Embedding model (default: `text-embedding-3-small`, 1536 dimensions)
+- **USE_OPENAI_EMBEDDINGS**: Force OpenAI embeddings even if API key exists (default: `false`)
+#### Embedding Configuration
+- **ST_EMBED_MODEL**: SentenceTransformers model (default: `all-MiniLM-L6-v2`)
+  - Automatically downloaded on first run
+  - Used when OpenAI is not available or disabled
+#### Reranking Configuration
+- **RERANKER_MODEL**: Cross-encoder model for reranking (default: `cross-encoder/ms-marco-MiniLM-L-6-v2`)
+  - Automatically downloaded on first use
+- **RERANKER_STRATEGY**: Reranking method (default: `cross_encoder`)
+  - Options: `cross_encoder`, `openai`, `semantic`, `heuristic`
+#### Tag Generation Configuration
+- **TAG_GENERATION_METHOD**: Tag generation method (default: `auto`)
+  - `auto`: Automatically selects best available method
+  - `yake`: YAKE keyword extraction (English/Japanese)
+  - `keybert`: KeyBERT with BERT embeddings
+  - `spacy`: spaCy noun phrase extraction (English only)
+  - `janome`: Japanese tokenization and noun extraction
+  - `openai`: OpenAI-based tag generation (requires API key)
+- **MAX_TAGS_PER_CHUNK**: Maximum tags generated per chunk (default: `10`)
+- **MIN_TAG_LENGTH**: Minimum tag length in words (default: `2`)
+- **MAX_TAG_LENGTH**: Maximum tag length in words (default: `5`)
+#### Database & Storage
+- **CHROMA_PERSIST_DIR**: Directory for ChromaDB storage (default: `./chroma_data`)
+- **SESSION_TIMEOUT**: Session expiration time in seconds (default: `3600` = 1 hour)
+#### UI & Behavior
+- **DEFAULT_SEARCH_K**: Default number of search results (default: `5`)
+- **GRADIO_SERVER_PORT**: Server port (default: `7860`)
+- **GRADIO_SERVER_NAME**: Server hostname (default: `0.0.0.0` for all interfaces)
+- **LOG_LEVEL**: Logging verbosity (default: `INFO`)
+### Minimal Configuration Example
+For offline operation without OpenAI:
+```bash
+# Minimal .env file for offline operation
+LOG_LEVEL=INFO
+```
+The system will use default values for all other settings.
+## Deployment Options
+### Option 1: Local Development
+**Run locally:**
+```bash
+python app.py
+```
+Access at: `http://localhost:7860`
+### Option 2: Hugging Face Spaces
+**Requirements**:
+- Hugging Face account
+- Space created (set SDK to `gradio`)
+**Deployment Steps**:
+1. **Push to Hugging Face**:
+```bash
+git remote add huggingface https://huggingface.co/spaces/<your-username>/<space-name>
+git push huggingface main
+```
+2. **Configure Space Settings**:
+   - **SDK**: `gradio`
+   - **Python Version**: `3.10`
+   - **Hardware**: CPU (or GPU if needed)
+3. **Add Secrets** (if using OpenAI):
+   - Go to Space Settings → Secrets
+   - Add `OPENAI_API_KEY` secret
+4. **Environment Variables**:
+   - Add via Space Settings → Variables tab
+   - Or use secrets for sensitive values
+**Note**: Hugging Face Spaces automatically installs from `requirements.txt` and runs `app.py`.
+### Option 3: Docker Deployment
+**Create Dockerfile**:
+```dockerfile
+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Models download automatically on first run
+# Copy application
+COPY . .
+# Create directories
+RUN mkdir -p reports chroma_data
+# Expose port
+EXPOSE 7860
+# Run application
+CMD ["python", "app.py"]
+```
+**Build and Run**:
+```bash
+docker build -t auto-tagging-rag .
+docker run -p 7860:7860 --env-file .env auto-tagging-rag
+```
+### Option 4: Cloud Deployment (AWS/GCP/Azure)
+**For AWS EC2**:
+```bash
+# Install dependencies
+sudo apt-get update
+sudo apt-get install python3-pip python3-venv git
+# Clone repository
+git clone <repository-url>
+cd auto_tagging_rag
+# Setup virtual environment
+python3 -m venv venv
+source venv/bin/activate
+# Install dependencies
+pip install -r requirements.txt
+# Models download automatically on first run
+# Configure environment
+cp .env.example .env
+# Edit .env with your settings
+# Run with nohup or systemd
+nohup python app.py > app.log 2>&1 &
+```
+**Security Groups**: Open port 7860 (or your configured port) for HTTP access.
+## Initial Setup
+### First Run Checklist
+1. ✅ Python 3.8+ installed
+2. ✅ Dependencies installed (`pip install -r requirements.txt`)
+3. ✅ `.env` file created and configured (optional, only if using OpenAI)
+4. ✅ Directories created (`reports/`, `chroma_data/`)
+5. ✅ Internet connection available (for initial model downloads - models download automatically)
+### First Launch
+```bash
+python app.py
+```
+**Expected Output**:
+```
+INFO:rag_app:OpenAI API key detected. OpenAI-powered auto-detection is ENABLED.
+# or
+INFO:rag_app:OpenAI API key not found. Using SentenceTransformers embeddings.
+INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled...
+INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
+Running on local URL:  http://127.0.0.1:7860
+```
+**First-time downloads**:
+- SentenceTransformers model (~80MB) - downloads automatically
+- Reranker model (~40MB) - downloads on first use if reranking enabled
+### Offline Operation
+After initial setup, the system works **fully offline**:
+- All models are cached locally
+- No network access required
+- Tiktoken fallback handles network issues gracefully
+## Verification
+### Test Installation
+1. **Start Application**:
+```bash
+python app.py
+```
+2. **Access UI**: Open `http://localhost:7860` in browser
+3. **Test Upload**:
+   - Go to "Upload & Tagging" tab
+   - Upload `sample_documents/emergency_procedures.txt`
+   - Click "Build RAG Index"
+   - Verify: Tags generated, document count = 1
+4. **Test Search**:
+   - Go to "Search & Compare" tab
+   - Enter query: "What are emergency procedures?"
+   - Click "Search All Methods"
+   - Verify: Results appear for all 4 methods
+5. **Test Evaluation**:
+   - Go to "Analytics & Evaluation" tab
+   - Copy content from `sample_evaluation_queries.json`
+   - Paste into "Evaluation Queries" field
+   - Click "Run Evaluation"
+   - Verify: Charts appear, CSV generated
+### Verify Models
+**Check spaCy model**:
+```bash
+python3 -c "import spacy; nlp = spacy.load('en_core_web_sm'); print('✓ spaCy model loaded')"
+```
+**Check SentenceTransformers**:
+```bash
+python3 -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('all-MiniLM-L6-v2'); print('✓ SentenceTransformers model loaded')"
+```
+**Check ChromaDB**:
+```bash
+python3 -c "import chromadb; client = chromadb.PersistentClient(path='./chroma_data'); print('✓ ChromaDB initialized')"
+```
+## Troubleshooting
+### Common Issues
+#### Issue 1: "No module named spacy"
+**Solution**:
+```bash
+pip install spacy>=3.7.0
+python -m spacy download en_core_web_sm
+```
+#### Issue 2: "spaCy model not found"
+**Solution**:
+```bash
+python -m spacy download en_core_web_sm
+# Verify
+python3 -c "import spacy; nlp = spacy.load('en_core_web_sm')"
+```
+#### Issue 3: "Permission denied: /data"
+**Solution**: ChromaDB defaults to `/data` which may not be writable. The app automatically falls back to `./chroma_data` or user cache directory.
+**Manual override**:
+```bash
+export CHROMA_PERSIST_DIR=./chroma_data
+python app.py
+```
+#### Issue 4: "Tiktoken network warning"
+**Explanation**: This is expected in offline mode. Tiktoken attempts to download encodings but falls back gracefully to character-based token counting.
+**Solution**: No action needed - the system handles this automatically.
+#### Issue 5: "OpenAI API key not working"
+**Solution**:
+1. Verify key in `.env` file: `OPENAI_API_KEY=sk-...`
+2. Check key is valid: https://platform.openai.com/api-keys
+3. Restart application after changing `.env`
+#### Issue 6: "ChromaDB collection not found after restart"
+**Solution**: Collections are namespaced by embedding provider/dimension. If you switch providers, you need to re-upload documents.
+**Check collections**:
+```python
+import chromadb
+client = chromadb.PersistentClient(path="./chroma_data")
+collections = client.list_collections()
+print([col.name for col in collections])
+```
+#### Issue 7: "Session not persisting after refresh"
+**Solution**:
+- Sessions are stored in browser `localStorage` automatically
+- Server restarts: Sessions are restored from ChromaDB collections if they exist
+- Check browser console for errors
+#### Issue 8: "Import errors"
+**Solution**:
+```bash
+# Reinstall all dependencies
+pip install --upgrade -r requirements.txt
+```
+#### Issue 9: "Memory errors with large documents"
+**Solution**:
+- Reduce `MAX_TAGS_PER_CHUNK` in `.env`
+- Process documents in smaller batches
+- Increase system RAM or use smaller embedding models
+#### Issue 10: "Port 7860 already in use"
+**Solution**:
+```bash
+# Find process using port
+lsof -i :7860  # Linux/macOS
+netstat -ano | findstr :7860  # Windows
+# Kill process or change port in .env
+GRADIO_SERVER_PORT=7861
+```
+### Getting Help
+1. **Check Logs**: Look at console output for error messages
+2. **Enable Debug Logging**: Set `LOG_LEVEL=DEBUG` in `.env`
+3. **Check Documentation**: See `README.md` and other `.md` files
+4. **Verify Installation**: Run verification steps above
+## Next Steps
+After successful setup:
+1. **Upload Sample Documents**: Use files in `sample_documents/`
+2. **Run Sample Evaluation**: Use `sample_evaluation_queries.json`
+3. **Explore Features**: Try all tabs in the UI
+4. **Read Guides**:
+   - `EVALUATION_GUIDE.md` - How to run evaluation
+   - `SEARCH_COMPARE_GUIDE.md` - How to use Search & Compare
+   - `TAG_GENERATION_GUIDE.md` - Tag generation details

app.py ADDED Viewed

	@@ -0,0 +1,1552 @@

+import gradio as gr
+import os
+import logging
+from dotenv import load_dotenv
+import tempfile
+import pandas as pd
+from typing import List, Dict, Any, Tuple, Optional
+import shutil
+import json
+import time
+import zipfile
+from core.ingest import FlatTagChunker
+from core.index import VectorStore
+from core.retrieval import RAGManager
+from core.eval import RAGEvaluator
+from core.utils import generate_id
+from core.comparison import RAGComparisonFramework
+from core.visualization import RAGVisualizer
+from core.report_generator import ReportGenerator
+from core.session_manager import SessionManager
+from core.session_rag import SessionAwareRAGManager
+import os as _os
+_OPENAI_ON = False
+try:
+    from openai import OpenAI as _OpenAI
+    _OPENAI_ON = True if _os.getenv("OPENAI_API_KEY") else False
+except Exception:
+    _OPENAI_ON = False
+# Load environment variables from .env if present, then configure logging
+load_dotenv()
+logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
+logger = logging.getLogger("rag_app")
+if os.getenv("OPENAI_API_KEY"):
+    logger.info("OpenAI API key detected. OpenAI-powered auto-detection is ENABLED.")
+    if os.getenv("OPENAI_MODEL"):
+        logger.info(f"OpenAI model: {os.getenv('OPENAI_MODEL')}")
+else:
+    logger.info("OpenAI API key not set. Falling back to heuristic auto-detection.")
+# Global variables
+rag_manager = None
+evaluator = None
+comparator = None
+visualizer = None
+report_generator = None
+session_manager = None
+session_rag_manager = None
+current_collection = "documents"
+persist_directory = None
+def initialize_system():
+    """Initialize the RAG system"""
+    global rag_manager, evaluator, comparator, visualizer, report_generator, session_manager, session_rag_manager, persist_directory
+    # Try /data/chroma first (for HF Spaces), fallback to ./chroma_data
+    persist_dir = "/data/chroma" if os.path.exists("/data/chroma") else "./chroma_data"
+    # Create directory with proper permissions, and check if we can write to it
+    try:
+        os.makedirs(persist_dir, exist_ok=True, mode=0o755)
+        # Test write permissions
+        test_file = os.path.join(persist_dir, ".test_write")
+        try:
+            with open(test_file, 'w') as f:
+                f.write("test")
+            os.remove(test_file)
+        except (PermissionError, OSError):
+            # If can't write to /data/chroma, use ./chroma_data
+            persist_dir = "./chroma_data"
+            os.makedirs(persist_dir, exist_ok=True, mode=0o755)
+    except (PermissionError, OSError) as e:
+        # If even ./chroma_data fails, try current directory
+        persist_dir = "./chroma_data"
+        os.makedirs(persist_dir, exist_ok=True, mode=0o755)
+    persist_directory = persist_dir
+    rag_manager = RAGManager(persist_directory=persist_dir)
+    evaluator = RAGEvaluator(rag_manager)
+    comparator = RAGComparisonFramework(evaluator)
+    visualizer = RAGVisualizer()
+    report_generator = ReportGenerator()
+    # Initialize session manager
+    session_timeout = int(os.getenv("SESSION_TIMEOUT", 3600))
+    session_manager = SessionManager(base_persist_dir=persist_dir, session_timeout=session_timeout)
+    session_rag_manager = SessionAwareRAGManager(rag_manager, session_manager)
+    return f"System initialized successfully! Using persist directory: {persist_dir}"
+def reset_index() -> str:
+    """Clear Chroma persistence and reinitialize the vector store."""
+    global rag_manager, evaluator, comparator, visualizer, report_generator, session_manager, session_rag_manager, persist_directory
+    try:
+        dir_path = persist_directory or ("/data/chroma" if os.path.exists("/data/chroma") else "./chroma_data")
+        if os.path.exists(dir_path):
+            shutil.rmtree(dir_path, ignore_errors=True)
+        os.makedirs(dir_path, exist_ok=True, mode=0o755)
+        persist_directory = dir_path
+        rag_manager = RAGManager(persist_directory=dir_path)
+        evaluator = RAGEvaluator(rag_manager)
+        comparator = RAGComparisonFramework(evaluator)
+        visualizer = RAGVisualizer()
+        report_generator = ReportGenerator()
+        session_timeout = int(os.getenv("SESSION_TIMEOUT", 3600))
+        session_manager = SessionManager(base_persist_dir=dir_path, session_timeout=session_timeout)
+        session_rag_manager = SessionAwareRAGManager(rag_manager, session_manager)
+        return f"Index reset complete. Using fresh directory: {dir_path}"
+    except Exception as ex:
+        return f"Failed to reset index: {ex}"
+def upload_documents(files: List[str], language: str, user_tags: Optional[str] = None, use_flat_tags: bool = True, collection_name: str = None, progress: Any = None) -> Tuple[str, List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
+    """Upload and process documents.
+    Returns: (status_text, per_file_summaries, collection_stats, chunk_rows)
+    per_file_summaries: [{filename, chunks, language, tags}]
+    """
+    global rag_manager, persist_directory, current_collection
+    if not files:
+        return "No files provided!", [], {}, []
+    # Ensure system is initialized
+    if not rag_manager:
+        initialize_system()
+    # Use provided collection or default
+    if collection_name:
+        current_collection = collection_name
+    # Parse user tags from comma-separated string
+    user_tags_list = []
+    if user_tags and user_tags.strip():
+        user_tags_list = [tag.strip() for tag in user_tags.split(',') if tag.strip()]
+    # Choose chunker based on tag mode
+    # Use FlatTagChunker for flat tagging
+    chunker = FlatTagChunker()
+    all_chunks = []
+    per_file_summaries: List[Dict[str, Any]] = []
+    chunk_rows: List[Dict[str, Any]] = []
+    processed_count = 0
+    errors: List[str] = []
+    total = len(files)
+    for idx, file_path in enumerate(files, start=1):
+        if progress:
+            try:
+                progress(idx, total=total, desc=f"Processing {idx}/{total}: {os.path.basename(file_path)}")
+            except Exception:
+                pass
+        try:
+            chunks = chunker.chunk_document(file_path, language=language, user_tags=user_tags_list)
+            if chunks:
+                # Aggregate per-file metadata from chunks (majority vote)
+                from collections import Counter
+                langs = Counter([c.metadata.get('lang') for c in chunks if c.metadata.get('lang')])
+                # Count tags (including user tags)
+                tag_count = sum(len(c.metadata.get('tags', [])) for c in chunks)
+                # Count user tags vs auto tags (first tags are user tags)
+                user_tag_count = len(user_tags_list) if user_tags_list else 0
+                per_file_summaries.append({
+                    'Filename': os.path.basename(file_path),
+                    'Chunks': len(chunks),
+                    'Language': (langs.most_common(1)[0][0] if langs else None),
+                    'Tags': tag_count,
+                    'User Tags': user_tag_count
+                })
+                # Prepare per-chunk preview rows
+                for c in chunks:
+                    md = c.metadata or {}
+                    row = {
+                        'Filename': os.path.basename(file_path),
+                        'Language': md.get('lang'),
+                        'Tags': ', '.join(md.get('tags', [])[:5])  # Show first 5 tags
+                    }
+                    row['Preview'] = (c.content[:160] + '...') if c.content else ''
+                    chunk_rows.append(row)
+                all_chunks.extend(chunks)
+                processed_count += 1
+            else:
+                errors.append(f"Warning: {os.path.basename(file_path)} produced no chunks")
+        except Exception as e:
+            error_msg = f"{os.path.basename(file_path)}: {str(e)}"
+            errors.append(error_msg)
+    # Index if any chunk present
+    vector_store = rag_manager.vector_store
+    stats: Dict[str, Any] = {"document_count": 0, "collection_name": current_collection}
+    if all_chunks:
+        vector_store.add_documents(current_collection, all_chunks)
+        stats = vector_store.get_collection_stats(current_collection)
+    # Build result message
+    total_tags = sum(len(c.metadata.get('tags', [])) for c in all_chunks)
+    status_lines = [
+        f"Processed {processed_count}/{total} files",
+        f"Indexed chunks: {len(all_chunks)}",
+        f"Generated tags: {total_tags}"
+    ]
+    if errors:
+        status_lines.append("\nErrors/Warnings:\n" + "\n".join(f"- {e}" for e in errors))
+    return "\n".join(status_lines), per_file_summaries, stats, chunk_rows
+def init_session(client_session_id: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Initialize or restore a user session.
+    Args:
+        client_session_id: Optional session ID from client (localStorage)
+    Returns:
+        Dict with session_id and collection_name
+    """
+    global session_manager
+    if not session_manager:
+        initialize_system()
+    # Use get_or_create_session which checks ChromaDB for existing collections
+    # This handles server restarts - if ChromaDB collection exists, restore session
+    session = session_manager.get_or_create_session(session_id=client_session_id)
+    if client_session_id and session.session_id == client_session_id:
+        # Successfully restored existing session
+        logger.info(f"Restored session {client_session_id} (collection: {session.collection_name})")
+        return {"session_id": session.session_id, "collection_name": session.collection_name}
+    else:
+        # Created new session
+        logger.info(f"Created new session {session.session_id} (collection: {session.collection_name})")
+        return {"session_id": session.session_id, "collection_name": session.collection_name, "new_session": True}
+def get_document_count(session_state: Dict[str, Any]) -> str:
+    """Get total document count from the current session's collection - returns only the number"""
+    global rag_manager
+    if not session_state or not session_state.get("collection_name"):
+        return "0"
+    try:
+        if not rag_manager:
+            initialize_system()
+        collection_name = session_state.get("collection_name")
+        stats = rag_manager.vector_store.get_collection_stats(collection_name)
+        doc_count = stats.get("document_count", 0)
+        return str(doc_count)
+    except Exception as e:
+        logger.warning(f"Failed to get document count: {e}")
+        return "0"
+def build_with_session(files: List[Any], language: str, user_tags: str, session_state: Dict[str, Any], progress=gr.Progress()) -> Tuple[Dict[str, Any], str, pd.DataFrame, pd.DataFrame, str]:
+    """Build RAG index with session support and progress tracking"""
+    # Initialize or get session
+    if not session_state or not session_state.get("session_id"):
+        session_state = init_session()
+    # Refresh session to prevent expiration (get_session updates access time)
+    global session_manager
+    if session_manager and session_state.get("session_id"):
+        session_manager.get_session(session_state["session_id"])
+    # Build index using session collection with progress (always use flat tags)
+    status, stats_df, chunks_df = build_rag_index(
+        files, language, user_tags, use_flat_tags=True, collection_name=session_state.get("collection_name")
+    )
+    # Extract tags for visualization
+    tag_list_str = ""
+    if chunks_df is not None and not chunks_df.empty and 'Tags' in chunks_df.columns:
+        all_tags = []
+        for tags_str in chunks_df['Tags'].dropna():
+            if tags_str:
+                all_tags.extend([t.strip() for t in str(tags_str).split(',')])
+        from collections import Counter
+        tag_counts = Counter(all_tags)
+        top_tags = [f"{tag} ({count})" for tag, count in tag_counts.most_common(20)]
+        tag_list_str = "\n".join([f"- {tag}" for tag in top_tags]) if top_tags else "No tags generated"
+    # Document count will be updated separately after processing completes to avoid progress bar
+    return session_state, status, stats_df, chunks_df, tag_list_str
+def build_rag_index(files: List[Any], language: str, user_tags: str = "", use_flat_tags: bool = True, collection_name: str = None) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
+    """Build RAG index from uploaded files"""
+    if not files:
+        return "No files provided!", None, None
+    # Gradio file objects already contain the full path in .name property
+    # No need to prepend /tmp/ - just use the path directly
+    file_paths = []
+    for file in files:
+        # Get the file path - Gradio provides it as .name or as a string
+        if isinstance(file, str):
+            file_path = file
+        elif hasattr(file, 'name') and file.name:
+            # file.name already contains the full path (e.g., /tmp/gradio/.../filename.txt)
+            file_path = file.name
+        else:
+            # Fallback for edge cases
+            return f"Error: Unable to get file path from uploaded file", None
+        # Normalize the path to handle any double slashes
+        file_path = os.path.normpath(file_path)
+        # Ensure the file exists
+        if not os.path.exists(file_path):
+            return f"Error: File not found at {file_path}", None
+        file_paths.append(file_path)
+    # Normalize "Auto" to None for auto-detection downstream
+    norm_language = None if not language or str(language).lower() == 'auto' else language
+    # Process documents (progress provided by gradio)
+    status_text, file_summaries, stats, chunk_rows = upload_documents(
+        file_paths, norm_language, user_tags,
+        use_flat_tags=use_flat_tags, collection_name=collection_name, progress=None
+    )
+    # Build per-file dataframe (no totals row)
+    per_file_df = pd.DataFrame(file_summaries) if file_summaries else pd.DataFrame(columns=['Filename','Chunks','Language','Tags','User Tags'])
+    chunks_df = pd.DataFrame(chunk_rows) if chunk_rows else pd.DataFrame(columns=['Filename','Language','Tags','Preview'])
+    return status_text, per_file_df, chunks_df
+def _format_result(result, method_name, query: str, k: int, used_tags=None):
+    """Format a single retrieval result for display"""
+    if not result or not result.sources:
+        return f"**{method_name}**: No results found", ""
+    # Generate answer
+    answer = _llm_answer(query, result.sources[:3])  # Use top 3 for answer generation
+    content_lines = [f"**{method_name}** (Latency: {result.latency:.3f}s)\n"]
+    content_lines.append(f"**Answer**: {answer}\n")
+    content_lines.append(f"\n**Retrieved Documents** ({len(result.sources)} total):\n")
+    # Get tags from retrieved documents
+    doc_tags = set()
+    for src in result.sources[:k]:
+        meta = src.get('metadata', {})
+        tags_from_meta = meta.get('tags', [])
+        # Handle both list and comma-separated string
+        if isinstance(tags_from_meta, str):
+            tags_from_meta = [t.strip() for t in tags_from_meta.split(',') if t.strip()]
+        doc_tags.update(tags_from_meta)
+    if used_tags:
+        content_lines.append(f"**Tags Used**: {', '.join(used_tags)}\n")
+    if doc_tags:
+        content_lines.append(f"**Tags in Results**: {', '.join(list(doc_tags)[:10])}\n")
+    content_lines.append("\n**Top Results**:\n")
+    for i, src in enumerate(result.sources[:k], 1):
+        score = src.get('score', 0)
+        meta = src.get('metadata', {})
+        source_name = meta.get('source_name', 'unknown')
+        src_tags = meta.get('tags', [])
+        if isinstance(src_tags, str):
+            src_tags = [t.strip() for t in src_tags.split(',') if t.strip()]
+        tag_str = f" [Tags: {', '.join(src_tags[:3])}]" if src_tags else ""
+        content_lines.append(f"{i}. [{source_name}] (Score: {score:.3f}){tag_str}")
+        content_lines.append(f"   {src.get('content', '')[:120]}...\n")
+    return "\n".join(content_lines), answer
+def search_all_methods(query: str, k: int, tags: str, tag_operator: str, vector_weight: float, tag_weight: float, session_state: Dict[str, Any]) -> Tuple[str, str, str, str, str, Dict[str, Any]]:
+    """Search using all 4 retrieval methods - processes sequentially to reduce load"""
+    global session_rag_manager, rag_manager
+    if not session_rag_manager or not rag_manager:
+        initialize_system()
+    # Get or refresh session
+    if not session_state or not session_state.get("session_id"):
+        session_state = init_session()
+    else:
+        # Refresh session to prevent expiration (get_session updates access time)
+        session_manager.get_session(session_state["session_id"])
+    # Get session-aware RAG manager
+    rag = session_rag_manager.get_rag(session_state["session_id"])
+    # Parse tags
+    tag_list = [t.strip() for t in tags.split(',') if t.strip()] if tags else []
+    results = {}
+    base_text = "**Base RAG**: Processing..."
+    tag_text = "**Tag Filter RAG**: Waiting..."
+    hybrid_text = "**Hybrid RAG**: Waiting..."
+    rerank_text = "**Hybrid Rerank RAG**: Waiting..."
+    try:
+        # 1. Base RAG (process first)
+        logger.info("Processing Base RAG...")
+        base_result = rag.base_rag.retrieve(query, k)
+        results['base'] = base_result
+        base_text, base_answer = _format_result(base_result, "Base RAG", query, k)
+        # 2. Tag Filter RAG (process second)
+        logger.info("Processing Tag Filter RAG...")
+        tag_result = rag.tag_filter_rag.retrieve(query, k, tags=tag_list, tag_operator=tag_operator)
+        results['tag'] = tag_result
+        tag_text, tag_answer = _format_result(tag_result, f"Tag Filter RAG ({tag_operator})", query, k, tag_list)
+        # 3. Hybrid RAG (process third)
+        logger.info("Processing Hybrid RAG...")
+        hybrid_result = rag.hybrid_rag.retrieve(query, k, tags=tag_list, vector_weight=vector_weight, tag_weight=tag_weight)
+        results['hybrid'] = hybrid_result
+        hybrid_text, hybrid_answer = _format_result(hybrid_result, f"Hybrid RAG (V:{vector_weight:.1f}, T:{tag_weight:.1f})", query, k, tag_list)
+        # 4. Hybrid Rerank RAG (process last - most expensive)
+        logger.info("Processing Hybrid Rerank RAG...")
+        rerank_result = rag.hybrid_rerank_rag.retrieve(query, k, tags=tag_list, vector_weight=vector_weight, tag_weight=tag_weight)
+        results['rerank'] = rerank_result
+        rerank_text, rerank_answer = _format_result(rerank_result, "Hybrid Rerank RAG", query, k, tag_list)
+    except Exception as e:
+        error_msg = f"Error during retrieval: {str(e)}"
+        logger.error(f"Search error: {error_msg}", exc_info=True)
+        return error_msg, error_msg, error_msg, error_msg, f"Error: {error_msg}", session_state
+    # Create quick comparison metrics
+    metrics = []
+    for method_name, result, answer in [
+        ("Base RAG", results.get('base'), base_answer if 'base_answer' in locals() else None),
+        ("Tag Filter RAG", results.get('tag'), tag_answer if 'tag_answer' in locals() else None),
+        ("Hybrid RAG", results.get('hybrid'), hybrid_answer if 'hybrid_answer' in locals() else None),
+        ("Hybrid Rerank RAG", results.get('rerank'), rerank_answer if 'rerank_answer' in locals() else None)
+    ]:
+        if result and result.sources:
+            metrics.append(f"- **{method_name}**: {len(result.sources)} docs, {result.latency:.3f}s, Score: {result.sources[0].get('score', 0):.3f}")
+        else:
+            metrics.append(f"- **{method_name}**: No results")
+    summary = f"""
+### Query: {query}
+### Quick Comparison:
+{chr(10).join(metrics)}
+### Tags: {', '.join(tag_list) if tag_list else 'None'}
+"""
+    logger.info("All search methods completed successfully")
+    return base_text, tag_text, hybrid_text, rerank_text, summary, session_state
+def _llm_answer(user_message: str, contexts: List[Dict[str, Any]]) -> str:
+    """Generate a natural, human-like answer grounded in contexts.
+    Uses OpenAI if configured; otherwise produces a conversational fallback.
+    """
+    # Build a compact context string
+    ctx_blocks = []
+    for i, c in enumerate(contexts, 1):
+        src = c.get('metadata', {}).get('source_name', 'unknown')
+        snippet = (c.get('content', '') or '')[:400]
+        ctx_blocks.append(f"[{i}] ({src}) {snippet}")
+    ctx_text = "\n\n".join(ctx_blocks)
+    # Prefer OpenAI
+    if os.getenv("OPENAI_API_KEY") and '_OpenAI' in globals() and _OpenAI is not None:
+        try:
+            client = _OpenAI()
+            system_prompt = (
+                "You are a helpful, professional assistant. Answer in a warm, natural, and concise tone. "
+                "ALWAYS ground the answer ONLY in the provided contexts. If information is missing, say so. "
+                "Style: Start with a clear 1-2 sentence answer. Then, if helpful, add 2-5 short bullet points with key facts. "
+                "Avoid hedging, avoid citations inline, avoid repeating the question."
+            )
+            content = (
+                f"User question:\n{user_message}\n\n"
+                f"Contexts (each begins with [n]):\n{ctx_text}\n\n"
+                "Write a natural, human-like response as specified."
+            )
+            resp = client.chat.completions.create(
+                model=os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": content},
+                ],
+                temperature=0.2,
+            )
+            return resp.choices[0].message.content.strip()
+        except Exception:
+            pass
+    # Fallback heuristic: conversational synthesis from top snippets
+    if contexts:
+        snippets = []
+        for src in contexts[:3]:
+            txt = (src.get('content') or '').strip()
+            if txt:
+                snippets.append(txt)
+        joined = " ".join(snippets)
+        # Naive sentence split
+        sentences = [s.strip() for s in joined.replace('\n', ' ').split('.') if s.strip()]
+        bullets = sentences[:4]
+        lead = "Here’s what the documents say:"
+        if bullets:
+            bullets_text = "\n".join([f"- {b}." for b in bullets])
+            return f"{lead}\n\n{bullets_text}"
+    return "I don't have enough information to answer that yet."
+def chat_with_rag(message: str, history: List[Dict[str, str]], pipeline: str, k: int, tags: str, use_tags: bool, session_state: Dict[str, Any]) -> Tuple[str, List[Dict[str, str]], Dict[str, Any], str]:
+    """Chat interface with RAG: choose one pipeline, retrieve, then generate an LLM answer."""
+    global session_rag_manager, rag_manager, session_manager
+    if not session_rag_manager or not rag_manager:
+        return "System not initialized! Please build the RAG index first.", history, session_state or {}, ""
+    # Get or refresh session
+    if not session_state or not session_state.get("session_id"):
+        session_state = init_session()
+    else:
+        # Refresh session to prevent expiration (get_session updates access time)
+        session_manager.get_session(session_state["session_id"])
+    rag = session_rag_manager.get_rag(session_state["session_id"])
+    # Parse tags - only use if toggle is enabled
+    tag_list = [t.strip() for t in tags.split(',') if t.strip()] if (use_tags and tags) else []
+    # Retrieve with chosen pipeline
+    filters_note = ""
+    result = None
+    try:
+        if pipeline == "base_rag":
+            result = rag.base_rag.retrieve(message, k)
+        elif pipeline == "tag_filter_rag":
+            result = rag.tag_filter_rag.retrieve(message, k, tags=tag_list, tag_operator="OR")
+            if tag_list:
+                filters_note = f" (tags: {', '.join(tag_list)})"
+        elif pipeline == "hybrid_rag":
+            result = rag.hybrid_rag.retrieve(message, k, tags=tag_list, vector_weight=0.7, tag_weight=0.3)
+            if tag_list:
+                filters_note = f" (tags: {', '.join(tag_list)})"
+        elif pipeline == "hybrid_rerank_rag":
+            result = rag.hybrid_rerank_rag.retrieve(message, k, tags=tag_list, vector_weight=0.7, tag_weight=0.3)
+            if tag_list:
+                filters_note = f" (tags: {', '.join(tag_list)})"
+        else:
+            return f"Unknown pipeline: {pipeline}", history, session_state or {}, ""
+    except Exception as e:
+        return f"Error during retrieval: {str(e)}", history, session_state or {}, ""
+    if not result:
+        return "No results retrieved", history, session_state or {}, ""
+    # Generate grounded answer
+    answer = _llm_answer(message, result.sources)
+    # Format sources with tags for display
+    sources_display = []
+    sources_display.append(f"### 📎 Source Documents ({len(result.sources)} total)\n")
+    for i, s in enumerate(result.sources[:k], 1):
+        meta = s.get('metadata', {})
+        src_name = meta.get('source_name', 'unknown')
+        src_tags = meta.get('tags', [])
+        tag_str = f" **Tags:** {', '.join(src_tags[:5])}" if src_tags else ""
+        score = s.get('score', 0)
+        sources_display.append(f"{i}. **{src_name}** (Score: {score:.3f}){tag_str}")
+        sources_display.append(f"   {s.get('content', '')[:150]}...\n")
+    sources_text = "\n".join(sources_display)
+    # Sources list (compact for chat)
+    src_lines = []
+    for s in result.sources[:k]:
+        src_name = s.get('metadata', {}).get('source_name', 'unknown')
+        src_tags = s.get('metadata', {}).get('tags', [])
+        tag_str = f" [{', '.join(src_tags[:3])}]" if src_tags else ""
+        src_lines.append(f"• {src_name}{tag_str}")
+    pipeline_names = {
+        "base_rag": "Base-RAG",
+        "tag_filter_rag": "Tag Filter RAG",
+        "hybrid_rag": "Hybrid RAG",
+        "hybrid_rerank_rag": "Hybrid Rerank RAG"
+    }
+    pipeline_name = pipeline_names.get(pipeline, pipeline)
+    response = (
+        f"{answer}\n\n"
+        f"─────────────────────────────────────\n"
+        f"📎 Sources ({pipeline_name}{filters_note}):\n" + ("\n".join(src_lines) or "(none)")
+    )
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": response})
+    return "", history, session_state, sources_text
+def run_evaluation(queries_json: str, output_filename: str, user_satisfaction_json: str = None, session_state: Dict[str, Any] = None) -> Tuple[str, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]]]:
+    """Run quantitative evaluation with all pipelines"""
+    global evaluator, comparator, visualizer, report_generator, session_rag_manager, session_manager, rag_manager
+    if not evaluator:
+        return "System not initialized!", None, None, None
+    # Get or refresh session
+    if not session_state or not session_state.get("session_id"):
+        session_state = init_session()
+    else:
+        # Refresh session to prevent expiration
+        if session_manager and session_state.get("session_id"):
+            session_manager.get_session(session_state["session_id"])
+    # Use session-aware RAG manager for evaluation
+    if session_rag_manager and session_state and session_state.get("session_id"):
+        rag = session_rag_manager.get_rag(session_state["session_id"])
+        # Create a temporary evaluator with the session RAG manager
+        from core.eval import RAGEvaluator
+        evaluator = RAGEvaluator(rag)
+    try:
+        queries = json.loads(queries_json)
+    except json.JSONDecodeError as e:
+        return f"Invalid JSON format: {str(e)}", None, None, None
+    # Parse user satisfaction scores if provided
+    user_satisfaction = None
+    if user_satisfaction_json:
+        try:
+            user_satisfaction = json.loads(user_satisfaction_json)
+        except json.JSONDecodeError:
+            pass
+    # Run evaluation with all pipelines
+    df, summary_dict, raw_results = evaluator.batch_evaluate(
+        queries, output_filename,
+        pipelines=['base_rag', 'tag_filter_rag', 'hybrid_rag', 'hybrid_rerank_rag'],
+        user_satisfaction=user_satisfaction
+    )
+    # Generate summary statistics
+    summary = df.groupby(['pipeline', 'k']).agg({
+        'precision_at_k': 'mean',
+        'ndcg_at_k': 'mean',
+        'hit_at_k': 'mean',
+        'mrr': 'mean',
+        'semantic_similarity': 'mean',
+        'latency': 'mean',
+        'retrieved_count': 'mean'
+    }).reset_index()
+    if 'user_satisfaction' in df.columns:
+        summary['user_satisfaction'] = df.groupby(['pipeline', 'k'])['user_satisfaction'].mean().reset_index()['user_satisfaction']
+    # Create comparison plot data (for backward compatibility)
+    plot_data = pd.DataFrame({
+        'k': summary[summary['pipeline'] == 'base_rag']['k'],
+        'base_rag_hit@k': summary[summary['pipeline'] == 'base_rag']['hit_at_k'],
+        'tag_filter_rag_hit@k': summary[summary['pipeline'] == 'tag_filter_rag']['hit_at_k'],
+        'hybrid_rag_hit@k': summary[summary['pipeline'] == 'hybrid_rag']['hit_at_k'],
+        'hybrid_rerank_rag_hit@k': summary[summary['pipeline'] == 'hybrid_rerank_rag']['hit_at_k']
+    })
+    summary_dict = {
+        'total_queries': len(queries),
+        'pipelines': ['base_rag', 'tag_filter_rag', 'hybrid_rag', 'hybrid_rerank_rag'],
+        'summary_stats': summary.to_dict('records'),
+        'plot_data': plot_data.to_dict('records')
+    }
+    return f"Evaluation completed! Processed {len(queries)} queries. Results saved to {output_filename}", df, summary_dict, raw_results
+def run_evaluation_with_viz(queries_json: str, output_filename: str, user_satisfaction_json: str = None, session_state: Dict[str, Any] = None) -> Tuple[str, pd.DataFrame, str, str, str, Dict[str, str], str]:
+    """Run evaluation with visualization and report generation"""
+    global visualizer, report_generator
+    status, df, summary, raw_results = run_evaluation(queries_json, output_filename, user_satisfaction_json, session_state)
+    if df is None:
+        return status, None, "", None, None, {}, output_filename
+    # Generate visualizations
+    viz_files = {}
+    if visualizer:
+        try:
+            viz_dir = "reports/visualizations"
+            os.makedirs(viz_dir, exist_ok=True)
+            viz_files = visualizer.create_all_charts(df, output_dir=viz_dir)
+        except Exception as e:
+            logger.warning(f"Visualization generation failed: {e}")
+    # Generate report
+    report_paths = {}
+    if report_generator:
+        try:
+            report_paths = report_generator.generate_report(
+                df, summary,
+                report_name=output_filename.replace('.csv', ''),
+                visualizations=viz_files,
+                raw_results=raw_results
+            )
+        except Exception as e:
+            logger.warning(f"Report generation failed: {e}")
+    # Create summary text
+    summary_text = f"### Evaluation Summary\n\n"
+    summary_text += f"**Total Queries**: {summary.get('total_queries', len(df['query'].unique())) if df is not None else 0}\n\n"
+    if summary and 'summary_stats' in summary:
+        summary_text += "**Average Metrics by Pipeline**:\n\n"
+        # Group by pipeline to show all k values together
+        pipelines = {}
+        for stat in summary['summary_stats']:
+            pipeline = stat.get('pipeline', 'unknown')
+            if pipeline not in pipelines:
+                pipelines[pipeline] = []
+            pipelines[pipeline].append(stat)
+        # Show all pipelines, sorted by name
+        for pipeline in sorted(pipelines.keys()):
+            for stat in sorted(pipelines[pipeline], key=lambda x: x.get('k', 0)):
+                k = stat.get('k', 'N/A')
+                summary_text += f"- {pipeline} (k={k}): "
+                summary_text += f"Precision@{k}={stat.get('precision_at_k', 0):.3f}, "
+                summary_text += f"nDCG@{k}={stat.get('ndcg_at_k', 0):.3f}, "
+                summary_text += f"MRR={stat.get('mrr', 0):.3f}\n"
+    if report_paths:
+        summary_text += f"\n**Reports Generated**:\n"
+        for report_type, path in report_paths.items():
+            summary_text += f"- {report_type}: {path}\n"
+    # Get visualization files as images - ensure paths exist and are absolute
+    bar_chart = None
+    line_plot = None
+    if viz_files:
+        bar_path = viz_files.get('bar')
+        line_path = viz_files.get('line')
+        # Check if files exist and convert to absolute paths
+        if bar_path and os.path.exists(bar_path):
+            bar_chart = os.path.abspath(bar_path)
+        if line_path and os.path.exists(line_path):
+            line_plot = os.path.abspath(line_path)
+    return status, df, summary_text, bar_chart, line_plot, viz_files, output_filename
+def update_remaining_vizs(viz_files: Dict[str, str]):
+    """Update remaining visualization tabs"""
+    if not viz_files:
+        return None, None, None, None
+    # Check if files exist and convert to absolute paths
+    scatter = viz_files.get('scatter')
+    box = viz_files.get('box')
+    stacked_bar = viz_files.get('stacked_bar')
+    pareto = viz_files.get('pareto')
+    scatter_path = os.path.abspath(scatter) if scatter and os.path.exists(scatter) else None
+    box_path = os.path.abspath(box) if box and os.path.exists(box) else None
+    stacked_bar_path = os.path.abspath(stacked_bar) if stacked_bar and os.path.exists(stacked_bar) else None
+    pareto_path = os.path.abspath(pareto) if pareto and os.path.exists(pareto) else None
+    return scatter_path, box_path, stacked_bar_path, pareto_path
+# Diagnostics: simple OpenAI connectivity test
+## (removed) test_openai_connectivity helper
+# Initialize system
+initialize_system()
+# Create Gradio interface
+# Minimal CSS to keep layout stable when vertical scrollbar appears and improve mobile spacing
+APP_CSS = """
+html, body { scrollbar-gutter: stable both-edges; }
+body { overflow-y: scroll; }
+* { box-sizing: border-box; }
+@media (max-width: 768px) {
+  .gradio-container { padding-left: 8px; padding-right: 8px; }
+}
+"""
+with gr.Blocks(title="Auto Tagging RAG", css=APP_CSS) as demo:
+    # Global header with session and status indicators
+    with gr.Row():
+        gr.Markdown("# Auto Tagging RAG System")
+    with gr.Row():
+        session_indicator = gr.Markdown("**Session**: Not initialized", visible=True)
+        document_count_indicator = gr.Markdown("**Documents**: 0", visible=True)
+    # Session state
+    session_state = gr.State(value={"session_id": None, "collection_name": None})
+    # BrowserState to persist session ID in localStorage
+    browser_session_id = gr.BrowserState(default_value=None, storage_key="rag_session_id")
+    with gr.Tab("Upload & Tagging"):
+        gr.Markdown("## Upload and Process Documents")
+        with gr.Row():
+            with gr.Column():
+                file_upload = gr.File(
+                    label="Upload PDF/TXT Files",
+                    file_count="multiple",
+                    file_types=[".pdf", ".txt"]
+                )
+                language_dropdown = gr.Dropdown(
+                    choices=["Auto", "en", "ja"],
+                    label="Language",
+                    value="Auto"
+                )
+                manual_tags_input = gr.Textbox(
+                    label="Add Tags (comma-separated, optional)",
+                    placeholder="hospital-protocol, urgent, confidential",
+                    info="Add custom tags that will be combined with auto-generated tags",
+                    lines=2
+                )
+                build_btn = gr.Button("Build RAG Index", variant="primary")
+            with gr.Column():
+                build_output = gr.Textbox(label="Build Status", lines=4)
+                stats_table = gr.DataFrame(label="Processing Summary")
+                chunks_table = gr.DataFrame(label="Indexed Chunks (preview)")
+                reset_btn = gr.Button("Reset Index (Clear chroma_data)", variant="secondary")
+        # Tag visualization section
+        with gr.Accordion("Tag Visualization", open=False):
+            tag_visualization = gr.Markdown(label="Generated Tags (Top 20)", value="Tags will appear here after processing...")
+    with gr.Tab("Search & Compare"):
+        gr.Markdown("## Compare All Retrieval Methods Side-by-Side")
+        with gr.Row():
+            with gr.Column():
+                search_query = gr.Textbox(
+                    label="Search Query",
+                    placeholder="Enter your query...",
+                    lines=2
+                )
+                tags_input = gr.Textbox(
+                    label="Tags (comma-separated, optional)",
+                    placeholder="tag1, tag2, tag3",
+                    lines=1
+                )
+                with gr.Row():
+                    tag_operator = gr.Radio(
+                        choices=["OR", "AND", "NOT"],
+                        value="OR",
+                        label="Tag Operator",
+                        info="OR: any tag, AND: all tags, NOT: exclude tags"
+                    )
+                    k_slider = gr.Slider(
+                        minimum=1, maximum=20,
+                        value=int(os.getenv("DEFAULT_SEARCH_K", 5)),
+                        step=1,
+                        label="Number of results (k)"
+                    )
+                with gr.Row():
+                    vector_weight = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=0.7, step=0.1,
+                        label="Vector Weight (for Hybrid)",
+                        info="Weight for vector similarity in hybrid search"
+                    )
+                    tag_weight = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=0.3, step=0.1,
+                        label="Tag Weight (for Hybrid)",
+                        info="Weight for tag matching in hybrid search"
+                    )
+                search_btn = gr.Button("Search All Methods", variant="primary")
+                search_status = gr.Markdown("**Status**: Ready - Click 'Search All Methods' to start")
+            with gr.Column():
+                with gr.Row():
+                    base_results = gr.Textbox(
+                        label="Base RAG",
+                        lines=6,
+                        max_lines=10,
+                        value="Results will appear here..."
+                    )
+                    tag_results = gr.Textbox(
+                        label="Tag Filter RAG",
+                        lines=6,
+                        max_lines=10,
+                        value="Results will appear here..."
+                    )
+                with gr.Row():
+                    hybrid_results = gr.Textbox(
+                        label="Hybrid RAG",
+                        lines=6,
+                        max_lines=10,
+                        value="Results will appear here..."
+                    )
+                    rerank_results = gr.Textbox(
+                        label="Hybrid Rerank RAG",
+                        lines=6,
+                        max_lines=10,
+                        value="Results will appear here..."
+                    )
+        search_summary = gr.Markdown()
+    with gr.Tab("Chat Interface"):
+        gr.Markdown("## Natural Conversation with Tag-Enhanced RAG")
+        with gr.Row():
+            with gr.Column(scale=1):
+                pipeline_radio = gr.Radio(
+                    choices=["base_rag", "tag_filter_rag", "hybrid_rag", "hybrid_rerank_rag"],
+                    label="RAG Pipeline",
+                    value="hybrid_rag"
+                )
+                use_tags_toggle = gr.Checkbox(
+                    label="Enable Tag Filtering",
+                    value=False,
+                    info="Use tags for tag-based pipelines"
+                )
+                chat_tags_input = gr.Textbox(
+                    label="Tags (comma-separated, for tag-based pipelines)",
+                    placeholder="tag1, tag2",
+                    lines=1,
+                    visible=True
+                )
+                chat_k_slider = gr.Slider(
+                    minimum=1, maximum=10, value=3, step=1,
+                    label="Number of results"
+                )
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(label="RAG Chat", type="messages", height=400)
+                chat_input = gr.Textbox(
+                    label="Message",
+                    placeholder="Ask a question...",
+                    lines=2
+                )
+                chat_btn = gr.Button("Send", variant="primary")
+        # Source documents display with tags
+        with gr.Accordion("📎 Source Documents", open=False):
+            chat_sources = gr.Markdown(label="Retrieved Documents with Tags", value="Source documents will appear here after chat...")
+    with gr.Tab("Analytics & Evaluation"):
+        gr.Markdown("## Performance Visualization and Metrics")
+        with gr.Row():
+            with gr.Column():
+                eval_queries = gr.Textbox(
+                    label="Evaluation Queries (JSON)",
+                    lines=8,
+                    placeholder='''[
+  {
+    "query": "What are emergency procedures?",
+    "ground_truth": ["chunk_id_1", "chunk_id_2"],
+    "k_values": [1, 3, 5],
+    "tags": ["emergency", "procedure"]
+  }
+]''',
+                    value='''[
+  {
+    "query": "What are the emergency procedures?",
+    "ground_truth": ["Emergency protocols for triage", "Patient assessment guidelines"],
+    "k_values": [1, 3, 5]
+  }
+]'''
+                )
+                user_satisfaction_input = gr.Textbox(
+                    label="User Satisfaction Scores (JSON, optional)",
+                    lines=3,
+                    placeholder='''{
+  "query_0": 4.5,
+  "query_1": 4.2,
+  "query_2": 4.8
+}''',
+                    value=""
+                )
+                eval_output_name = gr.Textbox(
+                    label="Output Filename",
+                    value="evaluation_results.csv"
+                )
+                with gr.Row():
+                    eval_btn = gr.Button("Run Evaluation", variant="primary")
+                    export_csv_btn = gr.Button("Download CSV", variant="secondary")
+                    export_png_btn = gr.Button("Download Charts", variant="secondary")
+            with gr.Column():
+                eval_output = gr.Textbox(label="Evaluation Status", lines=3)
+                eval_summary_text = gr.Markdown(label="Summary Statistics")
+                eval_results_table = gr.DataFrame(label="Evaluation Results")
+                query_history = gr.DataFrame(label="Query History with Performance Scores", visible=False)
+                export_csv_file = gr.File(visible=False)
+                export_charts_files = gr.File(visible=False)
+        with gr.Tabs():
+            with gr.Tab("Bar Charts"):
+                eval_bar_chart = gr.Image(label="Metric Comparison (Bar Chart)")
+            with gr.Tab("Line Plots"):
+                eval_line_plot = gr.Image(label="Metric Trends (Line Plot)")
+            with gr.Tab("Scatter Plots"):
+                eval_scatter_plot = gr.Image(label="Correlation Analysis (Scatter Plot)")
+            with gr.Tab("Box Plots"):
+                eval_box_plot = gr.Image(label="Distribution Analysis (Box Plot)")
+            with gr.Tab("Stacked Bar"):
+                eval_stacked_plot = gr.Image(label="Method Breakdown (Stacked Bar)")
+            with gr.Tab("Pareto"):
+                eval_pareto_plot = gr.Image(label="Performance Ranking (Pareto Chart)")
+    with gr.Tab("Settings & Management"):
+        gr.Markdown("## System Configuration and User Management")
+        with gr.Accordion("Tag Generation Parameters", open=False):
+            max_tags_slider = gr.Slider(
+                minimum=5, maximum=50, value=10, step=1,
+                label="Max Tags Per Chunk",
+                info="Maximum number of tags to generate per document chunk"
+            )
+            min_tag_length_slider = gr.Slider(
+                minimum=1, maximum=5, value=2, step=1,
+                label="Min Tag Length (words)",
+                info="Minimum number of words in a tag phrase"
+            )
+            max_tag_length_slider = gr.Slider(
+                minimum=1, maximum=5, value=3, step=1,
+                label="Max Tag Length (words)",
+                info="Maximum number of words in a tag phrase"
+            )
+            tag_method_dropdown = gr.Dropdown(
+                choices=["auto", "yake", "keybert", "spacy", "janome", "openai"],
+                value="auto",
+                label="Tag Generation Method",
+                info="Method for generating tags (auto selects best available)"
+            )
+            apply_tag_params_btn = gr.Button("Apply Tag Settings", variant="primary")
+            tag_params_status = gr.Textbox(label="Status", lines=2, interactive=False)
+        with gr.Accordion("Hybrid Search Weights", open=False):
+            default_vector_weight = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.7, step=0.1,
+                label="Default Vector Weight"
+            )
+            default_tag_weight = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.3, step=0.1,
+                label="Default Tag Weight"
+            )
+            apply_weights_btn = gr.Button("Apply Weight Settings", variant="primary")
+            weights_status = gr.Textbox(label="Status", lines=2, interactive=False)
+        with gr.Accordion("Database Management", open=False):
+            with gr.Row():
+                clear_data_btn = gr.Button("Clear All Data", variant="stop")
+                export_data_btn = gr.Button("Export Database", variant="secondary")
+                import_data_btn = gr.File(label="Import Database", file_count="single", file_types=[".sqlite3", ".db"])
+            db_status = gr.Textbox(label="Database Status", lines=2, interactive=False)
+        with gr.Accordion("Embedding Configuration", open=True):
+            gr.Markdown("**Select the embedding provider and model.** Switching providers requires re-indexing your documents.")
+            gr.Markdown("**Note:** For OpenAI embeddings, set `OPENAI_API_KEY` in your `.env` file or environment variables. API keys should not be set through the UI.")
+            with gr.Row():
+                with gr.Column():
+                    emb_provider = gr.Radio(
+                        choices=["SentenceTransformers", "OpenAI"],
+                        value="SentenceTransformers",
+                        label="Embeddings Provider",
+                        info="Choose between local SentenceTransformers models or OpenAI embeddings (requires OPENAI_API_KEY in .env)"
+                    )
+                    with gr.Row():
+                        apply_embed_btn = gr.Button("Apply Embedding Settings", variant="primary")
+            with gr.Row():
+                with gr.Column():
+                    st_model_in = gr.Textbox(
+                        label="SentenceTransformers Model",
+                        value=os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2"),
+                        interactive=False,
+                        info="Local embedding model (384 dimensions)"
+                    )
+                with gr.Column():
+                    oai_model_in = gr.Textbox(
+                        label="OpenAI Embedding Model",
+                        value=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small"),
+                        interactive=False,
+                        info="OpenAI embedding model (1536 dimensions for small, 3072 for large)"
+                    )
+            embed_status = gr.Textbox(
+                label="Status",
+                lines=3,
+                interactive=False,
+                placeholder="Embedding configuration status will appear here..."
+            )
+        # Define handler before wiring it
+        def _apply_embeddings(provider, st_model, oai_model):
+            try:
+                use_oai = (provider == "OpenAI")
+                rag_manager.vector_store.configure_embeddings(use_oai, openai_model=oai_model, st_model_name=st_model)
+                status_msg = f"✅ Embeddings successfully configured!\n\n"
+                status_msg += f"Provider: {provider}\n"
+                if use_oai:
+                    status_msg += f"Model: {oai_model} (OpenAI)\n"
+                    status_msg += f"Dimensions: {3072 if 'large' in oai_model.lower() else 1536}\n"
+                else:
+                    status_msg += f"Model: {st_model} (SentenceTransformers)\n"
+                    status_msg += f"Dimensions: ~384\n"
+                status_msg += f"\n⚠️ Note: If switching providers, reset and rebuild your index in the Upload tab."
+                return status_msg
+            except Exception as ex:
+                return f"❌ Failed to set embeddings: {ex}\n\nPlease check your configuration and try again."
+        # Handler functions for Settings tab
+        def apply_tag_params(max_tags, min_len, max_len, method):
+            """Apply tag generation parameters"""
+            global rag_manager
+            if not rag_manager:
+                return "System not initialized. Please initialize first."
+            # Store settings in environment or config (simplified for now)
+            os.environ['MAX_TAGS_PER_CHUNK'] = str(max_tags)
+            os.environ['MIN_TAG_LENGTH'] = str(min_len)
+            os.environ['MAX_TAG_LENGTH'] = str(max_len)
+            os.environ['TAG_GENERATION_METHOD'] = method
+            return f"✅ Tag parameters updated:\n- Max tags: {max_tags}\n- Tag length: {min_len}-{max_len} words\n- Method: {method}"
+        def apply_weight_settings(vec_weight, tag_weight):
+            """Apply default hybrid search weights"""
+            os.environ['DEFAULT_VECTOR_WEIGHT'] = str(vec_weight)
+            os.environ['DEFAULT_TAG_WEIGHT'] = str(tag_weight)
+            return f"✅ Default weights updated:\n- Vector: {vec_weight}\n- Tag: {tag_weight}"
+        def clear_all_data():
+            """Clear all database data"""
+            try:
+                reset_index()
+                return "✅ All data cleared successfully. Please rebuild your index."
+            except Exception as e:
+                return f"❌ Error clearing data: {str(e)}"
+        def export_database():
+            """Export database"""
+            try:
+                db_path = persist_directory or "./chroma_data"
+                if os.path.exists(db_path):
+                    import shutil
+                    export_path = f"export_{int(time.time())}.tar.gz"
+                    shutil.make_archive(export_path.replace('.tar.gz', ''), 'gztar', db_path)
+                    return f"✅ Database exported to: {export_path}"
+                return "❌ No database found to export"
+            except Exception as e:
+                return f"❌ Export failed: {str(e)}"
+        apply_tag_params_btn.click(
+            fn=apply_tag_params,
+            inputs=[max_tags_slider, min_tag_length_slider, max_tag_length_slider, tag_method_dropdown],
+            outputs=[tag_params_status]
+        )
+        apply_weights_btn.click(
+            fn=apply_weight_settings,
+            inputs=[default_vector_weight, default_tag_weight],
+            outputs=[weights_status]
+        )
+        clear_data_btn.click(
+            fn=clear_all_data,
+            inputs=None,
+            outputs=[db_status]
+        )
+        export_data_btn.click(
+            fn=export_database,
+            inputs=None,
+            outputs=[db_status]
+        )
+        apply_embed_btn.click(
+            fn=_apply_embeddings,
+            inputs=[emb_provider, st_model_in, oai_model_in],
+            outputs=embed_status
+        )
+    # Initialize session on load with localStorage support
+    def init_session_on_load(browser_sid):
+        """Initialize session from localStorage or create new one"""
+        # Read session ID from BrowserState (localStorage)
+        # browser_sid could be None, str, or already be a session ID
+        if browser_sid and isinstance(browser_sid, str):
+            session_id = browser_sid.strip() if browser_sid.strip() else None
+        else:
+            session_id = None
+        result = init_session(session_id)
+        # Return session_state, the session_id (BrowserState will auto-save to localStorage), document count, and session indicator
+        session_id_str = result.get("session_id", "")
+        doc_count = get_document_count(result)
+        doc_count_str = f"**Documents**: {doc_count}"
+        session_indicator_str = f"**Session**: {session_id_str[:8]}..." if session_id_str else "**Session**: Not initialized"
+        logger.info(f"Session initialized: {session_id_str[:8]}... (from localStorage: {session_id is not None})")
+        return result, session_id_str, doc_count_str, session_indicator_str
+    # Read session ID from localStorage on page load and initialize session
+    demo.load(
+        fn=init_session_on_load,
+        inputs=[browser_session_id],
+        outputs=[session_state, browser_session_id, document_count_indicator, session_indicator],
+        queue=False
+    )
+    # Save session ID to BrowserState (localStorage) whenever session_state is updated
+    def update_browser_session(session_data: Dict[str, Any]) -> Tuple[Dict[str, Any], str, str, str]:
+        """Update session state, save to BrowserState (localStorage), and update document count and session indicator"""
+        session_id = session_data.get("session_id", "") if session_data else ""
+        doc_count = get_document_count(session_data)
+        doc_count_str = f"**Documents**: {doc_count}"
+        session_indicator_str = f"**Session**: {session_id[:8]}..." if session_id else "**Session**: Not initialized"
+        return session_data, session_id, doc_count_str, session_indicator_str
+    # Hook into session_state changes to save to BrowserState (localStorage) and update document count
+    session_state.change(
+        fn=update_browser_session,
+        inputs=[session_state],
+        outputs=[session_state, browser_session_id, document_count_indicator, session_indicator],
+        queue=False
+    )
+    # Event handlers
+    build_btn.click(
+        fn=build_with_session,
+        inputs=[file_upload, language_dropdown, manual_tags_input, session_state],
+        outputs=[session_state, build_output, stats_table, chunks_table, tag_visualization],
+        api_name="build_rag"
+    ).then(
+        fn=lambda s: gr.update(value=f"**Session**: {s.get('session_id', 'Unknown')[:8]}..." if s and s.get('session_id') else "**Session**: Not initialized"),
+        inputs=[session_state],
+        outputs=[session_indicator],
+        queue=False
+    ).then(
+        fn=lambda s: gr.update(value=f"**Documents**: {get_document_count(s)}"),
+        inputs=[session_state],
+        outputs=[document_count_indicator],
+        queue=False
+    )
+    def reset_index_with_count(session_state: Dict[str, Any]) -> Tuple[str, str]:
+        """Reset index and return updated document count"""
+        reset_msg = reset_index()
+        # After reset, document count should be 0
+        return reset_msg, "**Documents**: 0"
+    reset_btn.click(
+        fn=reset_index_with_count,
+        inputs=[session_state],
+        outputs=[build_output, document_count_indicator]
+    )
+    # Search all methods - process sequentially with status updates
+    def search_with_status(query: str, k: int, tags: str, tag_operator: str, vector_weight: float, tag_weight: float, session_state: Dict[str, Any]):
+        """Wrapper to update status during search"""
+        return search_all_methods(query, k, tags, tag_operator, vector_weight, tag_weight, session_state)
+    search_btn.click(
+        fn=lambda: gr.update(value="**Status**: 🔄 Processing Base RAG..."),
+        outputs=[search_status],
+        queue=False
+    ).then(
+        fn=search_all_methods,
+        inputs=[search_query, k_slider, tags_input, tag_operator, vector_weight, tag_weight, session_state],
+        outputs=[base_results, tag_results, hybrid_results, rerank_results, search_summary, session_state],
+        api_name="search_all"
+    ).then(
+        fn=lambda: gr.update(value="**Status**: ✅ All methods completed!"),
+        outputs=[search_status],
+        queue=False
+    )
+    # Toggle tag input visibility
+    def toggle_tag_input(use_tags):
+        return gr.update(visible=use_tags)
+    use_tags_toggle.change(
+        fn=toggle_tag_input,
+        inputs=[use_tags_toggle],
+        outputs=[chat_tags_input]
+    )
+    chat_btn.click(
+        fn=chat_with_rag,
+        inputs=[chat_input, chatbot, pipeline_radio, chat_k_slider, chat_tags_input, use_tags_toggle, session_state],
+        outputs=[chat_input, chatbot, session_state, chat_sources],
+        api_name="chat"
+    ).then(
+        lambda: None,
+        None,
+        chat_input,
+        queue=False
+    )
+    eval_viz_state = gr.State(value={})
+    eval_output_filename_state = gr.State(value="")
+    eval_btn.click(
+        fn=run_evaluation_with_viz,
+        inputs=[eval_queries, eval_output_name, user_satisfaction_input, session_state],
+        outputs=[eval_output, eval_results_table, eval_summary_text, eval_bar_chart, eval_line_plot, eval_viz_state, eval_output_filename_state],
+        api_name="evaluate"
+    ).then(
+        fn=update_remaining_vizs,
+        inputs=[eval_viz_state],
+        outputs=[eval_scatter_plot, eval_box_plot, eval_stacked_plot, eval_pareto_plot],
+        queue=False
+    )
+    # Export button handlers
+    def export_csv_wrapper(filename_state):
+        """Export CSV from current evaluation results - returns file for download"""
+        try:
+            # Use stored filename from evaluation if available, otherwise use provided filename
+            filename = filename_state if filename_state else "evaluation_results.csv"
+            # Ensure .csv extension
+            if not filename.endswith('.csv'):
+                filename = f"{filename}.csv"
+            csv_path = os.path.join("reports", filename)
+            if os.path.exists(csv_path):
+                # Return absolute path for download
+                return os.path.abspath(csv_path)
+            else:
+                # Try to find any CSV file in reports directory
+                reports_dir = "reports"
+                if os.path.exists(reports_dir):
+                    csv_files = [f for f in os.listdir(reports_dir) if f.endswith('.csv')]
+                    if csv_files:
+                        # Return the most recent one
+                        csv_files.sort(key=lambda x: os.path.getmtime(os.path.join(reports_dir, x)), reverse=True)
+                        return os.path.abspath(os.path.join(reports_dir, csv_files[0]))
+                return None
+        except Exception as e:
+            logger.error(f"CSV export error: {e}")
+            return None
+    def export_png_wrapper():
+        """Export PNG charts - creates a ZIP file for download"""
+        try:
+            viz_dir = "reports/visualizations"
+            if os.path.exists(viz_dir):
+                png_files = [f for f in os.listdir(viz_dir) if f.endswith('.png')]
+                if png_files:
+                    # Create a ZIP file containing all PNG charts
+                    zip_path = os.path.join("reports", "charts.zip")
+                    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                        for png_file in png_files:
+                            file_path = os.path.join(viz_dir, png_file)
+                            # Add file to ZIP with just the filename (no directory structure)
+                            zipf.write(file_path, arcname=png_file)
+                    # Return absolute path to ZIP file
+                    return os.path.abspath(zip_path)
+            return None
+        except Exception as e:
+            logger.error(f"Charts export error: {e}")
+            return None
+    # JavaScript to directly trigger browser download from File component value
+    # When File component gets a file path, Gradio creates a downloadable URL
+    # We extract that URL and trigger download immediately
+    auto_download_js = """
+    function(fileValue) {
+        // Wait a bit for Gradio to process the file and create download URL
+        setTimeout(() => {
+            if (fileValue) {
+                // File component value can be a string (path) or object with file info
+                let fileUrl = null;
+                let filename = 'download';
+                if (typeof fileValue === 'string') {
+                    // If it's already a URL (Gradio file endpoint)
+                    if (fileValue.startsWith('http') || fileValue.startsWith('/')) {
+                        fileUrl = fileValue;
+                        filename = fileValue.split('/').pop().split('?')[0] || 'download';
+                    } else {
+                        // If it's a file path, construct Gradio file URL
+                        const baseUrl = window.location.origin + window.location.pathname.replace(/\/$/, '');
+                        fileUrl = baseUrl + '/file=' + encodeURIComponent(fileValue);
+                        filename = fileValue.split('/').pop() || 'download';
+                    }
+                } else if (fileValue && fileValue.url) {
+                    // If it's an object with url property (Gradio FileData)
+                    fileUrl = fileValue.url;
+                    filename = fileValue.name || fileValue.url.split('/').pop().split('?')[0] || 'download';
+                }
+                if (fileUrl) {
+                    // Create and trigger download
+                    const link = document.createElement('a');
+                    link.href = fileUrl;
+                    link.download = filename;
+                    link.style.display = 'none';
+                    document.body.appendChild(link);
+                    link.click();
+                    setTimeout(() => document.body.removeChild(link), 100);
+                }
+            }
+        }, 300);
+        return fileValue;
+    }
+    """
+    export_csv_btn.click(
+        fn=export_csv_wrapper,
+        inputs=[eval_output_filename_state],
+        outputs=[export_csv_file]
+    ).then(
+        fn=None,
+        inputs=[export_csv_file],
+        outputs=None,
+        js=auto_download_js
+    )
+    export_png_btn.click(
+        fn=export_png_wrapper,
+        inputs=None,
+        outputs=[export_charts_files]
+    ).then(
+        fn=None,
+        inputs=[export_charts_files],
+        outputs=None,
+        js=auto_download_js
+    )
+    # Diagnostics trigger removed
+# MCP Server Implementation
+import asyncio
+import sys
+from typing import Any, List, Optional
+try:
+    from mcp.server import Server
+    from mcp.server.models import InitializationOptions
+    from mcp.types import Tool, TextContent
+    MCP_AVAILABLE = True
+except ImportError:
+    MCP_AVAILABLE = False
+    # Fallback for when MCP is not installed
+    Server = None
+    Tool = None
+    TextContent = None
+class RAGMCPServer:
+    """MCP server for RAG system"""
+    def __init__(self):
+        persist_dir = "/data/chroma" if os.path.exists("/data/chroma") else "./chroma_data"
+        self.rag_manager = RAGManager(persist_directory=persist_dir)
+        self.evaluator = RAGEvaluator(self.rag_manager)
+    async def list_tools(self) -> List[Tool]:
+        """List available MCP tools"""
+        return [
+            Tool(
+                name="search_documents",
+                description="Search documents using RAG system (Base-RAG, Tag Filter, Hybrid, or Hybrid Rerank)",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "query": {"type": "string", "description": "Search query"},
+                        "k": {"type": "integer", "description": "Number of results", "default": 5},
+                        "pipeline": {"type": "string", "enum": ["base_rag", "tag_filter_rag", "hybrid_rag", "hybrid_rerank_rag"], "default": "base_rag"},
+                        "tags": {"type": "array", "items": {"type": "string"}, "description": "Tags for tag-based search"},
+                        "tag_operator": {"type": "string", "enum": ["OR", "AND", "NOT"], "description": "Tag operator (OR/AND/NOT)", "default": "OR"},
+                    },
+                    "required": ["query"]
+                }
+            ),
+            Tool(
+                name="evaluate_retrieval",
+                description="Evaluate RAG performance with batch queries",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "queries": {
+                            "type": "array",
+                            "description": "List of query objects with query, ground_truth, k_values, and optional filters",
+                            "items": {"type": "object"}
+                        },
+                        "output_file": {"type": "string", "description": "Output filename for results"}
+                    },
+                    "required": ["queries"]
+                }
+            )
+        ]
+    async def call_tool(self, name: str, arguments: dict) -> List[TextContent]:
+        """Call an MCP tool by name"""
+        if name == "search_documents":
+            query = arguments.get("query")
+            k = arguments.get("k", 5)
+            pipeline = arguments.get("pipeline", "base_rag")
+            tags = arguments.get("tags", [])
+            tag_operator = arguments.get("tag_operator", "OR")
+            if pipeline == "base_rag":
+                result = self.rag_manager.base_rag.retrieve(query, k)
+            elif pipeline == "tag_filter_rag":
+                result = self.rag_manager.tag_filter_rag.retrieve(query, k, tags=tags, tag_operator=tag_operator)
+            elif pipeline == "hybrid_rag":
+                result = self.rag_manager.hybrid_rag.retrieve(query, k, tags=tags, vector_weight=0.7, tag_weight=0.3)
+            elif pipeline == "hybrid_rerank_rag":
+                result = self.rag_manager.hybrid_rerank_rag.retrieve(query, k, tags=tags, vector_weight=0.7, tag_weight=0.3)
+            else:
+                result = self.rag_manager.base_rag.retrieve(query, k)
+            response = {
+                "content": result.content,
+                "sources": [
+                    {
+                        "content": source['content'][:200],
+                        "metadata": source['metadata'],
+                        "score": source['score']
+                    } for source in result.sources
+                ],
+                "latency": result.latency,
+                "strategy": pipeline
+            }
+            return [TextContent(type="text", text=json.dumps(response, indent=2))]
+        elif name == "evaluate_retrieval":
+            queries = arguments.get("queries", [])
+            output_file = arguments.get("output_file")
+            df, results = self.evaluator.batch_evaluate(queries, output_file)
+            summary = df.groupby('pipeline').agg({
+                'hit_at_k': 'mean',
+                'mrr': 'mean',
+                'semantic_similarity': 'mean',
+                'latency': 'mean'
+            }).reset_index()
+            response = {
+                "summary": summary.to_dict('records'),
+                "total_queries": len(queries),
+                "output_file": output_file
+            }
+            return [TextContent(type="text", text=json.dumps(response, indent=2))]
+        else:
+            raise ValueError(f"Unknown tool: {name}")
+# Export for Gradio Client
+if __name__ == "__main__":
+    # If run as CLI, prefer plain Gradio serving. Spaces will import demo directly.
+    # Respect common hosting env vars.
+    host = os.getenv("HOST", "0.0.0.0")
+    port = int(os.getenv("PORT", os.getenv("GRADIO_SERVER_PORT", 7860)))
+    # Avoid SSR and API schema on Spaces to prevent response length errors
+    demo.launch(server_name=host, server_port=port, share=False, ssl_verify=False, ssr_mode=False)

core/comparison.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Comparison Framework for RAG Methods
+This module provides utilities for comparing different RAG retrieval methods
+side-by-side with aggregated metrics and easy-to-use comparison functions.
+"""
+import pandas as pd
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+from .eval import RAGEvaluator
+from .retrieval import RetrievalResult
+class RAGComparisonFramework:
+    """Framework for comparing RAG retrieval methods side-by-side"""
+    # Method name mappings for display
+    METHOD_NAMES = {
+        'base_rag': 'Baseline',
+        'tag_filter_rag': '+Tags(Filter)',
+        'hybrid_rag': 'Hybrid(Weighted)',
+        'hybrid_rerank_rag': 'Hybrid+Rerank'
+    }
+    def __init__(self, evaluator: RAGEvaluator):
+        """
+        Initialize comparison framework.
+        Args:
+            evaluator: RAGEvaluator instance for evaluation
+        """
+        self.evaluator = evaluator
+    def compare_methods(self,
+                       queries: List[Dict[str, Any]],
+                       k_values: Optional[List[int]] = None,
+                       methods: Optional[List[str]] = None) -> pd.DataFrame:
+        """
+        Compare all methods side-by-side on given queries.
+        Args:
+            queries: List of query dictionaries
+            k_values: List of k values to evaluate (default: [1, 3, 5])
+            methods: List of methods to compare (default: all 4 methods)
+        Returns:
+            DataFrame with side-by-side comparison
+        """
+        if k_values is None:
+            k_values = [1, 3, 5]
+        if methods is None:
+            methods = ['base_rag', 'tag_filter_rag', 'hybrid_rag', 'hybrid_rerank_rag']
+        # Run evaluation
+        df, summary, raw_results = self.evaluator.batch_evaluate(
+            queries=queries,
+            pipelines=methods
+        )
+        return df
+    def get_comparison_table(self,
+                           df: pd.DataFrame,
+                           k_value: int,
+                           metrics: Optional[List[str]] = None) -> pd.DataFrame:
+        """
+        Get comparison table for specific k value.
+        Args:
+            df: Evaluation results DataFrame
+            k_value: k value to filter by
+            metrics: List of metrics to include (default: all)
+        Returns:
+            Comparison table DataFrame
+        """
+        if metrics is None:
+            metrics = ['precision_at_k', 'ndcg_at_k', 'mrr', 'hit_at_k', 'latency']
+        # Filter by k
+        k_df = df[df['k'] == k_value]
+        # Aggregate by pipeline
+        comparison = k_df.groupby('pipeline')[metrics].mean().reset_index()
+        # Rename pipelines
+        comparison['pipeline'] = comparison['pipeline'].map(
+            lambda x: self.METHOD_NAMES.get(x, x)
+        )
+        return comparison
+    def get_aggregated_comparison(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Get aggregated comparison across all k values.
+        Args:
+            df: Evaluation results DataFrame
+        Returns:
+            Aggregated comparison DataFrame
+        """
+        metrics = ['precision_at_k', 'ndcg_at_k', 'mrr', 'hit_at_k', 'latency']
+        # Aggregate by pipeline
+        aggregated = df.groupby('pipeline')[metrics].agg(['mean', 'std']).reset_index()
+        # Flatten column names
+        aggregated.columns = ['_'.join(col).strip('_') if col[1] else col[0]
+                             for col in aggregated.columns.values]
+        # Rename pipelines
+        aggregated['pipeline'] = aggregated['pipeline'].map(
+            lambda x: self.METHOD_NAMES.get(x, x)
+        )
+        return aggregated
+    def get_method_rankings(self, df: pd.DataFrame, k_value: int,
+                           metric: str = 'precision_at_k') -> pd.DataFrame:
+        """
+        Get method rankings by metric.
+        Args:
+            df: Evaluation results DataFrame
+            k_value: k value to filter by
+            metric: Metric to rank by
+        Returns:
+            Rankings DataFrame
+        """
+        k_df = df[df['k'] == k_value]
+        # Average metric by pipeline
+        rankings = k_df.groupby('pipeline')[metric].mean().reset_index()
+        rankings = rankings.sort_values(metric, ascending=False)
+        # Add ranking
+        rankings['rank'] = range(1, len(rankings) + 1)
+        # Rename pipelines
+        rankings['pipeline'] = rankings['pipeline'].map(
+            lambda x: self.METHOD_NAMES.get(x, x)
+        )
+        return rankings

core/eval.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import time
+import json
+import pandas as pd
+from typing import List, Dict, Any, Tuple, Optional
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from collections import defaultdict
+import math
+from .retrieval import RAGManager, RetrievalResult
+class RAGEvaluator:
+    """Evaluation framework for RAG systems"""
+    def __init__(self, rag_manager: RAGManager):
+        self.rag_manager = rag_manager
+    def evaluate_single_query(self, query: str, ground_truth: List[str],
+                            k_values: List[int] = [1, 3, 5, 10],
+                            level1: Optional[str] = None,
+                            level2: Optional[str] = None,
+                            level3: Optional[str] = None,
+                            doc_type: Optional[str] = None) -> Dict[str, Any]:
+        """Evaluate retrieval for a single query"""
+        base_results = {}
+        hier_results = {}
+        for k in k_values:
+            # Get results from both pipelines
+            base_result, hier_result = self.rag_manager.compare_retrieval(
+                query, k, level1, level2, level3, doc_type
+            )
+            base_results[k] = base_result
+            hier_results[k] = hier_result
+        # Calculate metrics
+        metrics = {
+            "query": query,
+            "ground_truth": ground_truth,
+            "base_rag": self._calculate_metrics(base_results, ground_truth),
+            "hier_rag": self._calculate_metrics(hier_results, ground_truth),
+            "filters": {
+                "level1": level1,
+                "level2": level2,
+                "level3": level3,
+                "doc_type": doc_type
+            }
+        }
+        return metrics
+    def _calculate_metrics(self, results_dict: Dict[int, RetrievalResult],
+                          ground_truth: List[str]) -> Dict[str, Any]:
+        """Calculate evaluation metrics including Precision@k and nDCG@k"""
+        metrics = {}
+        for k, result in results_dict.items():
+            retrieved_docs = [source['content'] for source in result.sources]
+            # Hit@k
+            hit_at_k = self._calculate_hit_at_k(retrieved_docs, ground_truth, k)
+            # Precision@k
+            precision_at_k = self._calculate_precision_at_k(retrieved_docs, ground_truth, k)
+            # nDCG@k
+            ndcg_at_k = self._calculate_ndcg_at_k(retrieved_docs, ground_truth, k)
+            # MRR
+            mrr = self._calculate_mrr(retrieved_docs, ground_truth)
+            # Semantic similarity
+            semantic_sim = self._calculate_semantic_similarity(retrieved_docs, ground_truth)
+            metrics[k] = {
+                "hit_at_k": hit_at_k,
+                "precision_at_k": precision_at_k,
+                "ndcg_at_k": ndcg_at_k,
+                "mrr": mrr,
+                "semantic_similarity": semantic_sim,
+                "latency": result.latency,
+                "retrieved_count": len(retrieved_docs)
+            }
+        return metrics
+    def _calculate_precision_at_k(self, retrieved: List[str], ground_truth: List[str], k: int) -> float:
+        """Calculate Precision@k metric"""
+        if not ground_truth or not retrieved:
+            return 0.0
+        relevant = 0
+        for doc in retrieved[:k]:
+            for gt_doc in ground_truth:
+                if self._documents_match(doc, gt_doc):
+                    relevant += 1
+                    break
+        return relevant / min(k, len(retrieved))
+    def _calculate_ndcg_at_k(self, retrieved: List[str], ground_truth: List[str], k: int) -> float:
+        """Calculate Normalized Discounted Cumulative Gain at k"""
+        if not ground_truth or not retrieved:
+            return 0.0
+        # Calculate DCG@k
+        dcg = 0.0
+        for i, doc in enumerate(retrieved[:k], 1):
+            for gt_doc in ground_truth:
+                if self._documents_match(doc, gt_doc):
+                    # Binary relevance (can be enhanced with graded relevance)
+                    relevance = 1.0
+                    dcg += relevance / math.log2(i + 1)
+                    break
+        # Calculate ideal DCG (IDCG)
+        idcg = 0.0
+        num_relevant = min(k, len(ground_truth))
+        for i in range(1, num_relevant + 1):
+            idcg += 1.0 / math.log2(i + 1)
+        # nDCG = DCG / IDCG
+        if idcg == 0:
+            return 0.0
+        return dcg / idcg
+    def _calculate_hit_at_k(self, retrieved: List[str], ground_truth: List[str], k: int) -> float:
+        """Calculate Hit@k metric"""
+        if not ground_truth:
+            return 0.0
+        # Simple exact match (can be enhanced with semantic matching)
+        for doc in retrieved[:k]:
+            for gt_doc in ground_truth:
+                if self._documents_match(doc, gt_doc):
+                    return 1.0
+        return 0.0
+    def _calculate_mrr(self, retrieved: List[str], ground_truth: List[str]) -> float:
+        """Calculate Mean Reciprocal Rank"""
+        if not ground_truth:
+            return 0.0
+        for rank, doc in enumerate(retrieved, 1):
+            for gt_doc in ground_truth:
+                if self._documents_match(doc, gt_doc):
+                    return 1.0 / rank
+        return 0.0
+    def _calculate_semantic_similarity(self, retrieved: List[str], ground_truth: List[str]) -> float:
+        """Calculate average semantic similarity"""
+        if not retrieved or not ground_truth:
+            return 0.0
+        # Use the same embedding model as the vector store
+        embeddings_retrieved = [self.rag_manager.vector_store.embed_text(doc) for doc in retrieved]
+        embeddings_gt = [self.rag_manager.vector_store.embed_text(doc) for doc in ground_truth]
+        # Calculate cosine similarity matrix
+        similarity_matrix = cosine_similarity(embeddings_retrieved, embeddings_gt)
+        # Return max similarity for each retrieved document, then average
+        max_similarities = np.max(similarity_matrix, axis=1)
+        return float(np.mean(max_similarities))
+    def _documents_match(self, doc1: str, doc2: str, threshold: float = 0.7) -> bool:
+        """Check if two documents match (semantically or exactly)
+        Uses semantic similarity with a threshold. Also checks for exact substring matches
+        to handle cases where ground truth is a substring of the actual chunk.
+        """
+        # Normalize strings for comparison
+        doc1_clean = doc1.strip().lower()
+        doc2_clean = doc2.strip().lower()
+        # Exact match or substring match (ground truth might be a substring of chunk)
+        if doc1_clean == doc2_clean or doc1_clean in doc2_clean or doc2_clean in doc1_clean:
+            return True
+        # Semantic similarity check
+        try:
+            embedding1 = self.rag_manager.vector_store.embed_text(doc1)
+            embedding2 = self.rag_manager.vector_store.embed_text(doc2)
+            similarity = cosine_similarity([embedding1], [embedding2])[0][0]
+            return similarity > threshold
+        except Exception as e:
+            # Fallback to exact match if embedding fails
+            return doc1_clean == doc2_clean
+    def batch_evaluate(self, queries: List[Dict[str, Any]],
+                      output_file: Optional[str] = None,
+                      pipelines: Optional[List[str]] = None,
+                      user_satisfaction: Optional[Dict[str, int]] = None) -> Tuple[pd.DataFrame, Dict[str, Any], List[Dict[str, Any]]]:
+        """
+        Batch evaluation on multiple queries across multiple pipelines.
+        Args:
+            queries: List of query dictionaries
+            output_file: Optional output filename
+            pipelines: List of pipeline names to evaluate (default: all 4 methods)
+            user_satisfaction: Optional dict mapping query_id to satisfaction score (1-5)
+        Returns:
+            Tuple of (DataFrame, summary dict, raw results list)
+        """
+        if pipelines is None:
+            pipelines = ['base_rag', 'tag_filter_rag', 'hybrid_rag', 'hybrid_rerank_rag']
+        results = []
+        all_latencies = defaultdict(list)
+        for i, query_data in enumerate(queries):
+            query_id = query_data.get('query_id', f'query_{i+1}')
+            query = query_data['query']
+            print(f"Evaluating query {i+1}/{len(queries)}: {query[:50]}...")
+            # Get user satisfaction score - try both query_id and index-based keys
+            user_sat_score = None
+            if user_satisfaction:
+                # Try query_id first
+                user_sat_score = user_satisfaction.get(query_id)
+                # If not found, try index-based key (query_0, query_1, etc.)
+                if user_sat_score is None:
+                    user_sat_score = user_satisfaction.get(f'query_{i}')
+            query_result = {
+                'query_id': query_id,
+                'query': query,
+                'ground_truth': query_data.get('ground_truth', []),
+                'user_satisfaction': user_sat_score,
+                'pipelines': {}
+            }
+            k_values = query_data.get('k_values', [1, 3, 5])
+            for pipeline in pipelines:
+                pipeline_results = {}
+                for k in k_values:
+                    try:
+                        # Retrieve using the specified pipeline
+                        retrieval_result = self._retrieve_from_pipeline(
+                            pipeline, query, k, query_data
+                        )
+                        # Calculate metrics
+                        retrieved_docs = [source['content'] for source in retrieval_result.sources]
+                        metrics = {
+                            'hit_at_k': self._calculate_hit_at_k(retrieved_docs, query_data.get('ground_truth', []), k),
+                            'precision_at_k': self._calculate_precision_at_k(retrieved_docs, query_data.get('ground_truth', []), k),
+                            'ndcg_at_k': self._calculate_ndcg_at_k(retrieved_docs, query_data.get('ground_truth', []), k),
+                            'mrr': self._calculate_mrr(retrieved_docs, query_data.get('ground_truth', [])),
+                            'semantic_similarity': self._calculate_semantic_similarity(retrieved_docs, query_data.get('ground_truth', [])),
+                            'latency': retrieval_result.latency,
+                            'retrieved_count': len(retrieved_docs)
+                        }
+                        pipeline_results[k] = metrics
+                        all_latencies[pipeline].append(retrieval_result.latency)
+                    except Exception as e:
+                        print(f"Error evaluating {pipeline} for query {query_id}: {e}")
+                        pipeline_results[k] = {
+                            'hit_at_k': 0.0,
+                            'precision_at_k': 0.0,
+                            'ndcg_at_k': 0.0,
+                            'mrr': 0.0,
+                            'semantic_similarity': 0.0,
+                            'latency': 0.0,
+                            'retrieved_count': 0
+                        }
+                query_result['pipelines'][pipeline] = pipeline_results
+            results.append(query_result)
+        # Convert to DataFrame
+        df = self._results_to_dataframe(results)
+        # Calculate summary statistics
+        summary = self._calculate_summary_statistics(df, all_latencies)
+        # Save results if output file specified
+        if output_file:
+            import os
+            reports_dir = os.path.join(os.getcwd(), "reports")
+            os.makedirs(reports_dir, exist_ok=True)
+            csv_path = os.path.join(reports_dir, output_file)
+            json_path = os.path.join(reports_dir, output_file.replace('.csv', '.json'))
+            df.to_csv(csv_path, index=False)
+            # Save with summary - convert numpy types to Python native types
+            save_data = {
+                'results': self._convert_to_native_types(results),
+                'summary': self._convert_to_native_types(summary)
+            }
+            with open(json_path, 'w') as f:
+                json.dump(save_data, f, indent=2)
+        return df, summary, results
+    def _retrieve_from_pipeline(self, pipeline: str, query: str, k: int,
+                               query_data: Dict[str, Any]) -> RetrievalResult:
+        """Retrieve from the specified pipeline"""
+        if pipeline == 'base_rag':
+            return self.rag_manager.base_rag.retrieve(query, k)
+        elif pipeline == 'tag_filter_rag':
+            tags = query_data.get('tags')
+            return self.rag_manager.tag_filter_rag.retrieve(
+                query, k, tags=tags, tag_operator=query_data.get('tag_operator', 'OR')
+            )
+        elif pipeline == 'hybrid_rag':
+            tags = query_data.get('tags')
+            return self.rag_manager.hybrid_rag.retrieve(
+                query, k,
+                tags=tags,
+                tag_operator=query_data.get('tag_operator', 'OR'),
+                vector_weight=query_data.get('vector_weight', 0.7),
+                tag_weight=query_data.get('tag_weight', 0.3)
+            )
+        elif pipeline == 'hybrid_rerank_rag':
+            tags = query_data.get('tags')
+            return self.rag_manager.hybrid_rerank_rag.retrieve(
+                query, k,
+                tags=tags,
+                tag_operator=query_data.get('tag_operator', 'OR'),
+                vector_weight=query_data.get('vector_weight', 0.7),
+                tag_weight=query_data.get('tag_weight', 0.3)
+            )
+        else:
+            raise ValueError(f"Unknown pipeline: {pipeline}")
+    def _calculate_summary_statistics(self, df: pd.DataFrame,
+                                     all_latencies: Dict[str, List[float]]) -> Dict[str, Any]:
+        """Calculate aggregated summary statistics"""
+        summary = {}
+        # Aggregate by pipeline and k
+        for pipeline in df['pipeline'].unique():
+            summary[pipeline] = {}
+            pipeline_df = df[df['pipeline'] == pipeline]
+            for k in df['k'].unique():
+                # Convert numpy int64 to Python int for dictionary key
+                k_int = int(k) if isinstance(k, (np.integer, np.int64)) else k
+                k_df = pipeline_df[pipeline_df['k'] == k]
+                summary[pipeline][k_int] = {
+                    'mean_precision_at_k': float(k_df['precision_at_k'].mean()),
+                    'mean_ndcg_at_k': float(k_df['ndcg_at_k'].mean()),
+                    'mean_hit_at_k': float(k_df['hit_at_k'].mean()),
+                    'mean_mrr': float(k_df['mrr'].mean()),
+                    'mean_semantic_similarity': float(k_df['semantic_similarity'].mean()),
+                    'mean_latency': float(k_df['latency'].mean()),
+                    'p50_latency': float(k_df['latency'].quantile(0.5)),
+                    'p90_latency': float(k_df['latency'].quantile(0.9))
+                }
+        # Overall latency percentiles per pipeline
+        for pipeline, latencies in all_latencies.items():
+            if latencies:
+                summary[pipeline]['latency_percentiles'] = {
+                    'p50': float(np.percentile(latencies, 50)),
+                    'p90': float(np.percentile(latencies, 90))
+                }
+        return summary
+    def _convert_to_native_types(self, obj):
+        """Recursively convert numpy types to Python native types for JSON serialization"""
+        if isinstance(obj, dict):
+            # Convert numpy int64 keys to Python int (json.dump can handle int keys)
+            return {int(k) if isinstance(k, (np.integer, np.int64, np.int32)) else k: self._convert_to_native_types(v)
+                    for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [self._convert_to_native_types(item) for item in obj]
+        elif isinstance(obj, (np.integer, np.int64, np.int32)):
+            return int(obj)
+        elif isinstance(obj, (np.floating, np.float64, np.float32)):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, (np.bool_, bool)):
+            return bool(obj)
+        elif obj is None:
+            return None
+        else:
+            return obj
+    def _results_to_dataframe(self, results: List[Dict[str, Any]]) -> pd.DataFrame:
+        """Convert evaluation results to DataFrame"""
+        rows = []
+        for result in results:
+            query_id = result.get('query_id', 'unknown')
+            query = result['query']
+            for pipeline, pipeline_results in result.get('pipelines', {}).items():
+                for k, metrics in pipeline_results.items():
+                    rows.append({
+                        'query_id': query_id,
+                        'query': query,
+                        'k': k,
+                        'pipeline': pipeline,
+                        'hit_at_k': metrics.get('hit_at_k', 0.0),
+                        'precision_at_k': metrics.get('precision_at_k', 0.0),
+                        'ndcg_at_k': metrics.get('ndcg_at_k', 0.0),
+                        'mrr': metrics.get('mrr', 0.0),
+                        'semantic_similarity': metrics.get('semantic_similarity', 0.0),
+                        'latency': metrics.get('latency', 0.0),
+                        'retrieved_count': metrics.get('retrieved_count', 0),
+                        'user_satisfaction': result.get('user_satisfaction')
+                    })
+        return pd.DataFrame(rows)

core/index.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import chromadb
+from chromadb.config import Settings
+from typing import List, Dict, Any, Optional, Union
+import os
+import numpy as np
+from .utils import Chunk
+import os as _os
+_OPENAI_EMBED = False
+try:
+    from openai import OpenAI as _OpenAI
+    _OPENAI_EMBED = True if _os.getenv("OPENAI_API_KEY") else False
+except Exception:
+    _OPENAI_EMBED = False
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    SentenceTransformer = None
+class VectorStore:
+    """Vector database management"""
+    def __init__(self, persist_directory: str = "/data/chroma"):
+        self.persist_directory = persist_directory
+        # Ensure directory exists with proper permissions before creating client
+        os.makedirs(persist_directory, exist_ok=True, mode=0o755)
+        self.client = chromadb.PersistentClient(path=persist_directory)
+        # Default to SentenceTransformers; runtime switching handled via configure_embeddings()
+        self.use_openai = False
+        if SentenceTransformer is None:
+            raise RuntimeError("SentenceTransformers not available. Install sentence-transformers or switch to OpenAI via UI.")
+        self.st_model_name = os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2")
+        # Use local_files_only=True to work offline (model should already be cached)
+        # This prevents network requests to HuggingFace Hub
+        try:
+            self.embedding_model = SentenceTransformer(self.st_model_name, local_files_only=True)
+        except Exception:
+            # Fallback: if local files not found, try with network (for first-time setup)
+            self.embedding_model = SentenceTransformer(self.st_model_name)
+        # Get model output dimension
+        try:
+            self.embed_dim = int(getattr(self.embedding_model, "get_sentence_embedding_dimension")())
+        except Exception:
+            # Fallback: compute once
+            self.embed_dim = len(self.embedding_model.encode("test"))
+    def _reopen_client(self, new_path: str):
+        os.makedirs(new_path, exist_ok=True, mode=0o755)
+        self.persist_directory = new_path
+        self.client = chromadb.PersistentClient(path=new_path)
+    def _collection_suffix(self) -> str:
+        provider = "oai" if self.use_openai else "st"
+        return f"{provider}_{self.embed_dim}"
+    def _resolve_collection_name(self, base_name: str) -> str:
+        """Ensure separate collections per embedding dimension/provider to avoid mismatch."""
+        return f"{base_name}__{self._collection_suffix()}"
+    def configure_embeddings(self, use_openai: bool, openai_model: Optional[str] = None, st_model_name: Optional[str] = None):
+        """Reconfigure embedding backend at runtime.
+        Switching providers/dimensions implies a new collection suffix; existing data remains under old suffix.
+        """
+        self.use_openai = bool(use_openai)
+        if self.use_openai:
+            # Check at call-time to avoid stale module-level flags
+            if not os.getenv("OPENAI_API_KEY"):
+                raise RuntimeError("OpenAI not available or API key missing.")
+            self.openai_client = _OpenAI()
+            self.openai_model = openai_model or os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
+            if self.openai_model == "text-embedding-3-large":
+                self.embed_dim = 3072
+            else:
+                self.embed_dim = 1536
+        else:
+            if SentenceTransformer is None:
+                raise RuntimeError("SentenceTransformer not available.")
+            name = st_model_name or os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2")
+            # Only reload if changed
+            if not hasattr(self, 'st_model_name') or self.st_model_name != name:
+                self.st_model_name = name
+                # Use local_files_only=True to work offline
+                try:
+                    self.embedding_model = SentenceTransformer(self.st_model_name, local_files_only=True)
+                except Exception:
+                    # Fallback: if local files not found, try with network
+                    self.embedding_model = SentenceTransformer(self.st_model_name)
+            try:
+                self.embed_dim = int(getattr(self.embedding_model, "get_sentence_embedding_dimension")())
+            except Exception:
+                self.embed_dim = len(self.embedding_model.encode("test"))
+    def create_collection(self, name: str) -> chromadb.Collection:
+        """Create or get collection, namespaced by embedding provider/dimension."""
+        full_name = self._resolve_collection_name(name)
+        return self.client.get_or_create_collection(
+            name=full_name,
+            metadata={"hnsw:space": "cosine", "embed_dim": str(self.embed_dim)}
+        )
+    def _embed_one(self, text: str) -> List[float]:
+        """Generate embedding for a single text"""
+        if self.use_openai:
+            resp = self.openai_client.embeddings.create(model=self.openai_model, input=text)
+            return resp.data[0].embedding
+        return self.embedding_model.encode(text).tolist()
+    def _embed_batch(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a batch of texts"""
+        if self.use_openai:
+            resp = self.openai_client.embeddings.create(model=self.openai_model, input=texts)
+            return [d.embedding for d in resp.data]
+        return self.embedding_model.encode(texts).tolist()
+    # Backward-compat API used by evaluator
+    def embed_text(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]:
+        """Compatibility wrapper:
+        - If text is str -> returns single embedding list[float]
+        - If text is list[str] -> returns list of embeddings list[list[float]]
+        """
+        if isinstance(text, list):
+            return self._embed_batch(text)
+        return self._embed_one(text)
+    def add_documents(self, collection_name: str, chunks: List[Chunk]):
+        """Add documents to vector store"""
+        collection = self.create_collection(collection_name)
+        # Generate embeddings
+        texts = [chunk.content for chunk in chunks]
+        embeddings = self._embed_batch(texts)
+        # Prepare metadata - ChromaDB doesn't support lists, so convert tags to string
+        metadatas = []
+        for chunk in chunks:
+            metadata = chunk.metadata.copy()
+            # Convert tags list to comma-separated string for ChromaDB
+            if 'tags' in metadata and isinstance(metadata['tags'], list):
+                metadata['tags'] = ', '.join(metadata['tags'])
+            # Add doc_id to metadata for document counting
+            if hasattr(chunk, 'doc_id') and chunk.doc_id:
+                metadata['doc_id'] = chunk.doc_id
+            metadatas.append(metadata)
+        ids = [chunk.chunk_id for chunk in chunks]
+        # Add to collection with writable fallback
+        try:
+            collection.add(
+                embeddings=embeddings,
+                documents=texts,
+                metadatas=metadatas,
+                ids=ids
+            )
+        except Exception as e:
+            msg = str(e).lower()
+            if "readonly" in msg or "read-only" in msg:
+                # Fallback to a user-writable directory and retry once
+                fallback_dir = os.getenv("CHROMA_PERSIST_DIR", os.path.join(os.getcwd(), "chroma_data"))
+                if os.path.abspath(fallback_dir) == os.path.abspath(self.persist_directory):
+                    # Choose a different fallback under the user's home cache
+                    home_cache = os.path.join(os.path.expanduser("~"), ".cache", "rag-evaluation-system", "chroma")
+                    fallback_dir = home_cache
+                self._reopen_client(fallback_dir)
+                collection = self.create_collection(collection_name)
+                collection.add(
+                    embeddings=embeddings,
+                    documents=texts,
+                    metadatas=metadatas,
+                    ids=ids
+                )
+            else:
+                raise
+    def search(self, collection_name: str, query: str,
+               filters: Optional[Dict[str, Any]] = None,
+               tag_filters: Optional[Dict[str, Any]] = None,
+               k: int = 5) -> List[Dict[str, Any]]:
+        """
+        Search in vector store with optional filters and tag filters.
+        Args:
+            collection_name: Collection name
+            query: Search query
+            filters: Standard metadata filters (doc_type, etc.)
+            tag_filters: Tag filters dict with 'tags' (list) and 'operator' ('OR', 'AND', 'NOT')
+            k: Number of results to return
+        Returns:
+            List of formatted search results
+        """
+        collection = self.create_collection(collection_name)
+        # Generate query embedding
+        query_embedding = self._embed_one(query)
+        # Build where clause combining standard filters and tag filters
+        where_clause = self._build_where_clause(filters, tag_filters)
+        # For tag filtering (OR/AND/NOT), we need to fetch more results and post-filter
+        # ChromaDB stores tags as comma-separated strings, so we filter in-memory
+        fetch_k = k
+        if tag_filters and tag_filters.get('tags'):
+            fetch_k = k * 10  # Fetch more for post-filtering
+        # Perform search
+        results = collection.query(
+            query_embeddings=[query_embedding],
+            n_results=fetch_k,
+            where=where_clause,
+            include=["documents", "metadatas", "distances"]
+        )
+        # Format results
+        formatted_results = []
+        for i in range(len(results['documents'][0])):
+            metadata = results['metadatas'][0][i].copy()
+            # Convert tags from string (ChromaDB format) back to list
+            if 'tags' in metadata and isinstance(metadata['tags'], str):
+                metadata['tags'] = [tag.strip() for tag in metadata['tags'].split(',') if tag.strip()]
+            formatted_results.append({
+                'content': results['documents'][0][i],
+                'metadata': metadata,
+                'distance': results['distances'][0][i],
+                'score': 1 - results['distances'][0][i],  # Convert to similarity score
+                'id': results.get('ids', [None])[0][i] if results.get('ids') else None
+            })
+        # Post-filter for all tag operators (OR/AND/NOT)
+        # Since tags are stored as comma-separated strings, we filter in-memory
+        if tag_filters and tag_filters.get('tags'):
+            formatted_results = self._post_filter_tags(
+                formatted_results,
+                tag_filters['tags'],
+                tag_filters.get('operator', 'OR').upper()
+            )
+            formatted_results = formatted_results[:k]
+        return formatted_results
+    def _build_where_clause(self, filters: Optional[Dict[str, Any]],
+                           tag_filters: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """Build ChromaDB where clause from filters and tag filters."""
+        conditions = []
+        # Add standard filters
+        if filters:
+            for key, value in filters.items():
+                conditions.append({key: {"$eq": value}})
+        # Add tag filters
+        # Note: ChromaDB doesn't support list fields or $contains, so tags are stored as comma-separated strings
+        # All tag filtering (OR/AND/NOT) is done in-memory after retrieval
+        if tag_filters:
+            tags = tag_filters.get('tags', [])
+            operator = tag_filters.get('operator', 'OR').upper()
+            # For tag filtering, we need to fetch more results and filter in-memory
+            # Don't add tag filters to where clause
+        if not conditions:
+            return None
+        elif len(conditions) == 1:
+            return conditions[0]
+        else:
+            return {"$and": conditions}
+    def _post_filter_tags(self, results: List[Dict[str, Any]],
+                         tags: List[str], operator: str) -> List[Dict[str, Any]]:
+        """
+        Post-filter results based on tag operator (OR/AND/NOT).
+        ChromaDB stores tags as comma-separated strings, so we filter in-memory.
+        Uses case-insensitive matching and substring matching for flexibility.
+        """
+        filtered = []
+        # Normalize search tags (lowercase, strip)
+        normalized_search_tags = [tag.strip().lower() for tag in tags if tag.strip()]
+        for result in results:
+            metadata = result.get('metadata', {})
+            result_tags = metadata.get('tags', [])
+            # Convert tags from string (ChromaDB format) to list if needed
+            if isinstance(result_tags, str):
+                result_tags = [tag.strip() for tag in result_tags.split(',') if tag.strip()]
+            elif not isinstance(result_tags, list):
+                result_tags = []
+            # Normalize result tags (lowercase, strip)
+            normalized_result_tags = [tag.strip().lower() for tag in result_tags if tag.strip()]
+            if operator == 'OR':
+                # Any tag must be present (case-insensitive, substring match)
+                # Check if any search tag matches any result tag (exact or substring)
+                matches = False
+                for search_tag in normalized_search_tags:
+                    for result_tag in normalized_result_tags:
+                        if search_tag == result_tag or search_tag in result_tag or result_tag in search_tag:
+                            matches = True
+                            break
+                    if matches:
+                        break
+                if matches:
+                    filtered.append(result)
+            elif operator == 'AND':
+                # All tags must be present (case-insensitive, substring match)
+                all_match = True
+                for search_tag in normalized_search_tags:
+                    tag_matches = False
+                    for result_tag in normalized_result_tags:
+                        if search_tag == result_tag or search_tag in result_tag or result_tag in search_tag:
+                            tag_matches = True
+                            break
+                    if not tag_matches:
+                        all_match = False
+                        break
+                if all_match:
+                    filtered.append(result)
+            elif operator == 'NOT':
+                # None of the tags should be present (case-insensitive)
+                no_match = True
+                for search_tag in normalized_search_tags:
+                    for result_tag in normalized_result_tags:
+                        if search_tag == result_tag or search_tag in result_tag or result_tag in search_tag:
+                            no_match = False
+                            break
+                    if not no_match:
+                        break
+                if no_match:
+                    filtered.append(result)
+        return filtered
+    def get_collection_stats(self, collection_name: str) -> Dict[str, Any]:
+        """Get collection statistics"""
+        try:
+            collection = self.create_collection(collection_name)
+            total_chunks = collection.count()
+            # Count unique documents by getting all metadata and counting unique doc_id or source_name
+            # Since ChromaDB doesn't have a direct way to get unique values, we fetch all metadata
+            unique_docs = set()
+            try:
+                # Try to get all items to count unique documents
+                # Use get() with a reasonable limit to avoid memory issues
+                # If collection is large, we might need to handle pagination
+                if total_chunks > 0:
+                    # Fetch all chunks (up to 10k limit, adjust if needed)
+                    limit = min(10000, total_chunks)
+                    results = collection.get(limit=limit)
+                    if results and results.get('metadatas'):
+                        # Extract unique doc_ids or source_names
+                        for metadata in results['metadatas']:
+                            # Try doc_id first (most reliable for unique document counting)
+                            doc_id = metadata.get('doc_id')
+                            if doc_id:
+                                unique_docs.add(doc_id)
+                            else:
+                                # Fallback to source_name if doc_id not available (for older data)
+                                source_name = metadata.get('source_name')
+                                if source_name:
+                                    unique_docs.add(source_name)
+                    doc_count = len(unique_docs) if unique_docs else 0
+                else:
+                    doc_count = 0
+            except Exception as e:
+                # If we can't get metadata, fall back to using chunk count as estimate
+                # This might overcount if documents have multiple chunks, but it's better than 0
+                doc_count = total_chunks
+                import logging
+                logging.getLogger("rag_vector_store").warning(f"Could not count unique documents: {e}, using chunk count as estimate")
+            return {
+                "document_count": doc_count,
+                "chunk_count": total_chunks,
+                "collection_name": self._resolve_collection_name(collection_name)
+            }
+        except Exception as e:
+            import logging
+            logging.getLogger("rag_vector_store").warning(f"Failed to get collection stats: {e}")
+            return {
+                "document_count": 0,
+                "chunk_count": 0,
+                "collection_name": self._resolve_collection_name(collection_name)
+            }

core/ingest.py ADDED Viewed

	@@ -0,0 +1,667 @@

+import os
+import yaml
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+import PyPDF2
+from .utils import Chunk, TextProcessor, generate_id
+import logging as _logging
+_logger = _logging.getLogger("rag_ingest")
+import os as _os
+_OPENAI_ENABLED = False
+try:
+    from openai import OpenAI as _OpenAI
+    _OPENAI_ENABLED = True if _os.getenv("OPENAI_API_KEY") else False
+except Exception:
+    _OPENAI_ENABLED = False
+class OpenAIMetadataDetector:
+    """Use OpenAI to detect language, doc_type, and hierarchy levels for a chunk.
+    Falls back to heuristics when OpenAI is not available.
+    """
+    def __init__(self, hierarchy_manager: 'HierarchyManager'):
+        self.hierarchy_manager = hierarchy_manager
+        self.client = _OpenAI() if _OPENAI_ENABLED else None
+        self.model = _os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+    def detect(self, text: str) -> Dict[str, Any]:
+        if not self.client:
+            return {}
+        hierarchies = self.hierarchy_manager.list_hierarchies()
+        prompt = (
+            "You are a metadata extractor. Given a text chunk, infer: language (en|ja), "
+            "document_type (Policy|Manual|FAQ|Report|Note|Guideline), hierarchy_name, level1, level2, level3. "
+            "CRITICAL: hierarchy_name MUST be exactly one of the following: "
+            f"{hierarchies}. Do not invent other names. "
+            "Respond as strict JSON with keys: language, document_type, hierarchy_name, level1, level2, level3. "
+            "Be concise; if unsure, pick the closest.\n\nText:\n" + text[:2000]
+        )
+        try:
+            _logger.debug("Calling OpenAI for chunk metadata detection (model=%s)", self.model)
+            resp = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.0,
+            )
+            content = resp.choices[0].message.content
+            import json as _json
+            data = _json.loads(content)
+            # Enforce allowed hierarchy set
+            if isinstance(data, dict) and data.get("hierarchy_name") not in hierarchies:
+                data["hierarchy_name"] = None
+            _logger.debug("OpenAI chunk metadata inferred: %s", data)
+            return data if isinstance(data, dict) else {}
+        except Exception:
+            _logger.exception("OpenAI chunk metadata detection failed; using heuristics.")
+            return {}
+# Try to import pypdf (newer, more robust PDF library)
+try:
+    from pypdf import PdfReader as PyPdfReader
+    PYPDF_AVAILABLE = True
+except ImportError:
+    PYPDF_AVAILABLE = False
+class DocumentLoader:
+    """Load documents from various formats"""
+    def __init__(self):
+        self.text_processor = TextProcessor()
+    def load_pdf(self, file_path: str) -> str:
+        """Load text from PDF file with fallback readers, preserving paragraphs"""
+        # Validate file exists and is readable
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"PDF file not found: {file_path}")
+        if not os.path.isfile(file_path):
+            raise ValueError(f"Path is not a file: {file_path}")
+        # Check file size
+        file_size = os.path.getsize(file_path)
+        if file_size == 0:
+            raise ValueError(f"PDF file is empty: {file_path}")
+        # Try pypdf first (more robust)
+        if PYPDF_AVAILABLE:
+            try:
+                with open(file_path, 'rb') as file:
+                    reader = PyPdfReader(file)
+                    text = ""
+                    for page in reader.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + "\n"
+                    if text.strip():
+                        return self.text_processor.clean_text_preserve_newlines(text)
+            except Exception as e:
+                # If pypdf fails, try PyPDF2 as fallback
+                pass
+        # Fallback to PyPDF2
+        try:
+            with open(file_path, 'rb') as file:
+                # Try to read with strict=False for corrupted PDFs
+                try:
+                    reader = PyPDF2.PdfReader(file, strict=False)
+                except:
+                    # If strict=False doesn't work, try normal reader
+                    file.seek(0)
+                reader = PyPDF2.PdfReader(file)
+                text = ""
+                for i, page in enumerate(reader.pages):
+                    try:
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + "\n"
+                    except Exception as page_error:
+                        # Skip pages that can't be extracted
+                        continue
+                if not text.strip():
+                    raise ValueError(f"No text could be extracted from PDF: {file_path}")
+                return self.text_processor.clean_text_preserve_newlines(text)
+        except Exception as e:
+            error_msg = str(e)
+            if "EOF marker not found" in error_msg or "EOF" in error_msg:
+                raise Exception(
+                    f"PDF file appears to be corrupted or incomplete: {file_path}. "
+                    f"This may be due to an incomplete upload or corrupted file. "
+                    f"Please try re-uploading the file or check if the PDF is valid."
+                )
+            else:
+                raise Exception(f"Error loading PDF {file_path}: {error_msg}")
+    def load_txt(self, file_path: str) -> str:
+        """Load text from TXT file preserving paragraphs"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                text = file.read()
+                return self.text_processor.clean_text_preserve_newlines(text)
+        except Exception as e:
+            raise Exception(f"Error loading TXT {file_path}: {str(e)}")
+    def load_document(self, file_path: str) -> str:
+        """Load document based on file extension"""
+        ext = Path(file_path).suffix.lower()
+        if ext == '.pdf':
+            return self.load_pdf(file_path)
+        elif ext == '.txt':
+            return self.load_txt(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {ext}")
+class HierarchyManager:
+    """Manage hierarchical metadata definitions"""
+    def __init__(self, hierarchies_dir: str = "hierarchies"):
+        self.hierarchies_dir = Path(hierarchies_dir)
+        self.hierarchies = {}
+        self.load_hierarchies()
+    def load_hierarchies(self):
+        """Load all hierarchy definitions"""
+        for yaml_file in self.hierarchies_dir.glob("*.yaml"):
+            with open(yaml_file, 'r', encoding='utf-8') as file:
+                hierarchy_name = yaml_file.stem
+                self.hierarchies[hierarchy_name] = yaml.safe_load(file)
+    def get_hierarchy(self, name: str) -> Dict[str, Any]:
+        """Get hierarchy definition by name"""
+        if name not in self.hierarchies:
+            raise ValueError(f"Hierarchy '{name}' not found")
+        return self.hierarchies[name]
+    def list_hierarchies(self) -> List[str]:
+        """List available hierarchies"""
+        return list(self.hierarchies.keys())
+class DocumentChunker:
+    """Chunk documents with hierarchical metadata"""
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_processor = TextProcessor()
+        self.hierarchy_manager = HierarchyManager()
+        self.ai_detector = OpenAIMetadataDetector(self.hierarchy_manager)
+    def chunk_document(self, file_path: str, hierarchy: Optional[str],
+                      doc_type: Optional[str], language: Optional[str]) -> List[Chunk]:
+        """Chunk document with hierarchical metadata per chunk.
+        - Auto-detects hierarchy/doc_type/language when None or 'Auto'.
+        - Assigns metadata per chunk to support multi-topic documents.
+        """
+        loader = DocumentLoader()
+        content = loader.load_document(file_path)
+        # Auto-detect language if needed
+        if not language or str(language).lower() == 'auto':
+            # Prefer OpenAI if available
+            ai_guess = self.ai_detector.detect(content)
+            _logger.debug("Language auto-detect: ai_guess=%s", ai_guess.get('language') if isinstance(ai_guess, dict) else None)
+            language = ai_guess.get('language') if isinstance(ai_guess, dict) and ai_guess.get('language') in ('en','ja') else (
+                'ja' if any('\u3040' <= ch <= '\u30ff' or '\u4e00' <= ch <= '\u9faf' for ch in content) else 'en'
+            )
+        # Prepare list of hierarchy names and definitions
+        hier_names = self.hierarchy_manager.list_hierarchies()
+        # If hierarchy is auto, we'll pick best per-chunk later; else load the chosen one
+        fixed_hierarchy_def = None
+        if hierarchy and hierarchy.lower() != 'auto':
+            fixed_hierarchy_def = self.hierarchy_manager.get_hierarchy(hierarchy)
+        # Simple structural chunking: split on double newlines first, then fall back to token windows
+        raw_blocks = [b.strip() for b in content.split('\n\n') if b.strip()]
+        if not raw_blocks:
+            raw_blocks = [content]
+        # Further split large blocks into overlapping windows
+        processed_blocks: List[str] = []
+        for block in raw_blocks:
+            words = block.split()
+            if len(words) <= self.chunk_size:
+                processed_blocks.append(block)
+            else:
+                step = max(1, self.chunk_size - self.chunk_overlap)
+                for i in range(0, len(words), step):
+                    processed_blocks.append(' '.join(words[i:i + self.chunk_size]))
+        # Phase 1: provisional labels for each block
+        provisional: List[Dict[str, Any]] = []
+        # Sticky explicit labels propagate until overridden by new explicit labels
+        sticky_l1: Optional[str] = None
+        sticky_l2: Optional[str] = None
+        for block in processed_blocks:
+            ai_used = False
+            ph_hdef = fixed_hierarchy_def
+            ph_hname = hierarchy if hierarchy and hierarchy.lower() != 'auto' else None
+            if ph_hdef is None:
+                ai_guess = self.ai_detector.detect(block)
+                guess_name = ai_guess.get('hierarchy_name') if isinstance(ai_guess, dict) else None
+                # 0) Explicit label "Hierarchy: <name>"
+                import re
+                mH = re.search(r"^\s*hierarchy\s*:\s*(.+)$", block, flags=re.IGNORECASE | re.MULTILINE)
+                if mH:
+                    explicit_h = mH.group(1).strip().lower()
+                    for name in hier_names:
+                        if name.lower() in explicit_h or explicit_h in name.lower():
+                            ph_hdef = self.hierarchy_manager.get_hierarchy(name)
+                            ph_hname = name
+                            ai_used = ai_used or False
+                # 1) If OpenAI guessed a known hierarchy
+                if ph_hdef is None and guess_name in hier_names:
+                    ph_hdef = self.hierarchy_manager.get_hierarchy(guess_name)
+                    ph_hname = guess_name
+                    ai_used = True
+                # 2) Weighted keyword scoring across all hierarchies (level1/2/3 + doc_types + filename hints)
+                if ph_hdef is None:
+                    best_score = -1
+                    best_name = None
+                    best_def = None
+                    block_lower = block.lower()
+                    filename_lower = os.path.basename(file_path).lower()
+                    for name in hier_names:
+                        hdef = self.hierarchy_manager.get_hierarchy(name)
+                        score = 0
+                        # level1
+                        for v in hdef['levels']['level1']['values']:
+                            if v.lower() in block_lower:
+                                score += 2
+                        # level2
+                        for l2_list in hdef['levels']['level2']['values'].values():
+                            for v in l2_list:
+                                if v.lower() in block_lower:
+                                    score += 2
+                        # level3
+                        for l3_list in hdef['levels']['level3']['values'].values():
+                            for v in l3_list:
+                                if v.lower() in block_lower:
+                                    score += 1
+                        # doc_types
+                        for dt in hdef.get('doc_types', []):
+                            if dt.lower() in block_lower:
+                                score += 1
+                        # filename hint
+                        if name.lower() in filename_lower:
+                            score += 3
+                        if score > best_score:
+                            best_score = score
+                            best_name = name
+                            best_def = hdef
+                    ph_hdef = best_def if best_def is not None else self.hierarchy_manager.get_hierarchy(hier_names[0])
+                    ph_hname = best_name or hier_names[0]
+            ph_dtype = doc_type
+            if not doc_type or str(doc_type).lower() == 'auto':
+                ai_guess = self.ai_detector.detect(block)
+                if isinstance(ai_guess, dict) and ai_guess.get('document_type'):
+                    ph_dtype = ai_guess['document_type']
+                    ai_used = True
+                else:
+                    dt_candidates = ph_hdef.get('doc_types', ["Policy", "Manual", "FAQ", "Report", "Note", "Guideline"])
+                    block_lower = block.lower()
+                    best_dt = dt_candidates[0]
+                    best_score = -1
+                    for dt in dt_candidates:
+                        s = 0
+                        if dt.lower() in block_lower:
+                            s += 1
+                        if dt.lower() == 'faq' and ('faq' in block_lower or 'q:' in block_lower):
+                            s += 1
+                        if dt.lower() == 'report' and ('report' in block_lower or 'summary' in block_lower):
+                            s += 1
+                        if s > best_score:
+                            best_score = s
+                            best_dt = dt
+                    ph_dtype = best_dt
+            content_lower = block.lower()
+            # Detect explicit labels in this block
+            import re
+            exp_l1 = exp_l2 = None
+            m1 = re.search(r"^\s*domain\s*:\s*(.+)$", content_lower, flags=re.MULTILINE)
+            m2 = re.search(r"^\s*section\s*:\s*(.+)$", content_lower, flags=re.MULTILINE)
+            if m1:
+                exp_l1 = m1.group(1).strip()
+            if m2:
+                exp_l2 = m2.group(1).strip()
+            # Provisional levels
+            ph_l1 = self._classify_level1(content_lower, ph_hdef)
+            ph_l2 = self._classify_level2(content_lower, ph_hdef, ph_l1)
+            # Override with explicit labels when present
+            def _best_match(name: str, candidates: list[str]) -> str:
+                name_l = name.lower()
+                for c in candidates:
+                    cl = c.lower()
+                    if cl == name_l or name_l in cl or cl in name_l:
+                        return c
+                return candidates[0] if candidates else "General"
+            if exp_l1:
+                ph_l1 = _best_match(exp_l1, ph_hdef['levels']['level1']['values'])
+                sticky_l1 = ph_l1
+            if exp_l2:
+                l2_candidates = ph_hdef['levels']['level2']['values'].get(ph_l1, [])
+                ph_l2 = _best_match(exp_l2, l2_candidates)
+                sticky_l2 = ph_l2
+            # Apply sticky labels when no explicit labels in this block
+            if not exp_l1 and sticky_l1:
+                ph_l1 = sticky_l1
+            if not exp_l2 and sticky_l2 and ph_hdef['levels']['level2']['values'].get(ph_l1):
+                ph_l2 = sticky_l2
+            provisional.append({
+                'text': block,
+                'hdef': ph_hdef,
+                'hname': ph_hname,
+                'dtype': ph_dtype,
+                'l1': ph_l1,
+                'l2': ph_l2,
+                'ai': ai_used
+            })
+        # Phase 2: merge adjacent blocks with same labels within size limit
+        merged_texts: List[str] = []
+        merged_meta: List[Dict[str, Any]] = []
+        if provisional:
+            current_text = provisional[0]['text']
+            current_meta = provisional[0]
+            for p in provisional[1:]:
+                same = (p['hname'] == current_meta['hname'] and p['l1'] == current_meta['l1'] and p['l2'] == current_meta['l2'])
+                candidate = current_text + "\n\n" + p['text'] if same else current_text
+                if same and self.text_processor.count_tokens(candidate) <= self.text_processor.count_tokens(current_text) + self.chunk_size:
+                    current_text = candidate
+                    current_meta['ai'] = current_meta['ai'] or p['ai']
+                else:
+                    merged_texts.append(current_text)
+                    merged_meta.append(current_meta)
+                    current_text = p['text']
+                    current_meta = p
+            merged_texts.append(current_text)
+            merged_meta.append(current_meta)
+        # Phase 3: finalize chunks
+        chunks: List[Chunk] = []
+        for text_block, meta in zip(merged_texts, merged_meta):
+            final_md = self._generate_metadata(
+                file_path=file_path,
+                hierarchy_def=meta['hdef'],
+                doc_type=meta['dtype'],
+                language=language,
+                content=text_block
+            )
+            if meta['hname']:
+                final_md['hierarchy'] = meta['hname']
+            final_md['ai_detected'] = meta['ai']
+            chunks.append(Chunk(
+                doc_id=generate_id(),
+                chunk_id=generate_id(),
+                content=text_block,
+                metadata=final_md
+            ))
+        return chunks
+    def _generate_metadata(self, file_path: str, hierarchy_def: Dict[str, Any],
+                          doc_type: str, language: str, content: str) -> Dict[str, Any]:
+        """Generate hierarchical metadata for chunk"""
+        # Simple rule-based classification with explicit label override
+        content_lower = content.lower()
+        # 1) Try to honor explicit labels like "Domain:", "Section:", "Topic:"
+        import re
+        explicit_l1 = explicit_l2 = explicit_l3 = None
+        m1 = re.search(r"^\s*domain\s*:\s*(.+)$", content_lower, flags=re.MULTILINE)
+        m2 = re.search(r"^\s*section\s*:\s*(.+)$", content_lower, flags=re.MULTILINE)
+        m3 = re.search(r"^\s*topic\s*:\s*(.+)$", content_lower, flags=re.MULTILINE)
+        if m1:
+            explicit_l1 = m1.group(1).strip()
+        if m2:
+            explicit_l2 = m2.group(1).strip()
+        if m3:
+            explicit_l3 = m3.group(1).strip()
+        def _best_match(name: str, candidates: list[str]) -> str:
+            name_l = name.lower()
+            # exact contains
+            for c in candidates:
+                if c.lower() == name_l or name_l in c.lower() or c.lower() in name_l:
+                    return c
+            # fallback: first candidate
+            return candidates[0] if candidates else "General"
+        if explicit_l1:
+            level1 = _best_match(explicit_l1, hierarchy_def['levels']['level1']['values'])
+        else:
+            level1 = self._classify_level1(content_lower, hierarchy_def)
+        if explicit_l2:
+            level2_candidates = hierarchy_def['levels']['level2']['values'].get(level1, [])
+            level2 = _best_match(explicit_l2, level2_candidates)
+        else:
+            level2 = self._classify_level2(content_lower, hierarchy_def, level1)
+        if explicit_l3:
+            level3_candidates = hierarchy_def['levels']['level3']['values'].get(level2, [])
+            level3 = _best_match(explicit_l3, level3_candidates)
+        else:
+            level3 = self._classify_level3(content_lower, hierarchy_def, level1, level2)
+        # Fallback mapping to 'Other' when nothing matches this hierarchy
+        def _any_present(values: list[str]) -> bool:
+            return any(v.lower() in content_lower for v in values)
+        # If no level1 value appears, set to 'Other'
+        if not _any_present(hierarchy_def['levels']['level1']['values']):
+            level1 = 'Other'
+        # If level2 options for chosen level1 exist but none appear, set to 'Other'
+        l2_opts = hierarchy_def['levels']['level2']['values'].get(level1, [])
+        if l2_opts and not _any_present(l2_opts):
+            level2 = 'Other'
+        # If level3 options for chosen level2 exist but none appear, set to 'Other'
+        l3_opts = hierarchy_def['levels']['level3']['values'].get(level2, [])
+        if l3_opts and not _any_present(l3_opts):
+            level3 = 'Other'
+        return {
+            "source_name": os.path.basename(file_path),
+            "lang": language,
+            "level1": level1,
+            "level2": level2,
+            "level3": level3,
+            "doc_type": doc_type,
+            "chunk_size": len(content),
+            "token_count": self.text_processor.count_tokens(content)
+        }
+    def _classify_level1(self, content: str, hierarchy_def: Dict[str, Any]) -> str:
+        """Classify level1 domain"""
+        level1_options = hierarchy_def['levels']['level1']['values']
+        # Simple keyword matching (enhance with ML model)
+        keyword_scores = {}
+        for domain in level1_options:
+            score = 0
+            # Add domain-specific keyword matching logic
+            if domain.lower() in content:
+                score += 1
+            keyword_scores[domain] = score
+        return max(keyword_scores.items(), key=lambda x: x[1])[0] if keyword_scores else level1_options[0]
+    def _classify_level2(self, content: str, hierarchy_def: Dict[str, Any], level1: str) -> str:
+        """Classify level2 section"""
+        level2_options = hierarchy_def['levels']['level2']['values'].get(level1, [])
+        if not level2_options:
+            return "General"
+        keyword_scores = {}
+        for section in level2_options:
+            score = 0
+            if section.lower() in content:
+                score += 1
+            keyword_scores[section] = score
+        return max(keyword_scores.items(), key=lambda x: x[1])[0] if keyword_scores else level2_options[0]
+    def _classify_level3(self, content: str, hierarchy_def: Dict[str, Any],
+                        level1: str, level2: str) -> str:
+        """Classify level3 topic"""
+        level3_options = hierarchy_def['levels']['level3']['values'].get(level2, [])
+        if not level3_options:
+            return "General"
+        keyword_scores = {}
+        for topic in level3_options:
+            score = 0
+            if topic.lower() in content:
+                score += 1
+            keyword_scores[topic] = score
+        return max(keyword_scores.items(), key=lambda x: x[1])[0] if keyword_scores else level3_options[0]
+class FlatTagChunker:
+    """Chunk documents and generate flat, non-hierarchical tags for each chunk."""
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200,
+                 max_tags: int = 10, min_tag_length: int = 2, max_tag_length: int = 3,
+                 use_openai_for_tags: bool = False):
+        """
+        Initialize flat tag chunker.
+        Args:
+            chunk_size: Target chunk size in characters
+            chunk_overlap: Overlap between chunks in characters
+            max_tags: Maximum tags per chunk
+            min_tag_length: Minimum words in a tag
+            max_tag_length: Maximum words in a tag
+            use_openai_for_tags: Whether to use OpenAI for tag generation
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_processor = TextProcessor()
+        # Import TagGenerator (lazy import to avoid circular dependencies)
+        from .tag_generator import TagGenerator
+        self.tag_generator = TagGenerator(
+            max_tags=max_tags,
+            min_tag_length=min_tag_length,
+            max_tag_length=max_tag_length
+        )
+        self.use_openai_for_tags = use_openai_for_tags
+    def chunk_document(self, file_path: str, language: Optional[str] = None,
+                      user_tags: Optional[List[str]] = None) -> List[Chunk]:
+        """
+        Chunk document and generate flat tags for each chunk.
+        Args:
+            file_path: Path to document file
+            language: Language code ('en', 'ja') or None for auto-detect
+            user_tags: Optional list of user-provided tags to add to auto-generated tags
+        Returns:
+            List of Chunk objects with tags in metadata
+        """
+        loader = DocumentLoader()
+        content = loader.load_document(file_path)
+        # Auto-detect language if needed
+        if not language or str(language).lower() == 'auto':
+            # Simple heuristic
+            language = 'ja' if any('\u3040' <= ch <= '\u30ff' or '\u4e00' <= ch <= '\u9faf' for ch in content) else 'en'
+        # Set language for tag generator
+        self.tag_generator.language = language
+        # Normalize user tags (lowercase, strip, remove empty)
+        normalized_user_tags = []
+        if user_tags:
+            for tag in user_tags:
+                if isinstance(tag, str) and tag.strip():
+                    normalized_user_tags.append(tag.strip().lower())
+        # Simple structural chunking: split on double newlines first
+        raw_blocks = [b.strip() for b in content.split('\n\n') if b.strip()]
+        if not raw_blocks:
+            raw_blocks = [content]
+        # Further split large blocks into chunks
+        chunks = []
+        for block in raw_blocks:
+            if len(block) <= self.chunk_size:
+                chunks.append(block)
+            else:
+                # Split by sentences, then combine into chunks
+                sentences = self.text_processor.split_sentences(block)
+                current_chunk = ""
+                for sentence in sentences:
+                    if len(current_chunk) + len(sentence) <= self.chunk_size:
+                        current_chunk += sentence + " "
+                    else:
+                        if current_chunk:
+                            chunks.append(current_chunk.strip())
+                        current_chunk = sentence + " "
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+        # Generate tags and create Chunk objects
+        result_chunks = []
+        source_name = os.path.basename(file_path)
+        doc_id = generate_id()  # Generate one doc_id for all chunks from this document
+        for i, chunk_text in enumerate(chunks):
+            # Generate auto tags for this chunk
+            auto_tags = self.tag_generator.generate_tags(
+                chunk_text,
+                methods=['all'] if not self.use_openai_for_tags else ['yake', 'openai'],
+                use_openai=self.use_openai_for_tags
+            )
+            # Merge user tags with auto-generated tags
+            # User tags are prepended (higher priority) and deduplicated
+            all_tags = []
+            seen_tags = set()
+            # Add user tags first (they get priority)
+            for tag in normalized_user_tags:
+                if tag not in seen_tags:
+                    all_tags.append(tag)
+                    seen_tags.add(tag)
+            # Add auto-generated tags (skip duplicates)
+            for tag in auto_tags:
+                if tag not in seen_tags:
+                    all_tags.append(tag)
+                    seen_tags.add(tag)
+            # Create chunk metadata
+            metadata = {
+                'source_name': source_name,
+                'chunk_index': i,
+                'chunk_size': len(chunk_text),
+                'lang': language,
+                'tags': all_tags  # Store as list (user tags + auto tags)
+            }
+            # Create Chunk object
+            chunk = Chunk(
+                doc_id=doc_id,
+                chunk_id=generate_id(),
+                content=chunk_text,
+                metadata=metadata
+            )
+            result_chunks.append(chunk)
+        _logger.info(f"Generated {len(result_chunks)} chunks with tags from {source_name}")
+        return result_chunks

core/report_generator.py ADDED Viewed

	@@ -0,0 +1,386 @@

+"""
+Report Generation Module for RAG Evaluation Results
+This module provides comprehensive report generation functionality including:
+- HTML/PDF report generation with aggregated statistics
+- Representative examples (best/worst performing queries)
+- Visualization embedding
+- Export functionality (CSV, PNG, HTML, PDF)
+"""
+import pandas as pd
+import numpy as np
+from typing import List, Dict, Any, Optional, Tuple
+import json
+import os
+from datetime import datetime
+from collections import defaultdict
+import logging
+_logger = logging.getLogger("rag_report_generator")
+try:
+    from jinja2 import Template
+    _JINJA2_AVAILABLE = True
+except ImportError:
+    _JINJA2_AVAILABLE = False
+    _logger.warning("Jinja2 not available. HTML report generation will be limited.")
+class ReportGenerator:
+    """Generate comprehensive evaluation reports"""
+    METHOD_NAMES = {
+        'base_rag': 'Baseline',
+        'tag_filter_rag': '+Tags(Filter)',
+        'hybrid_rag': 'Hybrid(Weighted)',
+        'hybrid_rerank_rag': 'Hybrid+Rerank',
+        'hier_rag': 'Hierarchical RAG'  # For backward compatibility
+    }
+    def __init__(self):
+        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    def generate_report(self,
+                       df: pd.DataFrame,
+                       summary: Dict[str, Any],
+                       output_dir: str = "reports",
+                       report_name: Optional[str] = None,
+                       visualizations: Optional[Dict[str, str]] = None,
+                       raw_results: Optional[List[Dict[str, Any]]] = None) -> Dict[str, str]:
+        """
+        Generate comprehensive evaluation report
+        Args:
+            df: Evaluation results DataFrame
+            summary: Summary statistics dictionary
+            output_dir: Output directory for reports
+            report_name: Base name for report files
+            visualizations: Dict mapping chart name to file path
+            raw_results: Raw evaluation results for examples
+        Returns:
+            Dict mapping file type to file path
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        if report_name is None:
+            report_name = f"evaluation_report_{self.timestamp}"
+        report_paths = {}
+        # 1. Export CSV
+        csv_path = os.path.join(output_dir, f"{report_name}.csv")
+        df.to_csv(csv_path, index=False)
+        report_paths['csv'] = csv_path
+        # 2. Export JSON summary
+        json_path = os.path.join(output_dir, f"{report_name}_summary.json")
+        with open(json_path, 'w') as f:
+            json.dump(summary, f, indent=2, default=str)
+        report_paths['json'] = json_path
+        # 3. Export aggregated statistics CSV
+        agg_stats_path = os.path.join(output_dir, f"{report_name}_aggregated_stats.csv")
+        agg_stats_df = self._generate_aggregated_stats_table(df, summary)
+        agg_stats_df.to_csv(agg_stats_path, index=False)
+        report_paths['aggregated_csv'] = agg_stats_path
+        # 4. Export representative examples
+        if raw_results:
+            examples_path = os.path.join(output_dir, f"{report_name}_examples.json")
+            examples = self._extract_representative_examples(df, raw_results)
+            with open(examples_path, 'w') as f:
+                json.dump(examples, f, indent=2, default=str)
+            report_paths['examples'] = examples_path
+        # 5. Export visualizations (if not already exported)
+        if visualizations:
+            viz_dir = os.path.join(output_dir, "visualizations")
+            os.makedirs(viz_dir, exist_ok=True)
+            for chart_name, chart_path in visualizations.items():
+                if chart_path and os.path.exists(chart_path):
+                    # Copy to reports directory if not already there
+                    dest_path = os.path.join(viz_dir, f"{report_name}_{chart_name}.png")
+                    if not os.path.abspath(chart_path) == os.path.abspath(dest_path):
+                        import shutil
+                        shutil.copy2(chart_path, dest_path)
+                    report_paths[f'viz_{chart_name}'] = dest_path
+        # 6. Generate HTML report
+        html_path = os.path.join(output_dir, f"{report_name}.html")
+        html_content = self._generate_html_report(
+            df, summary, report_name, visualizations or {}, raw_results
+        )
+        with open(html_path, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+        report_paths['html'] = html_path
+        return report_paths
+    def _generate_aggregated_stats_table(self, df: pd.DataFrame,
+                                        summary: Dict[str, Any]) -> pd.DataFrame:
+        """Generate aggregated statistics table"""
+        rows = []
+        # Aggregate by pipeline and k
+        for pipeline in df['pipeline'].unique():
+            for k in df['k'].unique():
+                pipeline_k_df = df[(df['pipeline'] == pipeline) & (df['k'] == k)]
+                if pipeline_k_df.empty:
+                    continue
+                rows.append({
+                    'pipeline': self.METHOD_NAMES.get(pipeline, pipeline),
+                    'k': k,
+                    'mean_precision@k': pipeline_k_df['precision_at_k'].mean(),
+                    'std_precision@k': pipeline_k_df['precision_at_k'].std(),
+                    'mean_ndcg@k': pipeline_k_df['ndcg_at_k'].mean(),
+                    'std_ndcg@k': pipeline_k_df['ndcg_at_k'].std(),
+                    'mean_hit@k': pipeline_k_df['hit_at_k'].mean(),
+                    'mean_mrr': pipeline_k_df['mrr'].mean(),
+                    'mean_semantic_similarity': pipeline_k_df['semantic_similarity'].mean(),
+                    'mean_latency': pipeline_k_df['latency'].mean(),
+                    'p50_latency': pipeline_k_df['latency'].quantile(0.5),
+                    'p90_latency': pipeline_k_df['latency'].quantile(0.9),
+                    'query_count': len(pipeline_k_df['query_id'].unique()),
+                    'mean_user_satisfaction': pipeline_k_df['user_satisfaction'].mean() if 'user_satisfaction' in pipeline_k_df.columns else None
+                })
+        return pd.DataFrame(rows)
+    def _extract_representative_examples(self,
+                                        df: pd.DataFrame,
+                                        raw_results: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Extract representative examples: best and worst performing queries"""
+        examples = {
+            'best_performing': [],
+            'worst_performing': [],
+            'most_improved': []
+        }
+        # Best performing queries (by precision@5)
+        best_df = df[df['k'] == 5].nlargest(3, 'precision_at_k')
+        for _, row in best_df.iterrows():
+            query_id = row['query_id']
+            query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
+            if query_result:
+                examples['best_performing'].append({
+                    'query': query_result['query'],
+                    'pipeline': row['pipeline'],
+                    'precision_at_k': row['precision_at_k'],
+                    'ndcg_at_k': row['ndcg_at_k'],
+                    'mrr': row['mrr']
+                })
+        # Worst performing queries
+        worst_df = df[df['k'] == 5].nsmallest(3, 'precision_at_k')
+        for _, row in worst_df.iterrows():
+            query_id = row['query_id']
+            query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
+            if query_result:
+                examples['worst_performing'].append({
+                    'query': query_result['query'],
+                    'pipeline': row['pipeline'],
+                    'precision_at_k': row['precision_at_k'],
+                    'ndcg_at_k': row['ndcg_at_k'],
+                    'mrr': row['mrr']
+                })
+        # Most improved (hybrid vs baseline)
+        if 'hybrid_rag' in df['pipeline'].values and 'base_rag' in df['pipeline'].values:
+            baseline_df = df[(df['pipeline'] == 'base_rag') & (df['k'] == 5)].set_index('query_id')
+            hybrid_df = df[(df['pipeline'] == 'hybrid_rag') & (df['k'] == 5)].set_index('query_id')
+            common_ids = baseline_df.index.intersection(hybrid_df.index)
+            if len(common_ids) > 0:
+                improvement = (hybrid_df.loc[common_ids, 'precision_at_k'] -
+                              baseline_df.loc[common_ids, 'precision_at_k']).nlargest(3)
+                for query_id in improvement.index:
+                    query_result = next((r for r in raw_results if r.get('query_id') == query_id), None)
+                    if query_result:
+                        examples['most_improved'].append({
+                            'query': query_result['query'],
+                            'baseline_precision': baseline_df.loc[query_id, 'precision_at_k'],
+                            'hybrid_precision': hybrid_df.loc[query_id, 'precision_at_k'],
+                            'improvement': improvement[query_id]
+                        })
+        return examples
+    def _generate_html_report(self,
+                             df: pd.DataFrame,
+                             summary: Dict[str, Any],
+                             report_name: str,
+                             visualizations: Dict[str, str],
+                             raw_results: Optional[List[Dict[str, Any]]]) -> str:
+        """Generate HTML report with all statistics and visualizations"""
+        # Generate aggregated stats table
+        agg_stats_df = self._generate_aggregated_stats_table(df, summary)
+        agg_stats_html = agg_stats_df.to_html(classes='table table-striped', table_id='aggregated_stats', escape=False)
+        # Generate summary statistics
+        summary_html = self._format_summary_html(summary)
+        # Generate representative examples
+        examples_html = ""
+        if raw_results:
+            examples = self._extract_representative_examples(df, raw_results)
+            examples_html = self._format_examples_html(examples)
+        # Generate visualization HTML
+        viz_html = self._format_visualizations_html(visualizations, report_name)
+        # Generate insights
+        insights_html = self._generate_insights(df, summary)
+        # Simple HTML template (without Jinja2 for compatibility)
+        html_template = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>RAG Evaluation Report - {report_name}</title>
+    <style>
+        body {{
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }}
+        h1 {{ color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }}
+        h2 {{ color: #34495e; margin-top: 30px; border-left: 4px solid #3498db; padding-left: 10px; }}
+        table {{ width: 100%; border-collapse: collapse; margin: 20px 0; background-color: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        th, td {{ padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }}
+        th {{ background-color: #3498db; color: white; font-weight: bold; }}
+        tr:hover {{ background-color: #f5f5f5; }}
+        .insights {{ background-color: #fff3cd; border-left: 4px solid #ffc107; padding: 15px; margin: 20px 0; border-radius: 5px; }}
+        .examples {{ background-color: #e7f3ff; border-left: 4px solid #2196F3; padding: 15px; margin: 20px 0; border-radius: 5px; }}
+        .viz-container {{ text-align: center; margin: 30px 0; background-color: white; padding: 20px; border-radius: 5px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .viz-container img {{ max-width: 100%; height: auto; border: 1px solid #ddd; border-radius: 3px; }}
+    </style>
+</head>
+<body>
+    <h1>RAG Evaluation Report</h1>
+    <div class="metadata">
+        <p><strong>Report Name:</strong> {report_name}</p>
+        <p><strong>Generated:</strong> {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
+        <p><strong>Total Queries:</strong> {len(df['query_id'].unique()) if 'query_id' in df.columns else len(df['query'].unique())}</p>
+        <p><strong>Pipelines Evaluated:</strong> {len(df['pipeline'].unique())}</p>
+    </div>
+    <h2>Summary Statistics</h2>
+    {summary_html}
+    <h2>Aggregated Performance Metrics</h2>
+    {agg_stats_html}
+    <h2>Visualizations</h2>
+    {viz_html}
+    <h2>Representative Examples</h2>
+    {examples_html}
+    <h2>Insights and Recommendations</h2>
+    <div class="insights">
+        {insights_html}
+    </div>
+</body>
+</html>
+        """
+        return html_template
+    def _format_summary_html(self, summary: Dict[str, Any]) -> str:
+        """Format summary statistics as HTML"""
+        html = "<table><tr><th>Pipeline</th><th>k</th><th>Mean Precision@k</th><th>Mean nDCG@k</th><th>Mean MRR</th><th>Mean Latency (s)</th><th>P50 Latency (s)</th><th>P90 Latency (s)</th></tr>"
+        for pipeline, pipeline_data in summary.items():
+            if isinstance(pipeline_data, dict) and 'latency_percentiles' not in pipeline_data:
+                for k, metrics in pipeline_data.items():
+                    if isinstance(k, int):
+                        html += f"<tr>"
+                        html += f"<td>{self.METHOD_NAMES.get(pipeline, pipeline)}</td>"
+                        html += f"<td>{k}</td>"
+                        html += f"<td>{metrics.get('mean_precision_at_k', 0):.3f}</td>"
+                        html += f"<td>{metrics.get('mean_ndcg_at_k', 0):.3f}</td>"
+                        html += f"<td>{metrics.get('mean_mrr', 0):.3f}</td>"
+                        html += f"<td>{metrics.get('mean_latency', 0):.3f}</td>"
+                        html += f"<td>{metrics.get('p50_latency', 0):.3f}</td>"
+                        html += f"<td>{metrics.get('p90_latency', 0):.3f}</td>"
+                        html += f"</tr>"
+        html += "</table>"
+        return html
+    def _format_examples_html(self, examples: Dict[str, Any]) -> str:
+        """Format representative examples as HTML"""
+        html = ""
+        if examples.get('best_performing'):
+            html += "<h3>Best Performing Queries</h3>"
+            for example in examples['best_performing']:
+                html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
+                html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
+                html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}</div>"
+        if examples.get('worst_performing'):
+            html += "<h3>Worst Performing Queries</h3>"
+            for example in examples['worst_performing']:
+                html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
+                html += f"<strong>Pipeline:</strong> {self.METHOD_NAMES.get(example['pipeline'], example['pipeline'])}<br>"
+                html += f"<strong>Precision@5:</strong> {example['precision_at_k']:.3f}</div>"
+        if examples.get('most_improved'):
+            html += "<h3>Most Improved Queries (Hybrid vs Baseline)</h3>"
+            for example in examples['most_improved']:
+                html += f"<div class='example-item'><strong>Query:</strong> {example['query']}<br>"
+                html += f"<strong>Improvement:</strong> +{example['improvement']:.3f}</div>"
+        if not html:
+            return "<p>No representative examples available.</p>"
+        return f"<div class='examples'>{html}</div>"
+    def _format_visualizations_html(self, visualizations: Dict[str, str], report_name: str) -> str:
+        """Format visualization images as HTML"""
+        if not visualizations:
+            return "<p>No visualizations available.</p>"
+        html = ""
+        for chart_key, chart_path in visualizations.items():
+            if chart_path and os.path.exists(chart_path):
+                rel_path = os.path.relpath(chart_path, os.path.dirname(chart_path))
+                html += f"<div class='viz-container'><img src='{rel_path}' alt='{chart_key}'></div>"
+        return html if html else "<p>No visualizations could be loaded.</p>"
+    def _generate_insights(self, df: pd.DataFrame, summary: Dict[str, Any]) -> str:
+        """Generate insights and recommendations from the evaluation results"""
+        insights = []
+        # Find best performing pipeline
+        if 'k' in df.columns:
+            k5_df = df[df['k'] == 5]
+            if len(k5_df) > 0:
+                best_pipeline = k5_df.groupby('pipeline')['precision_at_k'].mean().idxmax()
+                best_precision = k5_df.groupby('pipeline')['precision_at_k'].mean().max()
+                insights.append(f"<li><strong>Best Performing Pipeline:</strong> {self.METHOD_NAMES.get(best_pipeline, best_pipeline)} with average Precision@5 of {best_precision:.3f}</li>")
+        # Latency analysis
+        if 'latency' in df.columns:
+            avg_latency = df['latency'].mean()
+            if avg_latency < 0.5:
+                insights.append("<li><strong>Latency:</strong> System response time is excellent (&lt;0.5s)</li>")
+            elif avg_latency < 1.0:
+                insights.append("<li><strong>Latency:</strong> System response time is good (&lt;1.0s)</li>")
+            else:
+                insights.append(f"<li><strong>Latency:</strong> System response time may need optimization (avg: {avg_latency:.2f}s)</li>")
+        return f"<ul>{''.join(insights)}</ul>" if insights else "<p>No specific insights available.</p>"

core/reranker.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+Reranking Module for RAG Retrieval
+This module provides reranking functionality to reorder retrieved documents
+based on their relevance to the query using cross-encoder models or semantic similarity.
+"""
+import logging
+from typing import List, Dict, Any, Optional
+import numpy as np
+_logger = logging.getLogger("rag_reranker")
+# Try to import optional dependencies
+try:
+    from sentence_transformers import CrossEncoder
+    CROSSENCODER_AVAILABLE = True
+except ImportError:
+    CROSSENCODER_AVAILABLE = False
+    _logger.debug("sentence-transformers CrossEncoder not available")
+try:
+    from sentence_transformers import SentenceTransformer
+    SENTENCETRANSFORMERS_AVAILABLE = True
+except ImportError:
+    SENTENCETRANSFORMERS_AVAILABLE = False
+    _logger.debug("SentenceTransformers not available")
+try:
+    from sklearn.metrics.pairwise import cosine_similarity
+    SKLEARN_AVAILABLE = True
+except ImportError:
+    SKLEARN_AVAILABLE = False
+    _logger.debug("scikit-learn not available")
+class Reranker:
+    """
+    Reranker for retrieved documents.
+    Supports multiple reranking strategies:
+    - Cross-encoder models (best quality)
+    - Semantic similarity (fallback)
+    - Heuristic scoring (last resort)
+    """
+    def __init__(self, model_name: Optional[str] = None):
+        """
+        Initialize reranker.
+        Args:
+            model_name: Cross-encoder model name (default: cross-encoder/ms-marco-MiniLM-L-6-v2)
+        """
+        self.model_name = model_name or "cross-encoder/ms-marco-MiniLM-L-6-v2"
+        self._cross_encoder = None
+        self._embedding_model = None
+    def _initialize_cross_encoder(self):
+        """Initialize cross-encoder model."""
+        if not CROSSENCODER_AVAILABLE:
+            return None
+        if self._cross_encoder is None:
+            try:
+                self._cross_encoder = CrossEncoder(self.model_name)
+                _logger.info(f"Initialized CrossEncoder: {self.model_name}")
+            except Exception as e:
+                _logger.warning(f"Failed to initialize CrossEncoder: {e}")
+                return None
+        return self._cross_encoder
+    def _initialize_embedding_model(self):
+        """Initialize embedding model for semantic similarity."""
+        if not SENTENCETRANSFORMERS_AVAILABLE:
+            return None
+        if self._embedding_model is None:
+            try:
+                # Use a lightweight model for reranking
+                self._embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+            except Exception as e:
+                _logger.warning(f"Failed to initialize embedding model: {e}")
+                return None
+        return self._embedding_model
+    def rerank(self, query: str, results: List[Dict[str, Any]], top_k: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Rerank retrieved documents based on query relevance.
+        Args:
+            query: Search query
+            results: List of retrieved documents with content and metadata
+            top_k: Number of top results to return (None for all)
+        Returns:
+            Reranked list of documents
+        """
+        if not results:
+            return []
+        # Try cross-encoder reranking first
+        reranked = self._rerank_with_cross_encoder(query, results)
+        if reranked:
+            if top_k:
+                return reranked[:top_k]
+            return reranked
+        # Fallback to semantic similarity
+        reranked = self._rerank_with_similarity(query, results)
+        if reranked:
+            if top_k:
+                return reranked[:top_k]
+            return reranked
+        # Last resort: return original order
+        if top_k:
+            return results[:top_k]
+        return results
+    def _rerank_with_cross_encoder(self, query: str, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Rerank using cross-encoder model."""
+        model = self._initialize_cross_encoder()
+        if not model:
+            return []
+        try:
+            # Prepare query-document pairs
+            pairs = [(query, result['content']) for result in results]
+            # Get scores from cross-encoder
+            scores = model.predict(pairs)
+            # Add rerank scores to results
+            for i, result in enumerate(results):
+                result['rerank_score'] = float(scores[i])
+            # Sort by rerank score (descending)
+            reranked = sorted(results, key=lambda x: x.get('rerank_score', 0.0), reverse=True)
+            return reranked
+        except Exception as e:
+            _logger.warning(f"Cross-encoder reranking failed: {e}")
+            return []
+    def _rerank_with_similarity(self, query: str, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Rerank using semantic similarity."""
+        model = self._initialize_embedding_model()
+        if not model:
+            return []
+        try:
+            # Encode query
+            query_embedding = model.encode(query, convert_to_numpy=True)
+            # Encode all documents
+            documents = [result['content'] for result in results]
+            doc_embeddings = model.encode(documents, convert_to_numpy=True)
+            # Calculate cosine similarities
+            if SKLEARN_AVAILABLE:
+                similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
+            else:
+                # Manual cosine similarity
+                similarities = np.array([
+                    np.dot(query_embedding, doc_emb) /
+                    (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
+                    for doc_emb in doc_embeddings
+                ])
+            # Add similarity scores to results
+            for i, result in enumerate(results):
+                result['rerank_score'] = float(similarities[i])
+            # Sort by similarity score (descending)
+            reranked = sorted(results, key=lambda x: x.get('rerank_score', 0.0), reverse=True)
+            return reranked
+        except Exception as e:
+            _logger.warning(f"Similarity reranking failed: {e}")
+            return []

core/retrieval.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import time
+from typing import List, Dict, Any, Optional, Tuple, TYPE_CHECKING
+from dataclasses import dataclass
+from .index import VectorStore
+if TYPE_CHECKING:
+    from .reranker import Reranker
+@dataclass
+class RetrievalResult:
+    """Result from retrieval pipeline"""
+    content: str
+    sources: List[Dict[str, Any]]
+    latency: float
+    metadata: Dict[str, Any]
+class BaseRAG:
+    """Standard RAG pipeline without hierarchical filtering"""
+    def __init__(self, vector_store: VectorStore, collection_name: str = "documents"):
+        self.vector_store = vector_store
+        self.collection_name = collection_name
+    def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
+        """Retrieve documents using standard vector similarity"""
+        start_time = time.time()
+        results = self.vector_store.search(
+            collection_name=self.collection_name,
+            query=query,
+            k=k
+        )
+        latency = time.time() - start_time
+        return RetrievalResult(
+            content=self._format_results(results),
+            sources=results,
+            latency=latency,
+            metadata={"strategy": "base_rag", "k": k}
+        )
+    def _format_results(self, results: List[Dict[str, Any]]) -> str:
+        """Format retrieval results into text"""
+        formatted = []
+        for i, result in enumerate(results, 1):
+            formatted.append(f"[{i}] {result['content'][:200]}... (Score: {result['score']:.3f})")
+        return "\n\n".join(formatted)
+class HierarchicalRAG:
+    """Hierarchical RAG pipeline with metadata filtering"""
+    def __init__(self, vector_store: VectorStore, collection_name: str = "documents"):
+        self.vector_store = vector_store
+        self.collection_name = collection_name
+    def retrieve(self, query: str, k: int = 5,
+                level1: Optional[str] = None,
+                level2: Optional[str] = None,
+                level3: Optional[str] = None,
+                doc_type: Optional[str] = None) -> RetrievalResult:
+        """Retrieve documents with hierarchical filtering"""
+        start_time = time.time()
+        # Build metadata filters
+        filters = self._build_filters(level1, level2, level3, doc_type)
+        results = self.vector_store.search(
+            collection_name=self.collection_name,
+            query=query,
+            filters=filters,
+            k=k
+        )
+        latency = time.time() - start_time
+        return RetrievalResult(
+            content=self._format_results(results),
+            sources=results,
+            latency=latency,
+            metadata={
+                "strategy": "hier_rag",
+                "k": k,
+                "filters": filters
+            }
+        )
+    def _build_filters(self, level1: Optional[str], level2: Optional[str],
+                      level3: Optional[str], doc_type: Optional[str]) -> Dict[str, Any]:
+        """Build metadata filters for hierarchical search"""
+        filters = {}
+        if level1:
+            filters["level1"] = level1
+        if level2:
+            filters["level2"] = level2
+        if level3:
+            filters["level3"] = level3
+        if doc_type:
+            filters["doc_type"] = doc_type
+        return filters if filters else None
+    def _format_results(self, results: List[Dict[str, Any]]) -> str:
+        """Format retrieval results into text"""
+        formatted = []
+        for i, result in enumerate(results, 1):
+            metadata = result['metadata']
+            formatted.append(
+                f"[{i}] {result['content'][:200]}...\n"
+                f"    Domain: {metadata.get('level1', 'N/A')} > "
+                f"{metadata.get('level2', 'N/A')} > "
+                f"{metadata.get('level3', 'N/A')}\n"
+                f"    Score: {result['score']:.3f}"
+            )
+        return "\n\n".join(formatted)
+class TagFilterRAG:
+    """RAG pipeline with flat tag-based filtering"""
+    def __init__(self, vector_store: VectorStore, collection_name: str = "documents"):
+        self.vector_store = vector_store
+        self.collection_name = collection_name
+    def retrieve(self, query: str, k: int = 5,
+                 tags: Optional[List[str]] = None,
+                 tag_operator: str = "OR") -> RetrievalResult:
+        """
+        Retrieve documents using tag filtering.
+        Args:
+            query: Search query
+            k: Number of results
+            tags: List of tags to filter by
+            tag_operator: Tag filter operator - "AND", "OR", or "NOT"
+        """
+        start_time = time.time()
+        # Build tag filters
+        tag_filters = None
+        if tags and len(tags) > 0:
+            tag_filters = {
+                "tags": tags,
+                "operator": tag_operator.upper()
+            }
+        # Search with tag filters
+        results = self.vector_store.search(
+            collection_name=self.collection_name,
+            query=query,
+            filters=None,
+            tag_filters=tag_filters,
+            k=k
+        )
+        latency = time.time() - start_time
+        return RetrievalResult(
+            content=self._format_results(results),
+            sources=results,
+            latency=latency,
+            metadata={
+                "strategy": "tag_filter_rag",
+                "k": k,
+                "tags": tags,
+                "tag_operator": tag_operator
+            }
+        )
+    def _format_results(self, results: List[Dict[str, Any]]) -> str:
+        """Format retrieval results into text"""
+        formatted = []
+        for i, result in enumerate(results, 1):
+            metadata = result.get('metadata', {})
+            tags = metadata.get('tags', [])
+            tags_str = ", ".join(tags[:5]) if isinstance(tags, list) else str(tags)
+            if isinstance(tags, list) and len(tags) > 5:
+                tags_str += "..."
+            formatted.append(
+                f"[{i}] {result['content'][:200]}...\n"
+                f"    Tags: {tags_str}\n"
+                f"    Score: {result['score']:.3f}"
+            )
+        return "\n\n".join(formatted)
+class HybridRAG:
+    """Hybrid RAG pipeline combining vector search and tag search"""
+    def __init__(self, vector_store: VectorStore, collection_name: str = "documents"):
+        self.vector_store = vector_store
+        self.collection_name = collection_name
+        self.base_rag = BaseRAG(vector_store, collection_name)
+        self.tag_filter_rag = TagFilterRAG(vector_store, collection_name)
+    def retrieve(self, query: str, k: int = 5,
+                 tags: Optional[List[str]] = None,
+                 tag_operator: str = "OR",
+                 vector_weight: float = 0.7,
+                 tag_weight: float = 0.3) -> RetrievalResult:
+        """
+        Retrieve documents using hybrid approach: weighted combination of vector and tag search.
+        Args:
+            query: Search query text
+            k: Number of results to return
+            tags: List of tags for tag search (if None, extracts from query)
+            tag_operator: Tag filter operator - "AND", "OR", or "NOT"
+            vector_weight: Weight for vector similarity score (0.0-1.0)
+            tag_weight: Weight for tag matching score (0.0-1.0). Should sum to 1.0 with vector_weight
+        Returns:
+            RetrievalResult with hybrid-scored documents
+        """
+        start_time = time.time()
+        # Normalize weights
+        total_weight = vector_weight + tag_weight
+        if total_weight > 0:
+            vector_weight = vector_weight / total_weight
+            tag_weight = tag_weight / total_weight
+        else:
+            vector_weight = 0.5
+            tag_weight = 0.5
+        # Extract tags from query if not provided
+        if tags is None:
+            tags = self._extract_tags_from_query(query)
+        # Fetch more results to have enough for merging
+        fetch_k = k * 3
+        # Get vector search results
+        vector_results = self.base_rag.vector_store.search(
+            collection_name=self.collection_name,
+            query=query,
+            filters=None,
+            k=fetch_k
+        )
+        # Get tag search results (only if tags are provided)
+        tag_results = []
+        if tags and len(tags) > 0:
+            tag_filters = {
+                "tags": tags,
+                "operator": tag_operator.upper()
+            }
+            tag_results = self.tag_filter_rag.vector_store.search(
+                collection_name=self.collection_name,
+                query=query,
+                filters=None,
+                tag_filters=tag_filters,
+                k=fetch_k
+            )
+        # Merge and combine scores
+        hybrid_results = self._combine_results(
+            vector_results, tag_results, vector_weight, tag_weight
+        )
+        # Return top k
+        hybrid_results = hybrid_results[:k]
+        latency = time.time() - start_time
+        return RetrievalResult(
+            content=self._format_results(hybrid_results),
+            sources=hybrid_results,
+            latency=latency,
+            metadata={
+                "strategy": "hybrid_rag",
+                "k": k,
+                "vector_weight": vector_weight,
+                "tag_weight": tag_weight,
+                "tags": tags,
+                "tag_operator": tag_operator
+            }
+        )
+    def _extract_tags_from_query(self, query: str) -> List[str]:
+        """Extract potential tags from query text (simple keyword extraction)."""
+        # Simple approach: split query into words, filter stopwords
+        stopwords = {
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
+            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+            'would', 'should', 'could', 'what', 'how', 'why', 'when', 'where', 'who'
+        }
+        words = query.lower().split()
+        tags = [w.strip('.,!?;:()[]{}"\'') for w in words if len(w) > 3 and w not in stopwords]
+        return tags[:5]  # Return top 5 words as tags
+    def _combine_results(self, vector_results: List[Dict[str, Any]],
+                        tag_results: List[Dict[str, Any]],
+                        vector_weight: float, tag_weight: float) -> List[Dict[str, Any]]:
+        """Combine vector and tag search results with weighted scoring."""
+        # Create combined dict keyed by chunk_id
+        combined_dict = {}
+        # Add vector results
+        for result in vector_results:
+            chunk_id = result.get('id', result.get('chunk_id', str(id(result))))
+            combined_dict[chunk_id] = {
+                **result,
+                'vector_score': result.get('score', 0.0),
+                'tag_score': 0.0
+            }
+        # Add tag results and combine scores
+        for result in tag_results:
+            chunk_id = result.get('id', result.get('chunk_id', str(id(result))))
+            tag_score = result.get('score', 0.0)
+            if chunk_id in combined_dict:
+                # Combine with existing vector score
+                combined_dict[chunk_id]['tag_score'] = tag_score
+            else:
+                # Add new result with only tag score
+                combined_dict[chunk_id] = {
+                    **result,
+                    'vector_score': 0.0,
+                    'tag_score': tag_score
+                }
+        # Normalize scores and compute hybrid scores
+        vector_scores = [r.get('vector_score', 0.0) for r in combined_dict.values()]
+        tag_scores = [r.get('tag_score', 0.0) for r in combined_dict.values()]
+        max_vector_score = max(vector_scores) if vector_scores else 1.0
+        min_vector_score = min(vector_scores) if vector_scores else 0.0
+        vector_range = max_vector_score - min_vector_score if max_vector_score > min_vector_score else 1.0
+        for chunk_id, result in combined_dict.items():
+            vector_score = result.get('vector_score', 0.0)
+            tag_score = result.get('tag_score', 0.0)
+            # Normalize vector score to 0-1 range
+            if vector_range > 0:
+                vector_score = (vector_score - min_vector_score) / vector_range
+            else:
+                vector_score = 1.0 if vector_score > 0 else 0.0
+            # Ensure tag score is in 0-1 range
+            tag_score = max(0.0, min(1.0, tag_score))
+            # Weighted combination
+            hybrid_score = (vector_weight * vector_score) + (tag_weight * tag_score)
+            result['score'] = hybrid_score
+            result['hybrid_score'] = hybrid_score
+            result['vector_score'] = vector_score
+            result['tag_score'] = tag_score
+        # Sort by hybrid score (descending)
+        sorted_results = sorted(
+            combined_dict.values(),
+            key=lambda x: x.get('hybrid_score', x.get('score', 0.0)),
+            reverse=True
+        )
+        return sorted_results
+    def _format_results(self, results: List[Dict[str, Any]]) -> str:
+        """Format hybrid retrieval results into text"""
+        formatted = []
+        for i, result in enumerate(results, 1):
+            metadata = result.get('metadata', {})
+            tags = metadata.get('tags', [])
+            tags_str = ", ".join(tags[:5]) if isinstance(tags, list) else str(tags)
+            if isinstance(tags, list) and len(tags) > 5:
+                tags_str += "..."
+            vector_score = result.get('vector_score', 0.0)
+            tag_score = result.get('tag_score', 0.0)
+            hybrid_score = result.get('hybrid_score', result.get('score', 0.0))
+            formatted.append(
+                f"[{i}] {result['content'][:200]}...\n"
+                f"    Tags: {tags_str}\n"
+                f"    Hybrid Score: {hybrid_score:.3f} "
+                f"(Vector: {vector_score:.3f}, Tag: {tag_score:.3f})"
+            )
+        return "\n\n".join(formatted)
+class HybridRerankRAG:
+    """Hybrid RAG pipeline with reranking applied after hybrid retrieval"""
+    def __init__(self, vector_store: VectorStore, collection_name: str = "documents",
+                 reranker: Optional[Any] = None):
+        self.vector_store = vector_store
+        self.collection_name = collection_name
+        self.hybrid_rag = HybridRAG(vector_store, collection_name)
+        # Lazy import of Reranker
+        if reranker is None:
+            from .reranker import Reranker
+            self.reranker = Reranker()
+        else:
+            self.reranker = reranker
+    def retrieve(self, query: str, k: int = 5,
+                 tags: Optional[List[str]] = None,
+                 tag_operator: str = "OR",
+                 vector_weight: float = 0.7,
+                 tag_weight: float = 0.3,
+                 rerank_top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Retrieve documents using hybrid approach with reranking.
+        Args:
+            query: Search query text
+            k: Number of results to return
+            tags: List of tags for tag search (if None, extracts from query)
+            tag_operator: Tag filter operator - "AND", "OR", or "NOT"
+            vector_weight: Weight for vector similarity score (0.0-1.0)
+            tag_weight: Weight for tag matching score (0.0-1.0)
+            rerank_top_k: Number of results to rerank (if None, uses k*2)
+        Returns:
+            RetrievalResult with reranked documents
+        """
+        start_time = time.time()
+        # Fetch more results for reranking (fetch 2k to rerank, return top k)
+        fetch_k = rerank_top_k or (k * 2)
+        # Get hybrid retrieval results
+        hybrid_result = self.hybrid_rag.retrieve(
+            query=query,
+            k=fetch_k,
+            tags=tags,
+            tag_operator=tag_operator,
+            vector_weight=vector_weight,
+            tag_weight=tag_weight
+        )
+        # Rerank the results
+        reranked_results = self.reranker.rerank(
+            query=query,
+            results=hybrid_result.sources,
+            top_k=k
+        )
+        latency = time.time() - start_time
+        return RetrievalResult(
+            content=self._format_results(reranked_results),
+            sources=reranked_results,
+            latency=latency,
+            metadata={
+                "strategy": "hybrid_rerank_rag",
+                "k": k,
+                "vector_weight": vector_weight,
+                "tag_weight": tag_weight,
+                "tags": tags,
+                "tag_operator": tag_operator,
+                "rerank_top_k": fetch_k
+            }
+        )
+    def _format_results(self, results: List[Dict[str, Any]]) -> str:
+        """Format reranked retrieval results into text"""
+        formatted = []
+        for i, result in enumerate(results, 1):
+            metadata = result.get('metadata', {})
+            tags = metadata.get('tags', [])
+            tags_str = ", ".join(tags[:5]) if isinstance(tags, list) else str(tags)
+            if isinstance(tags, list) and len(tags) > 5:
+                tags_str += "..."
+            rerank_score = result.get('rerank_score', result.get('score', 0.0))
+            hybrid_score = result.get('hybrid_score', result.get('score', 0.0))
+            vector_score = result.get('vector_score', 0.0)
+            tag_score = result.get('tag_score', 0.0)
+            formatted.append(
+                f"[{i}] {result['content'][:200]}...\n"
+                f"    Tags: {tags_str}\n"
+                f"    Rerank Score: {rerank_score:.3f} "
+                f"(Hybrid: {hybrid_score:.3f}, Vector: {vector_score:.3f}, Tag: {tag_score:.3f})"
+            )
+        return "\n\n".join(formatted)
+class RAGManager:
+    """Manager for both RAG pipelines"""
+    def __init__(self, persist_directory: str = "/data/chroma"):
+        self.vector_store = VectorStore(persist_directory)
+        self.base_rag = BaseRAG(self.vector_store)
+        self.hier_rag = HierarchicalRAG(self.vector_store)
+        self.tag_filter_rag = TagFilterRAG(self.vector_store)
+        self.hybrid_rag = HybridRAG(self.vector_store)
+        self.hybrid_rerank_rag = HybridRerankRAG(self.vector_store)
+    def compare_retrieval(self, query: str, k: int = 5,
+                         level1: Optional[str] = None,
+                         level2: Optional[str] = None,
+                         level3: Optional[str] = None,
+                         doc_type: Optional[str] = None) -> Tuple[RetrievalResult, RetrievalResult]:
+        """Compare Base-RAG vs Hier-RAG"""
+        base_result = self.base_rag.retrieve(query, k)
+        hier_result = self.hier_rag.retrieve(query, k, level1, level2, level3, doc_type)
+        return base_result, hier_result

core/session_manager.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+Session Management Module
+This module provides session isolation for multiple concurrent users,
+ensuring each user has isolated data and retrieval contexts.
+"""
+import uuid
+import time
+import logging
+import threading
+from typing import Dict, Optional, Any
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from collections import defaultdict
+import os
+_logger = logging.getLogger("rag_session_manager")
+@dataclass
+class UserSession:
+    """Represents a user session with isolated data"""
+    session_id: str
+    user_id: str = ""
+    collection_name: str = ""
+    created_at: datetime = field(default_factory=datetime.now)
+    last_accessed: datetime = field(default_factory=datetime.now)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def update_access(self):
+        """Update last accessed time"""
+        self.last_accessed = datetime.now()
+    def is_expired(self, timeout_seconds: int = 3600) -> bool:
+        """Check if session has expired"""
+        elapsed = (datetime.now() - self.last_accessed).total_seconds()
+        return elapsed > timeout_seconds
+class SessionManager:
+    """
+    Manages user sessions with isolated data contexts.
+    Each session gets:
+    - Unique collection name in ChromaDB
+    - Isolated RAG manager instance
+    - Session-specific data storage
+    """
+    def __init__(self,
+                 base_persist_dir: str = "./chroma_data",
+                 session_timeout: int = 3600,
+                 auto_cleanup: bool = True,
+                 cleanup_interval: int = 300):
+        """
+        Initialize session manager.
+        Args:
+            base_persist_dir: Base directory for ChromaDB persistence
+            session_timeout: Session timeout in seconds (default: 1 hour)
+            auto_cleanup: Enable automatic session cleanup
+            cleanup_interval: Cleanup interval in seconds (default: 5 minutes)
+        """
+        self.base_persist_dir = base_persist_dir
+        self.session_timeout = session_timeout
+        self.auto_cleanup = auto_cleanup
+        self.cleanup_interval = cleanup_interval
+        # Session storage: session_id -> UserSession
+        self.sessions: Dict[str, UserSession] = {}
+        # User to session mapping: user_id -> [session_ids]
+        self.user_sessions: Dict[str, list] = defaultdict(list)
+        # Thread lock for thread-safe operations
+        self._lock = threading.Lock()
+        # Cleanup thread (if enabled)
+        self._cleanup_thread = None
+        self._stop_cleanup = threading.Event()
+        if self.auto_cleanup:
+            self._start_cleanup_thread()
+    def _start_cleanup_thread(self):
+        """Start background thread for session cleanup"""
+        def cleanup_loop():
+            while not self._stop_cleanup.is_set():
+                self._stop_cleanup.wait(self.cleanup_interval)
+                if not self._stop_cleanup.is_set():
+                    self.cleanup_expired_sessions()
+        self._cleanup_thread = threading.Thread(target=cleanup_loop, daemon=True)
+        self._cleanup_thread.start()
+        _logger.info("Session cleanup thread started")
+    def create_session(self, user_id: Optional[str] = None) -> UserSession:
+        """
+        Create a new user session.
+        Args:
+            user_id: Optional user identifier (e.g., from Gradio)
+        Returns:
+            UserSession object
+        """
+        with self._lock:
+            # Generate session ID
+            session_id = str(uuid.uuid4())
+            # Generate collection name
+            collection_name = f"session_{session_id[:8]}"
+            # Create session
+            session = UserSession(
+                session_id=session_id,
+                user_id=user_id or f"user_{session_id[:8]}",
+                collection_name=collection_name
+            )
+            # Store session
+            self.sessions[session_id] = session
+            # Map user to session
+            self.user_sessions[session.user_id].append(session_id)
+            _logger.info(f"Created session {session_id} for user {session.user_id}")
+            return session
+    def get_session(self, session_id: str) -> Optional[UserSession]:
+        """
+        Get session by ID.
+        Args:
+            session_id: Session identifier
+        Returns:
+            UserSession or None if not found/expired
+        """
+        with self._lock:
+            session = self.sessions.get(session_id)
+            if session is None:
+                return None
+            # Check if expired
+            if session.is_expired(self.session_timeout):
+                _logger.info(f"Session {session_id} expired")
+                self._remove_session(session_id)
+                return None
+            # Update access time
+            session.update_access()
+            return session
+    def get_or_create_session(self, session_id: Optional[str] = None,
+                              user_id: Optional[str] = None) -> UserSession:
+        """
+        Get existing session or create new one.
+        If session_id is provided but session doesn't exist in memory,
+        checks if ChromaDB collection exists and restores the session.
+        Args:
+            session_id: Optional existing session ID
+            user_id: Optional user identifier
+        Returns:
+            UserSession object
+        """
+        if session_id:
+            session = self.get_session(session_id)
+            if session:
+                return session
+            # Session not in memory - check if ChromaDB collection exists
+            # This handles server restarts where sessions are lost from memory
+            # but ChromaDB collections persist on disk
+            collection_name = f"session_{session_id[:8]}"
+            if self._collection_exists(collection_name):
+                # Restore session from existing ChromaDB collection
+                _logger.info(f"Restoring session {session_id} from existing ChromaDB collection {collection_name}")
+                session = UserSession(
+                    session_id=session_id,
+                    user_id=user_id or f"user_{session_id[:8]}",
+                    collection_name=collection_name
+                )
+                with self._lock:
+                    self.sessions[session_id] = session
+                    self.user_sessions[session.user_id].append(session_id)
+                return session
+        return self.create_session(user_id)
+    def _collection_exists(self, collection_name: str) -> bool:
+        """
+        Check if a ChromaDB collection exists.
+        Since ChromaDB collections are namespaced by embedding provider/dimension,
+        we check if any collection starts with the base collection name.
+        Args:
+            collection_name: Base collection name to check (e.g., "session_abc12345")
+        Returns:
+            True if collection exists (with any suffix), False otherwise
+        """
+        try:
+            import chromadb
+            client = chromadb.PersistentClient(path=self.base_persist_dir)
+            collections = client.list_collections()
+            collection_names = [col.name for col in collections]
+            # Check if any collection name starts with the base collection name
+            # because ChromaDB adds suffixes like "__st_384" or "__oai_1536"
+            return any(name.startswith(collection_name + "__") for name in collection_names)
+        except Exception as e:
+            _logger.warning(f"Failed to check if collection {collection_name} exists: {e}")
+            return False
+    def _remove_session(self, session_id: str):
+        """Remove session (internal, assumes lock is held)"""
+        session = self.sessions.get(session_id)
+        if session:
+            # Remove from sessions
+            del self.sessions[session_id]
+            # Remove from user mapping
+            if session.user_id in self.user_sessions:
+                try:
+                    self.user_sessions[session.user_id].remove(session_id)
+                except ValueError:
+                    pass
+                # Clean up empty user entries
+                if not self.user_sessions[session.user_id]:
+                    del self.user_sessions[session.user_id]
+            _logger.info(f"Removed session {session_id}")
+    def remove_session(self, session_id: str):
+        """Remove session (public, thread-safe)"""
+        with self._lock:
+            self._remove_session(session_id)
+    def cleanup_expired_sessions(self) -> int:
+        """
+        Clean up expired sessions.
+        Returns:
+            Number of sessions cleaned up
+        """
+        with self._lock:
+            expired = [
+                session_id for session_id, session in self.sessions.items()
+                if session.is_expired(self.session_timeout)
+            ]
+            for session_id in expired:
+                self._remove_session(session_id)
+            if expired:
+                _logger.info(f"Cleaned up {len(expired)} expired sessions")
+            return len(expired)
+    def get_user_sessions(self, user_id: str) -> list:
+        """
+        Get all sessions for a user.
+        Args:
+            user_id: User identifier
+        Returns:
+            List of UserSession objects
+        """
+        with self._lock:
+            session_ids = self.user_sessions.get(user_id, [])
+            sessions = []
+            for session_id in session_ids[:]:  # Copy list
+                session = self.sessions.get(session_id)
+                if session:
+                    if session.is_expired(self.session_timeout):
+                        self._remove_session(session_id)
+                    else:
+                        sessions.append(session)
+            return sessions
+    def get_session_stats(self) -> Dict[str, Any]:
+        """Get statistics about active sessions"""
+        with self._lock:
+            active_sessions = [
+                s for s in self.sessions.values()
+                if not s.is_expired(self.session_timeout)
+            ]
+            return {
+                'total_sessions': len(self.sessions),
+                'active_sessions': len(active_sessions),
+                'unique_users': len(self.user_sessions),
+                'expired_sessions': len(self.sessions) - len(active_sessions)
+            }
+    def clear_all_sessions(self):
+        """Clear all sessions (for testing/cleanup)"""
+        with self._lock:
+            self.sessions.clear()
+            self.user_sessions.clear()
+            _logger.info("Cleared all sessions")
+    def shutdown(self):
+        """Shutdown session manager and cleanup thread"""
+        if self._cleanup_thread:
+            self._stop_cleanup.set()
+            self._cleanup_thread.join(timeout=5)
+            _logger.info("Session cleanup thread stopped")
+    def __del__(self):
+        """Cleanup on destruction"""
+        self.shutdown()
+# Global session manager instance
+_global_session_manager: Optional[SessionManager] = None
+def get_session_manager(base_persist_dir: str = "./chroma_data",
+                        session_timeout: int = 3600,
+                        auto_cleanup: bool = True) -> SessionManager:
+    """
+    Get or create global session manager instance.
+    Args:
+        base_persist_dir: Base directory for ChromaDB persistence
+        session_timeout: Session timeout in seconds
+        auto_cleanup: Enable automatic cleanup
+    Returns:
+        SessionManager instance
+    """
+    global _global_session_manager
+    if _global_session_manager is None:
+        _global_session_manager = SessionManager(
+            base_persist_dir=base_persist_dir,
+            session_timeout=session_timeout,
+            auto_cleanup=auto_cleanup
+        )
+    return _global_session_manager

core/session_rag.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Session-Aware RAG Manager
+Helper module to create session-specific RAG managers and handle
+session isolation for multiple concurrent users.
+"""
+import logging
+from typing import Optional, Dict, Any
+from .retrieval import RAGManager, BaseRAG, TagFilterRAG, HybridRAG, HybridRerankRAG
+from .session_manager import SessionManager, UserSession
+_logger = logging.getLogger("rag_session_rag")
+class SessionAwareRAGManager:
+    """
+    Session-aware wrapper for RAGManager that provides per-session isolation.
+    This class creates session-specific RAG pipelines by using different
+    collection names per session, ensuring data isolation.
+    """
+    def __init__(self, base_rag_manager: RAGManager, session_manager: SessionManager):
+        """
+        Initialize session-aware RAG manager.
+        Args:
+            base_rag_manager: Base RAGManager instance (shared vector store)
+            session_manager: SessionManager instance for session handling
+        """
+        self.base_rag_manager = base_rag_manager
+        self.session_manager = session_manager
+        # Cache of session-specific RAG managers
+        self._session_managers: Dict[str, RAGManager] = {}
+    def get_session_manager(self, session: Optional[UserSession]) -> RAGManager:
+        """
+        Get RAG manager for a session.
+        Args:
+            session: UserSession object or None for default
+        Returns:
+            RAGManager instance for the session
+        """
+        if session is None:
+            # Use default collection
+            return self.base_rag_manager
+        # Get or create session-specific manager
+        collection_name = session.collection_name
+        if collection_name not in self._session_managers:
+            # Create new RAGManager with session-specific collection
+            session_rag = RAGManager(
+                persist_directory=self.base_rag_manager.vector_store.persist_directory
+            )
+            # Update collection names in all pipelines
+            session_rag.base_rag.collection_name = collection_name
+            session_rag.tag_filter_rag.collection_name = collection_name
+            session_rag.hybrid_rag.collection_name = collection_name
+            session_rag.hybrid_rerank_rag.collection_name = collection_name
+            session_rag.hybrid_rag.base_rag.collection_name = collection_name
+            session_rag.hybrid_rag.tag_filter_rag.collection_name = collection_name
+            session_rag.hybrid_rerank_rag.hybrid_rag.collection_name = collection_name
+            session_rag.hybrid_rerank_rag.hybrid_rag.base_rag.collection_name = collection_name
+            session_rag.hybrid_rerank_rag.hybrid_rag.tag_filter_rag.collection_name = collection_name
+            self._session_managers[collection_name] = session_rag
+            _logger.debug(f"Created session RAG manager for collection: {collection_name}")
+        return self._session_managers[collection_name]
+    def get_rag(self, session_id: str) -> RAGManager:
+        """
+        Get RAG manager for a session ID.
+        Args:
+            session_id: Session identifier
+        Returns:
+            RAGManager instance for the session
+        """
+        session = self.session_manager.get_session(session_id)
+        if session is None:
+            # If session doesn't exist, use default collection
+            _logger.warning(f"Session {session_id} not found, using default collection")
+            return self.base_rag_manager
+        return self.get_session_manager(session)
+    def cleanup_session(self, session_id: str):
+        """Clean up session-specific RAG manager"""
+        # Find collection name for session
+        session = self.session_manager.get_session(session_id)
+        if session:
+            collection_name = session.collection_name
+            if collection_name in self._session_managers:
+                del self._session_managers[collection_name]
+                _logger.debug(f"Cleaned up session RAG manager for collection: {collection_name}")

core/tag_generator.py ADDED Viewed

	@@ -0,0 +1,464 @@

+"""
+Tag Generation Module for Flat, Non-Hierarchical Tagging
+This module provides automatic tag generation using multiple methods:
+1. Keyphrase extraction (YAKE, KeyBERT)
+2. Noun phrase analysis (spaCy, Janome for Japanese)
+3. OpenAI-based generation (optional)
+Supports both English and Japanese text.
+"""
+import re
+import logging
+from typing import List, Dict, Any, Optional, Set
+from collections import Counter
+import os
+_logger = logging.getLogger("rag_tag_generator")
+# Try to import optional dependencies
+try:
+    import yake
+    YAKE_AVAILABLE = True
+except ImportError:
+    YAKE_AVAILABLE = False
+    _logger.debug("YAKE not available, will use fallback methods")
+try:
+    from keybert import KeyBERT
+    KEYBERT_AVAILABLE = True
+except ImportError:
+    KEYBERT_AVAILABLE = False
+    _logger.debug("KeyBERT not available, will use fallback methods")
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except ImportError:
+    SPACY_AVAILABLE = False
+    _logger.debug("spaCy not available for noun phrase extraction")
+try:
+    from janome.tokenizer import Tokenizer
+    JANOME_AVAILABLE = True
+except ImportError:
+    JANOME_AVAILABLE = False
+    _logger.debug("Janome not available for Japanese tokenization")
+# OpenAI for tag generation (optional, fallback)
+_OPENAI_ENABLED = False
+try:
+    from openai import OpenAI as _OpenAI
+    _OPENAI_ENABLED = True if os.getenv("OPENAI_API_KEY") else False
+except Exception:
+    _OPENAI_ENABLED = False
+class TagGenerator:
+    """
+    Automatic flat tag generation from text documents.
+    Supports multiple extraction methods:
+    - Keyphrase extraction (YAKE, KeyBERT)
+    - Noun phrase analysis (spaCy, Janome)
+    - OpenAI-based generation (optional)
+    """
+    def __init__(self,
+                 max_tags: int = 10,
+                 min_tag_length: int = 2,
+                 max_tag_length: int = 3,
+                 language: Optional[str] = None):
+        """
+        Initialize tag generator.
+        Args:
+            max_tags: Maximum number of tags to generate per chunk
+            min_tag_length: Minimum words in a tag phrase
+            max_tag_length: Maximum words in a tag phrase
+            language: Language code ('en', 'ja') or None for auto-detect
+        """
+        self.max_tags = max_tags
+        self.min_tag_length = min_tag_length
+        self.max_tag_length = max_tag_length
+        self.language = language
+        # Initialize models lazily
+        self._yake_extractor = None
+        self._keybert_model = None
+        self._spacy_model = None
+        self._janome_tokenizer = None
+        self._openai_client = None
+        # Common stopwords for filtering
+        self.stopwords_en = {
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
+            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+            'would', 'should', 'could', 'may', 'might', 'must', 'can', 'this',
+            'that', 'these', 'those', 'it', 'its', 'they', 'them', 'their'
+        }
+        # Japanese stopwords (basic set)
+        self.stopwords_ja = {
+            'の', 'に', 'は', 'を', 'た', 'が', 'で', 'て', 'と', 'し', 'れ',
+            'さ', 'ある', 'いる', 'も', 'する', 'から', 'な', 'こと', 'として',
+            'い', 'や', 'れる', 'など', 'なっ', 'たら', 'なり', 'られる', 'など'
+        }
+    def _detect_language(self, text: str) -> str:
+        """Detect language from text."""
+        if self.language:
+            return self.language
+        # Simple heuristic: check for Japanese characters
+        if any('\u3040' <= ch <= '\u30ff' or '\u4e00' <= ch <= '\u9faf' for ch in text):
+            return 'ja'
+        return 'en'
+    def _initialize_yake(self, language: str):
+        """Initialize YAKE keyphrase extractor."""
+        if not YAKE_AVAILABLE:
+            return None
+        if self._yake_extractor is None:
+            try:
+                # YAKE language mapping
+                yake_lang = 'ja' if language == 'ja' else 'en'
+                self._yake_extractor = yake.KeywordExtractor(
+                    lan=yake_lang,
+                    n=self.max_tag_length,
+                    dedupLim=0.9,
+                    top=self.max_tags * 2,  # Extract more, then filter
+                    features=None
+                )
+            except Exception as e:
+                _logger.warning(f"Failed to initialize YAKE: {e}")
+                return None
+        return self._yake_extractor
+    def _initialize_keybert(self):
+        """Initialize KeyBERT model."""
+        if not KEYBERT_AVAILABLE:
+            return None
+        if self._keybert_model is None:
+            try:
+                self._keybert_model = KeyBERT()
+            except Exception as e:
+                _logger.warning(f"Failed to initialize KeyBERT: {e}")
+                return None
+        return self._keybert_model
+    def _initialize_spacy(self, language: str):
+        """Initialize spaCy model for noun phrase extraction."""
+        if not SPACY_AVAILABLE or language != 'en':
+            return None
+        if self._spacy_model is None:
+            try:
+                # Try to load English model
+                self._spacy_model = spacy.load("en_core_web_sm")
+            except OSError:
+                _logger.warning("spaCy English model not found. Install with: python -m spacy download en_core_web_sm")
+                return None
+            except Exception as e:
+                _logger.warning(f"Failed to initialize spaCy: {e}")
+                return None
+        return self._spacy_model
+    def _initialize_janome(self):
+        """Initialize Janome tokenizer for Japanese."""
+        if not JANOME_AVAILABLE:
+            return None
+        if self._janome_tokenizer is None:
+            try:
+                self._janome_tokenizer = Tokenizer()
+            except Exception as e:
+                _logger.warning(f"Failed to initialize Janome: {e}")
+                return None
+        return self._janome_tokenizer
+    def _initialize_openai(self):
+        """Initialize OpenAI client for tag generation."""
+        if not _OPENAI_ENABLED:
+            return None
+        if self._openai_client is None:
+            try:
+                self._openai_client = _OpenAI()
+            except Exception as e:
+                _logger.warning(f"Failed to initialize OpenAI: {e}")
+                return None
+        return self._openai_client
+    def _extract_with_yake(self, text: str, language: str) -> List[str]:
+        """Extract tags using YAKE."""
+        extractor = self._initialize_yake(language)
+        if not extractor:
+            return []
+        try:
+            keywords = extractor.extract_keywords(text)
+            tags = [kw[1] for kw in keywords[:self.max_tags * 2]]  # Extract more than needed
+            return self._filter_tags(tags, language)
+        except Exception as e:
+            _logger.warning(f"YAKE extraction failed: {e}")
+            return []
+    def _extract_with_keybert(self, text: str, language: str) -> List[str]:
+        """Extract tags using KeyBERT (English only)."""
+        if language != 'en':
+            return []
+        model = self._initialize_keybert()
+        if not model:
+            return []
+        try:
+            keywords = model.extract_keywords(
+                text,
+                keyphrase_ngram_range=(self.min_tag_length, self.max_tag_length),
+                top_n=self.max_tags * 2
+            )
+            tags = [kw[0] for kw in keywords]
+            return self._filter_tags(tags, language)
+        except Exception as e:
+            _logger.warning(f"KeyBERT extraction failed: {e}")
+            return []
+    def _extract_noun_phrases_spacy(self, text: str) -> List[str]:
+        """Extract noun phrases using spaCy (English only)."""
+        model = self._initialize_spacy('en')
+        if not model:
+            return []
+        try:
+            doc = model(text)
+            noun_phrases = []
+            for chunk in doc.noun_chunks:
+                if self.min_tag_length <= len(chunk.text.split()) <= self.max_tag_length:
+                    phrase = chunk.text.lower().strip()
+                    if phrase and phrase not in self.stopwords_en:
+                        noun_phrases.append(phrase)
+            return self._filter_tags(noun_phrases[:self.max_tags * 2], 'en')
+        except Exception as e:
+            _logger.warning(f"spaCy noun phrase extraction failed: {e}")
+            return []
+    def _extract_noun_phrases_janome(self, text: str) -> List[str]:
+        """Extract noun phrases using Janome (Japanese only)."""
+        tokenizer = self._initialize_janome()
+        if not tokenizer:
+            return []
+        try:
+            tokens = tokenizer.tokenize(text)
+            noun_phrases = []
+            current_phrase = []
+            for token in tokens:
+                # Extract nouns (名詞) and compound nouns
+                if token.part_of_speech.split(',')[0] == '名詞':
+                    current_phrase.append(token.surface)
+                else:
+                    if len(current_phrase) >= self.min_tag_length:
+                        phrase = ''.join(current_phrase)
+                        # Filter stopwords
+                        if phrase and not all(c in self.stopwords_ja for c in phrase):
+                            noun_phrases.append(phrase)
+                    current_phrase = []
+            # Handle last phrase
+            if len(current_phrase) >= self.min_tag_length:
+                phrase = ''.join(current_phrase)
+                if phrase and not all(c in self.stopwords_ja for c in phrase):
+                    noun_phrases.append(phrase)
+            return self._filter_tags(noun_phrases[:self.max_tags], 'ja')
+        except Exception as e:
+            _logger.warning(f"Janome noun phrase extraction failed: {e}")
+            return []
+    def _extract_with_openai(self, text: str, language: str) -> List[str]:
+        """Extract tags using OpenAI (optional fallback)."""
+        client = self._initialize_openai()
+        if not client:
+            return []
+        try:
+            lang_name = 'Japanese' if language == 'ja' else 'English'
+            prompt = (
+                f"Extract {self.max_tags} flat, non-hierarchical tags (keywords/phrases) from the following {lang_name} text. "
+                f"Tags should be {self.min_tag_length}-{self.max_tag_length} words each. "
+                "Return only a JSON array of tag strings, no explanation.\n\n"
+                f"Text:\n{text[:2000]}"
+            )
+            response = client.chat.completions.create(
+                model=os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.0,
+            )
+            content = response.choices[0].message.content
+            import json
+            # Try to extract JSON array
+            if content.strip().startswith('['):
+                tags = json.loads(content)
+            else:
+                # Try to find JSON in the response
+                import re
+                json_match = re.search(r'\[.*\]', content, re.DOTALL)
+                if json_match:
+                    tags = json.loads(json_match.group())
+                else:
+                    # Fallback: split by lines or commas
+                    tags = [t.strip().strip('"\'') for t in content.split('\n') if t.strip()]
+            return self._filter_tags(tags[:self.max_tags], language)
+        except Exception as e:
+            _logger.warning(f"OpenAI tag extraction failed: {e}")
+            return []
+    def _filter_tags(self, tags: List[str], language: str) -> List[str]:
+        """Filter and normalize tags."""
+        stopwords = self.stopwords_ja if language == 'ja' else self.stopwords_en
+        filtered = []
+        seen = set()
+        for tag in tags:
+            if not tag or not isinstance(tag, str):
+                continue
+            # Normalize: lowercase, strip
+            tag = tag.lower().strip()
+            # Check length
+            if language == 'ja':
+                # For Japanese, count characters
+                if len(tag) < self.min_tag_length:
+                    continue
+            else:
+                # For English, count words
+                words = tag.split()
+                if len(words) < self.min_tag_length or len(words) > self.max_tag_length:
+                    continue
+            # Filter stopwords
+            if language == 'ja':
+                if all(c in stopwords for c in tag):
+                    continue
+            else:
+                words = tag.split()
+                if all(w in stopwords for w in words):
+                    continue
+            # Deduplicate
+            if tag not in seen:
+                seen.add(tag)
+                filtered.append(tag)
+        return filtered[:self.max_tags]
+    def generate_tags(self, text: str, methods: Optional[List[str]] = None,
+                     use_openai: bool = False) -> List[str]:
+        """
+        Generate tags from text using specified methods.
+        Args:
+            text: Input text
+            methods: List of methods ('yake', 'keybert', 'noun_phrases', 'openai', 'all')
+            use_openai: Whether to use OpenAI (requires API key)
+        Returns:
+            List of generated tags
+        """
+        if not text or not text.strip():
+            return []
+        # Detect language
+        language = self._detect_language(text)
+        # Determine methods to use
+        if methods is None or 'all' in methods:
+            methods = ['yake']
+            if KEYBERT_AVAILABLE and language == 'en':
+                methods.append('keybert')
+            if SPACY_AVAILABLE and language == 'en':
+                methods.append('noun_phrases')
+            if JANOME_AVAILABLE and language == 'ja':
+                methods.append('noun_phrases')
+            if use_openai and _OPENAI_ENABLED:
+                methods.append('openai')
+        # Collect tags from all methods
+        all_tags = []
+        for method in methods:
+            try:
+                if method == 'yake':
+                    tags = self._extract_with_yake(text, language)
+                    if tags:
+                        all_tags.extend(tags)
+                        _logger.debug(f"YAKE extracted {len(tags)} tags")
+                elif method == 'keybert' and language == 'en':
+                    tags = self._extract_with_keybert(text, language)
+                    if tags:
+                        all_tags.extend(tags)
+                        _logger.debug(f"KeyBERT extracted {len(tags)} tags")
+                elif method == 'noun_phrases':
+                    if language == 'en' and SPACY_AVAILABLE:
+                        tags = self._extract_noun_phrases_spacy(text)
+                        if tags:
+                            all_tags.extend(tags)
+                            _logger.debug(f"spaCy noun phrases: {len(tags)} tags")
+                    elif language == 'ja' and JANOME_AVAILABLE:
+                        tags = self._extract_noun_phrases_janome(text)
+                        if tags:
+                            all_tags.extend(tags)
+                            _logger.debug(f"Janome noun phrases: {len(tags)} tags")
+                elif method == 'openai' and use_openai:
+                    tags = self._extract_with_openai(text, language)
+                    if tags:
+                        all_tags.extend(tags)
+                        _logger.debug(f"OpenAI extracted {len(tags)} tags")
+            except Exception as e:
+                _logger.warning(f"Tag extraction method {method} failed: {e}")
+                continue
+        # Deduplicate and rank by frequency
+        if all_tags:
+            tag_counts = Counter(all_tags)
+            # Sort by frequency, then alphabetically
+            sorted_tags = sorted(tag_counts.items(), key=lambda x: (-x[1], x[0]))
+            return [tag for tag, _ in sorted_tags[:self.max_tags]]
+        # Fallback: simple keyword extraction
+        return self._fallback_extraction(text, language)
+    def _fallback_extraction(self, text: str, language: str) -> List[str]:
+        """Fallback tag extraction when no methods are available."""
+        stopwords = self.stopwords_ja if language == 'ja' else self.stopwords_en
+        if language == 'ja':
+            # For Japanese, extract non-stopword characters/words
+            words = list(text)
+            tags = [w for w in words if w not in stopwords and len(w) > 1]
+        else:
+            # For English, extract words
+            words = re.findall(r'\b\w+\b', text.lower())
+            tags = [w for w in words if w not in stopwords and len(w) >= self.min_tag_length]
+        # Return top N by frequency
+        tag_counts = Counter(tags)
+        sorted_tags = sorted(tag_counts.items(), key=lambda x: (-x[1], x[0]))
+        return [tag for tag, _ in sorted_tags[:self.max_tags]]

core/utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import uuid
+import re
+import logging
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+# Try to import tiktoken, but make it optional
+try:
+    import tiktoken
+    TIKTOKEN_AVAILABLE = True
+except (ImportError, Exception) as e:
+    logger.warning(f"tiktoken not available: {e}. Token counting will use fallback method.")
+    TIKTOKEN_AVAILABLE = False
+    tiktoken = None
+# Global cache for tiktoken encoding (to avoid repeated download attempts)
+_TIKTOKEN_ENCODING = None
+_TIKTOKEN_LOAD_ATTEMPTED = False
+@dataclass
+class Chunk:
+    """Data class for document chunks"""
+    doc_id: str
+    chunk_id: str
+    content: str
+    metadata: Dict[str, Any]
+    embeddings: Optional[List[float]] = None
+class TextProcessor:
+    """Text processing utilities"""
+    def __init__(self):
+        global _TIKTOKEN_ENCODING, _TIKTOKEN_LOAD_ATTEMPTED
+        # Use cached encoding if available
+        if _TIKTOKEN_ENCODING is not None:
+            self.encoding = _TIKTOKEN_ENCODING
+            return
+        # Try to load encoding only once (cache result)
+        self.encoding = None
+        if TIKTOKEN_AVAILABLE and not _TIKTOKEN_LOAD_ATTEMPTED:
+            _TIKTOKEN_LOAD_ATTEMPTED = True
+            try:
+                encoding = tiktoken.get_encoding("cl100k_base")
+                _TIKTOKEN_ENCODING = encoding
+                self.encoding = encoding
+            except (ConnectionError, OSError, Exception) as e:
+                # Handle network errors, connection refused, and other issues
+                # This is expected when offline - tiktoken needs to download encoding file on first use
+                # Only log info once (not warning, since this is expected offline behavior)
+                logger.info(f"Tiktoken encoding not available (offline mode). Using fallback token counting (characters/4).")
+                _TIKTOKEN_ENCODING = None  # Cache the failure
+                self.encoding = None
+        elif _TIKTOKEN_LOAD_ATTEMPTED:
+            # Already attempted, use cached result (which is None if it failed)
+            self.encoding = _TIKTOKEN_ENCODING
+    def count_tokens(self, text: str) -> int:
+        """Count tokens in text using tiktoken if available, otherwise use character-based estimate"""
+        if self.encoding:
+            try:
+                return len(self.encoding.encode(text))
+            except Exception as e:
+                logger.debug(f"tiktoken encoding failed: {e}, using fallback")
+        # Fallback: approximate token count (rough estimate: 1 token ≈ 4 characters)
+        # This is a simple approximation, not as accurate as tiktoken
+        return len(text) // 4
+    def mask_pii(self, text: str) -> str:
+        """Mask personally identifiable information"""
+        # Email addresses
+        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
+        # Phone numbers
+        text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
+        # Credit card numbers
+        text = re.sub(r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', '[CREDIT_CARD]', text)
+        # SSN
+        text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
+        return text
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize text"""
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep basic punctuation
+        text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
+        return text.strip()
+    def clean_text_preserve_newlines(self, text: str) -> str:
+        """Normalize text but preserve paragraph breaks for chunking.
+        - Normalize Windows newlines to \n
+        - Trim spaces on each line
+        - Collapse 3+ newlines -> 2 newlines (keep blank lines as separators)
+        - Collapse multiple spaces within lines
+        - Keep basic punctuation
+        """
+        # Normalize line endings
+        text = text.replace('\r\n', '\n').replace('\r', '\n')
+        # Trim spaces on each line
+        text = '\n'.join(line.strip() for line in text.split('\n'))
+        # Collapse 3+ newlines to 2 newlines
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # Collapse multiple spaces within lines
+        text = re.sub(r'[ \t]+', ' ', text)
+        # Remove disallowed characters but keep punctuation and newlines
+        text = re.sub(r'[^\w\s\n.,!?;:()\-]', '', text)
+        return text.strip()
+    def split_sentences(self, text: str) -> List[str]:
+        """Split text into sentences using simple regex-based approach.
+        Args:
+            text: Input text to split
+        Returns:
+            List of sentences
+        """
+        # Simple sentence splitting: split on sentence-ending punctuation
+        # Pattern: period, exclamation, or question mark followed by space or end of string
+        sentences = re.split(r'([.!?]+(?:\s+|$))', text)
+        # Combine punctuation with preceding sentence
+        result = []
+        for i in range(0, len(sentences) - 1, 2):
+            sentence = sentences[i]
+            if i + 1 < len(sentences):
+                sentence += sentences[i + 1]
+            sentence = sentence.strip()
+            if sentence:
+                result.append(sentence)
+        # Handle last sentence if odd number of splits
+        if len(sentences) % 2 == 1 and sentences[-1].strip():
+            result.append(sentences[-1].strip())
+        # If no sentences found, return the whole text as one sentence
+        if not result:
+            return [text.strip()] if text.strip() else []
+        return result
+def generate_id() -> str:
+    """Generate unique ID"""
+    return str(uuid.uuid4())

core/visualization.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+Visualization Module for RAG Evaluation Results
+This module provides functions to create various charts and visualizations
+for RAG evaluation results including bar charts, line charts, scatter plots,
+box plots, stacked bars, and Pareto charts.
+"""
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+from typing import List, Dict, Any, Optional, Tuple
+from collections import Counter
+import os
+import logging
+# Use non-interactive backend for server environments
+matplotlib.use('Agg')
+_logger = logging.getLogger("rag_visualization")
+class RAGVisualizer:
+    """Visualization utilities for RAG evaluation results"""
+    # Method display names
+    METHOD_NAMES = {
+        'base_rag': 'Baseline',
+        'tag_filter_rag': '+Tags(Filter)',
+        'hybrid_rag': 'Hybrid(Weighted)',
+        'hybrid_rerank_rag': 'Hybrid+Rerank'
+    }
+    def __init__(self):
+        """Initialize visualizer"""
+        pass
+    def bar_chart_avg_performance(self, df: pd.DataFrame, metric: str = 'precision_at_k',
+                                  k_value: Optional[int] = None) -> plt.Figure:
+        """Create bar chart of average performance by method"""
+        # Filter by k if specified
+        if k_value:
+            df = df[df['k'] == k_value]
+        # Aggregate by pipeline
+        avg_metrics = df.groupby('pipeline')[metric].mean().reset_index()
+        # Rename pipelines
+        avg_metrics['pipeline'] = avg_metrics['pipeline'].map(
+            lambda x: self.METHOD_NAMES.get(x, x)
+        )
+        # Create figure
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.bar(avg_metrics['pipeline'], avg_metrics[metric])
+        ax.set_xlabel('Method')
+        ax.set_ylabel(metric.replace('_', ' ').title())
+        ax.set_title(f'Average {metric.replace("_", " ").title()} by Method')
+        ax.tick_params(axis='x', rotation=45)
+        plt.tight_layout()
+        return fig
+    def box_plot_query_variance(self, df: pd.DataFrame, metric: str = 'precision_at_k',
+                                k_value: Optional[int] = None) -> plt.Figure:
+        """Create box plot showing query-level variance"""
+        # Filter by k if specified
+        if k_value:
+            df = df[df['k'] == k_value]
+        # Prepare data for box plot
+        data = [df[df['pipeline'] == pipeline][metric].values
+                for pipeline in df['pipeline'].unique()]
+        labels = [self.METHOD_NAMES.get(p, p) for p in df['pipeline'].unique()]
+        # Create figure
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.boxplot(data, labels=labels)
+        ax.set_xlabel('Method')
+        ax.set_ylabel(metric.replace('_', ' ').title())
+        ax.set_title(f'Query-Level Variance: {metric.replace("_", " ").title()}')
+        ax.tick_params(axis='x', rotation=45)
+        plt.tight_layout()
+        return fig
+    def scatter_plot_tags_vs_ndcg(self, df: pd.DataFrame, k_value: Optional[int] = None) -> plt.Figure:
+        """Create scatter plot of tag count vs nDCG"""
+        # Filter by k if specified
+        if k_value:
+            df = df[df['k'] == k_value]
+        # Extract tag counts from metadata if available
+        # This would require additional processing in evaluation
+        # For now, create a simple scatter
+        fig, ax = plt.subplots(figsize=(10, 6))
+        for pipeline in df['pipeline'].unique():
+            pipeline_df = df[df['pipeline'] == pipeline]
+            ax.scatter(range(len(pipeline_df)), pipeline_df['ndcg_at_k'],
+                      label=self.METHOD_NAMES.get(pipeline, pipeline), alpha=0.6)
+        ax.set_xlabel('Query Index')
+        ax.set_ylabel('nDCG@k')
+        ax.set_title('nDCG by Method and Query')
+        ax.legend()
+        plt.tight_layout()
+        return fig
+    def line_plot_metrics_over_k(self, df: pd.DataFrame, metric: str = 'precision_at_k') -> plt.Figure:
+        """Create line plot showing metric trends over k values"""
+        # Aggregate by pipeline and k
+        metric_over_k = df.groupby(['pipeline', 'k'])[metric].mean().reset_index()
+        # Create figure
+        fig, ax = plt.subplots(figsize=(10, 6))
+        for pipeline in df['pipeline'].unique():
+            pipeline_df = metric_over_k[metric_over_k['pipeline'] == pipeline]
+            ax.plot(pipeline_df['k'], pipeline_df[metric],
+                   marker='o', label=self.METHOD_NAMES.get(pipeline, pipeline))
+        ax.set_xlabel('k (Number of Results)')
+        ax.set_ylabel(metric.replace('_', ' ').title())
+        ax.set_title(f'{metric.replace("_", " ").title()} Trends Over k Values')
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        return fig
+    def stacked_bar_user_ratings(self, df: pd.DataFrame) -> plt.Figure:
+        """Create stacked bar chart of user satisfaction ratings"""
+        if 'user_satisfaction' not in df.columns:
+            # Create empty figure if no data
+            fig, ax = plt.subplots(figsize=(10, 6))
+            ax.text(0.5, 0.5, 'No user satisfaction data available',
+                   ha='center', va='center', transform=ax.transAxes)
+            return fig
+        # Group by pipeline and satisfaction score
+        ratings = df.groupby(['pipeline', 'user_satisfaction']).size().unstack(fill_value=0)
+        # Rename pipelines
+        ratings.index = [self.METHOD_NAMES.get(p, p) for p in ratings.index]
+        # Create figure
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ratings.plot(kind='bar', stacked=True, ax=ax)
+        ax.set_xlabel('Method')
+        ax.set_ylabel('Count')
+        ax.set_title('User Satisfaction Ratings Distribution')
+        ax.legend(title='Rating (1-5)')
+        ax.tick_params(axis='x', rotation=45)
+        plt.tight_layout()
+        return fig
+    def pareto_chart_method_ranking(self, df: pd.DataFrame, metric: str = 'precision_at_k',
+                                    k_value: Optional[int] = None) -> plt.Figure:
+        """Create Pareto chart ranking methods by performance"""
+        # Filter by k if specified
+        if k_value:
+            df = df[df['k'] == k_value]
+        # Aggregate by pipeline
+        avg_metrics = df.groupby('pipeline')[metric].mean().reset_index()
+        avg_metrics = avg_metrics.sort_values(metric, ascending=True)
+        # Rename pipelines
+        avg_metrics['pipeline_display'] = avg_metrics['pipeline'].map(
+            lambda x: self.METHOD_NAMES.get(x, x)
+        )
+        # Calculate cumulative percentage
+        total = avg_metrics[metric].sum()
+        avg_metrics['cumulative_pct'] = (avg_metrics[metric].cumsum() / total * 100)
+        # Create figure with dual y-axes
+        fig, ax1 = plt.subplots(figsize=(10, 6))
+        # Bar chart
+        ax1.barh(avg_metrics['pipeline_display'], avg_metrics[metric],
+                color='steelblue', alpha=0.7)
+        ax1.set_xlabel(metric.replace('_', ' ').title())
+        ax1.set_ylabel('Method')
+        ax1.set_title(f'Method Performance Ranking (Pareto Chart) - {metric.replace("_", " ").title()}')
+        # Cumulative line
+        ax2 = ax1.twinx()
+        ax2.plot(avg_metrics[metric].values, avg_metrics['cumulative_pct'].values,
+                'ro-', linewidth=2, markersize=8)
+        ax2.set_ylabel('Cumulative Percentage (%)', color='red')
+        ax2.tick_params(axis='y', labelcolor='red')
+        ax2.set_ylim([0, 105])
+        plt.tight_layout()
+        return fig
+    def save_figure(self, fig: plt.Figure, path: str):
+        """Save figure to file"""
+        fig.savefig(path, dpi=150, bbox_inches='tight')
+        plt.close(fig)
+    def create_all_charts(self, df: pd.DataFrame, output_dir: str = "reports/visualizations",
+                         k_value: Optional[int] = None) -> Dict[str, str]:
+        """
+        Create all available charts.
+        Args:
+            df: Evaluation results DataFrame
+            output_dir: Output directory for charts
+            k_value: Optional k value to filter
+        Returns:
+            Dict mapping chart name to file path (keys: 'bar', 'line', 'scatter', 'box', 'stacked_bar', 'pareto')
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        chart_paths = {}
+        # Bar chart: Average performance
+        try:
+            fig = self.bar_chart_avg_performance(df, metric='precision_at_k', k_value=k_value)
+            path = os.path.join(output_dir, 'bar_avg_performance.png')
+            self.save_figure(fig, path)
+            chart_paths['bar'] = path
+        except Exception as e:
+            _logger.warning(f"Error creating bar chart: {e}")
+        # Line plot: Metrics over k values
+        try:
+            fig = self.line_plot_metrics_over_k(df, metric='precision_at_k')
+            path = os.path.join(output_dir, 'line_metrics_over_k.png')
+            self.save_figure(fig, path)
+            chart_paths['line'] = path
+        except Exception as e:
+            _logger.warning(f"Error creating line plot: {e}")
+        # Scatter plot: Tags vs nDCG
+        try:
+            fig = self.scatter_plot_tags_vs_ndcg(df, k_value=k_value)
+            path = os.path.join(output_dir, 'scatter_tags_vs_ndcg.png')
+            self.save_figure(fig, path)
+            chart_paths['scatter'] = path
+        except Exception as e:
+            _logger.warning(f"Error creating scatter plot: {e}")
+        # Box plot: Query variance
+        try:
+            fig = self.box_plot_query_variance(df, metric='precision_at_k', k_value=k_value)
+            path = os.path.join(output_dir, 'box_query_variance.png')
+            self.save_figure(fig, path)
+            chart_paths['box'] = path
+        except Exception as e:
+            _logger.warning(f"Error creating box plot: {e}")
+        # Stacked bar: User ratings (if available)
+        if 'user_satisfaction' in df.columns and df['user_satisfaction'].notna().any():
+            try:
+                fig = self.stacked_bar_user_ratings(df)
+                path = os.path.join(output_dir, 'stacked_bar_user_ratings.png')
+                self.save_figure(fig, path)
+                chart_paths['stacked_bar'] = path
+            except Exception as e:
+                _logger.warning(f"Error creating stacked bar chart: {e}")
+        else:
+            # Create placeholder chart if no user satisfaction data
+            try:
+                fig, ax = plt.subplots(figsize=(10, 6))
+                ax.text(0.5, 0.5, 'No user satisfaction data available',
+                       ha='center', va='center', transform=ax.transAxes, fontsize=14)
+                ax.set_title('User Satisfaction Ratings Distribution')
+                path = os.path.join(output_dir, 'stacked_bar_user_ratings.png')
+                self.save_figure(fig, path)
+                chart_paths['stacked_bar'] = path
+            except Exception as e:
+                _logger.warning(f"Error creating placeholder stacked bar chart: {e}")
+        # Pareto chart: Method ranking
+        try:
+            fig = self.pareto_chart_method_ranking(df, metric='precision_at_k', k_value=k_value)
+            path = os.path.join(output_dir, 'pareto_method_ranking.png')
+            self.save_figure(fig, path)
+            chart_paths['pareto'] = path
+        except Exception as e:
+            _logger.warning(f"Error creating pareto chart: {e}")
+        return chart_paths

requirements.txt ADDED Viewed

	@@ -0,0 +1,55 @@

+# requirements.txt
+# Auto Tagging RAG System Dependencies
+# Core Gradio dependencies (v5 UI components)
+gradio==5.49.1
+gradio-client==1.13.3
+# LangChain dependencies
+langchain>=0.1.0
+langchain-community>=0.0.0
+# Vector database
+chromadb>=0.4.0
+# PDF processing (pypdf is newer, PyPDF2 is legacy but still used)
+pypdf>=3.0.0
+PyPDF2>=3.0.0
+# Embeddings and NLP
+sentence-transformers>=2.2.0
+tiktoken>=0.5.0
+# Tag generation (English and Japanese)
+yake>=0.4.0
+keybert>=0.8.0
+spacy>=3.7.0
+janome>=0.5.0
+# OpenAI (optional, for API usage)
+openai>=1.0.0
+# Testing
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+# Utilities
+python-dotenv>=1.0.0
+PyYAML>=6.0  # Note: pip package name is PyYAML, not pyyaml
+# Data processing
+numpy>=1.21.0
+pandas>=1.5.0
+scikit-learn>=1.2.0
+# Visualization and reports
+matplotlib>=3.5.0
+jinja2>=3.1.0
+# MCP Server
+mcp>=1.0.0
+# FastAPI (for MCP/API endpoints)
+fastapi>=0.110.0
+starlette>=0.36.3
+uvicorn>=0.23.0

tests/README.md ADDED Viewed

	@@ -0,0 +1,283 @@

+# Test Suite - Auto Tagging RAG System
+## Overview
+This directory contains comprehensive pytest test cases for the Auto Tagging RAG System, including tests for the MCP server, accuracy, user experience, robustness, and non-technical user scenarios.
+## Test Files
+### `test_mcp_server.py`
+**MCP Server Tests** - Tests for Model Context Protocol server functionality
+- Tool listing (`list_tools`)
+- Document search tool (`search_documents`) with all pipelines
+- Evaluation tool (`evaluate_retrieval`)
+- Error handling and edge cases
+- Tag operators (OR/AND/NOT)
+- Default parameter handling
+### `test_accuracy.py`
+**Accuracy Tests** - Tests for tag generation, retrieval, and evaluation metrics
+- Tag generation accuracy (YAKE, KeyBERT, spaCy, Janome)
+- Retrieval accuracy across all pipelines
+- Evaluation metrics (Precision@k, nDCG@k, MRR)
+- Metric range validation
+### `test_ux.py`
+**User Experience Tests** - Tests for user workflows and interface
+- Document upload workflow
+- Manual tag input
+- Search workflows
+- Evaluation workflows
+- Session persistence
+- Tag visualization
+- Document count display
+### `test_robustness.py`
+**Robustness Tests** - Tests for error handling and edge cases
+- Empty query handling
+- Invalid k values
+- Missing tags
+- Invalid operators
+- Empty documents
+- Special characters
+- Large k values
+- Data integrity
+- Performance tests
+### `test_user_scenarios.py`
+**Non-Technical User Scenarios** - Tests for users without technical knowledge
+- First-time user document upload
+- Simple search queries
+- Evaluation with sample queries
+- Custom tag input
+- Session persistence
+- Real-world workflows
+### `test_japanese_support.py`
+**Japanese Language Support** - Tests for Japanese language processing
+- Japanese tag generation
+- Language detection
+- Japanese document processing
+- Japanese search queries
+- Mixed language handling
+### `conftest.py`
+**Pytest Fixtures** - Shared fixtures for all tests
+- Temporary persistence directories
+- RAGManager instances
+- Evaluator instances
+- Session managers
+- Sample documents and queries
+- MCP server instances
+- Populated RAG managers
+## Running Tests
+### Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+Note: `pytest>=7.0.0` and `pytest-asyncio>=0.21.0` are included in `requirements.txt`.
+### Run All Tests
+```bash
+pytest tests/ -v
+```
+### Run Specific Test File
+```bash
+# MCP Server tests
+pytest tests/test_mcp_server.py -v
+# Accuracy tests
+pytest tests/test_accuracy.py -v
+# UX tests
+pytest tests/test_ux.py -v
+# Robustness tests
+pytest tests/test_robustness.py -v
+# User scenario tests
+pytest tests/test_user_scenarios.py -v
+# Japanese support tests
+pytest tests/test_japanese_support.py -v
+```
+### Run Specific Test Class
+```bash
+pytest tests/test_mcp_server.py::TestMCPServer -v
+```
+### Run Specific Test Case
+```bash
+pytest tests/test_mcp_server.py::TestMCPServer::test_search_documents_base_rag -v
+```
+### Run with Coverage
+```bash
+pip install pytest-cov
+pytest tests/ --cov=core --cov=app --cov-report=html
+```
+### Run Asynchronous Tests
+MCP server tests use `pytest.mark.asyncio`. Ensure `pytest-asyncio` is installed:
+```bash
+pip install pytest-asyncio
+pytest tests/test_mcp_server.py -v
+```
+## Test Structure
+### Test Categories
+1. **MCP Server Tests** (15+ tests)
+   - Tool listing and discovery
+   - All pipeline types (Base, Tag Filter, Hybrid, Hybrid Rerank)
+   - Tag operators
+   - Evaluation tool
+   - Error handling
+2. **Accuracy Tests** (10+ tests)
+   - Tag generation for English and Japanese
+   - Retrieval accuracy
+   - Evaluation metrics
+3. **User Experience Tests** (5+ tests)
+   - Workflow tests
+   - Interface tests
+   - User interactions
+4. **Robustness Tests** (10+ tests)
+   - Error handling
+   - Edge cases
+   - Data integrity
+   - Performance
+5. **User Scenario Tests** (8+ tests)
+   - Non-technical user workflows
+   - Real-world scenarios
+6. **Japanese Support Tests** (5+ tests)
+   - Japanese language processing
+   - Mixed language handling
+**Total**: 50+ test cases
+## Test Fixtures
+### Available Fixtures
+- `temp_persist_dir`: Temporary directory for ChromaDB (auto-cleanup)
+- `rag_manager`: RAGManager instance with temporary persistence
+- `evaluator`: RAGEvaluator instance
+- `session_manager`: SessionManager instance
+- `session_rag_manager`: SessionAwareRAGManager instance
+- `sample_documents`: Sample document data (emergency, medical, surgery)
+- `sample_queries`: Sample evaluation queries
+- `mcp_server`: MCP server instance for testing
+- `populated_rag_manager`: RAGManager with sample documents pre-indexed
+## Writing New Tests
+### Example Test Case
+```python
+def test_my_feature(populated_rag_manager):
+    """Test my new feature"""
+    query = "test query"
+    result = populated_rag_manager.base_rag.retrieve(query, k=3)
+    assert result is not None
+    assert len(result.sources) > 0
+```
+### Using Fixtures
+```python
+def test_with_fixture(rag_manager, sample_documents):
+    """Test using fixtures"""
+    doc_data = sample_documents["emergency"]
+    # Use doc_data for testing
+```
+### Async Tests (MCP Server)
+```python
+@pytest.mark.asyncio
+async def test_mcp_tool(mcp_server):
+    """Test MCP tool"""
+    result = await mcp_server.call_tool("search_documents", {"query": "test"})
+    assert result is not None
+```
+## Requirements
+### Python Version
+- Python 3.8+
+### Dependencies
+- `pytest>=7.0.0`
+- `pytest-asyncio>=0.21.0` (for MCP server tests)
+- `pytest-cov` (optional, for coverage)
+### Models
+- spaCy English model: `python -m spacy download en_core_web_sm`
+- SentenceTransformers: Downloads automatically
+- MCP package: `pip install mcp` (for MCP server tests)
+## Skipped Tests
+Some tests may be skipped if optional dependencies are not installed:
+- MCP server tests: Skipped if `mcp` package not installed
+- spaCy tests: Skipped if spaCy model not available
+- OpenAI tests: Skipped if OpenAI API key not configured
+## Test Maintenance
+- Update tests when features change
+- Add new tests for new features
+- Review and refine tests regularly
+- Keep test data updated
+## Troubleshooting
+### Import Errors
+```bash
+# Ensure you're in the project root
+cd /path/to/auto_tagging_rag
+pytest tests/ -v
+```
+### MCP Tests Fail
+```bash
+# Install MCP package
+pip install mcp
+pytest tests/test_mcp_server.py -v
+```
+### Model Not Found Errors
+```bash
+# Download required models
+python -m spacy download en_core_web_sm
+```
+### Session Errors
+- Tests use temporary directories (auto-cleanup)
+- If tests fail, check temp directory permissions
+---
+**Last Updated**: 2024
+**Test Suite Version**: 1.0

tests/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Tests package for Auto Tagging RAG System
2	+

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+Pytest fixtures for Auto Tagging RAG System tests
+"""
+import pytest
+import os
+import tempfile
+import shutil
+from pathlib import Path
+# Import core modules
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from core.retrieval import RAGManager
+from core.eval import RAGEvaluator
+from core.session_manager import SessionManager
+from core.session_rag import SessionAwareRAGManager
+@pytest.fixture(scope="function")
+def temp_persist_dir():
+    """Create a temporary directory for ChromaDB persistence"""
+    temp_dir = tempfile.mkdtemp()
+    yield temp_dir
+    # Cleanup
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir, ignore_errors=True)
+@pytest.fixture(scope="function")
+def rag_manager(temp_persist_dir):
+    """Create a RAGManager instance with temporary persistence"""
+    return RAGManager(persist_directory=temp_persist_dir)
+@pytest.fixture(scope="function")
+def evaluator(rag_manager):
+    """Create a RAGEvaluator instance"""
+    return RAGEvaluator(rag_manager)
+@pytest.fixture(scope="function")
+def session_manager(temp_persist_dir):
+    """Create a SessionManager instance with temporary persistence"""
+    return SessionManager(base_persist_dir=temp_persist_dir)
+@pytest.fixture(scope="function")
+def session_rag_manager(rag_manager, session_manager):
+    """Create a SessionAwareRAGManager instance"""
+    return SessionAwareRAGManager(rag_manager, session_manager)
+@pytest.fixture(scope="function")
+def sample_documents():
+    """Sample documents for testing"""
+    return {
+        "emergency": {
+            "content": """
+            Emergency Procedures
+            In case of fire, immediately activate the nearest fire alarm and evacuate the building.
+            Fire safety protocols require all personnel to know the location of fire extinguishers.
+            Do not use elevators during a fire emergency. Stay low to avoid smoke inhalation.
+            """,
+            "tags": ["fire", "emergency", "safety", "evacuation"],
+            "language": "en"
+        },
+        "medical": {
+            "content": """
+            Medical Emergency Response
+            Medical emergency response begins with assessing patient ABC (Airway, Breathing, Circulation).
+            Emergency protocols require immediate notification of medical team.
+            If the patient is unresponsive, call for emergency medical services immediately.
+            """,
+            "tags": ["medical", "emergency", "patient", "response"],
+            "language": "en"
+        },
+        "surgery": {
+            "content": """
+            Surgical Safety Protocols
+            All surgical procedures require pre-operative checklists and sterile environment protocols.
+            Surgical safety includes patient identification verification and site marking procedures.
+            The surgical site should be marked with an indelible marker before the patient enters the operating room.
+            """,
+            "tags": ["surgery", "safety", "protocol", "patient"],
+            "language": "en"
+        }
+    }
+@pytest.fixture(scope="function")
+def sample_queries():
+    """Sample evaluation queries for testing"""
+    return [
+        {
+            "query": "What are the emergency procedures for fire incidents?",
+            "ground_truth": [
+                "In case of fire, immediately activate the nearest fire alarm and evacuate the building.",
+                "Fire safety protocols require all personnel to know the location of fire extinguishers."
+            ],
+            "k_values": [1, 3, 5],
+            "tags": ["fire", "emergency", "safety"],
+            "tag_operator": "OR"
+        },
+        {
+            "query": "How to handle medical emergencies?",
+            "ground_truth": [
+                "Medical emergency response begins with assessing patient ABC (Airway, Breathing, Circulation).",
+                "Emergency protocols require immediate notification of medical team."
+            ],
+            "k_values": [1, 3, 5],
+            "tags": ["medical", "emergency", "patient"],
+            "tag_operator": "OR"
+        }
+    ]
+@pytest.fixture(scope="function")
+def mcp_server(rag_manager):
+    """Create an MCP server instance for testing"""
+    try:
+        from app import RAGMCPServer
+        # Override the persist_dir in __init__ by modifying the instance
+        server = RAGMCPServer()
+        server.rag_manager = rag_manager
+        server.evaluator = RAGEvaluator(rag_manager)
+        return server
+    except ImportError:
+        pytest.skip("MCP server not available (mcp package not installed)")
+@pytest.fixture(scope="function")
+def populated_rag_manager(rag_manager, sample_documents):
+    """Create a RAGManager with sample documents already indexed"""
+    from core.ingest import FlatTagChunker
+    from core.utils import Chunk
+    # Create sample chunks
+    all_chunks = []
+    for doc_name, doc_data in sample_documents.items():
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(
+            doc_data["content"],
+            language=doc_data["language"],
+            user_tags=None
+        )
+        all_chunks.extend(chunks)
+    # Index chunks
+    if all_chunks:
+        rag_manager.vector_store.add_documents("documents", all_chunks)
+    return rag_manager

tests/test_accuracy.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Accuracy tests for Auto Tagging RAG System
+Tests tag generation, retrieval accuracy, and evaluation metrics
+"""
+import pytest
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from core.tag_generator import TagGenerator
+from core.ingest import FlatTagChunker
+from core.utils import Chunk
+class TestTagGenerationAccuracy:
+    """Test tag generation accuracy"""
+    def test_english_tag_generation_yake(self):
+        """Test YAKE tag generation for English documents"""
+        text = """
+        Emergency Procedures
+        In case of fire, immediately activate the nearest fire alarm and evacuate the building.
+        Fire safety protocols require all personnel to know the location of fire extinguishers.
+        """
+        generator = TagGenerator()
+        tags = generator.generate_tags(text, method="yake", language="en", max_tags=10)
+        assert len(tags) > 0
+        assert isinstance(tags, list)
+        # Check for relevant tags (case-insensitive)
+        tag_str = " ".join(tags).lower()
+        assert any(keyword in tag_str for keyword in ["fire", "emergency", "safety"])
+    def test_english_tag_generation_keybert(self):
+        """Test KeyBERT tag generation for English documents"""
+        text = """
+        Medical Emergency Response
+        Medical emergency response begins with assessing patient ABC.
+        Emergency protocols require immediate notification of medical team.
+        """
+        generator = TagGenerator()
+        tags = generator.generate_tags(text, method="keybert", language="en", max_tags=10)
+        assert len(tags) > 0
+        assert isinstance(tags, list)
+        tag_str = " ".join(tags).lower()
+        assert any(keyword in tag_str for keyword in ["medical", "emergency", "patient"])
+    def test_english_tag_generation_spacy(self):
+        """Test spaCy tag generation for English documents"""
+        text = """
+        Surgical Safety Protocols
+        All surgical procedures require pre-operative checklists.
+        Surgical safety includes patient identification verification.
+        """
+        generator = TagGenerator()
+        try:
+            tags = generator.generate_tags(text, method="spacy", language="en", max_tags=10)
+            assert len(tags) > 0
+            assert isinstance(tags, list)
+        except Exception as e:
+            pytest.skip(f"spaCy model not available: {e}")
+    def test_japanese_tag_generation(self):
+        """Test Japanese tag generation"""
+        text = """
+        緊急時の手順
+        火災の場合は、最寄りの火災報知器をすぐに作動させ、建物から避難してください。
+        防火安全プロトコルでは、すべての職員が消火器の場所を知っている必要があります。
+        """
+        generator = TagGenerator()
+        tags = generator.generate_tags(text, method="janome", language="ja", max_tags=10)
+        assert len(tags) > 0
+        assert isinstance(tags, list)
+    def test_auto_tag_method_selection(self):
+        """Test automatic method selection"""
+        generator = TagGenerator()
+        text = "Emergency procedures for fire safety."
+        tags = generator.generate_tags(text, method="auto", language="en", max_tags=5)
+        assert len(tags) > 0
+        assert isinstance(tags, list)
+class TestRetrievalAccuracy:
+    """Test retrieval accuracy"""
+    def test_base_rag_retrieval(self, rag_manager, populated_rag_manager):
+        """Test Base RAG retrieval returns relevant documents"""
+        query = "What are emergency procedures for fire incidents?"
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        assert result is not None
+        assert len(result.sources) > 0
+        assert result.latency > 0
+        # Check if results contain relevant content
+        content_lower = " ".join([s['content'].lower() for s in result.sources])
+        assert "fire" in content_lower or "emergency" in content_lower
+    def test_tag_filter_rag_or_operator(self, populated_rag_manager):
+        """Test Tag Filter RAG with OR operator"""
+        query = "What are emergency procedures?"
+        tags = ["fire", "emergency"]
+        result = populated_rag_manager.tag_filter_rag.retrieve(
+            query, k=3, tags=tags, tag_operator="OR"
+        )
+        assert result is not None
+        # Should return documents with at least one matching tag
+        if len(result.sources) > 0:
+            for source in result.sources:
+                source_tags = source.get('metadata', {}).get('tags', [])
+                if isinstance(source_tags, str):
+                    source_tags = [t.strip() for t in source_tags.split(',')]
+                assert any(tag.lower() in str(source_tags).lower() for tag in tags)
+    def test_tag_filter_rag_and_operator(self, populated_rag_manager):
+        """Test Tag Filter RAG with AND operator"""
+        query = "What are emergency procedures?"
+        tags = ["emergency", "safety"]
+        result = populated_rag_manager.tag_filter_rag.retrieve(
+            query, k=3, tags=tags, tag_operator="AND"
+        )
+        assert result is not None
+        # Should return documents with all matching tags (or empty if none match all)
+    def test_hybrid_rag_retrieval(self, populated_rag_manager):
+        """Test Hybrid RAG retrieval"""
+        query = "How to handle medical emergencies?"
+        tags = ["medical", "emergency"]
+        result = populated_rag_manager.hybrid_rag.retrieve(
+            query, k=3, tags=tags, vector_weight=0.7, tag_weight=0.3
+        )
+        assert result is not None
+        assert len(result.sources) > 0
+        assert result.latency > 0
+    def test_hybrid_rerank_rag_retrieval(self, populated_rag_manager):
+        """Test Hybrid Rerank RAG retrieval"""
+        query = "What are surgical safety protocols?"
+        tags = ["surgery", "safety"]
+        result = populated_rag_manager.hybrid_rerank_rag.retrieve(
+            query, k=3, tags=tags, vector_weight=0.7, tag_weight=0.3
+        )
+        assert result is not None
+        assert len(result.sources) > 0
+        assert result.latency > 0
+        # Reranked results should have rerank_score or hybrid_score
+        if len(result.sources) > 0:
+            first_source = result.sources[0]
+            assert 'score' in first_source
+class TestEvaluationMetrics:
+    """Test evaluation metrics accuracy"""
+    def test_precision_at_k_calculation(self, evaluator, populated_rag_manager, sample_queries):
+        """Test Precision@k calculation"""
+        query_data = sample_queries[0]
+        query = query_data["query"]
+        ground_truth = query_data["ground_truth"]
+        # Retrieve results
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        # Calculate metrics manually
+        retrieved = [s['content'] for s in result.sources]
+        # Calculate precision@k
+        relevant_retrieved = 0
+        for gt in ground_truth:
+            for ret in retrieved:
+                if gt.lower() in ret.lower() or ret.lower() in gt.lower():
+                    relevant_retrieved += 1
+                    break
+        precision = relevant_retrieved / len(retrieved) if retrieved else 0
+        assert 0 <= precision <= 1
+        assert precision >= 0  # Should be non-negative
+    def test_evaluation_batch_evaluate(self, evaluator, populated_rag_manager, sample_queries):
+        """Test batch evaluation produces correct metrics"""
+        df, summary, results = evaluator.batch_evaluate(
+            sample_queries,
+            output_file=None,
+            pipelines=['base_rag', 'tag_filter_rag', 'hybrid_rag', 'hybrid_rerank_rag']
+        )
+        assert df is not None
+        assert len(df) > 0
+        # Check required columns
+        required_columns = ['query', 'k', 'pipeline', 'precision_at_k', 'ndcg_at_k', 'mrr']
+        for col in required_columns:
+            assert col in df.columns
+        # Check summary structure
+        assert isinstance(summary, dict)
+        assert 'summary_stats' in summary or isinstance(summary, dict)
+    def test_metrics_ranges(self, evaluator, populated_rag_manager, sample_queries):
+        """Test that all metrics are in valid ranges"""
+        df, summary, results = evaluator.batch_evaluate(
+            sample_queries[:1],  # Use one query for speed
+            output_file=None,
+            pipelines=['base_rag']
+        )
+        # Check metric ranges
+        assert all(0 <= val <= 1 for val in df['precision_at_k'].dropna())
+        assert all(0 <= val <= 1 for val in df['ndcg_at_k'].dropna())
+        assert all(0 <= val <= 1 for val in df['mrr'].dropna())
+        assert all(0 <= val <= 1 for val in df['hit_at_k'].dropna())

tests/test_japanese_support.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Japanese language support tests for Auto Tagging RAG System
+"""
+import pytest
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from core.tag_generator import TagGenerator
+from core.ingest import FlatTagChunker
+class TestJapaneseLanguageSupport:
+    """Test Japanese language processing"""
+    def test_japanese_tag_generation(self):
+        """Test Japanese tag generation using Janome"""
+        text = """
+        緊急時の手順
+        火災の場合は、最寄りの火災報知器をすぐに作動させ、建物から避難してください。
+        防火安全プロトコルでは、すべての職員が消火器の場所を知っている必要があります。
+        """
+        generator = TagGenerator()
+        tags = generator.generate_tags(text, method="janome", language="ja", max_tags=10)
+        assert len(tags) > 0
+        assert isinstance(tags, list)
+    def test_japanese_language_detection(self):
+        """Test Japanese language auto-detection"""
+        from core.tag_generator import TagGenerator
+        text = "火災の場合は、すぐに避難してください。"
+        generator = TagGenerator()
+        # Should detect Japanese and use appropriate method
+        tags = generator.generate_tags(text, method="auto", language=None, max_tags=5)
+        assert len(tags) > 0
+    def test_japanese_document_processing(self, rag_manager):
+        """Test processing Japanese documents"""
+        text = """
+        医療緊急対応
+        医療緊急対応は、患者のABC（気道、呼吸、循環）を評価することから始まります。
+        緊急プロトコルでは、医療チームへの即座の通知が必要です。
+        """
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(text, language="ja", user_tags=None)
+        assert len(chunks) > 0
+        # Index Japanese documents
+        rag_manager.vector_store.add_documents("documents", chunks)
+        stats = rag_manager.vector_store.get_collection_stats("documents")
+        assert stats["chunk_count"] > 0
+    def test_japanese_search(self, rag_manager):
+        """Test searching Japanese documents"""
+        # Index Japanese document
+        text = "火災安全プロトコル"
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(text, language="ja", user_tags=None)
+        if chunks:
+            rag_manager.vector_store.add_documents("documents", chunks)
+            # Search with Japanese query
+            query = "火災"
+            result = rag_manager.base_rag.retrieve(query, k=3)
+            assert result is not None
+    def test_mixed_language_documents(self, rag_manager):
+        """Test handling of mixed language documents"""
+        text = "Emergency 緊急 Fire 火災 Safety 安全"
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(text, language=None, user_tags=None)
+        # Should handle mixed content gracefully
+        assert chunks is not None
+        assert isinstance(chunks, list)

tests/test_mcp_server.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Test cases for MCP Server functionality
+"""
+import pytest
+import json
+import asyncio
+from pathlib import Path
+# Import core modules
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+# Check if MCP is available
+try:
+    import mcp
+    MCP_AVAILABLE = True
+except ImportError:
+    MCP_AVAILABLE = False
+@pytest.mark.asyncio
+@pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")
+class TestMCPServer:
+    """Test cases for MCP Server"""
+    async def test_list_tools(self, mcp_server):
+        """Test that MCP server lists available tools"""
+        tools = await mcp_server.list_tools()
+        assert len(tools) >= 2
+        tool_names = [tool.name for tool in tools]
+        assert "search_documents" in tool_names
+        assert "evaluate_retrieval" in tool_names
+    async def test_search_documents_base_rag(self, mcp_server, populated_rag_manager):
+        """Test search_documents tool with Base RAG pipeline"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "What are emergency procedures for fire?",
+            "k": 3,
+            "pipeline": "base_rag"
+        }
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "content" in response
+        assert "sources" in response
+        assert "latency" in response
+        assert response["strategy"] == "base_rag"
+        assert len(response["sources"]) <= 3
+    async def test_search_documents_tag_filter_rag(self, mcp_server, populated_rag_manager):
+        """Test search_documents tool with Tag Filter RAG pipeline"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "What are emergency procedures?",
+            "k": 3,
+            "pipeline": "tag_filter_rag",
+            "tags": ["fire", "emergency"],
+            "tag_operator": "OR"
+        }
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "content" in response
+        assert "sources" in response
+        assert response["strategy"] == "tag_filter_rag"
+    async def test_search_documents_hybrid_rag(self, mcp_server, populated_rag_manager):
+        """Test search_documents tool with Hybrid RAG pipeline"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "How to handle medical emergencies?",
+            "k": 3,
+            "pipeline": "hybrid_rag",
+            "tags": ["medical", "emergency"]
+        }
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "content" in response
+        assert "sources" in response
+        assert response["strategy"] == "hybrid_rag"
+    async def test_search_documents_hybrid_rerank_rag(self, mcp_server, populated_rag_manager):
+        """Test search_documents tool with Hybrid Rerank RAG pipeline"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "What are surgical safety protocols?",
+            "k": 3,
+            "pipeline": "hybrid_rerank_rag",
+            "tags": ["surgery", "safety"]
+        }
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "content" in response
+        assert "sources" in response
+        assert response["strategy"] == "hybrid_rerank_rag"
+    async def test_search_documents_default_parameters(self, mcp_server, populated_rag_manager):
+        """Test search_documents tool with default parameters"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "What are emergency procedures?"
+        }
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "content" in response
+        assert "sources" in response
+        assert len(response["sources"]) <= 5  # Default k=5
+    async def test_search_documents_invalid_pipeline(self, mcp_server, populated_rag_manager):
+        """Test search_documents tool with invalid pipeline falls back to base_rag"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "What are emergency procedures?",
+            "pipeline": "invalid_pipeline"
+        }
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        # Should fall back to base_rag
+        assert "content" in response
+        assert "sources" in response
+    async def test_evaluate_retrieval(self, mcp_server, populated_rag_manager, sample_queries):
+        """Test evaluate_retrieval tool"""
+        from core.eval import RAGEvaluator
+        mcp_server.rag_manager = populated_rag_manager
+        mcp_server.evaluator = RAGEvaluator(populated_rag_manager)
+        arguments = {
+            "queries": sample_queries,
+            "output_file": "test_evaluation.json"
+        }
+        result = await mcp_server.call_tool("evaluate_retrieval", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "summary" in response
+        assert "total_queries" in response
+        assert response["total_queries"] == len(sample_queries)
+        assert len(response["summary"]) > 0
+    async def test_evaluate_retrieval_no_output_file(self, mcp_server, populated_rag_manager, sample_queries):
+        """Test evaluate_retrieval tool without output file"""
+        from core.eval import RAGEvaluator
+        mcp_server.rag_manager = populated_rag_manager
+        mcp_server.evaluator = RAGEvaluator(populated_rag_manager)
+        arguments = {
+            "queries": sample_queries
+        }
+        result = await mcp_server.call_tool("evaluate_retrieval", arguments)
+        assert len(result) > 0
+        response = json.loads(result[0].text)
+        assert "summary" in response
+        assert "total_queries" in response
+    async def test_call_tool_invalid_tool(self, mcp_server):
+        """Test calling invalid tool raises error"""
+        with pytest.raises(ValueError, match="Unknown tool"):
+            await mcp_server.call_tool("invalid_tool", {})
+    async def test_search_documents_empty_query(self, mcp_server, populated_rag_manager):
+        """Test search_documents with empty query"""
+        mcp_server.rag_manager = populated_rag_manager
+        arguments = {
+            "query": "",
+            "k": 3
+        }
+        # Should not raise error, but may return empty results
+        result = await mcp_server.call_tool("search_documents", arguments)
+        assert len(result) > 0
+    async def test_search_documents_tag_operators(self, mcp_server, populated_rag_manager):
+        """Test search_documents with different tag operators"""
+        mcp_server.rag_manager = populated_rag_manager
+        operators = ["OR", "AND", "NOT"]
+        for operator in operators:
+            arguments = {
+                "query": "What are emergency procedures?",
+                "k": 3,
+                "pipeline": "tag_filter_rag",
+                "tags": ["fire", "emergency"],
+                "tag_operator": operator
+            }
+            result = await mcp_server.call_tool("search_documents", arguments)
+            assert len(result) > 0
+            response = json.loads(result[0].text)
+            assert "sources" in response
+    async def test_mcp_server_initialization(self, temp_persist_dir):
+        """Test MCP server initialization"""
+        try:
+            from app import RAGMCPServer
+            server = RAGMCPServer()
+            assert server.rag_manager is not None
+            assert server.evaluator is not None
+            # Test list_tools
+            tools = await server.list_tools()
+            assert len(tools) >= 2
+        except ImportError:
+            pytest.skip("MCP server not available")
+    async def test_search_documents_all_pipelines(self, mcp_server, populated_rag_manager):
+        """Test search_documents with all pipeline types"""
+        mcp_server.rag_manager = populated_rag_manager
+        pipelines = ["base_rag", "tag_filter_rag", "hybrid_rag", "hybrid_rerank_rag"]
+        for pipeline in pipelines:
+            arguments = {
+                "query": "What are emergency procedures?",
+                "k": 3,
+                "pipeline": pipeline,
+                "tags": ["emergency"]
+            }
+            result = await mcp_server.call_tool("search_documents", arguments)
+            assert len(result) > 0
+            response = json.loads(result[0].text)
+            assert response["strategy"] == pipeline

tests/test_robustness.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Robustness tests for Auto Tagging RAG System
+Tests error handling, edge cases, and data integrity
+"""
+import pytest
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+class TestErrorHandling:
+    """Test error handling and recovery"""
+    def test_empty_query_handling(self, populated_rag_manager):
+        """Test handling of empty queries"""
+        query = ""
+        # Should not crash, may return empty results
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        assert result is not None
+    def test_invalid_k_value(self, populated_rag_manager):
+        """Test handling of invalid k values"""
+        query = "test query"
+        # Test with k=0
+        result = populated_rag_manager.base_rag.retrieve(query, k=0)
+        assert result is not None
+        # Test with negative k
+        result = populated_rag_manager.base_rag.retrieve(query, k=-1)
+        assert result is not None
+    def test_missing_tags_in_tag_filter(self, populated_rag_manager):
+        """Test tag filter with tags that don't exist"""
+        query = "test query"
+        result = populated_rag_manager.tag_filter_rag.retrieve(
+            query, k=3, tags=["nonexistent-tag-xyz"], tag_operator="OR"
+        )
+        assert result is not None
+        # May return empty results, but shouldn't crash
+    def test_invalid_tag_operator(self, populated_rag_manager):
+        """Test handling of invalid tag operator"""
+        query = "test query"
+        # Should default to OR or handle gracefully
+        result = populated_rag_manager.tag_filter_rag.retrieve(
+            query, k=3, tags=["emergency"], tag_operator="INVALID"
+        )
+        assert result is not None
+    def test_empty_document_handling(self, rag_manager):
+        """Test handling of empty documents"""
+        from core.ingest import FlatTagChunker
+        chunker = FlatTagChunker()
+        # Empty document
+        chunks = chunker.chunk_document("", language="en", user_tags=None)
+        # Should handle gracefully (may return empty list)
+        assert chunks is not None
+        assert isinstance(chunks, list)
+class TestEdgeCases:
+    """Test edge cases and boundary conditions"""
+    def test_very_short_document(self, rag_manager):
+        """Test processing of very short documents"""
+        from core.ingest import FlatTagChunker
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document("Emergency!", language="en", user_tags=None)
+        assert chunks is not None
+        assert isinstance(chunks, list)
+    def test_special_characters_in_document(self, rag_manager):
+        """Test handling of special characters"""
+        from core.ingest import FlatTagChunker
+        text = "Emergency! 🚨 Fire safety (protocol #1) requires: 1) Alert 2) Evacuate"
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(text, language="en", user_tags=None)
+        assert chunks is not None
+        assert len(chunks) > 0
+    def test_large_k_value(self, populated_rag_manager):
+        """Test retrieval with large k value"""
+        query = "test query"
+        result = populated_rag_manager.base_rag.retrieve(query, k=100)
+        assert result is not None
+        # Should not crash, may return fewer results than requested
+    def test_many_tags(self, populated_rag_manager):
+        """Test tag filtering with many tags"""
+        query = "test query"
+        many_tags = [f"tag_{i}" for i in range(50)]
+        result = populated_rag_manager.tag_filter_rag.retrieve(
+            query, k=3, tags=many_tags, tag_operator="OR"
+        )
+        assert result is not None
+class TestDataIntegrity:
+    """Test data integrity and consistency"""
+    def test_document_count_accuracy(self, rag_manager, sample_documents):
+        """Test document count reflects unique documents"""
+        from core.ingest import FlatTagChunker
+        # Index multiple chunks from same document
+        all_chunks = []
+        doc_data = sample_documents["emergency"]
+        chunker = FlatTagChunker()
+        # Create chunks multiple times to simulate chunking
+        for _ in range(2):
+            chunks = chunker.chunk_document(
+                doc_data["content"],
+                language=doc_data["language"],
+                user_tags=None
+            )
+            all_chunks.extend(chunks)
+        if all_chunks:
+            rag_manager.vector_store.add_documents("documents", all_chunks)
+            stats = rag_manager.vector_store.get_collection_stats("documents")
+            # Should count unique documents, not chunks
+            assert stats["document_count"] > 0
+    def test_tag_consistency(self, rag_manager):
+        """Test tag generation consistency"""
+        from core.tag_generator import TagGenerator
+        text = "Emergency procedures for fire safety and evacuation protocols."
+        generator = TagGenerator()
+        tags1 = generator.generate_tags(text, method="yake", language="en", max_tags=5)
+        tags2 = generator.generate_tags(text, method="yake", language="en", max_tags=5)
+        # Tags should be similar (may vary slightly due to randomness)
+        assert len(tags1) > 0
+        assert len(tags2) > 0
+    def test_session_isolation(self, session_manager):
+        """Test session isolation"""
+        # Create two sessions
+        session1 = session_manager.create_session(user_id="user1")
+        session2 = session_manager.create_session(user_id="user2")
+        assert session1.session_id != session2.session_id
+        assert session1.collection_name != session2.collection_name
+class TestPerformance:
+    """Test performance and resource usage"""
+    def test_retrieval_latency(self, populated_rag_manager):
+        """Test retrieval latency is reasonable"""
+        query = "What are emergency procedures?"
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        # Should complete within reasonable time (10 seconds for test)
+        assert result.latency < 10.0
+        assert result.latency > 0
+    def test_evaluation_performance(self, evaluator, populated_rag_manager, sample_queries):
+        """Test evaluation completes in reasonable time"""
+        import time
+        start = time.time()
+        df, summary, results = evaluator.batch_evaluate(
+            sample_queries[:2],  # Use 2 queries
+            output_file=None,
+            pipelines=['base_rag']
+        )
+        elapsed = time.time() - start
+        # Should complete within 60 seconds for 2 queries
+        assert elapsed < 60.0
+        assert df is not None

tests/test_user_scenarios.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Non-technical user scenario tests for Auto Tagging RAG System
+Tests workflows that don't require technical knowledge
+"""
+import pytest
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+class TestNonTechnicalUserScenarios:
+    """Test scenarios for non-technical users"""
+    def test_first_time_user_upload(self, rag_manager):
+        """Test first-time user can upload documents"""
+        from core.ingest import FlatTagChunker
+        # Simulate user action: upload document with default settings
+        text = """
+        Emergency Procedures
+        In case of fire, activate fire alarm and evacuate.
+        Know the location of fire extinguishers.
+        """
+        # User doesn't specify language (Auto-detect)
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(text, language=None, user_tags=None)
+        assert len(chunks) > 0
+        # User builds index
+        rag_manager.vector_store.add_documents("documents", chunks)
+        # Verify it worked
+        stats = rag_manager.vector_store.get_collection_stats("documents")
+        assert stats["chunk_count"] > 0
+    def test_simple_search_query(self, populated_rag_manager):
+        """Test non-technical user can search without understanding technical details"""
+        # User enters simple query
+        query = "What are emergency procedures?"
+        # Uses default pipeline (Base RAG)
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        # User sees results
+        assert result is not None
+        assert len(result.sources) > 0
+        assert result.content  # Results are readable
+    def test_search_without_tags(self, populated_rag_manager):
+        """Test user can search without understanding tags"""
+        query = "How to handle medical emergencies?"
+        # User doesn't provide tags, uses default pipeline
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        assert result is not None
+        assert len(result.sources) > 0
+    def test_evaluation_with_sample_queries(self, evaluator, populated_rag_manager):
+        """Test user can run evaluation with sample queries"""
+        # User copies sample queries (simplified)
+        queries = [
+            {
+                "query": "What are emergency procedures for fire?",
+                "ground_truth": ["Fire safety protocols"],
+                "k_values": [1, 3, 5]
+            }
+        ]
+        # User runs evaluation
+        df, summary, results = evaluator.batch_evaluate(
+            queries,
+            output_file=None,
+            pipelines=['base_rag']
+        )
+        # User sees results
+        assert df is not None
+        assert len(df) > 0
+    def test_user_adds_custom_tags(self, rag_manager):
+        """Test user can add custom tags without understanding auto-tagging"""
+        from core.ingest import FlatTagChunker
+        text = "Emergency procedures document."
+        # User adds custom tags
+        user_tags = ["important", "must-read"]
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(text, language="en", user_tags=user_tags)
+        # Verify user tags are included
+        if chunks:
+            chunk_tags = chunks[0].metadata.get('tags', [])
+            if isinstance(chunk_tags, str):
+                chunk_tags = [t.strip() for t in chunk_tags.split(',')]
+            chunk_tags_lower = [t.lower() for t in chunk_tags]
+            assert any(ut.lower() in chunk_tags_lower for ut in user_tags)
+    def test_session_persistence_for_user(self, session_manager):
+        """Test session persists across browser refresh for non-technical user"""
+        # User creates session (automatic)
+        session = session_manager.create_session(user_id="user_123")
+        session_id = session.session_id
+        # User refreshes browser - session should persist
+        retrieved = session_manager.get_session(session_id)
+        assert retrieved is not None
+        assert retrieved.session_id == session_id
+class TestRealWorldScenarios:
+    """Test real-world usage scenarios"""
+    def test_document_processing_workflow(self, rag_manager, sample_documents):
+        """Test complete document processing workflow"""
+        from core.ingest import FlatTagChunker
+        # User uploads multiple documents
+        all_chunks = []
+        for doc_name, doc_data in sample_documents.items():
+            chunker = FlatTagChunker()
+            chunks = chunker.chunk_document(
+                doc_data["content"],
+                language=doc_data["language"],
+                user_tags=None
+            )
+            all_chunks.extend(chunks)
+        # User builds index
+        if all_chunks:
+            rag_manager.vector_store.add_documents("documents", all_chunks)
+            stats = rag_manager.vector_store.get_collection_stats("documents")
+            assert stats["chunk_count"] > 0
+            assert stats["document_count"] > 0
+    def test_search_comparison_workflow(self, populated_rag_manager):
+        """Test comparing different search methods"""
+        query = "What are emergency procedures?"
+        # User compares methods
+        results = {}
+        results['base'] = populated_rag_manager.base_rag.retrieve(query, k=3)
+        results['tag'] = populated_rag_manager.tag_filter_rag.retrieve(
+            query, k=3, tags=["emergency"], tag_operator="OR"
+        )
+        results['hybrid'] = populated_rag_manager.hybrid_rag.retrieve(
+            query, k=3, tags=["emergency"], vector_weight=0.7, tag_weight=0.3
+        )
+        # All should return results
+        for method, result in results.items():
+            assert result is not None
+            assert result.latency > 0
+    def test_chat_interface_workflow(self, populated_rag_manager):
+        """Test natural conversation workflow"""
+        query = "Tell me about fire safety"
+        # User asks question
+        result = populated_rag_manager.base_rag.retrieve(query, k=3)
+        # User sees answer and sources
+        assert result is not None
+        assert result.content
+        assert len(result.sources) > 0

tests/test_ux.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+User Experience tests for Auto Tagging RAG System
+Tests UI workflows, user interactions, and usability
+"""
+import pytest
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from core.ingest import FlatTagChunker
+from core.utils import Chunk
+class TestUserWorkflows:
+    """Test user workflows and interactions"""
+    def test_document_upload_workflow(self, rag_manager, sample_documents):
+        """Test complete document upload workflow"""
+        doc_data = sample_documents["emergency"]
+        # Simulate upload workflow
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(
+            doc_data["content"],
+            language=doc_data["language"],
+            user_tags=None
+        )
+        assert len(chunks) > 0
+        # Index chunks
+        rag_manager.vector_store.add_documents("documents", chunks)
+        # Verify indexing
+        stats = rag_manager.vector_store.get_collection_stats("documents")
+        assert stats["chunk_count"] >= len(chunks)
+    def test_manual_tag_input(self, rag_manager, sample_documents):
+        """Test manual tag input during upload"""
+        doc_data = sample_documents["emergency"]
+        user_tags = ["custom-tag-1", "custom-tag-2"]
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(
+            doc_data["content"],
+            language=doc_data["language"],
+            user_tags=user_tags
+        )
+        # Check that user tags are included
+        if chunks:
+            chunk_tags = chunks[0].metadata.get('tags', [])
+            if isinstance(chunk_tags, str):
+                chunk_tags = [t.strip() for t in chunk_tags.split(',')]
+            # User tags should be present (may be lowercased)
+            chunk_tags_lower = [t.lower() for t in chunk_tags]
+            assert any(ut.lower() in chunk_tags_lower for ut in user_tags)
+    def test_search_workflow(self, populated_rag_manager):
+        """Test search workflow returns results"""
+        query = "What are emergency procedures?"
+        # Test all pipelines
+        pipelines = {
+            "base_rag": populated_rag_manager.base_rag,
+            "tag_filter_rag": populated_rag_manager.tag_filter_rag,
+            "hybrid_rag": populated_rag_manager.hybrid_rag,
+            "hybrid_rerank_rag": populated_rag_manager.hybrid_rerank_rag
+        }
+        for pipeline_name, pipeline in pipelines.items():
+            if pipeline_name == "tag_filter_rag":
+                result = pipeline.retrieve(query, k=3, tags=["emergency"], tag_operator="OR")
+            elif pipeline_name in ["hybrid_rag", "hybrid_rerank_rag"]:
+                result = pipeline.retrieve(query, k=3, tags=["emergency"], vector_weight=0.7, tag_weight=0.3)
+            else:
+                result = pipeline.retrieve(query, k=3)
+            assert result is not None
+            assert result.latency > 0
+    def test_evaluation_workflow(self, evaluator, populated_rag_manager, sample_queries):
+        """Test evaluation workflow produces results"""
+        df, summary, results = evaluator.batch_evaluate(
+            sample_queries[:2],  # Use 2 queries for speed
+            output_file=None,
+            pipelines=['base_rag']
+        )
+        assert df is not None
+        assert len(df) > 0
+        assert summary is not None
+        assert results is not None
+    def test_session_persistence(self, session_manager):
+        """Test session creation and retrieval"""
+        # Create session
+        session = session_manager.create_session(user_id="test_user")
+        session_id = session.session_id
+        # Retrieve session
+        retrieved = session_manager.get_session(session_id)
+        assert retrieved is not None
+        assert retrieved.session_id == session_id
+        assert retrieved.user_id == "test_user"
+class TestUserInterface:
+    """Test user interface components"""
+    def test_tag_visualization_format(self, rag_manager, sample_documents):
+        """Test tag visualization format"""
+        doc_data = sample_documents["emergency"]
+        chunker = FlatTagChunker()
+        chunks = chunker.chunk_document(
+            doc_data["content"],
+            language=doc_data["language"],
+            user_tags=None
+        )
+        # Extract tags for visualization
+        all_tags = []
+        for chunk in chunks:
+            tags = chunk.metadata.get('tags', [])
+            if isinstance(tags, str):
+                tags = [t.strip() for t in tags.split(',')]
+            all_tags.extend(tags)
+        # Tag visualization should be readable
+        assert len(all_tags) > 0
+        assert all(isinstance(tag, str) for tag in all_tags)
+    def test_document_count_display(self, rag_manager, sample_documents):
+        """Test document count accuracy"""
+        # Index documents
+        all_chunks = []
+        for doc_data in sample_documents.values():
+            chunker = FlatTagChunker()
+            chunks = chunker.chunk_document(
+                doc_data["content"],
+                language=doc_data["language"],
+                user_tags=None
+            )
+            all_chunks.extend(chunks)
+        if all_chunks:
+            rag_manager.vector_store.add_documents("documents", all_chunks)
+            stats = rag_manager.vector_store.get_collection_stats("documents")
+            # Document count should match number of unique documents
+            assert stats["document_count"] > 0
+            assert stats["document_count"] <= len(sample_documents)  # Should be <= number of documents