bookmyservice-mhs / scripts /validate_nlp_setup.py
MukeshKapoor25's picture
feat(nlp): implement comprehensive advanced NLP pipeline for merchant search
19aa29f
#!/usr/bin/env python3
"""
Validation script for Advanced NLP Pipeline setup
Run this script to verify that all components are properly installed and configured.
"""
import asyncio
import sys
import time
import logging
from typing import Dict, Any, List
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def check_dependencies() -> Dict[str, bool]:
"""Check if all required dependencies are installed"""
dependencies = {
'spacy': False,
'sklearn': False,
'numpy': False,
'sentence_transformers': False,
'transformers': False,
'torch': False
}
logger.info("Checking dependencies...")
# Check spaCy
try:
import spacy
dependencies['spacy'] = True
logger.info("βœ“ spaCy installed")
except ImportError:
logger.error("βœ— spaCy not installed")
# Check scikit-learn
try:
import sklearn
dependencies['sklearn'] = True
logger.info("βœ“ scikit-learn installed")
except ImportError:
logger.error("βœ— scikit-learn not installed")
# Check numpy
try:
import numpy
dependencies['numpy'] = True
logger.info("βœ“ numpy installed")
except ImportError:
logger.error("βœ— numpy not installed")
# Check sentence-transformers
try:
import sentence_transformers
dependencies['sentence_transformers'] = True
logger.info("βœ“ sentence-transformers installed")
except ImportError:
logger.error("βœ— sentence-transformers not installed")
# Check transformers
try:
import transformers
dependencies['transformers'] = True
logger.info("βœ“ transformers installed")
except ImportError:
logger.error("βœ— transformers not installed")
# Check torch
try:
import torch
dependencies['torch'] = True
logger.info("βœ“ torch installed")
except ImportError:
logger.error("βœ— torch not installed")
return dependencies
def check_spacy_model() -> bool:
"""Check if spaCy model is available"""
logger.info("Checking spaCy model...")
try:
import spacy
nlp = spacy.load("en_core_web_sm")
logger.info("βœ“ spaCy model 'en_core_web_sm' loaded successfully")
return True
except OSError:
logger.error("βœ— spaCy model 'en_core_web_sm' not found")
logger.error(" Run: python -m spacy download en_core_web_sm")
return False
except Exception as e:
logger.error(f"βœ— Error loading spaCy model: {e}")
return False
def check_sentence_transformer_model() -> bool:
"""Check if sentence transformer model can be loaded"""
logger.info("Checking sentence transformer model...")
try:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
logger.info("βœ“ Sentence transformer model 'all-MiniLM-L6-v2' loaded successfully")
return True
except Exception as e:
logger.error(f"βœ— Error loading sentence transformer model: {e}")
logger.error(" Model will be downloaded on first use")
return False
async def test_advanced_nlp_pipeline() -> bool:
"""Test the advanced NLP pipeline"""
logger.info("Testing Advanced NLP Pipeline...")
try:
# Import the pipeline
from app.services.advanced_nlp import advanced_nlp_pipeline
# Test with a simple query
test_query = "find a hair salon near me"
start_time = time.time()
result = await advanced_nlp_pipeline.process_query(test_query)
processing_time = time.time() - start_time
# Check if result has expected structure
required_keys = ['query', 'primary_intent', 'entities', 'similar_services', 'search_parameters']
missing_keys = [key for key in required_keys if key not in result]
if missing_keys:
logger.error(f"βœ— Missing keys in result: {missing_keys}")
return False
logger.info(f"βœ“ Advanced NLP Pipeline working (processed in {processing_time:.3f}s)")
logger.info(f" Intent: {result['primary_intent']['intent']} (confidence: {result['primary_intent']['confidence']:.3f})")
logger.info(f" Entities found: {len(result['entities'])}")
logger.info(f" Similar services: {len(result['similar_services'])}")
logger.info(f" Search parameters: {len(result['search_parameters'])}")
return True
except ImportError as e:
logger.error(f"βœ— Cannot import Advanced NLP Pipeline: {e}")
return False
except Exception as e:
logger.error(f"βœ— Error testing Advanced NLP Pipeline: {e}")
return False
async def test_individual_components() -> Dict[str, bool]:
"""Test individual NLP components"""
logger.info("Testing individual components...")
results = {
'intent_classifier': False,
'entity_extractor': False,
'semantic_matcher': False,
'context_processor': False
}
try:
from app.services.advanced_nlp import (
IntentClassifier, BusinessEntityExtractor,
SemanticMatcher, ContextAwareProcessor
)
# Test Intent Classifier
try:
classifier = IntentClassifier()
intent, confidence = classifier.get_primary_intent("find a salon")
if intent and confidence >= 0:
results['intent_classifier'] = True
logger.info(f"βœ“ Intent Classifier working (detected: {intent})")
else:
logger.error("βœ— Intent Classifier returned invalid results")
except Exception as e:
logger.error(f"βœ— Intent Classifier error: {e}")
# Test Entity Extractor
try:
extractor = BusinessEntityExtractor()
entities = extractor.extract_entities("luxury spa with parking")
if isinstance(entities, dict):
results['entity_extractor'] = True
logger.info(f"βœ“ Entity Extractor working (found {len(entities)} entity types)")
else:
logger.error("βœ— Entity Extractor returned invalid results")
except Exception as e:
logger.error(f"βœ— Entity Extractor error: {e}")
# Test Semantic Matcher
try:
matcher = SemanticMatcher()
matches = matcher.find_similar_services("hair salon")
if isinstance(matches, list):
results['semantic_matcher'] = True
logger.info(f"βœ“ Semantic Matcher working (found {len(matches)} matches)")
else:
logger.error("βœ— Semantic Matcher returned invalid results")
except Exception as e:
logger.error(f"βœ— Semantic Matcher error: {e}")
# Test Context Processor
try:
processor = ContextAwareProcessor()
context_result = await processor.process_with_context(
"spa treatment", {}, [("spa", 0.9)]
)
if isinstance(context_result, dict):
results['context_processor'] = True
logger.info("βœ“ Context Processor working")
else:
logger.error("βœ— Context Processor returned invalid results")
except Exception as e:
logger.error(f"βœ— Context Processor error: {e}")
except ImportError as e:
logger.error(f"βœ— Cannot import NLP components: {e}")
return results
def test_configuration() -> bool:
"""Test configuration loading"""
logger.info("Testing configuration...")
try:
from app.config.nlp_config import nlp_config
# Check if configuration is accessible
config_dict = nlp_config.get_config_dict()
if isinstance(config_dict, dict) and len(config_dict) > 0:
logger.info("βœ“ Configuration loaded successfully")
logger.info(f" Max workers: {nlp_config.ASYNC_PROCESSOR_MAX_WORKERS}")
logger.info(f" Cache duration: {nlp_config.CACHE_DURATION_SECONDS}s")
logger.info(f" Advanced NLP enabled: {nlp_config.ENABLE_ADVANCED_NLP}")
return True
else:
logger.error("βœ— Configuration is empty or invalid")
return False
except ImportError as e:
logger.error(f"βœ— Cannot import configuration: {e}")
return False
except Exception as e:
logger.error(f"βœ— Configuration error: {e}")
return False
async def run_performance_benchmark() -> Dict[str, float]:
"""Run a simple performance benchmark"""
logger.info("Running performance benchmark...")
test_queries = [
"find a hair salon",
"best spa near me",
"gym with parking",
"luxury massage therapy",
"dental clinic open now"
]
try:
from app.services.advanced_nlp import advanced_nlp_pipeline
total_time = 0
successful_queries = 0
for query in test_queries:
try:
start_time = time.time()
result = await advanced_nlp_pipeline.process_query(query)
processing_time = time.time() - start_time
if 'error' not in result:
total_time += processing_time
successful_queries += 1
logger.info(f" '{query}' processed in {processing_time:.3f}s")
else:
logger.warning(f" '{query}' failed: {result.get('error', 'Unknown error')}")
except Exception as e:
logger.warning(f" '{query}' error: {e}")
if successful_queries > 0:
avg_time = total_time / successful_queries
logger.info(f"βœ“ Performance benchmark completed")
logger.info(f" Average processing time: {avg_time:.3f}s")
logger.info(f" Successful queries: {successful_queries}/{len(test_queries)}")
return {
'average_time': avg_time,
'success_rate': successful_queries / len(test_queries),
'total_queries': len(test_queries)
}
else:
logger.error("βœ— No queries processed successfully")
return {}
except Exception as e:
logger.error(f"βœ— Performance benchmark failed: {e}")
return {}
def generate_report(
dependencies: Dict[str, bool],
spacy_model: bool,
sentence_model: bool,
pipeline_test: bool,
component_tests: Dict[str, bool],
config_test: bool,
performance: Dict[str, float]
) -> None:
"""Generate a comprehensive validation report"""
print("\n" + "="*60)
print("ADVANCED NLP PIPELINE VALIDATION REPORT")
print("="*60)
# Dependencies
print("\nπŸ“¦ DEPENDENCIES:")
all_deps_ok = all(dependencies.values())
for dep, status in dependencies.items():
status_icon = "βœ“" if status else "βœ—"
print(f" {status_icon} {dep}")
print(f"\n Overall: {'βœ“ All dependencies installed' if all_deps_ok else 'βœ— Missing dependencies'}")
# Models
print("\nπŸ€– MODELS:")
print(f" {'βœ“' if spacy_model else 'βœ—'} spaCy model (en_core_web_sm)")
print(f" {'βœ“' if sentence_model else 'βœ—'} Sentence transformer model")
# Pipeline
print("\nπŸ”§ PIPELINE:")
print(f" {'βœ“' if pipeline_test else 'βœ—'} Advanced NLP Pipeline")
# Components
print("\nβš™οΈ COMPONENTS:")
for component, status in component_tests.items():
status_icon = "βœ“" if status else "βœ—"
component_name = component.replace('_', ' ').title()
print(f" {status_icon} {component_name}")
# Configuration
print("\nβš™οΈ CONFIGURATION:")
print(f" {'βœ“' if config_test else 'βœ—'} Configuration loading")
# Performance
print("\n⚑ PERFORMANCE:")
if performance:
print(f" Average processing time: {performance.get('average_time', 0):.3f}s")
print(f" Success rate: {performance.get('success_rate', 0)*100:.1f}%")
if performance.get('average_time', 0) < 0.5:
print(" βœ“ Good performance")
elif performance.get('average_time', 0) < 1.0:
print(" ⚠ Acceptable performance")
else:
print(" βœ— Slow performance - consider optimization")
else:
print(" βœ— Performance test failed")
# Overall Status
print("\n" + "="*60)
overall_status = (
all_deps_ok and spacy_model and pipeline_test and
all(component_tests.values()) and config_test
)
if overall_status:
print("πŸŽ‰ OVERALL STATUS: βœ“ READY FOR PRODUCTION")
print("\nThe Advanced NLP Pipeline is properly installed and configured.")
print("You can now use the enhanced natural language processing features.")
else:
print("⚠️ OVERALL STATUS: βœ— ISSUES FOUND")
print("\nPlease address the issues above before using the Advanced NLP Pipeline.")
print("The system will fall back to basic processing until issues are resolved.")
# Recommendations
print("\nπŸ“‹ RECOMMENDATIONS:")
if not all_deps_ok:
print(" β€’ Install missing dependencies: pip install -r requirements.txt")
if not spacy_model:
print(" β€’ Download spaCy model: python -m spacy download en_core_web_sm")
if not sentence_model:
print(" β€’ Sentence transformer model will download automatically on first use")
if performance and performance.get('average_time', 0) > 0.5:
print(" β€’ Consider increasing ASYNC_PROCESSOR_MAX_WORKERS for better performance")
print(" β€’ Enable caching with longer CACHE_DURATION_SECONDS")
if not all(component_tests.values()):
print(" β€’ Check logs above for specific component errors")
print("\n" + "="*60)
async def main():
"""Main validation function"""
print("Starting Advanced NLP Pipeline validation...")
print("This may take a few minutes on first run due to model downloads.\n")
# Run all validation checks
dependencies = check_dependencies()
spacy_model = check_spacy_model()
sentence_model = check_sentence_transformer_model()
pipeline_test = await test_advanced_nlp_pipeline()
component_tests = await test_individual_components()
config_test = test_configuration()
performance = await run_performance_benchmark()
# Generate comprehensive report
generate_report(
dependencies, spacy_model, sentence_model,
pipeline_test, component_tests, config_test, performance
)
# Return exit code
overall_success = (
all(dependencies.values()) and spacy_model and pipeline_test and
all(component_tests.values()) and config_test
)
return 0 if overall_success else 1
if __name__ == "__main__":
try:
exit_code = asyncio.run(main())
sys.exit(exit_code)
except KeyboardInterrupt:
print("\n\nValidation interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nUnexpected error during validation: {e}")
sys.exit(1)