Spaces:

bmsadmin
/

bookmyservice-mhs

Sleeping

App Files Files Community

bookmyservice-mhs / scripts /validate_nlp_setup.py

MukeshKapoor25

feat(nlp): implement comprehensive advanced NLP pipeline for merchant search

19aa29f 4 months ago

raw

history blame contribute delete

15.7 kB

	#!/usr/bin/env python3
	"""
	Validation script for Advanced NLP Pipeline setup
	Run this script to verify that all components are properly installed and configured.
	"""

	import asyncio
	import sys
	import time
	import logging
	from typing import Dict, Any, List

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	def check_dependencies() -> Dict[str, bool]:
	"""Check if all required dependencies are installed"""
	dependencies = {
	'spacy': False,
	'sklearn': False,
	'numpy': False,
	'sentence_transformers': False,
	'transformers': False,
	'torch': False
	}

	logger.info("Checking dependencies...")

	# Check spaCy
	try:
	import spacy
	dependencies['spacy'] = True
	logger.info("✓ spaCy installed")
	except ImportError:
	logger.error("✗ spaCy not installed")

	# Check scikit-learn
	try:
	import sklearn
	dependencies['sklearn'] = True
	logger.info("✓ scikit-learn installed")
	except ImportError:
	logger.error("✗ scikit-learn not installed")

	# Check numpy
	try:
	import numpy
	dependencies['numpy'] = True
	logger.info("✓ numpy installed")
	except ImportError:
	logger.error("✗ numpy not installed")

	# Check sentence-transformers
	try:
	import sentence_transformers
	dependencies['sentence_transformers'] = True
	logger.info("✓ sentence-transformers installed")
	except ImportError:
	logger.error("✗ sentence-transformers not installed")

	# Check transformers
	try:
	import transformers
	dependencies['transformers'] = True
	logger.info("✓ transformers installed")
	except ImportError:
	logger.error("✗ transformers not installed")

	# Check torch
	try:
	import torch
	dependencies['torch'] = True
	logger.info("✓ torch installed")
	except ImportError:
	logger.error("✗ torch not installed")

	return dependencies

	def check_spacy_model() -> bool:
	"""Check if spaCy model is available"""
	logger.info("Checking spaCy model...")

	try:
	import spacy
	nlp = spacy.load("en_core_web_sm")
	logger.info("✓ spaCy model 'en_core_web_sm' loaded successfully")
	return True
	except OSError:
	logger.error("✗ spaCy model 'en_core_web_sm' not found")
	logger.error(" Run: python -m spacy download en_core_web_sm")
	return False
	except Exception as e:
	logger.error(f"✗ Error loading spaCy model: {e}")
	return False

	def check_sentence_transformer_model() -> bool:
	"""Check if sentence transformer model can be loaded"""
	logger.info("Checking sentence transformer model...")

	try:
	from sentence_transformers import SentenceTransformer
	model = SentenceTransformer('all-MiniLM-L6-v2')
	logger.info("✓ Sentence transformer model 'all-MiniLM-L6-v2' loaded successfully")
	return True
	except Exception as e:
	logger.error(f"✗ Error loading sentence transformer model: {e}")
	logger.error(" Model will be downloaded on first use")
	return False

	async def test_advanced_nlp_pipeline() -> bool:
	"""Test the advanced NLP pipeline"""
	logger.info("Testing Advanced NLP Pipeline...")

	try:
	# Import the pipeline
	from app.services.advanced_nlp import advanced_nlp_pipeline

	# Test with a simple query
	test_query = "find a hair salon near me"
	start_time = time.time()

	result = await advanced_nlp_pipeline.process_query(test_query)

	processing_time = time.time() - start_time

	# Check if result has expected structure
	required_keys = ['query', 'primary_intent', 'entities', 'similar_services', 'search_parameters']
	missing_keys = [key for key in required_keys if key not in result]

	if missing_keys:
	logger.error(f"✗ Missing keys in result: {missing_keys}")
	return False

	logger.info(f"✓ Advanced NLP Pipeline working (processed in {processing_time:.3f}s)")
	logger.info(f" Intent: {result['primary_intent']['intent']} (confidence: {result['primary_intent']['confidence']:.3f})")
	logger.info(f" Entities found: {len(result['entities'])}")
	logger.info(f" Similar services: {len(result['similar_services'])}")
	logger.info(f" Search parameters: {len(result['search_parameters'])}")

	return True

	except ImportError as e:
	logger.error(f"✗ Cannot import Advanced NLP Pipeline: {e}")
	return False
	except Exception as e:
	logger.error(f"✗ Error testing Advanced NLP Pipeline: {e}")
	return False

	async def test_individual_components() -> Dict[str, bool]:
	"""Test individual NLP components"""
	logger.info("Testing individual components...")

	results = {
	'intent_classifier': False,
	'entity_extractor': False,
	'semantic_matcher': False,
	'context_processor': False
	}

	try:
	from app.services.advanced_nlp import (
	IntentClassifier, BusinessEntityExtractor,
	SemanticMatcher, ContextAwareProcessor
	)

	# Test Intent Classifier
	try:
	classifier = IntentClassifier()
	intent, confidence = classifier.get_primary_intent("find a salon")
	if intent and confidence >= 0:
	results['intent_classifier'] = True
	logger.info(f"✓ Intent Classifier working (detected: {intent})")
	else:
	logger.error("✗ Intent Classifier returned invalid results")
	except Exception as e:
	logger.error(f"✗ Intent Classifier error: {e}")

	# Test Entity Extractor
	try:
	extractor = BusinessEntityExtractor()
	entities = extractor.extract_entities("luxury spa with parking")
	if isinstance(entities, dict):
	results['entity_extractor'] = True
	logger.info(f"✓ Entity Extractor working (found {len(entities)} entity types)")
	else:
	logger.error("✗ Entity Extractor returned invalid results")
	except Exception as e:
	logger.error(f"✗ Entity Extractor error: {e}")

	# Test Semantic Matcher
	try:
	matcher = SemanticMatcher()
	matches = matcher.find_similar_services("hair salon")
	if isinstance(matches, list):
	results['semantic_matcher'] = True
	logger.info(f"✓ Semantic Matcher working (found {len(matches)} matches)")
	else:
	logger.error("✗ Semantic Matcher returned invalid results")
	except Exception as e:
	logger.error(f"✗ Semantic Matcher error: {e}")

	# Test Context Processor
	try:
	processor = ContextAwareProcessor()
	context_result = await processor.process_with_context(
	"spa treatment", {}, [("spa", 0.9)]
	)
	if isinstance(context_result, dict):
	results['context_processor'] = True
	logger.info("✓ Context Processor working")
	else:
	logger.error("✗ Context Processor returned invalid results")
	except Exception as e:
	logger.error(f"✗ Context Processor error: {e}")

	except ImportError as e:
	logger.error(f"✗ Cannot import NLP components: {e}")

	return results

	def test_configuration() -> bool:
	"""Test configuration loading"""
	logger.info("Testing configuration...")

	try:
	from app.config.nlp_config import nlp_config

	# Check if configuration is accessible
	config_dict = nlp_config.get_config_dict()

	if isinstance(config_dict, dict) and len(config_dict) > 0:
	logger.info("✓ Configuration loaded successfully")
	logger.info(f" Max workers: {nlp_config.ASYNC_PROCESSOR_MAX_WORKERS}")
	logger.info(f" Cache duration: {nlp_config.CACHE_DURATION_SECONDS}s")
	logger.info(f" Advanced NLP enabled: {nlp_config.ENABLE_ADVANCED_NLP}")
	return True
	else:
	logger.error("✗ Configuration is empty or invalid")
	return False

	except ImportError as e:
	logger.error(f"✗ Cannot import configuration: {e}")
	return False
	except Exception as e:
	logger.error(f"✗ Configuration error: {e}")
	return False

	async def run_performance_benchmark() -> Dict[str, float]:
	"""Run a simple performance benchmark"""
	logger.info("Running performance benchmark...")

	test_queries = [
	"find a hair salon",
	"best spa near me",
	"gym with parking",
	"luxury massage therapy",
	"dental clinic open now"
	]

	try:
	from app.services.advanced_nlp import advanced_nlp_pipeline

	total_time = 0
	successful_queries = 0

	for query in test_queries:
	try:
	start_time = time.time()
	result = await advanced_nlp_pipeline.process_query(query)
	processing_time = time.time() - start_time

	if 'error' not in result:
	total_time += processing_time
	successful_queries += 1
	logger.info(f" '{query}' processed in {processing_time:.3f}s")
	else:
	logger.warning(f" '{query}' failed: {result.get('error', 'Unknown error')}")

	except Exception as e:
	logger.warning(f" '{query}' error: {e}")

	if successful_queries > 0:
	avg_time = total_time / successful_queries
	logger.info(f"✓ Performance benchmark completed")
	logger.info(f" Average processing time: {avg_time:.3f}s")
	logger.info(f" Successful queries: {successful_queries}/{len(test_queries)}")

	return {
	'average_time': avg_time,
	'success_rate': successful_queries / len(test_queries),
	'total_queries': len(test_queries)
	}
	else:
	logger.error("✗ No queries processed successfully")
	return {}

	except Exception as e:
	logger.error(f"✗ Performance benchmark failed: {e}")
	return {}

	def generate_report(
	dependencies: Dict[str, bool],
	spacy_model: bool,
	sentence_model: bool,
	pipeline_test: bool,
	component_tests: Dict[str, bool],
	config_test: bool,
	performance: Dict[str, float]
	) -> None:
	"""Generate a comprehensive validation report"""

	print("\n" + "="*60)
	print("ADVANCED NLP PIPELINE VALIDATION REPORT")
	print("="*60)

	# Dependencies
	print("\n📦 DEPENDENCIES:")
	all_deps_ok = all(dependencies.values())
	for dep, status in dependencies.items():
	status_icon = "✓" if status else "✗"
	print(f" {status_icon} {dep}")

	print(f"\n Overall: {'✓ All dependencies installed' if all_deps_ok else '✗ Missing dependencies'}")

	# Models
	print("\n🤖 MODELS:")
	print(f" {'✓' if spacy_model else '✗'} spaCy model (en_core_web_sm)")
	print(f" {'✓' if sentence_model else '✗'} Sentence transformer model")

	# Pipeline
	print("\n🔧 PIPELINE:")
	print(f" {'✓' if pipeline_test else '✗'} Advanced NLP Pipeline")

	# Components
	print("\n⚙️ COMPONENTS:")
	for component, status in component_tests.items():
	status_icon = "✓" if status else "✗"
	component_name = component.replace('_', ' ').title()
	print(f" {status_icon} {component_name}")

	# Configuration
	print("\n⚙️ CONFIGURATION:")
	print(f" {'✓' if config_test else '✗'} Configuration loading")

	# Performance
	print("\n⚡ PERFORMANCE:")
	if performance:
	print(f" Average processing time: {performance.get('average_time', 0):.3f}s")
	print(f" Success rate: {performance.get('success_rate', 0)*100:.1f}%")

	if performance.get('average_time', 0) < 0.5:
	print(" ✓ Good performance")
	elif performance.get('average_time', 0) < 1.0:
	print(" ⚠ Acceptable performance")
	else:
	print(" ✗ Slow performance - consider optimization")
	else:
	print(" ✗ Performance test failed")

	# Overall Status
	print("\n" + "="*60)

	overall_status = (
	all_deps_ok and spacy_model and pipeline_test and
	all(component_tests.values()) and config_test
	)

	if overall_status:
	print("🎉 OVERALL STATUS: ✓ READY FOR PRODUCTION")
	print("\nThe Advanced NLP Pipeline is properly installed and configured.")
	print("You can now use the enhanced natural language processing features.")
	else:
	print("⚠️ OVERALL STATUS: ✗ ISSUES FOUND")
	print("\nPlease address the issues above before using the Advanced NLP Pipeline.")
	print("The system will fall back to basic processing until issues are resolved.")

	# Recommendations
	print("\n📋 RECOMMENDATIONS:")

	if not all_deps_ok:
	print(" • Install missing dependencies: pip install -r requirements.txt")

	if not spacy_model:
	print(" • Download spaCy model: python -m spacy download en_core_web_sm")

	if not sentence_model:
	print(" • Sentence transformer model will download automatically on first use")

	if performance and performance.get('average_time', 0) > 0.5:
	print(" • Consider increasing ASYNC_PROCESSOR_MAX_WORKERS for better performance")
	print(" • Enable caching with longer CACHE_DURATION_SECONDS")

	if not all(component_tests.values()):
	print(" • Check logs above for specific component errors")

	print("\n" + "="*60)

	async def main():
	"""Main validation function"""
	print("Starting Advanced NLP Pipeline validation...")
	print("This may take a few minutes on first run due to model downloads.\n")

	# Run all validation checks
	dependencies = check_dependencies()
	spacy_model = check_spacy_model()
	sentence_model = check_sentence_transformer_model()
	pipeline_test = await test_advanced_nlp_pipeline()
	component_tests = await test_individual_components()
	config_test = test_configuration()
	performance = await run_performance_benchmark()

	# Generate comprehensive report
	generate_report(
	dependencies, spacy_model, sentence_model,
	pipeline_test, component_tests, config_test, performance
	)

	# Return exit code
	overall_success = (
	all(dependencies.values()) and spacy_model and pipeline_test and
	all(component_tests.values()) and config_test
	)

	return 0 if overall_success else 1

	if __name__ == "__main__":
	try:
	exit_code = asyncio.run(main())
	sys.exit(exit_code)
	except KeyboardInterrupt:
	print("\n\nValidation interrupted by user.")
	sys.exit(1)
	except Exception as e:
	print(f"\n\nUnexpected error during validation: {e}")
	sys.exit(1)