Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Validation script for Advanced NLP Pipeline setup | |
| Run this script to verify that all components are properly installed and configured. | |
| """ | |
| import asyncio | |
| import sys | |
| import time | |
| import logging | |
| from typing import Dict, Any, List | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def check_dependencies() -> Dict[str, bool]: | |
| """Check if all required dependencies are installed""" | |
| dependencies = { | |
| 'spacy': False, | |
| 'sklearn': False, | |
| 'numpy': False, | |
| 'sentence_transformers': False, | |
| 'transformers': False, | |
| 'torch': False | |
| } | |
| logger.info("Checking dependencies...") | |
| # Check spaCy | |
| try: | |
| import spacy | |
| dependencies['spacy'] = True | |
| logger.info("β spaCy installed") | |
| except ImportError: | |
| logger.error("β spaCy not installed") | |
| # Check scikit-learn | |
| try: | |
| import sklearn | |
| dependencies['sklearn'] = True | |
| logger.info("β scikit-learn installed") | |
| except ImportError: | |
| logger.error("β scikit-learn not installed") | |
| # Check numpy | |
| try: | |
| import numpy | |
| dependencies['numpy'] = True | |
| logger.info("β numpy installed") | |
| except ImportError: | |
| logger.error("β numpy not installed") | |
| # Check sentence-transformers | |
| try: | |
| import sentence_transformers | |
| dependencies['sentence_transformers'] = True | |
| logger.info("β sentence-transformers installed") | |
| except ImportError: | |
| logger.error("β sentence-transformers not installed") | |
| # Check transformers | |
| try: | |
| import transformers | |
| dependencies['transformers'] = True | |
| logger.info("β transformers installed") | |
| except ImportError: | |
| logger.error("β transformers not installed") | |
| # Check torch | |
| try: | |
| import torch | |
| dependencies['torch'] = True | |
| logger.info("β torch installed") | |
| except ImportError: | |
| logger.error("β torch not installed") | |
| return dependencies | |
| def check_spacy_model() -> bool: | |
| """Check if spaCy model is available""" | |
| logger.info("Checking spaCy model...") | |
| try: | |
| import spacy | |
| nlp = spacy.load("en_core_web_sm") | |
| logger.info("β spaCy model 'en_core_web_sm' loaded successfully") | |
| return True | |
| except OSError: | |
| logger.error("β spaCy model 'en_core_web_sm' not found") | |
| logger.error(" Run: python -m spacy download en_core_web_sm") | |
| return False | |
| except Exception as e: | |
| logger.error(f"β Error loading spaCy model: {e}") | |
| return False | |
| def check_sentence_transformer_model() -> bool: | |
| """Check if sentence transformer model can be loaded""" | |
| logger.info("Checking sentence transformer model...") | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| logger.info("β Sentence transformer model 'all-MiniLM-L6-v2' loaded successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Error loading sentence transformer model: {e}") | |
| logger.error(" Model will be downloaded on first use") | |
| return False | |
| async def test_advanced_nlp_pipeline() -> bool: | |
| """Test the advanced NLP pipeline""" | |
| logger.info("Testing Advanced NLP Pipeline...") | |
| try: | |
| # Import the pipeline | |
| from app.services.advanced_nlp import advanced_nlp_pipeline | |
| # Test with a simple query | |
| test_query = "find a hair salon near me" | |
| start_time = time.time() | |
| result = await advanced_nlp_pipeline.process_query(test_query) | |
| processing_time = time.time() - start_time | |
| # Check if result has expected structure | |
| required_keys = ['query', 'primary_intent', 'entities', 'similar_services', 'search_parameters'] | |
| missing_keys = [key for key in required_keys if key not in result] | |
| if missing_keys: | |
| logger.error(f"β Missing keys in result: {missing_keys}") | |
| return False | |
| logger.info(f"β Advanced NLP Pipeline working (processed in {processing_time:.3f}s)") | |
| logger.info(f" Intent: {result['primary_intent']['intent']} (confidence: {result['primary_intent']['confidence']:.3f})") | |
| logger.info(f" Entities found: {len(result['entities'])}") | |
| logger.info(f" Similar services: {len(result['similar_services'])}") | |
| logger.info(f" Search parameters: {len(result['search_parameters'])}") | |
| return True | |
| except ImportError as e: | |
| logger.error(f"β Cannot import Advanced NLP Pipeline: {e}") | |
| return False | |
| except Exception as e: | |
| logger.error(f"β Error testing Advanced NLP Pipeline: {e}") | |
| return False | |
| async def test_individual_components() -> Dict[str, bool]: | |
| """Test individual NLP components""" | |
| logger.info("Testing individual components...") | |
| results = { | |
| 'intent_classifier': False, | |
| 'entity_extractor': False, | |
| 'semantic_matcher': False, | |
| 'context_processor': False | |
| } | |
| try: | |
| from app.services.advanced_nlp import ( | |
| IntentClassifier, BusinessEntityExtractor, | |
| SemanticMatcher, ContextAwareProcessor | |
| ) | |
| # Test Intent Classifier | |
| try: | |
| classifier = IntentClassifier() | |
| intent, confidence = classifier.get_primary_intent("find a salon") | |
| if intent and confidence >= 0: | |
| results['intent_classifier'] = True | |
| logger.info(f"β Intent Classifier working (detected: {intent})") | |
| else: | |
| logger.error("β Intent Classifier returned invalid results") | |
| except Exception as e: | |
| logger.error(f"β Intent Classifier error: {e}") | |
| # Test Entity Extractor | |
| try: | |
| extractor = BusinessEntityExtractor() | |
| entities = extractor.extract_entities("luxury spa with parking") | |
| if isinstance(entities, dict): | |
| results['entity_extractor'] = True | |
| logger.info(f"β Entity Extractor working (found {len(entities)} entity types)") | |
| else: | |
| logger.error("β Entity Extractor returned invalid results") | |
| except Exception as e: | |
| logger.error(f"β Entity Extractor error: {e}") | |
| # Test Semantic Matcher | |
| try: | |
| matcher = SemanticMatcher() | |
| matches = matcher.find_similar_services("hair salon") | |
| if isinstance(matches, list): | |
| results['semantic_matcher'] = True | |
| logger.info(f"β Semantic Matcher working (found {len(matches)} matches)") | |
| else: | |
| logger.error("β Semantic Matcher returned invalid results") | |
| except Exception as e: | |
| logger.error(f"β Semantic Matcher error: {e}") | |
| # Test Context Processor | |
| try: | |
| processor = ContextAwareProcessor() | |
| context_result = await processor.process_with_context( | |
| "spa treatment", {}, [("spa", 0.9)] | |
| ) | |
| if isinstance(context_result, dict): | |
| results['context_processor'] = True | |
| logger.info("β Context Processor working") | |
| else: | |
| logger.error("β Context Processor returned invalid results") | |
| except Exception as e: | |
| logger.error(f"β Context Processor error: {e}") | |
| except ImportError as e: | |
| logger.error(f"β Cannot import NLP components: {e}") | |
| return results | |
| def test_configuration() -> bool: | |
| """Test configuration loading""" | |
| logger.info("Testing configuration...") | |
| try: | |
| from app.config.nlp_config import nlp_config | |
| # Check if configuration is accessible | |
| config_dict = nlp_config.get_config_dict() | |
| if isinstance(config_dict, dict) and len(config_dict) > 0: | |
| logger.info("β Configuration loaded successfully") | |
| logger.info(f" Max workers: {nlp_config.ASYNC_PROCESSOR_MAX_WORKERS}") | |
| logger.info(f" Cache duration: {nlp_config.CACHE_DURATION_SECONDS}s") | |
| logger.info(f" Advanced NLP enabled: {nlp_config.ENABLE_ADVANCED_NLP}") | |
| return True | |
| else: | |
| logger.error("β Configuration is empty or invalid") | |
| return False | |
| except ImportError as e: | |
| logger.error(f"β Cannot import configuration: {e}") | |
| return False | |
| except Exception as e: | |
| logger.error(f"β Configuration error: {e}") | |
| return False | |
| async def run_performance_benchmark() -> Dict[str, float]: | |
| """Run a simple performance benchmark""" | |
| logger.info("Running performance benchmark...") | |
| test_queries = [ | |
| "find a hair salon", | |
| "best spa near me", | |
| "gym with parking", | |
| "luxury massage therapy", | |
| "dental clinic open now" | |
| ] | |
| try: | |
| from app.services.advanced_nlp import advanced_nlp_pipeline | |
| total_time = 0 | |
| successful_queries = 0 | |
| for query in test_queries: | |
| try: | |
| start_time = time.time() | |
| result = await advanced_nlp_pipeline.process_query(query) | |
| processing_time = time.time() - start_time | |
| if 'error' not in result: | |
| total_time += processing_time | |
| successful_queries += 1 | |
| logger.info(f" '{query}' processed in {processing_time:.3f}s") | |
| else: | |
| logger.warning(f" '{query}' failed: {result.get('error', 'Unknown error')}") | |
| except Exception as e: | |
| logger.warning(f" '{query}' error: {e}") | |
| if successful_queries > 0: | |
| avg_time = total_time / successful_queries | |
| logger.info(f"β Performance benchmark completed") | |
| logger.info(f" Average processing time: {avg_time:.3f}s") | |
| logger.info(f" Successful queries: {successful_queries}/{len(test_queries)}") | |
| return { | |
| 'average_time': avg_time, | |
| 'success_rate': successful_queries / len(test_queries), | |
| 'total_queries': len(test_queries) | |
| } | |
| else: | |
| logger.error("β No queries processed successfully") | |
| return {} | |
| except Exception as e: | |
| logger.error(f"β Performance benchmark failed: {e}") | |
| return {} | |
| def generate_report( | |
| dependencies: Dict[str, bool], | |
| spacy_model: bool, | |
| sentence_model: bool, | |
| pipeline_test: bool, | |
| component_tests: Dict[str, bool], | |
| config_test: bool, | |
| performance: Dict[str, float] | |
| ) -> None: | |
| """Generate a comprehensive validation report""" | |
| print("\n" + "="*60) | |
| print("ADVANCED NLP PIPELINE VALIDATION REPORT") | |
| print("="*60) | |
| # Dependencies | |
| print("\nπ¦ DEPENDENCIES:") | |
| all_deps_ok = all(dependencies.values()) | |
| for dep, status in dependencies.items(): | |
| status_icon = "β" if status else "β" | |
| print(f" {status_icon} {dep}") | |
| print(f"\n Overall: {'β All dependencies installed' if all_deps_ok else 'β Missing dependencies'}") | |
| # Models | |
| print("\nπ€ MODELS:") | |
| print(f" {'β' if spacy_model else 'β'} spaCy model (en_core_web_sm)") | |
| print(f" {'β' if sentence_model else 'β'} Sentence transformer model") | |
| # Pipeline | |
| print("\nπ§ PIPELINE:") | |
| print(f" {'β' if pipeline_test else 'β'} Advanced NLP Pipeline") | |
| # Components | |
| print("\nβοΈ COMPONENTS:") | |
| for component, status in component_tests.items(): | |
| status_icon = "β" if status else "β" | |
| component_name = component.replace('_', ' ').title() | |
| print(f" {status_icon} {component_name}") | |
| # Configuration | |
| print("\nβοΈ CONFIGURATION:") | |
| print(f" {'β' if config_test else 'β'} Configuration loading") | |
| # Performance | |
| print("\nβ‘ PERFORMANCE:") | |
| if performance: | |
| print(f" Average processing time: {performance.get('average_time', 0):.3f}s") | |
| print(f" Success rate: {performance.get('success_rate', 0)*100:.1f}%") | |
| if performance.get('average_time', 0) < 0.5: | |
| print(" β Good performance") | |
| elif performance.get('average_time', 0) < 1.0: | |
| print(" β Acceptable performance") | |
| else: | |
| print(" β Slow performance - consider optimization") | |
| else: | |
| print(" β Performance test failed") | |
| # Overall Status | |
| print("\n" + "="*60) | |
| overall_status = ( | |
| all_deps_ok and spacy_model and pipeline_test and | |
| all(component_tests.values()) and config_test | |
| ) | |
| if overall_status: | |
| print("π OVERALL STATUS: β READY FOR PRODUCTION") | |
| print("\nThe Advanced NLP Pipeline is properly installed and configured.") | |
| print("You can now use the enhanced natural language processing features.") | |
| else: | |
| print("β οΈ OVERALL STATUS: β ISSUES FOUND") | |
| print("\nPlease address the issues above before using the Advanced NLP Pipeline.") | |
| print("The system will fall back to basic processing until issues are resolved.") | |
| # Recommendations | |
| print("\nπ RECOMMENDATIONS:") | |
| if not all_deps_ok: | |
| print(" β’ Install missing dependencies: pip install -r requirements.txt") | |
| if not spacy_model: | |
| print(" β’ Download spaCy model: python -m spacy download en_core_web_sm") | |
| if not sentence_model: | |
| print(" β’ Sentence transformer model will download automatically on first use") | |
| if performance and performance.get('average_time', 0) > 0.5: | |
| print(" β’ Consider increasing ASYNC_PROCESSOR_MAX_WORKERS for better performance") | |
| print(" β’ Enable caching with longer CACHE_DURATION_SECONDS") | |
| if not all(component_tests.values()): | |
| print(" β’ Check logs above for specific component errors") | |
| print("\n" + "="*60) | |
| async def main(): | |
| """Main validation function""" | |
| print("Starting Advanced NLP Pipeline validation...") | |
| print("This may take a few minutes on first run due to model downloads.\n") | |
| # Run all validation checks | |
| dependencies = check_dependencies() | |
| spacy_model = check_spacy_model() | |
| sentence_model = check_sentence_transformer_model() | |
| pipeline_test = await test_advanced_nlp_pipeline() | |
| component_tests = await test_individual_components() | |
| config_test = test_configuration() | |
| performance = await run_performance_benchmark() | |
| # Generate comprehensive report | |
| generate_report( | |
| dependencies, spacy_model, sentence_model, | |
| pipeline_test, component_tests, config_test, performance | |
| ) | |
| # Return exit code | |
| overall_success = ( | |
| all(dependencies.values()) and spacy_model and pipeline_test and | |
| all(component_tests.values()) and config_test | |
| ) | |
| return 0 if overall_success else 1 | |
| if __name__ == "__main__": | |
| try: | |
| exit_code = asyncio.run(main()) | |
| sys.exit(exit_code) | |
| except KeyboardInterrupt: | |
| print("\n\nValidation interrupted by user.") | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"\n\nUnexpected error during validation: {e}") | |
| sys.exit(1) |