#!/usr/bin/env python3 """ Test script to verify all dependencies and PDF handling capabilities. """ import os import sys print("=== CogniChat Dependencies & PDF Handling Test ===") # Test imports try: print("\nTesting core imports...") from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.retrievers import BM25Retriever from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document print("✓ Core LangChain imports successful!") except ImportError as e: print(f"✗ Import error: {e}") if "rank_bm25" in str(e): print(" → Missing dependency: pip install rank-bm25==0.2.2") sys.exit(1) # Test PDF loading capabilities try: print("\nTesting PDF loading capabilities...") # Test PyPDF availability try: from langchain_community.document_loaders import PyPDFLoader print("✓ PyPDFLoader available") except ImportError: print("✗ PyPDFLoader not available") # Test PyMuPDF availability try: import fitz print("✓ PyMuPDF (fitz) available - can handle corrupted PDFs") except ImportError: print("✗ PyMuPDF (fitz) not available") # Test pdfplumber availability try: import pdfplumber print("✓ pdfplumber available - additional PDF parsing method") except ImportError: print("✗ pdfplumber not available") except Exception as e: print(f"✗ Error testing PDF capabilities: {e}") # Test BM25 Retriever try: print("\nTesting BM25 Retriever...") # Create some test documents test_docs = [ Document(page_content="This is the first test document about machine learning."), Document(page_content="This is the second document discussing natural language processing."), Document(page_content="The third document covers artificial intelligence topics."), ] # Create BM25 retriever bm25_retriever = BM25Retriever.from_documents(test_docs) bm25_retriever.k = 2 # Test retrieval query = "machine learning" results = bm25_retriever.get_relevant_documents(query) print(f"✓ BM25 retriever created and tested successfully!") print(f"✓ Retrieved {len(results)} documents for query: '{query}'") except Exception as e: print(f"✗ Error testing BM25 retriever: {e}") import traceback traceback.print_exc() sys.exit(1) print("\n=== All tests completed successfully! ===") print("\nThe application should now handle:") print(" • Regular file uploads and processing") print(" • Corrupted PDF files with multiple fallback methods") print(" • BM25 and FAISS hybrid retrieval") print(" • Proper error messages for failed file processing") print("\nMake sure to install all dependencies with:") print(" pip install -r requirements.txt") # Display dependency summary print("\n=== Key Dependencies Added/Updated ===") print(" • rank-bm25==0.2.2 (for BM25 retrieval)") print(" • pymupdf==1.23.26 (PDF fallback method)") print(" • pdfplumber==0.10.3 (additional PDF parsing)")