cognichat / test_dependencies.py
HYPERXD
fix
09498fa
#!/usr/bin/env python3
"""
Test script to verify all dependencies and PDF handling capabilities.
"""
import os
import sys
print("=== CogniChat Dependencies & PDF Handling Test ===")
# Test imports
try:
print("\nTesting core imports...")
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
print("βœ“ Core LangChain imports successful!")
except ImportError as e:
print(f"βœ— Import error: {e}")
if "rank_bm25" in str(e):
print(" β†’ Missing dependency: pip install rank-bm25==0.2.2")
sys.exit(1)
# Test PDF loading capabilities
try:
print("\nTesting PDF loading capabilities...")
# Test PyPDF availability
try:
from langchain_community.document_loaders import PyPDFLoader
print("βœ“ PyPDFLoader available")
except ImportError:
print("βœ— PyPDFLoader not available")
# Test PyMuPDF availability
try:
import fitz
print("βœ“ PyMuPDF (fitz) available - can handle corrupted PDFs")
except ImportError:
print("βœ— PyMuPDF (fitz) not available")
# Test pdfplumber availability
try:
import pdfplumber
print("βœ“ pdfplumber available - additional PDF parsing method")
except ImportError:
print("βœ— pdfplumber not available")
except Exception as e:
print(f"βœ— Error testing PDF capabilities: {e}")
# Test BM25 Retriever
try:
print("\nTesting BM25 Retriever...")
# Create some test documents
test_docs = [
Document(page_content="This is the first test document about machine learning."),
Document(page_content="This is the second document discussing natural language processing."),
Document(page_content="The third document covers artificial intelligence topics."),
]
# Create BM25 retriever
bm25_retriever = BM25Retriever.from_documents(test_docs)
bm25_retriever.k = 2
# Test retrieval
query = "machine learning"
results = bm25_retriever.get_relevant_documents(query)
print(f"βœ“ BM25 retriever created and tested successfully!")
print(f"βœ“ Retrieved {len(results)} documents for query: '{query}'")
except Exception as e:
print(f"βœ— Error testing BM25 retriever: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print("\n=== All tests completed successfully! ===")
print("\nThe application should now handle:")
print(" β€’ Regular file uploads and processing")
print(" β€’ Corrupted PDF files with multiple fallback methods")
print(" β€’ BM25 and FAISS hybrid retrieval")
print(" β€’ Proper error messages for failed file processing")
print("\nMake sure to install all dependencies with:")
print(" pip install -r requirements.txt")
# Display dependency summary
print("\n=== Key Dependencies Added/Updated ===")
print(" β€’ rank-bm25==0.2.2 (for BM25 retrieval)")
print(" β€’ pymupdf==1.23.26 (PDF fallback method)")
print(" β€’ pdfplumber==0.10.3 (additional PDF parsing)")