File size: 3,169 Bytes
09498fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
"""
Test script to verify all dependencies and PDF handling capabilities.
"""
import os
import sys

print("=== CogniChat Dependencies & PDF Handling Test ===")

# Test imports
try:
    print("\nTesting core imports...")
    from langchain_community.embeddings import HuggingFaceEmbeddings
    from langchain_community.retrievers import BM25Retriever
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_core.documents import Document
    print("βœ“ Core LangChain imports successful!")
    
except ImportError as e:
    print(f"βœ— Import error: {e}")
    if "rank_bm25" in str(e):
        print("  β†’ Missing dependency: pip install rank-bm25==0.2.2")
    sys.exit(1)

# Test PDF loading capabilities
try:
    print("\nTesting PDF loading capabilities...")
    
    # Test PyPDF availability
    try:
        from langchain_community.document_loaders import PyPDFLoader
        print("βœ“ PyPDFLoader available")
    except ImportError:
        print("βœ— PyPDFLoader not available")
    
    # Test PyMuPDF availability
    try:
        import fitz
        print("βœ“ PyMuPDF (fitz) available - can handle corrupted PDFs")
    except ImportError:
        print("βœ— PyMuPDF (fitz) not available")
    
    # Test pdfplumber availability
    try:
        import pdfplumber
        print("βœ“ pdfplumber available - additional PDF parsing method")
    except ImportError:
        print("βœ— pdfplumber not available")
        
except Exception as e:
    print(f"βœ— Error testing PDF capabilities: {e}")

# Test BM25 Retriever
try:
    print("\nTesting BM25 Retriever...")
    
    # Create some test documents
    test_docs = [
        Document(page_content="This is the first test document about machine learning."),
        Document(page_content="This is the second document discussing natural language processing."),
        Document(page_content="The third document covers artificial intelligence topics."),
    ]
    
    # Create BM25 retriever
    bm25_retriever = BM25Retriever.from_documents(test_docs)
    bm25_retriever.k = 2
    
    # Test retrieval
    query = "machine learning"
    results = bm25_retriever.get_relevant_documents(query)
    print(f"βœ“ BM25 retriever created and tested successfully!")
    print(f"βœ“ Retrieved {len(results)} documents for query: '{query}'")
    
except Exception as e:
    print(f"βœ— Error testing BM25 retriever: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

print("\n=== All tests completed successfully! ===")
print("\nThe application should now handle:")
print("  β€’ Regular file uploads and processing")
print("  β€’ Corrupted PDF files with multiple fallback methods")
print("  β€’ BM25 and FAISS hybrid retrieval")
print("  β€’ Proper error messages for failed file processing")
print("\nMake sure to install all dependencies with:")
print("  pip install -r requirements.txt")

# Display dependency summary
print("\n=== Key Dependencies Added/Updated ===")
print("  β€’ rank-bm25==0.2.2 (for BM25 retrieval)")
print("  β€’ pymupdf==1.23.26 (PDF fallback method)")
print("  β€’ pdfplumber==0.10.3 (additional PDF parsing)")