| | |
| | """ |
| | Test script to verify all dependencies and PDF handling capabilities. |
| | """ |
| | import os |
| | import sys |
| |
|
| | print("=== CogniChat Dependencies & PDF Handling Test ===") |
| |
|
| | |
| | try: |
| | print("\nTesting core imports...") |
| | from langchain_community.embeddings import HuggingFaceEmbeddings |
| | from langchain_community.retrievers import BM25Retriever |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain_core.documents import Document |
| | print("β Core LangChain imports successful!") |
| | |
| | except ImportError as e: |
| | print(f"β Import error: {e}") |
| | if "rank_bm25" in str(e): |
| | print(" β Missing dependency: pip install rank-bm25==0.2.2") |
| | sys.exit(1) |
| |
|
| | |
| | try: |
| | print("\nTesting PDF loading capabilities...") |
| | |
| | |
| | try: |
| | from langchain_community.document_loaders import PyPDFLoader |
| | print("β PyPDFLoader available") |
| | except ImportError: |
| | print("β PyPDFLoader not available") |
| | |
| | |
| | try: |
| | import fitz |
| | print("β PyMuPDF (fitz) available - can handle corrupted PDFs") |
| | except ImportError: |
| | print("β PyMuPDF (fitz) not available") |
| | |
| | |
| | try: |
| | import pdfplumber |
| | print("β pdfplumber available - additional PDF parsing method") |
| | except ImportError: |
| | print("β pdfplumber not available") |
| | |
| | except Exception as e: |
| | print(f"β Error testing PDF capabilities: {e}") |
| |
|
| | |
| | try: |
| | print("\nTesting BM25 Retriever...") |
| | |
| | |
| | test_docs = [ |
| | Document(page_content="This is the first test document about machine learning."), |
| | Document(page_content="This is the second document discussing natural language processing."), |
| | Document(page_content="The third document covers artificial intelligence topics."), |
| | ] |
| | |
| | |
| | bm25_retriever = BM25Retriever.from_documents(test_docs) |
| | bm25_retriever.k = 2 |
| | |
| | |
| | query = "machine learning" |
| | results = bm25_retriever.get_relevant_documents(query) |
| | print(f"β BM25 retriever created and tested successfully!") |
| | print(f"β Retrieved {len(results)} documents for query: '{query}'") |
| | |
| | except Exception as e: |
| | print(f"β Error testing BM25 retriever: {e}") |
| | import traceback |
| | traceback.print_exc() |
| | sys.exit(1) |
| |
|
| | print("\n=== All tests completed successfully! ===") |
| | print("\nThe application should now handle:") |
| | print(" β’ Regular file uploads and processing") |
| | print(" β’ Corrupted PDF files with multiple fallback methods") |
| | print(" β’ BM25 and FAISS hybrid retrieval") |
| | print(" β’ Proper error messages for failed file processing") |
| | print("\nMake sure to install all dependencies with:") |
| | print(" pip install -r requirements.txt") |
| |
|
| | |
| | print("\n=== Key Dependencies Added/Updated ===") |
| | print(" β’ rank-bm25==0.2.2 (for BM25 retrieval)") |
| | print(" β’ pymupdf==1.23.26 (PDF fallback method)") |
| | print(" β’ pdfplumber==0.10.3 (additional PDF parsing)") |