""" Test suite for PDF Processor module Tests text extraction, sentence segmentation, and LLM refinement """ import pytest import tempfile import os from pathlib import Path from utility.pdf_processor import PDFProcessor class TestPDFProcessor: """Test cases for PDFProcessor""" @pytest.fixture def processor(self): """Create a PDFProcessor instance""" return PDFProcessor() def test_initialization(self, processor): """Test PDFProcessor initialization""" assert processor is not None assert processor.llm_client is not None def test_clean_text(self, processor): """Test text cleaning""" dirty_text = "यो एक\n\nपरीक्षण है। \n \nधन्यवाद।" cleaned = processor.clean_text(dirty_text) assert "\n" not in cleaned assert " " not in cleaned assert cleaned == "यो एक परीक्षण है। धन्यवाद।" def test_split_into_sentences_with_danda(self, processor): """Test Nepali sentence splitting with danda""" text = "यो पहिलो वाक्य है। दोस्रो वाक्य छ। तेस्रो वाक्य छ।" sentences = processor.split_into_sentences(text) assert len(sentences) >= 3 assert "पहिलो" in sentences[0] assert "दोस्रो" in sentences[1] def test_split_into_sentences_empty(self, processor): """Test with empty text""" sentences = processor.split_into_sentences("") assert sentences == [] def test_split_into_sentences_short_text(self, processor): """Test with text shorter than minimum length""" text = "छ। छ।" # Too short fragments sentences = processor.split_into_sentences(text) assert len(sentences) == 0 def test_process_pdf_from_bytes(self, processor): """Test processing PDF from bytes""" # This requires a valid PDF file # For CI/CD, we can skip this if no PDF available pytest.skip("Requires actual PDF file") def test_process_pdf_nonexistent_file(self, processor): """Test error handling for non-existent file""" with pytest.raises(FileNotFoundError): processor.process_pdf(pdf_path="nonexistent.pdf") def test_refine_sentences_with_empty_list(self, processor): """Test LLM refinement with empty list""" sentences = processor.refine_sentences_with_llm([]) assert sentences == [] def test_refine_sentences_with_valid_text(self, processor): """Test LLM refinement with valid Nepali text""" # This requires API key, so we'll mock it in production pytest.skip("Requires Mistral API key") # Integration tests class TestPDFProcessorIntegration: """Integration tests for PDF processing workflow""" @pytest.fixture def processor(self): """Create a PDFProcessor instance""" return PDFProcessor() def test_end_to_end_pdf_processing(self, processor): """Test complete PDF processing pipeline""" pytest.skip("Requires actual PDF file for testing") def test_bias_detection_integration(self, processor): """Test integration with bias detection""" pytest.skip("Requires bias detection model") # Example manual tests def test_nepali_text_processing_manual(): """Manual test for Nepali text processing""" processor = PDFProcessor() # Test Nepali text nepali_text = "नेपालमा शिक्षा अहिले पनि समस्यामा छ। गरिबी र भुखमरी फैलिरहेको छ। सरकार कमजोर भएको छ।" sentences = processor.split_into_sentences(nepali_text) print(f"\nExtracted {len(sentences)} sentences:") for i, s in enumerate(sentences, 1): print(f" {i}. {s}") assert len(sentences) >= 2, "Should extract at least 2 sentences" def test_text_cleaning_manual(): """Manual test for text cleaning""" processor = PDFProcessor() dirty = "यो एक\n\nप्रमुख\n\nवाक्य है।" cleaned = processor.clean_text(dirty) print(f"\nOriginal: {repr(dirty)}") print(f"Cleaned: {repr(cleaned)}") assert "\n" not in cleaned assert " " not in cleaned if __name__ == "__main__": # Run manual tests print("Running manual tests...") test_nepali_text_processing_manual() test_text_cleaning_manual() print("\n✓ Manual tests passed!")