File size: 4,668 Bytes
3998131 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
"""
Test suite for PDF Processor module
Tests text extraction, sentence segmentation, and LLM refinement
"""
import pytest
import tempfile
import os
from pathlib import Path
from utility.pdf_processor import PDFProcessor
class TestPDFProcessor:
"""Test cases for PDFProcessor"""
@pytest.fixture
def processor(self):
"""Create a PDFProcessor instance"""
return PDFProcessor()
def test_initialization(self, processor):
"""Test PDFProcessor initialization"""
assert processor is not None
assert processor.llm_client is not None
def test_clean_text(self, processor):
"""Test text cleaning"""
dirty_text = "यो एक\n\nपरीक्षण है। \n \nधन्यवाद।"
cleaned = processor.clean_text(dirty_text)
assert "\n" not in cleaned
assert " " not in cleaned
assert cleaned == "यो एक परीक्षण है। धन्यवाद।"
def test_split_into_sentences_with_danda(self, processor):
"""Test Nepali sentence splitting with danda"""
text = "यो पहिलो वाक्य है। दोस्रो वाक्य छ। तेस्रो वाक्य छ।"
sentences = processor.split_into_sentences(text)
assert len(sentences) >= 3
assert "पहिलो" in sentences[0]
assert "दोस्रो" in sentences[1]
def test_split_into_sentences_empty(self, processor):
"""Test with empty text"""
sentences = processor.split_into_sentences("")
assert sentences == []
def test_split_into_sentences_short_text(self, processor):
"""Test with text shorter than minimum length"""
text = "छ। छ।" # Too short fragments
sentences = processor.split_into_sentences(text)
assert len(sentences) == 0
def test_process_pdf_from_bytes(self, processor):
"""Test processing PDF from bytes"""
# This requires a valid PDF file
# For CI/CD, we can skip this if no PDF available
pytest.skip("Requires actual PDF file")
def test_process_pdf_nonexistent_file(self, processor):
"""Test error handling for non-existent file"""
with pytest.raises(FileNotFoundError):
processor.process_pdf(pdf_path="nonexistent.pdf")
def test_refine_sentences_with_empty_list(self, processor):
"""Test LLM refinement with empty list"""
sentences = processor.refine_sentences_with_llm([])
assert sentences == []
def test_refine_sentences_with_valid_text(self, processor):
"""Test LLM refinement with valid Nepali text"""
# This requires API key, so we'll mock it in production
pytest.skip("Requires Mistral API key")
# Integration tests
class TestPDFProcessorIntegration:
"""Integration tests for PDF processing workflow"""
@pytest.fixture
def processor(self):
"""Create a PDFProcessor instance"""
return PDFProcessor()
def test_end_to_end_pdf_processing(self, processor):
"""Test complete PDF processing pipeline"""
pytest.skip("Requires actual PDF file for testing")
def test_bias_detection_integration(self, processor):
"""Test integration with bias detection"""
pytest.skip("Requires bias detection model")
# Example manual tests
def test_nepali_text_processing_manual():
"""Manual test for Nepali text processing"""
processor = PDFProcessor()
# Test Nepali text
nepali_text = "नेपालमा शिक्षा अहिले पनि समस्यामा छ। गरिबी र भुखमरी फैलिरहेको छ। सरकार कमजोर भएको छ।"
sentences = processor.split_into_sentences(nepali_text)
print(f"\nExtracted {len(sentences)} sentences:")
for i, s in enumerate(sentences, 1):
print(f" {i}. {s}")
assert len(sentences) >= 2, "Should extract at least 2 sentences"
def test_text_cleaning_manual():
"""Manual test for text cleaning"""
processor = PDFProcessor()
dirty = "यो एक\n\nप्रमुख\n\nवाक्य है।"
cleaned = processor.clean_text(dirty)
print(f"\nOriginal: {repr(dirty)}")
print(f"Cleaned: {repr(cleaned)}")
assert "\n" not in cleaned
assert " " not in cleaned
if __name__ == "__main__":
# Run manual tests
print("Running manual tests...")
test_nepali_text_processing_manual()
test_text_cleaning_manual()
print("\n✓ Manual tests passed!")
|