|
|
""" |
|
|
Example usage of the PDF Processor utility |
|
|
Demonstrates how to extract sentences from Nepali PDFs and prepare for bias detection |
|
|
""" |
|
|
|
|
|
from utility.pdf_processor import PDFProcessor |
|
|
import json |
|
|
|
|
|
|
|
|
def example_pdf_processing(): |
|
|
"""Example 1: Basic PDF processing with sentence extraction""" |
|
|
print("=" * 60) |
|
|
print("Example 1: Basic PDF Processing") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
processor = PDFProcessor() |
|
|
|
|
|
|
|
|
pdf_path = "path/to/your/nepali_document.pdf" |
|
|
|
|
|
result = processor.process_pdf( |
|
|
pdf_path=pdf_path, |
|
|
refine_with_llm=True |
|
|
) |
|
|
|
|
|
if result["success"]: |
|
|
print(f"\nβ Successfully processed PDF") |
|
|
print(f" Total sentences extracted: {result['total_sentences']}") |
|
|
print(f"\nExtracted sentences:") |
|
|
for i, sentence in enumerate(result["sentences"], 1): |
|
|
print(f" {i}. {sentence}") |
|
|
else: |
|
|
print(f"β Error: {result['error']}") |
|
|
|
|
|
|
|
|
def example_pdf_processing_without_llm(): |
|
|
"""Example 2: PDF processing without LLM refinement (faster)""" |
|
|
print("\n" + "=" * 60) |
|
|
print("Example 2: PDF Processing (No LLM Refinement)") |
|
|
print("=" * 60) |
|
|
|
|
|
processor = PDFProcessor() |
|
|
|
|
|
pdf_path = "path/to/your/nepali_document.pdf" |
|
|
|
|
|
|
|
|
result = processor.process_pdf( |
|
|
pdf_path=pdf_path, |
|
|
refine_with_llm=False |
|
|
) |
|
|
|
|
|
if result["success"]: |
|
|
print(f"\nβ Successfully processed PDF (without LLM refinement)") |
|
|
print(f" Total sentences: {result['total_sentences']}") |
|
|
print(f" Processing time: Faster (no LLM calls)") |
|
|
else: |
|
|
print(f"β Error: {result['error']}") |
|
|
|
|
|
|
|
|
def example_batch_processing(): |
|
|
"""Example 3: Process multiple PDFs""" |
|
|
print("\n" + "=" * 60) |
|
|
print("Example 3: Batch PDF Processing") |
|
|
print("=" * 60) |
|
|
|
|
|
processor = PDFProcessor() |
|
|
pdf_files = [ |
|
|
"path/to/document1.pdf", |
|
|
"path/to/document2.pdf", |
|
|
"path/to/document3.pdf", |
|
|
] |
|
|
|
|
|
all_results = {} |
|
|
|
|
|
for pdf_path in pdf_files: |
|
|
result = processor.process_pdf( |
|
|
pdf_path=pdf_path, |
|
|
refine_with_llm=True |
|
|
) |
|
|
|
|
|
all_results[pdf_path] = { |
|
|
"success": result["success"], |
|
|
"total_sentences": result["total_sentences"], |
|
|
"sentences": result["sentences"] |
|
|
} |
|
|
|
|
|
if result["success"]: |
|
|
print(f"β {pdf_path}: {result['total_sentences']} sentences") |
|
|
else: |
|
|
print(f"β {pdf_path}: {result['error']}") |
|
|
|
|
|
return all_results |
|
|
|
|
|
|
|
|
def example_prepare_for_bias_detection(): |
|
|
"""Example 4: Prepare extracted sentences for Bias Detection API""" |
|
|
print("\n" + "=" * 60) |
|
|
print("Example 4: Prepare for Bias Detection API") |
|
|
print("=" * 60) |
|
|
|
|
|
processor = PDFProcessor() |
|
|
|
|
|
pdf_path = "path/to/your/nepali_document.pdf" |
|
|
result = processor.process_pdf( |
|
|
pdf_path=pdf_path, |
|
|
refine_with_llm=True |
|
|
) |
|
|
|
|
|
if result["success"]: |
|
|
sentences = result["sentences"] |
|
|
|
|
|
|
|
|
from api.schemas import BatchBiasDetectionRequest |
|
|
|
|
|
api_payload = BatchBiasDetectionRequest( |
|
|
texts=sentences, |
|
|
confidence_threshold=0.7 |
|
|
) |
|
|
|
|
|
print(f"\nβ Prepared {len(sentences)} sentences for bias detection") |
|
|
print(f"\nAPI Payload Preview:") |
|
|
print(f" - Number of texts: {len(api_payload.texts)}") |
|
|
print(f" - Confidence threshold: {api_payload.confidence_threshold}") |
|
|
print(f"\nExample sentences to be analyzed:") |
|
|
for i, sentence in enumerate(api_payload.texts[:3], 1): |
|
|
print(f" {i}. {sentence}") |
|
|
|
|
|
return api_payload |
|
|
else: |
|
|
print(f"β Error: {result['error']}") |
|
|
return None |
|
|
|
|
|
|
|
|
def example_direct_api_integration(): |
|
|
"""Example 5: Integration pattern for API endpoint""" |
|
|
print("\n" + "=" * 60) |
|
|
print("Example 5: API Integration Pattern") |
|
|
print("=" * 60) |
|
|
|
|
|
print(""" |
|
|
When integrating with FastAPI endpoints: |
|
|
|
|
|
1. In /api/routes/pdf_processing.py: |
|
|
- POST /api/v1/process-pdf |
|
|
* Upload PDF file |
|
|
* Returns extracted sentences |
|
|
|
|
|
- POST /api/v1/process-pdf-to-bias |
|
|
* Upload PDF file |
|
|
* Returns bias detection results directly |
|
|
|
|
|
- GET /api/v1/pdf-health |
|
|
* Check PDF processor service status |
|
|
|
|
|
2. Usage flow: |
|
|
a) User uploads PDF via /api/v1/process-pdf |
|
|
b) PDFProcessor extracts text and segments into sentences |
|
|
c) Mistral LLM refines sentences (optional) |
|
|
d) Return list of sentences |
|
|
|
|
|
OR |
|
|
|
|
|
a) User uploads PDF via /api/v1/process-pdf-to-bias |
|
|
b) PDFProcessor extracts and refines sentences |
|
|
c) Bias detection model analyzes each sentence |
|
|
d) Return complete bias analysis results |
|
|
|
|
|
3. Example cURL commands: |
|
|
|
|
|
# Extract sentences only |
|
|
curl -X POST "http://localhost:8000/api/v1/process-pdf" \\ |
|
|
-F "file=@document.pdf" \\ |
|
|
-F "refine_with_llm=true" |
|
|
|
|
|
# Extract and analyze bias |
|
|
curl -X POST "http://localhost:8000/api/v1/process-pdf-to-bias" \\ |
|
|
-F "file=@document.pdf" \\ |
|
|
-F "refine_with_llm=true" \\ |
|
|
-F "confidence_threshold=0.7" |
|
|
""") |
|
|
|
|
|
|
|
|
def example_error_handling(): |
|
|
"""Example 6: Error handling""" |
|
|
print("\n" + "=" * 60) |
|
|
print("Example 6: Error Handling") |
|
|
print("=" * 60) |
|
|
|
|
|
processor = PDFProcessor() |
|
|
|
|
|
test_cases = [ |
|
|
("nonexistent.pdf", "File not found"), |
|
|
("image_pdf.pdf", "Image-based PDF (no OCR)"), |
|
|
] |
|
|
|
|
|
for pdf_path, description in test_cases: |
|
|
print(f"\nTest: {description}") |
|
|
result = processor.process_pdf(pdf_path=pdf_path, refine_with_llm=False) |
|
|
|
|
|
if not result["success"]: |
|
|
print(f" Handled error: {result['error']}") |
|
|
else: |
|
|
print(f" Success: {result['total_sentences']} sentences extracted") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("\n" + "=" * 60) |
|
|
print("PDF Processor Usage Examples") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\nNote: Update file paths to your actual PDF files to run examples\n") |
|
|
|
|
|
example_direct_api_integration() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("For more information, see docs/pdf_processing.md") |
|
|
print("=" * 60) |
|
|
|