File size: 8,113 Bytes

eb53bb5

"""

Simple demo script for document text extraction.

Demonstrates the complete workflow from training to inference.

"""

import os
import sys
from pathlib import Path
import jso                    print(f"  {entity['entity']}: '{entity['text']}' ({confidence}%)")
        else:
            print(f"Error: {result['error']}")
    
    except Exception as e:
        print(f"Failed to process text: {e}") Add src to path for imports
sys.path.append(str(Path(__file__).parent))

from src.data_preparation import DocumentProcessor, NERDatasetCreator
from src.training_pipeline import TrainingPipeline, create_custom_config
from src.inference import DocumentInference


def run_quick_demo():
    """Run a quick demonstration of the text extraction system."""
    print("DOCUMENT TEXT EXTRACTION - QUICK DEMO")
    print("=" * 60)
    
    # Sample documents for demonstration
    demo_texts = [
        {
            "name": "Invoice Example 1",
            "text": "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567"
        },
        {
            "name": "Invoice Example 2",
            "text": "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com"
        },
        {
            "name": "Receipt Example",
            "text": "Receipt for Michael Brown 456 Oak Street Boston MA 02101 Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75"
        }
    ]
    
    print("\nSample Documents:")
    for i, doc in enumerate(demo_texts, 1):
        print(f"{i}. {doc['name']}: {doc['text'][:60]}...")
    
    # Check if model exists
    model_path = "models/document_ner_model"
    if not Path(model_path).exists():
        print(f"\nModel not found at {model_path}")
        print("Training a new model first...")
        
        # Train model
        config = create_custom_config()
        config.num_epochs = 2  # Quick training for demo
        config.batch_size = 8
        
        pipeline = TrainingPipeline(config)
        model_path = pipeline.run_complete_pipeline()
        
        print(f"Model trained and saved to {model_path}")
    
    # Load inference pipeline
    print(f"\nLoading inference pipeline from {model_path}")
    try:
        inference = DocumentInference(model_path)
        print("Inference pipeline loaded successfully")
    except Exception as e:
        print(f"Failed to load inference pipeline: {e}")
        return
    
    # Process demo texts
    print(f"\nProcessing {len(demo_texts)} demo documents...")
    results = []
    
    for i, doc in enumerate(demo_texts, 1):
        print(f"\nProcessing Document {i}: {doc['name']}")
        print("-" * 50)
        print(f"Text: {doc['text']}")
        
        # Extract information
        result = inference.process_text_directly(doc['text'])
        results.append({
            'document_name': doc['name'],
            'original_text': doc['text'],
            'result': result
        })
        
        # Display results
        if 'error' not in result:
            structured_data = result.get('structured_data', {})
            entities = result.get('entities', [])
            
            print(f"\nExtraction Results:")
            if structured_data:
                print("Structured Data:")
                for key, value in structured_data.items():
                    print(f"   {key}: {value}")
            else:
                print("   No structured data extracted")
            
            if entities:
                print(f"Found {len(entities)} entities:")
                for entity in entities:
                    confidence = int(entity['confidence'] * 100)
                    print(f"   {entity['entity']}: '{entity['text']}' ({confidence}%)")
        else:
            print(f"Error: {result['error']}")
    
    # Save results
    output_path = "results/demo_results.json"
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nDemo results saved to: {output_path}")
    
    # Summary
    successful_extractions = sum(1 for r in results if 'error' not in r['result'])
    total_entities = sum(len(r['result'].get('entities', [])) for r in results if 'error' not in r['result'])
    total_structured_fields = sum(len(r['result'].get('structured_data', {})) for r in results if 'error' not in r['result'])
    
    print(f"\nDemo Summary:")
    print(f"   Successfully processed: {successful_extractions}/{len(demo_texts)} documents")
    print(f"   Total entities found: {total_entities}")
    print(f"   Total structured fields: {total_structured_fields}")
    
    print(f"\nDemo completed successfully!")
    print(f"You can now:")
    print(f"   - Run the web API: python api/app.py")
    print(f"   - Process your own documents using inference.py")
    print(f"   - Retrain with your data using training_pipeline.py")


def train_model_only():
    """Train the model without running inference demo."""
    print("TRAINING MODEL ONLY")
    print("=" * 40)
    
    config = create_custom_config()
    pipeline = TrainingPipeline(config)
    
    model_path = pipeline.run_complete_pipeline()
    
    print(f"Model training completed!")
    print(f"Model saved to: {model_path}")


def test_specific_text():
    """Test extraction on user-provided text."""
    print("CUSTOM TEXT EXTRACTION")
    print("=" * 40)
    
    # Check if model exists
    model_path = "models/document_ner_model"
    if not Path(model_path).exists():
        print("No trained model found. Please run training first.")
        return
    
    # Get text from user
    print("Enter text to extract information from:")
    print("(Example: Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00)")
    text = input("Text: ").strip()
    
    if not text:
        print("No text provided.")
        return
    
    # Load inference and process
    try:
        inference = DocumentInference(model_path)
        result = inference.process_text_directly(text)
        
        print(f"\nExtraction Results:")
        if 'error' not in result:
            structured_data = result.get('structured_data', {})
            if structured_data:
                print("Structured Information:")
                for key, value in structured_data.items():
                    print(f"  {key}: {value}")
            else:
                print("No structured information found.")
            
            entities = result.get('entities', [])
            if entities:
                print(f"\nEntities Found ({len(entities)}):")
                for entity in entities:
                    confidence = int(entity['confidence'] * 100)
                    print(f"  {entity['entity']}: '{entity['text']}' ({confidence}%)")
        else:
            print(f"Error: {result['error']}")
    
    except Exception as e:
        print(f"Failed to process text: {e}")


def main():
    """Main demo function with options."""
    print("DOCUMENT TEXT EXTRACTION SYSTEM")
    print("=" * 50)
    print("Choose an option:")
    print("1. Run complete demo (train + inference)")
    print("2. Train model only")
    print("3. Test specific text (requires trained model)")
    print("4. Exit")
    
    while True:
        choice = input("\nEnter your choice (1-4): ").strip()
        
        if choice == '1':
            run_quick_demo()
            break
        elif choice == '2':
            train_model_only()
            break
        elif choice == '3':
            test_specific_text()
            break
        elif choice == '4':
            print("👋 Goodbye!")
            break
        else:
            print("Invalid choice. Please enter 1, 2, 3, or 4.")


if __name__ == "__main__":
    main()