File size: 8,113 Bytes
eb53bb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""

Simple demo script for document text extraction.

Demonstrates the complete workflow from training to inference.

"""

import os
import sys
from pathlib import Path
import jso                    print(f"  {entity['entity']}: '{entity['text']}' ({confidence}%)")
        else:
            print(f"Error: {result['error']}")
    
    except Exception as e:
        print(f"Failed to process text: {e}") Add src to path for imports
sys.path.append(str(Path(__file__).parent))

from src.data_preparation import DocumentProcessor, NERDatasetCreator
from src.training_pipeline import TrainingPipeline, create_custom_config
from src.inference import DocumentInference


def run_quick_demo():
    """Run a quick demonstration of the text extraction system."""
    print("DOCUMENT TEXT EXTRACTION - QUICK DEMO")
    print("=" * 60)
    
    # Sample documents for demonstration
    demo_texts = [
        {
            "name": "Invoice Example 1",
            "text": "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567"
        },
        {
            "name": "Invoice Example 2",
            "text": "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com"
        },
        {
            "name": "Receipt Example",
            "text": "Receipt for Michael Brown 456 Oak Street Boston MA 02101 Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75"
        }
    ]
    
    print("\nSample Documents:")
    for i, doc in enumerate(demo_texts, 1):
        print(f"{i}. {doc['name']}: {doc['text'][:60]}...")
    
    # Check if model exists
    model_path = "models/document_ner_model"
    if not Path(model_path).exists():
        print(f"\nModel not found at {model_path}")
        print("Training a new model first...")
        
        # Train model
        config = create_custom_config()
        config.num_epochs = 2  # Quick training for demo
        config.batch_size = 8
        
        pipeline = TrainingPipeline(config)
        model_path = pipeline.run_complete_pipeline()
        
        print(f"Model trained and saved to {model_path}")
    
    # Load inference pipeline
    print(f"\nLoading inference pipeline from {model_path}")
    try:
        inference = DocumentInference(model_path)
        print("Inference pipeline loaded successfully")
    except Exception as e:
        print(f"Failed to load inference pipeline: {e}")
        return
    
    # Process demo texts
    print(f"\nProcessing {len(demo_texts)} demo documents...")
    results = []
    
    for i, doc in enumerate(demo_texts, 1):
        print(f"\nProcessing Document {i}: {doc['name']}")
        print("-" * 50)
        print(f"Text: {doc['text']}")
        
        # Extract information
        result = inference.process_text_directly(doc['text'])
        results.append({
            'document_name': doc['name'],
            'original_text': doc['text'],
            'result': result
        })
        
        # Display results
        if 'error' not in result:
            structured_data = result.get('structured_data', {})
            entities = result.get('entities', [])
            
            print(f"\nExtraction Results:")
            if structured_data:
                print("Structured Data:")
                for key, value in structured_data.items():
                    print(f"   {key}: {value}")
            else:
                print("   No structured data extracted")
            
            if entities:
                print(f"Found {len(entities)} entities:")
                for entity in entities:
                    confidence = int(entity['confidence'] * 100)
                    print(f"   {entity['entity']}: '{entity['text']}' ({confidence}%)")
        else:
            print(f"Error: {result['error']}")
    
    # Save results
    output_path = "results/demo_results.json"
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nDemo results saved to: {output_path}")
    
    # Summary
    successful_extractions = sum(1 for r in results if 'error' not in r['result'])
    total_entities = sum(len(r['result'].get('entities', [])) for r in results if 'error' not in r['result'])
    total_structured_fields = sum(len(r['result'].get('structured_data', {})) for r in results if 'error' not in r['result'])
    
    print(f"\nDemo Summary:")
    print(f"   Successfully processed: {successful_extractions}/{len(demo_texts)} documents")
    print(f"   Total entities found: {total_entities}")
    print(f"   Total structured fields: {total_structured_fields}")
    
    print(f"\nDemo completed successfully!")
    print(f"You can now:")
    print(f"   - Run the web API: python api/app.py")
    print(f"   - Process your own documents using inference.py")
    print(f"   - Retrain with your data using training_pipeline.py")


def train_model_only():
    """Train the model without running inference demo."""
    print("TRAINING MODEL ONLY")
    print("=" * 40)
    
    config = create_custom_config()
    pipeline = TrainingPipeline(config)
    
    model_path = pipeline.run_complete_pipeline()
    
    print(f"Model training completed!")
    print(f"Model saved to: {model_path}")


def test_specific_text():
    """Test extraction on user-provided text."""
    print("CUSTOM TEXT EXTRACTION")
    print("=" * 40)
    
    # Check if model exists
    model_path = "models/document_ner_model"
    if not Path(model_path).exists():
        print("No trained model found. Please run training first.")
        return
    
    # Get text from user
    print("Enter text to extract information from:")
    print("(Example: Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00)")
    text = input("Text: ").strip()
    
    if not text:
        print("No text provided.")
        return
    
    # Load inference and process
    try:
        inference = DocumentInference(model_path)
        result = inference.process_text_directly(text)
        
        print(f"\nExtraction Results:")
        if 'error' not in result:
            structured_data = result.get('structured_data', {})
            if structured_data:
                print("Structured Information:")
                for key, value in structured_data.items():
                    print(f"  {key}: {value}")
            else:
                print("No structured information found.")
            
            entities = result.get('entities', [])
            if entities:
                print(f"\nEntities Found ({len(entities)}):")
                for entity in entities:
                    confidence = int(entity['confidence'] * 100)
                    print(f"  {entity['entity']}: '{entity['text']}' ({confidence}%)")
        else:
            print(f"Error: {result['error']}")
    
    except Exception as e:
        print(f"Failed to process text: {e}")


def main():
    """Main demo function with options."""
    print("DOCUMENT TEXT EXTRACTION SYSTEM")
    print("=" * 50)
    print("Choose an option:")
    print("1. Run complete demo (train + inference)")
    print("2. Train model only")
    print("3. Test specific text (requires trained model)")
    print("4. Exit")
    
    while True:
        choice = input("\nEnter your choice (1-4): ").strip()
        
        if choice == '1':
            run_quick_demo()
            break
        elif choice == '2':
            train_model_only()
            break
        elif choice == '3':
            test_specific_text()
            break
        elif choice == '4':
            print("👋 Goodbye!")
            break
        else:
            print("Invalid choice. Please enter 1, 2, 3, or 4.")


if __name__ == "__main__":
    main()