""" Simple demo script for document text extraction. Demonstrates the complete workflow from training to inference. """ import os import sys from pathlib import Path import jso print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)") else: print(f"Error: {result['error']}") except Exception as e: print(f"Failed to process text: {e}") Add src to path for imports sys.path.append(str(Path(__file__).parent)) from src.data_preparation import DocumentProcessor, NERDatasetCreator from src.training_pipeline import TrainingPipeline, create_custom_config from src.inference import DocumentInference def run_quick_demo(): """Run a quick demonstration of the text extraction system.""" print("DOCUMENT TEXT EXTRACTION - QUICK DEMO") print("=" * 60) # Sample documents for demonstration demo_texts = [ { "name": "Invoice Example 1", "text": "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567" }, { "name": "Invoice Example 2", "text": "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com" }, { "name": "Receipt Example", "text": "Receipt for Michael Brown 456 Oak Street Boston MA 02101 Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75" } ] print("\nSample Documents:") for i, doc in enumerate(demo_texts, 1): print(f"{i}. {doc['name']}: {doc['text'][:60]}...") # Check if model exists model_path = "models/document_ner_model" if not Path(model_path).exists(): print(f"\nModel not found at {model_path}") print("Training a new model first...") # Train model config = create_custom_config() config.num_epochs = 2 # Quick training for demo config.batch_size = 8 pipeline = TrainingPipeline(config) model_path = pipeline.run_complete_pipeline() print(f"Model trained and saved to {model_path}") # Load inference pipeline print(f"\nLoading inference pipeline from {model_path}") try: inference = DocumentInference(model_path) print("Inference pipeline loaded successfully") except Exception as e: print(f"Failed to load inference pipeline: {e}") return # Process demo texts print(f"\nProcessing {len(demo_texts)} demo documents...") results = [] for i, doc in enumerate(demo_texts, 1): print(f"\nProcessing Document {i}: {doc['name']}") print("-" * 50) print(f"Text: {doc['text']}") # Extract information result = inference.process_text_directly(doc['text']) results.append({ 'document_name': doc['name'], 'original_text': doc['text'], 'result': result }) # Display results if 'error' not in result: structured_data = result.get('structured_data', {}) entities = result.get('entities', []) print(f"\nExtraction Results:") if structured_data: print("Structured Data:") for key, value in structured_data.items(): print(f" {key}: {value}") else: print(" No structured data extracted") if entities: print(f"Found {len(entities)} entities:") for entity in entities: confidence = int(entity['confidence'] * 100) print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)") else: print(f"Error: {result['error']}") # Save results output_path = "results/demo_results.json" Path(output_path).parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\nDemo results saved to: {output_path}") # Summary successful_extractions = sum(1 for r in results if 'error' not in r['result']) total_entities = sum(len(r['result'].get('entities', [])) for r in results if 'error' not in r['result']) total_structured_fields = sum(len(r['result'].get('structured_data', {})) for r in results if 'error' not in r['result']) print(f"\nDemo Summary:") print(f" Successfully processed: {successful_extractions}/{len(demo_texts)} documents") print(f" Total entities found: {total_entities}") print(f" Total structured fields: {total_structured_fields}") print(f"\nDemo completed successfully!") print(f"You can now:") print(f" - Run the web API: python api/app.py") print(f" - Process your own documents using inference.py") print(f" - Retrain with your data using training_pipeline.py") def train_model_only(): """Train the model without running inference demo.""" print("TRAINING MODEL ONLY") print("=" * 40) config = create_custom_config() pipeline = TrainingPipeline(config) model_path = pipeline.run_complete_pipeline() print(f"Model training completed!") print(f"Model saved to: {model_path}") def test_specific_text(): """Test extraction on user-provided text.""" print("CUSTOM TEXT EXTRACTION") print("=" * 40) # Check if model exists model_path = "models/document_ner_model" if not Path(model_path).exists(): print("No trained model found. Please run training first.") return # Get text from user print("Enter text to extract information from:") print("(Example: Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00)") text = input("Text: ").strip() if not text: print("No text provided.") return # Load inference and process try: inference = DocumentInference(model_path) result = inference.process_text_directly(text) print(f"\nExtraction Results:") if 'error' not in result: structured_data = result.get('structured_data', {}) if structured_data: print("Structured Information:") for key, value in structured_data.items(): print(f" {key}: {value}") else: print("No structured information found.") entities = result.get('entities', []) if entities: print(f"\nEntities Found ({len(entities)}):") for entity in entities: confidence = int(entity['confidence'] * 100) print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)") else: print(f"Error: {result['error']}") except Exception as e: print(f"Failed to process text: {e}") def main(): """Main demo function with options.""" print("DOCUMENT TEXT EXTRACTION SYSTEM") print("=" * 50) print("Choose an option:") print("1. Run complete demo (train + inference)") print("2. Train model only") print("3. Test specific text (requires trained model)") print("4. Exit") while True: choice = input("\nEnter your choice (1-4): ").strip() if choice == '1': run_quick_demo() break elif choice == '2': train_model_only() break elif choice == '3': test_specific_text() break elif choice == '4': print("👋 Goodbye!") break else: print("Invalid choice. Please enter 1, 2, 3, or 4.") if __name__ == "__main__": main()