File size: 8,113 Bytes
eb53bb5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | """
Simple demo script for document text extraction.
Demonstrates the complete workflow from training to inference.
"""
import os
import sys
from pathlib import Path
import jso print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
else:
print(f"Error: {result['error']}")
except Exception as e:
print(f"Failed to process text: {e}") Add src to path for imports
sys.path.append(str(Path(__file__).parent))
from src.data_preparation import DocumentProcessor, NERDatasetCreator
from src.training_pipeline import TrainingPipeline, create_custom_config
from src.inference import DocumentInference
def run_quick_demo():
"""Run a quick demonstration of the text extraction system."""
print("DOCUMENT TEXT EXTRACTION - QUICK DEMO")
print("=" * 60)
# Sample documents for demonstration
demo_texts = [
{
"name": "Invoice Example 1",
"text": "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567"
},
{
"name": "Invoice Example 2",
"text": "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com"
},
{
"name": "Receipt Example",
"text": "Receipt for Michael Brown 456 Oak Street Boston MA 02101 Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75"
}
]
print("\nSample Documents:")
for i, doc in enumerate(demo_texts, 1):
print(f"{i}. {doc['name']}: {doc['text'][:60]}...")
# Check if model exists
model_path = "models/document_ner_model"
if not Path(model_path).exists():
print(f"\nModel not found at {model_path}")
print("Training a new model first...")
# Train model
config = create_custom_config()
config.num_epochs = 2 # Quick training for demo
config.batch_size = 8
pipeline = TrainingPipeline(config)
model_path = pipeline.run_complete_pipeline()
print(f"Model trained and saved to {model_path}")
# Load inference pipeline
print(f"\nLoading inference pipeline from {model_path}")
try:
inference = DocumentInference(model_path)
print("Inference pipeline loaded successfully")
except Exception as e:
print(f"Failed to load inference pipeline: {e}")
return
# Process demo texts
print(f"\nProcessing {len(demo_texts)} demo documents...")
results = []
for i, doc in enumerate(demo_texts, 1):
print(f"\nProcessing Document {i}: {doc['name']}")
print("-" * 50)
print(f"Text: {doc['text']}")
# Extract information
result = inference.process_text_directly(doc['text'])
results.append({
'document_name': doc['name'],
'original_text': doc['text'],
'result': result
})
# Display results
if 'error' not in result:
structured_data = result.get('structured_data', {})
entities = result.get('entities', [])
print(f"\nExtraction Results:")
if structured_data:
print("Structured Data:")
for key, value in structured_data.items():
print(f" {key}: {value}")
else:
print(" No structured data extracted")
if entities:
print(f"Found {len(entities)} entities:")
for entity in entities:
confidence = int(entity['confidence'] * 100)
print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
else:
print(f"Error: {result['error']}")
# Save results
output_path = "results/demo_results.json"
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nDemo results saved to: {output_path}")
# Summary
successful_extractions = sum(1 for r in results if 'error' not in r['result'])
total_entities = sum(len(r['result'].get('entities', [])) for r in results if 'error' not in r['result'])
total_structured_fields = sum(len(r['result'].get('structured_data', {})) for r in results if 'error' not in r['result'])
print(f"\nDemo Summary:")
print(f" Successfully processed: {successful_extractions}/{len(demo_texts)} documents")
print(f" Total entities found: {total_entities}")
print(f" Total structured fields: {total_structured_fields}")
print(f"\nDemo completed successfully!")
print(f"You can now:")
print(f" - Run the web API: python api/app.py")
print(f" - Process your own documents using inference.py")
print(f" - Retrain with your data using training_pipeline.py")
def train_model_only():
"""Train the model without running inference demo."""
print("TRAINING MODEL ONLY")
print("=" * 40)
config = create_custom_config()
pipeline = TrainingPipeline(config)
model_path = pipeline.run_complete_pipeline()
print(f"Model training completed!")
print(f"Model saved to: {model_path}")
def test_specific_text():
"""Test extraction on user-provided text."""
print("CUSTOM TEXT EXTRACTION")
print("=" * 40)
# Check if model exists
model_path = "models/document_ner_model"
if not Path(model_path).exists():
print("No trained model found. Please run training first.")
return
# Get text from user
print("Enter text to extract information from:")
print("(Example: Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00)")
text = input("Text: ").strip()
if not text:
print("No text provided.")
return
# Load inference and process
try:
inference = DocumentInference(model_path)
result = inference.process_text_directly(text)
print(f"\nExtraction Results:")
if 'error' not in result:
structured_data = result.get('structured_data', {})
if structured_data:
print("Structured Information:")
for key, value in structured_data.items():
print(f" {key}: {value}")
else:
print("No structured information found.")
entities = result.get('entities', [])
if entities:
print(f"\nEntities Found ({len(entities)}):")
for entity in entities:
confidence = int(entity['confidence'] * 100)
print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
else:
print(f"Error: {result['error']}")
except Exception as e:
print(f"Failed to process text: {e}")
def main():
"""Main demo function with options."""
print("DOCUMENT TEXT EXTRACTION SYSTEM")
print("=" * 50)
print("Choose an option:")
print("1. Run complete demo (train + inference)")
print("2. Train model only")
print("3. Test specific text (requires trained model)")
print("4. Exit")
while True:
choice = input("\nEnter your choice (1-4): ").strip()
if choice == '1':
run_quick_demo()
break
elif choice == '2':
train_model_only()
break
elif choice == '3':
test_specific_text()
break
elif choice == '4':
print("👋 Goodbye!")
break
else:
print("Invalid choice. Please enter 1, 2, 3, or 4.")
if __name__ == "__main__":
main() |