Upload folder using huggingface_hub

eb53bb5 verified 3 months ago

8.11 kB

	"""
	Simple demo script for document text extraction.
	Demonstrates the complete workflow from training to inference.
	"""

	import os
	import sys
	from pathlib import Path
	import jso print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
	else:
	print(f"Error: {result['error']}")

	except Exception as e:
	print(f"Failed to process text: {e}") Add src to path for imports
	sys.path.append(str(Path(__file__).parent))

	from src.data_preparation import DocumentProcessor, NERDatasetCreator
	from src.training_pipeline import TrainingPipeline, create_custom_config
	from src.inference import DocumentInference


	def run_quick_demo():
	"""Run a quick demonstration of the text extraction system."""
	print("DOCUMENT TEXT EXTRACTION - QUICK DEMO")
	print("=" * 60)

	# Sample documents for demonstration
	demo_texts = [
	{
	"name": "Invoice Example 1",
	"text": "Invoice sent to Robert White on 15/09/2025 Invoice No: INV-1024 Amount: $1,250.00 Phone: (555) 123-4567"
	},
	{
	"name": "Invoice Example 2",
	"text": "Bill for Dr. Sarah Johnson dated March 10, 2025. Invoice Number: BL-2045. Total: $2,300.50 Email: sarah.johnson@email.com"
	},
	{
	"name": "Receipt Example",
	"text": "Receipt for Michael Brown 456 Oak Street Boston MA 02101 Invoice: REC-3089 Date: 2025-04-22 Amount: $890.75"
	}
	]

	print("\nSample Documents:")
	for i, doc in enumerate(demo_texts, 1):
	print(f"{i}. {doc['name']}: {doc['text'][:60]}...")

	# Check if model exists
	model_path = "models/document_ner_model"
	if not Path(model_path).exists():
	print(f"\nModel not found at {model_path}")
	print("Training a new model first...")

	# Train model
	config = create_custom_config()
	config.num_epochs = 2 # Quick training for demo
	config.batch_size = 8

	pipeline = TrainingPipeline(config)
	model_path = pipeline.run_complete_pipeline()

	print(f"Model trained and saved to {model_path}")

	# Load inference pipeline
	print(f"\nLoading inference pipeline from {model_path}")
	try:
	inference = DocumentInference(model_path)
	print("Inference pipeline loaded successfully")
	except Exception as e:
	print(f"Failed to load inference pipeline: {e}")
	return

	# Process demo texts
	print(f"\nProcessing {len(demo_texts)} demo documents...")
	results = []

	for i, doc in enumerate(demo_texts, 1):
	print(f"\nProcessing Document {i}: {doc['name']}")
	print("-" * 50)
	print(f"Text: {doc['text']}")

	# Extract information
	result = inference.process_text_directly(doc['text'])
	results.append({
	'document_name': doc['name'],
	'original_text': doc['text'],
	'result': result
	})

	# Display results
	if 'error' not in result:
	structured_data = result.get('structured_data', {})
	entities = result.get('entities', [])

	print(f"\nExtraction Results:")
	if structured_data:
	print("Structured Data:")
	for key, value in structured_data.items():
	print(f" {key}: {value}")
	else:
	print(" No structured data extracted")

	if entities:
	print(f"Found {len(entities)} entities:")
	for entity in entities:
	confidence = int(entity['confidence'] * 100)
	print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
	else:
	print(f"Error: {result['error']}")

	# Save results
	output_path = "results/demo_results.json"
	Path(output_path).parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(results, f, indent=2, ensure_ascii=False)

	print(f"\nDemo results saved to: {output_path}")

	# Summary
	successful_extractions = sum(1 for r in results if 'error' not in r['result'])
	total_entities = sum(len(r['result'].get('entities', [])) for r in results if 'error' not in r['result'])
	total_structured_fields = sum(len(r['result'].get('structured_data', {})) for r in results if 'error' not in r['result'])

	print(f"\nDemo Summary:")
	print(f" Successfully processed: {successful_extractions}/{len(demo_texts)} documents")
	print(f" Total entities found: {total_entities}")
	print(f" Total structured fields: {total_structured_fields}")

	print(f"\nDemo completed successfully!")
	print(f"You can now:")
	print(f" - Run the web API: python api/app.py")
	print(f" - Process your own documents using inference.py")
	print(f" - Retrain with your data using training_pipeline.py")


	def train_model_only():
	"""Train the model without running inference demo."""
	print("TRAINING MODEL ONLY")
	print("=" * 40)

	config = create_custom_config()
	pipeline = TrainingPipeline(config)

	model_path = pipeline.run_complete_pipeline()

	print(f"Model training completed!")
	print(f"Model saved to: {model_path}")


	def test_specific_text():
	"""Test extraction on user-provided text."""
	print("CUSTOM TEXT EXTRACTION")
	print("=" * 40)

	# Check if model exists
	model_path = "models/document_ner_model"
	if not Path(model_path).exists():
	print("No trained model found. Please run training first.")
	return

	# Get text from user
	print("Enter text to extract information from:")
	print("(Example: Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00)")
	text = input("Text: ").strip()

	if not text:
	print("No text provided.")
	return

	# Load inference and process
	try:
	inference = DocumentInference(model_path)
	result = inference.process_text_directly(text)

	print(f"\nExtraction Results:")
	if 'error' not in result:
	structured_data = result.get('structured_data', {})
	if structured_data:
	print("Structured Information:")
	for key, value in structured_data.items():
	print(f" {key}: {value}")
	else:
	print("No structured information found.")

	entities = result.get('entities', [])
	if entities:
	print(f"\nEntities Found ({len(entities)}):")
	for entity in entities:
	confidence = int(entity['confidence'] * 100)
	print(f" {entity['entity']}: '{entity['text']}' ({confidence}%)")
	else:
	print(f"Error: {result['error']}")

	except Exception as e:
	print(f"Failed to process text: {e}")


	def main():
	"""Main demo function with options."""
	print("DOCUMENT TEXT EXTRACTION SYSTEM")
	print("=" * 50)
	print("Choose an option:")
	print("1. Run complete demo (train + inference)")
	print("2. Train model only")
	print("3. Test specific text (requires trained model)")
	print("4. Exit")

	while True:
	choice = input("\nEnter your choice (1-4): ").strip()

	if choice == '1':
	run_quick_demo()
	break
	elif choice == '2':
	train_model_only()
	break
	elif choice == '3':
	test_specific_text()
	break
	elif choice == '4':
	print("👋 Goodbye!")
	break
	else:
	print("Invalid choice. Please enter 1, 2, 3, or 4.")


	if __name__ == "__main__":
	main()