#!/usr/bin/env python3 """ Simplified Document Text Extraction API Uses regex patterns instead of ML model for demonstration """ import json import re from datetime import datetime from typing import Dict, List, Any, Optional from pathlib import Path import sys import os # Add current directory to Python path sys.path.append(os.path.dirname(os.path.abspath(__file__))) try: from fastapi import FastAPI, HTTPException, File, UploadFile from fastapi.responses import HTMLResponse, FileResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import uvicorn HAS_FASTAPI = True except ImportError: print("FastAPI not installed. Install with: pip install fastapi uvicorn python-multipart") HAS_FASTAPI = False class SimpleDocumentProcessor: """Simplified document processor using regex patterns""" def __init__(self): # Define regex patterns for different entity types self.patterns = { 'NAME': [ r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', r'\b([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', r'(?:Invoice|Bill|Receipt)\s+(?:sent\s+)?(?:to\s+|for\s+)?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', ], 'DATE': [ r'\b(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})\b', r'\b(\d{2,4}[\/\-]\d{1,2}[\/\-]\d{1,2})\b', r'\b((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{2,4})\b', r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{2,4})\b', ], 'AMOUNT': [ r'\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', r'(?:Amount|Total|Sum):\s*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|dollars?))', ], 'INVOICE_NO': [ r'(?:Invoice|Bill|Receipt)(?:\s+No\.?|#|Number):\s*([A-Z]{2,4}[-\s]?\d{3,6})', r'(?:INV|BL|REC)[-\s]?(\d{3,6})', r'Reference:\s*([A-Z]{2,4}[-\s]?\d{3,6})', ], 'EMAIL': [ r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b', ], 'PHONE': [ r'\b(\+?1[-.\s]?\(?[2-9]\d{2}\)?[-.\s]?\d{3}[-.\s]?\d{4})\b', r'\b(\([2-9]\d{2}\)\s*[2-9]\d{2}[-.\s]?\d{4})\b', r'\b([2-9]\d{2}[-.\s]?[2-9]\d{2}[-.\s]?\d{4})\b', ], 'ADDRESS': [ r'\b(\d+\s+[A-Z][a-z]+\s+(?:Street|St|Avenue|Ave|Road|Rd|Lane|Ln|Drive|Dr|Boulevard|Blvd|Way))\b', ] } # Confidence scores for different entity types self.confidence_scores = { 'NAME': 0.80, 'DATE': 0.85, 'AMOUNT': 0.85, 'INVOICE_NO': 0.90, 'EMAIL': 0.95, 'PHONE': 0.90, 'ADDRESS': 0.75 } def extract_entities(self, text: str) -> List[Dict[str, Any]]: """Extract entities from text using regex patterns""" entities = [] for entity_type, patterns in self.patterns.items(): for pattern in patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: entity = { 'entity': entity_type, 'text': match.group(1) if match.groups() else match.group(0), 'start': match.start(), 'end': match.end(), 'confidence': self.confidence_scores[entity_type] } entities.append(entity) return entities def create_structured_data(self, entities: List[Dict]) -> Dict[str, str]: """Create structured data from extracted entities""" structured = {} # Get the best entity for each type entity_groups = {} for entity in entities: entity_type = entity['entity'] if entity_type not in entity_groups: entity_groups[entity_type] = [] entity_groups[entity_type].append(entity) # Select best entity for each type for entity_type, group in entity_groups.items(): if group: # Sort by confidence and take the best one best_entity = max(group, key=lambda x: x['confidence']) # Format field names field_mapping = { 'NAME': 'Name', 'DATE': 'Date', 'AMOUNT': 'Amount', 'INVOICE_NO': 'InvoiceNo', 'EMAIL': 'Email', 'PHONE': 'Phone', 'ADDRESS': 'Address' } field_name = field_mapping.get(entity_type, entity_type) structured[field_name] = best_entity['text'] return structured def process_text(self, text: str) -> Dict[str, Any]: """Process text and extract structured information""" entities = self.extract_entities(text) structured_data = self.create_structured_data(entities) # Get unique entity types entity_types = list(set(entity['entity'] for entity in entities)) return { 'status': 'success', 'data': { 'original_text': text, 'entities': entities, 'structured_data': structured_data, 'processing_timestamp': datetime.now().isoformat(), 'total_entities_found': len(entities), 'entity_types_found': sorted(entity_types) } } # Pydantic models for API if HAS_FASTAPI: class TextRequest(BaseModel): text: str def create_app(): """Create and configure FastAPI app""" if not HAS_FASTAPI: raise ImportError("FastAPI dependencies not installed") app = FastAPI( title="Simple Document Text Extraction API", description="Extract structured information from documents using regex patterns", version="1.0.0" ) # Enable CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Initialize processor processor = SimpleDocumentProcessor() @app.get("/", response_class=HTMLResponse) async def get_interface(): """Serve the web interface""" return """
Extract structured information from documents using AI patterns
Note: File upload processing is simplified in this demo
POST /extract-from-text
Content-Type: application/json
{
"text": "Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00"
}
POST /extract-from-file Content-Type: multipart/form-data file: [uploaded file]
{
"status": "success",
"data": {
"original_text": "...",
"entities": [...],
"structured_data": {...},
"processing_timestamp": "2025-09-27T...",
"total_entities_found": 7,
"entity_types_found": ["NAME", "DATE", "AMOUNT", "INVOICE_NO"]
}
}