#!/usr/bin/env python3 """ Simplified Document Text Extraction API Uses regex patterns instead of ML model for demonstration """ import json import re from datetime import datetime from typing import Dict, List, Any, Optional from pathlib import Path import sys import os # Add current directory to Python path sys.path.append(os.path.dirname(os.path.abspath(__file__))) try: from fastapi import FastAPI, HTTPException, File, UploadFile from fastapi.responses import HTMLResponse, FileResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import uvicorn HAS_FASTAPI = True except ImportError: print("FastAPI not installed. Install with: pip install fastapi uvicorn python-multipart") HAS_FASTAPI = False class SimpleDocumentProcessor: """Simplified document processor using regex patterns""" def __init__(self): # Define regex patterns for different entity types self.patterns = { 'NAME': [ r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', r'\b([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', r'(?:Invoice|Bill|Receipt)\s+(?:sent\s+)?(?:to\s+|for\s+)?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', ], 'DATE': [ r'\b(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})\b', r'\b(\d{2,4}[\/\-]\d{1,2}[\/\-]\d{1,2})\b', r'\b((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{2,4})\b', r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{2,4})\b', ], 'AMOUNT': [ r'\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', r'(?:Amount|Total|Sum):\s*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|dollars?))', ], 'INVOICE_NO': [ r'(?:Invoice|Bill|Receipt)(?:\s+No\.?|#|Number):\s*([A-Z]{2,4}[-\s]?\d{3,6})', r'(?:INV|BL|REC)[-\s]?(\d{3,6})', r'Reference:\s*([A-Z]{2,4}[-\s]?\d{3,6})', ], 'EMAIL': [ r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b', ], 'PHONE': [ r'\b(\+?1[-.\s]?\(?[2-9]\d{2}\)?[-.\s]?\d{3}[-.\s]?\d{4})\b', r'\b(\([2-9]\d{2}\)\s*[2-9]\d{2}[-.\s]?\d{4})\b', r'\b([2-9]\d{2}[-.\s]?[2-9]\d{2}[-.\s]?\d{4})\b', ], 'ADDRESS': [ r'\b(\d+\s+[A-Z][a-z]+\s+(?:Street|St|Avenue|Ave|Road|Rd|Lane|Ln|Drive|Dr|Boulevard|Blvd|Way))\b', ] } # Confidence scores for different entity types self.confidence_scores = { 'NAME': 0.80, 'DATE': 0.85, 'AMOUNT': 0.85, 'INVOICE_NO': 0.90, 'EMAIL': 0.95, 'PHONE': 0.90, 'ADDRESS': 0.75 } def extract_entities(self, text: str) -> List[Dict[str, Any]]: """Extract entities from text using regex patterns""" entities = [] for entity_type, patterns in self.patterns.items(): for pattern in patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: entity = { 'entity': entity_type, 'text': match.group(1) if match.groups() else match.group(0), 'start': match.start(), 'end': match.end(), 'confidence': self.confidence_scores[entity_type] } entities.append(entity) return entities def create_structured_data(self, entities: List[Dict]) -> Dict[str, str]: """Create structured data from extracted entities""" structured = {} # Get the best entity for each type entity_groups = {} for entity in entities: entity_type = entity['entity'] if entity_type not in entity_groups: entity_groups[entity_type] = [] entity_groups[entity_type].append(entity) # Select best entity for each type for entity_type, group in entity_groups.items(): if group: # Sort by confidence and take the best one best_entity = max(group, key=lambda x: x['confidence']) # Format field names field_mapping = { 'NAME': 'Name', 'DATE': 'Date', 'AMOUNT': 'Amount', 'INVOICE_NO': 'InvoiceNo', 'EMAIL': 'Email', 'PHONE': 'Phone', 'ADDRESS': 'Address' } field_name = field_mapping.get(entity_type, entity_type) structured[field_name] = best_entity['text'] return structured def process_text(self, text: str) -> Dict[str, Any]: """Process text and extract structured information""" entities = self.extract_entities(text) structured_data = self.create_structured_data(entities) # Get unique entity types entity_types = list(set(entity['entity'] for entity in entities)) return { 'status': 'success', 'data': { 'original_text': text, 'entities': entities, 'structured_data': structured_data, 'processing_timestamp': datetime.now().isoformat(), 'total_entities_found': len(entities), 'entity_types_found': sorted(entity_types) } } # Pydantic models for API if HAS_FASTAPI: class TextRequest(BaseModel): text: str def create_app(): """Create and configure FastAPI app""" if not HAS_FASTAPI: raise ImportError("FastAPI dependencies not installed") app = FastAPI( title="Simple Document Text Extraction API", description="Extract structured information from documents using regex patterns", version="1.0.0" ) # Enable CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Initialize processor processor = SimpleDocumentProcessor() @app.get("/", response_class=HTMLResponse) async def get_interface(): """Serve the web interface""" return """ Document Text Extraction Demo

Document Text Extraction

Extract structured information from documents using AI patterns

Enter Text to Extract:

Try These Examples:

Upload Document:


Note: File upload processing is simplified in this demo

API Documentation

Endpoints:

POST /extract-from-text
Content-Type: application/json
{
  "text": "Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00"
}
POST /extract-from-file
Content-Type: multipart/form-data
file: [uploaded file]

Response Format:

{
  "status": "success",
  "data": {
    "original_text": "...",
    "entities": [...],
    "structured_data": {...},
    "processing_timestamp": "2025-09-27T...",
    "total_entities_found": 7,
    "entity_types_found": ["NAME", "DATE", "AMOUNT", "INVOICE_NO"]
  }
}
""" @app.post("/extract-from-text") async def extract_from_text(request: TextRequest): """Extract entities from text""" try: result = processor.process_text(request.text) return result except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/extract-from-file") async def extract_from_file(file: UploadFile = File(...)): """Extract entities from uploaded file""" try: # Read file content content = await file.read() # For demo purposes, convert to text (simplified) if file.filename.lower().endswith('.txt'): text = content.decode('utf-8') else: # For other file types, use sample text in demo text = "Demo processing for " + file.filename + ": Invoice sent to John Doe on 01/15/2025 Invoice No: INV-1001 Amount: $1,500.00" result = processor.process_text(text) return result except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") async def health_check(): """Health check endpoint""" return {"status": "healthy", "timestamp": datetime.now().isoformat()} return app def main(): """Main function to run the API server""" if not HAS_FASTAPI: print("FastAPI dependencies not installed.") print("📦 Install with: pip install fastapi uvicorn python-multipart") return print("Starting Simple Document Text Extraction API...") print("Access the web interface at: http://localhost:7000") print("API documentation at: http://localhost:7000/docs") print("Health check at: http://localhost:7000/health") print("\nServer starting...") app = create_app() uvicorn.run(app, host="0.0.0.0", port=7000, log_level="info") if __name__ == "__main__": main()