ocr-layout-detection-poc / api_client.py
Ayaan Sharif
Add file validation and better error handling
255e6fd
#!/usr/bin/env python3
"""
OCR Layout Detection API Client
================================
Simple script to interact with the OCR Layout Detection service.
Usage:
python api_client.py <path_to_file>
Examples:
python api_client.py invoice.pdf
python api_client.py document.jpg
python api_client.py signature.png --signature-only
"""
import os
import sys
import json
import argparse
from pathlib import Path
from gradio_client import Client, handle_file
# API Configuration
SPACE_URL = "Ayaan-Sharif/ocr-layout-detection-poc"
HF_TOKEN = os.environ.get("HF_TOKEN") # Read from environment variable if available
def analyze_document(file_path, mode="Fast", enable_ocr=True, enable_tables=True,
detect_signatures=False, signature_conf=0.05):
"""
Analyze a document with layout detection and optional OCR.
Args:
file_path: Path to PDF or image file
mode: "Fast" or "Accurate" processing mode
enable_ocr: Extract text with OCR
enable_tables: Detect and extract tables
detect_signatures: Also detect signatures (slower)
signature_conf: Confidence threshold for signatures (0.01-0.5)
Returns:
dict: Contains visualization, summary, markdown, and JSON outputs
"""
print(f"πŸ“„ Analyzing document: {file_path}")
print(f" Mode: {mode} | OCR: {enable_ocr} | Tables: {enable_tables} | Signatures: {detect_signatures}")
try:
client = Client(SPACE_URL, hf_token=HF_TOKEN)
result = client.predict(
file=handle_file(file_path),
mode=mode,
enable_ocr=enable_ocr,
enable_tables=enable_tables,
run_signature_yolo=detect_signatures,
signature_conf=signature_conf,
api_name="/gradio_interface"
)
# result is a tuple: (visualization_image, summary_text, markdown_text, json_text)
visualization, summary, markdown, json_output = result
print("βœ… Analysis complete!")
return {
"visualization": visualization,
"summary": summary,
"markdown": markdown,
"json": json_output
}
except Exception as e:
print(f"❌ Error: {e}")
return None
def detect_signatures_only(file_path, multiscale=True, conf=0.03, iou=0.45, augment=True):
"""
Detect signatures only (faster, no OCR or layout analysis).
Args:
file_path: Path to PDF or image file
multiscale: Try multiple scales (1.0, 1.5, 2.0) for better detection
conf: Confidence threshold (0.01-0.5, lower = more detections)
iou: IoU threshold for NMS (0.1-0.9)
augment: Use augmentation (slower but better recall)
Returns:
dict: Contains annotated image, summary, and JSON detections
"""
print(f"✍️ Detecting signatures in: {file_path}")
print(f" Multiscale: {multiscale} | Conf: {conf} | IoU: {iou} | Augment: {augment}")
try:
client = Client(SPACE_URL, hf_token=HF_TOKEN)
result = client.predict(
file=handle_file(file_path),
try_scales=multiscale,
conf=conf,
iou=iou,
augment=augment,
api_name="/signature_only_infer"
)
# result is a tuple: (annotated_image, summary_text, json_detections)
annotated_image, summary, json_output = result
print("βœ… Signature detection complete!")
return {
"annotated_image": annotated_image,
"summary": summary,
"json": json_output
}
except Exception as e:
print(f"❌ Error: {e}")
return None
def save_results(results, output_dir="output"):
"""Save API results to files."""
os.makedirs(output_dir, exist_ok=True)
if results is None:
return
# Save visualization/annotated image
if "visualization" in results and results["visualization"]:
viz_path = results["visualization"].get("path")
if viz_path and os.path.exists(viz_path):
import shutil
output_path = os.path.join(output_dir, "visualization.png")
shutil.copy(viz_path, output_path)
print(f"πŸ’Ύ Saved visualization: {output_path}")
if "annotated_image" in results and results["annotated_image"]:
img_path = results["annotated_image"].get("path")
if img_path and os.path.exists(img_path):
import shutil
output_path = os.path.join(output_dir, "signatures_annotated.png")
shutil.copy(img_path, output_path)
print(f"πŸ’Ύ Saved annotated image: {output_path}")
# Save markdown content
if "markdown" in results and results["markdown"]:
markdown_path = os.path.join(output_dir, "content.md")
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(results["markdown"])
print(f"πŸ’Ύ Saved markdown: {markdown_path}")
# Save JSON output
if "json" in results and results["json"]:
json_path = os.path.join(output_dir, "layout.json")
with open(json_path, "w", encoding="utf-8") as f:
f.write(results["json"])
print(f"πŸ’Ύ Saved JSON: {json_path}")
# Save summary
if "summary" in results and results["summary"]:
summary_path = os.path.join(output_dir, "summary.txt")
with open(summary_path, "w", encoding="utf-8") as f:
f.write(results["summary"])
print(f"πŸ’Ύ Saved summary: {summary_path}")
def main():
parser = argparse.ArgumentParser(
description="OCR Layout Detection API Client",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Full document analysis with OCR
python api_client.py invoice.pdf
# Accurate mode with signature detection
python api_client.py document.pdf --mode Accurate --detect-signatures
# Signature detection only (faster)
python api_client.py contract.jpg --signature-only
# Custom output directory
python api_client.py file.pdf --output results/
"""
)
parser.add_argument("file", help="Path to document (PDF, JPG, PNG)")
parser.add_argument("--mode", choices=["Fast", "Accurate"], default="Fast",
help="Processing mode (default: Fast)")
parser.add_argument("--no-ocr", action="store_true", help="Disable OCR")
parser.add_argument("--no-tables", action="store_true", help="Disable table detection")
parser.add_argument("--detect-signatures", action="store_true",
help="Also detect signatures in full analysis")
parser.add_argument("--signature-conf", type=float, default=0.05,
help="Signature confidence threshold (default: 0.05)")
parser.add_argument("--signature-only", action="store_true",
help="Only detect signatures (faster, no OCR)")
parser.add_argument("--output", "-o", default="output",
help="Output directory (default: output)")
args = parser.parse_args()
# Validate file exists
if not os.path.exists(args.file):
print(f"❌ Error: File not found: {args.file}")
sys.exit(1)
# Check file type
ext = Path(args.file).suffix.lower()
if ext not in [".pdf", ".jpg", ".jpeg", ".png", ".tiff", ".bmp"]:
print(f"⚠️ Warning: Unsupported file type: {ext}")
print(" Supported: .pdf, .jpg, .jpeg, .png, .tiff, .bmp")
print(f"\nπŸš€ Starting API call to {SPACE_URL}\n")
# Call appropriate API endpoint
if args.signature_only:
results = detect_signatures_only(args.file)
else:
results = analyze_document(
args.file,
mode=args.mode,
enable_ocr=not args.no_ocr,
enable_tables=not args.no_tables,
detect_signatures=args.detect_signatures,
signature_conf=args.signature_conf
)
# Save results
if results:
print(f"\nπŸ“ Saving results to: {args.output}/")
save_results(results, args.output)
print("\n✨ Done!")
else:
print("\n❌ Failed to process document")
sys.exit(1)
if __name__ == "__main__":
main()