Spaces:

arjunbhargav212
/

docling-processor

Running

App Files Files Community

docling-processor / docstrange /cli.py

arjunbhargav212

Upload 63 files

5b14aa2 verified 26 days ago

raw

history blame contribute delete

22.1 kB

	"""Command-line interface for docstrange."""

	import argparse
	import sys
	import os
	import json
	from pathlib import Path
	from typing import List

	from .extractor import DocumentExtractor
	from .exceptions import ConversionError, UnsupportedFormatError, FileNotFoundError
	from . import __version__


	def print_version():
	"""Print version information."""
	print(f"docstrange v{__version__}")
	print("Convert any document, text, or URL into LLM-ready data format")
	print("with advanced intelligent document processing capabilities.")


	def print_supported_formats(extractor: DocumentExtractor):
	"""Print supported formats in a nice format."""
	print("Supported input formats:")
	print()

	formats = extractor.get_supported_formats()

	# Group formats by category
	categories = {
	"Documents": [f for f in formats if f in ['.pdf', '.docx', '.doc', '.txt', '.text']],
	"Data Files": [f for f in formats if f in ['.xlsx', '.xls', '.csv']],
	"Presentations": [f for f in formats if f in ['.ppt', '.pptx']],
	"Web": [f for f in formats if f == 'URLs'],
	"Images": [f for f in formats if f in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']],
	"Web Files": [f for f in formats if f in ['.html', '.htm']]
	}

	for category, format_list in categories.items():
	if format_list:
	print(f" {category}:")
	for fmt in format_list:
	print(f" - {fmt}")
	print()


	def process_single_input(extractor: DocumentExtractor, input_item: str, output_format: str, verbose: bool = False) -> dict:
	"""Process a single input item and return result with metadata."""
	if verbose:
	print(f"Processing: {input_item}", file=sys.stderr)

	try:
	# Check if it's a URL
	if input_item.startswith(('http://', 'https://')):
	if extractor.cloud_mode:
	raise ConversionError("URL processing is not supported in cloud mode. Use local mode for URLs.")
	result = extractor.extract_url(input_item)
	input_type = "URL"
	# Check if it's a file
	elif os.path.exists(input_item):
	result = extractor.extract(input_item)
	input_type = "File"
	# Treat as text
	else:
	if extractor.cloud_mode:
	raise ConversionError("Text processing is not supported in cloud mode. Use local mode for text.")
	result = extractor.extract_text(input_item)
	input_type = "Text"

	return {
	"success": True,
	"result": result,
	"input_type": input_type,
	"input_item": input_item
	}

	except FileNotFoundError:
	return {
	"success": False,
	"error": "File not found",
	"input_item": input_item
	}
	except UnsupportedFormatError:
	return {
	"success": False,
	"error": "Unsupported format",
	"input_item": input_item
	}
	except ConversionError as e:
	return {
	"success": False,
	"error": f"Conversion error: {e}",
	"input_item": input_item
	}
	except Exception as e:
	return {
	"success": False,
	"error": f"Unexpected error: {e}",
	"input_item": input_item
	}


	def handle_login(force_reauth: bool = False) -> int:
	"""Handle login command."""
	try:
	from .services.auth_service import get_authenticated_token

	print("\n🔐 DocStrange Authentication")
	print("=" * 50)

	token = get_authenticated_token(force_reauth=force_reauth)
	if token:
	print("✅ Authentication successful!")

	# Get cached credentials to show user info
	try:
	from .services.auth_service import AuthService
	auth_service = AuthService()
	cached_creds = auth_service.get_cached_credentials()

	if cached_creds and cached_creds.get('auth0_direct'):
	print(f"👤 Logged in as: {cached_creds.get('user_email', 'Unknown')}")
	print(f"👤 Name: {cached_creds.get('user_name', 'Unknown')}")
	print(f"🔐 Via: Auth0 Google Login")
	print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
	print("💾 Credentials cached securely")
	else:
	print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
	print("💾 Credentials cached securely")
	except Exception:
	print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
	print("💾 Credentials cached securely")

	print("\n💡 You can now use DocStrange cloud features without specifying --api-key")
	print("🌐 Your CLI is authenticated with the same Google account used on docstrange.nanonets.com")
	return 0
	else:
	print("❌ Authentication failed.")
	return 1
	except ImportError:
	print("❌ Authentication service not available.", file=sys.stderr)
	return 1
	except Exception as e:
	print(f"❌ Authentication error: {e}", file=sys.stderr)
	return 1


	def handle_logout() -> int:
	"""Handle logout command."""
	try:
	from .services.auth_service import clear_auth

	clear_auth()
	print("✅ Logged out successfully.")
	print("💾 Cached authentication credentials cleared.")
	return 0
	except ImportError:
	print("❌ Authentication service not available.", file=sys.stderr)
	return 1
	except Exception as e:
	print(f"❌ Error clearing credentials: {e}", file=sys.stderr)
	return 1


	def handle_api_keys_command(argv: list) -> int:
	"""Handle API key management commands.

	Usage:
	docstrange api-keys list
	docstrange api-keys add <key>
	docstrange api-keys remove <key>
	docstrange api-keys stats
	"""
	from .services.api_key_pool import ApiKeyPool

	pool = ApiKeyPool.get_instance()

	if not argv or argv[0] == "list":
	keys = pool.get_all_keys()
	stats = pool.get_pool_stats()
	print(f"\n🔑 API Key Pool")
	print("=" * 40)
	print(f"Total keys: {stats['total_keys']}")
	print(f"Available: {stats['available']}")
	print(f"Rate limited: {stats['rate_limited']}")
	print(f"Total requests: {stats['total_requests']}")
	print()
	if keys:
	print("Keys:")
	for i, masked in enumerate(keys, 1):
	print(f" {i}. {masked}")
	else:
	print("No API keys configured.")
	print("\n💡 Add keys with: docstrange api-keys add <key>")
	print("💡 Or set NANONETS_API_KEYS env var (comma-separated)")
	return 0

	elif argv[0] == "add":
	if len(argv) < 2:
	print("❌ Usage: docstrange api-keys add <key>", file=sys.stderr)
	return 1
	key = argv[1]
	if pool.add_key(key, source="cli"):
	pool.save_config()
	print(f"✅ API key added: {key[:8]}...{key[-4:]}")
	return 0
	else:
	print("⚠️ API key already exists in pool")
	return 0

	elif argv[0] == "remove":
	if len(argv) < 2:
	print("❌ Usage: docstrange api-keys remove <key>", file=sys.stderr)
	return 1
	key = argv[1]
	if pool.remove_key(key):
	pool.save_config()
	print(f"✅ API key removed: {key[:8]}...{key[-4:]}")
	return 0
	else:
	print("❌ API key not found in pool", file=sys.stderr)
	return 1

	elif argv[0] == "stats":
	stats = pool.get_pool_stats()
	print(f"\n📊 API Key Pool Statistics")
	print("=" * 40)
	print(f"Total keys: {stats['total_keys']}")
	print(f"Available: {stats['available']}")
	print(f"Rate limited: {stats['rate_limited']}")
	print(f"Total requests: {stats['total_requests']}")
	return 0

	else:
	print(f"❌ Unknown api-keys command: {argv[0]}", file=sys.stderr)
	print("Usage: docstrange api-keys [list\|add\|remove\|stats]", file=sys.stderr)
	return 1


	def main():
	"""Main CLI function."""
	parser = argparse.ArgumentParser(
	description="Convert documents to LLM-ready formats with intelligent document processing",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Authentication (browser-based login)
	docstrange login # One-click browser login
	docstrange login --reauth # Force re-authentication

	# API Key Management
	docstrange api-keys list # List all configured API keys
	docstrange api-keys add <key> # Add an API key to the rotation pool
	docstrange api-keys remove <key> # Remove an API key
	docstrange api-keys stats # Show pool usage statistics

	# Start web interface
	docstrange web # Start web interface at http://localhost:8000

	# Convert a PDF to markdown (default cloud mode)
	docstrange document.pdf

	# Convert with free API key with increased limits
	docstrange document.pdf --api-key YOUR_API_KEY

	# Convert with multiple API keys for automatic rotation
	docstrange document.pdf --api-keys KEY1 KEY2 KEY3

	# Force local GPU processing
	docstrange document.pdf --gpu-mode

	# Convert to different output formats
	docstrange document.pdf --output html
	docstrange document.pdf --output json
	docstrange document.pdf --output csv # Extract tables as CSV

	# Use specific model for cloud processing
	docstrange document.pdf --model gemini
	docstrange document.pdf --model openapi --output json
	docstrange document.pdf --model nanonets --output csv

	# Convert a URL (works in all modes)
	docstrange https://example.com --output html

	# Convert plain text (works in all modes)
	docstrange "Hello world" --output json

	# Convert multiple files
	docstrange file1.pdf file2.docx file3.xlsx --output markdown

	# Extract specific fields using cloud processing
	docstrange invoice.pdf --output json --extract-fields invoice_number total_amount vendor_name

	# Extract using JSON schema with cloud processing
	docstrange document.pdf --output json --json-schema schema.json

	# Save output to file
	docstrange document.pdf --output-file output.md

	# Use environment variable for API key
	export NANONETS_API_KEY=your_api_key
	docstrange document.pdf

	# List supported formats
	docstrange --list-formats

	# Show version
	docstrange --version
	"""
	)

	parser.add_argument(
	"input",
	nargs="*",
	help="Input file(s), URL(s), or text to extract"
	)

	parser.add_argument(
	"--output", "-o",
	choices=["markdown", "html", "json", "text", "csv"],
	default="markdown",
	help="Output format (default: markdown)"
	)

	# Processing mode arguments
	parser.add_argument(
	"--gpu-mode",
	action="store_true",
	help="Force local GPU processing (disables cloud mode, requires GPU)"
	)

	parser.add_argument(
	"--api-key",
	help="API key for increased cloud access (get it free from https://app.nanonets.com/#/keys)"
	)

	parser.add_argument(
	"--api-keys",
	nargs="+",
	help="Multiple API keys for automatic rotation when one hits rate limit"
	)

	parser.add_argument(
	"--model",
	choices=["gemini", "openapi", "nanonets"],
	help="Model to use for cloud processing (gemini, openapi, nanonets)"
	)

	parser.add_argument(
	"--ollama-url",
	default="http://localhost:11434",
	help="Ollama server URL for local field extraction (default: http://localhost:11434)"
	)

	parser.add_argument(
	"--ollama-model",
	default="llama3.2",
	help="Ollama model for local field extraction (default: llama3.2)"
	)

	parser.add_argument(
	"--extract-fields",
	nargs="+",
	help="Extract specific fields using cloud processing (e.g., --extract-fields invoice_number total_amount)"
	)

	parser.add_argument(
	"--json-schema",
	help="JSON schema file for structured extraction using cloud processing"
	)

	parser.add_argument(
	"--preserve-layout",
	action="store_true",
	default=True,
	help="Preserve document layout (default: True)"
	)

	parser.add_argument(
	"--include-images",
	action="store_true",
	help="Include images in output"
	)

	parser.add_argument(
	"--ocr-enabled",
	action="store_true",
	help="Enable intelligent document processing for images and PDFs"
	)

	parser.add_argument(
	"--output-file", "-f",
	help="Output file path (if not specified, prints to stdout)"
	)

	parser.add_argument(
	"--list-formats",
	action="store_true",
	help="List supported input formats and exit"
	)

	parser.add_argument(
	"--version",
	action="store_true",
	help="Show version information and exit"
	)

	parser.add_argument(
	"--verbose", "-v",
	action="store_true",
	help="Enable verbose output"
	)

	parser.add_argument(
	"--login",
	action="store_true",
	help="Perform browser-based authentication login"
	)

	parser.add_argument(
	"--reauth",
	action="store_true",
	help="Force re-authentication (use with --login)"
	)

	parser.add_argument(
	"--logout",
	action="store_true",
	help="Clear cached authentication credentials"
	)

	args = parser.parse_args()

	# Handle version flag
	if args.version:
	print_version()
	return 0

	# Handle list formats flag
	if args.list_formats:
	# Create a extractor to get supported formats
	extractor = DocumentExtractor(
	api_key=args.api_key,
	model=args.model,
	gpu=args.gpu_mode
	)
	print_supported_formats(extractor)
	return 0

	# Handle authentication commands
	# Check if first argument is "login" command
	if args.input and args.input[0] == "login":
	force_reauth = "--reauth" in sys.argv
	return handle_login(force_reauth)

	# Handle API key management commands
	if args.input and args.input[0] == "api-keys":
	return handle_api_keys_command(sys.argv[1:])

	# Handle web command
	if args.input and args.input[0] == "web":
	try:
	from .web_app import run_web_app
	print("Starting DocStrange web interface...")
	print("Open your browser and go to: http://localhost:8000")
	print("Press Ctrl+C to stop the server")
	run_web_app(host='0.0.0.0', port=8000, debug=False)
	return 0
	except ImportError:
	print("❌ Web interface not available. Install Flask: pip install Flask", file=sys.stderr)
	return 1

	# Handle login flags
	if args.login or args.logout:
	if args.logout:
	return handle_logout()
	else:
	return handle_login(args.reauth)

	# Check if input is provided
	if not args.input:
	parser.error("No input specified. Please provide file(s), URL(s), or text to extract.")

	# Cloud mode is default. Without login/API key it's limited calls.
	# Use 'docstrange login' (recommended) or --api-key for 10k docs/month for free.

	# Initialize extractor
	extractor = DocumentExtractor(
	api_key=args.api_key,
	api_keys=args.api_keys,
	model=args.model,
	gpu=args.gpu_mode
	)

	if args.verbose:
	mode = "local" if args.gpu_mode else "cloud"
	print(f"Initialized extractor in {mode} mode:")
	print(f" - Output format: {args.output}")
	if mode == "cloud":
	pool_stats = extractor.get_api_key_pool_stats()
	print(f" - API Key Pool: {pool_stats['available']}/{pool_stats['total_keys']} keys available")
	if args.model:
	print(f" - Model: {args.model}")
	else:
	print(f" - Local processing: GPU")
	print()

	# Process inputs
	results = []
	errors = []

	for i, input_item in enumerate(args.input, 1):
	if args.verbose and len(args.input) > 1:
	print(f"[{i}/{len(args.input)}] Processing: {input_item}", file=sys.stderr)

	result = process_single_input(extractor, input_item, args.output, args.verbose)

	if result["success"]:
	results.append(result["result"])
	if not args.verbose:
	print(f"Processing ... : {input_item}", file=sys.stderr)
	else:
	errors.append(result)
	print(f"❌ Failed: {input_item} - {result['error']}", file=sys.stderr)

	# Check if we have any successful results
	if not results:
	print("❌ No files were successfully processed.", file=sys.stderr)
	if errors:
	print("Errors encountered:", file=sys.stderr)
	for error in errors:
	print(f" - {error['input_item']}: {error['error']}", file=sys.stderr)
	return 1

	# Generate output
	if len(results) == 1:
	# Single result
	result = results[0]
	if args.output == "markdown":
	output_content = result.extract_markdown()
	elif args.output == "html":
	output_content = result.extract_html()
	elif args.output == "json":
	# Handle field extraction if specified
	json_schema = None
	if args.json_schema:
	try:
	with open(args.json_schema, 'r') as f:
	json_schema = json.load(f)
	except Exception as e:
	print(f"Error loading JSON schema: {e}", file=sys.stderr)
	sys.exit(1)

	try:
	result_json = result.extract_data(
	specified_fields=args.extract_fields,
	json_schema=json_schema,
	)
	output_content = json.dumps(result_json, indent=2)
	except Exception as e:
	print(f"Error during JSON extraction: {e}", file=sys.stderr)
	sys.exit(1)
	elif args.output == "csv":
	try:
	output_content = result.extract_csv(include_all_tables=True)
	except ValueError as e:
	print(f"Error: {e}", file=sys.stderr)
	sys.exit(1)
	else: # text
	output_content = result.extract_text()
	else:
	# Multiple results - combine them
	if args.output == "markdown":
	output_content = "\n\n---\n\n".join(r.extract_markdown() for r in results)
	elif args.output == "html":
	output_content = "\n\n<hr>\n\n".join(r.extract_html() for r in results)
	elif args.output == "json":
	# Handle field extraction for multiple results
	json_schema = None
	if args.json_schema:
	try:
	with open(args.json_schema, 'r') as f:
	json_schema = json.load(f)
	except Exception as e:
	print(f"Error loading JSON schema: {e}", file=sys.stderr)
	sys.exit(1)

	try:
	extracted_results = []
	for r in results:
	result_json = r.extract_data(
	specified_fields=args.extract_fields,
	json_schema=json_schema,
	)
	extracted_results.append(result_json)

	combined_json = {
	"results": extracted_results,
	"count": len(results),
	"errors": [{"input": e["input_item"], "error": e["error"]} for e in errors] if errors else []
	}
	output_content = json.dumps(combined_json, indent=2)
	except Exception as e:
	print(f"Error during JSON extraction: {e}", file=sys.stderr)
	sys.exit(1)
	elif args.output == "csv":
	csv_outputs = []
	for i, r in enumerate(results):
	try:
	csv_content = r.extract_csv(include_all_tables=True)
	if csv_content.strip():
	csv_outputs.append(f"=== File {i + 1} ===\n{csv_content}")
	except ValueError:
	# Skip files without tables
	continue
	if not csv_outputs:
	print("Error: No tables found in any of the input files", file=sys.stderr)
	sys.exit(1)
	output_content = "\n\n".join(csv_outputs)
	else: # text
	output_content = "\n\n---\n\n".join(r.extract_text() for r in results)

	# Write output
	if args.output_file:
	try:
	with open(args.output_file, 'w', encoding='utf-8') as f:
	f.write(output_content)
	print(f"✅ Output written to: {args.output_file}", file=sys.stderr)
	except Exception as e:
	print(f"❌ Failed to write output file: {e}", file=sys.stderr)
	return 1
	else:
	print(output_content)

	# Summary
	if args.verbose or len(args.input) > 1:
	print(f"\nSummary: {len(results)} successful, {len(errors)} failed", file=sys.stderr)

	return 0 if not errors else 1


	if __name__ == "__main__":
	sys.exit(main())