File size: 22,097 Bytes
5b14aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 | """Command-line interface for docstrange."""
import argparse
import sys
import os
import json
from pathlib import Path
from typing import List
from .extractor import DocumentExtractor
from .exceptions import ConversionError, UnsupportedFormatError, FileNotFoundError
from . import __version__
def print_version():
"""Print version information."""
print(f"docstrange v{__version__}")
print("Convert any document, text, or URL into LLM-ready data format")
print("with advanced intelligent document processing capabilities.")
def print_supported_formats(extractor: DocumentExtractor):
"""Print supported formats in a nice format."""
print("Supported input formats:")
print()
formats = extractor.get_supported_formats()
# Group formats by category
categories = {
"Documents": [f for f in formats if f in ['.pdf', '.docx', '.doc', '.txt', '.text']],
"Data Files": [f for f in formats if f in ['.xlsx', '.xls', '.csv']],
"Presentations": [f for f in formats if f in ['.ppt', '.pptx']],
"Web": [f for f in formats if f == 'URLs'],
"Images": [f for f in formats if f in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']],
"Web Files": [f for f in formats if f in ['.html', '.htm']]
}
for category, format_list in categories.items():
if format_list:
print(f" {category}:")
for fmt in format_list:
print(f" - {fmt}")
print()
def process_single_input(extractor: DocumentExtractor, input_item: str, output_format: str, verbose: bool = False) -> dict:
"""Process a single input item and return result with metadata."""
if verbose:
print(f"Processing: {input_item}", file=sys.stderr)
try:
# Check if it's a URL
if input_item.startswith(('http://', 'https://')):
if extractor.cloud_mode:
raise ConversionError("URL processing is not supported in cloud mode. Use local mode for URLs.")
result = extractor.extract_url(input_item)
input_type = "URL"
# Check if it's a file
elif os.path.exists(input_item):
result = extractor.extract(input_item)
input_type = "File"
# Treat as text
else:
if extractor.cloud_mode:
raise ConversionError("Text processing is not supported in cloud mode. Use local mode for text.")
result = extractor.extract_text(input_item)
input_type = "Text"
return {
"success": True,
"result": result,
"input_type": input_type,
"input_item": input_item
}
except FileNotFoundError:
return {
"success": False,
"error": "File not found",
"input_item": input_item
}
except UnsupportedFormatError:
return {
"success": False,
"error": "Unsupported format",
"input_item": input_item
}
except ConversionError as e:
return {
"success": False,
"error": f"Conversion error: {e}",
"input_item": input_item
}
except Exception as e:
return {
"success": False,
"error": f"Unexpected error: {e}",
"input_item": input_item
}
def handle_login(force_reauth: bool = False) -> int:
"""Handle login command."""
try:
from .services.auth_service import get_authenticated_token
print("\nπ DocStrange Authentication")
print("=" * 50)
token = get_authenticated_token(force_reauth=force_reauth)
if token:
print("β
Authentication successful!")
# Get cached credentials to show user info
try:
from .services.auth_service import AuthService
auth_service = AuthService()
cached_creds = auth_service.get_cached_credentials()
if cached_creds and cached_creds.get('auth0_direct'):
print(f"π€ Logged in as: {cached_creds.get('user_email', 'Unknown')}")
print(f"π€ Name: {cached_creds.get('user_name', 'Unknown')}")
print(f"π Via: Auth0 Google Login")
print(f"π Access Token: {token[:12]}...{token[-4:]}")
print("πΎ Credentials cached securely")
else:
print(f"π Access Token: {token[:12]}...{token[-4:]}")
print("πΎ Credentials cached securely")
except Exception:
print(f"π Access Token: {token[:12]}...{token[-4:]}")
print("πΎ Credentials cached securely")
print("\nπ‘ You can now use DocStrange cloud features without specifying --api-key")
print("π Your CLI is authenticated with the same Google account used on docstrange.nanonets.com")
return 0
else:
print("β Authentication failed.")
return 1
except ImportError:
print("β Authentication service not available.", file=sys.stderr)
return 1
except Exception as e:
print(f"β Authentication error: {e}", file=sys.stderr)
return 1
def handle_logout() -> int:
"""Handle logout command."""
try:
from .services.auth_service import clear_auth
clear_auth()
print("β
Logged out successfully.")
print("πΎ Cached authentication credentials cleared.")
return 0
except ImportError:
print("β Authentication service not available.", file=sys.stderr)
return 1
except Exception as e:
print(f"β Error clearing credentials: {e}", file=sys.stderr)
return 1
def handle_api_keys_command(argv: list) -> int:
"""Handle API key management commands.
Usage:
docstrange api-keys list
docstrange api-keys add <key>
docstrange api-keys remove <key>
docstrange api-keys stats
"""
from .services.api_key_pool import ApiKeyPool
pool = ApiKeyPool.get_instance()
if not argv or argv[0] == "list":
keys = pool.get_all_keys()
stats = pool.get_pool_stats()
print(f"\nπ API Key Pool")
print("=" * 40)
print(f"Total keys: {stats['total_keys']}")
print(f"Available: {stats['available']}")
print(f"Rate limited: {stats['rate_limited']}")
print(f"Total requests: {stats['total_requests']}")
print()
if keys:
print("Keys:")
for i, masked in enumerate(keys, 1):
print(f" {i}. {masked}")
else:
print("No API keys configured.")
print("\nπ‘ Add keys with: docstrange api-keys add <key>")
print("π‘ Or set NANONETS_API_KEYS env var (comma-separated)")
return 0
elif argv[0] == "add":
if len(argv) < 2:
print("β Usage: docstrange api-keys add <key>", file=sys.stderr)
return 1
key = argv[1]
if pool.add_key(key, source="cli"):
pool.save_config()
print(f"β
API key added: {key[:8]}...{key[-4:]}")
return 0
else:
print("β οΈ API key already exists in pool")
return 0
elif argv[0] == "remove":
if len(argv) < 2:
print("β Usage: docstrange api-keys remove <key>", file=sys.stderr)
return 1
key = argv[1]
if pool.remove_key(key):
pool.save_config()
print(f"β
API key removed: {key[:8]}...{key[-4:]}")
return 0
else:
print("β API key not found in pool", file=sys.stderr)
return 1
elif argv[0] == "stats":
stats = pool.get_pool_stats()
print(f"\nπ API Key Pool Statistics")
print("=" * 40)
print(f"Total keys: {stats['total_keys']}")
print(f"Available: {stats['available']}")
print(f"Rate limited: {stats['rate_limited']}")
print(f"Total requests: {stats['total_requests']}")
return 0
else:
print(f"β Unknown api-keys command: {argv[0]}", file=sys.stderr)
print("Usage: docstrange api-keys [list|add|remove|stats]", file=sys.stderr)
return 1
def main():
"""Main CLI function."""
parser = argparse.ArgumentParser(
description="Convert documents to LLM-ready formats with intelligent document processing",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Authentication (browser-based login)
docstrange login # One-click browser login
docstrange login --reauth # Force re-authentication
# API Key Management
docstrange api-keys list # List all configured API keys
docstrange api-keys add <key> # Add an API key to the rotation pool
docstrange api-keys remove <key> # Remove an API key
docstrange api-keys stats # Show pool usage statistics
# Start web interface
docstrange web # Start web interface at http://localhost:8000
# Convert a PDF to markdown (default cloud mode)
docstrange document.pdf
# Convert with free API key with increased limits
docstrange document.pdf --api-key YOUR_API_KEY
# Convert with multiple API keys for automatic rotation
docstrange document.pdf --api-keys KEY1 KEY2 KEY3
# Force local GPU processing
docstrange document.pdf --gpu-mode
# Convert to different output formats
docstrange document.pdf --output html
docstrange document.pdf --output json
docstrange document.pdf --output csv # Extract tables as CSV
# Use specific model for cloud processing
docstrange document.pdf --model gemini
docstrange document.pdf --model openapi --output json
docstrange document.pdf --model nanonets --output csv
# Convert a URL (works in all modes)
docstrange https://example.com --output html
# Convert plain text (works in all modes)
docstrange "Hello world" --output json
# Convert multiple files
docstrange file1.pdf file2.docx file3.xlsx --output markdown
# Extract specific fields using cloud processing
docstrange invoice.pdf --output json --extract-fields invoice_number total_amount vendor_name
# Extract using JSON schema with cloud processing
docstrange document.pdf --output json --json-schema schema.json
# Save output to file
docstrange document.pdf --output-file output.md
# Use environment variable for API key
export NANONETS_API_KEY=your_api_key
docstrange document.pdf
# List supported formats
docstrange --list-formats
# Show version
docstrange --version
"""
)
parser.add_argument(
"input",
nargs="*",
help="Input file(s), URL(s), or text to extract"
)
parser.add_argument(
"--output", "-o",
choices=["markdown", "html", "json", "text", "csv"],
default="markdown",
help="Output format (default: markdown)"
)
# Processing mode arguments
parser.add_argument(
"--gpu-mode",
action="store_true",
help="Force local GPU processing (disables cloud mode, requires GPU)"
)
parser.add_argument(
"--api-key",
help="API key for increased cloud access (get it free from https://app.nanonets.com/#/keys)"
)
parser.add_argument(
"--api-keys",
nargs="+",
help="Multiple API keys for automatic rotation when one hits rate limit"
)
parser.add_argument(
"--model",
choices=["gemini", "openapi", "nanonets"],
help="Model to use for cloud processing (gemini, openapi, nanonets)"
)
parser.add_argument(
"--ollama-url",
default="http://localhost:11434",
help="Ollama server URL for local field extraction (default: http://localhost:11434)"
)
parser.add_argument(
"--ollama-model",
default="llama3.2",
help="Ollama model for local field extraction (default: llama3.2)"
)
parser.add_argument(
"--extract-fields",
nargs="+",
help="Extract specific fields using cloud processing (e.g., --extract-fields invoice_number total_amount)"
)
parser.add_argument(
"--json-schema",
help="JSON schema file for structured extraction using cloud processing"
)
parser.add_argument(
"--preserve-layout",
action="store_true",
default=True,
help="Preserve document layout (default: True)"
)
parser.add_argument(
"--include-images",
action="store_true",
help="Include images in output"
)
parser.add_argument(
"--ocr-enabled",
action="store_true",
help="Enable intelligent document processing for images and PDFs"
)
parser.add_argument(
"--output-file", "-f",
help="Output file path (if not specified, prints to stdout)"
)
parser.add_argument(
"--list-formats",
action="store_true",
help="List supported input formats and exit"
)
parser.add_argument(
"--version",
action="store_true",
help="Show version information and exit"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose output"
)
parser.add_argument(
"--login",
action="store_true",
help="Perform browser-based authentication login"
)
parser.add_argument(
"--reauth",
action="store_true",
help="Force re-authentication (use with --login)"
)
parser.add_argument(
"--logout",
action="store_true",
help="Clear cached authentication credentials"
)
args = parser.parse_args()
# Handle version flag
if args.version:
print_version()
return 0
# Handle list formats flag
if args.list_formats:
# Create a extractor to get supported formats
extractor = DocumentExtractor(
api_key=args.api_key,
model=args.model,
gpu=args.gpu_mode
)
print_supported_formats(extractor)
return 0
# Handle authentication commands
# Check if first argument is "login" command
if args.input and args.input[0] == "login":
force_reauth = "--reauth" in sys.argv
return handle_login(force_reauth)
# Handle API key management commands
if args.input and args.input[0] == "api-keys":
return handle_api_keys_command(sys.argv[1:])
# Handle web command
if args.input and args.input[0] == "web":
try:
from .web_app import run_web_app
print("Starting DocStrange web interface...")
print("Open your browser and go to: http://localhost:8000")
print("Press Ctrl+C to stop the server")
run_web_app(host='0.0.0.0', port=8000, debug=False)
return 0
except ImportError:
print("β Web interface not available. Install Flask: pip install Flask", file=sys.stderr)
return 1
# Handle login flags
if args.login or args.logout:
if args.logout:
return handle_logout()
else:
return handle_login(args.reauth)
# Check if input is provided
if not args.input:
parser.error("No input specified. Please provide file(s), URL(s), or text to extract.")
# Cloud mode is default. Without login/API key it's limited calls.
# Use 'docstrange login' (recommended) or --api-key for 10k docs/month for free.
# Initialize extractor
extractor = DocumentExtractor(
api_key=args.api_key,
api_keys=args.api_keys,
model=args.model,
gpu=args.gpu_mode
)
if args.verbose:
mode = "local" if args.gpu_mode else "cloud"
print(f"Initialized extractor in {mode} mode:")
print(f" - Output format: {args.output}")
if mode == "cloud":
pool_stats = extractor.get_api_key_pool_stats()
print(f" - API Key Pool: {pool_stats['available']}/{pool_stats['total_keys']} keys available")
if args.model:
print(f" - Model: {args.model}")
else:
print(f" - Local processing: GPU")
print()
# Process inputs
results = []
errors = []
for i, input_item in enumerate(args.input, 1):
if args.verbose and len(args.input) > 1:
print(f"[{i}/{len(args.input)}] Processing: {input_item}", file=sys.stderr)
result = process_single_input(extractor, input_item, args.output, args.verbose)
if result["success"]:
results.append(result["result"])
if not args.verbose:
print(f"Processing ... : {input_item}", file=sys.stderr)
else:
errors.append(result)
print(f"β Failed: {input_item} - {result['error']}", file=sys.stderr)
# Check if we have any successful results
if not results:
print("β No files were successfully processed.", file=sys.stderr)
if errors:
print("Errors encountered:", file=sys.stderr)
for error in errors:
print(f" - {error['input_item']}: {error['error']}", file=sys.stderr)
return 1
# Generate output
if len(results) == 1:
# Single result
result = results[0]
if args.output == "markdown":
output_content = result.extract_markdown()
elif args.output == "html":
output_content = result.extract_html()
elif args.output == "json":
# Handle field extraction if specified
json_schema = None
if args.json_schema:
try:
with open(args.json_schema, 'r') as f:
json_schema = json.load(f)
except Exception as e:
print(f"Error loading JSON schema: {e}", file=sys.stderr)
sys.exit(1)
try:
result_json = result.extract_data(
specified_fields=args.extract_fields,
json_schema=json_schema,
)
output_content = json.dumps(result_json, indent=2)
except Exception as e:
print(f"Error during JSON extraction: {e}", file=sys.stderr)
sys.exit(1)
elif args.output == "csv":
try:
output_content = result.extract_csv(include_all_tables=True)
except ValueError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
else: # text
output_content = result.extract_text()
else:
# Multiple results - combine them
if args.output == "markdown":
output_content = "\n\n---\n\n".join(r.extract_markdown() for r in results)
elif args.output == "html":
output_content = "\n\n<hr>\n\n".join(r.extract_html() for r in results)
elif args.output == "json":
# Handle field extraction for multiple results
json_schema = None
if args.json_schema:
try:
with open(args.json_schema, 'r') as f:
json_schema = json.load(f)
except Exception as e:
print(f"Error loading JSON schema: {e}", file=sys.stderr)
sys.exit(1)
try:
extracted_results = []
for r in results:
result_json = r.extract_data(
specified_fields=args.extract_fields,
json_schema=json_schema,
)
extracted_results.append(result_json)
combined_json = {
"results": extracted_results,
"count": len(results),
"errors": [{"input": e["input_item"], "error": e["error"]} for e in errors] if errors else []
}
output_content = json.dumps(combined_json, indent=2)
except Exception as e:
print(f"Error during JSON extraction: {e}", file=sys.stderr)
sys.exit(1)
elif args.output == "csv":
csv_outputs = []
for i, r in enumerate(results):
try:
csv_content = r.extract_csv(include_all_tables=True)
if csv_content.strip():
csv_outputs.append(f"=== File {i + 1} ===\n{csv_content}")
except ValueError:
# Skip files without tables
continue
if not csv_outputs:
print("Error: No tables found in any of the input files", file=sys.stderr)
sys.exit(1)
output_content = "\n\n".join(csv_outputs)
else: # text
output_content = "\n\n---\n\n".join(r.extract_text() for r in results)
# Write output
if args.output_file:
try:
with open(args.output_file, 'w', encoding='utf-8') as f:
f.write(output_content)
print(f"β
Output written to: {args.output_file}", file=sys.stderr)
except Exception as e:
print(f"β Failed to write output file: {e}", file=sys.stderr)
return 1
else:
print(output_content)
# Summary
if args.verbose or len(args.input) > 1:
print(f"\nSummary: {len(results)} successful, {len(errors)} failed", file=sys.stderr)
return 0 if not errors else 1
if __name__ == "__main__":
sys.exit(main()) |