mikeboone's picture
feat: vendor legitdata source package into repository
1502291
#!/usr/bin/env python3
"""
LegitData CLI - Command line interface for realistic data generation.
Usage:
legitdata generate --ddl schema.sql --url https://example.com \\
--use-case "Retail Analytics" --connection "snowflake://..." --size medium
legitdata preview --ddl schema.sql --url https://example.com \\
--use-case "Retail Analytics" --rows 5
"""
import argparse
import json
import sys
from typing import Optional
def create_parser() -> argparse.ArgumentParser:
"""Create the argument parser."""
parser = argparse.ArgumentParser(
prog='legitdata',
description='Generate realistic synthetic data for analytics warehouses'
)
subparsers = parser.add_subparsers(dest='command', help='Commands')
# Generate command
gen_parser = subparsers.add_parser('generate', help='Generate and insert data')
gen_parser.add_argument(
'--ddl', '-d', required=True,
help='Path to DDL file or DDL string'
)
gen_parser.add_argument(
'--url', '-u', required=True,
help='Company website URL for context'
)
gen_parser.add_argument(
'--use-case', '-c', required=True,
help='Analytics use case (e.g., "Retail Analytics")'
)
gen_parser.add_argument(
'--connection', '-n', required=True,
help='Database connection string'
)
gen_parser.add_argument(
'--size', '-s', default='medium',
choices=['small', 'medium', 'large', 'xl'],
help='Size preset (default: medium)'
)
gen_parser.add_argument(
'--rows', '-r', type=str, default=None,
help='Custom row counts as JSON (e.g., \'{"fact_sales": 5000}\')'
)
gen_parser.add_argument(
'--no-truncate', action='store_true',
help='Do not truncate tables before insert'
)
gen_parser.add_argument(
'--dry-run', action='store_true',
help='Preview operations without writing to database'
)
gen_parser.add_argument(
'--no-cache', action='store_true',
help='Disable caching'
)
gen_parser.add_argument(
'--anthropic-key', type=str, default=None,
help='Anthropic API key (or set ANTHROPIC_API_KEY env var)'
)
# Preview command
preview_parser = subparsers.add_parser('preview', help='Preview generated data')
preview_parser.add_argument(
'--ddl', '-d', required=True,
help='Path to DDL file or DDL string'
)
preview_parser.add_argument(
'--url', '-u', required=True,
help='Company website URL for context'
)
preview_parser.add_argument(
'--use-case', '-c', required=True,
help='Analytics use case'
)
preview_parser.add_argument(
'--table', '-t', type=str, default=None,
help='Specific table to preview'
)
preview_parser.add_argument(
'--rows', '-r', type=int, default=5,
help='Number of rows to preview (default: 5)'
)
preview_parser.add_argument(
'--anthropic-key', type=str, default=None,
help='Anthropic API key (or set ANTHROPIC_API_KEY env var)'
)
# Cache command
cache_parser = subparsers.add_parser('cache', help='Manage cache')
cache_parser.add_argument(
'--clear', action='store_true',
help='Clear all cached data'
)
cache_parser.add_argument(
'--stats', action='store_true',
help='Show cache statistics'
)
cache_parser.add_argument(
'--type', type=str, default=None,
choices=['context', 'search', 'classification', 'generation'],
help='Specific cache type to clear'
)
return parser
def get_anthropic_client(api_key: Optional[str] = None):
"""Get Anthropic client."""
import os
key = api_key or os.environ.get('ANTHROPIC_API_KEY')
if not key:
print("Warning: No Anthropic API key provided. AI features will be limited.")
return None
try:
from anthropic import Anthropic
return Anthropic(api_key=key)
except ImportError:
print("Warning: anthropic package not installed. Install with: pip install anthropic")
return None
def cmd_generate(args) -> int:
"""Execute generate command."""
from legitdata import LegitGenerator
# Parse custom row counts
row_counts = None
if args.rows:
try:
row_counts = json.loads(args.rows)
except json.JSONDecodeError as e:
print(f"Error parsing row counts JSON: {e}")
return 1
# Get Anthropic client
anthropic_client = get_anthropic_client(args.anthropic_key)
# Create generator
gen = LegitGenerator(
url=args.url,
use_case=args.use_case,
connection_string=args.connection,
anthropic_client=anthropic_client,
cache_enabled=not args.no_cache,
dry_run=args.dry_run
)
# Load DDL
try:
gen.load_ddl(args.ddl)
except Exception as e:
print(f"Error loading DDL: {e}")
return 1
# Generate
try:
results = gen.generate(
size=args.size,
row_counts=row_counts,
truncate_first=not args.no_truncate
)
print("\nResults:")
for table, count in results.items():
print(f" {table}: {count} rows")
return 0
except Exception as e:
print(f"Error generating data: {e}")
import traceback
traceback.print_exc()
return 1
def cmd_preview(args) -> int:
"""Execute preview command."""
from legitdata import LegitGenerator
# Get Anthropic client
anthropic_client = get_anthropic_client(args.anthropic_key)
# Create generator (no connection needed for preview)
gen = LegitGenerator(
url=args.url,
use_case=args.use_case,
connection_string="", # Not needed for preview
anthropic_client=anthropic_client,
dry_run=True
)
# Load DDL
try:
gen.load_ddl(args.ddl)
except Exception as e:
print(f"Error loading DDL: {e}")
return 1
# Preview
try:
preview_data = gen.preview(
table_name=args.table,
num_rows=args.rows
)
print("\n" + "=" * 60)
print("PREVIEW DATA")
print("=" * 60)
for table_name, rows in preview_data.items():
print(f"\n{table_name}:")
print("-" * 40)
for i, row in enumerate(rows):
print(f"Row {i + 1}:")
for key, value in row.items():
if value is not None:
print(f" {key}: {value}")
print()
return 0
except Exception as e:
print(f"Error previewing data: {e}")
import traceback
traceback.print_exc()
return 1
def cmd_cache(args) -> int:
"""Execute cache command."""
from legitdata.cache import FileCache
cache = FileCache()
if args.clear:
count = cache.clear(args.type)
print(f"Cleared {count} cache entries")
return 0
if args.stats:
stats = cache.get_stats()
print("Cache Statistics:")
print(f" Total entries: {stats['total']}")
for cache_type, count in stats['by_type'].items():
print(f" {cache_type}: {count}")
return 0
print("Use --clear or --stats")
return 1
def main() -> int:
"""Main entry point."""
parser = create_parser()
args = parser.parse_args()
if args.command == 'generate':
return cmd_generate(args)
elif args.command == 'preview':
return cmd_preview(args)
elif args.command == 'cache':
return cmd_cache(args)
else:
parser.print_help()
return 0
if __name__ == '__main__':
sys.exit(main())