Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| LegitData CLI - Command line interface for realistic data generation. | |
| Usage: | |
| legitdata generate --ddl schema.sql --url https://example.com \\ | |
| --use-case "Retail Analytics" --connection "snowflake://..." --size medium | |
| legitdata preview --ddl schema.sql --url https://example.com \\ | |
| --use-case "Retail Analytics" --rows 5 | |
| """ | |
| import argparse | |
| import json | |
| import sys | |
| from typing import Optional | |
| def create_parser() -> argparse.ArgumentParser: | |
| """Create the argument parser.""" | |
| parser = argparse.ArgumentParser( | |
| prog='legitdata', | |
| description='Generate realistic synthetic data for analytics warehouses' | |
| ) | |
| subparsers = parser.add_subparsers(dest='command', help='Commands') | |
| # Generate command | |
| gen_parser = subparsers.add_parser('generate', help='Generate and insert data') | |
| gen_parser.add_argument( | |
| '--ddl', '-d', required=True, | |
| help='Path to DDL file or DDL string' | |
| ) | |
| gen_parser.add_argument( | |
| '--url', '-u', required=True, | |
| help='Company website URL for context' | |
| ) | |
| gen_parser.add_argument( | |
| '--use-case', '-c', required=True, | |
| help='Analytics use case (e.g., "Retail Analytics")' | |
| ) | |
| gen_parser.add_argument( | |
| '--connection', '-n', required=True, | |
| help='Database connection string' | |
| ) | |
| gen_parser.add_argument( | |
| '--size', '-s', default='medium', | |
| choices=['small', 'medium', 'large', 'xl'], | |
| help='Size preset (default: medium)' | |
| ) | |
| gen_parser.add_argument( | |
| '--rows', '-r', type=str, default=None, | |
| help='Custom row counts as JSON (e.g., \'{"fact_sales": 5000}\')' | |
| ) | |
| gen_parser.add_argument( | |
| '--no-truncate', action='store_true', | |
| help='Do not truncate tables before insert' | |
| ) | |
| gen_parser.add_argument( | |
| '--dry-run', action='store_true', | |
| help='Preview operations without writing to database' | |
| ) | |
| gen_parser.add_argument( | |
| '--no-cache', action='store_true', | |
| help='Disable caching' | |
| ) | |
| gen_parser.add_argument( | |
| '--anthropic-key', type=str, default=None, | |
| help='Anthropic API key (or set ANTHROPIC_API_KEY env var)' | |
| ) | |
| # Preview command | |
| preview_parser = subparsers.add_parser('preview', help='Preview generated data') | |
| preview_parser.add_argument( | |
| '--ddl', '-d', required=True, | |
| help='Path to DDL file or DDL string' | |
| ) | |
| preview_parser.add_argument( | |
| '--url', '-u', required=True, | |
| help='Company website URL for context' | |
| ) | |
| preview_parser.add_argument( | |
| '--use-case', '-c', required=True, | |
| help='Analytics use case' | |
| ) | |
| preview_parser.add_argument( | |
| '--table', '-t', type=str, default=None, | |
| help='Specific table to preview' | |
| ) | |
| preview_parser.add_argument( | |
| '--rows', '-r', type=int, default=5, | |
| help='Number of rows to preview (default: 5)' | |
| ) | |
| preview_parser.add_argument( | |
| '--anthropic-key', type=str, default=None, | |
| help='Anthropic API key (or set ANTHROPIC_API_KEY env var)' | |
| ) | |
| # Cache command | |
| cache_parser = subparsers.add_parser('cache', help='Manage cache') | |
| cache_parser.add_argument( | |
| '--clear', action='store_true', | |
| help='Clear all cached data' | |
| ) | |
| cache_parser.add_argument( | |
| '--stats', action='store_true', | |
| help='Show cache statistics' | |
| ) | |
| cache_parser.add_argument( | |
| '--type', type=str, default=None, | |
| choices=['context', 'search', 'classification', 'generation'], | |
| help='Specific cache type to clear' | |
| ) | |
| return parser | |
| def get_anthropic_client(api_key: Optional[str] = None): | |
| """Get Anthropic client.""" | |
| import os | |
| key = api_key or os.environ.get('ANTHROPIC_API_KEY') | |
| if not key: | |
| print("Warning: No Anthropic API key provided. AI features will be limited.") | |
| return None | |
| try: | |
| from anthropic import Anthropic | |
| return Anthropic(api_key=key) | |
| except ImportError: | |
| print("Warning: anthropic package not installed. Install with: pip install anthropic") | |
| return None | |
| def cmd_generate(args) -> int: | |
| """Execute generate command.""" | |
| from legitdata import LegitGenerator | |
| # Parse custom row counts | |
| row_counts = None | |
| if args.rows: | |
| try: | |
| row_counts = json.loads(args.rows) | |
| except json.JSONDecodeError as e: | |
| print(f"Error parsing row counts JSON: {e}") | |
| return 1 | |
| # Get Anthropic client | |
| anthropic_client = get_anthropic_client(args.anthropic_key) | |
| # Create generator | |
| gen = LegitGenerator( | |
| url=args.url, | |
| use_case=args.use_case, | |
| connection_string=args.connection, | |
| anthropic_client=anthropic_client, | |
| cache_enabled=not args.no_cache, | |
| dry_run=args.dry_run | |
| ) | |
| # Load DDL | |
| try: | |
| gen.load_ddl(args.ddl) | |
| except Exception as e: | |
| print(f"Error loading DDL: {e}") | |
| return 1 | |
| # Generate | |
| try: | |
| results = gen.generate( | |
| size=args.size, | |
| row_counts=row_counts, | |
| truncate_first=not args.no_truncate | |
| ) | |
| print("\nResults:") | |
| for table, count in results.items(): | |
| print(f" {table}: {count} rows") | |
| return 0 | |
| except Exception as e: | |
| print(f"Error generating data: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return 1 | |
| def cmd_preview(args) -> int: | |
| """Execute preview command.""" | |
| from legitdata import LegitGenerator | |
| # Get Anthropic client | |
| anthropic_client = get_anthropic_client(args.anthropic_key) | |
| # Create generator (no connection needed for preview) | |
| gen = LegitGenerator( | |
| url=args.url, | |
| use_case=args.use_case, | |
| connection_string="", # Not needed for preview | |
| anthropic_client=anthropic_client, | |
| dry_run=True | |
| ) | |
| # Load DDL | |
| try: | |
| gen.load_ddl(args.ddl) | |
| except Exception as e: | |
| print(f"Error loading DDL: {e}") | |
| return 1 | |
| # Preview | |
| try: | |
| preview_data = gen.preview( | |
| table_name=args.table, | |
| num_rows=args.rows | |
| ) | |
| print("\n" + "=" * 60) | |
| print("PREVIEW DATA") | |
| print("=" * 60) | |
| for table_name, rows in preview_data.items(): | |
| print(f"\n{table_name}:") | |
| print("-" * 40) | |
| for i, row in enumerate(rows): | |
| print(f"Row {i + 1}:") | |
| for key, value in row.items(): | |
| if value is not None: | |
| print(f" {key}: {value}") | |
| print() | |
| return 0 | |
| except Exception as e: | |
| print(f"Error previewing data: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return 1 | |
| def cmd_cache(args) -> int: | |
| """Execute cache command.""" | |
| from legitdata.cache import FileCache | |
| cache = FileCache() | |
| if args.clear: | |
| count = cache.clear(args.type) | |
| print(f"Cleared {count} cache entries") | |
| return 0 | |
| if args.stats: | |
| stats = cache.get_stats() | |
| print("Cache Statistics:") | |
| print(f" Total entries: {stats['total']}") | |
| for cache_type, count in stats['by_type'].items(): | |
| print(f" {cache_type}: {count}") | |
| return 0 | |
| print("Use --clear or --stats") | |
| return 1 | |
| def main() -> int: | |
| """Main entry point.""" | |
| parser = create_parser() | |
| args = parser.parse_args() | |
| if args.command == 'generate': | |
| return cmd_generate(args) | |
| elif args.command == 'preview': | |
| return cmd_preview(args) | |
| elif args.command == 'cache': | |
| return cmd_cache(args) | |
| else: | |
| parser.print_help() | |
| return 0 | |
| if __name__ == '__main__': | |
| sys.exit(main()) | |