""" CLI for keyword expansion toolkit. Usage: # Fetch reviews from Steam (can be resumed) python -m scripts.expand_keywords fetch --resume # Train FastText model python -m scripts.expand_keywords train # Expand dictionary and export candidates python -m scripts.expand_keywords expand --threshold 0.55 # Generate new keywords.py python -m scripts.expand_keywords generate --auto-approve 0.7 # Run all steps python -m scripts.expand_keywords run --resume # Show statistics python -m scripts.expand_keywords stats """ import argparse import asyncio import logging import sys from pathlib import Path # Add project root to path for imports PROJECT_ROOT = Path(__file__).parent.parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS from scripts.expand_keywords.expander import KeywordExpander from scripts.expand_keywords.fetcher import ReviewFetcher from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords from scripts.expand_keywords.trainer import FastTextTrainer # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger(__name__) def load_existing_keywords() -> dict[str, list[str]]: """Load existing TOPIC_KEYWORDS from keywords.py.""" keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py" if not keywords_path.exists(): raise FileNotFoundError(f"Keywords file not found: {keywords_path}") # Execute keywords.py to get TOPIC_KEYWORDS namespace: dict = {} exec(keywords_path.read_text(encoding="utf-8"), namespace) keywords = namespace.get("TOPIC_KEYWORDS") if not keywords: raise ValueError("TOPIC_KEYWORDS not found in keywords.py") return keywords async def cmd_fetch(args: argparse.Namespace) -> None: """Fetch reviews from Steam.""" logger.info("Starting review fetch...") fetcher = ReviewFetcher() # Show current progress stats = fetcher.get_stats() logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games") await fetcher.fetch_all( resume=args.resume, limit_games=args.limit, ) # Show final stats stats = fetcher.get_stats() logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games") def cmd_train(args: argparse.Namespace) -> None: """Train FastText model.""" logger.info("Starting model training...") # Load existing keywords for frozen n-grams keywords = load_existing_keywords() existing_ngrams = extract_ngrams_from_keywords(keywords) logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary") # Load reviews fetcher = ReviewFetcher() reviews = fetcher.load_all_reviews() if not reviews: logger.error("No reviews found. Run 'fetch' first.") return logger.info(f"Loaded {len(reviews)} reviews") # Preprocess preprocessor = Preprocessor(existing_ngrams=existing_ngrams) sentences = preprocessor.preprocess_corpus(reviews) preprocessor.save() # Train trainer = FastTextTrainer() trainer.train(sentences) trainer.save() logger.info("Training complete!") def cmd_expand(args: argparse.Namespace) -> None: """Expand dictionary and export candidates.""" logger.info("Starting dictionary expansion...") # Load components keywords = load_existing_keywords() preprocessor = Preprocessor() try: preprocessor.load() except FileNotFoundError: logger.error("Preprocessor not found. Run 'train' first.") return trainer = FastTextTrainer() try: model = trainer.load() except FileNotFoundError: logger.error("Model not found. Run 'train' first.") return # Expand expander = KeywordExpander( model=model, existing_keywords=keywords, word_frequencies=preprocessor.get_word_frequencies(), similarity_threshold=args.threshold, ) # Export candidates (with threshold in filename if requested) expander.export_candidates(include_threshold_in_name=args.compare) # Show stats stats = expander.get_expansion_stats() logger.info(f"Expansion complete: {stats['total_candidates']} candidates") logger.info(f" Auto-approved: {stats['auto_approved']}") logger.info(f" Needs review: {stats['needs_review']}") def cmd_compare(args: argparse.Namespace) -> None: """Compare multiple thresholds.""" logger.info("Comparing thresholds...") # Load components keywords = load_existing_keywords() preprocessor = Preprocessor() try: preprocessor.load() except FileNotFoundError: logger.error("Preprocessor not found. Run 'train' first.") return trainer = FastTextTrainer() try: model = trainer.load() except FileNotFoundError: logger.error("Model not found. Run 'train' first.") return thresholds = args.thresholds results = [] for threshold in thresholds: expander = KeywordExpander( model=model, existing_keywords=keywords, word_frequencies=preprocessor.get_word_frequencies(), similarity_threshold=threshold, ) # Export with threshold in name expander.export_candidates(include_threshold_in_name=True) stats = expander.get_expansion_stats() results.append((threshold, stats)) # Print comparison table print("\n" + "=" * 60) print("THRESHOLD COMPARISON") print("=" * 60) print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}") print("-" * 60) for threshold, stats in results: print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}") print("-" * 60) print(f"\nOutput files saved to: {OUTPUT_DIR}/") print("Compare candidates_t*.json to see differences.") def cmd_generate(args: argparse.Namespace) -> None: """Generate new keywords.py.""" logger.info("Generating expanded keywords.py...") # Load components keywords = load_existing_keywords() preprocessor = Preprocessor() try: preprocessor.load() except FileNotFoundError: logger.error("Preprocessor not found. Run 'train' first.") return trainer = FastTextTrainer() try: model = trainer.load() except FileNotFoundError: logger.error("Model not found. Run 'train' first.") return # Generate expander = KeywordExpander( model=model, existing_keywords=keywords, word_frequencies=preprocessor.get_word_frequencies(), ) output_path = expander.generate_keywords_py( auto_approve_threshold=args.auto_approve, ) logger.info(f"Generated: {output_path}") async def cmd_run(args: argparse.Namespace) -> None: """Run all steps: fetch, train, expand, generate.""" logger.info("Running complete pipeline...") # Step 1: Fetch await cmd_fetch(args) # Step 2: Train cmd_train(args) # Step 3: Expand cmd_expand(args) # Step 4: Generate cmd_generate(args) logger.info("Pipeline complete!") def cmd_stats(args: argparse.Namespace) -> None: """Show statistics.""" # Fetcher stats fetcher = ReviewFetcher() fetch_stats = fetcher.get_stats() print("\n=== Fetch Statistics ===") print(f"Games configured: {fetch_stats['games_total']}") print(f"Games completed: {fetch_stats['games_completed']}") print(f"Games in progress: {fetch_stats['games_in_progress']}") print(f"Total reviews: {fetch_stats['reviews_total']}") if fetch_stats["reviews_per_game"]: print("\nReviews per game:") for name, count in sorted(fetch_stats["reviews_per_game"].items()): print(f" {name}: {count}") # Model stats model_path = MODELS_DIR / "fasttext.model" if model_path.exists(): print("\n=== Model Statistics ===") trainer = FastTextTrainer() model = trainer.load() print(f"Vocabulary size: {len(model.wv)}") # Expansion stats (if available) candidates_path = OUTPUT_DIR / "candidates.json" if candidates_path.exists(): import json with open(candidates_path, "r", encoding="utf-8") as f: data = json.load(f) print("\n=== Expansion Statistics ===") print(f"Total candidates: {data['metadata']['total_candidates']}") for cat, cands in data["categories"].items(): print(f" {cat}: {len(cands)}") def cmd_similar(args: argparse.Namespace) -> None: """Find similar words for testing.""" trainer = FastTextTrainer() try: model = trainer.load() except FileNotFoundError: logger.error("Model not found. Run 'train' first.") return word = args.word topn = args.topn similar = trainer.get_similar(word, topn=topn) if similar: print(f"\nWords similar to '{word}':") for w, sim in similar: print(f" {w}: {sim:.3f}") else: print(f"Word '{word}' not found in vocabulary") def main(): parser = argparse.ArgumentParser( description="Keyword expansion toolkit using FastText", formatter_class=argparse.RawDescriptionHelpFormatter, ) subparsers = parser.add_subparsers(dest="command", help="Available commands") # fetch command fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam") fetch_parser.add_argument( "--resume", "-r", action="store_true", help="Resume from previous progress", ) fetch_parser.add_argument( "--limit", "-l", type=int, default=None, help="Limit number of games (for testing)", ) # train command train_parser = subparsers.add_parser("train", help="Train FastText model") # expand command expand_parser = subparsers.add_parser("expand", help="Expand dictionary") expand_parser.add_argument( "--threshold", "-t", type=float, default=SETTINGS["similarity_threshold"], help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})", ) expand_parser.add_argument( "--compare", "-c", action="store_true", help="Include threshold in output filename (for comparison)", ) # compare command compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds") compare_parser.add_argument( "--thresholds", "-t", type=float, nargs="+", default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70], help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)", ) # generate command generate_parser = subparsers.add_parser("generate", help="Generate keywords.py") generate_parser.add_argument( "--auto-approve", "-a", type=float, default=SETTINGS["auto_approve_threshold"], help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})", ) # run command (all steps) run_parser = subparsers.add_parser("run", help="Run all steps") run_parser.add_argument( "--resume", "-r", action="store_true", help="Resume fetch from previous progress", ) run_parser.add_argument( "--limit", "-l", type=int, default=None, help="Limit number of games (for testing)", ) run_parser.add_argument( "--threshold", "-t", type=float, default=SETTINGS["similarity_threshold"], help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})", ) run_parser.add_argument( "--auto-approve", "-a", type=float, default=SETTINGS["auto_approve_threshold"], help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})", ) # stats command stats_parser = subparsers.add_parser("stats", help="Show statistics") # similar command (for testing) similar_parser = subparsers.add_parser("similar", help="Find similar words") similar_parser.add_argument("word", help="Word to find similar words for") similar_parser.add_argument( "--topn", "-n", type=int, default=20, help="Number of results (default: 20)", ) args = parser.parse_args() if not args.command: parser.print_help() return # Execute command if args.command == "fetch": asyncio.run(cmd_fetch(args)) elif args.command == "train": cmd_train(args) elif args.command == "expand": cmd_expand(args) elif args.command == "compare": cmd_compare(args) elif args.command == "generate": cmd_generate(args) elif args.command == "run": asyncio.run(cmd_run(args)) elif args.command == "stats": cmd_stats(args) elif args.command == "similar": cmd_similar(args) if __name__ == "__main__": main()