Spaces:
Running
Running
| """ | |
| CLI for keyword expansion toolkit. | |
| Usage: | |
| # Fetch reviews from Steam (can be resumed) | |
| python -m scripts.expand_keywords fetch --resume | |
| # Train FastText model | |
| python -m scripts.expand_keywords train | |
| # Expand dictionary and export candidates | |
| python -m scripts.expand_keywords expand --threshold 0.55 | |
| # Generate new keywords.py | |
| python -m scripts.expand_keywords generate --auto-approve 0.7 | |
| # Run all steps | |
| python -m scripts.expand_keywords run --resume | |
| # Show statistics | |
| python -m scripts.expand_keywords stats | |
| """ | |
| import argparse | |
| import asyncio | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| # Add project root to path for imports | |
| PROJECT_ROOT = Path(__file__).parent.parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS | |
| from scripts.expand_keywords.expander import KeywordExpander | |
| from scripts.expand_keywords.fetcher import ReviewFetcher | |
| from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords | |
| from scripts.expand_keywords.trainer import FastTextTrainer | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def load_existing_keywords() -> dict[str, list[str]]: | |
| """Load existing TOPIC_KEYWORDS from keywords.py.""" | |
| keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py" | |
| if not keywords_path.exists(): | |
| raise FileNotFoundError(f"Keywords file not found: {keywords_path}") | |
| # Execute keywords.py to get TOPIC_KEYWORDS | |
| namespace: dict = {} | |
| exec(keywords_path.read_text(encoding="utf-8"), namespace) | |
| keywords = namespace.get("TOPIC_KEYWORDS") | |
| if not keywords: | |
| raise ValueError("TOPIC_KEYWORDS not found in keywords.py") | |
| return keywords | |
| async def cmd_fetch(args: argparse.Namespace) -> None: | |
| """Fetch reviews from Steam.""" | |
| logger.info("Starting review fetch...") | |
| fetcher = ReviewFetcher() | |
| # Show current progress | |
| stats = fetcher.get_stats() | |
| logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games") | |
| await fetcher.fetch_all( | |
| resume=args.resume, | |
| limit_games=args.limit, | |
| ) | |
| # Show final stats | |
| stats = fetcher.get_stats() | |
| logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games") | |
| def cmd_train(args: argparse.Namespace) -> None: | |
| """Train FastText model.""" | |
| logger.info("Starting model training...") | |
| # Load existing keywords for frozen n-grams | |
| keywords = load_existing_keywords() | |
| existing_ngrams = extract_ngrams_from_keywords(keywords) | |
| logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary") | |
| # Load reviews | |
| fetcher = ReviewFetcher() | |
| reviews = fetcher.load_all_reviews() | |
| if not reviews: | |
| logger.error("No reviews found. Run 'fetch' first.") | |
| return | |
| logger.info(f"Loaded {len(reviews)} reviews") | |
| # Preprocess | |
| preprocessor = Preprocessor(existing_ngrams=existing_ngrams) | |
| sentences = preprocessor.preprocess_corpus(reviews) | |
| preprocessor.save() | |
| # Train | |
| trainer = FastTextTrainer() | |
| trainer.train(sentences) | |
| trainer.save() | |
| logger.info("Training complete!") | |
| def cmd_expand(args: argparse.Namespace) -> None: | |
| """Expand dictionary and export candidates.""" | |
| logger.info("Starting dictionary expansion...") | |
| # Load components | |
| keywords = load_existing_keywords() | |
| preprocessor = Preprocessor() | |
| try: | |
| preprocessor.load() | |
| except FileNotFoundError: | |
| logger.error("Preprocessor not found. Run 'train' first.") | |
| return | |
| trainer = FastTextTrainer() | |
| try: | |
| model = trainer.load() | |
| except FileNotFoundError: | |
| logger.error("Model not found. Run 'train' first.") | |
| return | |
| # Expand | |
| expander = KeywordExpander( | |
| model=model, | |
| existing_keywords=keywords, | |
| word_frequencies=preprocessor.get_word_frequencies(), | |
| similarity_threshold=args.threshold, | |
| ) | |
| # Export candidates (with threshold in filename if requested) | |
| expander.export_candidates(include_threshold_in_name=args.compare) | |
| # Show stats | |
| stats = expander.get_expansion_stats() | |
| logger.info(f"Expansion complete: {stats['total_candidates']} candidates") | |
| logger.info(f" Auto-approved: {stats['auto_approved']}") | |
| logger.info(f" Needs review: {stats['needs_review']}") | |
| def cmd_compare(args: argparse.Namespace) -> None: | |
| """Compare multiple thresholds.""" | |
| logger.info("Comparing thresholds...") | |
| # Load components | |
| keywords = load_existing_keywords() | |
| preprocessor = Preprocessor() | |
| try: | |
| preprocessor.load() | |
| except FileNotFoundError: | |
| logger.error("Preprocessor not found. Run 'train' first.") | |
| return | |
| trainer = FastTextTrainer() | |
| try: | |
| model = trainer.load() | |
| except FileNotFoundError: | |
| logger.error("Model not found. Run 'train' first.") | |
| return | |
| thresholds = args.thresholds | |
| results = [] | |
| for threshold in thresholds: | |
| expander = KeywordExpander( | |
| model=model, | |
| existing_keywords=keywords, | |
| word_frequencies=preprocessor.get_word_frequencies(), | |
| similarity_threshold=threshold, | |
| ) | |
| # Export with threshold in name | |
| expander.export_candidates(include_threshold_in_name=True) | |
| stats = expander.get_expansion_stats() | |
| results.append((threshold, stats)) | |
| # Print comparison table | |
| print("\n" + "=" * 60) | |
| print("THRESHOLD COMPARISON") | |
| print("=" * 60) | |
| print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}") | |
| print("-" * 60) | |
| for threshold, stats in results: | |
| print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}") | |
| print("-" * 60) | |
| print(f"\nOutput files saved to: {OUTPUT_DIR}/") | |
| print("Compare candidates_t*.json to see differences.") | |
| def cmd_generate(args: argparse.Namespace) -> None: | |
| """Generate new keywords.py.""" | |
| logger.info("Generating expanded keywords.py...") | |
| # Load components | |
| keywords = load_existing_keywords() | |
| preprocessor = Preprocessor() | |
| try: | |
| preprocessor.load() | |
| except FileNotFoundError: | |
| logger.error("Preprocessor not found. Run 'train' first.") | |
| return | |
| trainer = FastTextTrainer() | |
| try: | |
| model = trainer.load() | |
| except FileNotFoundError: | |
| logger.error("Model not found. Run 'train' first.") | |
| return | |
| # Generate | |
| expander = KeywordExpander( | |
| model=model, | |
| existing_keywords=keywords, | |
| word_frequencies=preprocessor.get_word_frequencies(), | |
| ) | |
| output_path = expander.generate_keywords_py( | |
| auto_approve_threshold=args.auto_approve, | |
| ) | |
| logger.info(f"Generated: {output_path}") | |
| async def cmd_run(args: argparse.Namespace) -> None: | |
| """Run all steps: fetch, train, expand, generate.""" | |
| logger.info("Running complete pipeline...") | |
| # Step 1: Fetch | |
| await cmd_fetch(args) | |
| # Step 2: Train | |
| cmd_train(args) | |
| # Step 3: Expand | |
| cmd_expand(args) | |
| # Step 4: Generate | |
| cmd_generate(args) | |
| logger.info("Pipeline complete!") | |
| def cmd_stats(args: argparse.Namespace) -> None: | |
| """Show statistics.""" | |
| # Fetcher stats | |
| fetcher = ReviewFetcher() | |
| fetch_stats = fetcher.get_stats() | |
| print("\n=== Fetch Statistics ===") | |
| print(f"Games configured: {fetch_stats['games_total']}") | |
| print(f"Games completed: {fetch_stats['games_completed']}") | |
| print(f"Games in progress: {fetch_stats['games_in_progress']}") | |
| print(f"Total reviews: {fetch_stats['reviews_total']}") | |
| if fetch_stats["reviews_per_game"]: | |
| print("\nReviews per game:") | |
| for name, count in sorted(fetch_stats["reviews_per_game"].items()): | |
| print(f" {name}: {count}") | |
| # Model stats | |
| model_path = MODELS_DIR / "fasttext.model" | |
| if model_path.exists(): | |
| print("\n=== Model Statistics ===") | |
| trainer = FastTextTrainer() | |
| model = trainer.load() | |
| print(f"Vocabulary size: {len(model.wv)}") | |
| # Expansion stats (if available) | |
| candidates_path = OUTPUT_DIR / "candidates.json" | |
| if candidates_path.exists(): | |
| import json | |
| with open(candidates_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| print("\n=== Expansion Statistics ===") | |
| print(f"Total candidates: {data['metadata']['total_candidates']}") | |
| for cat, cands in data["categories"].items(): | |
| print(f" {cat}: {len(cands)}") | |
| def cmd_similar(args: argparse.Namespace) -> None: | |
| """Find similar words for testing.""" | |
| trainer = FastTextTrainer() | |
| try: | |
| model = trainer.load() | |
| except FileNotFoundError: | |
| logger.error("Model not found. Run 'train' first.") | |
| return | |
| word = args.word | |
| topn = args.topn | |
| similar = trainer.get_similar(word, topn=topn) | |
| if similar: | |
| print(f"\nWords similar to '{word}':") | |
| for w, sim in similar: | |
| print(f" {w}: {sim:.3f}") | |
| else: | |
| print(f"Word '{word}' not found in vocabulary") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Keyword expansion toolkit using FastText", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| ) | |
| subparsers = parser.add_subparsers(dest="command", help="Available commands") | |
| # fetch command | |
| fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam") | |
| fetch_parser.add_argument( | |
| "--resume", "-r", | |
| action="store_true", | |
| help="Resume from previous progress", | |
| ) | |
| fetch_parser.add_argument( | |
| "--limit", "-l", | |
| type=int, | |
| default=None, | |
| help="Limit number of games (for testing)", | |
| ) | |
| # train command | |
| train_parser = subparsers.add_parser("train", help="Train FastText model") | |
| # expand command | |
| expand_parser = subparsers.add_parser("expand", help="Expand dictionary") | |
| expand_parser.add_argument( | |
| "--threshold", "-t", | |
| type=float, | |
| default=SETTINGS["similarity_threshold"], | |
| help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})", | |
| ) | |
| expand_parser.add_argument( | |
| "--compare", "-c", | |
| action="store_true", | |
| help="Include threshold in output filename (for comparison)", | |
| ) | |
| # compare command | |
| compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds") | |
| compare_parser.add_argument( | |
| "--thresholds", "-t", | |
| type=float, | |
| nargs="+", | |
| default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70], | |
| help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)", | |
| ) | |
| # generate command | |
| generate_parser = subparsers.add_parser("generate", help="Generate keywords.py") | |
| generate_parser.add_argument( | |
| "--auto-approve", "-a", | |
| type=float, | |
| default=SETTINGS["auto_approve_threshold"], | |
| help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})", | |
| ) | |
| # run command (all steps) | |
| run_parser = subparsers.add_parser("run", help="Run all steps") | |
| run_parser.add_argument( | |
| "--resume", "-r", | |
| action="store_true", | |
| help="Resume fetch from previous progress", | |
| ) | |
| run_parser.add_argument( | |
| "--limit", "-l", | |
| type=int, | |
| default=None, | |
| help="Limit number of games (for testing)", | |
| ) | |
| run_parser.add_argument( | |
| "--threshold", "-t", | |
| type=float, | |
| default=SETTINGS["similarity_threshold"], | |
| help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})", | |
| ) | |
| run_parser.add_argument( | |
| "--auto-approve", "-a", | |
| type=float, | |
| default=SETTINGS["auto_approve_threshold"], | |
| help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})", | |
| ) | |
| # stats command | |
| stats_parser = subparsers.add_parser("stats", help="Show statistics") | |
| # similar command (for testing) | |
| similar_parser = subparsers.add_parser("similar", help="Find similar words") | |
| similar_parser.add_argument("word", help="Word to find similar words for") | |
| similar_parser.add_argument( | |
| "--topn", "-n", | |
| type=int, | |
| default=20, | |
| help="Number of results (default: 20)", | |
| ) | |
| args = parser.parse_args() | |
| if not args.command: | |
| parser.print_help() | |
| return | |
| # Execute command | |
| if args.command == "fetch": | |
| asyncio.run(cmd_fetch(args)) | |
| elif args.command == "train": | |
| cmd_train(args) | |
| elif args.command == "expand": | |
| cmd_expand(args) | |
| elif args.command == "compare": | |
| cmd_compare(args) | |
| elif args.command == "generate": | |
| cmd_generate(args) | |
| elif args.command == "run": | |
| asyncio.run(cmd_run(args)) | |
| elif args.command == "stats": | |
| cmd_stats(args) | |
| elif args.command == "similar": | |
| cmd_similar(args) | |
| if __name__ == "__main__": | |
| main() | |