GitHub Action
deploy: worker release from GitHub
8ff1b66
"""
CLI for keyword expansion toolkit.
Usage:
# Fetch reviews from Steam (can be resumed)
python -m scripts.expand_keywords fetch --resume
# Train FastText model
python -m scripts.expand_keywords train
# Expand dictionary and export candidates
python -m scripts.expand_keywords expand --threshold 0.55
# Generate new keywords.py
python -m scripts.expand_keywords generate --auto-approve 0.7
# Run all steps
python -m scripts.expand_keywords run --resume
# Show statistics
python -m scripts.expand_keywords stats
"""
import argparse
import asyncio
import logging
import sys
from pathlib import Path
# Add project root to path for imports
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS
from scripts.expand_keywords.expander import KeywordExpander
from scripts.expand_keywords.fetcher import ReviewFetcher
from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords
from scripts.expand_keywords.trainer import FastTextTrainer
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
def load_existing_keywords() -> dict[str, list[str]]:
"""Load existing TOPIC_KEYWORDS from keywords.py."""
keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py"
if not keywords_path.exists():
raise FileNotFoundError(f"Keywords file not found: {keywords_path}")
# Execute keywords.py to get TOPIC_KEYWORDS
namespace: dict = {}
exec(keywords_path.read_text(encoding="utf-8"), namespace)
keywords = namespace.get("TOPIC_KEYWORDS")
if not keywords:
raise ValueError("TOPIC_KEYWORDS not found in keywords.py")
return keywords
async def cmd_fetch(args: argparse.Namespace) -> None:
"""Fetch reviews from Steam."""
logger.info("Starting review fetch...")
fetcher = ReviewFetcher()
# Show current progress
stats = fetcher.get_stats()
logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
await fetcher.fetch_all(
resume=args.resume,
limit_games=args.limit,
)
# Show final stats
stats = fetcher.get_stats()
logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
def cmd_train(args: argparse.Namespace) -> None:
"""Train FastText model."""
logger.info("Starting model training...")
# Load existing keywords for frozen n-grams
keywords = load_existing_keywords()
existing_ngrams = extract_ngrams_from_keywords(keywords)
logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary")
# Load reviews
fetcher = ReviewFetcher()
reviews = fetcher.load_all_reviews()
if not reviews:
logger.error("No reviews found. Run 'fetch' first.")
return
logger.info(f"Loaded {len(reviews)} reviews")
# Preprocess
preprocessor = Preprocessor(existing_ngrams=existing_ngrams)
sentences = preprocessor.preprocess_corpus(reviews)
preprocessor.save()
# Train
trainer = FastTextTrainer()
trainer.train(sentences)
trainer.save()
logger.info("Training complete!")
def cmd_expand(args: argparse.Namespace) -> None:
"""Expand dictionary and export candidates."""
logger.info("Starting dictionary expansion...")
# Load components
keywords = load_existing_keywords()
preprocessor = Preprocessor()
try:
preprocessor.load()
except FileNotFoundError:
logger.error("Preprocessor not found. Run 'train' first.")
return
trainer = FastTextTrainer()
try:
model = trainer.load()
except FileNotFoundError:
logger.error("Model not found. Run 'train' first.")
return
# Expand
expander = KeywordExpander(
model=model,
existing_keywords=keywords,
word_frequencies=preprocessor.get_word_frequencies(),
similarity_threshold=args.threshold,
)
# Export candidates (with threshold in filename if requested)
expander.export_candidates(include_threshold_in_name=args.compare)
# Show stats
stats = expander.get_expansion_stats()
logger.info(f"Expansion complete: {stats['total_candidates']} candidates")
logger.info(f" Auto-approved: {stats['auto_approved']}")
logger.info(f" Needs review: {stats['needs_review']}")
def cmd_compare(args: argparse.Namespace) -> None:
"""Compare multiple thresholds."""
logger.info("Comparing thresholds...")
# Load components
keywords = load_existing_keywords()
preprocessor = Preprocessor()
try:
preprocessor.load()
except FileNotFoundError:
logger.error("Preprocessor not found. Run 'train' first.")
return
trainer = FastTextTrainer()
try:
model = trainer.load()
except FileNotFoundError:
logger.error("Model not found. Run 'train' first.")
return
thresholds = args.thresholds
results = []
for threshold in thresholds:
expander = KeywordExpander(
model=model,
existing_keywords=keywords,
word_frequencies=preprocessor.get_word_frequencies(),
similarity_threshold=threshold,
)
# Export with threshold in name
expander.export_candidates(include_threshold_in_name=True)
stats = expander.get_expansion_stats()
results.append((threshold, stats))
# Print comparison table
print("\n" + "=" * 60)
print("THRESHOLD COMPARISON")
print("=" * 60)
print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}")
print("-" * 60)
for threshold, stats in results:
print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}")
print("-" * 60)
print(f"\nOutput files saved to: {OUTPUT_DIR}/")
print("Compare candidates_t*.json to see differences.")
def cmd_generate(args: argparse.Namespace) -> None:
"""Generate new keywords.py."""
logger.info("Generating expanded keywords.py...")
# Load components
keywords = load_existing_keywords()
preprocessor = Preprocessor()
try:
preprocessor.load()
except FileNotFoundError:
logger.error("Preprocessor not found. Run 'train' first.")
return
trainer = FastTextTrainer()
try:
model = trainer.load()
except FileNotFoundError:
logger.error("Model not found. Run 'train' first.")
return
# Generate
expander = KeywordExpander(
model=model,
existing_keywords=keywords,
word_frequencies=preprocessor.get_word_frequencies(),
)
output_path = expander.generate_keywords_py(
auto_approve_threshold=args.auto_approve,
)
logger.info(f"Generated: {output_path}")
async def cmd_run(args: argparse.Namespace) -> None:
"""Run all steps: fetch, train, expand, generate."""
logger.info("Running complete pipeline...")
# Step 1: Fetch
await cmd_fetch(args)
# Step 2: Train
cmd_train(args)
# Step 3: Expand
cmd_expand(args)
# Step 4: Generate
cmd_generate(args)
logger.info("Pipeline complete!")
def cmd_stats(args: argparse.Namespace) -> None:
"""Show statistics."""
# Fetcher stats
fetcher = ReviewFetcher()
fetch_stats = fetcher.get_stats()
print("\n=== Fetch Statistics ===")
print(f"Games configured: {fetch_stats['games_total']}")
print(f"Games completed: {fetch_stats['games_completed']}")
print(f"Games in progress: {fetch_stats['games_in_progress']}")
print(f"Total reviews: {fetch_stats['reviews_total']}")
if fetch_stats["reviews_per_game"]:
print("\nReviews per game:")
for name, count in sorted(fetch_stats["reviews_per_game"].items()):
print(f" {name}: {count}")
# Model stats
model_path = MODELS_DIR / "fasttext.model"
if model_path.exists():
print("\n=== Model Statistics ===")
trainer = FastTextTrainer()
model = trainer.load()
print(f"Vocabulary size: {len(model.wv)}")
# Expansion stats (if available)
candidates_path = OUTPUT_DIR / "candidates.json"
if candidates_path.exists():
import json
with open(candidates_path, "r", encoding="utf-8") as f:
data = json.load(f)
print("\n=== Expansion Statistics ===")
print(f"Total candidates: {data['metadata']['total_candidates']}")
for cat, cands in data["categories"].items():
print(f" {cat}: {len(cands)}")
def cmd_similar(args: argparse.Namespace) -> None:
"""Find similar words for testing."""
trainer = FastTextTrainer()
try:
model = trainer.load()
except FileNotFoundError:
logger.error("Model not found. Run 'train' first.")
return
word = args.word
topn = args.topn
similar = trainer.get_similar(word, topn=topn)
if similar:
print(f"\nWords similar to '{word}':")
for w, sim in similar:
print(f" {w}: {sim:.3f}")
else:
print(f"Word '{word}' not found in vocabulary")
def main():
parser = argparse.ArgumentParser(
description="Keyword expansion toolkit using FastText",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# fetch command
fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam")
fetch_parser.add_argument(
"--resume", "-r",
action="store_true",
help="Resume from previous progress",
)
fetch_parser.add_argument(
"--limit", "-l",
type=int,
default=None,
help="Limit number of games (for testing)",
)
# train command
train_parser = subparsers.add_parser("train", help="Train FastText model")
# expand command
expand_parser = subparsers.add_parser("expand", help="Expand dictionary")
expand_parser.add_argument(
"--threshold", "-t",
type=float,
default=SETTINGS["similarity_threshold"],
help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
)
expand_parser.add_argument(
"--compare", "-c",
action="store_true",
help="Include threshold in output filename (for comparison)",
)
# compare command
compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds")
compare_parser.add_argument(
"--thresholds", "-t",
type=float,
nargs="+",
default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70],
help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)",
)
# generate command
generate_parser = subparsers.add_parser("generate", help="Generate keywords.py")
generate_parser.add_argument(
"--auto-approve", "-a",
type=float,
default=SETTINGS["auto_approve_threshold"],
help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
)
# run command (all steps)
run_parser = subparsers.add_parser("run", help="Run all steps")
run_parser.add_argument(
"--resume", "-r",
action="store_true",
help="Resume fetch from previous progress",
)
run_parser.add_argument(
"--limit", "-l",
type=int,
default=None,
help="Limit number of games (for testing)",
)
run_parser.add_argument(
"--threshold", "-t",
type=float,
default=SETTINGS["similarity_threshold"],
help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
)
run_parser.add_argument(
"--auto-approve", "-a",
type=float,
default=SETTINGS["auto_approve_threshold"],
help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
)
# stats command
stats_parser = subparsers.add_parser("stats", help="Show statistics")
# similar command (for testing)
similar_parser = subparsers.add_parser("similar", help="Find similar words")
similar_parser.add_argument("word", help="Word to find similar words for")
similar_parser.add_argument(
"--topn", "-n",
type=int,
default=20,
help="Number of results (default: 20)",
)
args = parser.parse_args()
if not args.command:
parser.print_help()
return
# Execute command
if args.command == "fetch":
asyncio.run(cmd_fetch(args))
elif args.command == "train":
cmd_train(args)
elif args.command == "expand":
cmd_expand(args)
elif args.command == "compare":
cmd_compare(args)
elif args.command == "generate":
cmd_generate(args)
elif args.command == "run":
asyncio.run(cmd_run(args))
elif args.command == "stats":
cmd_stats(args)
elif args.command == "similar":
cmd_similar(args)
if __name__ == "__main__":
main()