Spaces:

klzn
/

sentimentstream-worker

Running

File size: 13,287 Bytes

8ff1b66

"""
CLI for keyword expansion toolkit.

Usage:
    # Fetch reviews from Steam (can be resumed)
    python -m scripts.expand_keywords fetch --resume

    # Train FastText model
    python -m scripts.expand_keywords train

    # Expand dictionary and export candidates
    python -m scripts.expand_keywords expand --threshold 0.55

    # Generate new keywords.py
    python -m scripts.expand_keywords generate --auto-approve 0.7

    # Run all steps
    python -m scripts.expand_keywords run --resume

    # Show statistics
    python -m scripts.expand_keywords stats
"""

import argparse
import asyncio
import logging
import sys
from pathlib import Path

# Add project root to path for imports
PROJECT_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS
from scripts.expand_keywords.expander import KeywordExpander
from scripts.expand_keywords.fetcher import ReviewFetcher
from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords
from scripts.expand_keywords.trainer import FastTextTrainer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)


def load_existing_keywords() -> dict[str, list[str]]:
    """Load existing TOPIC_KEYWORDS from keywords.py."""
    keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py"

    if not keywords_path.exists():
        raise FileNotFoundError(f"Keywords file not found: {keywords_path}")

    # Execute keywords.py to get TOPIC_KEYWORDS
    namespace: dict = {}
    exec(keywords_path.read_text(encoding="utf-8"), namespace)

    keywords = namespace.get("TOPIC_KEYWORDS")
    if not keywords:
        raise ValueError("TOPIC_KEYWORDS not found in keywords.py")

    return keywords


async def cmd_fetch(args: argparse.Namespace) -> None:
    """Fetch reviews from Steam."""
    logger.info("Starting review fetch...")

    fetcher = ReviewFetcher()

    # Show current progress
    stats = fetcher.get_stats()
    logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")

    await fetcher.fetch_all(
        resume=args.resume,
        limit_games=args.limit,
    )

    # Show final stats
    stats = fetcher.get_stats()
    logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")


def cmd_train(args: argparse.Namespace) -> None:
    """Train FastText model."""
    logger.info("Starting model training...")

    # Load existing keywords for frozen n-grams
    keywords = load_existing_keywords()
    existing_ngrams = extract_ngrams_from_keywords(keywords)
    logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary")

    # Load reviews
    fetcher = ReviewFetcher()
    reviews = fetcher.load_all_reviews()

    if not reviews:
        logger.error("No reviews found. Run 'fetch' first.")
        return

    logger.info(f"Loaded {len(reviews)} reviews")

    # Preprocess
    preprocessor = Preprocessor(existing_ngrams=existing_ngrams)
    sentences = preprocessor.preprocess_corpus(reviews)
    preprocessor.save()

    # Train
    trainer = FastTextTrainer()
    trainer.train(sentences)
    trainer.save()

    logger.info("Training complete!")


def cmd_expand(args: argparse.Namespace) -> None:
    """Expand dictionary and export candidates."""
    logger.info("Starting dictionary expansion...")

    # Load components
    keywords = load_existing_keywords()

    preprocessor = Preprocessor()
    try:
        preprocessor.load()
    except FileNotFoundError:
        logger.error("Preprocessor not found. Run 'train' first.")
        return

    trainer = FastTextTrainer()
    try:
        model = trainer.load()
    except FileNotFoundError:
        logger.error("Model not found. Run 'train' first.")
        return

    # Expand
    expander = KeywordExpander(
        model=model,
        existing_keywords=keywords,
        word_frequencies=preprocessor.get_word_frequencies(),
        similarity_threshold=args.threshold,
    )

    # Export candidates (with threshold in filename if requested)
    expander.export_candidates(include_threshold_in_name=args.compare)

    # Show stats
    stats = expander.get_expansion_stats()
    logger.info(f"Expansion complete: {stats['total_candidates']} candidates")
    logger.info(f"  Auto-approved: {stats['auto_approved']}")
    logger.info(f"  Needs review: {stats['needs_review']}")


def cmd_compare(args: argparse.Namespace) -> None:
    """Compare multiple thresholds."""
    logger.info("Comparing thresholds...")

    # Load components
    keywords = load_existing_keywords()

    preprocessor = Preprocessor()
    try:
        preprocessor.load()
    except FileNotFoundError:
        logger.error("Preprocessor not found. Run 'train' first.")
        return

    trainer = FastTextTrainer()
    try:
        model = trainer.load()
    except FileNotFoundError:
        logger.error("Model not found. Run 'train' first.")
        return

    thresholds = args.thresholds
    results = []

    for threshold in thresholds:
        expander = KeywordExpander(
            model=model,
            existing_keywords=keywords,
            word_frequencies=preprocessor.get_word_frequencies(),
            similarity_threshold=threshold,
        )

        # Export with threshold in name
        expander.export_candidates(include_threshold_in_name=True)

        stats = expander.get_expansion_stats()
        results.append((threshold, stats))

    # Print comparison table
    print("\n" + "=" * 60)
    print("THRESHOLD COMPARISON")
    print("=" * 60)
    print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}")
    print("-" * 60)

    for threshold, stats in results:
        print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}")

    print("-" * 60)
    print(f"\nOutput files saved to: {OUTPUT_DIR}/")
    print("Compare candidates_t*.json to see differences.")


def cmd_generate(args: argparse.Namespace) -> None:
    """Generate new keywords.py."""
    logger.info("Generating expanded keywords.py...")

    # Load components
    keywords = load_existing_keywords()

    preprocessor = Preprocessor()
    try:
        preprocessor.load()
    except FileNotFoundError:
        logger.error("Preprocessor not found. Run 'train' first.")
        return

    trainer = FastTextTrainer()
    try:
        model = trainer.load()
    except FileNotFoundError:
        logger.error("Model not found. Run 'train' first.")
        return

    # Generate
    expander = KeywordExpander(
        model=model,
        existing_keywords=keywords,
        word_frequencies=preprocessor.get_word_frequencies(),
    )

    output_path = expander.generate_keywords_py(
        auto_approve_threshold=args.auto_approve,
    )

    logger.info(f"Generated: {output_path}")


async def cmd_run(args: argparse.Namespace) -> None:
    """Run all steps: fetch, train, expand, generate."""
    logger.info("Running complete pipeline...")

    # Step 1: Fetch
    await cmd_fetch(args)

    # Step 2: Train
    cmd_train(args)

    # Step 3: Expand
    cmd_expand(args)

    # Step 4: Generate
    cmd_generate(args)

    logger.info("Pipeline complete!")


def cmd_stats(args: argparse.Namespace) -> None:
    """Show statistics."""
    # Fetcher stats
    fetcher = ReviewFetcher()
    fetch_stats = fetcher.get_stats()

    print("\n=== Fetch Statistics ===")
    print(f"Games configured: {fetch_stats['games_total']}")
    print(f"Games completed: {fetch_stats['games_completed']}")
    print(f"Games in progress: {fetch_stats['games_in_progress']}")
    print(f"Total reviews: {fetch_stats['reviews_total']}")

    if fetch_stats["reviews_per_game"]:
        print("\nReviews per game:")
        for name, count in sorted(fetch_stats["reviews_per_game"].items()):
            print(f"  {name}: {count}")

    # Model stats
    model_path = MODELS_DIR / "fasttext.model"
    if model_path.exists():
        print("\n=== Model Statistics ===")
        trainer = FastTextTrainer()
        model = trainer.load()
        print(f"Vocabulary size: {len(model.wv)}")

    # Expansion stats (if available)
    candidates_path = OUTPUT_DIR / "candidates.json"
    if candidates_path.exists():
        import json
        with open(candidates_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        print("\n=== Expansion Statistics ===")
        print(f"Total candidates: {data['metadata']['total_candidates']}")
        for cat, cands in data["categories"].items():
            print(f"  {cat}: {len(cands)}")


def cmd_similar(args: argparse.Namespace) -> None:
    """Find similar words for testing."""
    trainer = FastTextTrainer()
    try:
        model = trainer.load()
    except FileNotFoundError:
        logger.error("Model not found. Run 'train' first.")
        return

    word = args.word
    topn = args.topn

    similar = trainer.get_similar(word, topn=topn)

    if similar:
        print(f"\nWords similar to '{word}':")
        for w, sim in similar:
            print(f"  {w}: {sim:.3f}")
    else:
        print(f"Word '{word}' not found in vocabulary")


def main():
    parser = argparse.ArgumentParser(
        description="Keyword expansion toolkit using FastText",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # fetch command
    fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam")
    fetch_parser.add_argument(
        "--resume", "-r",
        action="store_true",
        help="Resume from previous progress",
    )
    fetch_parser.add_argument(
        "--limit", "-l",
        type=int,
        default=None,
        help="Limit number of games (for testing)",
    )

    # train command
    train_parser = subparsers.add_parser("train", help="Train FastText model")

    # expand command
    expand_parser = subparsers.add_parser("expand", help="Expand dictionary")
    expand_parser.add_argument(
        "--threshold", "-t",
        type=float,
        default=SETTINGS["similarity_threshold"],
        help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
    )
    expand_parser.add_argument(
        "--compare", "-c",
        action="store_true",
        help="Include threshold in output filename (for comparison)",
    )

    # compare command
    compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds")
    compare_parser.add_argument(
        "--thresholds", "-t",
        type=float,
        nargs="+",
        default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70],
        help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)",
    )

    # generate command
    generate_parser = subparsers.add_parser("generate", help="Generate keywords.py")
    generate_parser.add_argument(
        "--auto-approve", "-a",
        type=float,
        default=SETTINGS["auto_approve_threshold"],
        help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
    )

    # run command (all steps)
    run_parser = subparsers.add_parser("run", help="Run all steps")
    run_parser.add_argument(
        "--resume", "-r",
        action="store_true",
        help="Resume fetch from previous progress",
    )
    run_parser.add_argument(
        "--limit", "-l",
        type=int,
        default=None,
        help="Limit number of games (for testing)",
    )
    run_parser.add_argument(
        "--threshold", "-t",
        type=float,
        default=SETTINGS["similarity_threshold"],
        help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
    )
    run_parser.add_argument(
        "--auto-approve", "-a",
        type=float,
        default=SETTINGS["auto_approve_threshold"],
        help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
    )

    # stats command
    stats_parser = subparsers.add_parser("stats", help="Show statistics")

    # similar command (for testing)
    similar_parser = subparsers.add_parser("similar", help="Find similar words")
    similar_parser.add_argument("word", help="Word to find similar words for")
    similar_parser.add_argument(
        "--topn", "-n",
        type=int,
        default=20,
        help="Number of results (default: 20)",
    )

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return

    # Execute command
    if args.command == "fetch":
        asyncio.run(cmd_fetch(args))
    elif args.command == "train":
        cmd_train(args)
    elif args.command == "expand":
        cmd_expand(args)
    elif args.command == "compare":
        cmd_compare(args)
    elif args.command == "generate":
        cmd_generate(args)
    elif args.command == "run":
        asyncio.run(cmd_run(args))
    elif args.command == "stats":
        cmd_stats(args)
    elif args.command == "similar":
        cmd_similar(args)


if __name__ == "__main__":
    main()