Spaces:

klzn
/

sentimentstream-worker

Running

sentimentstream-worker / scripts /expand_keywords /main.py

GitHub Action

deploy: worker release from GitHub

8ff1b66 24 days ago

13.3 kB

	"""
	CLI for keyword expansion toolkit.

	Usage:
	# Fetch reviews from Steam (can be resumed)
	python -m scripts.expand_keywords fetch --resume

	# Train FastText model
	python -m scripts.expand_keywords train

	# Expand dictionary and export candidates
	python -m scripts.expand_keywords expand --threshold 0.55

	# Generate new keywords.py
	python -m scripts.expand_keywords generate --auto-approve 0.7

	# Run all steps
	python -m scripts.expand_keywords run --resume

	# Show statistics
	python -m scripts.expand_keywords stats
	"""

	import argparse
	import asyncio
	import logging
	import sys
	from pathlib import Path

	# Add project root to path for imports
	PROJECT_ROOT = Path(__file__).parent.parent.parent
	sys.path.insert(0, str(PROJECT_ROOT))

	from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS
	from scripts.expand_keywords.expander import KeywordExpander
	from scripts.expand_keywords.fetcher import ReviewFetcher
	from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords
	from scripts.expand_keywords.trainer import FastTextTrainer

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)
	logger = logging.getLogger(__name__)


	def load_existing_keywords() -> dict[str, list[str]]:
	"""Load existing TOPIC_KEYWORDS from keywords.py."""
	keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py"

	if not keywords_path.exists():
	raise FileNotFoundError(f"Keywords file not found: {keywords_path}")

	# Execute keywords.py to get TOPIC_KEYWORDS
	namespace: dict = {}
	exec(keywords_path.read_text(encoding="utf-8"), namespace)

	keywords = namespace.get("TOPIC_KEYWORDS")
	if not keywords:
	raise ValueError("TOPIC_KEYWORDS not found in keywords.py")

	return keywords


	async def cmd_fetch(args: argparse.Namespace) -> None:
	"""Fetch reviews from Steam."""
	logger.info("Starting review fetch...")

	fetcher = ReviewFetcher()

	# Show current progress
	stats = fetcher.get_stats()
	logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")

	await fetcher.fetch_all(
	resume=args.resume,
	limit_games=args.limit,
	)

	# Show final stats
	stats = fetcher.get_stats()
	logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")


	def cmd_train(args: argparse.Namespace) -> None:
	"""Train FastText model."""
	logger.info("Starting model training...")

	# Load existing keywords for frozen n-grams
	keywords = load_existing_keywords()
	existing_ngrams = extract_ngrams_from_keywords(keywords)
	logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary")

	# Load reviews
	fetcher = ReviewFetcher()
	reviews = fetcher.load_all_reviews()

	if not reviews:
	logger.error("No reviews found. Run 'fetch' first.")
	return

	logger.info(f"Loaded {len(reviews)} reviews")

	# Preprocess
	preprocessor = Preprocessor(existing_ngrams=existing_ngrams)
	sentences = preprocessor.preprocess_corpus(reviews)
	preprocessor.save()

	# Train
	trainer = FastTextTrainer()
	trainer.train(sentences)
	trainer.save()

	logger.info("Training complete!")


	def cmd_expand(args: argparse.Namespace) -> None:
	"""Expand dictionary and export candidates."""
	logger.info("Starting dictionary expansion...")

	# Load components
	keywords = load_existing_keywords()

	preprocessor = Preprocessor()
	try:
	preprocessor.load()
	except FileNotFoundError:
	logger.error("Preprocessor not found. Run 'train' first.")
	return

	trainer = FastTextTrainer()
	try:
	model = trainer.load()
	except FileNotFoundError:
	logger.error("Model not found. Run 'train' first.")
	return

	# Expand
	expander = KeywordExpander(
	model=model,
	existing_keywords=keywords,
	word_frequencies=preprocessor.get_word_frequencies(),
	similarity_threshold=args.threshold,
	)

	# Export candidates (with threshold in filename if requested)
	expander.export_candidates(include_threshold_in_name=args.compare)

	# Show stats
	stats = expander.get_expansion_stats()
	logger.info(f"Expansion complete: {stats['total_candidates']} candidates")
	logger.info(f" Auto-approved: {stats['auto_approved']}")
	logger.info(f" Needs review: {stats['needs_review']}")


	def cmd_compare(args: argparse.Namespace) -> None:
	"""Compare multiple thresholds."""
	logger.info("Comparing thresholds...")

	# Load components
	keywords = load_existing_keywords()

	preprocessor = Preprocessor()
	try:
	preprocessor.load()
	except FileNotFoundError:
	logger.error("Preprocessor not found. Run 'train' first.")
	return

	trainer = FastTextTrainer()
	try:
	model = trainer.load()
	except FileNotFoundError:
	logger.error("Model not found. Run 'train' first.")
	return

	thresholds = args.thresholds
	results = []

	for threshold in thresholds:
	expander = KeywordExpander(
	model=model,
	existing_keywords=keywords,
	word_frequencies=preprocessor.get_word_frequencies(),
	similarity_threshold=threshold,
	)

	# Export with threshold in name
	expander.export_candidates(include_threshold_in_name=True)

	stats = expander.get_expansion_stats()
	results.append((threshold, stats))

	# Print comparison table
	print("\n" + "=" * 60)
	print("THRESHOLD COMPARISON")
	print("=" * 60)
	print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}")
	print("-" * 60)

	for threshold, stats in results:
	print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}")

	print("-" * 60)
	print(f"\nOutput files saved to: {OUTPUT_DIR}/")
	print("Compare candidates_t*.json to see differences.")


	def cmd_generate(args: argparse.Namespace) -> None:
	"""Generate new keywords.py."""
	logger.info("Generating expanded keywords.py...")

	# Load components
	keywords = load_existing_keywords()

	preprocessor = Preprocessor()
	try:
	preprocessor.load()
	except FileNotFoundError:
	logger.error("Preprocessor not found. Run 'train' first.")
	return

	trainer = FastTextTrainer()
	try:
	model = trainer.load()
	except FileNotFoundError:
	logger.error("Model not found. Run 'train' first.")
	return

	# Generate
	expander = KeywordExpander(
	model=model,
	existing_keywords=keywords,
	word_frequencies=preprocessor.get_word_frequencies(),
	)

	output_path = expander.generate_keywords_py(
	auto_approve_threshold=args.auto_approve,
	)

	logger.info(f"Generated: {output_path}")


	async def cmd_run(args: argparse.Namespace) -> None:
	"""Run all steps: fetch, train, expand, generate."""
	logger.info("Running complete pipeline...")

	# Step 1: Fetch
	await cmd_fetch(args)

	# Step 2: Train
	cmd_train(args)

	# Step 3: Expand
	cmd_expand(args)

	# Step 4: Generate
	cmd_generate(args)

	logger.info("Pipeline complete!")


	def cmd_stats(args: argparse.Namespace) -> None:
	"""Show statistics."""
	# Fetcher stats
	fetcher = ReviewFetcher()
	fetch_stats = fetcher.get_stats()

	print("\n=== Fetch Statistics ===")
	print(f"Games configured: {fetch_stats['games_total']}")
	print(f"Games completed: {fetch_stats['games_completed']}")
	print(f"Games in progress: {fetch_stats['games_in_progress']}")
	print(f"Total reviews: {fetch_stats['reviews_total']}")

	if fetch_stats["reviews_per_game"]:
	print("\nReviews per game:")
	for name, count in sorted(fetch_stats["reviews_per_game"].items()):
	print(f" {name}: {count}")

	# Model stats
	model_path = MODELS_DIR / "fasttext.model"
	if model_path.exists():
	print("\n=== Model Statistics ===")
	trainer = FastTextTrainer()
	model = trainer.load()
	print(f"Vocabulary size: {len(model.wv)}")

	# Expansion stats (if available)
	candidates_path = OUTPUT_DIR / "candidates.json"
	if candidates_path.exists():
	import json
	with open(candidates_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	print("\n=== Expansion Statistics ===")
	print(f"Total candidates: {data['metadata']['total_candidates']}")
	for cat, cands in data["categories"].items():
	print(f" {cat}: {len(cands)}")


	def cmd_similar(args: argparse.Namespace) -> None:
	"""Find similar words for testing."""
	trainer = FastTextTrainer()
	try:
	model = trainer.load()
	except FileNotFoundError:
	logger.error("Model not found. Run 'train' first.")
	return

	word = args.word
	topn = args.topn

	similar = trainer.get_similar(word, topn=topn)

	if similar:
	print(f"\nWords similar to '{word}':")
	for w, sim in similar:
	print(f" {w}: {sim:.3f}")
	else:
	print(f"Word '{word}' not found in vocabulary")


	def main():
	parser = argparse.ArgumentParser(
	description="Keyword expansion toolkit using FastText",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)

	subparsers = parser.add_subparsers(dest="command", help="Available commands")

	# fetch command
	fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam")
	fetch_parser.add_argument(
	"--resume", "-r",
	action="store_true",
	help="Resume from previous progress",
	)
	fetch_parser.add_argument(
	"--limit", "-l",
	type=int,
	default=None,
	help="Limit number of games (for testing)",
	)

	# train command
	train_parser = subparsers.add_parser("train", help="Train FastText model")

	# expand command
	expand_parser = subparsers.add_parser("expand", help="Expand dictionary")
	expand_parser.add_argument(
	"--threshold", "-t",
	type=float,
	default=SETTINGS["similarity_threshold"],
	help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
	)
	expand_parser.add_argument(
	"--compare", "-c",
	action="store_true",
	help="Include threshold in output filename (for comparison)",
	)

	# compare command
	compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds")
	compare_parser.add_argument(
	"--thresholds", "-t",
	type=float,
	nargs="+",
	default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70],
	help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)",
	)

	# generate command
	generate_parser = subparsers.add_parser("generate", help="Generate keywords.py")
	generate_parser.add_argument(
	"--auto-approve", "-a",
	type=float,
	default=SETTINGS["auto_approve_threshold"],
	help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
	)

	# run command (all steps)
	run_parser = subparsers.add_parser("run", help="Run all steps")
	run_parser.add_argument(
	"--resume", "-r",
	action="store_true",
	help="Resume fetch from previous progress",
	)
	run_parser.add_argument(
	"--limit", "-l",
	type=int,
	default=None,
	help="Limit number of games (for testing)",
	)
	run_parser.add_argument(
	"--threshold", "-t",
	type=float,
	default=SETTINGS["similarity_threshold"],
	help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
	)
	run_parser.add_argument(
	"--auto-approve", "-a",
	type=float,
	default=SETTINGS["auto_approve_threshold"],
	help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
	)

	# stats command
	stats_parser = subparsers.add_parser("stats", help="Show statistics")

	# similar command (for testing)
	similar_parser = subparsers.add_parser("similar", help="Find similar words")
	similar_parser.add_argument("word", help="Word to find similar words for")
	similar_parser.add_argument(
	"--topn", "-n",
	type=int,
	default=20,
	help="Number of results (default: 20)",
	)

	args = parser.parse_args()

	if not args.command:
	parser.print_help()
	return

	# Execute command
	if args.command == "fetch":
	asyncio.run(cmd_fetch(args))
	elif args.command == "train":
	cmd_train(args)
	elif args.command == "expand":
	cmd_expand(args)
	elif args.command == "compare":
	cmd_compare(args)
	elif args.command == "generate":
	cmd_generate(args)
	elif args.command == "run":
	asyncio.run(cmd_run(args))
	elif args.command == "stats":
	cmd_stats(args)
	elif args.command == "similar":
	cmd_similar(args)


	if __name__ == "__main__":
	main()