Spaces:

InnoTrack
/

Graduation_Project-v1.2

Running

App Files Files Community

Graduation_Project-v1.2 / src /recommendation_engine /feature_generator.py

bat-6

update

4552666 23 days ago

Raw

History Blame Contribute Delete

6.72 kB

	import logging
	from typing import List, Dict, Any, Set

	from src.recommendation_engine.context_builder import build_project_context
	from src.recommendation_engine.prompt_builder import build_feature_prompt
	from src.recommendation_engine.llm_client import generate_text
	from src.recommendation_engine.validator import validate_generated_list
	from src.recommendation_engine.novelty_checker import is_feature_novel

	from src.similarity_model import compare_two_ideas

	from src.recommendation_engine.config import (
	DEFAULT_FEATURE_COUNT,
	GENERATION_BATCH_SIZE
	)

	logger = logging.getLogger(__name__)

	MAX_RETRIES = 5

	SIMILARITY_THRESHOLD_LOCAL = 0.82

	def normalize(text: str) -> str:
	return " ".join(str(text).strip().lower().split())

	GENERIC_PATTERNS = [
	"dashboard",
	"login",
	"signup",
	"authentication",
	"analytics module",
	"ai module",
	"admin panel",
	"settings page",
	"reports system",
	"user management"
	]

	def is_generic_feature(text: str) -> bool:

	low = normalize(text)

	if len(low.split()) < 2:
	return True

	for bad in GENERIC_PATTERNS:
	if bad in low:
	return True

	return False

	def clean_features(features: List[str]) -> List[str]:

	final = []

	for f in features:

	clean = str(f).strip()

	if not clean:
	continue

	words = clean.split()


	if len(words) < 3 or len(words) > 10:
	continue

	if is_generic_feature(clean):
	continue

	final.append(clean)

	return final

	def is_duplicate_local(feature: str, existing: List[str]) -> bool:

	for old in existing:

	score = compare_two_ideas(feature, old)

	if score >= SIMILARITY_THRESHOLD_LOCAL:
	logger.info(f"[LOCAL DUPLICATE] {feature} ~ {old} ({score:.2f})")
	return True

	return False

	def fallback_features(title: str) -> List[str]:

	title = (title or "").lower()




	if any(k in title for k in ["health", "hospital", "medical", "clinic"]):

	return [
	"Real-time patient monitoring",
	"Emergency alert notification system",
	"AI-assisted diagnosis support",
	"Medical data visualization dashboard",
	"Predictive patient risk analysis"
	]




	if any(k in title for k in ["education", "learning", "student", "school"]):

	return [
	"Adaptive learning recommendation engine",
	"Student performance prediction system",
	"Automated assignment evaluation",
	"Gamified engagement tracking",
	"Personalized study path generation"
	]




	if any(k in title for k in ["security", "cyber", "threat"]):

	return [
	"Real-time threat detection engine",
	"Behavior anomaly monitoring",
	"Automated attack alert system",
	"Security event visualization",
	"Risk prediction analytics"
	]




	return [
	"Real-time intelligent monitoring",
	"Predictive analytics engine",
	"Smart recommendation system",
	"Automated decision support",
	"Dynamic performance optimization"
	]

	def generate_features(
	title: str,
	description: str,
	abstract: str = "",
	features: List[str] = None,
	previous_generated_features: List[str] = None,
	top_k: int = DEFAULT_FEATURE_COUNT
	) -> Dict[str, Any]:

	features = features or []
	previous_generated_features = previous_generated_features or []

	top_k = max(1, min(top_k, 20))

	logger.info(f"Starting feature generation \| title={title}")




	context = build_project_context(
	title=title,
	description=description,
	abstract=abstract,
	features=features
	)

	final_features: List[str] = []
	final_norm_set: Set[str] = set()

	existing_features = context.get("features", [])

	existing_norm = set(
	normalize(f)
	for f in existing_features
	)

	previous_norm = set(
	normalize(f)
	for f in previous_generated_features
	)

	attempts = 0




	while len(final_features) < top_k and attempts < MAX_RETRIES:

	attempts += 1

	logger.info(f"Generation attempt #{attempts}")

	generation_count = max(
	top_k * 4,
	GENERATION_BATCH_SIZE
	)




	prompt = build_feature_prompt(
	context=context,
	count=generation_count,
	previous_features=previous_generated_features
	)




	raw_text = generate_text(
	prompt,
	task="feature"
	)

	if not raw_text:
	logger.warning("Empty feature response")
	continue




	generated = validate_generated_list(
	text=raw_text,
	top_k=generation_count
	)

	generated = clean_features(generated)

	logger.info(f"Generated {len(generated)} candidate features")




	for feat in generated:

	norm = normalize(feat)

	if not norm:
	continue


	if (
	norm in final_norm_set
	or norm in existing_norm
	or norm in previous_norm
	):
	continue


	if not is_feature_novel(feat, existing_features):
	continue


	if is_duplicate_local(feat, final_features):
	continue

	final_features.append(feat)
	final_norm_set.add(norm)

	logger.info(f"[NEW FEATURE] {feat}")

	if len(final_features) >= top_k:
	break




	if len(final_features) < top_k:

	logger.warning("Using fallback features")

	fallback = fallback_features(title)

	for feat in fallback:

	norm = normalize(feat)

	if (
	norm not in final_norm_set
	and norm not in existing_norm
	):

	final_features.append(feat)
	final_norm_set.add(norm)

	if len(final_features) >= top_k:
	break

	logger.info(f"Final generated features: {final_features}")

	return {
	"project_title": context.get("project_title", title),
	"current_features": existing_features,
	"recommended_features": final_features,
	"originality_score": context.get("originality_score", 1.0),
	"similar_projects": context.get("similar_titles", [])
	}