import logging from typing import List, Dict, Any, Set from src.recommendation_engine.context_builder import build_project_context from src.recommendation_engine.prompt_builder import build_feature_prompt from src.recommendation_engine.llm_client import generate_text from src.recommendation_engine.validator import validate_generated_list from src.recommendation_engine.novelty_checker import is_feature_novel from src.similarity_model import compare_two_ideas from src.recommendation_engine.config import ( DEFAULT_FEATURE_COUNT, GENERATION_BATCH_SIZE ) logger = logging.getLogger(__name__) MAX_RETRIES = 5 SIMILARITY_THRESHOLD_LOCAL = 0.82 def normalize(text: str) -> str: return " ".join(str(text).strip().lower().split()) GENERIC_PATTERNS = [ "dashboard", "login", "signup", "authentication", "analytics module", "ai module", "admin panel", "settings page", "reports system", "user management" ] def is_generic_feature(text: str) -> bool: low = normalize(text) if len(low.split()) < 2: return True for bad in GENERIC_PATTERNS: if bad in low: return True return False def clean_features(features: List[str]) -> List[str]: final = [] for f in features: clean = str(f).strip() if not clean: continue words = clean.split() if len(words) < 3 or len(words) > 10: continue if is_generic_feature(clean): continue final.append(clean) return final def is_duplicate_local(feature: str, existing: List[str]) -> bool: for old in existing: score = compare_two_ideas(feature, old) if score >= SIMILARITY_THRESHOLD_LOCAL: logger.info(f"[LOCAL DUPLICATE] {feature} ~ {old} ({score:.2f})") return True return False def fallback_features(title: str) -> List[str]: title = (title or "").lower() if any(k in title for k in ["health", "hospital", "medical", "clinic"]): return [ "Real-time patient monitoring", "Emergency alert notification system", "AI-assisted diagnosis support", "Medical data visualization dashboard", "Predictive patient risk analysis" ] if any(k in title for k in ["education", "learning", "student", "school"]): return [ "Adaptive learning recommendation engine", "Student performance prediction system", "Automated assignment evaluation", "Gamified engagement tracking", "Personalized study path generation" ] if any(k in title for k in ["security", "cyber", "threat"]): return [ "Real-time threat detection engine", "Behavior anomaly monitoring", "Automated attack alert system", "Security event visualization", "Risk prediction analytics" ] return [ "Real-time intelligent monitoring", "Predictive analytics engine", "Smart recommendation system", "Automated decision support", "Dynamic performance optimization" ] def generate_features( title: str, description: str, abstract: str = "", features: List[str] = None, previous_generated_features: List[str] = None, top_k: int = DEFAULT_FEATURE_COUNT ) -> Dict[str, Any]: features = features or [] previous_generated_features = previous_generated_features or [] top_k = max(1, min(top_k, 20)) logger.info(f"Starting feature generation | title={title}") context = build_project_context( title=title, description=description, abstract=abstract, features=features ) final_features: List[str] = [] final_norm_set: Set[str] = set() existing_features = context.get("features", []) existing_norm = set( normalize(f) for f in existing_features ) previous_norm = set( normalize(f) for f in previous_generated_features ) attempts = 0 while len(final_features) < top_k and attempts < MAX_RETRIES: attempts += 1 logger.info(f"Generation attempt #{attempts}") generation_count = max( top_k * 4, GENERATION_BATCH_SIZE ) prompt = build_feature_prompt( context=context, count=generation_count, previous_features=previous_generated_features ) raw_text = generate_text( prompt, task="feature" ) if not raw_text: logger.warning("Empty feature response") continue generated = validate_generated_list( text=raw_text, top_k=generation_count ) generated = clean_features(generated) logger.info(f"Generated {len(generated)} candidate features") for feat in generated: norm = normalize(feat) if not norm: continue if ( norm in final_norm_set or norm in existing_norm or norm in previous_norm ): continue if not is_feature_novel(feat, existing_features): continue if is_duplicate_local(feat, final_features): continue final_features.append(feat) final_norm_set.add(norm) logger.info(f"[NEW FEATURE] {feat}") if len(final_features) >= top_k: break if len(final_features) < top_k: logger.warning("Using fallback features") fallback = fallback_features(title) for feat in fallback: norm = normalize(feat) if ( norm not in final_norm_set and norm not in existing_norm ): final_features.append(feat) final_norm_set.add(norm) if len(final_features) >= top_k: break logger.info(f"Final generated features: {final_features}") return { "project_title": context.get("project_title", title), "current_features": existing_features, "recommended_features": final_features, "originality_score": context.get("originality_score", 1.0), "similar_projects": context.get("similar_titles", []) }