bat-6's picture
update
4552666
Raw
History Blame Contribute Delete
4.76 kB
import logging
import re
from typing import List
from functools import lru_cache
from src.similarity_model import (
compare_two_ideas,
find_similar_projects
)
from src.recommendation_engine.config import (
IDEA_DUPLICATE_THRESHOLD,
FEATURE_DUPLICATE_THRESHOLD
)
logger = logging.getLogger(__name__)
GENERIC_PATTERNS = [
"dashboard",
"platform",
"system",
"application",
"website",
"ai module",
"analytics module",
"smart system",
"management system"
]
def normalize(text: str) -> str:
text = str(text).lower().strip()
text = re.sub(r"[^a-z0-9\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def is_generic(text: str) -> bool:
low = normalize(text)
for pattern in GENERIC_PATTERNS:
if pattern in low:
return True
return False
def token_overlap_score(a: str, b: str) -> float:
a_tokens = set(normalize(a).split())
b_tokens = set(normalize(b).split())
if not a_tokens or not b_tokens:
return 0.0
overlap = len(a_tokens & b_tokens)
union = len(a_tokens | b_tokens)
return overlap / union
def is_feature_novel(
feature: str,
existing_features: List[str]
) -> bool:
feature = normalize(feature)
if not feature:
return False
if is_generic(feature):
logger.debug(f"[GENERIC FEATURE] {feature}")
return False
existing_norm = [
normalize(f)
for f in existing_features
]
if feature in existing_norm:
logger.debug(f"[EXACT DUP] {feature}")
return False
for old in existing_norm:
if not old:
continue
semantic_score = compare_two_ideas(
feature,
old
)
overlap_score = token_overlap_score(
feature,
old
)
final_score = max(
semantic_score,
overlap_score
)
logger.debug(
f"[COMPARE] {feature} ~ {old} "
f"(semantic={semantic_score:.2f}, "
f"overlap={overlap_score:.2f})"
)
if final_score >= (
FEATURE_DUPLICATE_THRESHOLD + 0.08
):
logger.debug(
f"[FEATURE DUPLICATE] "
f"{feature} ~ {old}"
)
return False
return True
def filter_duplicate_features(
generated_features: List[str],
existing_features: List[str]
) -> List[str]:
final = []
seen = set()
for feat in generated_features:
clean = str(feat).strip()
norm = normalize(clean)
if not clean:
continue
if norm in seen:
continue
if not is_feature_novel(
norm,
existing_features + final
):
continue
seen.add(norm)
final.append(clean)
return final
@lru_cache(maxsize=256)
def _cached_db_check(idea: str) -> float:
try:
results = find_similar_projects(
title=idea,
description=idea,
top_k=3
)
if (
hasattr(results, "iloc")
and len(results) > 0
):
scores = []
for _, row in results.iterrows():
score = float(
row.get("hybrid_score", 0)
)
scores.append(score)
if scores:
return max(scores)
except Exception as e:
logger.warning(f"[DB ERROR] {e}")
return 0.0
def is_idea_novel(idea_title: str) -> bool:
idea_title = normalize(idea_title)
if not idea_title:
return False
if is_generic(idea_title):
logger.info(
f"[GENERIC IDEA REJECTED] "
f"{idea_title}"
)
return False
score = _cached_db_check(idea_title)
logger.info(f"[DB CHECK] {idea_title}")
logger.info(f"[SIMILARITY SCORE] {score:.4f}")
return score < IDEA_DUPLICATE_THRESHOLD
def score_feature_novelty(
feature: str,
existing_features: List[str]
) -> float:
feature = normalize(feature)
if not existing_features:
return 1.0
scores = []
for old in existing_features:
old_norm = normalize(old)
semantic = compare_two_ideas(
feature,
old_norm
)
overlap = token_overlap_score(
feature,
old_norm
)
scores.append(
max(semantic, overlap)
)
if not scores:
return 1.0
return round(
1.0 - max(scores),
4
)