bat-6's picture
feat: implement recommendation engine architecture with chatbot logic, intent classification, and project validation modules.
809b701
Raw
History Blame Contribute Delete
11 kB
import logging
import re
from collections import Counter
from typing import Dict, Any, List
from functools import lru_cache
from difflib import get_close_matches
import pandas as pd
from src.similarity_model import (
find_similar_projects,
extract_features
)
from src.recommendation_engine.config import (
SIMILARITY_TOP_K,
MAX_FEATURES
)
logger = logging.getLogger(__name__)
DOMAIN_KEYWORDS = {
"AI & Machine Learning": [
"ai", "artificial intelligence", "machine learning", "ml", "deep learning",
"neural network", "nlp", "computer vision"
],
"Business & Finance": [
"fintech", "finance", "bank", "payment", "crypto", "blockchain", "business", "trading"
],
"Cloud & DevOps": [
"cloud", "devops", "aws", "azure", "docker", "kubernetes", "infrastructure"
],
"Cybersecurity": [
"security", "cyber", "cybersecurity", "threat", "attack", "malware", "hacking"
],
"Education": [
"education", "school", "learning", "edtech", "student", "university", "academic"
],
"Healthcare": [
"hospital", "health", "medical", "healthcare", "clinic", "patient", "care"
],
"IoT & Embedded Systems": [
"iot", "embedded", "hardware", "sensor", "arduino", "raspberry", "smart home"
],
"Web & Mobile Development": [
"web", "mobile", "app", "ios", "android", "frontend", "backend", "fullstack", "website"
],
"Data Science & Analytics": [
"data", "analytics", "science", "big data", "dashboard", "statistics"
],
"E-Commerce & Marketplaces": [
"ecommerce", "shopping", "retail", "store", "marketplace", "shop"
],
"Smart Systems": [
"smart system", "automation", "smart city", "smart"
],
"Networking & Communication": [
"networking", "communication", "telecom", "5g", "network"
],
"Game Development": [
"game", "gaming", "unity", "unreal", "ar", "vr"
],
"Others": [
"general", "random", "anything", "any", "whatever", "surprise me", "mixed", "all", "open", "everything", "other"
]
}
def normalize(text: str) -> str:
text = str(text).lower().strip()
text = re.sub(r"[^a-z0-9\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def clean_list(
items: List[str],
limit: int = 20
) -> List[str]:
final = []
seen = set()
for item in items:
val = normalize(item)
if not val:
continue
if val not in seen:
seen.add(val)
final.append(val)
return final[:limit]
def detect_domains(text: str) -> List[str]:
text = normalize(text)
detected = []
words_in_text = set(text.split())
for domain, words in DOMAIN_KEYWORDS.items():
for w in words:
if " " in w:
if w in text:
detected.append(domain)
break
else:
if w in words_in_text:
detected.append(domain)
break
return clean_list(detected, limit=3)
def extract_domain(text: str) -> str:
if not text:
return ""
text = normalize(text)
if text in ["ai", "ml"]:
return "artificial intelligence"
# Map normalized domain names to their original keys
normalized_domains = {normalize(d): d for d in DOMAIN_KEYWORDS.keys()}
if text in normalized_domains:
return normalized_domains[text]
# Check close matches against normalized domain names
match_domain = get_close_matches(
text,
list(normalized_domains.keys()),
n=1,
cutoff=0.85
)
if match_domain:
return normalized_domains[match_domain[0]]
if text in DOMAIN_KEYWORDS:
return text
domains = detect_domains(text)
if domains:
for d in domains:
if d != "general":
return d
return domains[0]
all_words = []
word_map = {}
for domain, words in DOMAIN_KEYWORDS.items():
for w in words:
all_words.append(w)
word_map[w] = domain
match = get_close_matches(
text,
all_words,
n=1,
cutoff=0.75
)
if match:
return word_map[match[0]]
for domain, words in DOMAIN_KEYWORDS.items():
for w in words:
if text in w or w.startswith(text):
return domain
others_keywords = DOMAIN_KEYWORDS.get("Others", [])
if any(ow in text for ow in others_keywords):
return "Others"
return ""
@lru_cache(maxsize=100)
def cached_similarity(
title: str,
description: str
):
return find_similar_projects(
title=title,
description=description,
top_k=SIMILARITY_TOP_K
)
def extract_common_features(
results: pd.DataFrame
) -> List[str]:
counter = Counter()
if not isinstance(results, pd.DataFrame):
return []
for _, row in results.iterrows():
matches = row.get(
"matched_features",
[]
)
for item in matches:
if isinstance(item, dict):
feat = item.get(
"feature_b",
""
)
feat = normalize(feat)
if feat:
counter[feat] += 1
return [
feat
for feat, _
in counter.most_common(12)
]
def extract_titles(
results: pd.DataFrame
) -> List[str]:
if not isinstance(results, pd.DataFrame):
return []
titles = [
str(row.get("project_title", "")).strip()
for _, row in results.iterrows()
if row.get("project_title")
]
return clean_list(titles, limit=10)
def build_architecture_hints(
domains: List[str]
) -> List[str]:
hints = []
if "artificial intelligence" in domains:
hints.extend([
"AI inference pipeline",
"Model prediction workflow",
"Data preprocessing module"
])
if "healthcare" in domains:
hints.extend([
"Emergency handling workflow",
"Patient monitoring logic",
"Medical alert system"
])
if "security" in domains:
hints.extend([
"Threat detection pipeline",
"Behavior anomaly analysis",
"Risk monitoring engine"
])
if "education" in domains:
hints.extend([
"Adaptive learning workflow",
"Student performance analytics",
"Recommendation engine"
])
return clean_list(hints, limit=10)
def build_project_context(
title: str,
description: str,
abstract: str = "",
features: List[str] = None
) -> Dict[str, Any]:
features = features or []
logger.info("Building project context")
full_text = (
f"{title}. "
f"{abstract}. "
f"{description}"
)
domains = detect_domains(full_text)
main_domain = (
domains[0]
if domains
else "general"
)
auto_features = extract_features(
full_text
)
user_features = clean_list(
features + auto_features,
MAX_FEATURES
)
try:
results = cached_similarity(
title,
description
)
except Exception as e:
logger.warning(
f"Similarity failed: {e}"
)
results = None
if (
not isinstance(results, pd.DataFrame)
or len(results) == 0
or "message" in results.columns
):
return {
"project_title": title,
"domain": main_domain,
"domains": domains,
"features": user_features,
"similar_titles": [],
"common_features": [],
"unique_features": user_features,
"architecture_hints": build_architecture_hints(domains),
"originality_score": 99.0,
"context_strength": 0.0
}
similar_titles = extract_titles(results)
common_features = extract_common_features(
results
)
unique_features = [
f
for f in user_features
if f not in common_features
]
hybrid_scores = results.get(
"hybrid_score",
pd.Series([0])
)
context_strength = float(
hybrid_scores.mean()
)
return {
"project_title": title,
"domain": main_domain,
"domains": domains,
"features": user_features,
"similar_titles": similar_titles,
"common_features": common_features,
"unique_features": unique_features,
"architecture_hints": build_architecture_hints(domains),
"originality_score": calibrate_originality(context_strength),
"context_strength": round(context_strength, 4)
}
def calibrate_originality(similarity: float) -> float:
"""
Piecewise linear calibration curve mapping database similarity to originality percentage.
- S <= 0.45: maps linearly to O in [85.0%, 99.0%]
- S > 0.45: maps linearly to O in [5.0%, 85.0%]
"""
s = max(0.0, min(1.0, float(similarity)))
if s <= 0.45:
originality = 99.0 - (s / 0.45) * 14.0
else:
originality = 85.0 - ((s - 0.45) / 0.55) * 80.0
return round(originality, 2)
def build_domain_context(
domain: str
) -> Dict[str, Any]:
extracted = extract_domain(domain)
if extracted and extracted.lower() != "others":
domain_clean = extracted
else:
logger.info(
f"[DOMAIN INFO] Using custom dynamic domain: {domain}"
)
domain_clean = normalize(domain)
logger.info(
f"Building domain context: {domain_clean}"
)
try:
results = cached_similarity(
domain_clean,
domain_clean
)
except Exception as e:
logger.warning(
f"Domain similarity failed: {e}"
)
results = None
if (
not isinstance(results, pd.DataFrame)
or len(results) == 0
or "message" in results.columns
):
return {
"domain": domain_clean,
"existing_titles": [],
"common_features": [],
"architecture_hints": build_architecture_hints([domain_clean]),
"context_strength": 0.0
}
hybrid_scores = results.get(
"hybrid_score",
pd.Series([0])
)
return {
"domain": domain_clean,
"existing_titles": extract_titles(results),
"common_features": extract_common_features(results),
"architecture_hints": build_architecture_hints([domain_clean]),
"context_strength": round(
float(hybrid_scores.mean()),
4
)
}