Spaces:

InnoTrack
/

Graduation_Project-v1.2

Running

File size: 10,993 Bytes

import logging
import re

from collections import Counter
from typing import Dict, Any, List
from functools import lru_cache
from difflib import get_close_matches

import pandas as pd

from src.similarity_model import (
    find_similar_projects,
    extract_features
)

from src.recommendation_engine.config import (
    SIMILARITY_TOP_K,
    MAX_FEATURES
)

logger = logging.getLogger(__name__)

DOMAIN_KEYWORDS = {
    "AI & Machine Learning": [
        "ai", "artificial intelligence", "machine learning", "ml", "deep learning",
        "neural network", "nlp", "computer vision"
    ],
    "Business & Finance": [
        "fintech", "finance", "bank", "payment", "crypto", "blockchain", "business", "trading"
    ],
    "Cloud & DevOps": [
        "cloud", "devops", "aws", "azure", "docker", "kubernetes", "infrastructure"
    ],
    "Cybersecurity": [
        "security", "cyber", "cybersecurity", "threat", "attack", "malware", "hacking"
    ],
    "Education": [
        "education", "school", "learning", "edtech", "student", "university", "academic"
    ],
    "Healthcare": [
        "hospital", "health", "medical", "healthcare", "clinic", "patient", "care"
    ],
    "IoT & Embedded Systems": [
        "iot", "embedded", "hardware", "sensor", "arduino", "raspberry", "smart home"
    ],
    "Web & Mobile Development": [
        "web", "mobile", "app", "ios", "android", "frontend", "backend", "fullstack", "website"
    ],
    "Data Science & Analytics": [
        "data", "analytics", "science", "big data", "dashboard", "statistics"
    ],
    "E-Commerce & Marketplaces": [
        "ecommerce", "shopping", "retail", "store", "marketplace", "shop"
    ],
    "Smart Systems": [
        "smart system", "automation", "smart city", "smart"
    ],
    "Networking & Communication": [
        "networking", "communication", "telecom", "5g", "network"
    ],
    "Game Development": [
        "game", "gaming", "unity", "unreal", "ar", "vr"
    ],
    "Others": [
        "general", "random", "anything", "any", "whatever", "surprise me", "mixed", "all", "open", "everything", "other"
    ]
}

def normalize(text: str) -> str:

    text = str(text).lower().strip()

    text = re.sub(r"[^a-z0-9\s]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

def clean_list(
    items: List[str],
    limit: int = 20
) -> List[str]:

    final = []

    seen = set()

    for item in items:

        val = normalize(item)

        if not val:
            continue

        if val not in seen:

            seen.add(val)

            final.append(val)

    return final[:limit]

def detect_domains(text: str) -> List[str]:

    text = normalize(text)

    detected = []

    words_in_text = set(text.split())

    for domain, words in DOMAIN_KEYWORDS.items():

        for w in words:

            
            if " " in w:

                if w in text:

                    detected.append(domain)
                    break

            
            else:

                if w in words_in_text:

                    detected.append(domain)
                    break

    return clean_list(detected, limit=3)

def extract_domain(text: str) -> str:

    if not text:
        return ""

    text = normalize(text)

    
    if text in ["ai", "ml"]:
        return "artificial intelligence"

    
    # Map normalized domain names to their original keys
    normalized_domains = {normalize(d): d for d in DOMAIN_KEYWORDS.keys()}
    if text in normalized_domains:
        return normalized_domains[text]

    # Check close matches against normalized domain names
    match_domain = get_close_matches(
        text,
        list(normalized_domains.keys()),
        n=1,
        cutoff=0.85
    )
    if match_domain:
        return normalized_domains[match_domain[0]]

    if text in DOMAIN_KEYWORDS:
        return text

    
    domains = detect_domains(text)

    if domains:

        
        for d in domains:

            if d != "general":
                return d

        return domains[0]

    
    all_words = []

    word_map = {}

    for domain, words in DOMAIN_KEYWORDS.items():

        for w in words:

            all_words.append(w)

            word_map[w] = domain

    match = get_close_matches(
        text,
        all_words,
        n=1,
        cutoff=0.75
    )

    if match:
        return word_map[match[0]]

    
    for domain, words in DOMAIN_KEYWORDS.items():

        for w in words:

            if text in w or w.startswith(text):

                return domain

    others_keywords = DOMAIN_KEYWORDS.get("Others", [])
    if any(ow in text for ow in others_keywords):
        return "Others"

    return ""

@lru_cache(maxsize=100)
def cached_similarity(
    title: str,
    description: str
):

    return find_similar_projects(
        title=title,
        description=description,
        top_k=SIMILARITY_TOP_K
    )

def extract_common_features(
    results: pd.DataFrame
) -> List[str]:

    counter = Counter()

    if not isinstance(results, pd.DataFrame):
        return []

    for _, row in results.iterrows():

        matches = row.get(
            "matched_features",
            []
        )

        for item in matches:

            if isinstance(item, dict):

                feat = item.get(
                    "feature_b",
                    ""
                )

                feat = normalize(feat)

                if feat:

                    counter[feat] += 1

    return [
        feat
        for feat, _
        in counter.most_common(12)
    ]

def extract_titles(
    results: pd.DataFrame
) -> List[str]:

    if not isinstance(results, pd.DataFrame):
        return []

    titles = [
        str(row.get("project_title", "")).strip()
        for _, row in results.iterrows()
        if row.get("project_title")
    ]

    return clean_list(titles, limit=10)

def build_architecture_hints(
    domains: List[str]
) -> List[str]:

    hints = []

    if "artificial intelligence" in domains:
        hints.extend([
            "AI inference pipeline",
            "Model prediction workflow",
            "Data preprocessing module"
        ])

    if "healthcare" in domains:
        hints.extend([
            "Emergency handling workflow",
            "Patient monitoring logic",
            "Medical alert system"
        ])

    if "security" in domains:
        hints.extend([
            "Threat detection pipeline",
            "Behavior anomaly analysis",
            "Risk monitoring engine"
        ])

    if "education" in domains:
        hints.extend([
            "Adaptive learning workflow",
            "Student performance analytics",
            "Recommendation engine"
        ])

    return clean_list(hints, limit=10)

def build_project_context(
    title: str,
    description: str,
    abstract: str = "",
    features: List[str] = None
) -> Dict[str, Any]:

    features = features or []

    logger.info("Building project context")

    full_text = (
        f"{title}. "
        f"{abstract}. "
        f"{description}"
    )

    
    
    
    domains = detect_domains(full_text)

    main_domain = (
        domains[0]
        if domains
        else "general"
    )

    
    
    
    auto_features = extract_features(
        full_text
    )

    user_features = clean_list(
        features + auto_features,
        MAX_FEATURES
    )

    
    
    
    try:

        results = cached_similarity(
            title,
            description
        )

    except Exception as e:

        logger.warning(
            f"Similarity failed: {e}"
        )

        results = None

    
    
    
    if (
        not isinstance(results, pd.DataFrame)
        or len(results) == 0
        or "message" in results.columns
    ):

        return {
            "project_title": title,
            "domain": main_domain,
            "domains": domains,
            "features": user_features,
            "similar_titles": [],
            "common_features": [],
            "unique_features": user_features,
            "architecture_hints": build_architecture_hints(domains),
            "originality_score": 99.0,
            "context_strength": 0.0
        }

    
    
    
    similar_titles = extract_titles(results)

    common_features = extract_common_features(
        results
    )

    unique_features = [
        f
        for f in user_features
        if f not in common_features
    ]

    hybrid_scores = results.get(
        "hybrid_score",
        pd.Series([0])
    )

    context_strength = float(
        hybrid_scores.mean()
    )

    return {
        "project_title": title,
        "domain": main_domain,
        "domains": domains,
        "features": user_features,
        "similar_titles": similar_titles,
        "common_features": common_features,
        "unique_features": unique_features,
        "architecture_hints": build_architecture_hints(domains),
        "originality_score": calibrate_originality(context_strength),
        "context_strength": round(context_strength, 4)
    }

def calibrate_originality(similarity: float) -> float:
    """
    Piecewise linear calibration curve mapping database similarity to originality percentage.
    - S <= 0.45: maps linearly to O in [85.0%, 99.0%]
    - S > 0.45: maps linearly to O in [5.0%, 85.0%]
    """
    s = max(0.0, min(1.0, float(similarity)))
    if s <= 0.45:
        originality = 99.0 - (s / 0.45) * 14.0
    else:
        originality = 85.0 - ((s - 0.45) / 0.55) * 80.0
    return round(originality, 2)




def build_domain_context(
    domain: str
) -> Dict[str, Any]:

    extracted = extract_domain(domain)

    if extracted and extracted.lower() != "others":
        domain_clean = extracted
    else:
        logger.info(
            f"[DOMAIN INFO] Using custom dynamic domain: {domain}"
        )

        domain_clean = normalize(domain)

    logger.info(
        f"Building domain context: {domain_clean}"
    )

    try:

        results = cached_similarity(
            domain_clean,
            domain_clean
        )

    except Exception as e:

        logger.warning(
            f"Domain similarity failed: {e}"
        )

        results = None

    if (
        not isinstance(results, pd.DataFrame)
        or len(results) == 0
        or "message" in results.columns
    ):

        return {
            "domain": domain_clean,
            "existing_titles": [],
            "common_features": [],
            "architecture_hints": build_architecture_hints([domain_clean]),
            "context_strength": 0.0
        }

    hybrid_scores = results.get(
        "hybrid_score",
        pd.Series([0])
    )

    return {
        "domain": domain_clean,
        "existing_titles": extract_titles(results),
        "common_features": extract_common_features(results),
        "architecture_hints": build_architecture_hints([domain_clean]),
        "context_strength": round(
            float(hybrid_scores.mean()),
            4
        )
    }