Spaces:

InnoTrack
/

Graduation_Project-v1.2

Running

File size: 3,806 Bytes

import re
from typing import List

GENERIC_PATTERNS = [
    "dashboard",
    "login",
    "signup",
    "authentication",
    "admin panel",
    "analytics system",
    "analytics platform",
    "management system",
    "tracking system",
    "monitoring system",
    "ai module",
    "smart system",
    "web platform",
    "mobile app",
    "website",
    "reports page",
    "user management"
]

BAD_STARTS = [
    "here are",
    "below are",
    "these are",
    "the following",
    "project ideas",
    "features include"
]

LOW_VALUE_WORDS = [
    "system",
    "platform",
    "application",
    "website",
    "solution"
]

def clean_text(text: str) -> str:

    if not text:
        return ""

    text = str(text).strip()

    
    text = re.sub(r"^\d+[\)\.\-\s]+", "", text)

    
    text = re.sub(r"^[\-\*\•\→\▪\s]+", "", text)

    
    text = text.replace("**", "")

    
    text = text.replace('"', "").replace("'", "")

    
    text = re.sub(r"\(.*?\)", "", text)

    
    if ":" in text and len(text.split()) > 6:
        text = text.split(":")[0]

    
    text = re.sub(r"^(assistant|bot)\s*[:\-]\s*", "", text, flags=re.I)

    
    text = re.sub(r"[.,\-:;]+$", "", text)

    
    text = re.sub(r"\s+", " ", text).strip()

    return text

def normalize_key(text: str) -> str:

    text = text.lower()

    text = re.sub(r"[^a-z0-9\s]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

def is_generic(text: str) -> bool:

    low = normalize_key(text)

    for pattern in GENERIC_PATTERNS:
        if pattern in low:
            return True

    return False

def is_low_quality(text: str) -> bool:

    low = normalize_key(text)

    words = low.split()

    
    if len(words) < 3:
        return True

    
    if len(words) > 12:
        return True

    
    if any(low.startswith(x) for x in BAD_STARTS):
        return True

    
    weak_count = sum(
        1 for w in words
        if w in LOW_VALUE_WORDS
    )

    if weak_count >= len(words) / 2:
        return True

    return False

def is_valid_item(text: str) -> bool:

    if not text:
        return False

    
    if is_generic(text):
        return False

    
    if is_low_quality(text):
        return False

    return True

def filter_items(items: List[str]) -> List[str]:

    final = []

    seen = set()

    for item in items:

        text = clean_text(item)

        if not text:
            continue

        if not is_valid_item(text):
            continue

        key = normalize_key(text)

        
        if key in seen:
            continue

        
        duplicate = False

        for old in seen:

            
            overlap = set(key.split()) & set(old.split())

            if len(overlap) >= max(2, min(len(key.split()), len(old.split())) - 1):
                duplicate = True
                break

        if duplicate:
            continue

        seen.add(key)

        final.append(text)

    return final

def smart_split(text: str) -> List[str]:

    if not text:
        return []

    text = text.replace("\r", "\n")

    lines = []

    for line in text.split("\n"):

        line = line.strip()

        if not line:
            continue

        
        parts = re.split(r"\d+[\.\)]\s*", line)

        for p in parts:

            p = p.strip()

            if not p:
                continue

            # Remove leading bullets or hyphens instead of splitting the whole string
            p = re.sub(r"^[-•▪*]\s*", "", p).strip()

            if p:
                lines.append(p)

    return lines

def validate_generated_list(
    text: str,
    top_k: int = 10
) -> List[str]:

    if not text:
        return []

    raw_items = smart_split(text)

    cleaned = filter_items(raw_items)

    return cleaned[:top_k]