| import re |
| from typing import List |
|
|
| GENERIC_PATTERNS = [ |
| "dashboard", |
| "login", |
| "signup", |
| "authentication", |
| "admin panel", |
| "analytics system", |
| "analytics platform", |
| "management system", |
| "tracking system", |
| "monitoring system", |
| "ai module", |
| "smart system", |
| "web platform", |
| "mobile app", |
| "website", |
| "reports page", |
| "user management" |
| ] |
|
|
| BAD_STARTS = [ |
| "here are", |
| "below are", |
| "these are", |
| "the following", |
| "project ideas", |
| "features include" |
| ] |
|
|
| LOW_VALUE_WORDS = [ |
| "system", |
| "platform", |
| "application", |
| "website", |
| "solution" |
| ] |
|
|
| def clean_text(text: str) -> str: |
|
|
| if not text: |
| return "" |
|
|
| text = str(text).strip() |
|
|
| |
| text = re.sub(r"^\d+[\)\.\-\s]+", "", text) |
|
|
| |
| text = re.sub(r"^[\-\*\•\→\▪\s]+", "", text) |
|
|
| |
| text = text.replace("**", "") |
|
|
| |
| text = text.replace('"', "").replace("'", "") |
|
|
| |
| text = re.sub(r"\(.*?\)", "", text) |
|
|
| |
| if ":" in text and len(text.split()) > 6: |
| text = text.split(":")[0] |
|
|
| |
| text = re.sub(r"^(assistant|bot)\s*[:\-]\s*", "", text, flags=re.I) |
|
|
| |
| text = re.sub(r"[.,\-:;]+$", "", text) |
|
|
| |
| text = re.sub(r"\s+", " ", text).strip() |
|
|
| return text |
|
|
| def normalize_key(text: str) -> str: |
|
|
| text = text.lower() |
|
|
| text = re.sub(r"[^a-z0-9\s]", "", text) |
|
|
| text = re.sub(r"\s+", " ", text).strip() |
|
|
| return text |
|
|
| def is_generic(text: str) -> bool: |
|
|
| low = normalize_key(text) |
|
|
| for pattern in GENERIC_PATTERNS: |
| if pattern in low: |
| return True |
|
|
| return False |
|
|
| def is_low_quality(text: str) -> bool: |
|
|
| low = normalize_key(text) |
|
|
| words = low.split() |
|
|
| |
| if len(words) < 3: |
| return True |
|
|
| |
| if len(words) > 12: |
| return True |
|
|
| |
| if any(low.startswith(x) for x in BAD_STARTS): |
| return True |
|
|
| |
| weak_count = sum( |
| 1 for w in words |
| if w in LOW_VALUE_WORDS |
| ) |
|
|
| if weak_count >= len(words) / 2: |
| return True |
|
|
| return False |
|
|
| def is_valid_item(text: str) -> bool: |
|
|
| if not text: |
| return False |
|
|
| |
| if is_generic(text): |
| return False |
|
|
| |
| if is_low_quality(text): |
| return False |
|
|
| return True |
|
|
| def filter_items(items: List[str]) -> List[str]: |
|
|
| final = [] |
|
|
| seen = set() |
|
|
| for item in items: |
|
|
| text = clean_text(item) |
|
|
| if not text: |
| continue |
|
|
| if not is_valid_item(text): |
| continue |
|
|
| key = normalize_key(text) |
|
|
| |
| if key in seen: |
| continue |
|
|
| |
| duplicate = False |
|
|
| for old in seen: |
|
|
| |
| overlap = set(key.split()) & set(old.split()) |
|
|
| if len(overlap) >= max(2, min(len(key.split()), len(old.split())) - 1): |
| duplicate = True |
| break |
|
|
| if duplicate: |
| continue |
|
|
| seen.add(key) |
|
|
| final.append(text) |
|
|
| return final |
|
|
| def smart_split(text: str) -> List[str]: |
|
|
| if not text: |
| return [] |
|
|
| text = text.replace("\r", "\n") |
|
|
| lines = [] |
|
|
| for line in text.split("\n"): |
|
|
| line = line.strip() |
|
|
| if not line: |
| continue |
|
|
| |
| parts = re.split(r"\d+[\.\)]\s*", line) |
|
|
| for p in parts: |
|
|
| p = p.strip() |
|
|
| if not p: |
| continue |
|
|
| |
| p = re.sub(r"^[-•▪*]\s*", "", p).strip() |
|
|
| if p: |
| lines.append(p) |
|
|
| return lines |
|
|
| def validate_generated_list( |
| text: str, |
| top_k: int = 10 |
| ) -> List[str]: |
|
|
| if not text: |
| return [] |
|
|
| raw_items = smart_split(text) |
|
|
| cleaned = filter_items(raw_items) |
|
|
| return cleaned[:top_k] |
|
|