bat-6's picture
update
6459fd1
Raw
History Blame Contribute Delete
3.81 kB
import re
from typing import List
GENERIC_PATTERNS = [
"dashboard",
"login",
"signup",
"authentication",
"admin panel",
"analytics system",
"analytics platform",
"management system",
"tracking system",
"monitoring system",
"ai module",
"smart system",
"web platform",
"mobile app",
"website",
"reports page",
"user management"
]
BAD_STARTS = [
"here are",
"below are",
"these are",
"the following",
"project ideas",
"features include"
]
LOW_VALUE_WORDS = [
"system",
"platform",
"application",
"website",
"solution"
]
def clean_text(text: str) -> str:
if not text:
return ""
text = str(text).strip()
text = re.sub(r"^\d+[\)\.\-\s]+", "", text)
text = re.sub(r"^[\-\*\•\→\▪\s]+", "", text)
text = text.replace("**", "")
text = text.replace('"', "").replace("'", "")
text = re.sub(r"\(.*?\)", "", text)
if ":" in text and len(text.split()) > 6:
text = text.split(":")[0]
text = re.sub(r"^(assistant|bot)\s*[:\-]\s*", "", text, flags=re.I)
text = re.sub(r"[.,\-:;]+$", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def normalize_key(text: str) -> str:
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def is_generic(text: str) -> bool:
low = normalize_key(text)
for pattern in GENERIC_PATTERNS:
if pattern in low:
return True
return False
def is_low_quality(text: str) -> bool:
low = normalize_key(text)
words = low.split()
if len(words) < 3:
return True
if len(words) > 12:
return True
if any(low.startswith(x) for x in BAD_STARTS):
return True
weak_count = sum(
1 for w in words
if w in LOW_VALUE_WORDS
)
if weak_count >= len(words) / 2:
return True
return False
def is_valid_item(text: str) -> bool:
if not text:
return False
if is_generic(text):
return False
if is_low_quality(text):
return False
return True
def filter_items(items: List[str]) -> List[str]:
final = []
seen = set()
for item in items:
text = clean_text(item)
if not text:
continue
if not is_valid_item(text):
continue
key = normalize_key(text)
if key in seen:
continue
duplicate = False
for old in seen:
overlap = set(key.split()) & set(old.split())
if len(overlap) >= max(2, min(len(key.split()), len(old.split())) - 1):
duplicate = True
break
if duplicate:
continue
seen.add(key)
final.append(text)
return final
def smart_split(text: str) -> List[str]:
if not text:
return []
text = text.replace("\r", "\n")
lines = []
for line in text.split("\n"):
line = line.strip()
if not line:
continue
parts = re.split(r"\d+[\.\)]\s*", line)
for p in parts:
p = p.strip()
if not p:
continue
# Remove leading bullets or hyphens instead of splitting the whole string
p = re.sub(r"^[-•▪*]\s*", "", p).strip()
if p:
lines.append(p)
return lines
def validate_generated_list(
text: str,
top_k: int = 10
) -> List[str]:
if not text:
return []
raw_items = smart_split(text)
cleaned = filter_items(raw_items)
return cleaned[:top_k]