from __future__ import annotations from typing import List, Tuple import re from sklearn.feature_extraction.text import TfidfVectorizer def normalize_whitespace(text: str) -> str: return re.sub(r"\s+", " ", text).strip() def extract_keywords_from_text(text: str, top_k: int = 30) -> List[str]: if not text: return [] # Simple TF-IDF over character n-grams and words to capture phrases docs = [text] vectorizer = TfidfVectorizer( analyzer="word", ngram_range=(1, 3), stop_words="english", max_features=5000, ) tfidf = vectorizer.fit_transform(docs) feature_array = vectorizer.get_feature_names_out() scores = tfidf.toarray()[0] pairs: List[Tuple[str, float]] = list(zip(feature_array, scores)) pairs.sort(key=lambda p: p[1], reverse=True) keywords = [k for k, _ in pairs[:top_k]] # Clean keywords a bit keywords = [normalize_whitespace(k) for k in keywords if len(k) > 2] # Deduplicate while preserving order seen = set() deduped = [] for k in keywords: if k not in seen: seen.add(k) deduped.append(k) return deduped def clamp_to_char_limit(text: str, max_chars: int) -> str: text = text.strip() if len(text) <= max_chars: return text # Try to cut at last newline before limit cut = text[:max_chars] last_nl = cut.rfind("\n") if last_nl > max_chars - 500: # avoid cutting too far back return cut[:last_nl].rstrip() + "\n" # Fallback return cut.rstrip() + "\n"