Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| from typing import List, Tuple | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| def normalize_whitespace(text: str) -> str: | |
| return re.sub(r"\s+", " ", text).strip() | |
| def extract_keywords_from_text(text: str, top_k: int = 30) -> List[str]: | |
| if not text: | |
| return [] | |
| # Simple TF-IDF over character n-grams and words to capture phrases | |
| docs = [text] | |
| vectorizer = TfidfVectorizer( | |
| analyzer="word", | |
| ngram_range=(1, 3), | |
| stop_words="english", | |
| max_features=5000, | |
| ) | |
| tfidf = vectorizer.fit_transform(docs) | |
| feature_array = vectorizer.get_feature_names_out() | |
| scores = tfidf.toarray()[0] | |
| pairs: List[Tuple[str, float]] = list(zip(feature_array, scores)) | |
| pairs.sort(key=lambda p: p[1], reverse=True) | |
| keywords = [k for k, _ in pairs[:top_k]] | |
| # Clean keywords a bit | |
| keywords = [normalize_whitespace(k) for k in keywords if len(k) > 2] | |
| # Deduplicate while preserving order | |
| seen = set() | |
| deduped = [] | |
| for k in keywords: | |
| if k not in seen: | |
| seen.add(k) | |
| deduped.append(k) | |
| return deduped | |
| def clamp_to_char_limit(text: str, max_chars: int) -> str: | |
| text = text.strip() | |
| if len(text) <= max_chars: | |
| return text | |
| # Try to cut at last newline before limit | |
| cut = text[:max_chars] | |
| last_nl = cut.rfind("\n") | |
| if last_nl > max_chars - 500: # avoid cutting too far back | |
| return cut[:last_nl].rstrip() + "\n" | |
| # Fallback | |
| return cut.rstrip() + "\n" |