Spaces:
Runtime error
Runtime error
File size: 1,612 Bytes
7498f2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from __future__ import annotations
from typing import List, Tuple
import re
from sklearn.feature_extraction.text import TfidfVectorizer
def normalize_whitespace(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def extract_keywords_from_text(text: str, top_k: int = 30) -> List[str]:
if not text:
return []
# Simple TF-IDF over character n-grams and words to capture phrases
docs = [text]
vectorizer = TfidfVectorizer(
analyzer="word",
ngram_range=(1, 3),
stop_words="english",
max_features=5000,
)
tfidf = vectorizer.fit_transform(docs)
feature_array = vectorizer.get_feature_names_out()
scores = tfidf.toarray()[0]
pairs: List[Tuple[str, float]] = list(zip(feature_array, scores))
pairs.sort(key=lambda p: p[1], reverse=True)
keywords = [k for k, _ in pairs[:top_k]]
# Clean keywords a bit
keywords = [normalize_whitespace(k) for k in keywords if len(k) > 2]
# Deduplicate while preserving order
seen = set()
deduped = []
for k in keywords:
if k not in seen:
seen.add(k)
deduped.append(k)
return deduped
def clamp_to_char_limit(text: str, max_chars: int) -> str:
text = text.strip()
if len(text) <= max_chars:
return text
# Try to cut at last newline before limit
cut = text[:max_chars]
last_nl = cut.rfind("\n")
if last_nl > max_chars - 500: # avoid cutting too far back
return cut[:last_nl].rstrip() + "\n"
# Fallback
return cut.rstrip() + "\n" |