File size: 1,612 Bytes
7498f2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from __future__ import annotations
from typing import List, Tuple
import re

from sklearn.feature_extraction.text import TfidfVectorizer


def normalize_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def extract_keywords_from_text(text: str, top_k: int = 30) -> List[str]:
    if not text:
        return []
    # Simple TF-IDF over character n-grams and words to capture phrases
    docs = [text]
    vectorizer = TfidfVectorizer(
        analyzer="word",
        ngram_range=(1, 3),
        stop_words="english",
        max_features=5000,
    )
    tfidf = vectorizer.fit_transform(docs)
    feature_array = vectorizer.get_feature_names_out()
    scores = tfidf.toarray()[0]
    pairs: List[Tuple[str, float]] = list(zip(feature_array, scores))
    pairs.sort(key=lambda p: p[1], reverse=True)
    keywords = [k for k, _ in pairs[:top_k]]
    # Clean keywords a bit
    keywords = [normalize_whitespace(k) for k in keywords if len(k) > 2]
    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for k in keywords:
        if k not in seen:
            seen.add(k)
            deduped.append(k)
    return deduped


def clamp_to_char_limit(text: str, max_chars: int) -> str:
    text = text.strip()
    if len(text) <= max_chars:
        return text
    # Try to cut at last newline before limit
    cut = text[:max_chars]
    last_nl = cut.rfind("\n")
    if last_nl > max_chars - 500:  # avoid cutting too far back
        return cut[:last_nl].rstrip() + "\n"
    # Fallback
    return cut.rstrip() + "\n"