Noo88ear's picture
πŸš€ Initial deployment of Multi-Agent Job Application Assistant
7498f2c
from __future__ import annotations
from typing import List, Tuple
import re
from sklearn.feature_extraction.text import TfidfVectorizer
def normalize_whitespace(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def extract_keywords_from_text(text: str, top_k: int = 30) -> List[str]:
if not text:
return []
# Simple TF-IDF over character n-grams and words to capture phrases
docs = [text]
vectorizer = TfidfVectorizer(
analyzer="word",
ngram_range=(1, 3),
stop_words="english",
max_features=5000,
)
tfidf = vectorizer.fit_transform(docs)
feature_array = vectorizer.get_feature_names_out()
scores = tfidf.toarray()[0]
pairs: List[Tuple[str, float]] = list(zip(feature_array, scores))
pairs.sort(key=lambda p: p[1], reverse=True)
keywords = [k for k, _ in pairs[:top_k]]
# Clean keywords a bit
keywords = [normalize_whitespace(k) for k in keywords if len(k) > 2]
# Deduplicate while preserving order
seen = set()
deduped = []
for k in keywords:
if k not in seen:
seen.add(k)
deduped.append(k)
return deduped
def clamp_to_char_limit(text: str, max_chars: int) -> str:
text = text.strip()
if len(text) <= max_chars:
return text
# Try to cut at last newline before limit
cut = text[:max_chars]
last_nl = cut.rfind("\n")
if last_nl > max_chars - 500: # avoid cutting too far back
return cut[:last_nl].rstrip() + "\n"
# Fallback
return cut.rstrip() + "\n"