Ringkas-In / src /core /keyword_extractor.py
anthonysigid's picture
deploy SummAIrizer apps to spaces
2a16478
Raw
History Blame Contribute Delete
2.14 kB
from typing import List, Tuple
from keybert import KeyBERT
class KeywordExtractor:
def __init__(self):
"""
Inisialisasi Keyword Extractor menggunakan KeyBERT.
Secara default menggunakan model multilingual yang ringan.
"""
# "paraphrase-multilingual-MiniLM-L12-v2" is good for Indonesian
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
print(f"Loading KeyBERT model: {model_name}...")
try:
self.kw_model = KeyBERT(model=model_name)
except Exception as e:
print(f"Error loading KeyBERT: {e}")
self.kw_model = None
def extract_keywords(self, text: str, top_n: int = 5) -> List[str]:
"""
Mengekstrak kata kunci dari teks menggunakan KeyBERT.
Mengembalikan daftar string (kata kunci).
"""
if not text or not self.kw_model:
return []
try:
# We can extract n-grams (1 to 2 words)
keywords_with_scores = self.kw_model.extract_keywords(
text,
keyphrase_ngram_range=(1, 2),
stop_words=None,
top_n=top_n
)
# KeyBERT returns a list of tuples: (keyword, score)
# We just want the keywords sorted by score (descending)
keywords = [kw[0] for kw in keywords_with_scores]
return keywords
except Exception as e:
print(f"Keyword extraction error: {e}")
return []
def extract_with_scores(self, text: str, top_n: int = 5) -> List[Tuple[str, float]]:
"""
Mengekstrak kata kunci beserta skor kepentingannya.
"""
if not text or not self.kw_model:
return []
return self.kw_model.extract_keywords(
text,
keyphrase_ngram_range=(1, 2),
stop_words=None,
top_n=top_n
)
# Example usage:
# extractor = KeywordExtractor()
# keywords = extractor.extract_keywords("Pemerintah mengalokasikan dana pendidikan sebesar 20 triliun rupiah.")
# print(keywords)