Spaces:
Paused
Paused
| import spacy | |
| import pytextrank | |
| from spacy.tokens import Span | |
| # Define decorator for converting to singular version of words | |
| def plural_scrubber(): | |
| def scrubber_func(span: Span) -> str: | |
| return span.lemma_ | |
| return scrubber_func | |
| def model_selector(target_language: str): | |
| # Load subset of non-english models | |
| language_model = { | |
| "spa": "es_core_news_sm", | |
| "fra": "fr_core_news_sm", | |
| "pol": "pl_core_news_sm", | |
| "deu": "de_core_news_sm", | |
| "ita": "it_core_news_sm", | |
| "por": "pt_core_news_sm", | |
| "nld": "nl_core_news_sm", | |
| "fin": "fi_core_news_sm", | |
| "ron": "ro_core_news_sm", | |
| "rus": "ru_core_news_sm" | |
| } | |
| try: | |
| nlp = spacy.load(language_model[target_language]) | |
| except KeyError: | |
| # Load a spaCy English model | |
| nlp = spacy.load("en_core_web_lg") | |
| # Add TextRank component to pipeline with stopwords | |
| nlp.add_pipe("textrank", config={ | |
| "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words}, | |
| "scrubber": {"@misc": "plural_scrubber"}}) | |
| return nlp | |
| def extract_terms(text, target_language, length): | |
| nlp = model_selector(target_language) | |
| # Perform fact extraction on overall summary and segment summaries | |
| doc = nlp(text) | |
| if length < 100: | |
| # Get single most used key term | |
| phrases = {phrase.text for phrase in doc._.phrases[:1]} | |
| elif length >= 100 and length < 300: | |
| # Create unique set from top 2 ranked phrases | |
| phrases = {phrase.text for phrase in doc._.phrases[:2]} | |
| if length >= 300: | |
| # Create unique set from top 3 ranked phrases | |
| phrases = {phrase.text for phrase in doc._.phrases[:3]} | |
| return list(phrases) | |