Spaces:
Paused
Paused
| ##################################################### | |
| ### DOCUMENT PROCESSOR [Keywords] | |
| ##################################################### | |
| ### Jonathan Wang | |
| # ABOUT: | |
| # This creates an app to chat with PDFs. | |
| # This is the Keywords | |
| # Which creates keywords based on documents. | |
| ##################################################### | |
| ### TODO Board: | |
| # TODO(Jonathan Wang): Add Maximum marginal relevance to the merger for better keywords. | |
| # TODO(Jonathan Wang): create own version of Rake keywords | |
| ##################################################### | |
| ### PROGRAM SETTINGS | |
| ##################################################### | |
| ### PROGRAM IMPORTS | |
| from __future__ import annotations | |
| from typing import Any, Callable, Optional | |
| # Keywords | |
| # from multi_rake import Rake # removing because of compile issues and lack of maintainence | |
| import yake | |
| from llama_index.core.bridge.pydantic import Field | |
| from llama_index.core.schema import BaseNode | |
| # Own Modules | |
| from metadata_adder import MetadataAdder | |
| ##################################################### | |
| ### SCRIPT | |
| def get_keywords(input_text: str) -> str: | |
| """ | |
| Given a string, get its keywords using RAKE+YAKE w/ Distribution Based Fusion. | |
| Inputs: | |
| input_text (str): the input text to get keywords from | |
| # top_k (int): the number of keywords to get | |
| Returns: | |
| str: A list of the keywords, joined into a string. | |
| """ | |
| # RAKE | |
| # kw_extractor = Rake() | |
| # keywords_rake = kw_extractor.apply(input_text) | |
| # keywords_rake = dict(keywords_rake) | |
| # YAKE | |
| kw_extractor = yake.KeywordExtractor(lan="en", dedupLim=0.9, n=3) | |
| keywords_yake = kw_extractor.extract_keywords(input_text) | |
| # reorder scores so that higher is better | |
| keywords_yake = {keyword[0].lower(): (1 - keyword[1]) for keyword in keywords_yake} | |
| keywords_yake = dict( | |
| sorted(keywords_yake.items(), key=lambda x: x[1], reverse=True) # type hinting YAKE is miserable | |
| ) | |
| # Merge RAKE and YAKE based on scores. | |
| # keywords_merged = _merge_on_scores( | |
| # list(keywords_yake.keys()), | |
| # list(keywords_rake.keys()), | |
| # list(keywords_yake.values()), | |
| # list(keywords_rake.values()), | |
| # a_weight=0.5, | |
| # top_k=top_k | |
| # ) | |
| # return (list(keywords_rake.keys())[:top_k], list(keywords_yake.keys())[:top_k], keywords_merged) | |
| return ", ".join(keywords_yake) # kinda regretting forcing this into a string | |
| class KeywordMetadataAdder(MetadataAdder): | |
| """Adds keyword metadata to a document. | |
| Args: | |
| metadata_name: The name of the metadata to add to the document. Defaults to 'keyword_metadata'. | |
| keywords_function: A function for keywords, given a source string and the number of keywords to get. | |
| """ | |
| keywords_function: Callable[[str, int], str] = Field( | |
| description="The function to use to extract keywords from the text. Input is string and number of keywords to extract. Ouptut is string of keywords.", | |
| default=get_keywords, | |
| ) | |
| num_keywords: int = Field( | |
| default=5, | |
| description="The number of keywords to extract from the text. Defaults to 5.", | |
| ) | |
| def __init__( | |
| self, | |
| metadata_name: str = "keyword_metadata", | |
| keywords_function: Callable[[str], str] = get_keywords, | |
| num_keywords: int = 5, | |
| **kwargs: Any, | |
| ) -> None: | |
| """Init params.""" | |
| super().__init__(metadata_name=metadata_name, keywords_function=keywords_function, num_keywords=num_keywords, **kwargs) # ah yes i love oop :) | |
| def class_name(cls) -> str: | |
| return "KeywordMetadataAdder" | |
| def get_node_metadata(self, node: BaseNode) -> str | None: | |
| if not hasattr(node, "text") or node.text is None: | |
| return None | |
| return self.keywords_function(node.get_content(), self.num_keywords) | |