| import nltk | |
| from nltk.tokenize import word_tokenize | |
| import re | |
| # Load the document | |
| with open('document.txt', 'r') as f: | |
| text = f.read() | |
| # Preprocess the text | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [t for t in tokens if t.isalpha()] # remove non-alpha characters | |
| # Define key words | |
| key_words = ['chronic kidney disease', 'heart failure', 'cirrhosis', 'ascites', 'ESRD', 'liver disease'] | |
| # Use regex to find key words | |
| found_key_words = [] | |
| for key_word in key_words: | |
| pattern = re.compile(r'\b' + key_word + r'\b') | |
| if pattern.search(text): | |
| found_key_words.append(key_word) | |
| # Return the list of key words | |
| print(found_key_words) |