Create indexing_search.py
Browse files- indexing_search.py +22 -0
indexing_search.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def neighbor_keywords_with_threshold(text, threshold=8):
|
| 2 |
+
keywords = ["no", "not", "normal", "intact", "deficient"]
|
| 3 |
+
target_words = ["mmr", "mismatch"]
|
| 4 |
+
result = {}
|
| 5 |
+
|
| 6 |
+
words = text.lower().split()
|
| 7 |
+
for i, word in enumerate(words):
|
| 8 |
+
if word in target_words:
|
| 9 |
+
result[i] = {word: {"left": {}, "right": {}}}
|
| 10 |
+
for j in range(max(0, i - threshold), min(i + threshold + 1, len(words))):
|
| 11 |
+
if words[j] in keywords:
|
| 12 |
+
if j < i:
|
| 13 |
+
result[i][word]["left"][words[j]] = j
|
| 14 |
+
else:
|
| 15 |
+
result[i][word]["right"][words[j]] = j
|
| 16 |
+
|
| 17 |
+
return result
|
| 18 |
+
|
| 19 |
+
text = "PMS-2 Positive Tumors displaying loss of any MMR protein are mismatch repair deficient and considered to be MSI-High (MSI-H), whereas those with intact MMR proteins are expected to be microsatellite stable (MSS) or MSI-low (MSI-L). $$ IHC shows normal expression of MLH-1, MSH-2, MSH-6, and PMS-2. $$ The results of the IC analysis suggest the presence of normal DNA mismatch $$ Positive MSH-6. Another MMR protein is also involved."
|
| 20 |
+
|
| 21 |
+
output = neighbor_keywords_with_threshold(text, threshold=8)
|
| 22 |
+
print(output)
|