| from cid import CaseInsensitiveDict |
| import re |
|
|
|
|
|
|
| |
| |
| with open('utils/summarize_utils/map_terms.txt') as f: |
| mt_dict = dict(x.rstrip().split(',', 1) for x in f) |
|
|
| |
| |
| |
| with open('utils/summarize_utils/map_nums.txt') as f: |
| num_dict = dict(x.rstrip().split(',', 1) for x in f) |
| |
| |
| with open('utils/summarize_utils/stopwords-en.txt','r',encoding='unicode_escape') as f: |
| stopwords = f.read().split() |
| |
| def get_first_word(alloc,masking,status): |
| print('Getting first word..') |
| if (alloc.lower()=='n/a' and masking.lower()=='none (open label)'): |
| if status == 'Active, not recruiting': |
| fw = 'An ' |
| start_word = 'It is in ' |
| result = start_word + fw.lower() |
| return result |
| elif status == 'Recruiting': |
| fw = 'An ' |
| start_word = 'It is in ' |
| result = start_word + fw.lower() |
| return result |
| else: |
| fw = 'An ' |
| result = fw |
| return result |
| else: |
| if status == 'Active, not recruiting': |
| fw = 'A ' |
| start_word = 'It is in ' |
| result = start_word + fw.lower() |
| return result |
| elif status == 'Recruiting': |
| fw = 'A ' |
| start_word = 'It is in ' |
| result = start_word + fw.lower() |
| return result |
| else: |
| fw = 'A ' |
| result = fw |
| return result |
|
|
| |
| |
| def get_mask(masking): |
| print('Getting mask..') |
| |
| try: |
| if masking.lower() == 'double': |
| masking = 'double-blind, ' |
| elif masking.lower() == 'none (open label)': |
| masking = 'open-label, ' |
| elif masking.lower() in 'quadruple': |
| masking = 'quadruple-blind, ' |
| |
| return str(masking) |
| except: |
| pass |
| |
| |
| def get_stype(stype): |
| print('Getting study type...') |
| if stype.lower() == 'interventional': |
| stype = 'interventional study ' |
| else: |
| stype = 'observational study ' |
| |
| return stype |
| |
| |
| def get_imodel(imodel): |
| print('Getting imodel...') |
| if imodel.lower() is not None: |
| res = imodel.lower() + ', ' |
| return res |
| else: |
| pass |
| |
| |
| def get_obj(otitle,bsumm, ddesc): |
| print('Getting objective for..') |
| |
| |
| |
| keywords = ['to Demonstrate', |
| 'to Evaluate', |
| 'to Investigate', |
| 'to Assess', |
| 'to Determine', |
| |
| 'Placebo','Purpose','aim','purpose','main purpose', |
| 'Aim','Objective', 'objective', 'Main Objective', 'Selection Study', |
| 'Main Purpose', 'Main Aim','Study', 'STUDY', 'study', |
| 'Ascending Multiple-dose','Adaptive','Dose Escalation', |
| 'assess', 'Bioavailability','investigate','Investigating' |
| ] |
|
|
| otitle_result = [ele for ele in keywords if(ele in otitle)] |
| print('otitle_result:', otitle_result) |
| bsumm_result = [ele for ele in keywords if(ele in bsumm.lower())] |
| print('bsumm_result:', bsumm_result) |
| ddesc_result = [ele for ele in keywords if(ele in ddesc.lower())] |
| print('ddesc_result:',ddesc_result) |
| |
| try: |
| if len(otitle_result)>0: |
| print('im in otitle') |
| word = ''.join(otitle_result[0]) |
| print('word in otitle:', word) |
| matched = [sentence + '.' for sentence in otitle.split('. ') if word in sentence] |
| sobj = ''.join(matched) |
| print('matched sobj',sobj) |
| |
| pattern=word+'(.*)'+'.' |
| result = re.search(pattern, sobj) |
| print('result of pattern search:',result) |
| result = word+result.group(1) |
| print('result group:',result) |
| result = non_abbr(result) |
| print('non-abbr result:',result) |
| return result |
| elif len(bsumm_result)>0: |
| print('im in bsumm') |
| |
| word = ''.join(bsumm_result[0]) |
| |
| matched = [sentence + '.' for sentence in bsumm.split('. ') if word in sentence] |
| sobj = ', '.join(matched) |
| sobj = non_abbr(sobj) |
| return sobj |
| elif len(ddesc_result)>0: |
| |
| word = ''.join(ddesc_result[0]) |
| matched = [sentence + '.' for sentence in ddesc.split('. ') if word in sentence] |
| sobj = ''.join(matched) |
| sobj = non_abbr(sobj) |
| return sobj |
| else: |
| sobj = 'No Objective Found' |
| return sobj |
| except: |
| pass |
| |
| |
| |
| def get_osid(osid,sid): |
| print('Getting Study Ids...') |
| if None not in (osid,sid): |
| if sid !='': |
| osid = '(' + '; '.join(osid.split('|')) + '; '+ ', '.join(sid.split('|')) +') ' |
| |
| return osid |
| elif osid is not None: |
| osid_only = '(' + '; '.join(osid.split('|')) + ') ' |
| |
| return osid_only |
| elif osid is None and sid is not None: |
| sid_only = '(' + '; '.join(sid.split('|')) + ') ' |
| |
| |
| return sid_only |
| else: |
| pass |
|
|
| |
| |
| def join_and(items): |
| if len(items)>1: |
| return ', '.join(items[:-1]) + ', and '+items[-1] |
| else: |
| return ', '.join(items) |
| |
| def get_locs(locations): |
| print('Getting Locations...') |
| print(locations) |
| print(len(locations)) |
| if locations !='': |
| print('location is not empty') |
| if '|' in locations: |
| res = join_and(sorted(list(set(locations.split('|'))))) |
| print('inside location split if:', res) |
| else: |
| res = locations |
| print('inside location split else:', res) |
| else: |
| res = locations |
| print('outside location split else:', res) |
| if res =='': |
| pass |
| else: |
| res = ' in ' + res +', ' |
| |
| return res |
|
|
| |
| |
| status_dict = {'Not yet recruiting':', is planned ', |
| |
| 'Active, not recruiting':' (enrollment complete) ', |
| 'Completed' :', is complete ', |
| 'Terminated':', has been terminated', |
| 'Suspended' :', has been suspended', |
| 'Withdrawn' :', has been withdrawn' |
| } |
| def get_status(status): |
| print('Getting trial type...') |
| search_key = status |
| |
| try: |
| res = [val for key, val in status_dict.items() if search_key in key] |
| res = str(res).replace("['",'').replace("']",'') |
| |
| return res |
| except: |
| pass |
|
|
| |
| |
| def non_abbr(string): |
| word = string.split(' ') |
| my_list=[] |
| try: |
| for word in word: |
| if word.isupper() == True: |
| word = word.upper() |
| my_list.append(word) |
| else: |
| word = word.lower() |
| my_list.append(word) |
| return ' '.join(my_list) |
| except: |
| pass |
| |
| |
| def get_ystop(ystop): |
| print('Getting ystop...') |
| if ystop!='': |
| ystop = non_abbr(ystop) |
| ystop = ', '+ 'due to ' + ystop |
| return ystop |
| else: |
| pass |
| |
| |
| def get_age(minage,maxage): |
| |
| if maxage !='': |
| age = 'aged between '+ minage+ ' and ' + maxage |
| else: |
| age = 'with minimum age of ' +minage |
| |
| return age |
| |
|
|
| |
| def get_url(nctid,lupd): |
| print('Cooking up final url...') |
| urll='https://clinicaltrials.gov/ct2/show/' |
| new_url= ' ('+ 'ClinicalTrials.gov, '+ lupd+', ' +urll+nctid + ')' |
| return new_url |
| |
| |
| def map_week_num(myText): |
| obj = CaseInsensitiveDict(num_dict) |
| pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE) |
| text = pattern.sub(lambda x: obj[x.group()], myText) |
| |
| return text |
| |
| |
| def map_terms(myText): |
| obj = CaseInsensitiveDict(mt_dict) |
| pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE) |
| text = pattern.sub(lambda x: obj[x.group()], myText) |
| |
| return text |
| |
| |
| def remove_period_spaces(text): |
| text = text.replace('||','') |
| text = text.replace('Korea, Republic of','S Korea') |
| text = text.replace('[]','') |
| text = text.replace(', This',', this') |
| text = text.replace(') The',') the') |
| text = text.replace('in The The','in the') |
| text = text.replace('The','the') |
| text = text.replace('the the','the') |
| text = text.replace('this is a','') |
| text = text.replace('.,',',') |
| text = text.replace('., ',',') |
| text = text.replace(',',', ') |
| text = text.replace("due to", "because of", 1) |
| text = text.replace("male subjects", "male participants") |
| text = text.replace("female subjects", "female participants") |
| |
| text=" ".join(text.split()) |
| return text |
| |
| |
| def unique_list(text_str): |
| l = text_str.split() |
| temp = [] |
| for x in l: |
| if x not in temp: |
| temp.append(x) |
| return ' '.join(temp) |
| |
| |
| def repos_condition(my_string): |
| try: |
| |
| subjects=re.search('with(.*),',my_string) |
| |
| if subjects: |
| fs=subjects.group(1).split(',')[0] |
| |
| else: |
| subjects=re.search('with(.*).',my_string) |
| fs=subjects.group(1).split('.')[0] |
| |
| a=re.search(r"\d+\s+subjects\s",my_string) |
| |
| r=re.sub(r"\d+\s+subjects\s",a.group(0)+"with"+fs+" ",my_string) |
| |
| result=re.sub("with"+fs+",","",r) |
| print("--------------") |
| return result |
| except: |
| print("not found") |
|
|
| |
|
|
| |
| def repos_study_design(text): |
| try: |
| result = re.search('subjects(.*)study', text.lower()) |
| if result: |
| r = result.group(1)+'study' |
| newtext= text.replace(r, '') |
| try: |
| idx = newtext.lower().index('phase') |
| newtext = newtext[:idx] + result.group(1) + newtext[idx:] |
| return newtext |
| except: |
| return text |
| else: |
| return text |
| except: |
| print("nothing happened") |
| |
| |
| def purpose_issue(summary): |
| flag_words = ['will also be evaluated','will be evaluated','No Objective Found','subjects), is', 'subjects, is complete'] |
| if any(word in summary for word in flag_words): |
| return "Yes - Grammar/Endpoint related Mistakes in Summary" |
| else: |
| return "No" |
| |
| |
| def dupe_check(text,rr_value,stopwords=stopwords): |
| if rr_value == 'No': |
| split_text = text.split(' ') |
| clean_text = ' '.join(i for i in split_text if i.lower() not in (x.lower() for x in stopwords)) |
| words = clean_text.split() |
| result = (len(words) > len(set(words))) |
| if result ==True: |
| return " Yes - Duplicate Words maybe found in Summary" |
| else: |
| return rr_value |
| else: |
| return rr_value |
| |
| |
| def count_caps(summary,rr_value): |
| if rr_value == 'No': |
| match_length = len(' '.join(re.findall(r"\b[A-Z\s]+\b", summary)).split()) |
| if match_length > 10: |
| res = 'Yes - Summary May Contain Lot of Words in Upper Case' |
| return res |
| else: |
| return rr_value |
| else: |
| return rr_value |
| |
| |
| def route_miss(summary,rr_value,int_dec): |
| if rr_value == 'No': |
| split_summ = summary.split(' ') |
| clean_text = ' '.join(i for i in split_summ if i.lower() not in (x.lower() for x in stopwords)) |
| summ_list = clean_text.split() |
| int_summ = int_dec.split(' ') |
| clean_text = ' '.join(i for i in int_summ if i.lower() not in (x.lower() for x in stopwords)) |
| int_list = clean_text.split() |
| if any(check in int_list for check in summ_list): |
| return "No" |
| else: |
| return "Yes - Route/Dose info might have been missed" |
| else: |
| return rr_value |
|
|
|
|