# prac.py import gradio as gr import re import fitz # PyMuPDF import spacy from config import ( keywords, numeric_regex, exclude_brackets_regex, date_regex, table_regex, gender_regex, age_regex, author_pattern, exclude_words, key_sections, follow_up ) # Load spaCy's English model nlp = spacy.load("en_core_web_sm") def normalize_text(text): """Normalize text by removing extra whitespace.""" text = re.sub(r'([a-zA-Z0-9])([\),.!?;-]+)([a-zA-Z])', r'\1\2 \3', text ) # Space between delimmiter and letter text = re.sub(r'([a-z])([\.])([\s]*)([a-z])', r'\1 \3\4', text) # Reomove '.' between two lowercase letters e.g., et al. xxx text = re.sub(r'([0-9]+)([\.]+)([0-9]+)([\.]+)([0-9]+)', r'\1-\3-\5', text) # Reomove '.' between three decimal numbers e.g., et 000.55.66 text = re.sub(r'([a-z])([\.]*)([0-9])', r'\1\2 \3', text) # Space between letter and no. text = re.sub(r'(\s)([a-z0-9]+)([A-Z])([\w]+)', r'\1\2. \3\4', text) # Put a '.' after a lowercase letter/number followed by Uppercase e.g., drains removed by day threeHe continued to text = re.sub(r'([a-z0-9])([\n]+)([A-Z])', r'\1\. \3', text) # Put a between lowercase letter/number, \n and uppercase letter e.g., xxx5 \n Yyy text = re.sub(r'(\.)([\s]*)([\.]+)', r'\1', text) # Removing extra '.'s, if any text = re.sub(r'([a-zA-Z0-9])([\s]*)([-])([\s]*)([a-zA-Z0-9])', r'\1\3\5', text) # Replace words like trans - anethole with trans-anethole # return text return " ".join(line.strip() for line in text.splitlines()) def extract_sentences(text): """Split text into sentences.""" return re.split(r'(? tags.""" if not user_keywords: return sentence # Separate single-word and multi-word keywords single_words = [kw for kw in user_keywords if ' ' not in kw] phrases = [kw for kw in user_keywords if ' ' in kw] # Escape keywords for regex escaped_single_words = [re.escape(kw) for kw in single_words] escaped_phrases = [re.escape(kw) for kw in phrases] # Build regex patterns patterns = [] if escaped_single_words: single_word_pattern = r'\b(?:' + '|'.join(escaped_single_words) + r')\b' patterns.append(single_word_pattern) if escaped_phrases: phrase_pattern = r'(?:' + '|'.join(escaped_phrases) + r')' patterns.append(phrase_pattern) # Combine patterns into a single regex if patterns: combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE) else: return sentence # Function to add tags def replacer(match): return f"{match.group(0)}" # Substitute matched keywords with highlighted version highlighted_sentence = combined_pattern.sub(replacer, sentence) return highlighted_sentence def process_file(file_path, user_keywords, check_time_duration=False): """ Process the PDF file and extract sentences based on criteria, then filter by user keywords and highlight them. """ doc = fitz.open(file_path) first_page = doc[0] author_names = extract_authors(first_page) authors_str = ', '.join(author_names) all_extracted_sentences = [] for page in doc: text = normalize_text(page.get_text()) sentences = extract_sentences(text) extracted = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)] all_extracted_sentences.extend(extracted) if not check_time_duration: filtered_sentences = [sentence for sentence in all_extracted_sentences if matches_keyword(sentence, user_keywords)] else: filtered_sentences = all_extracted_sentences # Highlight keywords in the filtered sentences highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences] doc.close() return highlighted_sentences, authors_str def process_text(input_text, user_keywords, check_time_duration=False): """ Process the input text and extract sentences based on criteria, then filter by user keywords and highlight them. """ refined_text = normalize_text(input_text) sentences = extract_sentences(refined_text) extracted_sentences = [sentence.strip() for sentence in sentences if matches_criteria(sentence, check_time_duration)] if not check_time_duration: filtered_sentences = [sentence for sentence in extracted_sentences if matches_keyword(sentence, user_keywords)] else: filtered_sentences = extracted_sentences # Highlight keywords in the filtered sentences highlighted_sentences = [highlight_keywords(sentence, user_keywords) for sentence in filtered_sentences] return highlighted_sentences, "Authors not extracted from text input." def handle_input(file_path=None, input_text=None, keyword_group=None, custom_keywords=None, time_duration=False): """ Handle user input from the Gradio interface, process the file or text, and return highlighted sentences with authors. """ # Decide on which keywords to use user_keywords = [] if keyword_group: user_keywords = keywords.get(keyword_group, []) if custom_keywords: user_keywords.extend(kw.strip() for kw in custom_keywords.split(",") if kw.strip()) if not user_keywords and not time_duration: return "No keyword provided." if file_path: extracted_sentences, authors_str = process_file(file_path, user_keywords, time_duration) elif input_text: extracted_sentences, authors_str = process_text(input_text, user_keywords, time_duration) else: return "No input provided." if extracted_sentences: # Combine authors and highlighted sentences into HTML highlighted_html = f"

Authors: {authors_str}

" for sentence in extracted_sentences: highlighted_html += f"

{sentence}

" return highlighted_html return "No matching sentences found." # Gradio Interface iface = gr.Interface( fn=handle_input, inputs=[ gr.File(label="Upload PDF or Text File", type="filepath"), gr.Textbox(label="Enter Text", placeholder="Type or paste text here..."), gr.Radio( choices=list(keywords.keys()), label="Information related to..." ), gr.Textbox( label="Enter Custom Keywords", placeholder="e.g., migraine, headache" ), # gr.Checkbox( # label="Check Time Duration Criteria", # value=False # ) ], outputs=gr.HTML(label="Processed Output"), title="BioMedical Information Extraction", description="""
Made by: Sumit Kumar (2311006), Ramavath Tharun (21219)
Supervisor: Dr. Tanmay Basu
Indian Institute of Science Education and Research
Upload a PDF file or enter text, then select a keyword group or enter custom keywords to extract and highlight relevant sentences.
""", examples=None, # You can add example files or texts if desired allow_flagging="never", cache_examples=True, # Add custom CSS to style the tag if necessary css=""" mark { background-color: blue; padding: 0; border-radius: 2px; } /* Optional: Adjust paragraph spacing */ p { margin-bottom: 10px; } """ ) iface.launch(share=True)