Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import PyPDF2 | |
| import pandas as pd | |
| import uuid | |
| import tiktoken | |
| import re | |
| from datetime import datetime | |
| from helper_functions import extract_text_from_pdf | |
| from azure_openai import evaluation_process, process_insight, process_compare, risk_score_process, process_chunks | |
| def count_tokens_with_tiktoken(text): | |
| """ | |
| Counts the number of tokens in a given text using tiktoken. | |
| :param text: Input text. | |
| :return: Token count. | |
| """ | |
| tokenizer = tiktoken.get_encoding("cl100k_base") | |
| tokens = tokenizer.encode(text) | |
| return len(tokens), tokens | |
| def split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap): | |
| """ | |
| Splits text into chunks based on a specified chunk size in tokens and overlap using tiktoken. | |
| Tracks start and end page numbers for each chunk. | |
| :param text: Combined text of the document. | |
| :param page_texts: List of tuples [(page_number, page_text), ...]. | |
| :param chunk_size: Maximum size of each chunk in tokens. | |
| :param overlap: Number of overlapping tokens between consecutive chunks. | |
| :return: List of dictionaries representing chunks with start and end pages. | |
| """ | |
| _, tokens = count_tokens_with_tiktoken(text) | |
| chunks = [] | |
| # Map token positions to page numbers | |
| page_token_map = [] | |
| tokenizer = tiktoken.get_encoding("cl100k_base") | |
| for page_number, page_text in page_texts: | |
| page_tokens = tokenizer.encode(page_text) | |
| page_token_map.extend([page_number] * len(page_tokens)) | |
| for start in range(0, len(tokens), chunk_size - overlap): | |
| end = min(start + chunk_size, len(tokens)) | |
| chunk_tokens = tokens[start:end] | |
| chunk_text = tokenizer.decode(chunk_tokens) | |
| # Determine start and end pages | |
| start_page = page_token_map[start] if start < len(page_token_map) else None | |
| end_page = page_token_map[end - 1] if end - 1 < len(page_token_map) else page_texts[-1][0] | |
| chunks.append({ | |
| "ChunkText": chunk_text, | |
| "TokenCount": len(chunk_tokens), | |
| "StartPage": start_page, | |
| "EndPage": end_page, | |
| "ChunkID": str(uuid.uuid4()) | |
| }) | |
| return chunks | |
| def split_dataframe_with_combined_text_and_pages(df, chunk_size, overlap): | |
| """ | |
| Splits the combined text of a DataFrame into chunks using tiktoken. | |
| Each chunk will include start and end page numbers. | |
| :param df: DataFrame with columns ['Title', 'Text', 'PageTexts']. | |
| :param chunk_size: The maximum size of each chunk in tokens. | |
| :param overlap: The number of overlapping tokens between consecutive chunks. | |
| :return: DataFrame with columns ['ChunkText', 'TokenCount', 'StartPage', 'EndPage', 'ChunkID']. | |
| """ | |
| chunks = [] | |
| for _, row in df.iterrows(): | |
| text = row['Text'] | |
| page_texts = row['PageTexts'] | |
| split_chunks = split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap) | |
| chunks.extend(split_chunks) | |
| return pd.DataFrame(chunks) | |
| def main(): | |
| st.set_page_config(page_title="RegIntel Risk Analyser", page_icon=":vertical_traffic_light:") | |
| st.title("External RegIntel Risk Analyser :vertical_traffic_light:") | |
| topic = st.selectbox("Please choose a focus for the system",("Labelling", | |
| "Institutional Review Board/Independent Ethics Committee", | |
| "Investigator", "Sponsor", | |
| "Clinical Trial Protocol and protocol amendments", | |
| "Investigator's Brochure", "Conduct of Clinical Trial", | |
| "Monitoring", "Auditing", | |
| "Data handling and record keeping", | |
| "clinical trial reports", | |
| "Responsibilities of the Sponsor and Investigator", | |
| "Sponsor Inspection Preparation"),) | |
| uploaded_extintl_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf") | |
| uploaded_interintel_file_insight = st.file_uploader("Upload a Internal Reg Intel", type="pdf") | |
| if uploaded_extintl_file_insight is not None and uploaded_interintel_file_insight is not None: | |
| uploaded_file_SOP = st.file_uploader("Upload the draft submission file", type="pdf") | |
| if uploaded_file_SOP is not None: | |
| # Extract insight document | |
| with st.spinner("Processing External Reg Intel"): | |
| ext_intl_text_insight, ext_intl_page_texts_insight = extract_text_from_pdf(uploaded_extintl_file_insight) | |
| token_count_insight, _ = count_tokens_with_tiktoken(ext_intl_text_insight) | |
| st.sidebar.success("External Reg Intel file successfully processed") | |
| st.write("Token Count") | |
| st.write(f"The PDF contains **{token_count_insight}** tokens.") | |
| with st.spinner("Processing Internal Reg Intel"): | |
| int_intl_text_insight, int_intl_page_texts_insight = extract_text_from_pdf(uploaded_interintel_file_insight) | |
| token_count_insight, _ = count_tokens_with_tiktoken(int_intl_text_insight) | |
| st.sidebar.success("Internal Reg Intel file successfully processed") | |
| st.write("Token Count") | |
| st.write(f"The PDF contains **{token_count_insight}** tokens.") | |
| # Extract draft submission document | |
| with st.spinner("Processing the draft submission file Text..."): | |
| text_SOP, page_texts_SOP = extract_text_from_pdf(uploaded_file_SOP) | |
| token_count_SOP, _ = count_tokens_with_tiktoken(text_SOP) | |
| st.sidebar.success("draft submission file successfully processed") | |
| st.write("Token Count") | |
| st.write(f"The PDF contains **{token_count_SOP}** tokens.") | |
| # Process external insight Insights into chunks | |
| with st.spinner("Processing the Insight Document..."): | |
| df_ei_input_insight = pd.DataFrame([{ "Title": uploaded_extintl_file_insight.name, "Text": ext_intl_text_insight, "PageTexts": ext_intl_page_texts_insight }]) | |
| df_ei_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ei_input_insight, 10000, 1000) | |
| st.write("Processed External Reg Intel") | |
| st.sidebar.success("Processed External Reg Intel") | |
| st.write(df_ei_insight_chunks) | |
| # Process internal insight Insights into chunks | |
| with st.spinner("Processing the Insight Document..."): | |
| df_ii_input_insight = pd.DataFrame([{ "Title": uploaded_interintel_file_insight.name, "Text": int_intl_text_insight, "PageTexts": int_intl_page_texts_insight }]) | |
| df_ii_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ii_input_insight, 10000, 1000) | |
| st.write("Processed Internal Reg Intel") | |
| st.sidebar.success("Processed Internal Reg Intel") | |
| st.write(df_ii_insight_chunks) | |
| # Process draft submission file into chunks | |
| with st.spinner("Processing the draft submission file..."): | |
| df_input_SOP = pd.DataFrame([{ "Title": uploaded_file_SOP.name, "Text": text_SOP, "PageTexts": page_texts_SOP }]) | |
| df_sop_chunks = split_dataframe_with_combined_text_and_pages(df_input_SOP, 10000, 1000) | |
| st.write("Processed draft submission file") | |
| st.sidebar.success("Processed draft submission file") | |
| st.write(df_sop_chunks) | |
| # Evaluate Document | |
| with st.spinner("Evaluating document"): | |
| df_ei_eval, ei_con, ei_score = evaluation_process(df_ei_insight_chunks, topic,"ext") | |
| ei_score["source"]="external intel" | |
| df_ei_eval["source"]="external intel" | |
| df_ii_eval, ii_con, ii_score = evaluation_process(df_ii_insight_chunks, topic,"intl") | |
| ii_score["source"]="internal intel" | |
| df_ii_eval["source"]="internal intel" | |
| score = pd.concat([ei_score, ii_score]) | |
| st.write("External & Internal Inteligence Evaluation") | |
| st.sidebar.success(f"Evaluation Concensus: {ei_con}") | |
| st.write(f"Evaluation Concensus: {ei_con}") | |
| st.write("Evaluation Scores:") | |
| st.write(score) | |
| if ei_con == "False" and ii_con == "False": | |
| st.sidebar.error("Document Not Relevant To Topic") | |
| st.write("Document Not Relevant To Topic") | |
| st.write("Exiting RegIntel Analysis") | |
| return | |
| # Generate Insights | |
| with st.spinner("Creating insights"): | |
| df_ei_insights = process_chunks(df_ei_insight_chunks, topic,"ext") | |
| df_ii_insights = process_chunks(df_ii_insight_chunks, topic,"intl") | |
| df_ei_insights["source"]="external intel" | |
| df_ii_insights["source"]="internal intel" | |
| df_insights = pd.concat([df_ei_insights, df_ii_insights]) | |
| st.subheader("External & Internal Inteligence Insights") | |
| st.sidebar.success("External & Internal Inteligence Insights Created") | |
| st.write(df_insights) | |
| filtered_insights_on_impact = df_insights[df_insights['classification'] == 'impact'] | |
| if filtered_insights_on_impact.empty: | |
| st.write("No impact insights") | |
| st.sidebar.error("No impact insights") | |
| return | |
| # Comparing to Insights | |
| with st.spinner("Comparing Impact Classified Insights To draft submission file"): | |
| df_compare = process_compare(filtered_insights_on_impact, df_sop_chunks, topic) | |
| st.subheader("Comparison of Insights to draft submission file's") | |
| st.sidebar.success("Comparison of External & Internal Intel to draft submission file's Complete") | |
| st.write(df_compare) | |
| filtered_comparisons_df = df_compare[df_compare['ReviewNeeded'] == True] | |
| if filtered_comparisons_df.empty: | |
| st.write("No reviews needed for this draft submission file") | |
| st.sidebar.error("No reviews needed for this draft submission file") | |
| return | |
| # Risk scoring | |
| with st.spinner("Risk Assessing Insights To draft submission file"): | |
| df_risks = risk_score_process(filtered_comparisons_df, topic) | |
| st.subheader("Risk Score of Insights to draft submission file's") | |
| st.sidebar.success("Risk Score of Insights to draft submission file's Completed") | |
| st.write(df_risks) | |
| if __name__ == "__main__": | |
| main() | |